From 8421f23975ff6155c027dc38d87533a3c14b9acf Mon Sep 17 00:00:00 2001 From: Blargian Date: Sun, 28 Jan 2024 22:56:47 +0100 Subject: [PATCH 001/392] #56257 - add failing test and new setting for parsing TSV files with crlf --- src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + .../02973_parse_crlf_with_tsv_files.sh | 23 +++++++++++++++++++ 4 files changed, 26 insertions(+) create mode 100755 tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e0b3ca39899..a62380ad926 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1012,6 +1012,7 @@ class IColumn; M(Bool, input_format_csv_skip_trailing_empty_lines, false, "Skip trailing empty lines in CSV format", 0) \ M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \ M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \ + M(Bool, input_format_tsv_crlf_end_of_line, false, "If it is set true, file function will read TSV format with \\r\\n instead of \\n.", 0) \ \ M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \ \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 62cbadec4f4..a4a6e1ab83a 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -175,6 +175,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header; format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines; format_settings.tsv.allow_variable_number_of_columns = settings.input_format_tsv_allow_variable_number_of_columns; + format_settings.tsv.crlf_end_of_line_input = settings.input_format_tsv_crlf_end_of_line; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.allow_data_after_semicolon = settings.input_format_values_allow_data_after_semicolon; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 30e4dd04513..7231e10a763 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -351,6 +351,7 @@ struct FormatSettings bool try_detect_header = true; bool skip_trailing_empty_lines = false; bool allow_variable_number_of_columns = false; + bool crlf_end_of_line_input = false; } tsv; struct diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh new file mode 100755 index 00000000000..6f7308e18a4 --- /dev/null +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +USER_FILES_PATH=$($CLICKHOUSE_CLIENT --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +# Test setup +touch ${USER_FILES_PATH:?}/02973_data_without_crlf.tsv +touch ${USER_FILES_PATH:?}/02973_data_with_crlf.tsv +echo -e 'Akiba_Hebrew_Academy\t2017-08-01\t241\nAegithina_tiphia\t2018-02-01\t34\n1971-72_Utah_Stars_season\t2016-10-01\t1' > "$USER_FILES_PATH/02973_data_without_crlf.tsv" +echo -e 'Akiba_Hebrew_Academy\t2017-08-01\t241\r\nAegithina_tiphia\t2018-02-01\t34\r\n1971-72_Utah_Stars_season\t2016-10-01\t1\r' > "$USER_FILES_PATH/02973_data_with_crlf.tsv" + +$CLICKHOUSE_CLIENT --multiquery "SELECT * FROM file(02973_data_without_crlf.tsv, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" +$CLICKHOUSE_CLIENT --multiquery "SELECT * FROM file(02973_data_with_crlf.tsv, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32'); --{clientError 117}" + +# Change setting to escape \r +$CLICKHOUSE_CLIENT --multiquery "SELECT * FROM file(02973_data_with_crlf.tsv, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" + +# Test teardown +rm "$USER_FILES_PATH/02973_data_without_crlf.tsv" +rm "$USER_FILES_PATH/02973_data_with_crlf.tsv" From 7ae202376f29e56a9dc82ad911155ab451c0317b Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 31 Jan 2024 21:03:06 +0100 Subject: [PATCH 002/392] missed place for documentation change --- docs/en/operations/settings/settings-formats.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index eb09af44efd..e5c555af018 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -823,7 +823,13 @@ Default value: `0`. ### output_format_tsv_crlf_end_of_line {#output_format_tsv_crlf_end_of_line} -Use DOC/Windows-style line separator (CRLF) in TSV instead of Unix style (LF). +Use DOS/Windows-style line separator (CRLF) in TSV instead of Unix style (LF). + +Disabled by default. + +### input_format_tsv_crlf_end_of_line {#input_format_tsv_crlf_end_of_line} + +Use DOS/Windows-style line separator (CRLF) for TSV input files instead of Unix style (LF). Disabled by default. From 31416bc4885a5d6302e8e59235921cc018b121b4 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 31 Jan 2024 21:03:47 +0100 Subject: [PATCH 003/392] 2 tests fail - not sure if related to changes, try again From ab384f86527641a6a9c28179fe995e957072e157 Mon Sep 17 00:00:00 2001 From: Blargian Date: Sun, 4 Feb 2024 15:29:57 +0100 Subject: [PATCH 004/392] add support_crlf for TSV format --- .../SerializationFixedString.cpp | 8 +++- .../Serializations/SerializationNullable.cpp | 5 +- .../Serializations/SerializationString.cpp | 8 +++- src/Formats/EscapingRuleUtils.cpp | 4 +- src/IO/ReadHelpers.cpp | 46 +++++++++++++++---- src/IO/ReadHelpers.h | 6 ++- .../Formats/Impl/TSKVRowInputFormat.cpp | 2 +- .../Impl/TabSeparatedRowInputFormat.cpp | 11 +++-- 8 files changed, 68 insertions(+), 22 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp index fa50af52f2f..cf731409fd0 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.cpp +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -151,9 +151,13 @@ static inline void read(const SerializationFixedString & self, IColumn & column, } -void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); + read(*this, column, [&istr, &settings](ColumnFixedString::Chars & data) + { + settings.tsv.crlf_end_of_line_input ? readEscapedStringInto(data, istr) + : readEscapedStringInto(data, istr); + }); } diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 4b0ad0b54ba..c0fbdfbb022 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -290,6 +290,7 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col const SerializationPtr & nested_serialization) { const String & null_representation = settings.tsv.null_representation; + const bool supports_crlf = settings.tsv.crlf_end_of_line_input; /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) @@ -309,10 +310,10 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col /// Check if we have enough data in buffer to check if it's a null. if (istr.available() > null_representation.size()) { - auto check_for_null = [&istr, &null_representation]() + auto check_for_null = [&istr, &null_representation, &supports_crlf]() { auto * pos = istr.position(); - if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n')) + if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n' || (supports_crlf && *istr.position() == '\r'))) return true; istr.position() = pos; return false; diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index b2b083fd466..4ff0ba9a400 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -301,9 +301,13 @@ void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & is } -void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) + { + settings.tsv.crlf_end_of_line_input ? readEscapedStringInto,true>(data, istr) + : readEscapedStringInto,false>(data, istr); + }); } diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index a7e9fb8e99f..481696edc49 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -76,7 +76,7 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca /// Empty field, just skip spaces break; case FormatSettings::EscapingRule::Escaped: - readEscapedStringInto(out, buf); + readEscapedStringInto(out, buf); break; case FormatSettings::EscapingRule::Quoted: readQuotedFieldInto(out, buf); @@ -236,7 +236,7 @@ String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escapin if constexpr (read_string) readEscapedString(result, buf); else - readTSVField(result, buf); + readTSVField(result, buf); break; default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 05d35a57b12..90168325d99 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -496,13 +496,19 @@ static ReturnType parseJSONEscapeSequence(Vector & s, ReadBuffer & buf) } -template +template void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) { while (!buf.eof()) { - char * next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end()); - + char * next_pos; + if constexpr (support_crlf) + { + next_pos = find_first_symbols<'\t', '\n', '\\','\r'>(buf.position(), buf.buffer().end()); + } else { + next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end()); + } + appendToStringOrVector(s, buf, next_pos); buf.position() = next_pos; @@ -529,25 +535,41 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) } } } + + if (*buf.position() == '\r') + { + ++buf.position(); // advance to \n after \r + } } } -template +template void readEscapedStringInto(Vector & s, ReadBuffer & buf) { - readEscapedStringIntoImpl(s, buf); + readEscapedStringIntoImpl(s, buf); } void readEscapedString(String & s, ReadBuffer & buf) { s.clear(); - readEscapedStringInto(s, buf); + readEscapedStringInto(s, buf); } -template void readEscapedStringInto>(PaddedPODArray & s, ReadBuffer & buf); -template void readEscapedStringInto(NullOutput & s, ReadBuffer & buf); +template +void readEscapedStringCRLF(String & s, ReadBuffer & buf) +{ + s.clear(); + readEscapedStringInto(s, buf); +} +template void readEscapedStringInto,false>(PaddedPODArray & s, ReadBuffer & buf); +template void readEscapedStringInto(NullOutput & s, ReadBuffer & buf); +template void readEscapedStringInto,true>(PaddedPODArray & s, ReadBuffer & buf); +template void readEscapedStringInto(NullOutput & s, ReadBuffer & buf); + +template void readEscapedStringCRLF(String & s, ReadBuffer & buf); +template void readEscapedStringCRLF(String & s, ReadBuffer & buf); /** If enable_sql_style_quoting == true, * strings like 'abc''def' will be parsed as abc'def. @@ -1761,10 +1783,16 @@ void readJSONField(String & s, ReadBuffer & buf) readParsedValueInto(s, buf, parse_func); } +template void readTSVField(String & s, ReadBuffer & buf) { s.clear(); - readEscapedStringIntoImpl(s, buf); + readEscapedStringIntoImpl(s, buf); } +template void readTSVField(String & s, ReadBuffer & buf); +template void readTSVField(String & s, ReadBuffer & buf); + } + + diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 85584d63ee8..5ee56201035 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -548,6 +548,9 @@ void readString(String & s, ReadBuffer & buf); void readEscapedString(String & s, ReadBuffer & buf); +template +void readEscapedStringCRLF(String & s, ReadBuffer & buf); + void readQuotedString(String & s, ReadBuffer & buf); void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); @@ -601,7 +604,7 @@ void readStringInto(Vector & s, ReadBuffer & buf); template void readNullTerminated(Vector & s, ReadBuffer & buf); -template +template void readEscapedStringInto(Vector & s, ReadBuffer & buf); template @@ -1757,6 +1760,7 @@ void readQuotedField(String & s, ReadBuffer & buf); void readJSONField(String & s, ReadBuffer & buf); +template void readTSVField(String & s, ReadBuffer & buf); /** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters). diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 432e944a246..d59b5cdd2d0 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -134,7 +134,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex /// If the key is not found, skip the value. NullOutput sink; - readEscapedStringInto(sink, *in); + readEscapedStringInto(sink, *in); } else { diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 6f6dae334e5..afd91e913d2 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "Formats/FormatSettings.h" namespace DB { @@ -105,14 +106,17 @@ template String TabSeparatedFormatReader::readFieldIntoString() { String field; + bool support_crlf = format_settings.tsv.crlf_end_of_line_input; if (is_raw) readString(field, *buf); else { if constexpr (read_string) - readEscapedString(field, *buf); + support_crlf ? readEscapedStringCRLF(field, *buf) + : readEscapedStringCRLF(field, *buf); else - readTSVField(field, *buf); + support_crlf ? readTSVField(field, *buf) + : readTSVField(field, *buf); } return field; } @@ -123,7 +127,8 @@ void TabSeparatedFormatReader::skipField() if (is_raw) readStringInto(out, *buf); else - readEscapedStringInto(out, *buf); + format_settings.tsv.crlf_end_of_line_input ? readEscapedStringInto(out, *buf) + : readEscapedStringInto(out, *buf); } void TabSeparatedFormatReader::skipHeaderRow() From a12d8d749dc660da64c34188cff4dbc2d33946a8 Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 6 Feb 2024 17:17:24 +0100 Subject: [PATCH 005/392] modify skipRowEndDelimiter for \r --- .../Serializations/SerializationNullable.cpp | 5 ++--- .../Formats/Impl/TabSeparatedRowInputFormat.cpp | 11 ++++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index c0fbdfbb022..4b0ad0b54ba 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -290,7 +290,6 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col const SerializationPtr & nested_serialization) { const String & null_representation = settings.tsv.null_representation; - const bool supports_crlf = settings.tsv.crlf_end_of_line_input; /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) @@ -310,10 +309,10 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col /// Check if we have enough data in buffer to check if it's a null. if (istr.available() > null_representation.size()) { - auto check_for_null = [&istr, &null_representation, &supports_crlf]() + auto check_for_null = [&istr, &null_representation]() { auto * pos = istr.position(); - if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n' || (supports_crlf && *istr.position() == '\r'))) + if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n')) return true; istr.position() = pos; return false; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index afd91e913d2..5a94a505984 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -23,9 +23,13 @@ namespace ErrorCodes /** Check for a common error case - usage of Windows line feed. */ +template static void checkForCarriageReturn(ReadBuffer & in) { - if (!in.eof() && (in.position()[0] == '\r' || (in.position() != in.buffer().begin() && in.position()[-1] == '\r'))) + bool crlf_escaped = false; + if constexpr (supports_crlf) + crlf_escaped = true; + if (!in.eof() && (in.position()[0] == '\r' || (crlf_escaped ? false : (in.position() != in.buffer().begin() && in.position()[-1] == '\r')))) throw Exception(ErrorCodes::INCORRECT_DATA, "\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row." "\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format." " You must transform your file to Unix format." @@ -90,12 +94,13 @@ void TabSeparatedFormatReader::skipFieldDelimiter() void TabSeparatedFormatReader::skipRowEndDelimiter() { + bool supports_crfl = format_settings.tsv.crlf_end_of_line_input; if (buf->eof()) return; if (unlikely(first_row)) - { - checkForCarriageReturn(*buf); + { + supports_crfl ? checkForCarriageReturn(*buf) : checkForCarriageReturn(*buf); first_row = false; } From a2dfc4856712ad8003eef902d33bafb3f47cc6aa Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 8 Feb 2024 07:41:50 +0100 Subject: [PATCH 006/392] change typo crfl to crlf in skipRowEndDelimiter function --- src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 3f18aad3bd1..dbd939effe1 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -95,16 +95,16 @@ void TabSeparatedFormatReader::skipFieldDelimiter() void TabSeparatedFormatReader::skipRowEndDelimiter() { - bool supports_crfl = format_settings.tsv.crlf_end_of_line_input; + bool supports_crlf = format_settings.tsv.crlf_end_of_line_input; if (buf->eof()) return; - if (supports_crfl && first_row==false) + if (supports_crlf && first_row==false) { ++buf->position(); } if (unlikely(first_row)) { - supports_crfl ? checkForCarriageReturn(*buf) : checkForCarriageReturn(*buf); + supports_crlf ? checkForCarriageReturn(*buf) : checkForCarriageReturn(*buf); first_row = false; } assertChar('\n', *buf); From 04abd62288a55a0d6b3a315e08a6410a39e70199 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 8 Feb 2024 07:43:41 +0100 Subject: [PATCH 007/392] rename reference file to fix typo of crfl to crlf --- ..._files.reference => 02973_parse_crlf_with_tsv_files.reference} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02973_parse_crfl_with_tsv_files.reference => 02973_parse_crlf_with_tsv_files.reference} (100%) diff --git a/tests/queries/0_stateless/02973_parse_crfl_with_tsv_files.reference b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference similarity index 100% rename from tests/queries/0_stateless/02973_parse_crfl_with_tsv_files.reference rename to tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference From d53632d61ea85040572c7f4f449e48b54737090d Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 8 Feb 2024 07:50:13 +0100 Subject: [PATCH 008/392] update SettingsChangesHistory --- src/Core/SettingsChangesHistory.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index db3a76e29cd..8b918c1c064 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -90,7 +90,8 @@ static std::map sett {"async_insert_busy_timeout_min_ms", 50, 50, "The minimum value of the asynchronous insert timeout in milliseconds; it also serves as the initial value, which may be increased later by the adaptive algorithm"}, {"async_insert_busy_timeout_max_ms", 200, 200, "The minimum value of the asynchronous insert timeout in milliseconds; async_insert_busy_timeout_ms is aliased to async_insert_busy_timeout_max_ms"}, {"async_insert_busy_timeout_increase_rate", 0.2, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout increases"}, - {"async_insert_busy_timeout_decrease_rate", 0.2, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout decreases"}}}, + {"async_insert_busy_timeout_decrease_rate", 0.2, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout decreases"}, + {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}}}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}, From debc804b777ec8c0355b29d9f325defd461e5e63 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 8 Feb 2024 08:10:53 +0100 Subject: [PATCH 009/392] documentation changes --- docs/en/interfaces/formats.md | 1 + docs/ru/interfaces/formats.md | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a11c3e5ef19..0a5a9c6a076 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -195,6 +195,7 @@ SELECT * FROM nestedt FORMAT TSV - [input_format_tsv_enum_as_number](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_enum_as_number) - treat inserted enum values in TSV formats as enum indices. Default value - `false`. - [input_format_tsv_use_best_effort_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_use_best_effort_in_schema_inference) - use some tweaks and heuristics to infer schema in TSV format. If disabled, all fields will be inferred as Strings. Default value - `true`. - [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`. +- [input_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV input format will be `\r\n` instead of `\n`. Default value - `false`. - [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`. - [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`. - [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index b4794b02743..c4892c74515 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -119,6 +119,7 @@ Hello\nworld Hello\ world ``` +`\n\r` (CRLF) поддерживается с помощью настройки `input_format_tsv_crlf_end_of_line`. Второй вариант поддерживается, так как его использует MySQL при записи tab-separated дампа. From 3cca8410385c216ced1c9366a8e8cda8503f3407 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 9 Feb 2024 18:55:21 +0100 Subject: [PATCH 010/392] Unite s3/hdfs/azure storage implementations into a single class on top of IObjectStorage --- src/Backups/BackupIO_AzureBlobStorage.cpp | 26 +- src/Backups/BackupIO_AzureBlobStorage.h | 46 +- .../registerBackupEngineAzureBlobStorage.cpp | 18 +- src/CMakeLists.txt | 1 + .../AzureBlobStorage/AzureObjectStorage.cpp | 8 +- ...jectStorageRemoteMetadataRestoreHelper.cpp | 14 +- src/Disks/ObjectStorages/IObjectStorage.h | 5 +- src/Disks/ObjectStorages/IObjectStorage_fwd.h | 3 + .../MetadataStorageFromPlainObjectStorage.cpp | 2 +- .../ObjectStorages/ObjectStorageIterator.cpp | 2 +- .../ObjectStorages/ObjectStorageIterator.h | 22 +- .../ObjectStorageIteratorAsync.cpp | 4 +- .../ObjectStorageIteratorAsync.h | 4 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 26 +- .../copyAzureBlobStorageFile.h | 3 +- src/Interpreters/InterpreterSystemQuery.cpp | 6 +- src/Server/TCPHandler.cpp | 2 +- .../DataLakes/DeltaLakeMetadataParser.cpp | 87 +- .../DataLakes/DeltaLakeMetadataParser.h | 10 +- src/Storages/DataLakes/HudiMetadataParser.cpp | 181 +- src/Storages/DataLakes/HudiMetadataParser.h | 15 +- src/Storages/DataLakes/IStorageDataLake.h | 144 +- .../DataLakes/Iceberg/IcebergMetadata.cpp | 65 +- .../DataLakes/Iceberg/IcebergMetadata.h | 27 +- .../DataLakes/Iceberg/StorageIceberg.cpp | 79 - .../DataLakes/Iceberg/StorageIceberg.h | 117 +- src/Storages/DataLakes/S3MetadataReader.cpp | 86 - src/Storages/DataLakes/S3MetadataReader.h | 25 - src/Storages/DataLakes/StorageDeltaLake.h | 7 +- src/Storages/DataLakes/StorageHudi.h | 7 +- src/Storages/DataLakes/registerDataLakes.cpp | 38 +- src/Storages/HDFS/StorageHDFS.cpp | 1117 ---------- src/Storages/HDFS/StorageHDFS.h | 179 -- src/Storages/HDFS/StorageHDFSCluster.cpp | 98 - src/Storages/HDFS/StorageHDFSCluster.h | 56 - src/Storages/IStorage.h | 9 +- .../ObjectStorage/AzureConfiguration.cpp | 451 ++++ .../ObjectStorage/AzureConfiguration.h | 54 + src/Storages/ObjectStorage/Configuration.h | 55 + .../ObjectStorage/HDFSConfiguration.h | 81 + .../ObjectStorage/ReadBufferIterator.h | 197 ++ .../ObjectStorage/ReadFromObjectStorage.h | 105 + .../ObjectStorage/S3Configuration.cpp | 491 +++++ src/Storages/ObjectStorage/S3Configuration.h | 46 + src/Storages/ObjectStorage/Settings.h | 86 + .../ObjectStorage/StorageObjectStorage.cpp | 303 +++ .../ObjectStorage/StorageObjectStorage.h | 116 + .../StorageObjectStorageCluster.cpp | 107 + .../StorageObjectStorageCluster.h | 72 + .../ObjectStorage/StorageObjectStorageSink.h | 155 ++ .../StorageObjectStorageSource.cpp | 464 ++++ .../StorageObjectStorageSource.h | 217 ++ .../registerStorageObjectStorage.cpp | 166 ++ src/Storages/ObjectStorageConfiguration.h | 0 src/Storages/S3Queue/S3QueueSource.cpp | 85 +- src/Storages/S3Queue/S3QueueSource.h | 42 +- src/Storages/S3Queue/S3QueueTableMetadata.cpp | 3 +- src/Storages/S3Queue/S3QueueTableMetadata.h | 7 +- src/Storages/S3Queue/StorageS3Queue.cpp | 101 +- src/Storages/S3Queue/StorageS3Queue.h | 14 +- src/Storages/StorageAzureBlob.cpp | 1478 ------------- src/Storages/StorageAzureBlob.h | 339 --- src/Storages/StorageAzureBlobCluster.cpp | 89 - src/Storages/StorageAzureBlobCluster.h | 56 - src/Storages/StorageS3.cpp | 1905 ----------------- src/Storages/StorageS3.h | 399 ---- src/Storages/StorageS3Cluster.cpp | 103 - src/Storages/StorageS3Cluster.h | 58 - .../StorageSystemSchemaInferenceCache.cpp | 6 +- src/Storages/registerStorages.cpp | 17 +- src/TableFunctions/ITableFunctionCluster.h | 6 +- src/TableFunctions/ITableFunctionDataLake.h | 22 +- .../TableFunctionAzureBlobStorage.cpp | 323 --- .../TableFunctionAzureBlobStorage.h | 80 - .../TableFunctionAzureBlobStorageCluster.cpp | 85 - .../TableFunctionAzureBlobStorageCluster.h | 55 - src/TableFunctions/TableFunctionDeltaLake.cpp | 24 +- src/TableFunctions/TableFunctionHDFS.cpp | 54 - src/TableFunctions/TableFunctionHDFS.h | 50 - .../TableFunctionHDFSCluster.cpp | 61 - src/TableFunctions/TableFunctionHDFSCluster.h | 54 - src/TableFunctions/TableFunctionHudi.cpp | 24 +- src/TableFunctions/TableFunctionIceberg.cpp | 7 +- .../TableFunctionObjectStorage.cpp | 224 ++ .../TableFunctionObjectStorage.h | 150 ++ .../TableFunctionObjectStorageCluster.cpp | 113 + .../TableFunctionObjectStorageCluster.h | 91 + src/TableFunctions/TableFunctionS3.cpp | 464 ---- src/TableFunctions/TableFunctionS3.h | 86 - src/TableFunctions/TableFunctionS3Cluster.cpp | 74 - src/TableFunctions/TableFunctionS3Cluster.h | 64 - src/TableFunctions/registerTableFunctions.cpp | 23 +- src/TableFunctions/registerTableFunctions.h | 9 +- .../test_storage_azure_blob_storage/test.py | 8 +- 94 files changed, 4403 insertions(+), 8155 deletions(-) delete mode 100644 src/Storages/DataLakes/S3MetadataReader.cpp delete mode 100644 src/Storages/DataLakes/S3MetadataReader.h delete mode 100644 src/Storages/HDFS/StorageHDFS.cpp delete mode 100644 src/Storages/HDFS/StorageHDFS.h delete mode 100644 src/Storages/HDFS/StorageHDFSCluster.cpp delete mode 100644 src/Storages/HDFS/StorageHDFSCluster.h create mode 100644 src/Storages/ObjectStorage/AzureConfiguration.cpp create mode 100644 src/Storages/ObjectStorage/AzureConfiguration.h create mode 100644 src/Storages/ObjectStorage/Configuration.h create mode 100644 src/Storages/ObjectStorage/HDFSConfiguration.h create mode 100644 src/Storages/ObjectStorage/ReadBufferIterator.h create mode 100644 src/Storages/ObjectStorage/ReadFromObjectStorage.h create mode 100644 src/Storages/ObjectStorage/S3Configuration.cpp create mode 100644 src/Storages/ObjectStorage/S3Configuration.h create mode 100644 src/Storages/ObjectStorage/Settings.h create mode 100644 src/Storages/ObjectStorage/StorageObjectStorage.cpp create mode 100644 src/Storages/ObjectStorage/StorageObjectStorage.h create mode 100644 src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp create mode 100644 src/Storages/ObjectStorage/StorageObjectStorageCluster.h create mode 100644 src/Storages/ObjectStorage/StorageObjectStorageSink.h create mode 100644 src/Storages/ObjectStorage/StorageObjectStorageSource.cpp create mode 100644 src/Storages/ObjectStorage/StorageObjectStorageSource.h create mode 100644 src/Storages/ObjectStorage/registerStorageObjectStorage.cpp create mode 100644 src/Storages/ObjectStorageConfiguration.h delete mode 100644 src/Storages/StorageAzureBlob.cpp delete mode 100644 src/Storages/StorageAzureBlob.h delete mode 100644 src/Storages/StorageAzureBlobCluster.cpp delete mode 100644 src/Storages/StorageAzureBlobCluster.h delete mode 100644 src/Storages/StorageS3.cpp delete mode 100644 src/Storages/StorageS3.h delete mode 100644 src/Storages/StorageS3Cluster.cpp delete mode 100644 src/Storages/StorageS3Cluster.h delete mode 100644 src/TableFunctions/TableFunctionAzureBlobStorage.cpp delete mode 100644 src/TableFunctions/TableFunctionAzureBlobStorage.h delete mode 100644 src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp delete mode 100644 src/TableFunctions/TableFunctionAzureBlobStorageCluster.h delete mode 100644 src/TableFunctions/TableFunctionHDFS.cpp delete mode 100644 src/TableFunctions/TableFunctionHDFS.h delete mode 100644 src/TableFunctions/TableFunctionHDFSCluster.cpp delete mode 100644 src/TableFunctions/TableFunctionHDFSCluster.h create mode 100644 src/TableFunctions/TableFunctionObjectStorage.cpp create mode 100644 src/TableFunctions/TableFunctionObjectStorage.h create mode 100644 src/TableFunctions/TableFunctionObjectStorageCluster.cpp create mode 100644 src/TableFunctions/TableFunctionObjectStorageCluster.h delete mode 100644 src/TableFunctions/TableFunctionS3.cpp delete mode 100644 src/TableFunctions/TableFunctionS3.h delete mode 100644 src/TableFunctions/TableFunctionS3Cluster.cpp delete mode 100644 src/TableFunctions/TableFunctionS3Cluster.h diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 52ce20d5108..dc636f90be7 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -29,7 +28,7 @@ namespace ErrorCodes } BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( - StorageAzureBlob::Configuration configuration_, + const StorageAzureBlobConfiguration & configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_) @@ -37,10 +36,10 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::Azure, MetadataStorageType::None, configuration_.container, false, false} , configuration(configuration_) { - auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + auto client_ptr = configuration.createClient(/* is_read_only */ false); object_storage = std::make_unique("BackupReaderAzureBlobStorage", std::move(client_ptr), - StorageAzureBlob::createSettings(context_), + configuration.createSettings(context_), configuration_.container); client = object_storage->getAzureBlobStorageClient(); settings = object_storage->getSettings(); @@ -137,7 +136,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( - StorageAzureBlob::Configuration configuration_, + const StorageAzureBlobConfiguration & configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_) @@ -145,17 +144,22 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::Azure, MetadataStorageType::None, configuration_.container, false, false} , configuration(configuration_) { - auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + auto client_ptr = configuration.createClient(/* is_read_only */ false); object_storage = std::make_unique("BackupWriterAzureBlobStorage", std::move(client_ptr), - StorageAzureBlob::createSettings(context_), - configuration_.container); + configuration.createSettings(context_), + configuration.container); client = object_storage->getAzureBlobStorageClient(); settings = object_storage->getSettings(); } -void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, - bool copy_encrypted, UInt64 start_pos, UInt64 length) +void BackupWriterAzureBlobStorage::copyFileFromDisk( + const String & path_in_backup, + DiskPtr src_disk, + const String & src_path, + bool copy_encrypted, + UInt64 start_pos, + UInt64 length) { /// Use the native copy as a more optimal way to copy a file from AzureBlobStorage to AzureBlobStorage if it's possible. auto source_data_source_description = src_disk->getDataSourceDescription(); @@ -241,7 +245,7 @@ UInt64 BackupWriterAzureBlobStorage::getFileSize(const String & file_name) object_storage->listObjects(key,children,/*max_keys*/0); if (children.empty()) throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Object must exist"); - return children[0].metadata.size_bytes; + return children[0]->metadata.size_bytes; } std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String & file_name, size_t /*expected_file_size*/) diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 95325044a62..99002c53769 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -5,8 +5,8 @@ #if USE_AZURE_BLOB_STORAGE #include #include -#include #include +#include namespace DB @@ -16,20 +16,30 @@ namespace DB class BackupReaderAzureBlobStorage : public BackupReaderDefault { public: - BackupReaderAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + BackupReaderAzureBlobStorage( + const StorageAzureBlobConfiguration & configuration_, + const ReadSettings & read_settings_, + const WriteSettings & write_settings_, + const ContextPtr & context_); + ~BackupReaderAzureBlobStorage() override; bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; std::unique_ptr readFile(const String & file_name) override; - void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; + void copyFileToDisk( + const String & path_in_backup, + size_t file_size, + bool encrypted_in_backup, + DiskPtr destination_disk, + const String & destination_path, + WriteMode write_mode) override; private: const DataSourceDescription data_source_description; std::shared_ptr client; - StorageAzureBlob::Configuration configuration; + StorageAzureBlobConfiguration configuration; std::unique_ptr object_storage; std::shared_ptr settings; }; @@ -37,16 +47,31 @@ private: class BackupWriterAzureBlobStorage : public BackupWriterDefault { public: - BackupWriterAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + BackupWriterAzureBlobStorage( + const StorageAzureBlobConfiguration & configuration_, + const ReadSettings & read_settings_, + const WriteSettings & write_settings_, + const ContextPtr & context_); + ~BackupWriterAzureBlobStorage() override; bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; std::unique_ptr writeFile(const String & file_name) override; - void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; - void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, - bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + void copyDataToFile( + const String & path_in_backup, + const CreateReadBufferFunction & create_read_buffer, + UInt64 start_pos, + UInt64 length) override; + + void copyFileFromDisk( + const String & path_in_backup, + DiskPtr src_disk, + const String & src_path, + bool copy_encrypted, + UInt64 start_pos, + UInt64 length) override; void copyFile(const String & destination, const String & source, size_t size) override; @@ -56,9 +81,10 @@ public: private: std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; void removeFilesBatch(const Strings & file_names); + const DataSourceDescription data_source_description; std::shared_ptr client; - StorageAzureBlob::Configuration configuration; + StorageAzureBlobConfiguration configuration; std::unique_ptr object_storage; std::shared_ptr settings; }; diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index 48f66569304..9408c7ccdcf 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -5,11 +5,11 @@ #if USE_AZURE_BLOB_STORAGE #include -#include #include #include #include #include +#include #include #endif @@ -49,7 +49,7 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) const String & id_arg = params.backup_info.id_arg; const auto & args = params.backup_info.args; - StorageAzureBlob::Configuration configuration; + StorageAzureBlobConfiguration configuration; if (!id_arg.empty()) { @@ -59,6 +59,9 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) if (!config.has(config_prefix)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no collection named `{}` in config", id_arg); + if (!config.has(config_prefix)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no `{}` in config", config_prefix); + if (config.has(config_prefix + ".connection_string")) { configuration.connection_url = config.getString(config_prefix + ".connection_string"); @@ -75,10 +78,11 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) } if (args.size() > 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Backup AzureBlobStorage requires 1 or 2 arguments: named_collection, [filename]"); + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Backup AzureBlobStorage requires 1 or 2 arguments: named_collection, [filename]"); if (args.size() == 1) - configuration.blob_path = args[0].safeGet(); + configuration.setPath(args[0].safeGet()); } else @@ -110,12 +114,14 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) } BackupImpl::ArchiveParams archive_params; - if (hasRegisteredArchiveFileExtension(configuration.blob_path)) + if (hasRegisteredArchiveFileExtension(configuration.getPath())) { if (params.is_internal_backup) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Using archives with backups on clusters is disabled"); - archive_params.archive_name = removeFileNameFromURL(configuration.blob_path); + auto path = configuration.getPath(); + configuration.setPath(removeFileNameFromURL(path)); + archive_params.archive_name = configuration.getPath(); archive_params.compression_method = params.compression_method; archive_params.compression_level = params.compression_level; archive_params.password = params.password; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 08913ed1b5a..50130e6abd0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -119,6 +119,7 @@ endif() add_headers_and_sources(dbms Storages/DataLakes) add_headers_and_sources(dbms Storages/DataLakes/Iceberg) +add_headers_and_sources(dbms Storages/ObjectStorage) add_headers_and_sources(dbms Common/NamedCollections) if (TARGET ch_contrib::amqp_cpp) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 74389aedb64..2ca44137442 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -65,14 +65,14 @@ private: for (const auto & blob : blobs_list) { - batch.emplace_back( + batch.emplace_back(std::make_shared( blob.Name, ObjectMetadata{ static_cast(blob.BlobSize), Poco::Timestamp::fromEpochTime( std::chrono::duration_cast( static_cast(blob.Details.LastModified).time_since_epoch()).count()), - {}}); + {}})); } if (!blob_list_response.NextPageToken.HasValue() || blob_list_response.NextPageToken.Value().empty()) @@ -156,14 +156,14 @@ void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWith for (const auto & blob : blobs_list) { - children.emplace_back( + children.emplace_back(std::make_shared( blob.Name, ObjectMetadata{ static_cast(blob.BlobSize), Poco::Timestamp::fromEpochTime( std::chrono::duration_cast( static_cast(blob.Details.LastModified).time_since_epoch()).count()), - {}}); + {}})); } if (max_keys) diff --git a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp index 0314e0a7e92..cc9ee3db505 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp @@ -363,18 +363,18 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFiles(IObjectStorage * for (const auto & object : objects) { - LOG_INFO(disk->log, "Calling restore for key for disk {}", object.relative_path); + LOG_INFO(disk->log, "Calling restore for key for disk {}", object->relative_path); /// Skip file operations objects. They will be processed separately. - if (object.relative_path.find("/operations/") != String::npos) + if (object->relative_path.find("/operations/") != String::npos) continue; - const auto [revision, _] = extractRevisionAndOperationFromKey(object.relative_path); + const auto [revision, _] = extractRevisionAndOperationFromKey(object->relative_path); /// Filter early if it's possible to get revision from key. if (revision > restore_information.revision) continue; - keys_names.push_back(object.relative_path); + keys_names.push_back(object->relative_path); } if (!keys_names.empty()) @@ -474,10 +474,10 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFileOperations(IObject for (const auto & object : objects) { - const auto [revision, operation] = extractRevisionAndOperationFromKey(object.relative_path); + const auto [revision, operation] = extractRevisionAndOperationFromKey(object->relative_path); if (revision == UNKNOWN_REVISION) { - LOG_WARNING(disk->log, "Skip key {} with unknown revision", object.relative_path); + LOG_WARNING(disk->log, "Skip key {} with unknown revision", object->relative_path); continue; } @@ -490,7 +490,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFileOperations(IObject if (send_metadata) revision_counter = revision - 1; - auto object_attributes = *(source_object_storage->getObjectMetadata(object.relative_path).attributes); + auto object_attributes = *(source_object_storage->getObjectMetadata(object->relative_path).attributes); if (operation == rename) { auto from_path = object_attributes["from_path"]; diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 049935ad60c..7d354e6383d 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -62,6 +62,8 @@ struct RelativePathWithMetadata : relative_path(std::move(relative_path_)) , metadata(std::move(metadata_)) {} + + virtual ~RelativePathWithMetadata() = default; }; struct ObjectKeyWithMetadata @@ -77,7 +79,8 @@ struct ObjectKeyWithMetadata {} }; -using RelativePathsWithMetadata = std::vector; +using RelativePathWithMetadataPtr = std::shared_ptr; +using RelativePathsWithMetadata = std::vector; using ObjectKeysWithMetadata = std::vector; class IObjectStorageIterator; diff --git a/src/Disks/ObjectStorages/IObjectStorage_fwd.h b/src/Disks/ObjectStorages/IObjectStorage_fwd.h index f6ebc883682..67efa4aae2b 100644 --- a/src/Disks/ObjectStorages/IObjectStorage_fwd.h +++ b/src/Disks/ObjectStorages/IObjectStorage_fwd.h @@ -10,4 +10,7 @@ using ObjectStoragePtr = std::shared_ptr; class IMetadataStorage; using MetadataStoragePtr = std::shared_ptr; +class IObjectStorageIterator; +using ObjectStorageIteratorPtr = std::shared_ptr; + } diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index b03809f5b39..f07cf23106f 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -77,7 +77,7 @@ std::vector MetadataStorageFromPlainObjectStorage::listDirectory(co std::vector result; for (const auto & path_size : files) { - result.push_back(path_size.relative_path); + result.push_back(path_size->relative_path); } std::unordered_set duplicates_filter; diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.cpp b/src/Disks/ObjectStorages/ObjectStorageIterator.cpp index 72ec6e0e500..3d939ce9230 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIterator.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.cpp @@ -9,7 +9,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -RelativePathWithMetadata ObjectStorageIteratorFromList::current() +RelativePathWithMetadataPtr ObjectStorageIteratorFromList::current() { if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.h b/src/Disks/ObjectStorages/ObjectStorageIterator.h index 841b0ea6664..e934fc2056d 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIterator.h +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.h @@ -12,9 +12,9 @@ public: virtual void next() = 0; virtual void nextBatch() = 0; virtual bool isValid() = 0; - virtual RelativePathWithMetadata current() = 0; + virtual RelativePathWithMetadataPtr current() = 0; virtual RelativePathsWithMetadata currentBatch() = 0; - virtual std::optional getCurrrentBatchAndScheduleNext() = 0; + virtual std::optional getCurrentBatchAndScheduleNext() = 0; virtual size_t getAccumulatedSize() const = 0; virtual ~IObjectStorageIterator() = default; @@ -47,22 +47,14 @@ public: return batch_iterator != batch.end(); } - RelativePathWithMetadata current() override; + RelativePathWithMetadataPtr current() override; - RelativePathsWithMetadata currentBatch() override - { - return batch; - } + RelativePathsWithMetadata currentBatch() override { return batch; } - virtual std::optional getCurrrentBatchAndScheduleNext() override - { - return std::nullopt; - } + std::optional getCurrentBatchAndScheduleNext() override { return std::nullopt; } + + size_t getAccumulatedSize() const override { return batch.size(); } - size_t getAccumulatedSize() const override - { - return batch.size(); - } private: RelativePathsWithMetadata batch; RelativePathsWithMetadata::iterator batch_iterator; diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index 990e66fc4e5..b7729623a64 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -82,7 +82,7 @@ bool IObjectStorageIteratorAsync::isValid() return current_batch_iterator != current_batch.end(); } -RelativePathWithMetadata IObjectStorageIteratorAsync::current() +RelativePathWithMetadataPtr IObjectStorageIteratorAsync::current() { if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); @@ -101,7 +101,7 @@ RelativePathsWithMetadata IObjectStorageIteratorAsync::currentBatch() return current_batch; } -std::optional IObjectStorageIteratorAsync::getCurrrentBatchAndScheduleNext() +std::optional IObjectStorageIteratorAsync::getCurrentBatchAndScheduleNext() { std::lock_guard lock(mutex); if (!is_initialized) diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h index a6abe03bac9..8d155f7ec8d 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h @@ -26,10 +26,10 @@ public: void next() override; void nextBatch() override; bool isValid() override; - RelativePathWithMetadata current() override; + RelativePathWithMetadataPtr current() override; RelativePathsWithMetadata currentBatch() override; size_t getAccumulatedSize() const override; - std::optional getCurrrentBatchAndScheduleNext() override; + std::optional getCurrentBatchAndScheduleNext() override; ~IObjectStorageIteratorAsync() override { diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 4cc49288af6..cc138c43c71 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -120,25 +120,22 @@ private: { ProfileEvents::increment(ProfileEvents::S3ListObjects); - bool result = false; auto outcome = client->ListObjectsV2(request); + /// Outcome failure will be handled on the caller side. if (outcome.IsSuccess()) { + request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); + auto objects = outcome.GetResult().GetContents(); - - result = !objects.empty(); - for (const auto & object : objects) - batch.emplace_back( - object.GetKey(), - ObjectMetadata{static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), {}} - ); + { + ObjectMetadata metadata{static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), {}}; + batch.emplace_back(std::make_shared(object.GetKey(), std::move(metadata))); + } - if (result) - request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); - - return result; + /// It returns false when all objects were returned + return outcome.GetResult().GetIsTruncated(); } throw S3Exception(outcome.GetError().GetErrorType(), "Could not list objects in bucket {} with prefix {}, S3 exception: {}, message: {}", @@ -249,7 +246,6 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN if (write_settings.s3_allow_parallel_part_upload) scheduler = threadPoolCallbackRunner(getThreadPoolWriter(), "VFSWrite"); - auto blob_storage_log = BlobStorageLogWriter::create(disk_name); if (blob_storage_log) blob_storage_log->local_path = object.local_path; @@ -300,12 +296,12 @@ void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMet break; for (const auto & object : objects) - children.emplace_back( + children.emplace_back(std::make_shared( object.GetKey(), ObjectMetadata{ static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), - {}}); + {}})); if (max_keys) { diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 83814f42693..cc23f604278 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -4,9 +4,8 @@ #if USE_AZURE_BLOB_STORAGE -#include -#include #include +#include #include #include #include diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 9a80553f149..d697d90c8a6 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -52,11 +52,9 @@ #include #include #include -#include #include -#include +#include #include -#include #include #include #include @@ -482,7 +480,7 @@ BlockIO InterpreterSystemQuery::execute() StorageURL::getSchemaCache(getContext()).clear(); #if USE_AZURE_BLOB_STORAGE if (caches_to_drop.contains("AZURE")) - StorageAzureBlob::getSchemaCache(getContext()).clear(); + StorageAzureBlobStorage::getSchemaCache(getContext()).clear(); #endif break; } diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index e1086ac5833..58672a72563 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp b/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp index 3584f137225..55ff8fefdd5 100644 --- a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp +++ b/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp @@ -4,8 +4,6 @@ #include #if USE_AWS_S3 && USE_PARQUET -#include -#include #include #include #include @@ -13,10 +11,10 @@ #include #include #include +#include #include #include #include -#include namespace fs = std::filesystem; @@ -29,8 +27,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -template -struct DeltaLakeMetadataParser::Impl +struct DeltaLakeMetadataParser::Impl { /** * Useful links: @@ -65,10 +62,13 @@ struct DeltaLakeMetadataParser::Impl * An action changes one aspect of the table's state, for example, adding or removing a file. * Note: it is not a valid json, but a list of json's, so we read it in a while cycle. */ - std::set processMetadataFiles(const Configuration & configuration, ContextPtr context) + std::set processMetadataFiles( + ObjectStoragePtr object_storage, + const StorageObjectStorageConfiguration & configuration, + ContextPtr context) { std::set result_files; - const auto checkpoint_version = getCheckpointIfExists(result_files, configuration, context); + const auto checkpoint_version = getCheckpointIfExists(result_files, object_storage, configuration, context); if (checkpoint_version) { @@ -78,10 +78,10 @@ struct DeltaLakeMetadataParser::Impl const auto filename = withPadding(++current_version) + metadata_file_suffix; const auto file_path = fs::path(configuration.getPath()) / deltalake_metadata_directory / filename; - if (!MetadataReadHelper::exists(file_path, configuration)) + if (!object_storage->exists(StoredObject(file_path))) break; - processMetadataFile(file_path, result_files, configuration, context); + processMetadataFile(file_path, result_files, object_storage, configuration, context); } LOG_TRACE( @@ -90,16 +90,33 @@ struct DeltaLakeMetadataParser::Impl } else { - const auto keys = MetadataReadHelper::listFiles( - configuration, deltalake_metadata_directory, metadata_file_suffix); - + const auto keys = listFiles(object_storage, configuration, deltalake_metadata_directory, metadata_file_suffix); for (const String & key : keys) - processMetadataFile(key, result_files, configuration, context); + processMetadataFile(key, result_files, object_storage, configuration, context); } return result_files; } + std::vector listFiles( + const ObjectStoragePtr & object_storage, + const StorageObjectStorageConfiguration & configuration, + const String & prefix, const String & suffix) + { + auto key = std::filesystem::path(configuration.getPath()) / prefix; + RelativePathsWithMetadata files_with_metadata; + object_storage->listObjects(key, files_with_metadata, 0); + Strings res; + for (const auto & file_with_metadata : files_with_metadata) + { + const auto & filename = file_with_metadata->relative_path; + if (filename.ends_with(suffix)) + res.push_back(filename); + } + LOG_TRACE(getLogger("DataLakeMetadataReadHelper"), "Listed {} files", res.size()); + return res; + } + /** * Example of content of a single .json metadata file: * " @@ -132,10 +149,12 @@ struct DeltaLakeMetadataParser::Impl void processMetadataFile( const String & key, std::set & result, - const Configuration & configuration, + ObjectStoragePtr object_storage, + const StorageObjectStorageConfiguration & configuration, ContextPtr context) { - auto buf = MetadataReadHelper::createReadBuffer(key, context, configuration); + auto read_settings = context->getReadSettings(); + auto buf = object_storage->readObject(StoredObject(key), read_settings); char c; while (!buf->eof()) @@ -180,14 +199,18 @@ struct DeltaLakeMetadataParser::Impl * * We need to get "version", which is the version of the checkpoint we need to read. */ - size_t readLastCheckpointIfExists(const Configuration & configuration, ContextPtr context) + size_t readLastCheckpointIfExists( + ObjectStoragePtr object_storage, + const StorageObjectStorageConfiguration & configuration, + ContextPtr context) const { const auto last_checkpoint_file = fs::path(configuration.getPath()) / deltalake_metadata_directory / "_last_checkpoint"; - if (!MetadataReadHelper::exists(last_checkpoint_file, configuration)) + if (!object_storage->exists(StoredObject(last_checkpoint_file))) return 0; String json_str; - auto buf = MetadataReadHelper::createReadBuffer(last_checkpoint_file, context, configuration); + auto read_settings = context->getReadSettings(); + auto buf = object_storage->readObject(StoredObject(last_checkpoint_file), read_settings); readJSONObjectPossiblyInvalid(json_str, *buf); const JSON json(json_str); @@ -237,9 +260,13 @@ struct DeltaLakeMetadataParser::Impl throw Exception(ErrorCodes::BAD_ARGUMENTS, "Arrow error: {}", _s.ToString()); \ } while (false) - size_t getCheckpointIfExists(std::set & result, const Configuration & configuration, ContextPtr context) + size_t getCheckpointIfExists( + std::set & result, + ObjectStoragePtr object_storage, + const StorageObjectStorageConfiguration & configuration, + ContextPtr context) { - const auto version = readLastCheckpointIfExists(configuration, context); + const auto version = readLastCheckpointIfExists(object_storage, configuration, context); if (!version) return 0; @@ -248,7 +275,8 @@ struct DeltaLakeMetadataParser::Impl LOG_TRACE(log, "Using checkpoint file: {}", checkpoint_path.string()); - auto buf = MetadataReadHelper::createReadBuffer(checkpoint_path, context, configuration); + auto read_settings = context->getReadSettings(); + auto buf = object_storage->readObject(StoredObject(checkpoint_path), read_settings); auto format_settings = getFormatSettings(context); /// Force nullable, because this parquet file for some reason does not have nullable @@ -317,22 +345,17 @@ struct DeltaLakeMetadataParser::Impl LoggerPtr log = getLogger("DeltaLakeMetadataParser"); }; +DeltaLakeMetadataParser::DeltaLakeMetadataParser() : impl(std::make_unique()) {} -template -DeltaLakeMetadataParser::DeltaLakeMetadataParser() : impl(std::make_unique()) +Strings DeltaLakeMetadataParser::getFiles( + ObjectStoragePtr object_storage, + StorageObjectStorageConfigurationPtr configuration, + ContextPtr context) { -} - -template -Strings DeltaLakeMetadataParser::getFiles(const Configuration & configuration, ContextPtr context) -{ - auto result = impl->processMetadataFiles(configuration, context); + auto result = impl->processMetadataFiles(object_storage, *configuration, context); return Strings(result.begin(), result.end()); } -template DeltaLakeMetadataParser::DeltaLakeMetadataParser(); -template Strings DeltaLakeMetadataParser::getFiles( - const StorageS3::Configuration & configuration, ContextPtr); } #endif diff --git a/src/Storages/DataLakes/DeltaLakeMetadataParser.h b/src/Storages/DataLakes/DeltaLakeMetadataParser.h index df7276b90b4..f94024597d6 100644 --- a/src/Storages/DataLakes/DeltaLakeMetadataParser.h +++ b/src/Storages/DataLakes/DeltaLakeMetadataParser.h @@ -2,17 +2,21 @@ #include #include +#include +#include namespace DB { -template struct DeltaLakeMetadataParser { public: - DeltaLakeMetadataParser(); + DeltaLakeMetadataParser(); - Strings getFiles(const Configuration & configuration, ContextPtr context); + Strings getFiles( + ObjectStoragePtr object_storage, + StorageObjectStorageConfigurationPtr configuration, + ContextPtr context); private: struct Impl; diff --git a/src/Storages/DataLakes/HudiMetadataParser.cpp b/src/Storages/DataLakes/HudiMetadataParser.cpp index 699dfe8fda0..8571c035b32 100644 --- a/src/Storages/DataLakes/HudiMetadataParser.cpp +++ b/src/Storages/DataLakes/HudiMetadataParser.cpp @@ -1,16 +1,11 @@ #include +#include #include -#include #include #include #include "config.h" -#include #include -#if USE_AWS_S3 -#include -#include - namespace DB { @@ -19,98 +14,98 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -template -struct HudiMetadataParser::Impl -{ - /** - * Useful links: - * - https://hudi.apache.org/tech-specs/ - * - https://hudi.apache.org/docs/file_layouts/ - */ +/** + * Useful links: + * - https://hudi.apache.org/tech-specs/ + * - https://hudi.apache.org/docs/file_layouts/ + */ - /** - * Hudi tables store metadata files and data files. - * Metadata files are stored in .hoodie/metadata directory. Though unlike DeltaLake and Iceberg, - * metadata is not required in order to understand which files we need to read, moreover, - * for Hudi metadata does not always exist. - * - * There can be two types of data files - * 1. base files (columnar file formats like Apache Parquet/Orc) - * 2. log files - * Currently we support reading only `base files`. - * Data file name format: - * [File Id]_[File Write Token]_[Transaction timestamp].[File Extension] - * - * To find needed parts we need to find out latest part file for every file group for every partition. - * Explanation why: - * Hudi reads in and overwrites the entire table/partition with each update. - * Hudi controls the number of file groups under a single partition according to the - * hoodie.parquet.max.file.size option. Once a single Parquet file is too large, Hudi creates a second file group. - * Each file group is identified by File Id. - */ - Strings processMetadataFiles(const Configuration & configuration) +/** + * Hudi tables store metadata files and data files. + * Metadata files are stored in .hoodie/metadata directory. Though unlike DeltaLake and Iceberg, + * metadata is not required in order to understand which files we need to read, moreover, + * for Hudi metadata does not always exist. + * + * There can be two types of data files + * 1. base files (columnar file formats like Apache Parquet/Orc) + * 2. log files + * Currently we support reading only `base files`. + * Data file name format: + * [File Id]_[File Write Token]_[Transaction timestamp].[File Extension] + * + * To find needed parts we need to find out latest part file for every file group for every partition. + * Explanation why: + * Hudi reads in and overwrites the entire table/partition with each update. + * Hudi controls the number of file groups under a single partition according to the + * hoodie.parquet.max.file.size option. Once a single Parquet file is too large, Hudi creates a second file group. + * Each file group is identified by File Id. + */ +std::vector listFiles( + const ObjectStoragePtr & object_storage, + const StorageObjectStorageConfiguration & configuration, + const String & prefix, const String & suffix) +{ + auto key = std::filesystem::path(configuration.getPath()) / prefix; + RelativePathsWithMetadata files_with_metadata; + object_storage->listObjects(key, files_with_metadata, 0); + Strings res; + for (const auto & file_with_metadata : files_with_metadata) { - auto log = getLogger("HudiMetadataParser"); - - const auto keys = MetadataReadHelper::listFiles(configuration, "", Poco::toLower(configuration.format)); - - using Partition = std::string; - using FileID = std::string; - struct FileInfo - { - String key; - UInt64 timestamp = 0; - }; - std::unordered_map> data_files; - - for (const auto & key : keys) - { - auto key_file = std::filesystem::path(key); - Strings file_parts; - const String stem = key_file.stem(); - splitInto<'_'>(file_parts, stem); - if (file_parts.size() != 3) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected format for file: {}", key); - - const auto partition = key_file.parent_path().stem(); - const auto & file_id = file_parts[0]; - const auto timestamp = parse(file_parts[2]); - - auto & file_info = data_files[partition][file_id]; - if (file_info.timestamp == 0 || file_info.timestamp < timestamp) - { - file_info.key = std::move(key); - file_info.timestamp = timestamp; - } - } - - Strings result; - for (auto & [partition, partition_data] : data_files) - { - LOG_TRACE(log, "Adding {} data files from partition {}", partition, partition_data.size()); - for (auto & [file_id, file_data] : partition_data) - result.push_back(std::move(file_data.key)); - } - return result; + const auto & filename = file_with_metadata->relative_path; + if (filename.ends_with(suffix)) + res.push_back(filename); } -}; + LOG_TRACE(getLogger("DataLakeMetadataReadHelper"), "Listed {} files", res.size()); + return res; +} - -template -HudiMetadataParser::HudiMetadataParser() : impl(std::make_unique()) +Strings HudiMetadataParser::getFiles( + ObjectStoragePtr object_storage, + StorageObjectStorageConfigurationPtr configuration, + ContextPtr) { + auto log = getLogger("HudiMetadataParser"); + + const auto keys = listFiles(object_storage, *configuration, "", Poco::toLower(configuration->format)); + + using Partition = std::string; + using FileID = std::string; + struct FileInfo + { + String key; + UInt64 timestamp = 0; + }; + std::unordered_map> data_files; + + for (const auto & key : keys) + { + auto key_file = std::filesystem::path(key); + Strings file_parts; + const String stem = key_file.stem(); + splitInto<'_'>(file_parts, stem); + if (file_parts.size() != 3) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected format for file: {}", key); + + const auto partition = key_file.parent_path().stem(); + const auto & file_id = file_parts[0]; + const auto timestamp = parse(file_parts[2]); + + auto & file_info = data_files[partition][file_id]; + if (file_info.timestamp == 0 || file_info.timestamp < timestamp) + { + file_info.key = key; + file_info.timestamp = timestamp; + } + } + + Strings result; + for (auto & [partition, partition_data] : data_files) + { + LOG_TRACE(log, "Adding {} data files from partition {}", partition, partition_data.size()); + for (auto & [file_id, file_data] : partition_data) + result.push_back(std::move(file_data.key)); + } + return result; } -template -Strings HudiMetadataParser::getFiles(const Configuration & configuration, ContextPtr) -{ - return impl->processMetadataFiles(configuration); } - -template HudiMetadataParser::HudiMetadataParser(); -template Strings HudiMetadataParser::getFiles( - const StorageS3::Configuration & configuration, ContextPtr); - -} - -#endif diff --git a/src/Storages/DataLakes/HudiMetadataParser.h b/src/Storages/DataLakes/HudiMetadataParser.h index 6727ba2f718..2fc004595ca 100644 --- a/src/Storages/DataLakes/HudiMetadataParser.h +++ b/src/Storages/DataLakes/HudiMetadataParser.h @@ -1,22 +1,17 @@ #pragma once #include -#include +#include +#include namespace DB { -template struct HudiMetadataParser { -public: - HudiMetadataParser(); - - Strings getFiles(const Configuration & configuration, ContextPtr context); - -private: - struct Impl; - std::shared_ptr impl; + Strings getFiles( + ObjectStoragePtr object_storage, + StorageObjectStorageConfigurationPtr configuration, ContextPtr context); }; } diff --git a/src/Storages/DataLakes/IStorageDataLake.h b/src/Storages/DataLakes/IStorageDataLake.h index db3f835494f..934bf227c42 100644 --- a/src/Storages/DataLakes/IStorageDataLake.h +++ b/src/Storages/DataLakes/IStorageDataLake.h @@ -8,127 +8,91 @@ #include #include #include -#include +#include +#include namespace DB { -template -class IStorageDataLake : public Storage +template +class IStorageDataLake : public StorageObjectStorage { public: static constexpr auto name = Name::name; - using Configuration = typename Storage::Configuration; - template - explicit IStorageDataLake(const Configuration & configuration_, ContextPtr context_, bool attach, Args && ...args) - : Storage(getConfigurationForDataRead(configuration_, context_, {}, attach), context_, std::forward(args)...) - , base_configuration(configuration_) - , log(getLogger(getName())) {} // NOLINT(clang-analyzer-optin.cplusplus.VirtualCall) + using Storage = StorageObjectStorage; + using ConfigurationPtr = Storage::ConfigurationPtr; - template - static StoragePtr create(const Configuration & configuration_, ContextPtr context_, bool attach, Args && ...args) + static StoragePtr create( + ConfigurationPtr base_configuration, + ContextPtr context, + const String & engine_name_, + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & comment_, + std::optional format_settings_, + bool /* attach */) { - return std::make_shared>(configuration_, context_, attach, std::forward(args)...); + auto object_storage = base_configuration->createOrUpdateObjectStorage(context); + + auto configuration = base_configuration->clone(); + configuration->getPaths() = MetadataParser().getFiles(object_storage, configuration, context); + + return std::make_shared>( + base_configuration, configuration, object_storage, engine_name_, context, + table_id_, columns_, constraints_, comment_, format_settings_); } String getName() const override { return name; } static ColumnsDescription getTableStructureFromData( - Configuration & base_configuration, - const std::optional & format_settings, + ObjectStoragePtr object_storage_, + ConfigurationPtr base_configuration, + const std::optional &, ContextPtr local_context) { - auto configuration = getConfigurationForDataRead(base_configuration, local_context); - return Storage::getTableStructureFromData(configuration, format_settings, local_context); + auto metadata = parseIcebergMetadata(object_storage_, base_configuration, local_context); + return ColumnsDescription(metadata->getTableSchema()); } - static Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context) + std::pair updateConfigurationAndGetCopy(ContextPtr local_context) override { - return Storage::getConfiguration(engine_args, local_context, /* get_format_from_file */false); + std::lock_guard lock(Storage::configuration_update_mutex); + + auto new_object_storage = base_configuration->createOrUpdateObjectStorage(local_context); + bool updated = new_object_storage != nullptr; + if (updated) + Storage::object_storage = new_object_storage; + + auto new_keys = MetadataParser().getFiles(Storage::object_storage, base_configuration, local_context); + + if (updated || new_keys != Storage::configuration->getPaths()) + { + auto updated_configuration = base_configuration->clone(); + /// If metadata wasn't changed, we won't list data files again. + updated_configuration->getPaths() = new_keys; + Storage::configuration = updated_configuration; + } + return {Storage::configuration, Storage::object_storage}; } - Configuration updateConfigurationAndGetCopy(ContextPtr local_context) override + template + explicit IStorageDataLake( + ConfigurationPtr base_configuration_, + Args &&... args) + : Storage(std::forward(args)...) + , base_configuration(base_configuration_) { - std::lock_guard lock(configuration_update_mutex); - updateConfigurationImpl(local_context); - return Storage::getConfiguration(); - } - - void updateConfiguration(ContextPtr local_context) override - { - std::lock_guard lock(configuration_update_mutex); - updateConfigurationImpl(local_context); } private: - static Configuration getConfigurationForDataRead( - const Configuration & base_configuration, ContextPtr local_context, const Strings & keys = {}, bool attach = false) - { - auto configuration{base_configuration}; - configuration.update(local_context); - configuration.static_configuration = true; - - try - { - if (keys.empty()) - configuration.keys = getDataFiles(configuration, local_context); - else - configuration.keys = keys; - - LOG_TRACE( - getLogger("DataLake"), - "New configuration path: {}, keys: {}", - configuration.getPath(), fmt::join(configuration.keys, ", ")); - - configuration.connect(local_context); - return configuration; - } - catch (...) - { - if (!attach) - throw; - tryLogCurrentException(__PRETTY_FUNCTION__); - return configuration; - } - } - - static Strings getDataFiles(const Configuration & configuration, ContextPtr local_context) - { - return MetadataParser().getFiles(configuration, local_context); - } - - void updateConfigurationImpl(ContextPtr local_context) - { - const bool updated = base_configuration.update(local_context); - auto new_keys = getDataFiles(base_configuration, local_context); - - if (!updated && new_keys == Storage::getConfiguration().keys) - return; - - Storage::useConfiguration(getConfigurationForDataRead(base_configuration, local_context, new_keys)); - } - - Configuration base_configuration; - std::mutex configuration_update_mutex; + ConfigurationPtr base_configuration; LoggerPtr log; }; -template -static StoragePtr createDataLakeStorage(const StorageFactory::Arguments & args) -{ - auto configuration = DataLake::getConfiguration(args.engine_args, args.getLocalContext()); - - /// Data lakes use parquet format, no need for schema inference. - if (configuration.format == "auto") - configuration.format = "Parquet"; - - return DataLake::create(configuration, args.getContext(), args.attach, args.table_id, args.columns, args.constraints, - args.comment, getFormatSettings(args.getContext())); -} - } #endif diff --git a/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp index df1536f53fc..08cebb3f396 100644 --- a/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp @@ -21,11 +21,11 @@ #include #include #include +#include #include #include #include -#include -#include +#include #include #include @@ -44,7 +44,8 @@ namespace ErrorCodes } IcebergMetadata::IcebergMetadata( - const StorageS3::Configuration & configuration_, + ObjectStoragePtr object_storage_, + StorageObjectStorageConfigurationPtr configuration_, DB::ContextPtr context_, Int32 metadata_version_, Int32 format_version_, @@ -52,6 +53,7 @@ IcebergMetadata::IcebergMetadata( Int32 current_schema_id_, DB::NamesAndTypesList schema_) : WithContext(context_) + , object_storage(object_storage_) , configuration(configuration_) , metadata_version(metadata_version_) , format_version(format_version_) @@ -331,21 +333,42 @@ MutableColumns parseAvro( return columns; } +std::vector listFiles( + const ObjectStoragePtr & object_storage, + const StorageObjectStorageConfiguration & configuration, + const String & prefix, const String & suffix) +{ + auto key = std::filesystem::path(configuration.getPath()) / prefix; + RelativePathsWithMetadata files_with_metadata; + object_storage->listObjects(key, files_with_metadata, 0); + Strings res; + for (const auto & file_with_metadata : files_with_metadata) + { + const auto & filename = file_with_metadata->relative_path; + if (filename.ends_with(suffix)) + res.push_back(filename); + } + LOG_TRACE(getLogger("DataLakeMetadataReadHelper"), "Listed {} files", res.size()); + return res; +} + /** * Each version of table metadata is stored in a `metadata` directory and * has one of 2 formats: * 1) v.metadata.json, where V - metadata version. * 2) -.metadata.json, where V - metadata version */ -std::pair getMetadataFileAndVersion(const StorageS3::Configuration & configuration) +std::pair getMetadataFileAndVersion( + ObjectStoragePtr object_storage, + const StorageObjectStorageConfiguration & configuration) { - const auto metadata_files = S3DataLakeMetadataReadHelper::listFiles(configuration, "metadata", ".metadata.json"); + const auto metadata_files = listFiles(object_storage, configuration, "metadata", ".metadata.json"); if (metadata_files.empty()) { throw Exception( ErrorCodes::FILE_DOESNT_EXIST, "The metadata file for Iceberg table with path {} doesn't exist", - configuration.url.key); + configuration.getPath()); } std::vector> metadata_files_with_versions; @@ -372,11 +395,15 @@ std::pair getMetadataFileAndVersion(const StorageS3::Configuratio } -std::unique_ptr parseIcebergMetadata(const StorageS3::Configuration & configuration, ContextPtr context_) +std::unique_ptr parseIcebergMetadata( + ObjectStoragePtr object_storage, + StorageObjectStorageConfigurationPtr configuration, + ContextPtr context_) { - const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(configuration); + const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration); LOG_DEBUG(getLogger("IcebergMetadata"), "Parse metadata {}", metadata_file_path); - auto buf = S3DataLakeMetadataReadHelper::createReadBuffer(metadata_file_path, context_, configuration); + auto read_settings = context_->getReadSettings(); + auto buf = object_storage->readObject(StoredObject(metadata_file_path), read_settings); String json_str; readJSONObjectPossiblyInvalid(json_str, *buf); @@ -397,12 +424,12 @@ std::unique_ptr parseIcebergMetadata(const StorageS3::Configura if (snapshot->getValue("snapshot-id") == current_snapshot_id) { const auto path = snapshot->getValue("manifest-list"); - manifest_list_file = std::filesystem::path(configuration.url.key) / "metadata" / std::filesystem::path(path).filename(); + manifest_list_file = std::filesystem::path(configuration->getPath()) / "metadata" / std::filesystem::path(path).filename(); break; } } - return std::make_unique(configuration, context_, metadata_version, format_version, manifest_list_file, schema_id, schema); + return std::make_unique(object_storage, configuration, context_, metadata_version, format_version, manifest_list_file, schema_id, schema); } /** @@ -441,12 +468,14 @@ Strings IcebergMetadata::getDataFiles() LOG_TEST(log, "Collect manifest files from manifest list {}", manifest_list_file); - auto manifest_list_buf = S3DataLakeMetadataReadHelper::createReadBuffer(manifest_list_file, getContext(), configuration); + auto context = getContext(); + auto read_settings = context->getReadSettings(); + auto manifest_list_buf = object_storage->readObject(StoredObject(manifest_list_file), read_settings); auto manifest_list_file_reader = std::make_unique(std::make_unique(*manifest_list_buf)); auto data_type = AvroSchemaReader::avroNodeToDataType(manifest_list_file_reader->dataSchema().root()->leafAt(0)); Block header{{data_type->createColumn(), data_type, "manifest_path"}}; - auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(getContext())); + auto columns = parseAvro(*manifest_list_file_reader, header, getFormatSettings(context)); auto & col = columns.at(0); if (col->getDataType() != TypeIndex::String) @@ -462,7 +491,7 @@ Strings IcebergMetadata::getDataFiles() { const auto file_path = col_str->getDataAt(i).toView(); const auto filename = std::filesystem::path(file_path).filename(); - manifest_files.emplace_back(std::filesystem::path(configuration.url.key) / "metadata" / filename); + manifest_files.emplace_back(std::filesystem::path(configuration->getPath()) / "metadata" / filename); } NameSet files; @@ -471,7 +500,7 @@ Strings IcebergMetadata::getDataFiles() { LOG_TEST(log, "Process manifest file {}", manifest_file); - auto buffer = S3DataLakeMetadataReadHelper::createReadBuffer(manifest_file, getContext(), configuration); + auto buffer = object_storage->readObject(StoredObject(manifest_file), read_settings); auto manifest_file_reader = std::make_unique(std::make_unique(*buffer)); /// Manifest file should always have table schema in avro file metadata. By now we don't support tables with evolved schema, @@ -482,7 +511,7 @@ Strings IcebergMetadata::getDataFiles() Poco::JSON::Parser parser; Poco::Dynamic::Var json = parser.parse(schema_json_string); Poco::JSON::Object::Ptr schema_object = json.extract(); - if (!getContext()->getSettingsRef().iceberg_engine_ignore_schema_evolution && schema_object->getValue("schema-id") != current_schema_id) + if (!context->getSettingsRef().iceberg_engine_ignore_schema_evolution && schema_object->getValue("schema-id") != current_schema_id) throw Exception( ErrorCodes::UNSUPPORTED_METHOD, "Cannot read Iceberg table: the table schema has been changed at least 1 time, reading tables with evolved schema is not " @@ -595,9 +624,9 @@ Strings IcebergMetadata::getDataFiles() const auto status = status_int_column->getInt(i); const auto data_path = std::string(file_path_string_column->getDataAt(i).toView()); - const auto pos = data_path.find(configuration.url.key); + const auto pos = data_path.find(configuration->getPath()); if (pos == std::string::npos) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected to find {} in data path: {}", configuration.url.key, data_path); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected to find {} in data path: {}", configuration->getPath(), data_path); const auto file_path = data_path.substr(pos); diff --git a/src/Storages/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/DataLakes/Iceberg/IcebergMetadata.h index 3e6a2ec3415..92946e4192b 100644 --- a/src/Storages/DataLakes/Iceberg/IcebergMetadata.h +++ b/src/Storages/DataLakes/Iceberg/IcebergMetadata.h @@ -2,9 +2,10 @@ #if USE_AWS_S3 && USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format. -#include #include #include +#include +#include namespace DB { @@ -59,13 +60,15 @@ namespace DB class IcebergMetadata : WithContext { public: - IcebergMetadata(const StorageS3::Configuration & configuration_, - ContextPtr context_, - Int32 metadata_version_, - Int32 format_version_, - String manifest_list_file_, - Int32 current_schema_id_, - NamesAndTypesList schema_); + IcebergMetadata( + ObjectStoragePtr object_storage_, + StorageObjectStorageConfigurationPtr configuration_, + ContextPtr context_, + Int32 metadata_version_, + Int32 format_version_, + String manifest_list_file_, + Int32 current_schema_id_, + NamesAndTypesList schema_); /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files. /// All subsequent calls will return saved list of files (because it cannot be changed without changing metadata file) @@ -77,7 +80,8 @@ public: size_t getVersion() const { return metadata_version; } private: - const StorageS3::Configuration configuration; + ObjectStoragePtr object_storage; + StorageObjectStorageConfigurationPtr configuration; Int32 metadata_version; Int32 format_version; String manifest_list_file; @@ -88,7 +92,10 @@ private: }; -std::unique_ptr parseIcebergMetadata(const StorageS3::Configuration & configuration, ContextPtr context); +std::unique_ptr parseIcebergMetadata( + ObjectStoragePtr object_storage, + StorageObjectStorageConfigurationPtr configuration, + ContextPtr context); } diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp index 8a1a2cdbd8f..ad1a27c312b 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp +++ b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp @@ -5,85 +5,6 @@ namespace DB { -StoragePtr StorageIceberg::create( - const DB::StorageIceberg::Configuration & base_configuration, - DB::ContextPtr context_, - bool attach, - const DB::StorageID & table_id_, - const DB::ColumnsDescription & columns_, - const DB::ConstraintsDescription & constraints_, - const String & comment, - std::optional format_settings_) -{ - auto configuration{base_configuration}; - configuration.update(context_); - std::unique_ptr metadata; - NamesAndTypesList schema_from_metadata; - try - { - metadata = parseIcebergMetadata(configuration, context_); - schema_from_metadata = metadata->getTableSchema(); - configuration.keys = metadata->getDataFiles(); - } - catch (...) - { - if (!attach) - throw; - tryLogCurrentException(__PRETTY_FUNCTION__); - } - - return std::make_shared( - std::move(metadata), - configuration, - context_, - table_id_, - columns_.empty() ? ColumnsDescription(schema_from_metadata) : columns_, - constraints_, - comment, - format_settings_); -} - -StorageIceberg::StorageIceberg( - std::unique_ptr metadata_, - const Configuration & configuration_, - ContextPtr context_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - std::optional format_settings_) - : StorageS3(configuration_, context_, table_id_, columns_, constraints_, comment, format_settings_) - , current_metadata(std::move(metadata_)) - , base_configuration(configuration_) -{ -} - -ColumnsDescription StorageIceberg::getTableStructureFromData( - Configuration & base_configuration, - const std::optional &, - ContextPtr local_context) -{ - auto configuration{base_configuration}; - configuration.update(local_context); - auto metadata = parseIcebergMetadata(configuration, local_context); - return ColumnsDescription(metadata->getTableSchema()); -} - -void StorageIceberg::updateConfigurationImpl(ContextPtr local_context) -{ - const bool updated = base_configuration.update(local_context); - auto new_metadata = parseIcebergMetadata(base_configuration, local_context); - - if (!current_metadata || new_metadata->getVersion() != current_metadata->getVersion()) - current_metadata = std::move(new_metadata); - else if (!updated) - return; - - auto updated_configuration{base_configuration}; - /// If metadata wasn't changed, we won't list data files again. - updated_configuration.keys = current_metadata->getDataFiles(); - StorageS3::useConfiguration(updated_configuration); -} } diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.h b/src/Storages/DataLakes/Iceberg/StorageIceberg.h index 4e63da5508a..bca6e3c868f 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.h +++ b/src/Storages/DataLakes/Iceberg/StorageIceberg.h @@ -4,13 +4,13 @@ #if USE_AWS_S3 && USE_AVRO -# include -# include -# include -# include -# include -# include -# include +#include +#include +#include +#include +#include +#include +#include namespace DB @@ -21,65 +21,100 @@ namespace DB /// many Iceberg features like schema evolution, partitioning, positional and equality deletes. /// TODO: Implement Iceberg as a separate storage using IObjectStorage /// (to support all object storages, not only S3) and add support for missing Iceberg features. -class StorageIceberg : public StorageS3 +template +class StorageIceberg : public StorageObjectStorage { public: static constexpr auto name = "Iceberg"; + using Storage = StorageObjectStorage; + using ConfigurationPtr = Storage::ConfigurationPtr; - using Configuration = StorageS3::Configuration; - - static StoragePtr create(const Configuration & base_configuration, - ContextPtr context_, - bool attach, + static StoragePtr create( + ConfigurationPtr base_configuration, + ContextPtr context, + const String & engine_name_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const String & comment, - std::optional format_settings_); + const String & comment_, + std::optional format_settings_, + bool attach) + { + auto object_storage = base_configuration->createOrUpdateObjectStorage(context); + std::unique_ptr metadata; + NamesAndTypesList schema_from_metadata; + try + { + metadata = parseIcebergMetadata(object_storage, base_configuration, context); + schema_from_metadata = metadata->getTableSchema(); + } + catch (...) + { + if (!attach) + throw; + tryLogCurrentException(__PRETTY_FUNCTION__); + } - StorageIceberg( - std::unique_ptr metadata_, - const Configuration & configuration_, - ContextPtr context_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - std::optional format_settings_); + auto configuration = base_configuration->clone(); + configuration->getPaths() = metadata->getDataFiles(); + + return std::make_shared>( + base_configuration, std::move(metadata), configuration, object_storage, engine_name_, context, + table_id_, + columns_.empty() ? ColumnsDescription(schema_from_metadata) : columns_, + constraints_, comment_, format_settings_); + } String getName() const override { return name; } static ColumnsDescription getTableStructureFromData( - Configuration & base_configuration, + ObjectStoragePtr object_storage_, + ConfigurationPtr base_configuration, const std::optional &, - ContextPtr local_context); - - static Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context) + ContextPtr local_context) { - return StorageS3::getConfiguration(engine_args, local_context, /* get_format_from_file */false); + auto metadata = parseIcebergMetadata(object_storage_, base_configuration, local_context); + return ColumnsDescription(metadata->getTableSchema()); } - Configuration updateConfigurationAndGetCopy(ContextPtr local_context) override + std::pair updateConfigurationAndGetCopy(ContextPtr local_context) override { - std::lock_guard lock(configuration_update_mutex); - updateConfigurationImpl(local_context); - return StorageS3::getConfiguration(); + std::lock_guard lock(Storage::configuration_update_mutex); + + auto new_object_storage = base_configuration->createOrUpdateObjectStorage(local_context); + bool updated = new_object_storage != nullptr; + if (updated) + Storage::object_storage = new_object_storage; + + auto new_metadata = parseIcebergMetadata(Storage::object_storage, base_configuration, local_context); + + if (!current_metadata || new_metadata->getVersion() != current_metadata->getVersion()) + current_metadata = std::move(new_metadata); + else if (updated) + { + auto updated_configuration = base_configuration->clone(); + /// If metadata wasn't changed, we won't list data files again. + updated_configuration->getPaths() = current_metadata->getDataFiles(); + Storage::configuration = updated_configuration; + } + return {Storage::configuration, Storage::object_storage}; } - void updateConfiguration(ContextPtr local_context) override + template + StorageIceberg( + ConfigurationPtr base_configuration_, + std::unique_ptr metadata_, + Args &&... args) + : Storage(std::forward(args)...) + , base_configuration(base_configuration_) + , current_metadata(std::move(metadata_)) { - std::lock_guard lock(configuration_update_mutex); - updateConfigurationImpl(local_context); } private: - void updateConfigurationImpl(ContextPtr local_context); - + ConfigurationPtr base_configuration; std::unique_ptr current_metadata; - Configuration base_configuration; - std::mutex configuration_update_mutex; }; - } #endif diff --git a/src/Storages/DataLakes/S3MetadataReader.cpp b/src/Storages/DataLakes/S3MetadataReader.cpp deleted file mode 100644 index d66e21550a3..00000000000 --- a/src/Storages/DataLakes/S3MetadataReader.cpp +++ /dev/null @@ -1,86 +0,0 @@ -#include - -#if USE_AWS_S3 - -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int S3_ERROR; -} - -std::shared_ptr -S3DataLakeMetadataReadHelper::createReadBuffer(const String & key, ContextPtr context, const StorageS3::Configuration & base_configuration) -{ - S3Settings::RequestSettings request_settings; - request_settings.max_single_read_retries = context->getSettingsRef().s3_max_single_read_retries; - return std::make_shared( - base_configuration.client, - base_configuration.url.bucket, - key, - base_configuration.url.version_id, - request_settings, - context->getReadSettings()); -} - -bool S3DataLakeMetadataReadHelper::exists(const String & key, const StorageS3::Configuration & configuration) -{ - return S3::objectExists(*configuration.client, configuration.url.bucket, key); -} - -std::vector S3DataLakeMetadataReadHelper::listFiles( - const StorageS3::Configuration & base_configuration, const String & prefix, const String & suffix) -{ - const auto & table_path = base_configuration.url.key; - const auto & bucket = base_configuration.url.bucket; - const auto & client = base_configuration.client; - - std::vector res; - S3::ListObjectsV2Request request; - Aws::S3::Model::ListObjectsV2Outcome outcome; - - request.SetBucket(bucket); - request.SetPrefix(std::filesystem::path(table_path) / prefix); - - bool is_finished{false}; - while (!is_finished) - { - outcome = client->ListObjectsV2(request); - if (!outcome.IsSuccess()) - throw S3Exception( - outcome.GetError().GetErrorType(), - "Could not list objects in bucket {} with key {}, S3 exception: {}, message: {}", - quoteString(bucket), - quoteString(base_configuration.url.key), - backQuote(outcome.GetError().GetExceptionName()), - quoteString(outcome.GetError().GetMessage())); - - const auto & result_batch = outcome.GetResult().GetContents(); - for (const auto & obj : result_batch) - { - const auto & filename = obj.GetKey(); - if (filename.ends_with(suffix)) - res.push_back(filename); - } - - request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); - is_finished = !outcome.GetResult().GetIsTruncated(); - } - - LOG_TRACE(getLogger("S3DataLakeMetadataReadHelper"), "Listed {} files", res.size()); - - return res; -} - -} -#endif diff --git a/src/Storages/DataLakes/S3MetadataReader.h b/src/Storages/DataLakes/S3MetadataReader.h deleted file mode 100644 index cae7dd1fa3d..00000000000 --- a/src/Storages/DataLakes/S3MetadataReader.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include - -#if USE_AWS_S3 - -#include - -class ReadBuffer; - -namespace DB -{ - -struct S3DataLakeMetadataReadHelper -{ - static std::shared_ptr createReadBuffer( - const String & key, ContextPtr context, const StorageS3::Configuration & base_configuration); - - static bool exists(const String & key, const StorageS3::Configuration & configuration); - - static std::vector listFiles(const StorageS3::Configuration & configuration, const std::string & prefix = "", const std::string & suffix = ""); -}; -} - -#endif diff --git a/src/Storages/DataLakes/StorageDeltaLake.h b/src/Storages/DataLakes/StorageDeltaLake.h index 8b4ba28d6f7..07c2205d2df 100644 --- a/src/Storages/DataLakes/StorageDeltaLake.h +++ b/src/Storages/DataLakes/StorageDeltaLake.h @@ -5,11 +5,6 @@ #include #include "config.h" -#if USE_AWS_S3 -#include -#include -#endif - namespace DB { @@ -19,7 +14,7 @@ struct StorageDeltaLakeName }; #if USE_AWS_S3 && USE_PARQUET -using StorageDeltaLakeS3 = IStorageDataLake>; +using StorageDeltaLakeS3 = IStorageDataLake; #endif } diff --git a/src/Storages/DataLakes/StorageHudi.h b/src/Storages/DataLakes/StorageHudi.h index 84666f51405..3fd52c82d32 100644 --- a/src/Storages/DataLakes/StorageHudi.h +++ b/src/Storages/DataLakes/StorageHudi.h @@ -5,11 +5,6 @@ #include #include "config.h" -#if USE_AWS_S3 -#include -#include -#endif - namespace DB { @@ -19,7 +14,7 @@ struct StorageHudiName }; #if USE_AWS_S3 -using StorageHudiS3 = IStorageDataLake>; +using StorageHudiS3 = IStorageDataLake; #endif } diff --git a/src/Storages/DataLakes/registerDataLakes.cpp b/src/Storages/DataLakes/registerDataLakes.cpp index 118600f7212..2647fbce39d 100644 --- a/src/Storages/DataLakes/registerDataLakes.cpp +++ b/src/Storages/DataLakes/registerDataLakes.cpp @@ -6,43 +6,43 @@ #include #include #include +#include namespace DB { -#define REGISTER_DATA_LAKE_STORAGE(STORAGE, NAME) \ - factory.registerStorage( \ - NAME, \ - [](const StorageFactory::Arguments & args) \ - { \ - return createDataLakeStorage(args);\ - }, \ - { \ - .supports_settings = false, \ - .supports_schema_inference = true, \ - .source_access_type = AccessType::S3, \ - }); - #if USE_PARQUET -void registerStorageDeltaLake(StorageFactory & factory) +void registerStorageDeltaLake(StorageFactory & ) { - REGISTER_DATA_LAKE_STORAGE(StorageDeltaLakeS3, StorageDeltaLakeName::name) + // factory.registerStorage( + // StorageDeltaLakeName::name, + // [&](const StorageFactory::Arguments & args) + // { + // auto configuration = std::make_shared(); + // return IStorageDataLake::create( + // configuration, args.getContext(), "deltaLake", args.table_id, args.columns, + // args.constraints, args.comment, std::nullopt, args.attach); + // }, + // { + // .supports_settings = false, + // .supports_schema_inference = true, + // .source_access_type = AccessType::S3, + // }); } #endif #if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format. -void registerStorageIceberg(StorageFactory & factory) +void registerStorageIceberg(StorageFactory &) { - REGISTER_DATA_LAKE_STORAGE(StorageIceberg, StorageIceberg::name) + // REGISTER_DATA_LAKE_STORAGE(StorageIceberg, StorageIceberg::name) } #endif -void registerStorageHudi(StorageFactory & factory) +void registerStorageHudi(StorageFactory &) { - REGISTER_DATA_LAKE_STORAGE(StorageHudiS3, StorageHudiName::name) } } diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp deleted file mode 100644 index ab21c4946e4..00000000000 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ /dev/null @@ -1,1117 +0,0 @@ -#include "config.h" - -#if USE_HDFS - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include -#include - -#include - -namespace fs = std::filesystem; - -namespace ProfileEvents -{ - extern const Event EngineFileLikeReadFiles; -} - -namespace DB -{ -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int ACCESS_DENIED; - extern const int DATABASE_ACCESS_DENIED; - extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int BAD_ARGUMENTS; - extern const int LOGICAL_ERROR; - extern const int CANNOT_COMPILE_REGEXP; -} -namespace -{ - struct HDFSFileInfoDeleter - { - /// Can have only one entry (see hdfsGetPathInfo()) - void operator()(hdfsFileInfo * info) { hdfsFreeFileInfo(info, 1); } - }; - using HDFSFileInfoPtr = std::unique_ptr; - - /* Recursive directory listing with matched paths as a result. - * Have the same method in StorageFile. - */ - std::vector LSWithRegexpMatching( - const String & path_for_ls, - const HDFSFSPtr & fs, - const String & for_match) - { - std::vector result; - - const size_t first_glob_pos = for_match.find_first_of("*?{"); - - if (first_glob_pos == std::string::npos) - { - const String path = fs::path(path_for_ls + for_match.substr(1)).lexically_normal(); - HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path.c_str())); - if (hdfs_info) // NOLINT - { - result.push_back(StorageHDFS::PathWithInfo{ - String(path), - StorageHDFS::PathInfo{hdfs_info->mLastMod, static_cast(hdfs_info->mSize)}}); - } - return result; - } - - const size_t end_of_path_without_globs = for_match.substr(0, first_glob_pos).rfind('/'); - const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' - const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' - - const size_t next_slash_after_glob_pos = suffix_with_globs.find('/', 1); - - const std::string current_glob = suffix_with_globs.substr(0, next_slash_after_glob_pos); - - re2::RE2 matcher(makeRegexpPatternFromGlobs(current_glob)); - if (!matcher.ok()) - throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, - "Cannot compile regex from glob ({}): {}", for_match, matcher.error()); - - HDFSFileInfo ls; - ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); - if (ls.file_info == nullptr && errno != ENOENT) // NOLINT - { - // ignore file not found exception, keep throw other exception, libhdfs3 doesn't have function to get exception type, so use errno. - throw Exception( - ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", prefix_without_globs, String(hdfsGetLastError())); - } - - if (!ls.file_info && ls.length > 0) - throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null"); - for (int i = 0; i < ls.length; ++i) - { - const String full_path = fs::path(ls.file_info[i].mName).lexically_normal(); - const size_t last_slash = full_path.rfind('/'); - const String file_name = full_path.substr(last_slash); - const bool looking_for_directory = next_slash_after_glob_pos != std::string::npos; - const bool is_directory = ls.file_info[i].mKind == 'D'; - /// Condition with type of current file_info means what kind of path is it in current iteration of ls - if (!is_directory && !looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - result.push_back(StorageHDFS::PathWithInfo{ - String(full_path), - StorageHDFS::PathInfo{ls.file_info[i].mLastMod, static_cast(ls.file_info[i].mSize)}}); - } - else if (is_directory && looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - std::vector result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, - suffix_with_globs.substr(next_slash_after_glob_pos)); - /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. - std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); - } - } - } - - return result; - } - - std::pair getPathFromUriAndUriWithoutPath(const String & uri) - { - auto pos = uri.find("//"); - if (pos != std::string::npos && pos + 2 < uri.length()) - { - pos = uri.find('/', pos + 2); - if (pos != std::string::npos) - return {uri.substr(pos), uri.substr(0, pos)}; - } - - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Storage HDFS requires valid URL to be set"); - } - - std::vector getPathsList(const String & path_from_uri, const String & uri_without_path, ContextPtr context) - { - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - - Strings paths = expandSelectionGlob(path_from_uri); - - std::vector res; - - for (const auto & path : paths) - { - auto part_of_res = LSWithRegexpMatching("/", fs, path); - res.insert(res.end(), part_of_res.begin(), part_of_res.end()); - } - return res; - } -} - -StorageHDFS::StorageHDFS( - const String & uri_, - const StorageID & table_id_, - const String & format_name_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - ContextPtr context_, - const String & compression_method_, - const bool distributed_processing_, - ASTPtr partition_by_) - : IStorage(table_id_) - , WithContext(context_) - , uris({uri_}) - , format_name(format_name_) - , compression_method(compression_method_) - , distributed_processing(distributed_processing_) - , partition_by(partition_by_) -{ - FormatFactory::instance().checkFormatName(format_name); - context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); - checkHDFSURL(uri_); - - String path = uri_.substr(uri_.find('/', uri_.find("//") + 2)); - is_path_with_globs = path.find_first_of("*?{") != std::string::npos; - - StorageInMemoryMetadata storage_metadata; - - if (columns_.empty()) - { - auto columns = getTableStructureFromData(format_name, uri_, compression_method, context_); - storage_metadata.setColumns(columns); - } - else - { - /// We don't allow special columns in HDFS storage. - if (!columns_.hasOnlyOrdinary()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine HDFS doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); - storage_metadata.setColumns(columns_); - } - - storage_metadata.setConstraints(constraints_); - storage_metadata.setComment(comment); - setInMemoryMetadata(storage_metadata); - - virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); -} - -namespace -{ - class ReadBufferIterator : public IReadBufferIterator, WithContext - { - public: - ReadBufferIterator( - const std::vector & paths_with_info_, - const String & uri_without_path_, - const String & format_, - const String & compression_method_, - const ContextPtr & context_) - : WithContext(context_) - , paths_with_info(paths_with_info_) - , uri_without_path(uri_without_path_) - , format(format_) - , compression_method(compression_method_) - { - } - - std::pair, std::optional> next() override - { - bool is_first = current_index == 0; - /// For default mode check cached columns for all paths on first iteration. - if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) - { - if (auto cached_columns = tryGetColumnsFromCache(paths_with_info)) - return {nullptr, cached_columns}; - } - - StorageHDFS::PathWithInfo path_with_info; - - while (true) - { - if (current_index == paths_with_info.size()) - { - if (is_first) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", format); - return {nullptr, std::nullopt}; - } - - path_with_info = paths_with_info[current_index++]; - if (getContext()->getSettingsRef().hdfs_skip_empty_files && path_with_info.info && path_with_info.info->size == 0) - continue; - - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) - { - std::vector paths = {path_with_info}; - if (auto cached_columns = tryGetColumnsFromCache(paths)) - return {nullptr, cached_columns}; - } - - auto compression = chooseCompressionMethod(path_with_info.path, compression_method); - auto impl = std::make_unique(uri_without_path, path_with_info.path, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); - if (!getContext()->getSettingsRef().hdfs_skip_empty_files || !impl->eof()) - { - const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - return {wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)), std::nullopt}; - } - } - } - - void setNumRowsToLastFile(size_t num_rows) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs) - return; - - String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, format, std::nullopt, getContext()); - StorageHDFS::getSchemaCache(getContext()).addNumRows(key, num_rows); - } - - void setSchemaToLastFile(const ColumnsDescription & columns) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs - || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::UNION) - return; - - String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, format, std::nullopt, getContext()); - StorageHDFS::getSchemaCache(getContext()).addColumns(key, columns); - } - - void setResultingSchema(const ColumnsDescription & columns) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs - || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::DEFAULT) - return; - - Strings sources; - sources.reserve(paths_with_info.size()); - std::transform(paths_with_info.begin(), paths_with_info.end(), std::back_inserter(sources), [&](const StorageHDFS::PathWithInfo & path_with_info){ return uri_without_path + path_with_info.path; }); - auto cache_keys = getKeysForSchemaCache(sources, format, {}, getContext()); - StorageHDFS::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); - } - - String getLastFileName() const override - { - if (current_index != 0) - return paths_with_info[current_index - 1].path; - - return ""; - } - - private: - std::optional tryGetColumnsFromCache(const std::vector & paths_with_info_) - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs) - return std::nullopt; - - auto & schema_cache = StorageHDFS::getSchemaCache(getContext()); - for (const auto & path_with_info : paths_with_info_) - { - auto get_last_mod_time = [&]() -> std::optional - { - if (path_with_info.info) - return path_with_info.info->last_mod_time; - - auto builder = createHDFSBuilder(uri_without_path + "/", getContext()->getGlobalContext()->getConfigRef()); - auto fs = createHDFSFS(builder.get()); - HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path_with_info.path.c_str())); - if (hdfs_info) - return hdfs_info->mLastMod; - - return std::nullopt; - }; - - String url = uri_without_path + path_with_info.path; - auto cache_key = getKeyForSchemaCache(url, format, {}, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; - } - - return std::nullopt; - } - - const std::vector & paths_with_info; - const String & uri_without_path; - const String & format; - const String & compression_method; - size_t current_index = 0; - }; -} - -ColumnsDescription StorageHDFS::getTableStructureFromData( - const String & format, - const String & uri, - const String & compression_method, - ContextPtr ctx) -{ - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); - auto paths_with_info = getPathsList(path_from_uri, uri, ctx); - - if (paths_with_info.empty() && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path." - " You must specify table structure manually", format); - - ReadBufferIterator read_buffer_iterator(paths_with_info, uri_without_path, format, compression_method, ctx); - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths_with_info.size() > 1, ctx); -} - -class HDFSSource::DisclosedGlobIterator::Impl -{ -public: - Impl(const String & uri, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) - { - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); - uris = getPathsList(path_from_uri, uri_without_path, context); - ActionsDAGPtr filter_dag; - if (!uris.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - - if (filter_dag) - { - std::vector paths; - paths.reserve(uris.size()); - for (const auto & path_with_info : uris) - paths.push_back(path_with_info.path); - - VirtualColumnUtils::filterByPathOrFile(uris, paths, filter_dag, virtual_columns, context); - } - auto file_progress_callback = context->getFileProgressCallback(); - - for (auto & elem : uris) - { - elem.path = uri_without_path + elem.path; - if (file_progress_callback && elem.info) - file_progress_callback(FileProgress(0, elem.info->size)); - } - uris_iter = uris.begin(); - } - - StorageHDFS::PathWithInfo next() - { - std::lock_guard lock(mutex); - if (uris_iter != uris.end()) - { - auto answer = *uris_iter; - ++uris_iter; - return answer; - } - return {}; - } -private: - std::mutex mutex; - std::vector uris; - std::vector::iterator uris_iter; -}; - -class HDFSSource::URISIterator::Impl : WithContext -{ -public: - explicit Impl(const std::vector & uris_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context_) - : WithContext(context_), uris(uris_), file_progress_callback(context_->getFileProgressCallback()) - { - ActionsDAGPtr filter_dag; - if (!uris.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - - if (filter_dag) - { - std::vector paths; - paths.reserve(uris.size()); - for (const auto & uri : uris) - paths.push_back(getPathFromUriAndUriWithoutPath(uri).first); - - VirtualColumnUtils::filterByPathOrFile(uris, paths, filter_dag, virtual_columns, getContext()); - } - - if (!uris.empty()) - { - auto path_and_uri = getPathFromUriAndUriWithoutPath(uris[0]); - builder = createHDFSBuilder(path_and_uri.second + "/", getContext()->getGlobalContext()->getConfigRef()); - fs = createHDFSFS(builder.get()); - } - } - - StorageHDFS::PathWithInfo next() - { - String uri; - HDFSFileInfoPtr hdfs_info; - do - { - size_t current_index = index.fetch_add(1); - if (current_index >= uris.size()) - return {"", {}}; - - uri = uris[current_index]; - auto path_and_uri = getPathFromUriAndUriWithoutPath(uri); - hdfs_info.reset(hdfsGetPathInfo(fs.get(), path_and_uri.first.c_str())); - } - /// Skip non-existed files. - while (!hdfs_info && String(hdfsGetLastError()).find("FileNotFoundException") != std::string::npos); - - std::optional info; - if (hdfs_info) - { - info = StorageHDFS::PathInfo{hdfs_info->mLastMod, static_cast(hdfs_info->mSize)}; - if (file_progress_callback) - file_progress_callback(FileProgress(0, hdfs_info->mSize)); - } - - return {uri, info}; - } - -private: - std::atomic_size_t index = 0; - Strings uris; - HDFSBuilderWrapper builder; - HDFSFSPtr fs; - std::function file_progress_callback; -}; - -HDFSSource::DisclosedGlobIterator::DisclosedGlobIterator(const String & uri, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) - : pimpl(std::make_shared(uri, predicate, virtual_columns, context)) {} - -StorageHDFS::PathWithInfo HDFSSource::DisclosedGlobIterator::next() -{ - return pimpl->next(); -} - -HDFSSource::URISIterator::URISIterator(const std::vector & uris_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) - : pimpl(std::make_shared(uris_, predicate, virtual_columns, context)) -{ -} - -StorageHDFS::PathWithInfo HDFSSource::URISIterator::next() -{ - return pimpl->next(); -} - -HDFSSource::HDFSSource( - const ReadFromFormatInfo & info, - StorageHDFSPtr storage_, - ContextPtr context_, - UInt64 max_block_size_, - std::shared_ptr file_iterator_, - bool need_only_count_) - : ISource(info.source_header, false) - , WithContext(context_) - , storage(std::move(storage_)) - , block_for_format(info.format_header) - , requested_columns(info.requested_columns) - , requested_virtual_columns(info.requested_virtual_columns) - , max_block_size(max_block_size_) - , file_iterator(file_iterator_) - , columns_description(info.columns_description) - , need_only_count(need_only_count_) -{ - initialize(); -} - -bool HDFSSource::initialize() -{ - bool skip_empty_files = getContext()->getSettingsRef().hdfs_skip_empty_files; - StorageHDFS::PathWithInfo path_with_info; - while (true) - { - path_with_info = (*file_iterator)(); - if (path_with_info.path.empty()) - return false; - - if (path_with_info.info && skip_empty_files && path_with_info.info->size == 0) - continue; - - current_path = path_with_info.path; - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); - - std::optional file_size; - if (!path_with_info.info) - { - auto builder = createHDFSBuilder(uri_without_path + "/", getContext()->getGlobalContext()->getConfigRef()); - auto fs = createHDFSFS(builder.get()); - HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path_from_uri.c_str())); - if (hdfs_info) - path_with_info.info = StorageHDFS::PathInfo{hdfs_info->mLastMod, static_cast(hdfs_info->mSize)}; - } - - if (path_with_info.info) - file_size = path_with_info.info->size; - - auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); - auto impl = std::make_unique( - uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings(), 0, false, file_size); - if (!skip_empty_files || !impl->eof()) - { - impl->setProgressCallback(getContext()); - const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - read_buf = wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); - break; - } - } - - current_path = path_with_info.path; - current_file_size = path_with_info.info ? std::optional(path_with_info.info->size) : std::nullopt; - - QueryPipelineBuilder builder; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(path_with_info) : std::nullopt; - if (num_rows_from_cache) - { - /// We should not return single chunk with all number of rows, - /// because there is a chance that this chunk will be materialized later - /// (it can cause memory problems even with default values in columns or when virtual columns are requested). - /// Instead, we use a special ConstChunkGenerator that will generate chunks - /// with max_block_size rows until total number of rows is reached. - auto source = std::make_shared(block_for_format, *num_rows_from_cache, max_block_size); - builder.init(Pipe(source)); - } - else - { - std::optional max_parsing_threads; - if (need_only_count) - max_parsing_threads = 1; - - input_format = getContext()->getInputFormat(storage->format_name, *read_buf, block_for_format, max_block_size, std::nullopt, max_parsing_threads); - - if (need_only_count) - input_format->needOnlyCount(); - - builder.init(Pipe(input_format)); - if (columns_description.hasDefaults()) - { - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, columns_description, *input_format, getContext()); - }); - } - } - - /// Add ExtractColumnsTransform to extract requested columns/subcolumns - /// from the chunk read by IInputFormat. - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, requested_columns); - }); - - pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); - reader = std::make_unique(*pipeline); - - ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); - return true; -} - -String HDFSSource::getName() const -{ - return "HDFSSource"; -} - -Chunk HDFSSource::generate() -{ - while (true) - { - if (isCancelled() || !reader) - { - if (reader) - reader->cancel(); - break; - } - - Chunk chunk; - if (reader->pull(chunk)) - { - UInt64 num_rows = chunk.getNumRows(); - total_rows_in_file += num_rows; - size_t chunk_size = 0; - if (input_format) - chunk_size = input_format->getApproxBytesReadForChunk(); - progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk(chunk, requested_virtual_columns, current_path, current_file_size); - return chunk; - } - - if (input_format && getContext()->getSettingsRef().use_cache_for_count_from_files) - addNumRowsToCache(current_path, total_rows_in_file); - - total_rows_in_file = 0; - - reader.reset(); - pipeline.reset(); - input_format.reset(); - read_buf.reset(); - - if (!initialize()) - break; - } - return {}; -} - -void HDFSSource::addNumRowsToCache(const String & path, size_t num_rows) -{ - auto cache_key = getKeyForSchemaCache(path, storage->format_name, std::nullopt, getContext()); - StorageHDFS::getSchemaCache(getContext()).addNumRows(cache_key, num_rows); -} - -std::optional HDFSSource::tryGetNumRowsFromCache(const StorageHDFS::PathWithInfo & path_with_info) -{ - auto cache_key = getKeyForSchemaCache(path_with_info.path, storage->format_name, std::nullopt, getContext()); - auto get_last_mod_time = [&]() -> std::optional - { - if (path_with_info.info) - return path_with_info.info->last_mod_time; - return std::nullopt; - }; - - return StorageHDFS::getSchemaCache(getContext()).tryGetNumRows(cache_key, get_last_mod_time); -} - -class HDFSSink : public SinkToStorage -{ -public: - HDFSSink(const String & uri, - const String & format, - const Block & sample_block, - ContextPtr context, - const CompressionMethod compression_method) - : SinkToStorage(sample_block) - { - const auto & settings = context->getSettingsRef(); - write_buf = wrapWriteBufferWithCompressionMethod( - std::make_unique( - uri, context->getGlobalContext()->getConfigRef(), context->getSettingsRef().hdfs_replication, context->getWriteSettings()), - compression_method, - static_cast(settings.output_format_compression_level), - static_cast(settings.output_format_compression_zstd_window_log)); - writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context); - } - - String getName() const override { return "HDFSSink"; } - - void consume(Chunk chunk) override - { - std::lock_guard lock(cancel_mutex); - if (cancelled) - return; - writer->write(getHeader().cloneWithColumns(chunk.detachColumns())); - } - - void onCancel() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - cancelled = true; - } - - void onException(std::exception_ptr exception) override - { - std::lock_guard lock(cancel_mutex); - try - { - std::rethrow_exception(exception); - } - catch (...) - { - /// An exception context is needed to proper delete write buffers without finalization - release(); - } - } - - void onFinish() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - } - -private: - void finalize() - { - if (!writer) - return; - - try - { - writer->finalize(); - writer->flush(); - write_buf->sync(); - write_buf->finalize(); - } - catch (...) - { - /// Stop ParallelFormattingOutputFormat correctly. - release(); - throw; - } - } - - void release() - { - writer.reset(); - write_buf->finalize(); - } - - std::unique_ptr write_buf; - OutputFormatPtr writer; - std::mutex cancel_mutex; - bool cancelled = false; -}; - -class PartitionedHDFSSink : public PartitionedSink -{ -public: - PartitionedHDFSSink( - const ASTPtr & partition_by, - const String & uri_, - const String & format_, - const Block & sample_block_, - ContextPtr context_, - const CompressionMethod compression_method_) - : PartitionedSink(partition_by, context_, sample_block_) - , uri(uri_) - , format(format_) - , sample_block(sample_block_) - , context(context_) - , compression_method(compression_method_) - { - } - - SinkPtr createSinkForPartition(const String & partition_id) override - { - auto path = PartitionedSink::replaceWildcards(uri, partition_id); - PartitionedSink::validatePartitionKey(path, true); - return std::make_shared(path, format, sample_block, context, compression_method); - } - -private: - const String uri; - const String format; - const Block sample_block; - ContextPtr context; - const CompressionMethod compression_method; -}; - - -bool StorageHDFS::supportsSubsetOfColumns(const ContextPtr & context_) const -{ - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context_); -} - -class ReadFromHDFS : public SourceStepWithFilter -{ -public: - std::string getName() const override { return "ReadFromHDFS"; } - void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; - void applyFilters() override; - - ReadFromHDFS( - Block sample_block, - ReadFromFormatInfo info_, - bool need_only_count_, - std::shared_ptr storage_, - ContextPtr context_, - size_t max_block_size_, - size_t num_streams_) - : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}) - , info(std::move(info_)) - , need_only_count(need_only_count_) - , storage(std::move(storage_)) - , context(std::move(context_)) - , max_block_size(max_block_size_) - , num_streams(num_streams_) - { - } - -private: - ReadFromFormatInfo info; - const bool need_only_count; - std::shared_ptr storage; - - ContextPtr context; - size_t max_block_size; - size_t num_streams; - - std::shared_ptr iterator_wrapper; - - void createIterator(const ActionsDAG::Node * predicate); -}; - -void ReadFromHDFS::applyFilters() -{ - auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes); - const ActionsDAG::Node * predicate = nullptr; - if (filter_actions_dag) - predicate = filter_actions_dag->getOutputs().at(0); - - createIterator(predicate); -} - -void StorageHDFS::read( - QueryPlan & query_plan, - const Names & column_names, - const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & query_info, - ContextPtr context_, - QueryProcessingStage::Enum /*processed_stage*/, - size_t max_block_size, - size_t num_streams) -{ - auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(context_), virtual_columns); - bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) - && context_->getSettingsRef().optimize_count_from_files; - - auto this_ptr = std::static_pointer_cast(shared_from_this()); - - auto reading = std::make_unique( - read_from_format_info.source_header, - std::move(read_from_format_info), - need_only_count, - std::move(this_ptr), - context_, - max_block_size, - num_streams); - - query_plan.addStep(std::move(reading)); -} - -void ReadFromHDFS::createIterator(const ActionsDAG::Node * predicate) -{ - if (iterator_wrapper) - return; - - if (storage->distributed_processing) - { - iterator_wrapper = std::make_shared( - [callback = context->getReadTaskCallback()]() -> StorageHDFS::PathWithInfo { - return StorageHDFS::PathWithInfo{callback(), std::nullopt}; - }); - } - else if (storage->is_path_with_globs) - { - /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(storage->uris[0], predicate, storage->virtual_columns, context); - iterator_wrapper = std::make_shared([glob_iterator]() - { - return glob_iterator->next(); - }); - } - else - { - auto uris_iterator = std::make_shared(storage->uris, predicate, storage->virtual_columns, context); - iterator_wrapper = std::make_shared([uris_iterator]() - { - return uris_iterator->next(); - }); - } -} - -void ReadFromHDFS::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) -{ - createIterator(nullptr); - - Pipes pipes; - for (size_t i = 0; i < num_streams; ++i) - { - pipes.emplace_back(std::make_shared( - info, - storage, - context, - max_block_size, - iterator_wrapper, - need_only_count)); - } - - auto pipe = Pipe::unitePipes(std::move(pipes)); - if (pipe.empty()) - pipe = Pipe(std::make_shared(info.source_header)); - - for (const auto & processor : pipe.getProcessors()) - processors.emplace_back(processor); - - pipeline.init(std::move(pipe)); -} - -SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context_, bool /*async_insert*/) -{ - String current_uri = uris.back(); - - bool has_wildcards = current_uri.find(PartitionedSink::PARTITION_ID_WILDCARD) != String::npos; - const auto * insert_query = dynamic_cast(query.get()); - auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; - bool is_partitioned_implementation = partition_by_ast && has_wildcards; - - if (is_partitioned_implementation) - { - return std::make_shared( - partition_by_ast, - current_uri, - format_name, - metadata_snapshot->getSampleBlock(), - context_, - chooseCompressionMethod(current_uri, compression_method)); - } - else - { - if (is_path_with_globs) - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "URI '{}' contains globs, so the table is in readonly mode", uris.back()); - - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_uri); - - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context_->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - - bool truncate_on_insert = context_->getSettingsRef().hdfs_truncate_on_insert; - if (!truncate_on_insert && !hdfsExists(fs.get(), path_from_uri.c_str())) - { - if (context_->getSettingsRef().hdfs_create_new_file_on_insert) - { - auto pos = uris[0].find_first_of('.', uris[0].find_last_of('/')); - size_t index = uris.size(); - String new_uri; - do - { - new_uri = uris[0].substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : uris[0].substr(pos)); - ++index; - } - while (!hdfsExists(fs.get(), new_uri.c_str())); - uris.push_back(new_uri); - current_uri = new_uri; - } - else - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "File with path {} already exists. If you want to overwrite it, enable setting hdfs_truncate_on_insert, " - "if you want to create new file on each insert, enable setting hdfs_create_new_file_on_insert", - path_from_uri); - } - - return std::make_shared(current_uri, - format_name, - metadata_snapshot->getSampleBlock(), - context_, - chooseCompressionMethod(current_uri, compression_method)); - } -} - -void StorageHDFS::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, ContextPtr local_context, TableExclusiveLockHolder &) -{ - const size_t begin_of_path = uris[0].find('/', uris[0].find("//") + 2); - const String url = uris[0].substr(0, begin_of_path); - - HDFSBuilderWrapper builder = createHDFSBuilder(url + "/", local_context->getGlobalContext()->getConfigRef()); - auto fs = createHDFSFS(builder.get()); - - for (const auto & uri : uris) - { - const String path = uri.substr(begin_of_path); - int ret = hdfsDelete(fs.get(), path.data(), 0); - if (ret) - throw Exception(ErrorCodes::ACCESS_DENIED, "Unable to truncate hdfs table: {}", std::string(hdfsGetLastError())); - } -} - - -void registerStorageHDFS(StorageFactory & factory) -{ - factory.registerStorage("HDFS", [](const StorageFactory::Arguments & args) - { - ASTs & engine_args = args.engine_args; - - if (engine_args.empty() || engine_args.size() > 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Storage HDFS requires 1, 2 or 3 arguments: " - "url, name of used format (taken from file extension by default) and optional compression method."); - - engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.getLocalContext()); - - String url = checkAndGetLiteralArgument(engine_args[0], "url"); - - String format_name = "auto"; - if (engine_args.size() > 1) - { - engine_args[1] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[1], args.getLocalContext()); - format_name = checkAndGetLiteralArgument(engine_args[1], "format_name"); - } - - if (format_name == "auto") - format_name = FormatFactory::instance().getFormatFromFileName(url, true); - - String compression_method; - if (engine_args.size() == 3) - { - engine_args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[2], args.getLocalContext()); - compression_method = checkAndGetLiteralArgument(engine_args[2], "compression_method"); - } else compression_method = "auto"; - - ASTPtr partition_by; - if (args.storage_def->partition_by) - partition_by = args.storage_def->partition_by->clone(); - - return std::make_shared( - url, args.table_id, format_name, args.columns, args.constraints, args.comment, args.getContext(), compression_method, false, partition_by); - }, - { - .supports_sort_order = true, // for partition by - .supports_schema_inference = true, - .source_access_type = AccessType::HDFS, - }); -} - -NamesAndTypesList StorageHDFS::getVirtuals() const -{ - return virtual_columns; -} - -Names StorageHDFS::getVirtualColumnNames() -{ - return VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage({}).getNames(); -} - -SchemaCache & StorageHDFS::getSchemaCache(const ContextPtr & ctx) -{ - static SchemaCache schema_cache(ctx->getConfigRef().getUInt("schema_inference_cache_max_elements_for_hdfs", DEFAULT_SCHEMA_CACHE_ELEMENTS)); - return schema_cache; -} - -} - -#endif diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h deleted file mode 100644 index 7170763c959..00000000000 --- a/src/Storages/HDFS/StorageHDFS.h +++ /dev/null @@ -1,179 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_HDFS - -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -class IInputFormat; - -/** - * This class represents table engine for external hdfs files. - * Read method is supported for now. - */ -class StorageHDFS final : public IStorage, WithContext -{ -public: - struct PathInfo - { - time_t last_mod_time; - size_t size; - }; - - struct PathWithInfo - { - PathWithInfo() = default; - PathWithInfo(const String & path_, const std::optional & info_) : path(path_), info(info_) {} - String path; - std::optional info; - }; - - StorageHDFS( - const String & uri_, - const StorageID & table_id_, - const String & format_name_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - ContextPtr context_, - const String & compression_method_ = "", - bool distributed_processing_ = false, - ASTPtr partition_by = nullptr); - - String getName() const override { return "HDFS"; } - - void read( - QueryPlan & query_plan, - const Names & column_names, - const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & query_info, - ContextPtr context, - QueryProcessingStage::Enum processed_stage, - size_t max_block_size, - size_t num_streams) override; - - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool async_insert) override; - - void truncate( - const ASTPtr & query, - const StorageMetadataPtr & metadata_snapshot, - ContextPtr local_context, - TableExclusiveLockHolder &) override; - - NamesAndTypesList getVirtuals() const override; - static Names getVirtualColumnNames(); - - bool supportsPartitionBy() const override { return true; } - - /// Check if the format is column-oriented. - /// Is is useful because column oriented formats could effectively skip unknown columns - /// So we can create a header of only required columns in read method and ask - /// format to read only them. Note: this hack cannot be done with ordinary formats like TSV. - bool supportsSubsetOfColumns(const ContextPtr & context_) const; - - bool supportsSubcolumns() const override { return true; } - - static ColumnsDescription getTableStructureFromData( - const String & format, - const String & uri, - const String & compression_method, - ContextPtr ctx); - - static SchemaCache & getSchemaCache(const ContextPtr & ctx); - - bool supportsTrivialCountOptimization() const override { return true; } - -protected: - friend class HDFSSource; - friend class ReadFromHDFS; - -private: - std::vector uris; - String format_name; - String compression_method; - const bool distributed_processing; - ASTPtr partition_by; - bool is_path_with_globs; - NamesAndTypesList virtual_columns; - - LoggerPtr log = getLogger("StorageHDFS"); -}; - -class PullingPipelineExecutor; - -class HDFSSource : public ISource, WithContext -{ -public: - class DisclosedGlobIterator - { - public: - DisclosedGlobIterator(const String & uri_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context); - StorageHDFS::PathWithInfo next(); - private: - class Impl; - /// shared_ptr to have copy constructor - std::shared_ptr pimpl; - }; - - class URISIterator - { - public: - URISIterator(const std::vector & uris_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context); - StorageHDFS::PathWithInfo next(); - private: - class Impl; - /// shared_ptr to have copy constructor - std::shared_ptr pimpl; - }; - - using IteratorWrapper = std::function; - using StorageHDFSPtr = std::shared_ptr; - - HDFSSource( - const ReadFromFormatInfo & info, - StorageHDFSPtr storage_, - ContextPtr context_, - UInt64 max_block_size_, - std::shared_ptr file_iterator_, - bool need_only_count_); - - String getName() const override; - - Chunk generate() override; - -private: - void addNumRowsToCache(const String & path, size_t num_rows); - std::optional tryGetNumRowsFromCache(const StorageHDFS::PathWithInfo & path_with_info); - - StorageHDFSPtr storage; - Block block_for_format; - NamesAndTypesList requested_columns; - NamesAndTypesList requested_virtual_columns; - UInt64 max_block_size; - std::shared_ptr file_iterator; - ColumnsDescription columns_description; - bool need_only_count; - size_t total_rows_in_file = 0; - - std::unique_ptr read_buf; - std::shared_ptr input_format; - std::unique_ptr pipeline; - std::unique_ptr reader; - String current_path; - std::optional current_file_size; - - /// Recreate ReadBuffer and PullingPipelineExecutor for each file. - bool initialize(); -}; -} - -#endif diff --git a/src/Storages/HDFS/StorageHDFSCluster.cpp b/src/Storages/HDFS/StorageHDFSCluster.cpp deleted file mode 100644 index fad29436102..00000000000 --- a/src/Storages/HDFS/StorageHDFSCluster.cpp +++ /dev/null @@ -1,98 +0,0 @@ -#include "config.h" -#include "Interpreters/Context_fwd.h" - -#if USE_HDFS - -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -StorageHDFSCluster::StorageHDFSCluster( - ContextPtr context_, - const String & cluster_name_, - const String & uri_, - const StorageID & table_id_, - const String & format_name_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & compression_method_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageHDFSCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) - , uri(uri_) - , format_name(format_name_) - , compression_method(compression_method_) -{ - checkHDFSURL(uri_); - context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); - - StorageInMemoryMetadata storage_metadata; - - if (columns_.empty()) - { - auto columns = StorageHDFS::getTableStructureFromData(format_name, uri_, compression_method, context_); - storage_metadata.setColumns(columns); - } - else - storage_metadata.setColumns(columns_); - - storage_metadata.setConstraints(constraints_); - setInMemoryMetadata(storage_metadata); - - virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); -} - -void StorageHDFSCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) -{ - ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); - if (!expression_list) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function hdfsCluster, got '{}'", queryToString(query)); - - TableFunctionHDFSCluster::addColumnsStructureToArguments(expression_list->children, structure, context); -} - - -RemoteQueryExecutor::Extension StorageHDFSCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const -{ - auto iterator = std::make_shared(uri, predicate, virtual_columns, context); - auto callback = std::make_shared>([iter = std::move(iterator)]() mutable -> String { return iter->next().path; }); - return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; -} - -NamesAndTypesList StorageHDFSCluster::getVirtuals() const -{ - return NamesAndTypesList{ - {"_path", std::make_shared(std::make_shared())}, - {"_file", std::make_shared(std::make_shared())}}; -} - -} - -#endif diff --git a/src/Storages/HDFS/StorageHDFSCluster.h b/src/Storages/HDFS/StorageHDFSCluster.h deleted file mode 100644 index 7c4c41a573a..00000000000 --- a/src/Storages/HDFS/StorageHDFSCluster.h +++ /dev/null @@ -1,56 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_HDFS - -#include -#include - -#include -#include -#include -#include - -namespace DB -{ - -class Context; - -class StorageHDFSCluster : public IStorageCluster -{ -public: - StorageHDFSCluster( - ContextPtr context_, - const String & cluster_name_, - const String & uri_, - const StorageID & table_id_, - const String & format_name_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & compression_method_, - bool structure_argument_was_provided_); - - std::string getName() const override { return "HDFSCluster"; } - - NamesAndTypesList getVirtuals() const override; - - RemoteQueryExecutor::Extension getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const override; - - bool supportsSubcolumns() const override { return true; } - - bool supportsTrivialCountOptimization() const override { return true; } - -private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; - - String uri; - String format_name; - String compression_method; - NamesAndTypesList virtual_columns; -}; - - -} - -#endif diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 4fa6bfdd617..26301472f24 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -98,9 +98,14 @@ class IStorage : public std::enable_shared_from_this, public TypePromo public: IStorage() = delete; /// Storage metadata can be set separately in setInMemoryMetadata method - explicit IStorage(StorageID storage_id_) + explicit IStorage(StorageID storage_id_, std::unique_ptr metadata_ = nullptr) : storage_id(std::move(storage_id_)) - , metadata(std::make_unique()) {} + { + if (metadata_) + metadata.set(std::move(metadata_)); + else + metadata.set(std::make_unique()); + } IStorage(const IStorage &) = delete; IStorage & operator=(const IStorage &) = delete; diff --git a/src/Storages/ObjectStorage/AzureConfiguration.cpp b/src/Storages/ObjectStorage/AzureConfiguration.cpp new file mode 100644 index 00000000000..ba3e796223a --- /dev/null +++ b/src/Storages/ObjectStorage/AzureConfiguration.cpp @@ -0,0 +1,451 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +const std::unordered_set required_configuration_keys = { + "blob_path", + "container", +}; + +const std::unordered_set optional_configuration_keys = { + "format", + "compression", + "structure", + "compression_method", + "account_name", + "account_key", + "connection_string", + "storage_account_url", +}; + +using AzureClient = Azure::Storage::Blobs::BlobContainerClient; +using AzureClientPtr = std::unique_ptr; + +namespace +{ + bool isConnectionString(const std::string & candidate) + { + return !candidate.starts_with("http"); + } + + bool containerExists(std::unique_ptr & blob_service_client, std::string container_name) + { + Azure::Storage::Blobs::ListBlobContainersOptions options; + options.Prefix = container_name; + options.PageSizeHint = 1; + + auto containers_list_response = blob_service_client->ListBlobContainers(options); + auto containers_list = containers_list_response.BlobContainers; + + for (const auto & container : containers_list) + { + if (container_name == container.Name) + return true; + } + return false; + } +} + +void StorageAzureBlobConfiguration::check(ContextPtr context) const +{ + Poco::URI url_to_check; + if (is_connection_string) + { + auto parsed_connection_string = Azure::Storage::_internal::ParseConnectionString(connection_url); + url_to_check = Poco::URI(parsed_connection_string.BlobServiceUrl.GetAbsoluteUrl()); + } + else + url_to_check = Poco::URI(connection_url); + + context->getGlobalContext()->getRemoteHostFilter().checkURL(url_to_check); +} + +StorageObjectStorageConfigurationPtr StorageAzureBlobConfiguration::clone() +{ + auto configuration = std::make_shared(); + configuration->connection_url = connection_url; + configuration->is_connection_string = is_connection_string; + configuration->account_name = account_name; + configuration->account_key = account_key; + configuration->container = container; + configuration->blob_path = blob_path; + configuration->blobs_paths = blobs_paths; + return configuration; +} + +AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(ContextPtr context) +{ + const auto & context_settings = context->getSettingsRef(); + auto settings_ptr = std::make_unique(); + settings_ptr->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; + settings_ptr->max_single_read_retries = context_settings.azure_max_single_read_retries; + settings_ptr->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); + return settings_ptr; +} + +ObjectStoragePtr StorageAzureBlobConfiguration::createOrUpdateObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT +{ + auto client = createClient(is_readonly); + auto settings = createSettings(context); + return std::make_unique("AzureBlobStorage", std::move(client), std::move(settings), container); +} + +AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) +{ + using namespace Azure::Storage::Blobs; + + AzureClientPtr result; + + if (is_connection_string) + { + auto blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(connection_url)); + result = std::make_unique(BlobContainerClient::CreateFromConnectionString(connection_url, container)); + bool container_exists = containerExists(blob_service_client, container); + + if (!container_exists) + { + if (is_read_only) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "AzureBlobStorage container does not exist '{}'", + container); + + try + { + result->CreateIfNotExists(); + } catch (const Azure::Storage::StorageException & e) + { + if (!(e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict + && e.ReasonPhrase == "The specified container already exists.")) + { + throw; + } + } + } + } + else + { + std::shared_ptr storage_shared_key_credential; + if (account_name.has_value() && account_key.has_value()) + { + storage_shared_key_credential = + std::make_shared(*account_name, *account_key); + } + + std::unique_ptr blob_service_client; + if (storage_shared_key_credential) + { + blob_service_client = std::make_unique(connection_url, storage_shared_key_credential); + } + else + { + blob_service_client = std::make_unique(connection_url); + } + + bool container_exists = containerExists(blob_service_client, container); + + std::string final_url; + size_t pos = connection_url.find('?'); + if (pos != std::string::npos) + { + auto url_without_sas = connection_url.substr(0, pos); + final_url = url_without_sas + (url_without_sas.back() == '/' ? "" : "/") + container + + connection_url.substr(pos); + } + else + final_url + = connection_url + (connection_url.back() == '/' ? "" : "/") + container; + + if (container_exists) + { + if (storage_shared_key_credential) + result = std::make_unique(final_url, storage_shared_key_credential); + else + result = std::make_unique(final_url); + } + else + { + if (is_read_only) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "AzureBlobStorage container does not exist '{}'", + container); + try + { + result = std::make_unique(blob_service_client->CreateBlobContainer(container).Value); + } + catch (const Azure::Storage::StorageException & e) + { + if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict + && e.ReasonPhrase == "The specified container already exists.") + { + if (storage_shared_key_credential) + result = std::make_unique(final_url, storage_shared_key_credential); + else + result = std::make_unique(final_url); + } + else + { + throw; + } + } + } + } + + return result; +} + +void StorageAzureBlobConfiguration::fromNamedCollection(const NamedCollection & collection) +{ + validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); + + if (collection.has("connection_string")) + { + connection_url = collection.get("connection_string"); + is_connection_string = true; + } + + if (collection.has("storage_account_url")) + { + connection_url = collection.get("storage_account_url"); + is_connection_string = false; + } + + container = collection.get("container"); + blob_path = collection.get("blob_path"); + + if (collection.has("account_name")) + account_name = collection.get("account_name"); + + if (collection.has("account_key")) + account_key = collection.get("account_key"); + + structure = collection.getOrDefault("structure", "auto"); + format = collection.getOrDefault("format", format); + compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); + + blobs_paths = {blob_path}; + if (format == "auto") + format = FormatFactory::instance().getFormatFromFileName(blob_path, true); +} + +void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr context, bool with_structure) +{ + if (engine_args.size() < 3 || engine_args.size() > (with_structure ? 8 : 7)) + { + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Storage AzureBlobStorage requires 3 to 7 arguments: " + "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, " + "[account_name, account_key, format, compression, structure)])"); + } + + for (auto & engine_arg : engine_args) + engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, context); + + std::unordered_map engine_args_to_idx; + + connection_url = checkAndGetLiteralArgument(engine_args[0], "connection_string/storage_account_url"); + is_connection_string = isConnectionString(connection_url); + + container = checkAndGetLiteralArgument(engine_args[1], "container"); + blob_path = checkAndGetLiteralArgument(engine_args[2], "blobpath"); + + auto is_format_arg = [] (const std::string & s) -> bool + { + return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); + }; + + if (engine_args.size() == 4) + { + //'c1 UInt64, c2 UInt64 + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + format = fourth_arg; + } + else + { + if (with_structure) + structure = fourth_arg; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format or account name specified without account key"); + } + } + else if (engine_args.size() == 5) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + format = fourth_arg; + compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + } + else + { + account_name = fourth_arg; + account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + } + } + else if (engine_args.size() == 6) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (is_format_arg(fourth_arg)) + { + if (with_structure) + { + format = fourth_arg; + compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + structure = checkAndGetLiteralArgument(engine_args[5], "structure"); + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); + } + else + { + account_name = fourth_arg; + account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); + if (is_format_arg(sixth_arg)) + format = sixth_arg; + else + { + if (with_structure) + structure = sixth_arg; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); + } + } + } + else if (engine_args.size() == 7) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + if (!with_structure && is_format_arg(fourth_arg)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); + } + else + { + account_name = fourth_arg; + account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); + if (!is_format_arg(sixth_arg)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); + format = sixth_arg; + compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + } + } + else if (with_structure && engine_args.size() == 8) + { + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + account_name = fourth_arg; + account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); + if (!is_format_arg(sixth_arg)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); + format = sixth_arg; + compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + structure = checkAndGetLiteralArgument(engine_args[7], "structure"); + } + + blobs_paths = {blob_path}; + + if (format == "auto") + format = FormatFactory::instance().getFormatFromFileName(blob_path, true); +} + +void StorageAzureBlobConfiguration::addStructureToArgs(ASTs & args, const String & structure_, ContextPtr context) +{ + if (tryGetNamedCollectionWithOverrides(args, context)) + { + /// In case of named collection, just add key-value pair "structure='...'" + /// at the end of arguments to override existed structure. + ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure_)}; + auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); + args.push_back(equal_func); + } + else + { + if (args.size() < 3 || args.size() > 8) + { + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Storage Azure requires 3 to 7 arguments: " + "StorageObjectStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); + } + + auto structure_literal = std::make_shared(structure_); + auto is_format_arg + = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); }; + + if (args.size() == 3) + { + /// Add format=auto & compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + else if (args.size() == 4) + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name/structure"); + if (is_format_arg(fourth_arg)) + { + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + else + { + args.back() = structure_literal; + } + } + else if (args.size() == 5) + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); + if (!is_format_arg(fourth_arg)) + { + /// Add format=auto & compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(std::make_shared("auto")); + } + args.push_back(structure_literal); + } + else if (args.size() == 6) + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); + if (!is_format_arg(fourth_arg)) + { + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + else + { + args.back() = structure_literal; + } + } + else if (args.size() == 7) + { + args.push_back(structure_literal); + } + else if (args.size() == 8) + { + args.back() = structure_literal; + } + } +} + +} diff --git a/src/Storages/ObjectStorage/AzureConfiguration.h b/src/Storages/ObjectStorage/AzureConfiguration.h new file mode 100644 index 00000000000..40d718d7690 --- /dev/null +++ b/src/Storages/ObjectStorage/AzureConfiguration.h @@ -0,0 +1,54 @@ +#pragma once +#include +#include + +namespace DB +{ +class BackupFactory; + +class StorageAzureBlobConfiguration : public StorageObjectStorageConfiguration +{ + friend class BackupReaderAzureBlobStorage; + friend class BackupWriterAzureBlobStorage; + friend void registerBackupEngineAzureBlobStorage(BackupFactory & factory); + +public: + StorageAzureBlobConfiguration() = default; + StorageAzureBlobConfiguration(const StorageAzureBlobConfiguration & other); + + Path getPath() const override { return blob_path; } + void setPath(const Path & path) override { blob_path = path; } + + const Paths & getPaths() const override { return blobs_paths; } + Paths & getPaths() override { return blobs_paths; } + + String getDataSourceDescription() override { return fs::path(connection_url) / container; } + String getNamespace() const override { return container; } + + void check(ContextPtr context) const override; + StorageObjectStorageConfigurationPtr clone() override; + ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT + + void fromNamedCollection(const NamedCollection & collection) override; + void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; + static void addStructureToArgs(ASTs & args, const String & structure, ContextPtr context); + +protected: + using AzureClient = Azure::Storage::Blobs::BlobContainerClient; + using AzureClientPtr = std::unique_ptr; + + std::string connection_url; + bool is_connection_string; + + std::optional account_name; + std::optional account_key; + + std::string container; + std::string blob_path; + std::vector blobs_paths; + + AzureClientPtr createClient(bool is_read_only); + AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); +}; + +} diff --git a/src/Storages/ObjectStorage/Configuration.h b/src/Storages/ObjectStorage/Configuration.h new file mode 100644 index 00000000000..708041980e3 --- /dev/null +++ b/src/Storages/ObjectStorage/Configuration.h @@ -0,0 +1,55 @@ +#pragma once +#include +#include + +namespace DB +{ + +class StorageObjectStorageConfiguration; +using StorageObjectStorageConfigurationPtr = std::shared_ptr; + +class StorageObjectStorageConfiguration +{ +public: + StorageObjectStorageConfiguration() = default; + virtual ~StorageObjectStorageConfiguration() = default; + + using Path = std::string; + using Paths = std::vector; + + virtual Path getPath() const = 0; + virtual void setPath(const Path & path) = 0; + + virtual const Paths & getPaths() const = 0; + virtual Paths & getPaths() = 0; + + virtual String getDataSourceDescription() = 0; + virtual String getNamespace() const = 0; + + bool isPathWithGlobs() const { return getPath().find_first_of("*?{") != std::string::npos; } + bool isNamespaceWithGlobs() const { return getNamespace().find_first_of("*?{") != std::string::npos; } + + std::string getPathWithoutGlob() const { return getPath().substr(0, getPath().find_first_of("*?{")); } + + virtual bool withWildcard() const + { + static const String PARTITION_ID_WILDCARD = "{_partition_id}"; + return getPath().find(PARTITION_ID_WILDCARD) != String::npos; + } + + virtual void check(ContextPtr context) const = 0; + virtual StorageObjectStorageConfigurationPtr clone() = 0; + + virtual ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT + + virtual void fromNamedCollection(const NamedCollection & collection) = 0; + virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; + + String format = "auto"; + String compression_method = "auto"; + String structure = "auto"; +}; + +using StorageObjectStorageConfigurationPtr = std::shared_ptr; + +} diff --git a/src/Storages/ObjectStorage/HDFSConfiguration.h b/src/Storages/ObjectStorage/HDFSConfiguration.h new file mode 100644 index 00000000000..f42cedf459d --- /dev/null +++ b/src/Storages/ObjectStorage/HDFSConfiguration.h @@ -0,0 +1,81 @@ +#pragma once +#include "config.h" + +#if USE_HDFS + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +class StorageHDFSConfiguration : public StorageObjectStorageConfiguration +{ +public: + Path getPath() const override { return path; } + void setPath(const Path & path_) override { path = path_; } + + const Paths & getPaths() const override { return paths; } + Paths & getPaths() override { return paths; } + + String getNamespace() const override { return ""; } + String getDataSourceDescription() override { return url; } + + void check(ContextPtr context) const override + { + context->getRemoteHostFilter().checkURL(Poco::URI(url)); + checkHDFSURL(url); + } + StorageObjectStorageConfigurationPtr clone() override + { + auto configuration = std::make_shared(); + return configuration; + } + + ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override /// NOLINT + { + UNUSED(is_readonly); + auto settings = std::make_unique(); + return std::make_shared(url, std::move(settings), context->getConfigRef()); + } + + void fromNamedCollection(const NamedCollection &) override {} + void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override + { + url = checkAndGetLiteralArgument(args[0], "url"); + + String format_name = "auto"; + if (args.size() > 1) + format_name = checkAndGetLiteralArgument(args[1], "format_name"); + + if (format_name == "auto") + format_name = FormatFactory::instance().getFormatFromFileName(url, true); + + String compression_method; + if (args.size() == 3) + { + compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); + } else compression_method = "auto"; + + } + static void addStructureToArgs(ASTs &, const String &, ContextPtr) {} + +private: + String url; + String path; + std::vector paths; +}; + +} + +#endif diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.h b/src/Storages/ObjectStorage/ReadBufferIterator.h new file mode 100644 index 00000000000..248700e2edf --- /dev/null +++ b/src/Storages/ObjectStorage/ReadBufferIterator.h @@ -0,0 +1,197 @@ +#pragma once +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + +} + +template +class ReadBufferIterator : public IReadBufferIterator, WithContext +{ +public: + using Storage = StorageObjectStorage; + using Source = StorageObjectStorageSource; + using FileIterator = std::shared_ptr; + using ObjectInfos = typename Storage::ObjectInfos; + + ReadBufferIterator( + ObjectStoragePtr object_storage_, + Storage::ConfigurationPtr configuration_, + const FileIterator & file_iterator_, + const std::optional & format_settings_, + ObjectInfos & read_keys_, + const ContextPtr & context_) + : WithContext(context_) + , object_storage(object_storage_) + , configuration(configuration_) + , file_iterator(file_iterator_) + , format_settings(format_settings_) + , storage_settings(StorageSettings::create(context_->getSettingsRef())) + , read_keys(read_keys_) + , prev_read_keys_size(read_keys_.size()) + { + } + + std::pair, std::optional> next() override + { + /// For default mode check cached columns for currently read keys on first iteration. + if (first && storage_settings.schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns}; + } + + current_object_info = file_iterator->next(0); + if (current_object_info->relative_path.empty()) + { + if (first) + { + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, " + "because there are no files with provided path. " + "You must specify table structure manually", + configuration->format); + } + return {nullptr, std::nullopt}; + } + + first = false; + + /// File iterator could get new keys after new iteration, + /// check them in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT + && read_keys.size() > prev_read_keys_size) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + prev_read_keys_size = read_keys.size(); + if (columns_from_cache) + return {nullptr, columns_from_cache}; + } + else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) + { + ObjectInfos paths = {current_object_info}; + if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) + return {nullptr, columns_from_cache}; + } + + first = false; + + std::unique_ptr read_buffer = object_storage->readObject( + StoredObject(current_object_info->relative_path), + getContext()->getReadSettings(), + {}, + current_object_info->metadata.size_bytes); + + read_buffer = wrapReadBufferWithCompressionMethod( + std::move(read_buffer), + chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method), + static_cast(getContext()->getSettingsRef().zstd_window_log_max)); + + return {std::move(read_buffer), std::nullopt}; + } + + void setNumRowsToLastFile(size_t num_rows) override + { + if (storage_settings.schema_inference_use_cache) + { + Storage::getSchemaCache(getContext()).addNumRows( + getKeyForSchemaCache(current_object_info->relative_path), num_rows); + } + } + + void setSchemaToLastFile(const ColumnsDescription & columns) override + { + if (storage_settings.schema_inference_use_cache + && storage_settings.schema_inference_mode == SchemaInferenceMode::UNION) + { + Storage::getSchemaCache(getContext()).addColumns( + getKeyForSchemaCache(current_object_info->relative_path), columns); + } + } + + void setResultingSchema(const ColumnsDescription & columns) override + { + if (storage_settings.schema_inference_use_cache + && storage_settings.schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + Storage::getSchemaCache(getContext()).addManyColumns(getPathsForSchemaCache(), columns); + } + } + + String getLastFileName() const override { return current_object_info->relative_path; } + +private: + SchemaCache::Key getKeyForSchemaCache(const String & path) const + { + auto source = fs::path(configuration->getDataSourceDescription()) / path; + return DB::getKeyForSchemaCache(source, configuration->format, format_settings, getContext()); + } + + SchemaCache::Keys getPathsForSchemaCache() const + { + Strings sources; + sources.reserve(read_keys.size()); + std::transform( + read_keys.begin(), read_keys.end(), + std::back_inserter(sources), + [&](const auto & elem) + { + return fs::path(configuration->getDataSourceDescription()) / elem->relative_path; + }); + return DB::getKeysForSchemaCache(sources, configuration->format, format_settings, getContext()); + } + + std::optional tryGetColumnsFromCache( + const ObjectInfos::iterator & begin, + const ObjectInfos::iterator & end) + { + if (!storage_settings.schema_inference_use_cache) + return std::nullopt; + + auto & schema_cache = Storage::getSchemaCache(getContext()); + for (auto it = begin; it < end; ++it) + { + const auto & object_info = (*it); + auto get_last_mod_time = [&] -> std::optional + { + if (object_info->metadata.last_modified) + return object_info->metadata.last_modified->epochMicroseconds(); + else + { + object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); + return object_info->metadata.last_modified->epochMicroseconds(); + } + }; + + auto cache_key = getKeyForSchemaCache(object_info->relative_path); + auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); + if (columns) + return columns; + } + + return std::nullopt; + } + + ObjectStoragePtr object_storage; + const Storage::ConfigurationPtr configuration; + const FileIterator file_iterator; + const std::optional & format_settings; + const StorageObjectStorageSettings storage_settings; + ObjectInfos & read_keys; + + size_t prev_read_keys_size; + Storage::ObjectInfoPtr current_object_info; + bool first = true; +}; +} diff --git a/src/Storages/ObjectStorage/ReadFromObjectStorage.h b/src/Storages/ObjectStorage/ReadFromObjectStorage.h new file mode 100644 index 00000000000..9cb77dcc25e --- /dev/null +++ b/src/Storages/ObjectStorage/ReadFromObjectStorage.h @@ -0,0 +1,105 @@ +#pragma once +#include +#include +#include +#include + +namespace DB +{ + +template +class ReadFromStorageObejctStorage : public SourceStepWithFilter +{ +public: + using Storage = StorageObjectStorage; + using Source = StorageObjectStorageSource; + + ReadFromStorageObejctStorage( + ObjectStoragePtr object_storage_, + Storage::ConfigurationPtr configuration_, + const String & name_, + const NamesAndTypesList & virtual_columns_, + const std::optional & format_settings_, + bool distributed_processing_, + ReadFromFormatInfo info_, + const bool need_only_count_, + ContextPtr context_, + size_t max_block_size_, + size_t num_streams_) + : SourceStepWithFilter(DataStream{.header = info_.source_header}) + , object_storage(object_storage_) + , configuration(configuration_) + , context(std::move(context_)) + , info(std::move(info_)) + , virtual_columns(virtual_columns_) + , format_settings(format_settings_) + , name(name_ + "Source") + , need_only_count(need_only_count_) + , max_block_size(max_block_size_) + , num_streams(num_streams_) + , distributed_processing(distributed_processing_) + { + } + + std::string getName() const override { return name; } + + void applyFilters() override + { + auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes); + const ActionsDAG::Node * predicate = nullptr; + if (filter_actions_dag) + predicate = filter_actions_dag->getOutputs().at(0); + + createIterator(predicate); + } + + void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override + { + createIterator(nullptr); + + Pipes pipes; + for (size_t i = 0; i < num_streams; ++i) + { + pipes.emplace_back(std::make_shared( + getName(), object_storage, configuration, info, format_settings, + context, max_block_size, iterator_wrapper, need_only_count)); + } + + auto pipe = Pipe::unitePipes(std::move(pipes)); + if (pipe.empty()) + pipe = Pipe(std::make_shared(info.source_header)); + + for (const auto & processor : pipe.getProcessors()) + processors.emplace_back(processor); + + pipeline.init(std::move(pipe)); + } + +private: + ObjectStoragePtr object_storage; + Storage::ConfigurationPtr configuration; + ContextPtr context; + + const ReadFromFormatInfo info; + const NamesAndTypesList virtual_columns; + const std::optional format_settings; + const String name; + const bool need_only_count; + const size_t max_block_size; + const size_t num_streams; + const bool distributed_processing; + + std::shared_ptr iterator_wrapper; + + void createIterator(const ActionsDAG::Node * predicate) + { + if (!iterator_wrapper) + { + iterator_wrapper = Source::createFileIterator( + configuration, object_storage, distributed_processing, context, + predicate, virtual_columns, nullptr, context->getFileProgressCallback()); + } + } +}; + +} diff --git a/src/Storages/ObjectStorage/S3Configuration.cpp b/src/Storages/ObjectStorage/S3Configuration.cpp new file mode 100644 index 00000000000..5a5412019f5 --- /dev/null +++ b/src/Storages/ObjectStorage/S3Configuration.cpp @@ -0,0 +1,491 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +static const std::unordered_set required_configuration_keys = { + "url", +}; + +static const std::unordered_set optional_configuration_keys = { + "format", + "compression", + "compression_method", + "structure", + "access_key_id", + "secret_access_key", + "session_token", + "filename", + "use_environment_credentials", + "max_single_read_retries", + "min_upload_part_size", + "upload_part_size_multiply_factor", + "upload_part_size_multiply_parts_count_threshold", + "max_single_part_upload_size", + "max_connections", + "expiration_window_seconds", + "no_sign_request" +}; + +String StorageS3Configuration::getDataSourceDescription() +{ + return fs::path(url.uri.getHost() + std::to_string(url.uri.getPort())) / url.bucket; +} + +void StorageS3Configuration::check(ContextPtr context) const +{ + context->getGlobalContext()->getRemoteHostFilter().checkURL(url.uri); + context->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(headers_from_ast); +} + +StorageObjectStorageConfigurationPtr StorageS3Configuration::clone() +{ + auto configuration = std::make_shared(); + configuration->url = url; + configuration->auth_settings = auth_settings; + configuration->request_settings = request_settings; + configuration->static_configuration = static_configuration; + configuration->headers_from_ast = headers_from_ast; + configuration->keys = keys; + configuration->initialized = initialized; + return configuration; +} + +ObjectStoragePtr StorageS3Configuration::createOrUpdateObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT +{ + auto s3_settings = context->getStorageS3Settings().getSettings(url.uri.toString()); + request_settings = s3_settings.request_settings; + request_settings.updateFromSettings(context->getSettings()); + + if (!initialized || (!static_configuration && auth_settings.hasUpdates(s3_settings.auth_settings))) + { + auth_settings.updateFrom(s3_settings.auth_settings); + keys[0] = url.key; + initialized = true; + } + + const auto & config = context->getConfigRef(); + auto s3_capabilities = S3Capabilities + { + .support_batch_delete = config.getBool("s3.support_batch_delete", true), + .support_proxy = config.getBool("s3.support_proxy", config.has("s3.proxy")), + }; + + auto s3_storage_settings = std::make_unique( + request_settings, + config.getUInt64("s3.min_bytes_for_seek", 1024 * 1024), + config.getInt("s3.list_object_keys_size", 1000), + config.getInt("s3.objects_chunk_size_to_delete", 1000), + config.getBool("s3.readonly", false)); + + auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); + auto client = createClient(context); + std::string disk_name = "StorageS3"; + + return std::make_shared( + std::move(client), std::move(s3_storage_settings), url, s3_capabilities, key_generator, /*disk_name*/disk_name); +} + +std::unique_ptr StorageS3Configuration::createClient(ContextPtr context) +{ + const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); + const Settings & local_settings = context->getSettingsRef(); + + auto client_configuration = S3::ClientFactory::instance().createClientConfiguration( + auth_settings.region, + context->getRemoteHostFilter(), + static_cast(global_settings.s3_max_redirects), + static_cast(global_settings.s3_retry_attempts), + global_settings.enable_s3_requests_logging, + /* for_disk_s3 = */ false, + request_settings.get_request_throttler, + request_settings.put_request_throttler, + url.uri.getScheme()); + + client_configuration.endpointOverride = url.endpoint; + client_configuration.maxConnections = static_cast(request_settings.max_connections); + client_configuration.http_connection_pool_size = global_settings.s3_http_connection_pool_size; + + auto headers = auth_settings.headers; + if (!headers_from_ast.empty()) + headers.insert(headers.end(), headers_from_ast.begin(), headers_from_ast.end()); + + client_configuration.requestTimeoutMs = request_settings.request_timeout_ms; + + S3::ClientSettings client_settings{ + .use_virtual_addressing = url.is_virtual_hosted_style, + .disable_checksum = local_settings.s3_disable_checksum, + .gcs_issue_compose_request = context->getConfigRef().getBool("s3.gcs_issue_compose_request", false), + }; + + auto credentials = Aws::Auth::AWSCredentials(auth_settings.access_key_id, + auth_settings.secret_access_key, + auth_settings.session_token); + + auto credentials_configuration = S3::CredentialsConfiguration + { + auth_settings.use_environment_credentials.value_or(context->getConfigRef().getBool("s3.use_environment_credentials", true)), + auth_settings.use_insecure_imds_request.value_or(context->getConfigRef().getBool("s3.use_insecure_imds_request", false)), + auth_settings.expiration_window_seconds.value_or(context->getConfigRef().getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), + auth_settings.no_sign_request.value_or(context->getConfigRef().getBool("s3.no_sign_request", false)), + }; + + return S3::ClientFactory::instance().create( + client_configuration, + client_settings, + credentials.GetAWSAccessKeyId(), + credentials.GetAWSSecretKey(), + auth_settings.server_side_encryption_customer_key_base64, + auth_settings.server_side_encryption_kms_config, + std::move(headers), + credentials_configuration); +} + +void StorageS3Configuration::fromNamedCollection(const NamedCollection & collection) +{ + validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); + + auto filename = collection.getOrDefault("filename", ""); + if (!filename.empty()) + url = S3::URI(std::filesystem::path(collection.get("url")) / filename); + else + url = S3::URI(collection.get("url")); + + auth_settings.access_key_id = collection.getOrDefault("access_key_id", ""); + auth_settings.secret_access_key = collection.getOrDefault("secret_access_key", ""); + auth_settings.use_environment_credentials = collection.getOrDefault("use_environment_credentials", 1); + auth_settings.no_sign_request = collection.getOrDefault("no_sign_request", false); + auth_settings.expiration_window_seconds = collection.getOrDefault("expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS); + + format = collection.getOrDefault("format", format); + compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); + structure = collection.getOrDefault("structure", "auto"); + + request_settings = S3Settings::RequestSettings(collection); + + static_configuration = !auth_settings.access_key_id.empty() || auth_settings.no_sign_request.has_value(); + + keys = {url.key}; + + //if (format == "auto" && get_format_from_file) + if (format == "auto") + format = FormatFactory::instance().getFormatFromFileName(url.key, true); +} + +void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_structure) +{ + /// Supported signatures: S3('url') S3('url', 'format') S3('url', 'format', 'compression') S3('url', NOSIGN) S3('url', NOSIGN, 'format') S3('url', NOSIGN, 'format', 'compression') S3('url', 'aws_access_key_id', 'aws_secret_access_key') S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'session_token') S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format') S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'session_token', 'format') S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'compression') + /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'session_token', 'format', 'compression') + /// with optional headers() function + + size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); + + if (count == 0 || count > (with_structure ? 7 : 6)) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Storage S3 requires 1 to 5 arguments: " + "url, [NOSIGN | access_key_id, secret_access_key], name of used format and [compression_method]"); + + std::unordered_map engine_args_to_idx; + bool no_sign_request = false; + + /// For 2 arguments we support 2 possible variants: + /// - s3(source, format) + /// - s3(source, NOSIGN) + /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or not. + if (count == 2) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + if (boost::iequals(second_arg, "NOSIGN")) + no_sign_request = true; + else + engine_args_to_idx = {{"format", 1}}; + } + /// For 3 arguments we support 2 possible variants: + /// - s3(source, format, compression_method) + /// - s3(source, access_key_id, secret_access_key) + /// - s3(source, NOSIGN, format) + /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or format name. + else if (count == 3) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/access_key_id/NOSIGN"); + if (boost::iequals(second_arg, "NOSIGN")) + { + no_sign_request = true; + engine_args_to_idx = {{"format", 2}}; + } + else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) + { + if (with_structure) + engine_args_to_idx = {{"format", 1}, {"structure", 2}}; + else + engine_args_to_idx = {{"format", 1}, {"compression_method", 2}}; + } + else + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}}; + } + /// For 4 arguments we support 3 possible variants: + /// if with_structure == 0: + /// - s3(source, access_key_id, secret_access_key, session_token) + /// - s3(source, access_key_id, secret_access_key, format) + /// - s3(source, NOSIGN, format, compression_method) + /// if with_structure == 1: + /// - s3(source, format, structure, compression_method), + /// - s3(source, access_key_id, secret_access_key, format), + /// - s3(source, access_key_id, secret_access_key, session_token) + /// - s3(source, NOSIGN, format, structure) + /// We can distinguish them by looking at the 2-nd argument: check if it's a NOSIGN or not. + else if (count == 4) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "access_key_id/NOSIGN"); + if (boost::iequals(second_arg, "NOSIGN")) + { + no_sign_request = true; + if (with_structure) + engine_args_to_idx = {{"format", 2}, {"structure", 3}}; + else + engine_args_to_idx = {{"format", 2}, {"compression_method", 3}}; + } + else if (with_structure && (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg))) + { + engine_args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}}; + } + else + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "session_token/format"); + if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}}; + } + else + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}}; + } + } + } + /// For 5 arguments we support 2 possible variants: + /// if with_structure == 0: + /// - s3(source, access_key_id, secret_access_key, session_token, format) + /// - s3(source, access_key_id, secret_access_key, format, compression) + /// if with_structure == 1: + /// - s3(source, access_key_id, secret_access_key, format, structure) + /// - s3(source, access_key_id, secret_access_key, session_token, format) + /// - s3(source, NOSIGN, format, structure, compression_method) + else if (count == 5) + { + if (with_structure) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "NOSIGN/access_key_id"); + if (boost::iequals(second_arg, "NOSIGN")) + { + no_sign_request = true; + engine_args_to_idx = {{"format", 2}, {"structure", 3}, {"compression_method", 4}}; + } + else + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/session_token"); + if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}}; + } + else + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}}; + } + } + } + else + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "session_token/format"); + if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"compression_method", 4}}; + } + else + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}}; + } + } + } + else if (count == 6) + { + if (with_structure) + { + /// - s3(source, access_key_id, secret_access_key, format, structure, compression_method) + /// - s3(source, access_key_id, secret_access_key, session_token, format, structure) + /// We can distinguish them by looking at the 4-th argument: check if it's a format name or not + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/session_token"); + if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}}; + } + else + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}}; + } + } + else + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"compression_method", 5}}; + } + } + else if (with_structure && count == 7) + { + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}, {"compression_method", 6}}; + } + + /// This argument is always the first + url = S3::URI(checkAndGetLiteralArgument(args[0], "url")); + + if (engine_args_to_idx.contains("format")) + { + format = checkAndGetLiteralArgument(args[engine_args_to_idx["format"]], "format"); + /// Set format to configuration only of it's not 'auto', + /// because we can have default format set in configuration. + if (format != "auto") + format = format; + } + + if (engine_args_to_idx.contains("structure")) + structure = checkAndGetLiteralArgument(args[engine_args_to_idx["structure"]], "structure"); + + if (engine_args_to_idx.contains("compression_method")) + compression_method = checkAndGetLiteralArgument(args[engine_args_to_idx["compression_method"]], "compression_method"); + + if (engine_args_to_idx.contains("access_key_id")) + auth_settings.access_key_id = checkAndGetLiteralArgument(args[engine_args_to_idx["access_key_id"]], "access_key_id"); + + if (engine_args_to_idx.contains("secret_access_key")) + auth_settings.secret_access_key = checkAndGetLiteralArgument(args[engine_args_to_idx["secret_access_key"]], "secret_access_key"); + + if (engine_args_to_idx.contains("session_token")) + auth_settings.session_token = checkAndGetLiteralArgument(args[engine_args_to_idx["session_token"]], "session_token"); + + if (no_sign_request) + auth_settings.no_sign_request = no_sign_request; + + static_configuration = !auth_settings.access_key_id.empty() || auth_settings.no_sign_request.has_value(); + auth_settings.no_sign_request = no_sign_request; + + keys = {url.key}; + + // if (format == "auto" && get_format_from_file) + if (format == "auto") + format = FormatFactory::instance().getFormatFromFileName(url.key, true); +} + +void StorageS3Configuration::addStructureToArgs(ASTs & args, const String & structure_, ContextPtr context) +{ + if (tryGetNamedCollectionWithOverrides(args, context)) + { + /// In case of named collection, just add key-value pair "structure='...'" + /// at the end of arguments to override existed structure. + ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure_)}; + auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); + args.push_back(equal_func); + } + else + { + HTTPHeaderEntries tmp_headers; + size_t count = StorageURL::evalArgsAndCollectHeaders(args, tmp_headers, context); + + if (count == 0 || count > 6) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to 6 arguments in table function, got {}", count); + + auto structure_literal = std::make_shared(structure_); + + /// s3(s3_url) + if (count == 1) + { + /// Add format=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + /// s3(s3_url, format) or s3(s3_url, NOSIGN) + /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or not. + else if (count == 2) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// If there is NOSIGN, add format=auto before structure. + if (boost::iequals(second_arg, "NOSIGN")) + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + /// s3(source, format, structure) or + /// s3(source, access_key_id, secret_access_key) or + /// s3(source, NOSIGN, format) + /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN, format name or neither. + else if (count == 3) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + if (boost::iequals(second_arg, "NOSIGN")) + { + args.push_back(structure_literal); + } + else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) + { + args[count - 1] = structure_literal; + } + else + { + /// Add format=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + } + /// s3(source, format, structure, compression_method) or + /// s3(source, access_key_id, secret_access_key, format) or + /// s3(source, NOSIGN, format, structure) + /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN, format name or neither. + else if (count == 4) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + if (boost::iequals(second_arg, "NOSIGN")) + { + args[count - 1] = structure_literal; + } + else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) + { + args[count - 2] = structure_literal; + } + else + { + args.push_back(structure_literal); + } + } + /// s3(source, access_key_id, secret_access_key, format, structure) or + /// s3(source, NOSIGN, format, structure, compression_method) + /// We can distinguish them by looking at the 2-nd argument: check if it's a NOSIGN keyword name or not. + else if (count == 5) + { + auto sedond_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + if (boost::iequals(sedond_arg, "NOSIGN")) + { + args[count - 2] = structure_literal; + } + else + { + args[count - 1] = structure_literal; + } + } + /// s3(source, access_key_id, secret_access_key, format, structure, compression) + else if (count == 6) + { + args[count - 2] = structure_literal; + } + } +} + +} diff --git a/src/Storages/ObjectStorage/S3Configuration.h b/src/Storages/ObjectStorage/S3Configuration.h new file mode 100644 index 00000000000..34f5735e02a --- /dev/null +++ b/src/Storages/ObjectStorage/S3Configuration.h @@ -0,0 +1,46 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +class StorageS3Configuration : public StorageObjectStorageConfiguration +{ +public: + Path getPath() const override { return url.key; } + void setPath(const Path & path) override { url.key = path; } + + const Paths & getPaths() const override { return keys; } + Paths & getPaths() override { return keys; } + + String getNamespace() const override { return url.bucket; } + String getDataSourceDescription() override; + + void check(ContextPtr context) const override; + StorageObjectStorageConfigurationPtr clone() override; + + ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT + + void fromNamedCollection(const NamedCollection & collection) override; + void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; + static void addStructureToArgs(ASTs & args, const String & structure, ContextPtr context); + +private: + S3::URI url; + S3::AuthSettings auth_settings; + S3Settings::RequestSettings request_settings; + /// If s3 configuration was passed from ast, then it is static. + /// If from config - it can be changed with config reload. + bool static_configuration = true; + /// Headers from ast is a part of static configuration. + HTTPHeaderEntries headers_from_ast; + std::vector keys; + + std::unique_ptr createClient(ContextPtr context); + + bool initialized = false; +}; + +} diff --git a/src/Storages/ObjectStorage/Settings.h b/src/Storages/ObjectStorage/Settings.h new file mode 100644 index 00000000000..015cf9bc01d --- /dev/null +++ b/src/Storages/ObjectStorage/Settings.h @@ -0,0 +1,86 @@ +#pragma once +#include +#include +#include + +namespace CurrentMetrics +{ + extern const Metric ObjectStorageAzureThreads; + extern const Metric ObjectStorageAzureThreadsActive; + extern const Metric ObjectStorageAzureThreadsScheduled; + + extern const Metric ObjectStorageS3Threads; + extern const Metric ObjectStorageS3ThreadsActive; + extern const Metric ObjectStorageS3ThreadsScheduled; +} + +namespace DB +{ + +struct StorageObjectStorageSettings +{ + bool truncate_on_insert; + bool create_new_file_on_insert; + bool schema_inference_use_cache; + SchemaInferenceMode schema_inference_mode; +}; + +struct S3StorageSettings +{ + static StorageObjectStorageSettings create(const Settings & settings) + { + return StorageObjectStorageSettings{ + .truncate_on_insert = settings.s3_truncate_on_insert, + .create_new_file_on_insert = settings.s3_create_new_file_on_insert, + .schema_inference_use_cache = settings.schema_inference_use_cache_for_s3, + .schema_inference_mode = settings.schema_inference_mode, + }; + } + + static constexpr auto SCHEMA_CACHE_MAX_ELEMENTS_CONFIG_SETTING = "schema_inference_cache_max_elements_for_s3"; + + static CurrentMetrics::Metric ObjectStorageThreads() { return CurrentMetrics::ObjectStorageS3Threads; } /// NOLINT + static CurrentMetrics::Metric ObjectStorageThreadsActive() { return CurrentMetrics::ObjectStorageS3ThreadsActive; } /// NOLINT + static CurrentMetrics::Metric ObjectStorageThreadsScheduled() { return CurrentMetrics::ObjectStorageS3ThreadsScheduled; } /// NOLINT +}; + +struct AzureStorageSettings +{ + static StorageObjectStorageSettings create(const Settings & settings) + { + return StorageObjectStorageSettings{ + .truncate_on_insert = settings.azure_truncate_on_insert, + .create_new_file_on_insert = settings.azure_create_new_file_on_insert, + .schema_inference_use_cache = settings.schema_inference_use_cache_for_azure, + .schema_inference_mode = settings.schema_inference_mode, + }; + } + + static constexpr auto SCHEMA_CACHE_MAX_ELEMENTS_CONFIG_SETTING = "schema_inference_cache_max_elements_for_azure"; + + static CurrentMetrics::Metric ObjectStorageThreads() { return CurrentMetrics::ObjectStorageAzureThreads; } /// NOLINT + static CurrentMetrics::Metric ObjectStorageThreadsActive() { return CurrentMetrics::ObjectStorageAzureThreadsActive; } /// NOLINT + static CurrentMetrics::Metric ObjectStorageThreadsScheduled() { return CurrentMetrics::ObjectStorageAzureThreadsScheduled; } /// NOLINT +}; + +struct HDFSStorageSettings +{ + static StorageObjectStorageSettings create(const Settings & settings) + { + return StorageObjectStorageSettings{ + .truncate_on_insert = settings.hdfs_truncate_on_insert, + .create_new_file_on_insert = settings.hdfs_create_new_file_on_insert, + .schema_inference_use_cache = settings.schema_inference_use_cache_for_hdfs, + .schema_inference_mode = settings.schema_inference_mode, + }; + } + + static constexpr auto SCHEMA_CACHE_MAX_ELEMENTS_CONFIG_SETTING = "schema_inference_cache_max_elements_for_hdfs"; + + /// TODO: s3 -> hdfs + static CurrentMetrics::Metric ObjectStorageThreads() { return CurrentMetrics::ObjectStorageS3Threads; } /// NOLINT + static CurrentMetrics::Metric ObjectStorageThreadsActive() { return CurrentMetrics::ObjectStorageS3ThreadsActive; } /// NOLINT + static CurrentMetrics::Metric ObjectStorageThreadsScheduled() { return CurrentMetrics::ObjectStorageS3ThreadsScheduled; } /// NOLINT +}; + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp new file mode 100644 index 00000000000..9250ab8ecbe --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -0,0 +1,303 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int DATABASE_ACCESS_DENIED; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; + +} + +template +std::unique_ptr getStorageMetadata( + ObjectStoragePtr object_storage, + const StorageObjectStorageConfigurationPtr & configuration, + const ColumnsDescription & columns, + const ConstraintsDescription & constraints, + std::optional format_settings, + const String & comment, + const std::string & engine_name, + const ContextPtr & context) +{ + auto storage_metadata = std::make_unique(); + if (columns.empty()) + { + auto fetched_columns = StorageObjectStorage::getTableStructureFromData( + object_storage, configuration, format_settings, context); + storage_metadata->setColumns(fetched_columns); + } + else + { + /// We don't allow special columns. + if (!columns.hasOnlyOrdinary()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Table engine {} doesn't support special columns " + "like MATERIALIZED, ALIAS or EPHEMERAL", + engine_name); + + storage_metadata->setColumns(columns); + } + + storage_metadata->setConstraints(constraints); + storage_metadata->setComment(comment); + return storage_metadata; +} + +template +StorageObjectStorage::StorageObjectStorage( + ConfigurationPtr configuration_, + ObjectStoragePtr object_storage_, + const String & engine_name_, + ContextPtr context, + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & comment, + std::optional format_settings_, + bool distributed_processing_, + ASTPtr partition_by_) + : IStorage(table_id_, getStorageMetadata( + object_storage_, configuration_, columns_, constraints_, format_settings_, + comment, engine_name, context)) + , engine_name(engine_name_) + , virtual_columns(VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage( + getInMemoryMetadataPtr()->getSampleBlock().getNamesAndTypesList())) + , format_settings(format_settings_) + , partition_by(partition_by_) + , distributed_processing(distributed_processing_) + , object_storage(object_storage_) + , configuration(configuration_) +{ + FormatFactory::instance().checkFormatName(configuration->format); + configuration->check(context); + + StoredObjects objects; + for (const auto & key : configuration->getPaths()) + objects.emplace_back(key); +} + +template +Names StorageObjectStorage::getVirtualColumnNames() +{ + return VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage({}).getNames(); +} + +template +bool StorageObjectStorage::supportsSubsetOfColumns(const ContextPtr & context) const +{ + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context, format_settings); +} + +template +bool StorageObjectStorage::prefersLargeBlocks() const +{ + return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration->format); +} + +template +bool StorageObjectStorage::parallelizeOutputAfterReading(ContextPtr context) const +{ + return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration->format, context); +} + +template +std::pair +StorageObjectStorage::updateConfigurationAndGetCopy(ContextPtr local_context) +{ + std::lock_guard lock(configuration_update_mutex); + auto new_object_storage = configuration->createOrUpdateObjectStorage(local_context); + if (new_object_storage) + object_storage = new_object_storage; + return {configuration, object_storage}; +} + +template +SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context) +{ + static SchemaCache schema_cache( + context->getConfigRef().getUInt( + StorageSettings::SCHEMA_CACHE_MAX_ELEMENTS_CONFIG_SETTING, + DEFAULT_SCHEMA_CACHE_ELEMENTS)); + return schema_cache; +} + +template +void StorageObjectStorage::read( + QueryPlan & query_plan, + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr local_context, + QueryProcessingStage::Enum /*processed_stage*/, + size_t max_block_size, + size_t num_streams) +{ + if (partition_by && configuration->withWildcard()) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Reading from a partitioned {} storage is not implemented yet", + getName()); + } + + auto this_ptr = std::static_pointer_cast(shared_from_this()); + auto read_from_format_info = prepareReadingFromFormat( + column_names, storage_snapshot, supportsSubsetOfColumns(local_context), getVirtuals()); + bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) + && local_context->getSettingsRef().optimize_count_from_files; + + auto [query_configuration, query_object_storage] = updateConfigurationAndGetCopy(local_context); + auto reading = std::make_unique>( + query_object_storage, + query_configuration, + getName(), + virtual_columns, + format_settings, + distributed_processing, + std::move(read_from_format_info), + need_only_count, + local_context, + max_block_size, + num_streams); + + query_plan.addStep(std::move(reading)); +} + +template +SinkToStoragePtr StorageObjectStorage::write( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context, + bool /* async_insert */) +{ + auto insert_query = std::dynamic_pointer_cast(query); + auto partition_by_ast = insert_query + ? (insert_query->partition_by ? insert_query->partition_by : partition_by) + : nullptr; + bool is_partitioned_implementation = partition_by_ast && configuration->withWildcard(); + + auto sample_block = metadata_snapshot->getSampleBlock(); + auto storage_settings = StorageSettings::create(local_context->getSettingsRef()); + + if (is_partitioned_implementation) + { + return std::make_shared( + object_storage, configuration, format_settings, sample_block, local_context, partition_by_ast); + } + + if (configuration->isPathWithGlobs() || configuration->isNamespaceWithGlobs()) + { + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, + "{} key '{}' contains globs, so the table is in readonly mode", + getName(), configuration->getPath()); + } + + if (!storage_settings.truncate_on_insert + && object_storage->exists(StoredObject(configuration->getPath()))) + { + if (storage_settings.create_new_file_on_insert) + { + size_t index = configuration->getPaths().size(); + const auto & first_key = configuration->getPaths()[0]; + auto pos = first_key.find_first_of('.'); + String new_key; + + do + { + new_key = first_key.substr(0, pos) + + "." + + std::to_string(index) + + (pos == std::string::npos ? "" : first_key.substr(pos)); + ++index; + } + while (object_storage->exists(StoredObject(new_key))); + + configuration->getPaths().push_back(new_key); + } + else + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Object in bucket {} with key {} already exists. " + "If you want to overwrite it, enable setting [engine_name]_truncate_on_insert, if you " + "want to create a new file on each insert, enable setting [engine_name]_create_new_file_on_insert", + configuration->getNamespace(), configuration->getPaths().back()); + } + } + + return std::make_shared( + object_storage, configuration, format_settings, sample_block, local_context); +} + +template +void StorageObjectStorage::truncate( + const ASTPtr &, + const StorageMetadataPtr &, + ContextPtr, + TableExclusiveLockHolder &) +{ + if (configuration->isPathWithGlobs() || configuration->isNamespaceWithGlobs()) + { + throw Exception( + ErrorCodes::DATABASE_ACCESS_DENIED, + "{} key '{}' contains globs, so the table is in readonly mode and cannot be truncated", + getName(), configuration->getPath()); + } + + StoredObjects objects; + for (const auto & key : configuration->getPaths()) + objects.emplace_back(key); + + object_storage->removeObjectsIfExist(objects); +} + +template +ColumnsDescription StorageObjectStorage::getTableStructureFromData( + ObjectStoragePtr object_storage, + const ConfigurationPtr & configuration, + const std::optional & format_settings, + ContextPtr context) +{ + using Source = StorageObjectStorageSource; + + ObjectInfos read_keys; + auto file_iterator = Source::createFileIterator( + configuration, object_storage, /* distributed_processing */false, + context, /* predicate */{}, /* virtual_columns */{}, &read_keys); + + ReadBufferIterator read_buffer_iterator( + object_storage, configuration, file_iterator, + format_settings, read_keys, context); + + const bool retry = configuration->isPathWithGlobs() || configuration->isNamespaceWithGlobs(); + return readSchemaFromFormat( + configuration->format, format_settings, + read_buffer_iterator, retry, context); +} + +template class StorageObjectStorage; +template class StorageObjectStorage; +template class StorageObjectStorage; + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h new file mode 100644 index 00000000000..0b29845ba5c --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +struct SelectQueryInfo; +class StorageObjectStorageConfiguration; +struct S3StorageSettings; +struct HDFSStorageSettings; +struct AzureStorageSettings; +class PullingPipelineExecutor; +using ReadTaskCallback = std::function; +class IOutputFormat; +class IInputFormat; +class SchemaCache; + + +template +class StorageObjectStorage : public IStorage +{ +public: + using Configuration = StorageObjectStorageConfiguration; + using ConfigurationPtr = std::shared_ptr; + using ObjectInfo = RelativePathWithMetadata; + using ObjectInfoPtr = std::shared_ptr; + using ObjectInfos = std::vector; + + StorageObjectStorage( + ConfigurationPtr configuration_, + ObjectStoragePtr object_storage_, + const String & engine_name_, + ContextPtr context_, + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + const String & comment, + std::optional format_settings_, + bool distributed_processing_ = false, + ASTPtr partition_by_ = nullptr); + + String getName() const override { return engine_name; } + + void read( + QueryPlan & query_plan, + const Names &, + const StorageSnapshotPtr &, + SelectQueryInfo &, + ContextPtr, + QueryProcessingStage::Enum, + size_t, + size_t) override; + + SinkToStoragePtr write( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr context, + bool async_insert) override; + + void truncate( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context, + TableExclusiveLockHolder &) override; + + NamesAndTypesList getVirtuals() const override { return virtual_columns; } + + static Names getVirtualColumnNames(); + + bool supportsPartitionBy() const override { return true; } + + bool supportsSubcolumns() const override { return true; } + + bool supportsTrivialCountOptimization() const override { return true; } + + bool supportsSubsetOfColumns(const ContextPtr & context) const; + + bool prefersLargeBlocks() const override; + + bool parallelizeOutputAfterReading(ContextPtr context) const override; + + static SchemaCache & getSchemaCache(const ContextPtr & context); + + static ColumnsDescription getTableStructureFromData( + ObjectStoragePtr object_storage, + const ConfigurationPtr & configuration, + const std::optional & format_settings, + ContextPtr context); + +protected: + virtual std::pair + updateConfigurationAndGetCopy(ContextPtr local_context); + + const std::string engine_name; + const NamesAndTypesList virtual_columns; + std::optional format_settings; + const ASTPtr partition_by; + const bool distributed_processing; + + ObjectStoragePtr object_storage; + ConfigurationPtr configuration; + std::mutex configuration_update_mutex; +}; + +using StorageS3 = StorageObjectStorage; +using StorageAzureBlobStorage = StorageObjectStorage; +using StorageHDFS = StorageObjectStorage; + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp new file mode 100644 index 00000000000..414932016f4 --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -0,0 +1,107 @@ +#include "Storages/ObjectStorage/StorageObjectStorageCluster.h" + +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +template +StorageObjectStorageCluster::StorageObjectStorageCluster( + const String & cluster_name_, + const Storage::ConfigurationPtr & configuration_, + ObjectStoragePtr object_storage_, + const String & engine_name_, + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + ContextPtr context_, + bool structure_argument_was_provided_) + : IStorageCluster(cluster_name_, + table_id_, + getLogger(fmt::format("{}({})", engine_name_, table_id_.table_name)), + structure_argument_was_provided_) + , engine_name(engine_name_) + , configuration{configuration_} + , object_storage(object_storage_) +{ + configuration->check(context_); + StorageInMemoryMetadata storage_metadata; + + if (columns_.empty()) + { + /// `format_settings` is set to std::nullopt, because StorageObjectStorageCluster is used only as table function + auto columns = StorageObjectStorage::getTableStructureFromData( + object_storage, configuration, /*format_settings=*/std::nullopt, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + + storage_metadata.setConstraints(constraints_); + setInMemoryMetadata(storage_metadata); + + virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage( + storage_metadata.getSampleBlock().getNamesAndTypesList()); +} + +template +void StorageObjectStorageCluster::addColumnsStructureToQuery( + ASTPtr & query, + const String & structure, + const ContextPtr & context) +{ + ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); + if (!expression_list) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Expected SELECT query from table function {}, got '{}'", + engine_name, queryToString(query)); + } + using TableFunction = TableFunctionObjectStorageCluster; + TableFunction::addColumnsStructureToArguments(expression_list->children, structure, context); +} + +template +RemoteQueryExecutor::Extension +StorageObjectStorageCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr &) const +{ + auto iterator = std::make_shared( + object_storage, configuration, predicate, virtual_columns, nullptr); + + auto callback = std::make_shared>([iterator]() mutable -> String{ return iterator->next(0)->relative_path; }); + return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) }; +} + + +#if USE_AWS_S3 +template class StorageObjectStorageCluster; +#endif + +#if USE_AZURE_BLOB_STORAGE +template class StorageObjectStorageCluster; +#endif + +#if USE_HDFS +template class StorageObjectStorageCluster; +#endif + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h new file mode 100644 index 00000000000..b1f9af14e03 --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -0,0 +1,72 @@ +#pragma once + +#include "config.h" + +#include +#include +#include +#include +#include + +namespace DB +{ + +class StorageS3Settings; +class StorageAzureBlobSettings; + +class Context; + +template +class StorageObjectStorageCluster : public IStorageCluster +{ +public: + using Storage = StorageObjectStorage; + using Source = StorageObjectStorageSource; + + StorageObjectStorageCluster( + const String & cluster_name_, + const Storage::ConfigurationPtr & configuration_, + ObjectStoragePtr object_storage_, + const String & engine_name_, + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + ContextPtr context_, + bool structure_argument_was_provided_); + + std::string getName() const override { return engine_name; } + + NamesAndTypesList getVirtuals() const override { return virtual_columns; } + + RemoteQueryExecutor::Extension + getTaskIteratorExtension( + const ActionsDAG::Node * predicate, + const ContextPtr & context) const override; + + bool supportsSubcolumns() const override { return true; } + + bool supportsTrivialCountOptimization() const override { return true; } + +private: + void updateBeforeRead(const ContextPtr & /* context */) override {} + + void addColumnsStructureToQuery( + ASTPtr & query, + const String & structure, + const ContextPtr & context) override; + + const String & engine_name; + const Storage::ConfigurationPtr configuration; + const ObjectStoragePtr object_storage; + NamesAndTypesList virtual_columns; +}; + + +#if USE_AWS_S3 +using StorageS3Cluster = StorageObjectStorageCluster; +#endif +#if USE_AZURE_BLOB_STORAGE +using StorageAzureBlobCluster = StorageObjectStorageCluster; +#endif + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h new file mode 100644 index 00000000000..34ab8ebec66 --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -0,0 +1,155 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace DB +{ +class StorageObjectStorageSink : public SinkToStorage +{ +public: + StorageObjectStorageSink( + ObjectStoragePtr object_storage, + StorageObjectStorageConfigurationPtr configuration, + std::optional format_settings_, + const Block & sample_block_, + ContextPtr context, + const std::string & blob_path = "") + : SinkToStorage(sample_block_) + , sample_block(sample_block_) + , format_settings(format_settings_) + { + const auto & settings = context->getSettingsRef(); + const auto path = blob_path.empty() ? configuration->getPaths().back() : blob_path; + const auto chosen_compression_method = chooseCompressionMethod(path, configuration->compression_method); + + auto buffer = object_storage->writeObject( + StoredObject(path), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); + + write_buf = wrapWriteBufferWithCompressionMethod( + std::move(buffer), + chosen_compression_method, + static_cast(settings.output_format_compression_level), + static_cast(settings.output_format_compression_zstd_window_log)); + + writer = FormatFactory::instance().getOutputFormatParallelIfPossible( + configuration->format, *write_buf, sample_block, context, format_settings); + } + + String getName() const override { return "StorageObjectStorageSink"; } + + void consume(Chunk chunk) override + { + std::lock_guard lock(cancel_mutex); + if (cancelled) + return; + writer->write(getHeader().cloneWithColumns(chunk.detachColumns())); + } + + void onCancel() override + { + std::lock_guard lock(cancel_mutex); + finalize(); + cancelled = true; + } + + void onException(std::exception_ptr exception) override + { + std::lock_guard lock(cancel_mutex); + try + { + std::rethrow_exception(exception); + } + catch (...) + { + /// An exception context is needed to proper delete write buffers without finalization. + release(); + } + } + + void onFinish() override + { + std::lock_guard lock(cancel_mutex); + finalize(); + } + +private: + const Block sample_block; + const std::optional format_settings; + + std::unique_ptr write_buf; + OutputFormatPtr writer; + bool cancelled = false; + std::mutex cancel_mutex; + + void finalize() + { + if (!writer) + return; + + try + { + writer->finalize(); + writer->flush(); + write_buf->finalize(); + } + catch (...) + { + /// Stop ParallelFormattingOutputFormat correctly. + release(); + throw; + } + } + + void release() + { + writer.reset(); + write_buf->finalize(); + } +}; + +class PartitionedStorageObjectStorageSink : public PartitionedSink +{ +public: + PartitionedStorageObjectStorageSink( + ObjectStoragePtr object_storage_, + StorageObjectStorageConfigurationPtr configuration_, + std::optional format_settings_, + const Block & sample_block_, + ContextPtr context_, + const ASTPtr & partition_by) + : PartitionedSink(partition_by, context_, sample_block_) + , object_storage(object_storage_) + , configuration(configuration_) + , format_settings(format_settings_) + , sample_block(sample_block_) + , context(context_) + { + } + + SinkPtr createSinkForPartition(const String & partition_id) override + { + auto blob = configuration->getPaths().back(); + auto partition_key = replaceWildcards(blob, partition_id); + validatePartitionKey(partition_key, true); + return std::make_shared( + object_storage, + configuration, + format_settings, + sample_block, + context, + partition_key + ); + } + +private: + ObjectStoragePtr object_storage; + StorageObjectStorageConfigurationPtr configuration; + const std::optional format_settings; + const Block sample_block; + const ContextPtr context; +}; + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp new file mode 100644 index 00000000000..9fc7925a6d1 --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -0,0 +1,464 @@ +#include "StorageObjectStorageSource.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace ProfileEvents +{ + extern const Event EngineFileLikeReadFiles; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_COMPILE_REGEXP; +} + +template +std::shared_ptr::IIterator> +StorageObjectStorageSource::createFileIterator( + Storage::ConfigurationPtr configuration, + ObjectStoragePtr object_storage, + bool distributed_processing, + const ContextPtr & local_context, + const ActionsDAG::Node * predicate, + const NamesAndTypesList & virtual_columns, + ObjectInfos * read_keys, + std::function file_progress_callback) +{ + if (distributed_processing) + return std::make_shared(local_context->getReadTaskCallback()); + + if (configuration->isNamespaceWithGlobs()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); + + if (configuration->isPathWithGlobs()) + { + /// Iterate through disclosed globs and make a source for each file + return std::make_shared( + object_storage, configuration, predicate, virtual_columns, read_keys, file_progress_callback); + } + else + { + return std::make_shared( + object_storage, configuration, virtual_columns, read_keys, file_progress_callback); + } +} + +template +StorageObjectStorageSource::GlobIterator::GlobIterator( + ObjectStoragePtr object_storage_, + Storage::ConfigurationPtr configuration_, + const ActionsDAG::Node * predicate, + const NamesAndTypesList & virtual_columns_, + ObjectInfos * read_keys_, + std::function file_progress_callback_) + : object_storage(object_storage_) + , configuration(configuration_) + , virtual_columns(virtual_columns_) + , read_keys(read_keys_) + , file_progress_callback(file_progress_callback_) +{ + if (configuration->isNamespaceWithGlobs()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); + } + else if (configuration->isPathWithGlobs()) + { + const auto key_with_globs = configuration_->getPath(); + const auto key_prefix = configuration->getPathWithoutGlob(); + object_storage_iterator = object_storage->iterate(key_prefix); + + matcher = std::make_unique(makeRegexpPatternFromGlobs(key_with_globs)); + if (matcher->ok()) + { + recursive = key_with_globs == "/**"; + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + } + else + { + throw Exception( + ErrorCodes::CANNOT_COMPILE_REGEXP, + "Cannot compile regex from glob ({}): {}", key_with_globs, matcher->error()); + } + } + else + { + const auto key_with_globs = configuration_->getPath(); + auto object_metadata = object_storage->getObjectMetadata(key_with_globs); + auto object_info = std::make_shared(key_with_globs, object_metadata); + + object_infos.emplace_back(object_info); + if (read_keys) + read_keys->emplace_back(object_info); + + if (file_progress_callback) + file_progress_callback(FileProgress(0, object_metadata.size_bytes)); + + is_finished = true; + } +} + +template +StorageObjectStorageSource::ObjectInfoPtr +StorageObjectStorageSource::GlobIterator::next(size_t /* processor */) +{ + std::lock_guard lock(next_mutex); + + if (is_finished && index >= object_infos.size()) + return {}; + + bool need_new_batch = object_infos.empty() || index >= object_infos.size(); + + if (need_new_batch) + { + ObjectInfos new_batch; + while (new_batch.empty()) + { + auto result = object_storage_iterator->getCurrentBatchAndScheduleNext(); + if (result.has_value()) + { + new_batch = result.value(); + } + else + { + is_finished = true; + return {}; + } + + for (auto it = new_batch.begin(); it != new_batch.end();) + { + if (!recursive && !re2::RE2::FullMatch((*it)->relative_path, *matcher)) + it = new_batch.erase(it); + else + ++it; + } + } + + index = 0; + + if (filter_dag) + { + std::vector paths; + paths.reserve(new_batch.size()); + for (auto & object_info : new_batch) + paths.push_back(fs::path(configuration->getNamespace()) / object_info->relative_path); + + VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_dag, virtual_columns, getContext()); + } + + if (read_keys) + read_keys->insert(read_keys->end(), new_batch.begin(), new_batch.end()); + + object_infos = std::move(new_batch); + if (file_progress_callback) + { + for (const auto & object_info : object_infos) + { + file_progress_callback(FileProgress(0, object_info->metadata.size_bytes)); + } + } + } + + size_t current_index = index++; + if (current_index >= object_infos.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Index out of bound for blob metadata"); + + return object_infos[current_index]; +} + +template +StorageObjectStorageSource::KeysIterator::KeysIterator( + ObjectStoragePtr object_storage_, + Storage::ConfigurationPtr configuration_, + const NamesAndTypesList & virtual_columns_, + ObjectInfos * read_keys_, + std::function file_progress_callback_) + : object_storage(object_storage_) + , configuration(configuration_) + , virtual_columns(virtual_columns_) + , file_progress_callback(file_progress_callback_) + , keys(configuration->getPaths()) +{ + if (read_keys_) + { + /// TODO: should we add metadata if we anyway fetch it if file_progress_callback is passed? + for (auto && key : keys) + { + auto object_info = std::make_shared(key, ObjectMetadata{}); + read_keys_->emplace_back(object_info); + } + } +} + +template +StorageObjectStorageSource::ObjectInfoPtr +StorageObjectStorageSource::KeysIterator::next(size_t /* processor */) +{ + size_t current_index = index.fetch_add(1, std::memory_order_relaxed); + if (current_index >= keys.size()) + return {}; + + auto key = keys[current_index]; + + ObjectMetadata metadata{}; + if (file_progress_callback) + { + metadata = object_storage->getObjectMetadata(key); + file_progress_callback(FileProgress(0, metadata.size_bytes)); + } + + return std::make_shared(key, metadata); +} + +template +Chunk StorageObjectStorageSource::generate() +{ + while (true) + { + if (isCancelled() || !reader) + { + if (reader) + reader->cancel(); + break; + } + + Chunk chunk; + if (reader->pull(chunk)) + { + UInt64 num_rows = chunk.getNumRows(); + total_rows_in_file += num_rows; + size_t chunk_size = 0; + if (const auto * input_format = reader.getInputFormat()) + chunk_size = input_format->getApproxBytesReadForChunk(); + progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); + + VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( + chunk, + read_from_format_info.requested_virtual_columns, + fs::path(configuration->getNamespace()) / reader.getRelativePath(), + reader.getObjectInfo().metadata.size_bytes); + + return chunk; + } + + if (reader.getInputFormat() && getContext()->getSettingsRef().use_cache_for_count_from_files) + addNumRowsToCache(reader.getRelativePath(), total_rows_in_file); + + total_rows_in_file = 0; + + assert(reader_future.valid()); + reader = reader_future.get(); + + if (!reader) + break; + + /// Even if task is finished the thread may be not freed in pool. + /// So wait until it will be freed before scheduling a new task. + create_reader_pool.wait(); + reader_future = createReaderAsync(); + } + + return {}; +} + +template +void StorageObjectStorageSource::addNumRowsToCache(const String & path, size_t num_rows) +{ + String source = fs::path(configuration->getDataSourceDescription()) / path; + auto cache_key = getKeyForSchemaCache(source, configuration->format, format_settings, getContext()); + Storage::getSchemaCache(getContext()).addNumRows(cache_key, num_rows); +} + +template +std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const ObjectInfoPtr & object_info) +{ + String source = fs::path(configuration->getDataSourceDescription()) / object_info->relative_path; + auto cache_key = getKeyForSchemaCache(source, configuration->format, format_settings, getContext()); + auto get_last_mod_time = [&]() -> std::optional + { + auto last_mod = object_info->metadata.last_modified; + if (last_mod) + return last_mod->epochTime(); + else + { + object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); + return object_info->metadata.last_modified->epochMicroseconds(); + } + }; + return Storage::getSchemaCache(getContext()).tryGetNumRows(cache_key, get_last_mod_time); +} + +template +StorageObjectStorageSource::StorageObjectStorageSource( + String name_, + ObjectStoragePtr object_storage_, + Storage::ConfigurationPtr configuration_, + const ReadFromFormatInfo & info, + std::optional format_settings_, + ContextPtr context_, + UInt64 max_block_size_, + std::shared_ptr file_iterator_, + bool need_only_count_) + :ISource(info.source_header, false) + , WithContext(context_) + , name(std::move(name_)) + , object_storage(object_storage_) + , configuration(configuration_) + , format_settings(format_settings_) + , max_block_size(max_block_size_) + , need_only_count(need_only_count_) + , read_from_format_info(info) + , columns_desc(info.columns_description) + , file_iterator(file_iterator_) + , create_reader_pool(StorageSettings::ObjectStorageThreads(), + StorageSettings::ObjectStorageThreadsActive(), + StorageSettings::ObjectStorageThreadsScheduled(), 1) + , create_reader_scheduler(threadPoolCallbackRunner(create_reader_pool, "Reader")) +{ + reader = createReader(); + if (reader) + reader_future = createReaderAsync(); +} + +template +StorageObjectStorageSource::~StorageObjectStorageSource() +{ + create_reader_pool.wait(); +} + +template +StorageObjectStorageSource::ReaderHolder +StorageObjectStorageSource::createReader(size_t processor) +{ + auto object_info = file_iterator->next(processor); + if (object_info->relative_path.empty()) + return {}; + + if (object_info->metadata.size_bytes == 0) + object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); + + QueryPipelineBuilder builder; + std::shared_ptr source; + std::unique_ptr read_buf; + std::optional num_rows_from_cache = need_only_count + && getContext()->getSettingsRef().use_cache_for_count_from_files + ? tryGetNumRowsFromCache(object_info) + : std::nullopt; + + if (num_rows_from_cache) + { + /// We should not return single chunk with all number of rows, + /// because there is a chance that this chunk will be materialized later + /// (it can cause memory problems even with default values in columns or when virtual columns are requested). + /// Instead, we use special ConstChunkGenerator that will generate chunks + /// with max_block_size rows until total number of rows is reached. + source = std::make_shared( + read_from_format_info.format_header, *num_rows_from_cache, max_block_size); + builder.init(Pipe(source)); + } + else + { + std::optional max_parsing_threads; + if (need_only_count) + max_parsing_threads = 1; + + auto compression_method = chooseCompressionMethod( + object_info->relative_path, configuration->compression_method); + + read_buf = createReadBuffer(object_info->relative_path, object_info->metadata.size_bytes); + + auto input_format = FormatFactory::instance().getInput( + configuration->format, *read_buf, read_from_format_info.format_header, + getContext(), max_block_size, format_settings, max_parsing_threads, + std::nullopt, /* is_remote_fs */ true, compression_method); + + if (need_only_count) + input_format->needOnlyCount(); + + builder.init(Pipe(input_format)); + + if (columns_desc.hasDefaults()) + { + builder.addSimpleTransform( + [&](const Block & header) + { + return std::make_shared(header, columns_desc, *input_format, getContext()); + }); + } + + source = input_format; + } + + /// Add ExtractColumnsTransform to extract requested columns/subcolumns + /// from chunk read by IInputFormat. + builder.addSimpleTransform([&](const Block & header) + { + return std::make_shared(header, read_from_format_info.requested_columns); + }); + + auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); + auto current_reader = std::make_unique(*pipeline); + + ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); + + return ReaderHolder{object_info, std::move(read_buf), + std::move(source), std::move(pipeline), std::move(current_reader)}; +} + +template +std::future::ReaderHolder> +StorageObjectStorageSource::createReaderAsync(size_t processor) +{ + return create_reader_scheduler([=, this] { return createReader(processor); }, Priority{}); +} + +template +std::unique_ptr StorageObjectStorageSource::createReadBuffer(const String & key, size_t object_size) +{ + auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); + read_settings.enable_filesystem_cache = false; + read_settings.remote_read_min_bytes_for_seek = read_settings.remote_fs_buffer_size; + + // auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; + // const bool object_too_small = object_size <= 2 * download_buffer_size; + + // Create a read buffer that will prefetch the first ~1 MB of the file. + // When reading lots of tiny files, this prefetching almost doubles the throughput. + // For bigger files, parallel reading is more useful. + // if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) + // { + // LOG_TRACE(log, "Downloading object of size {} with initial prefetch", object_size); + + // auto async_reader = object_storage->readObjects( + // StoredObjects{StoredObject{key, /* local_path */ "", object_size}}, read_settings); + + // async_reader->setReadUntilEnd(); + // if (read_settings.remote_fs_prefetch) + // async_reader->prefetch(DEFAULT_PREFETCH_PRIORITY); + + // return async_reader; + // } + // else + return object_storage->readObject(StoredObject(key), read_settings); +} + +template class StorageObjectStorageSource; +template class StorageObjectStorageSource; +template class StorageObjectStorageSource; + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h new file mode 100644 index 00000000000..f68a5d47456 --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -0,0 +1,217 @@ +#pragma once +#include +#include +#include + + +namespace DB +{ +template +class StorageObjectStorageSource : public ISource, WithContext +{ + friend class StorageS3QueueSource; +public: + using Source = StorageObjectStorageSource; + using Storage = StorageObjectStorage; + using ObjectInfo = Storage::ObjectInfo; + using ObjectInfoPtr = Storage::ObjectInfoPtr; + using ObjectInfos = Storage::ObjectInfos; + + class IIterator : public WithContext + { + public: + virtual ~IIterator() = default; + + virtual size_t estimatedKeysCount() = 0; + virtual ObjectInfoPtr next(size_t processor) = 0; + }; + + class ReadTaskIterator; + class GlobIterator; + class KeysIterator; + + StorageObjectStorageSource( + String name_, + ObjectStoragePtr object_storage_, + Storage::ConfigurationPtr configuration, + const ReadFromFormatInfo & info, + std::optional format_settings_, + ContextPtr context_, + UInt64 max_block_size_, + std::shared_ptr file_iterator_, + bool need_only_count_); + + ~StorageObjectStorageSource() override; + + String getName() const override { return name; } + + Chunk generate() override; + + static std::shared_ptr createFileIterator( + Storage::ConfigurationPtr configuration, + ObjectStoragePtr object_storage, + bool distributed_processing, + const ContextPtr & local_context, + const ActionsDAG::Node * predicate, + const NamesAndTypesList & virtual_columns, + ObjectInfos * read_keys, + std::function file_progress_callback = {}); + +protected: + void addNumRowsToCache(const String & path, size_t num_rows); + std::optional tryGetNumRowsFromCache(const ObjectInfoPtr & object_info); + + const String name; + ObjectStoragePtr object_storage; + const Storage::ConfigurationPtr configuration; + const std::optional format_settings; + const UInt64 max_block_size; + const bool need_only_count; + const ReadFromFormatInfo read_from_format_info; + + ColumnsDescription columns_desc; + std::shared_ptr file_iterator; + size_t total_rows_in_file = 0; + + struct ReaderHolder + { + public: + ReaderHolder( + ObjectInfoPtr object_info_, + std::unique_ptr read_buf_, + std::shared_ptr source_, + std::unique_ptr pipeline_, + std::unique_ptr reader_) + : object_info(std::move(object_info_)) + , read_buf(std::move(read_buf_)) + , source(std::move(source_)) + , pipeline(std::move(pipeline_)) + , reader(std::move(reader_)) + { + } + + ReaderHolder() = default; + ReaderHolder(const ReaderHolder & other) = delete; + ReaderHolder & operator=(const ReaderHolder & other) = delete; + ReaderHolder(ReaderHolder && other) noexcept { *this = std::move(other); } + + ReaderHolder & operator=(ReaderHolder && other) noexcept + { + /// The order of destruction is important. + /// reader uses pipeline, pipeline uses read_buf. + reader = std::move(other.reader); + pipeline = std::move(other.pipeline); + source = std::move(other.source); + read_buf = std::move(other.read_buf); + object_info = std::move(other.object_info); + return *this; + } + + explicit operator bool() const { return reader != nullptr; } + PullingPipelineExecutor * operator->() { return reader.get(); } + const PullingPipelineExecutor * operator->() const { return reader.get(); } + const String & getRelativePath() const { return object_info->relative_path; } + const ObjectInfo & getObjectInfo() const { return *object_info; } + const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } + + private: + ObjectInfoPtr object_info; + std::unique_ptr read_buf; + std::shared_ptr source; + std::unique_ptr pipeline; + std::unique_ptr reader; + }; + + ReaderHolder reader; + LoggerPtr log = getLogger("StorageObjectStorageSource"); + ThreadPool create_reader_pool; + ThreadPoolCallbackRunner create_reader_scheduler; + std::future reader_future; + + /// Recreate ReadBuffer and Pipeline for each file. + ReaderHolder createReader(size_t processor = 0); + std::future createReaderAsync(size_t processor = 0); + + std::unique_ptr createReadBuffer(const String & key, size_t object_size); +}; + +template +class StorageObjectStorageSource::ReadTaskIterator : public IIterator +{ +public: + explicit ReadTaskIterator(const ReadTaskCallback & callback_) : callback(callback_) {} + + size_t estimatedKeysCount() override { return 0; } /// TODO FIXME + + ObjectInfoPtr next(size_t) override { return std::make_shared( callback(), ObjectMetadata{} ); } + +private: + ReadTaskCallback callback; +}; + +template +class StorageObjectStorageSource::GlobIterator : public IIterator +{ +public: + GlobIterator( + ObjectStoragePtr object_storage_, + Storage::ConfigurationPtr configuration_, + const ActionsDAG::Node * predicate, + const NamesAndTypesList & virtual_columns_, + ObjectInfos * read_keys_, + std::function file_progress_callback_ = {}); + + ~GlobIterator() override = default; + + size_t estimatedKeysCount() override { return object_infos.size(); } + + ObjectInfoPtr next(size_t processor) override; + +private: + ObjectStoragePtr object_storage; + Storage::ConfigurationPtr configuration; + ActionsDAGPtr filter_dag; + NamesAndTypesList virtual_columns; + + size_t index = 0; + + ObjectInfos object_infos; + ObjectInfos * read_keys; + ObjectStorageIteratorPtr object_storage_iterator; + bool recursive{false}; + + std::unique_ptr matcher; + + void createFilterAST(const String & any_key); + bool is_finished = false; + std::mutex next_mutex; + + std::function file_progress_callback; +}; + +template +class StorageObjectStorageSource::KeysIterator : public IIterator +{ +public: + KeysIterator( + ObjectStoragePtr object_storage_, + Storage::ConfigurationPtr configuration_, + const NamesAndTypesList & virtual_columns_, + ObjectInfos * read_keys_, + std::function file_progress_callback = {}); + + ~KeysIterator() override = default; + + size_t estimatedKeysCount() override { return keys.size(); } + + ObjectInfoPtr next(size_t processor) override; + +private: + const ObjectStoragePtr object_storage; + const Storage::ConfigurationPtr configuration; + const NamesAndTypesList virtual_columns; + const std::function file_progress_callback; + const std::vector keys; + std::atomic index = 0; +}; +} diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp new file mode 100644 index 00000000000..bc9f93690f5 --- /dev/null +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -0,0 +1,166 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +static void initializeConfiguration( + StorageObjectStorageConfiguration & configuration, + ASTs & engine_args, + ContextPtr local_context, + bool with_table_structure) +{ + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) + configuration.fromNamedCollection(*named_collection); + else + configuration.fromAST(engine_args, local_context, with_table_structure); +} + +template +static std::shared_ptr> createStorageObjectStorage( + const StorageFactory::Arguments & args, + typename StorageObjectStorage::ConfigurationPtr configuration, + const String & engine_name, + ContextPtr context) +{ + auto & engine_args = args.engine_args; + if (engine_args.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); + + // Use format settings from global server context + settings from + // the SETTINGS clause of the create query. Settings from current + // session and user are ignored. + std::optional format_settings; + if (args.storage_def->settings) + { + FormatFactorySettings user_format_settings; + + // Apply changed settings from global context, but ignore the + // unknown ones, because we only have the format settings here. + const auto & changes = context->getSettingsRef().changes(); + for (const auto & change : changes) + { + if (user_format_settings.has(change.name)) + user_format_settings.set(change.name, change.value); + } + + // Apply changes from SETTINGS clause, with validation. + user_format_settings.applyChanges(args.storage_def->settings->changes); + format_settings = getFormatSettings(context, user_format_settings); + } + else + { + format_settings = getFormatSettings(context); + } + + ASTPtr partition_by; + if (args.storage_def->partition_by) + partition_by = args.storage_def->partition_by->clone(); + + return std::make_shared>( + configuration, + configuration->createOrUpdateObjectStorage(context), + engine_name, + args.getContext(), + args.table_id, + args.columns, + args.constraints, + args.comment, + format_settings, + /* distributed_processing */ false, + partition_by); +} + +#if USE_AZURE_BLOB_STORAGE +void registerStorageAzure(StorageFactory & factory) +{ + factory.registerStorage("AzureBlobStorage", [](const StorageFactory::Arguments & args) + { + auto context = args.getLocalContext(); + auto configuration = std::make_shared(); + initializeConfiguration(*configuration, args.engine_args, context, false); + return createStorageObjectStorage(args, configuration, "Azure", context); + }, + { + .supports_settings = true, + .supports_sort_order = true, // for partition by + .supports_schema_inference = true, + .source_access_type = AccessType::AZURE, + }); +} +#endif + +#if USE_AWS_S3 +void registerStorageS3Impl(const String & name, StorageFactory & factory) +{ + factory.registerStorage(name, [=](const StorageFactory::Arguments & args) + { + auto context = args.getLocalContext(); + auto configuration = std::make_shared(); + initializeConfiguration(*configuration, args.engine_args, context, false); + return createStorageObjectStorage(args, configuration, name, context); + }, + { + .supports_settings = true, + .supports_sort_order = true, // for partition by + .supports_schema_inference = true, + .source_access_type = AccessType::S3, + }); +} + +void registerStorageS3(StorageFactory & factory) +{ + return registerStorageS3Impl("S3", factory); +} + +void registerStorageCOS(StorageFactory & factory) +{ + return registerStorageS3Impl("COSN", factory); +} + +void registerStorageOSS(StorageFactory & factory) +{ + return registerStorageS3Impl("OSS", factory); +} + +#endif + +#if USE_HDFS +void registerStorageHDFS(StorageFactory & factory) +{ + factory.registerStorage("HDFS", [=](const StorageFactory::Arguments & args) + { + auto context = args.getLocalContext(); + auto configuration = std::make_shared(); + initializeConfiguration(*configuration, args.engine_args, context, false); + return createStorageObjectStorage(args, configuration, "HDFS", context); + }, + { + .supports_settings = true, + .supports_sort_order = true, // for partition by + .supports_schema_inference = true, + .source_access_type = AccessType::HDFS, + }); +} +#endif + +void registerStorageObjectStorage(StorageFactory & factory) +{ +#if USE_AWS_S3 + registerStorageS3(factory); + registerStorageCOS(factory); + registerStorageOSS(factory); +#endif +#if USE_AZURE_BLOB_STORAGE + registerStorageAzure(factory); +#endif +#if USE_HDFS + registerStorageHDFS(factory); +#endif +} + +} diff --git a/src/Storages/ObjectStorageConfiguration.h b/src/Storages/ObjectStorageConfiguration.h new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index b4f5f957f76..bd34d1ec093 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -5,9 +5,9 @@ #include #include #include -#include #include #include +#include namespace CurrentMetrics @@ -31,11 +31,11 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -StorageS3QueueSource::S3QueueKeyWithInfo::S3QueueKeyWithInfo( +StorageS3QueueSource::S3QueueObjectInfo::S3QueueObjectInfo( const std::string & key_, - std::optional info_, + const ObjectMetadata & object_metadata_, Metadata::ProcessingNodeHolderPtr processing_holder_) - : StorageS3Source::KeyWithInfo(key_, info_) + : Source::ObjectInfo(key_, object_metadata_) , processing_holder(processing_holder_) { } @@ -55,15 +55,15 @@ StorageS3QueueSource::FileIterator::FileIterator( if (sharded_processing) { for (const auto & id : metadata->getProcessingIdsForShard(current_shard)) - sharded_keys.emplace(id, std::deque{}); + sharded_keys.emplace(id, std::deque{}); } } -StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next(size_t idx) +StorageS3QueueSource::Source::ObjectInfoPtr StorageS3QueueSource::FileIterator::next(size_t processor) { while (!shutdown_called) { - KeyWithInfoPtr val{nullptr}; + Source::ObjectInfoPtr val{nullptr}; { std::unique_lock lk(sharded_keys_mutex, std::defer_lock); @@ -73,7 +73,7 @@ StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next(si /// we need to check sharded_keys and to next() under lock. lk.lock(); - if (auto it = sharded_keys.find(idx); it != sharded_keys.end()) + if (auto it = sharded_keys.find(processor); it != sharded_keys.end()) { auto & keys = it->second; if (!keys.empty()) @@ -86,24 +86,24 @@ StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next(si { throw Exception(ErrorCodes::LOGICAL_ERROR, "Processing id {} does not exist (Expected ids: {})", - idx, fmt::join(metadata->getProcessingIdsForShard(current_shard), ", ")); + processor, fmt::join(metadata->getProcessingIdsForShard(current_shard), ", ")); } } if (!val) { - val = glob_iterator->next(); + val = glob_iterator->next(processor); if (val && sharded_processing) { - const auto processing_id_for_key = metadata->getProcessingIdForPath(val->key); - if (idx != processing_id_for_key) + const auto processing_id_for_key = metadata->getProcessingIdForPath(val->relative_path); + if (processor != processing_id_for_key) { if (metadata->isProcessingIdBelongsToShard(processing_id_for_key, current_shard)) { LOG_TEST(log, "Putting key {} into queue of processor {} (total: {})", - val->key, processing_id_for_key, sharded_keys.size()); + val->relative_path, processing_id_for_key, sharded_keys.size()); - if (auto it = sharded_keys.find(idx); it != sharded_keys.end()) + if (auto it = sharded_keys.find(processor); it != sharded_keys.end()) { it->second.push_back(val); } @@ -111,7 +111,7 @@ StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next(si { throw Exception(ErrorCodes::LOGICAL_ERROR, "Processing id {} does not exist (Expected ids: {})", - idx, fmt::join(metadata->getProcessingIdsForShard(current_shard), ", ")); + processor, fmt::join(metadata->getProcessingIdsForShard(current_shard), ", ")); } } continue; @@ -129,25 +129,25 @@ StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next(si return {}; } - auto processing_holder = metadata->trySetFileAsProcessing(val->key); + auto processing_holder = metadata->trySetFileAsProcessing(val->relative_path); if (shutdown_called) { LOG_TEST(log, "Shutdown was called, stopping file iterator"); return {}; } - LOG_TEST(log, "Checking if can process key {} for processing_id {}", val->key, idx); + LOG_TEST(log, "Checking if can process key {} for processing_id {}", val->relative_path, processor); if (processing_holder) { - return std::make_shared(val->key, val->info, processing_holder); + return std::make_shared(val->relative_path, val->metadata, processing_holder); } else if (sharded_processing - && metadata->getFileStatus(val->key)->state == S3QueueFilesMetadata::FileStatus::State::Processing) + && metadata->getFileStatus(val->relative_path)->state == S3QueueFilesMetadata::FileStatus::State::Processing) { throw Exception(ErrorCodes::LOGICAL_ERROR, "File {} is processing by someone else in sharded processing. " - "It is a bug", val->key); + "It is a bug", val->relative_path); } } return {}; @@ -161,7 +161,7 @@ size_t StorageS3QueueSource::FileIterator::estimatedKeysCount() StorageS3QueueSource::StorageS3QueueSource( String name_, const Block & header_, - std::unique_ptr internal_source_, + std::unique_ptr internal_source_, std::shared_ptr files_metadata_, size_t processing_id_, const S3QueueAction & action_, @@ -190,38 +190,19 @@ StorageS3QueueSource::StorageS3QueueSource( { } -StorageS3QueueSource::~StorageS3QueueSource() -{ - internal_source->create_reader_pool.wait(); -} - String StorageS3QueueSource::getName() const { return name; } -void StorageS3QueueSource::lazyInitialize() -{ - if (initialized) - return; - - internal_source->lazyInitialize(processing_id); - reader = std::move(internal_source->reader); - if (reader) - reader_future = std::move(internal_source->reader_future); - initialized = true; -} - Chunk StorageS3QueueSource::generate() { - lazyInitialize(); - while (true) { if (!reader) break; - const auto * key_with_info = dynamic_cast(&reader.getKeyWithInfo()); + const auto * key_with_info = dynamic_cast(&reader.getObjectInfo()); auto file_status = key_with_info->processing_holder->getFileStatus(); if (isCancelled()) @@ -239,7 +220,7 @@ Chunk StorageS3QueueSource::generate() tryLogCurrentException(__PRETTY_FUNCTION__); } - appendLogElement(reader.getFile(), *file_status, processed_rows_from_file, false); + appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, false); } break; @@ -254,7 +235,7 @@ Chunk StorageS3QueueSource::generate() { LOG_DEBUG( log, "Table is being dropped, {} rows are already processed from {}, but file is not fully processed", - processed_rows_from_file, reader.getFile()); + processed_rows_from_file, reader.getRelativePath()); try { @@ -265,7 +246,7 @@ Chunk StorageS3QueueSource::generate() tryLogCurrentException(__PRETTY_FUNCTION__); } - appendLogElement(reader.getFile(), *file_status, processed_rows_from_file, false); + appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, false); /// Leave the file half processed. Table is being dropped, so we do not care. break; @@ -273,7 +254,7 @@ Chunk StorageS3QueueSource::generate() LOG_DEBUG(log, "Shutdown called, but file {} is partially processed ({} rows). " "Will process the file fully and then shutdown", - reader.getFile(), processed_rows_from_file); + reader.getRelativePath(), processed_rows_from_file); } auto * prev_scope = CurrentThread::get().attachProfileCountersScope(&file_status->profile_counters); @@ -287,30 +268,30 @@ Chunk StorageS3QueueSource::generate() Chunk chunk; if (reader->pull(chunk)) { - LOG_TEST(log, "Read {} rows from file: {}", chunk.getNumRows(), reader.getPath()); + LOG_TEST(log, "Read {} rows from file: {}", chunk.getNumRows(), reader.getRelativePath()); file_status->processed_rows += chunk.getNumRows(); processed_rows_from_file += chunk.getNumRows(); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk(chunk, requested_virtual_columns, reader.getPath(), reader.getKeyWithInfo().info->size); + VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk(chunk, requested_virtual_columns, reader.getRelativePath(), reader.getObjectInfo().metadata.size_bytes); return chunk; } } catch (...) { const auto message = getCurrentExceptionMessage(true); - LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", reader.getFile(), message); + LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", reader.getRelativePath(), message); files_metadata->setFileFailed(key_with_info->processing_holder, message); - appendLogElement(reader.getFile(), *file_status, processed_rows_from_file, false); + appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, false); throw; } files_metadata->setFileProcessed(key_with_info->processing_holder); - applyActionAfterProcessing(reader.getFile()); + applyActionAfterProcessing(reader.getRelativePath()); - appendLogElement(reader.getFile(), *file_status, processed_rows_from_file, true); + appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, true); file_status.reset(); processed_rows_from_file = 0; @@ -326,7 +307,7 @@ Chunk StorageS3QueueSource::generate() if (!reader) break; - file_status = files_metadata->getFileStatus(reader.getFile()); + file_status = files_metadata->getFileStatus(reader.getRelativePath()); /// Even if task is finished the thread may be not freed in pool. /// So wait until it will be freed before scheduling a new task. diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 8fc7305ea08..fcf5c5c0160 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -5,7 +5,9 @@ #include #include #include -#include +#include +#include +#include #include @@ -14,28 +16,32 @@ namespace Poco { class Logger; } namespace DB { +struct ObjectMetadata; + class StorageS3QueueSource : public ISource, WithContext { public: - using IIterator = StorageS3Source::IIterator; - using KeyWithInfoPtr = StorageS3Source::KeyWithInfoPtr; - using GlobIterator = StorageS3Source::DisclosedGlobIterator; + using Storage = StorageObjectStorage; + using Source = StorageObjectStorageSource; + + using ConfigurationPtr = Storage::ConfigurationPtr; + using GlobIterator = Source::GlobIterator; using ZooKeeperGetter = std::function; using RemoveFileFunc = std::function; using FileStatusPtr = S3QueueFilesMetadata::FileStatusPtr; using Metadata = S3QueueFilesMetadata; - struct S3QueueKeyWithInfo : public StorageS3Source::KeyWithInfo + struct S3QueueObjectInfo : public Source::ObjectInfo { - S3QueueKeyWithInfo( - const std::string & key_, - std::optional info_, - Metadata::ProcessingNodeHolderPtr processing_holder_); + S3QueueObjectInfo( + const std::string & key_, + const ObjectMetadata & object_metadata_, + Metadata::ProcessingNodeHolderPtr processing_holder_); Metadata::ProcessingNodeHolderPtr processing_holder; }; - class FileIterator : public IIterator + class FileIterator : public Source::IIterator { public: FileIterator( @@ -47,7 +53,7 @@ public: /// Note: /// List results in s3 are always returned in UTF-8 binary order. /// (https://docs.aws.amazon.com/AmazonS3/latest/userguide/ListingKeysUsingAPIs.html) - KeyWithInfoPtr next(size_t idx) override; + Source::ObjectInfoPtr next(size_t processor) override; size_t estimatedKeysCount() override; @@ -60,14 +66,14 @@ public: const bool sharded_processing; const size_t current_shard; - std::unordered_map> sharded_keys; + std::unordered_map> sharded_keys; std::mutex sharded_keys_mutex; }; StorageS3QueueSource( String name_, const Block & header_, - std::unique_ptr internal_source_, + std::unique_ptr internal_source_, std::shared_ptr files_metadata_, size_t processing_id_, const S3QueueAction & action_, @@ -80,8 +86,6 @@ public: const StorageID & storage_id_, LoggerPtr log_); - ~StorageS3QueueSource() override; - static Block getHeader(Block sample_block, const std::vector & requested_virtual_columns); String getName() const override; @@ -93,7 +97,7 @@ private: const S3QueueAction action; const size_t processing_id; const std::shared_ptr files_metadata; - const std::shared_ptr internal_source; + const std::shared_ptr internal_source; const NamesAndTypesList requested_virtual_columns; const std::atomic & shutdown_called; const std::atomic & table_is_being_dropped; @@ -103,13 +107,11 @@ private: RemoveFileFunc remove_file_func; LoggerPtr log; - using ReaderHolder = StorageS3Source::ReaderHolder; - ReaderHolder reader; - std::future reader_future; + Source::ReaderHolder reader; + std::future reader_future; std::atomic initialized{false}; size_t processed_rows_from_file = 0; - void lazyInitialize(); void applyActionAfterProcessing(const String & path); void appendLogElement(const std::string & filename, S3QueueFilesMetadata::FileStatus & file_status_, size_t processed_rows, bool processed); }; diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.cpp b/src/Storages/S3Queue/S3QueueTableMetadata.cpp index 3ee2594135d..94816619aaa 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueTableMetadata.cpp @@ -7,7 +7,6 @@ #include #include #include -#include namespace DB @@ -33,7 +32,7 @@ namespace S3QueueTableMetadata::S3QueueTableMetadata( - const StorageS3::Configuration & configuration, + const StorageObjectStorageConfiguration & configuration, const S3QueueSettings & engine_settings, const StorageInMemoryMetadata & storage_metadata) { diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.h b/src/Storages/S3Queue/S3QueueTableMetadata.h index 30642869930..942ce7973ef 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.h +++ b/src/Storages/S3Queue/S3QueueTableMetadata.h @@ -3,7 +3,7 @@ #if USE_AWS_S3 #include -#include +#include #include namespace DB @@ -27,7 +27,10 @@ struct S3QueueTableMetadata UInt64 s3queue_processing_threads_num; S3QueueTableMetadata() = default; - S3QueueTableMetadata(const StorageS3::Configuration & configuration, const S3QueueSettings & engine_settings, const StorageInMemoryMetadata & storage_metadata); + S3QueueTableMetadata( + const StorageObjectStorageConfiguration & configuration, + const S3QueueSettings & engine_settings, + const StorageInMemoryMetadata & storage_metadata); void read(const String & metadata_str); static S3QueueTableMetadata parse(const String & metadata_str); diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 0723205b544..fa7132f705a 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -50,11 +51,6 @@ namespace ErrorCodes namespace { - bool containsGlobs(const S3::URI & url) - { - return url.key.find_first_of("*?{") != std::string::npos; - } - std::string chooseZooKeeperPath(const StorageID & table_id, const Settings & settings, const S3QueueSettings & s3queue_settings) { std::string zk_path_prefix = settings.s3queue_default_zookeeper_path.value; @@ -98,7 +94,7 @@ namespace StorageS3Queue::StorageS3Queue( std::unique_ptr s3queue_settings_, - const StorageS3::Configuration & configuration_, + const ConfigurationPtr configuration_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -116,29 +112,29 @@ StorageS3Queue::StorageS3Queue( , reschedule_processing_interval_ms(s3queue_settings->s3queue_polling_min_timeout_ms) , log(getLogger("StorageS3Queue (" + table_id_.table_name + ")")) { - if (configuration.url.key.empty()) + if (configuration->getPath().empty()) { - configuration.url.key = "/*"; + configuration->setPath("/*"); } - else if (configuration.url.key.ends_with('/')) + else if (configuration->getPath().ends_with('/')) { - configuration.url.key += '*'; + configuration->setPath(configuration->getPath() + '*'); } - else if (!containsGlobs(configuration.url)) + else if (!configuration->isPathWithGlobs()) { throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue url must either end with '/' or contain globs"); } checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef()); - configuration.update(context_); - FormatFactory::instance().checkFormatName(configuration.format); - context_->getRemoteHostFilter().checkURL(configuration.url.uri); + object_storage = configuration->createOrUpdateObjectStorage(context_); + FormatFactory::instance().checkFormatName(configuration->format); + configuration->check(context_); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = StorageS3::getTableStructureFromDataImpl(configuration, format_settings, context_); + auto columns = Storage::getTableStructureFromData(object_storage, configuration, format_settings, context_); storage_metadata.setColumns(columns); } else @@ -226,7 +222,7 @@ void StorageS3Queue::drop() bool StorageS3Queue::supportsSubsetOfColumns(const ContextPtr & context_) const { - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context_, format_settings); + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context_, format_settings); } class ReadFromS3Queue : public SourceStepWithFilter @@ -345,38 +341,20 @@ std::shared_ptr StorageS3Queue::createSource( size_t max_block_size, ContextPtr local_context) { - auto configuration_snapshot = updateConfigurationAndGetCopy(local_context); - - auto internal_source = std::make_unique( - info, configuration.format, getName(), local_context, format_settings, + auto internal_source = std::make_unique( + getName(), + object_storage, + configuration, + info, + format_settings, + local_context, max_block_size, - configuration_snapshot.request_settings, - configuration_snapshot.compression_method, - configuration_snapshot.client, - configuration_snapshot.url.bucket, - configuration_snapshot.url.version_id, - configuration_snapshot.url.uri.getHost() + std::to_string(configuration_snapshot.url.uri.getPort()), - file_iterator, local_context->getSettingsRef().max_download_threads, false); + file_iterator, + false); - auto file_deleter = [this, bucket = configuration_snapshot.url.bucket, client = configuration_snapshot.client, blob_storage_log = BlobStorageLogWriter::create()](const std::string & path) mutable + auto file_deleter = [=, this](const std::string & path) mutable { - S3::DeleteObjectRequest request; - request.WithKey(path).WithBucket(bucket); - auto outcome = client->DeleteObject(request); - if (blob_storage_log) - blob_storage_log->addEvent( - BlobStorageLogElement::EventType::Delete, - bucket, path, {}, 0, outcome.IsSuccess() ? nullptr : &outcome.GetError()); - - if (!outcome.IsSuccess()) - { - const auto & err = outcome.GetError(); - LOG_ERROR(log, "{} (Code: {})", err.GetMessage(), static_cast(err.GetErrorType())); - } - else - { - LOG_TRACE(log, "Object with path {} was removed from S3", path); - } + object_storage->removeObject(StoredObject(path)); }; auto s3_queue_log = s3queue_settings->s3queue_enable_logging_to_s3queue_log ? local_context->getS3QueueLog() : nullptr; return std::make_shared( @@ -470,7 +448,6 @@ bool StorageS3Queue::streamToViews() auto s3queue_context = Context::createCopy(getContext()); s3queue_context->makeQueryContext(); - auto query_configuration = updateConfigurationAndGetCopy(s3queue_context); // Create a stream for each consumer and join them in a union stream // Only insert into dependent views and expect that input blocks contain virtual columns @@ -505,12 +482,6 @@ bool StorageS3Queue::streamToViews() return rows > 0; } -StorageS3Queue::Configuration StorageS3Queue::updateConfigurationAndGetCopy(ContextPtr local_context) -{ - configuration.update(local_context); - return configuration; -} - zkutil::ZooKeeperPtr StorageS3Queue::getZooKeeper() const { return getContext()->getZooKeeper(); @@ -530,7 +501,7 @@ void StorageS3Queue::createOrCheckMetadata(const StorageInMemoryMetadata & stora } else { - std::string metadata = S3QueueTableMetadata(configuration, *s3queue_settings, storage_metadata).toString(); + std::string metadata = S3QueueTableMetadata(*configuration, *s3queue_settings, storage_metadata).toString(); requests.emplace_back(zkutil::makeCreateRequest(zk_path, "", zkutil::CreateMode::Persistent)); requests.emplace_back(zkutil::makeCreateRequest(zk_path / "processed", "", zkutil::CreateMode::Persistent)); requests.emplace_back(zkutil::makeCreateRequest(zk_path / "failed", "", zkutil::CreateMode::Persistent)); @@ -568,7 +539,7 @@ void StorageS3Queue::checkTableStructure(const String & zookeeper_prefix, const String metadata_str = zookeeper->get(fs::path(zookeeper_prefix) / "metadata"); auto metadata_from_zk = S3QueueTableMetadata::parse(metadata_str); - S3QueueTableMetadata old_metadata(configuration, *s3queue_settings, storage_metadata); + S3QueueTableMetadata old_metadata(*configuration, *s3queue_settings, storage_metadata); old_metadata.checkEquals(metadata_from_zk); auto columns_from_zk = ColumnsDescription::parse(metadata_from_zk.columns); @@ -584,14 +555,25 @@ void StorageS3Queue::checkTableStructure(const String & zookeeper_prefix, const } } -std::shared_ptr StorageS3Queue::createFileIterator(ContextPtr local_context, const ActionsDAG::Node * predicate) +std::shared_ptr StorageS3Queue::createFileIterator(ContextPtr , const ActionsDAG::Node * predicate) { - auto glob_iterator = std::make_unique( - *configuration.client, configuration.url, predicate, virtual_columns, local_context, - /* read_keys */nullptr, configuration.request_settings); + auto glob_iterator = std::make_unique(object_storage, configuration, predicate, virtual_columns, nullptr); + return std::make_shared(files_metadata, std::move(glob_iterator), s3queue_settings->s3queue_current_shard_num, shutdown_called); } +static void initializeConfiguration( + StorageObjectStorageConfiguration & configuration, + ASTs & engine_args, + ContextPtr local_context, + bool with_table_structure) +{ + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) + configuration.fromNamedCollection(*named_collection); + else + configuration.fromAST(engine_args, local_context, with_table_structure); +} + void registerStorageS3QueueImpl(const String & name, StorageFactory & factory) { factory.registerStorage( @@ -602,7 +584,8 @@ void registerStorageS3QueueImpl(const String & name, StorageFactory & factory) if (engine_args.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); - auto configuration = StorageS3::getConfiguration(engine_args, args.getLocalContext()); + auto configuration = std::make_shared(); + initializeConfiguration(*configuration, args.engine_args, args.getContext(), false); // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index fd3b4bb4914..88f9bd65093 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -26,11 +26,13 @@ class S3QueueFilesMetadata; class StorageS3Queue : public IStorage, WithContext { public: - using Configuration = typename StorageS3::Configuration; + using Storage = StorageObjectStorage; + using Source = StorageObjectStorageSource; + using ConfigurationPtr = Storage::ConfigurationPtr; StorageS3Queue( std::unique_ptr s3queue_settings_, - const Configuration & configuration_, + ConfigurationPtr configuration_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -53,8 +55,6 @@ public: NamesAndTypesList getVirtuals() const override { return virtual_columns; } - const auto & getFormatName() const { return configuration.format; } - const fs::path & getZooKeeperPath() const { return zk_path; } zkutil::ZooKeeperPtr getZooKeeper() const; @@ -68,7 +68,8 @@ private: const S3QueueAction after_processing; std::shared_ptr files_metadata; - Configuration configuration; + ConfigurationPtr configuration; + ObjectStoragePtr object_storage; const std::optional format_settings; NamesAndTypesList virtual_columns; @@ -103,7 +104,6 @@ private: void createOrCheckMetadata(const StorageInMemoryMetadata & storage_metadata); void checkTableStructure(const String & zookeeper_prefix, const StorageInMemoryMetadata & storage_metadata); - Configuration updateConfigurationAndGetCopy(ContextPtr local_context); }; } diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp deleted file mode 100644 index c09db0bfb7b..00000000000 --- a/src/Storages/StorageAzureBlob.cpp +++ /dev/null @@ -1,1478 +0,0 @@ -#include - -#if USE_AZURE_BLOB_STORAGE -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -using namespace Azure::Storage::Blobs; - -namespace CurrentMetrics -{ - extern const Metric ObjectStorageAzureThreads; - extern const Metric ObjectStorageAzureThreadsActive; - extern const Metric ObjectStorageAzureThreadsScheduled; -} - -namespace ProfileEvents -{ - extern const Event EngineFileLikeReadFiles; -} - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int BAD_ARGUMENTS; - extern const int DATABASE_ACCESS_DENIED; - extern const int CANNOT_COMPILE_REGEXP; - extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int LOGICAL_ERROR; - extern const int NOT_IMPLEMENTED; - -} - -namespace -{ - -const std::unordered_set required_configuration_keys = { - "blob_path", - "container", -}; - -const std::unordered_set optional_configuration_keys = { - "format", - "compression", - "structure", - "compression_method", - "account_name", - "account_key", - "connection_string", - "storage_account_url", -}; - -bool isConnectionString(const std::string & candidate) -{ - return !candidate.starts_with("http"); -} - -} - -void StorageAzureBlob::processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection) -{ - validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); - - if (collection.has("connection_string")) - { - configuration.connection_url = collection.get("connection_string"); - configuration.is_connection_string = true; - } - - if (collection.has("storage_account_url")) - { - configuration.connection_url = collection.get("storage_account_url"); - configuration.is_connection_string = false; - } - - configuration.container = collection.get("container"); - configuration.blob_path = collection.get("blob_path"); - - if (collection.has("account_name")) - configuration.account_name = collection.get("account_name"); - - if (collection.has("account_key")) - configuration.account_key = collection.get("account_key"); - - configuration.structure = collection.getOrDefault("structure", "auto"); - configuration.format = collection.getOrDefault("format", configuration.format); - configuration.compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); -} - - -StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, ContextPtr local_context) -{ - StorageAzureBlob::Configuration configuration; - - /// Supported signatures: - /// - /// AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression]) - /// - - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - { - processNamedCollectionResult(configuration, *named_collection); - - configuration.blobs_paths = {configuration.blob_path}; - - if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); - - return configuration; - } - - if (engine_args.size() < 3 || engine_args.size() > 7) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Storage AzureBlobStorage requires 3 to 7 arguments: " - "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression])"); - - for (auto & engine_arg : engine_args) - engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, local_context); - - std::unordered_map engine_args_to_idx; - - configuration.connection_url = checkAndGetLiteralArgument(engine_args[0], "connection_string/storage_account_url"); - configuration.is_connection_string = isConnectionString(configuration.connection_url); - - configuration.container = checkAndGetLiteralArgument(engine_args[1], "container"); - configuration.blob_path = checkAndGetLiteralArgument(engine_args[2], "blobpath"); - - auto is_format_arg = [] (const std::string & s) -> bool - { - return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); - }; - - if (engine_args.size() == 4) - { - //'c1 UInt64, c2 UInt64 - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - if (is_format_arg(fourth_arg)) - { - configuration.format = fourth_arg; - } - else - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format or account name specified without account key"); - } - } - else if (engine_args.size() == 5) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - if (is_format_arg(fourth_arg)) - { - configuration.format = fourth_arg; - configuration.compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); - } - else - { - configuration.account_name = fourth_arg; - configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - } - } - else if (engine_args.size() == 6) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); - } - else - { - configuration.account_name = fourth_arg; - - configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); - if (!is_format_arg(sixth_arg)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - configuration.format = sixth_arg; - } - } - else if (engine_args.size() == 7) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); - } - else - { - configuration.account_name = fourth_arg; - configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); - if (!is_format_arg(sixth_arg)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - configuration.format = sixth_arg; - configuration.compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); - } - } - - configuration.blobs_paths = {configuration.blob_path}; - - if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); - - return configuration; -} - - -AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(ContextPtr local_context) -{ - const auto & context_settings = local_context->getSettingsRef(); - auto settings_ptr = std::make_unique(); - settings_ptr->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; - settings_ptr->max_single_read_retries = context_settings.azure_max_single_read_retries; - settings_ptr->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); - - return settings_ptr; -} - -void registerStorageAzureBlob(StorageFactory & factory) -{ - factory.registerStorage("AzureBlobStorage", [](const StorageFactory::Arguments & args) - { - auto & engine_args = args.engine_args; - if (engine_args.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); - - auto configuration = StorageAzureBlob::getConfiguration(engine_args, args.getLocalContext()); - auto client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); - // Use format settings from global server context + settings from - // the SETTINGS clause of the create query. Settings from current - // session and user are ignored. - std::optional format_settings; - if (args.storage_def->settings) - { - FormatFactorySettings user_format_settings; - - // Apply changed settings from global context, but ignore the - // unknown ones, because we only have the format settings here. - const auto & changes = args.getContext()->getSettingsRef().changes(); - for (const auto & change : changes) - { - if (user_format_settings.has(change.name)) - user_format_settings.set(change.name, change.value); - } - - // Apply changes from SETTINGS clause, with validation. - user_format_settings.applyChanges(args.storage_def->settings->changes); - format_settings = getFormatSettings(args.getContext(), user_format_settings); - } - else - { - format_settings = getFormatSettings(args.getContext()); - } - - ASTPtr partition_by; - if (args.storage_def->partition_by) - partition_by = args.storage_def->partition_by->clone(); - - auto settings = StorageAzureBlob::createSettings(args.getContext()); - - return std::make_shared( - std::move(configuration), - std::make_unique("AzureBlobStorage", std::move(client), std::move(settings),configuration.container), - args.getContext(), - args.table_id, - args.columns, - args.constraints, - args.comment, - format_settings, - /* distributed_processing */ false, - partition_by); - }, - { - .supports_settings = true, - .supports_sort_order = true, // for partition by - .supports_schema_inference = true, - .source_access_type = AccessType::AZURE, - }); -} - -static bool containerExists(std::unique_ptr &blob_service_client, std::string container_name) -{ - Azure::Storage::Blobs::ListBlobContainersOptions options; - options.Prefix = container_name; - options.PageSizeHint = 1; - - auto containers_list_response = blob_service_client->ListBlobContainers(options); - auto containers_list = containers_list_response.BlobContainers; - - for (const auto & container : containers_list) - { - if (container_name == container.Name) - return true; - } - return false; -} - -AzureClientPtr StorageAzureBlob::createClient(StorageAzureBlob::Configuration configuration, bool is_read_only) -{ - AzureClientPtr result; - - if (configuration.is_connection_string) - { - std::unique_ptr blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(configuration.connection_url)); - result = std::make_unique(BlobContainerClient::CreateFromConnectionString(configuration.connection_url, configuration.container)); - bool container_exists = containerExists(blob_service_client,configuration.container); - - if (!container_exists) - { - if (is_read_only) - throw Exception( - ErrorCodes::DATABASE_ACCESS_DENIED, - "AzureBlobStorage container does not exist '{}'", - configuration.container); - - try - { - result->CreateIfNotExists(); - } catch (const Azure::Storage::StorageException & e) - { - if (!(e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict - && e.ReasonPhrase == "The specified container already exists.")) - { - throw; - } - } - } - } - else - { - std::shared_ptr storage_shared_key_credential; - if (configuration.account_name.has_value() && configuration.account_key.has_value()) - { - storage_shared_key_credential - = std::make_shared(*configuration.account_name, *configuration.account_key); - } - - std::unique_ptr blob_service_client; - if (storage_shared_key_credential) - { - blob_service_client = std::make_unique(configuration.connection_url, storage_shared_key_credential); - } - else - { - blob_service_client = std::make_unique(configuration.connection_url); - } - - bool container_exists = containerExists(blob_service_client,configuration.container); - - std::string final_url; - size_t pos = configuration.connection_url.find('?'); - if (pos != std::string::npos) - { - auto url_without_sas = configuration.connection_url.substr(0, pos); - final_url = url_without_sas + (url_without_sas.back() == '/' ? "" : "/") + configuration.container - + configuration.connection_url.substr(pos); - } - else - final_url - = configuration.connection_url + (configuration.connection_url.back() == '/' ? "" : "/") + configuration.container; - - if (container_exists) - { - if (storage_shared_key_credential) - result = std::make_unique(final_url, storage_shared_key_credential); - else - result = std::make_unique(final_url); - } - else - { - if (is_read_only) - throw Exception( - ErrorCodes::DATABASE_ACCESS_DENIED, - "AzureBlobStorage container does not exist '{}'", - configuration.container); - try - { - result = std::make_unique(blob_service_client->CreateBlobContainer(configuration.container).Value); - } catch (const Azure::Storage::StorageException & e) - { - if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict - && e.ReasonPhrase == "The specified container already exists.") - { - if (storage_shared_key_credential) - result = std::make_unique(final_url, storage_shared_key_credential); - else - result = std::make_unique(final_url); - } - else - { - throw; - } - } - } - } - - return result; -} - -Poco::URI StorageAzureBlob::Configuration::getConnectionURL() const -{ - if (!is_connection_string) - return Poco::URI(connection_url); - - auto parsed_connection_string = Azure::Storage::_internal::ParseConnectionString(connection_url); - return Poco::URI(parsed_connection_string.BlobServiceUrl.GetAbsoluteUrl()); -} - - -StorageAzureBlob::StorageAzureBlob( - const Configuration & configuration_, - std::unique_ptr && object_storage_, - ContextPtr context, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - std::optional format_settings_, - bool distributed_processing_, - ASTPtr partition_by_) - : IStorage(table_id_) - , name("AzureBlobStorage") - , configuration(configuration_) - , object_storage(std::move(object_storage_)) - , distributed_processing(distributed_processing_) - , format_settings(format_settings_) - , partition_by(partition_by_) -{ - FormatFactory::instance().checkFormatName(configuration.format); - context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.getConnectionURL()); - - StorageInMemoryMetadata storage_metadata; - if (columns_.empty()) - { - auto columns = getTableStructureFromData(object_storage.get(), configuration, format_settings, context, distributed_processing); - storage_metadata.setColumns(columns); - } - else - { - /// We don't allow special columns in File storage. - if (!columns_.hasOnlyOrdinary()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine AzureBlobStorage doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); - storage_metadata.setColumns(columns_); - } - - storage_metadata.setConstraints(constraints_); - storage_metadata.setComment(comment); - setInMemoryMetadata(storage_metadata); - - StoredObjects objects; - for (const auto & key : configuration.blobs_paths) - objects.emplace_back(key); - - virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); -} - -void StorageAzureBlob::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) -{ - if (configuration.withGlobs()) - { - throw Exception( - ErrorCodes::DATABASE_ACCESS_DENIED, - "AzureBlobStorage key '{}' contains globs, so the table is in readonly mode", - configuration.blob_path); - } - - StoredObjects objects; - for (const auto & key : configuration.blobs_paths) - objects.emplace_back(key); - - object_storage->removeObjectsIfExist(objects); -} - -namespace -{ - -class StorageAzureBlobSink : public SinkToStorage -{ -public: - StorageAzureBlobSink( - const String & format, - const Block & sample_block_, - ContextPtr context, - std::optional format_settings_, - const CompressionMethod compression_method, - AzureObjectStorage * object_storage, - const String & blob_path) - : SinkToStorage(sample_block_) - , sample_block(sample_block_) - , format_settings(format_settings_) - { - StoredObject object(blob_path); - const auto & settings = context->getSettingsRef(); - write_buf = wrapWriteBufferWithCompressionMethod( - object_storage->writeObject(object, WriteMode::Rewrite), - compression_method, - static_cast(settings.output_format_compression_level), - static_cast(settings.output_format_compression_zstd_window_log)); - writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context, format_settings); - } - - String getName() const override { return "StorageAzureBlobSink"; } - - void consume(Chunk chunk) override - { - std::lock_guard lock(cancel_mutex); - if (cancelled) - return; - writer->write(getHeader().cloneWithColumns(chunk.detachColumns())); - } - - void onCancel() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - cancelled = true; - } - - void onException(std::exception_ptr exception) override - { - std::lock_guard lock(cancel_mutex); - try - { - std::rethrow_exception(exception); - } - catch (...) - { - /// An exception context is needed to proper delete write buffers without finalization - release(); - } - } - - void onFinish() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - } - -private: - void finalize() - { - if (!writer) - return; - - try - { - writer->finalize(); - writer->flush(); - write_buf->finalize(); - } - catch (...) - { - /// Stop ParallelFormattingOutputFormat correctly. - release(); - throw; - } - } - - void release() - { - writer.reset(); - write_buf->finalize(); - } - - Block sample_block; - std::optional format_settings; - std::unique_ptr write_buf; - OutputFormatPtr writer; - bool cancelled = false; - std::mutex cancel_mutex; -}; - -class PartitionedStorageAzureBlobSink : public PartitionedSink -{ -public: - PartitionedStorageAzureBlobSink( - const ASTPtr & partition_by, - const String & format_, - const Block & sample_block_, - ContextPtr context_, - std::optional format_settings_, - const CompressionMethod compression_method_, - AzureObjectStorage * object_storage_, - const String & blob_) - : PartitionedSink(partition_by, context_, sample_block_) - , format(format_) - , sample_block(sample_block_) - , context(context_) - , compression_method(compression_method_) - , object_storage(object_storage_) - , blob(blob_) - , format_settings(format_settings_) - { - } - - SinkPtr createSinkForPartition(const String & partition_id) override - { - auto partition_key = replaceWildcards(blob, partition_id); - validateKey(partition_key); - - return std::make_shared( - format, - sample_block, - context, - format_settings, - compression_method, - object_storage, - partition_key - ); - } - -private: - const String format; - const Block sample_block; - const ContextPtr context; - const CompressionMethod compression_method; - AzureObjectStorage * object_storage; - const String blob; - const std::optional format_settings; - - ExpressionActionsPtr partition_by_expr; - - static void validateKey(const String & str) - { - validatePartitionKey(str, true); - } -}; - -} - -class ReadFromAzureBlob : public SourceStepWithFilter -{ -public: - std::string getName() const override { return "ReadFromAzureBlob"; } - void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; - void applyFilters() override; - - ReadFromAzureBlob( - Block sample_block, - std::shared_ptr storage_, - ReadFromFormatInfo info_, - const bool need_only_count_, - ContextPtr context_, - size_t max_block_size_, - size_t num_streams_) - : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}) - , storage(std::move(storage_)) - , info(std::move(info_)) - , need_only_count(need_only_count_) - , context(std::move(context_)) - , max_block_size(max_block_size_) - , num_streams(num_streams_) - { - } - -private: - std::shared_ptr storage; - ReadFromFormatInfo info; - const bool need_only_count; - - ContextPtr context; - - size_t max_block_size; - const size_t num_streams; - - std::shared_ptr iterator_wrapper; - - void createIterator(const ActionsDAG::Node * predicate); -}; - -void ReadFromAzureBlob::applyFilters() -{ - auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes); - const ActionsDAG::Node * predicate = nullptr; - if (filter_actions_dag) - predicate = filter_actions_dag->getOutputs().at(0); - - createIterator(predicate); -} - -void StorageAzureBlob::read( - QueryPlan & query_plan, - const Names & column_names, - const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & query_info, - ContextPtr local_context, - QueryProcessingStage::Enum /*processed_stage*/, - size_t max_block_size, - size_t num_streams) -{ - if (partition_by && configuration.withWildcard()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned Azure storage is not implemented yet"); - - auto this_ptr = std::static_pointer_cast(shared_from_this()); - - auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context), getVirtuals()); - bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) - && local_context->getSettingsRef().optimize_count_from_files; - - auto reading = std::make_unique( - read_from_format_info.source_header, - std::move(this_ptr), - std::move(read_from_format_info), - need_only_count, - local_context, - max_block_size, - num_streams); - - query_plan.addStep(std::move(reading)); -} - -void ReadFromAzureBlob::createIterator(const ActionsDAG::Node * predicate) -{ - if (iterator_wrapper) - return; - - const auto & configuration = storage->configuration; - - if (storage->distributed_processing) - { - iterator_wrapper = std::make_shared(context, - context->getReadTaskCallback()); - } - else if (configuration.withGlobs()) - { - /// Iterate through disclosed globs and make a source for each file - iterator_wrapper = std::make_shared( - storage->object_storage.get(), configuration.container, configuration.blob_path, - predicate, storage->virtual_columns, context, nullptr, context->getFileProgressCallback()); - } - else - { - iterator_wrapper = std::make_shared( - storage->object_storage.get(), configuration.container, configuration.blobs_paths, - predicate, storage->virtual_columns, context, nullptr, context->getFileProgressCallback()); - } -} - -void ReadFromAzureBlob::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) -{ - createIterator(nullptr); - - const auto & configuration = storage->configuration; - Pipes pipes; - - for (size_t i = 0; i < num_streams; ++i) - { - pipes.emplace_back(std::make_shared( - info, - configuration.format, - getName(), - context, - storage->format_settings, - max_block_size, - configuration.compression_method, - storage->object_storage.get(), - configuration.container, - configuration.connection_url, - iterator_wrapper, - need_only_count)); - } - - auto pipe = Pipe::unitePipes(std::move(pipes)); - if (pipe.empty()) - pipe = Pipe(std::make_shared(info.source_header)); - - for (const auto & processor : pipe.getProcessors()) - processors.emplace_back(processor); - - pipeline.init(std::move(pipe)); -} - -SinkToStoragePtr StorageAzureBlob::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) -{ - auto sample_block = metadata_snapshot->getSampleBlock(); - auto chosen_compression_method = chooseCompressionMethod(configuration.blobs_paths.back(), configuration.compression_method); - auto insert_query = std::dynamic_pointer_cast(query); - - auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; - bool is_partitioned_implementation = partition_by_ast && configuration.withWildcard(); - - if (is_partitioned_implementation) - { - return std::make_shared( - partition_by_ast, - configuration.format, - sample_block, - local_context, - format_settings, - chosen_compression_method, - object_storage.get(), - configuration.blobs_paths.back()); - } - else - { - if (configuration.withGlobs()) - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, - "AzureBlobStorage key '{}' contains globs, so the table is in readonly mode", configuration.blob_path); - - bool truncate_in_insert = local_context->getSettingsRef().azure_truncate_on_insert; - - if (!truncate_in_insert && object_storage->exists(StoredObject(configuration.blob_path))) - { - - if (local_context->getSettingsRef().azure_create_new_file_on_insert) - { - size_t index = configuration.blobs_paths.size(); - const auto & first_key = configuration.blobs_paths[0]; - auto pos = first_key.find_first_of('.'); - String new_key; - - do - { - new_key = first_key.substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : first_key.substr(pos)); - ++index; - } - while (object_storage->exists(StoredObject(new_key))); - - configuration.blobs_paths.push_back(new_key); - } - else - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Object in bucket {} with key {} already exists. " - "If you want to overwrite it, enable setting azure_truncate_on_insert, if you " - "want to create a new file on each insert, enable setting azure_create_new_file_on_insert", - configuration.container, configuration.blobs_paths.back()); - } - } - - return std::make_shared( - configuration.format, - sample_block, - local_context, - format_settings, - chosen_compression_method, - object_storage.get(), - configuration.blobs_paths.back()); - } -} - -NamesAndTypesList StorageAzureBlob::getVirtuals() const -{ - return virtual_columns; -} - -Names StorageAzureBlob::getVirtualColumnNames() -{ - return VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage({}).getNames(); -} - -bool StorageAzureBlob::supportsPartitionBy() const -{ - return true; -} - -bool StorageAzureBlob::supportsSubsetOfColumns(const ContextPtr & context) const -{ - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context, format_settings); -} - -bool StorageAzureBlob::prefersLargeBlocks() const -{ - return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration.format); -} - -bool StorageAzureBlob::parallelizeOutputAfterReading(ContextPtr context) const -{ - return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration.format, context); -} - -StorageAzureBlobSource::GlobIterator::GlobIterator( - AzureObjectStorage * object_storage_, - const std::string & container_, - String blob_path_with_globs_, - const ActionsDAG::Node * predicate, - const NamesAndTypesList & virtual_columns_, - ContextPtr context_, - RelativePathsWithMetadata * outer_blobs_, - std::function file_progress_callback_) - : IIterator(context_) - , object_storage(object_storage_) - , container(container_) - , blob_path_with_globs(blob_path_with_globs_) - , virtual_columns(virtual_columns_) - , outer_blobs(outer_blobs_) - , file_progress_callback(file_progress_callback_) -{ - - const String key_prefix = blob_path_with_globs.substr(0, blob_path_with_globs.find_first_of("*?{")); - - /// We don't have to list bucket, because there is no asterisks. - if (key_prefix.size() == blob_path_with_globs.size()) - { - auto object_metadata = object_storage->getObjectMetadata(blob_path_with_globs); - blobs_with_metadata.emplace_back( - blob_path_with_globs, - object_metadata); - if (outer_blobs) - outer_blobs->emplace_back(blobs_with_metadata.back()); - if (file_progress_callback) - file_progress_callback(FileProgress(0, object_metadata.size_bytes)); - is_finished = true; - return; - } - - object_storage_iterator = object_storage->iterate(key_prefix); - - matcher = std::make_unique(makeRegexpPatternFromGlobs(blob_path_with_globs)); - - if (!matcher->ok()) - throw Exception( - ErrorCodes::CANNOT_COMPILE_REGEXP, "Cannot compile regex from glob ({}): {}", blob_path_with_globs, matcher->error()); - - recursive = blob_path_with_globs == "/**" ? true : false; - - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); -} - -RelativePathWithMetadata StorageAzureBlobSource::GlobIterator::next() -{ - std::lock_guard lock(next_mutex); - - if (is_finished && index >= blobs_with_metadata.size()) - { - return {}; - } - - bool need_new_batch = blobs_with_metadata.empty() || index >= blobs_with_metadata.size(); - - if (need_new_batch) - { - RelativePathsWithMetadata new_batch; - while (new_batch.empty()) - { - auto result = object_storage_iterator->getCurrrentBatchAndScheduleNext(); - if (result.has_value()) - { - new_batch = result.value(); - } - else - { - is_finished = true; - return {}; - } - - for (auto it = new_batch.begin(); it != new_batch.end();) - { - if (!recursive && !re2::RE2::FullMatch(it->relative_path, *matcher)) - it = new_batch.erase(it); - else - ++it; - } - } - - index = 0; - - if (filter_dag) - { - std::vector paths; - paths.reserve(new_batch.size()); - for (auto & path_with_metadata : new_batch) - paths.push_back(fs::path(container) / path_with_metadata.relative_path); - - VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_dag, virtual_columns, getContext()); - } - - if (outer_blobs) - outer_blobs->insert(outer_blobs->end(), new_batch.begin(), new_batch.end()); - - blobs_with_metadata = std::move(new_batch); - if (file_progress_callback) - { - for (const auto & [relative_path, info] : blobs_with_metadata) - { - file_progress_callback(FileProgress(0, info.size_bytes)); - } - } - } - - size_t current_index = index++; - if (current_index >= blobs_with_metadata.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Index out of bound for blob metadata"); - return blobs_with_metadata[current_index]; -} - -StorageAzureBlobSource::KeysIterator::KeysIterator( - AzureObjectStorage * object_storage_, - const std::string & container_, - const Strings & keys_, - const ActionsDAG::Node * predicate, - const NamesAndTypesList & virtual_columns_, - ContextPtr context_, - RelativePathsWithMetadata * outer_blobs, - std::function file_progress_callback) - : IIterator(context_) - , object_storage(object_storage_) - , container(container_) - , virtual_columns(virtual_columns_) -{ - Strings all_keys = keys_; - - ASTPtr filter_ast; - if (!all_keys.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - - if (filter_dag) - { - Strings paths; - paths.reserve(all_keys.size()); - for (const auto & key : all_keys) - paths.push_back(fs::path(container) / key); - - VirtualColumnUtils::filterByPathOrFile(all_keys, paths, filter_dag, virtual_columns, getContext()); - } - - for (auto && key : all_keys) - { - ObjectMetadata object_metadata = object_storage->getObjectMetadata(key); - if (file_progress_callback) - file_progress_callback(FileProgress(0, object_metadata.size_bytes)); - keys.emplace_back(key, object_metadata); - } - - if (outer_blobs) - *outer_blobs = keys; -} - -RelativePathWithMetadata StorageAzureBlobSource::KeysIterator::next() -{ - size_t current_index = index.fetch_add(1, std::memory_order_relaxed); - if (current_index >= keys.size()) - return {}; - - return keys[current_index]; -} - -Chunk StorageAzureBlobSource::generate() -{ - while (true) - { - if (isCancelled() || !reader) - { - if (reader) - reader->cancel(); - break; - } - - Chunk chunk; - if (reader->pull(chunk)) - { - UInt64 num_rows = chunk.getNumRows(); - total_rows_in_file += num_rows; - size_t chunk_size = 0; - if (const auto * input_format = reader.getInputFormat()) - chunk_size = input_format->getApproxBytesReadForChunk(); - progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( - chunk, - requested_virtual_columns, - fs::path(container) / reader.getRelativePath(), - reader.getRelativePathWithMetadata().metadata.size_bytes); - return chunk; - } - - if (reader.getInputFormat() && getContext()->getSettingsRef().use_cache_for_count_from_files) - addNumRowsToCache(reader.getRelativePath(), total_rows_in_file); - - total_rows_in_file = 0; - - assert(reader_future.valid()); - reader = reader_future.get(); - - if (!reader) - break; - - /// Even if task is finished the thread may be not freed in pool. - /// So wait until it will be freed before scheduling a new task. - create_reader_pool.wait(); - reader_future = createReaderAsync(); - } - - return {}; -} - -void StorageAzureBlobSource::addNumRowsToCache(const String & path, size_t num_rows) -{ - String source = fs::path(connection_url) / container / path; - auto cache_key = getKeyForSchemaCache(source, format, format_settings, getContext()); - StorageAzureBlob::getSchemaCache(getContext()).addNumRows(cache_key, num_rows); -} - -std::optional StorageAzureBlobSource::tryGetNumRowsFromCache(const DB::RelativePathWithMetadata & path_with_metadata) -{ - String source = fs::path(connection_url) / container / path_with_metadata.relative_path; - auto cache_key = getKeyForSchemaCache(source, format, format_settings, getContext()); - auto get_last_mod_time = [&]() -> std::optional - { - auto last_mod = path_with_metadata.metadata.last_modified; - if (last_mod) - return last_mod->epochTime(); - return std::nullopt; - }; - - return StorageAzureBlob::getSchemaCache(getContext()).tryGetNumRows(cache_key, get_last_mod_time); -} - -StorageAzureBlobSource::StorageAzureBlobSource( - const ReadFromFormatInfo & info, - const String & format_, - String name_, - ContextPtr context_, - std::optional format_settings_, - UInt64 max_block_size_, - String compression_hint_, - AzureObjectStorage * object_storage_, - const String & container_, - const String & connection_url_, - std::shared_ptr file_iterator_, - bool need_only_count_) - :ISource(info.source_header, false) - , WithContext(context_) - , requested_columns(info.requested_columns) - , requested_virtual_columns(info.requested_virtual_columns) - , format(format_) - , name(std::move(name_)) - , sample_block(info.format_header) - , format_settings(format_settings_) - , columns_desc(info.columns_description) - , max_block_size(max_block_size_) - , compression_hint(compression_hint_) - , object_storage(std::move(object_storage_)) - , container(container_) - , connection_url(connection_url_) - , file_iterator(file_iterator_) - , need_only_count(need_only_count_) - , create_reader_pool(CurrentMetrics::ObjectStorageAzureThreads, CurrentMetrics::ObjectStorageAzureThreadsActive, CurrentMetrics::ObjectStorageAzureThreadsScheduled, 1) - , create_reader_scheduler(threadPoolCallbackRunner(create_reader_pool, "AzureReader")) -{ - reader = createReader(); - if (reader) - reader_future = createReaderAsync(); -} - - -StorageAzureBlobSource::~StorageAzureBlobSource() -{ - create_reader_pool.wait(); -} - -String StorageAzureBlobSource::getName() const -{ - return name; -} - -StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() -{ - auto path_with_metadata = file_iterator->next(); - if (path_with_metadata.relative_path.empty()) - return {}; - - if (path_with_metadata.metadata.size_bytes == 0) - path_with_metadata.metadata = object_storage->getObjectMetadata(path_with_metadata.relative_path); - - QueryPipelineBuilder builder; - std::shared_ptr source; - std::unique_ptr read_buf; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files - ? tryGetNumRowsFromCache(path_with_metadata) : std::nullopt; - if (num_rows_from_cache) - { - /// We should not return single chunk with all number of rows, - /// because there is a chance that this chunk will be materialized later - /// (it can cause memory problems even with default values in columns or when virtual columns are requested). - /// Instead, we use special ConstChunkGenerator that will generate chunks - /// with max_block_size rows until total number of rows is reached. - source = std::make_shared(sample_block, *num_rows_from_cache, max_block_size); - builder.init(Pipe(source)); - } - else - { - std::optional max_parsing_threads; - if (need_only_count) - max_parsing_threads = 1; - - auto compression_method = chooseCompressionMethod(path_with_metadata.relative_path, compression_hint); - read_buf = createAzureReadBuffer(path_with_metadata.relative_path, path_with_metadata.metadata.size_bytes); - auto input_format = FormatFactory::instance().getInput( - format, *read_buf, sample_block, getContext(), max_block_size, - format_settings, max_parsing_threads, std::nullopt, - /* is_remote_fs */ true, compression_method); - - if (need_only_count) - input_format->needOnlyCount(); - - builder.init(Pipe(input_format)); - - if (columns_desc.hasDefaults()) - { - builder.addSimpleTransform( - [&](const Block & header) - { return std::make_shared(header, columns_desc, *input_format, getContext()); }); - } - - source = input_format; - } - - /// Add ExtractColumnsTransform to extract requested columns/subcolumns - /// from chunk read by IInputFormat. - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, requested_columns); - }); - - auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); - auto current_reader = std::make_unique(*pipeline); - - ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); - - return ReaderHolder{path_with_metadata, std::move(read_buf), std::move(source), std::move(pipeline), std::move(current_reader)}; -} - -std::future StorageAzureBlobSource::createReaderAsync() -{ - return create_reader_scheduler([this] { return createReader(); }, Priority{}); -} - -std::unique_ptr StorageAzureBlobSource::createAzureReadBuffer(const String & key, size_t object_size) -{ - auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); - read_settings.enable_filesystem_cache = false; - auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; - const bool object_too_small = object_size <= 2 * download_buffer_size; - - // Create a read buffer that will prefetch the first ~1 MB of the file. - // When reading lots of tiny files, this prefetching almost doubles the throughput. - // For bigger files, parallel reading is more useful. - if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) - { - LOG_TRACE(log, "Downloading object of size {} from Azure with initial prefetch", object_size); - return createAsyncAzureReadBuffer(key, read_settings, object_size); - } - - return object_storage->readObject(StoredObject(key), read_settings, {}, object_size); -} - -namespace -{ - class ReadBufferIterator : public IReadBufferIterator, WithContext - { - public: - ReadBufferIterator( - const std::shared_ptr & file_iterator_, - AzureObjectStorage * object_storage_, - const StorageAzureBlob::Configuration & configuration_, - const std::optional & format_settings_, - const RelativePathsWithMetadata & read_keys_, - const ContextPtr & context_) - : WithContext(context_) - , file_iterator(file_iterator_) - , object_storage(object_storage_) - , configuration(configuration_) - , format_settings(format_settings_) - , read_keys(read_keys_) - , prev_read_keys_size(read_keys_.size()) - { - } - - std::pair, std::optional> next() override - { - /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) - { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; - } - - current_path_with_metadata = file_iterator->next(); - - if (current_path_with_metadata.relative_path.empty()) - { - if (first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in AzureBlobStorage. You must specify table structure manually", configuration.format); - - return {nullptr, std::nullopt}; - } - - first = false; - - /// AzureBlobStorage file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) - { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); - prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache}; - } - else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) - { - RelativePathsWithMetadata paths = {current_path_with_metadata}; - if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) - return {nullptr, columns_from_cache}; - } - - first = false; - int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); - return {wrapReadBufferWithCompressionMethod( - object_storage->readObject(StoredObject(current_path_with_metadata.relative_path), getContext()->getReadSettings(), {}, current_path_with_metadata.metadata.size_bytes), - chooseCompressionMethod(current_path_with_metadata.relative_path, configuration.compression_method), - zstd_window_log_max), std::nullopt}; - } - - void setNumRowsToLastFile(size_t num_rows) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_azure) - return; - - String source = fs::path(configuration.connection_url) / configuration.container / current_path_with_metadata.relative_path; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - StorageAzureBlob::getSchemaCache(getContext()).addNumRows(key, num_rows); - } - - void setSchemaToLastFile(const ColumnsDescription & columns) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_azure - || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::UNION) - return; - - String source = fs::path(configuration.connection_url) / configuration.container / current_path_with_metadata.relative_path; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - StorageAzureBlob::getSchemaCache(getContext()).addColumns(key, columns); - } - - void setResultingSchema(const ColumnsDescription & columns) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_azure - || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::DEFAULT) - return; - - auto host_and_bucket = configuration.connection_url + '/' + configuration.container; - Strings sources; - sources.reserve(read_keys.size()); - std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket + '/' + elem.relative_path; }); - auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); - StorageAzureBlob::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); - } - - String getLastFileName() const override { return current_path_with_metadata.relative_path; } - - private: - std::optional tryGetColumnsFromCache(const RelativePathsWithMetadata::const_iterator & begin, const RelativePathsWithMetadata::const_iterator & end) - { - auto & schema_cache = StorageAzureBlob::getSchemaCache(getContext()); - for (auto it = begin; it < end; ++it) - { - auto get_last_mod_time = [&] -> std::optional - { - if (it->metadata.last_modified) - return it->metadata.last_modified->epochTime(); - return std::nullopt; - }; - - auto host_and_bucket = configuration.connection_url + '/' + configuration.container; - String source = host_and_bucket + '/' + it->relative_path; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; - } - - return std::nullopt; - } - - std::shared_ptr file_iterator; - AzureObjectStorage * object_storage; - const StorageAzureBlob::Configuration & configuration; - const std::optional & format_settings; - const RelativePathsWithMetadata & read_keys; - size_t prev_read_keys_size; - RelativePathWithMetadata current_path_with_metadata; - bool first = true; - }; -} - -ColumnsDescription StorageAzureBlob::getTableStructureFromData( - AzureObjectStorage * object_storage, - const Configuration & configuration, - const std::optional & format_settings, - ContextPtr ctx, - bool distributed_processing) -{ - RelativePathsWithMetadata read_keys; - std::shared_ptr file_iterator; - if (distributed_processing) - { - file_iterator = std::make_shared(ctx, - ctx->getReadTaskCallback()); - } - else if (configuration.withGlobs()) - { - file_iterator = std::make_shared( - object_storage, configuration.container, configuration.blob_path, nullptr, NamesAndTypesList{}, ctx, &read_keys); - } - else - { - file_iterator = std::make_shared( - object_storage, configuration.container, configuration.blobs_paths, nullptr, NamesAndTypesList{}, ctx, &read_keys); - } - - ReadBufferIterator read_buffer_iterator(file_iterator, object_storage, configuration, format_settings, read_keys, ctx); - return readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); -} - -SchemaCache & StorageAzureBlob::getSchemaCache(const ContextPtr & ctx) -{ - static SchemaCache schema_cache(ctx->getConfigRef().getUInt("schema_inference_cache_max_elements_for_azure", DEFAULT_SCHEMA_CACHE_ELEMENTS)); - return schema_cache; -} - - -std::unique_ptr StorageAzureBlobSource::createAsyncAzureReadBuffer( - const String & key, const ReadSettings & read_settings, size_t object_size) -{ - auto modified_settings{read_settings}; - modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; - auto async_reader = object_storage->readObjects(StoredObjects{StoredObject{key, /* local_path */ "", object_size}}, modified_settings); - - async_reader->setReadUntilEnd(); - if (read_settings.remote_fs_prefetch) - async_reader->prefetch(DEFAULT_PREFETCH_PRIORITY); - - return async_reader; -} - -} - -#endif diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h deleted file mode 100644 index 6fc3c5ce592..00000000000 --- a/src/Storages/StorageAzureBlob.h +++ /dev/null @@ -1,339 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AZURE_BLOB_STORAGE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -class StorageAzureBlob : public IStorage -{ -public: - - using AzureClient = Azure::Storage::Blobs::BlobContainerClient; - using AzureClientPtr = std::unique_ptr; - - struct Configuration : public StatelessTableEngineConfiguration - { - Configuration() = default; - - String getPath() const { return blob_path; } - - bool update(ContextPtr context); - - void connect(ContextPtr context); - - bool withGlobs() const { return blob_path.find_first_of("*?{") != std::string::npos; } - - bool withWildcard() const - { - static const String PARTITION_ID_WILDCARD = "{_partition_id}"; - return blobs_paths.back().find(PARTITION_ID_WILDCARD) != String::npos; - } - - Poco::URI getConnectionURL() const; - - std::string connection_url; - bool is_connection_string; - - std::optional account_name; - std::optional account_key; - - std::string container; - std::string blob_path; - std::vector blobs_paths; - }; - - StorageAzureBlob( - const Configuration & configuration_, - std::unique_ptr && object_storage_, - ContextPtr context_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - std::optional format_settings_, - bool distributed_processing_, - ASTPtr partition_by_); - - static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); - static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration, bool is_read_only); - - static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); - - static void processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection); - - String getName() const override - { - return name; - } - - void read( - QueryPlan & query_plan, - const Names &, - const StorageSnapshotPtr &, - SelectQueryInfo &, - ContextPtr, - QueryProcessingStage::Enum, - size_t, - size_t) override; - - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /* metadata_snapshot */, ContextPtr context, bool /*async_insert*/) override; - - void truncate(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, TableExclusiveLockHolder &) override; - - NamesAndTypesList getVirtuals() const override; - static Names getVirtualColumnNames(); - - bool supportsPartitionBy() const override; - - bool supportsSubcolumns() const override { return true; } - - bool supportsSubsetOfColumns(const ContextPtr & context) const; - - bool supportsTrivialCountOptimization() const override { return true; } - - bool prefersLargeBlocks() const override; - - bool parallelizeOutputAfterReading(ContextPtr context) const override; - - static SchemaCache & getSchemaCache(const ContextPtr & ctx); - - static ColumnsDescription getTableStructureFromData( - AzureObjectStorage * object_storage, - const Configuration & configuration, - const std::optional & format_settings, - ContextPtr ctx, - bool distributed_processing = false); - -private: - friend class ReadFromAzureBlob; - - std::string name; - Configuration configuration; - std::unique_ptr object_storage; - NamesAndTypesList virtual_columns; - - const bool distributed_processing; - std::optional format_settings; - ASTPtr partition_by; -}; - -class StorageAzureBlobSource : public ISource, WithContext -{ -public: - class IIterator : public WithContext - { - public: - IIterator(ContextPtr context_):WithContext(context_) {} - virtual ~IIterator() = default; - virtual RelativePathWithMetadata next() = 0; - - RelativePathWithMetadata operator ()() { return next(); } - }; - - class GlobIterator : public IIterator - { - public: - GlobIterator( - AzureObjectStorage * object_storage_, - const std::string & container_, - String blob_path_with_globs_, - const ActionsDAG::Node * predicate, - const NamesAndTypesList & virtual_columns_, - ContextPtr context_, - RelativePathsWithMetadata * outer_blobs_, - std::function file_progress_callback_ = {}); - - RelativePathWithMetadata next() override; - ~GlobIterator() override = default; - - private: - AzureObjectStorage * object_storage; - std::string container; - String blob_path_with_globs; - ActionsDAGPtr filter_dag; - NamesAndTypesList virtual_columns; - - size_t index = 0; - - RelativePathsWithMetadata blobs_with_metadata; - RelativePathsWithMetadata * outer_blobs; - ObjectStorageIteratorPtr object_storage_iterator; - bool recursive{false}; - - std::unique_ptr matcher; - - void createFilterAST(const String & any_key); - bool is_finished = false; - std::mutex next_mutex; - - std::function file_progress_callback; - }; - - class ReadIterator : public IIterator - { - public: - explicit ReadIterator(ContextPtr context_, - const ReadTaskCallback & callback_) - : IIterator(context_), callback(callback_) { } - RelativePathWithMetadata next() override - { - return {callback(), {}}; - } - - private: - ReadTaskCallback callback; - }; - - class KeysIterator : public IIterator - { - public: - KeysIterator( - AzureObjectStorage * object_storage_, - const std::string & container_, - const Strings & keys_, - const ActionsDAG::Node * predicate, - const NamesAndTypesList & virtual_columns_, - ContextPtr context_, - RelativePathsWithMetadata * outer_blobs, - std::function file_progress_callback = {}); - - RelativePathWithMetadata next() override; - ~KeysIterator() override = default; - - private: - AzureObjectStorage * object_storage; - std::string container; - RelativePathsWithMetadata keys; - - ActionsDAGPtr filter_dag; - NamesAndTypesList virtual_columns; - - std::atomic index = 0; - }; - - StorageAzureBlobSource( - const ReadFromFormatInfo & info, - const String & format_, - String name_, - ContextPtr context_, - std::optional format_settings_, - UInt64 max_block_size_, - String compression_hint_, - AzureObjectStorage * object_storage_, - const String & container_, - const String & connection_url_, - std::shared_ptr file_iterator_, - bool need_only_count_); - ~StorageAzureBlobSource() override; - - Chunk generate() override; - - String getName() const override; - -private: - void addNumRowsToCache(const String & path, size_t num_rows); - std::optional tryGetNumRowsFromCache(const RelativePathWithMetadata & path_with_metadata); - - NamesAndTypesList requested_columns; - NamesAndTypesList requested_virtual_columns; - String format; - String name; - Block sample_block; - std::optional format_settings; - ColumnsDescription columns_desc; - UInt64 max_block_size; - String compression_hint; - AzureObjectStorage * object_storage; - String container; - String connection_url; - std::shared_ptr file_iterator; - bool need_only_count; - size_t total_rows_in_file = 0; - - struct ReaderHolder - { - public: - ReaderHolder( - RelativePathWithMetadata relative_path_with_metadata_, - std::unique_ptr read_buf_, - std::shared_ptr source_, - std::unique_ptr pipeline_, - std::unique_ptr reader_) - : relative_path_with_metadata(std::move(relative_path_with_metadata_)) - , read_buf(std::move(read_buf_)) - , source(std::move(source_)) - , pipeline(std::move(pipeline_)) - , reader(std::move(reader_)) - { - } - - ReaderHolder() = default; - ReaderHolder(const ReaderHolder & other) = delete; - ReaderHolder & operator=(const ReaderHolder & other) = delete; - - ReaderHolder(ReaderHolder && other) noexcept - { - *this = std::move(other); - } - - ReaderHolder & operator=(ReaderHolder && other) noexcept - { - /// The order of destruction is important. - /// reader uses pipeline, pipeline uses read_buf. - reader = std::move(other.reader); - pipeline = std::move(other.pipeline); - source = std::move(other.source); - read_buf = std::move(other.read_buf); - relative_path_with_metadata = std::move(other.relative_path_with_metadata); - return *this; - } - - explicit operator bool() const { return reader != nullptr; } - PullingPipelineExecutor * operator->() { return reader.get(); } - const PullingPipelineExecutor * operator->() const { return reader.get(); } - const String & getRelativePath() const { return relative_path_with_metadata.relative_path; } - const RelativePathWithMetadata & getRelativePathWithMetadata() const { return relative_path_with_metadata; } - const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } - - private: - RelativePathWithMetadata relative_path_with_metadata; - std::unique_ptr read_buf; - std::shared_ptr source; - std::unique_ptr pipeline; - std::unique_ptr reader; - }; - - ReaderHolder reader; - - LoggerPtr log = getLogger("StorageAzureBlobSource"); - - ThreadPool create_reader_pool; - ThreadPoolCallbackRunner create_reader_scheduler; - std::future reader_future; - - /// Recreate ReadBuffer and Pipeline for each file. - ReaderHolder createReader(); - std::future createReaderAsync(); - - std::unique_ptr createAzureReadBuffer(const String & key, size_t object_size); - std::unique_ptr createAsyncAzureReadBuffer( - const String & key, const ReadSettings & read_settings, size_t object_size); -}; - -} - -#endif diff --git a/src/Storages/StorageAzureBlobCluster.cpp b/src/Storages/StorageAzureBlobCluster.cpp deleted file mode 100644 index 1d587512f38..00000000000 --- a/src/Storages/StorageAzureBlobCluster.cpp +++ /dev/null @@ -1,89 +0,0 @@ -#include "Storages/StorageAzureBlobCluster.h" - -#include "config.h" - -#if USE_AZURE_BLOB_STORAGE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -StorageAzureBlobCluster::StorageAzureBlobCluster( - const String & cluster_name_, - const StorageAzureBlob::Configuration & configuration_, - std::unique_ptr && object_storage_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageAzureBlobCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) - , configuration{configuration_} - , object_storage(std::move(object_storage_)) -{ - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.getConnectionURL()); - StorageInMemoryMetadata storage_metadata; - - if (columns_.empty()) - { - /// `format_settings` is set to std::nullopt, because StorageAzureBlobCluster is used only as table function - auto columns = StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context_, false); - storage_metadata.setColumns(columns); - } - else - storage_metadata.setColumns(columns_); - - storage_metadata.setConstraints(constraints_); - setInMemoryMetadata(storage_metadata); - - virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); -} - -void StorageAzureBlobCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) -{ - ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); - if (!expression_list) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function s3Cluster, got '{}'", queryToString(query)); - - TableFunctionAzureBlobStorageCluster::addColumnsStructureToArguments(expression_list->children, structure, context); -} - -RemoteQueryExecutor::Extension StorageAzureBlobCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const -{ - auto iterator = std::make_shared( - object_storage.get(), configuration.container, configuration.blob_path, - predicate, virtual_columns, context, nullptr); - auto callback = std::make_shared>([iterator]() mutable -> String{ return iterator->next().relative_path; }); - return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) }; -} - -NamesAndTypesList StorageAzureBlobCluster::getVirtuals() const -{ - return virtual_columns; -} - - -} - -#endif diff --git a/src/Storages/StorageAzureBlobCluster.h b/src/Storages/StorageAzureBlobCluster.h deleted file mode 100644 index 2831b94f825..00000000000 --- a/src/Storages/StorageAzureBlobCluster.h +++ /dev/null @@ -1,56 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AZURE_BLOB_STORAGE - -#include -#include - -#include "Client/Connection.h" -#include -#include -#include - -namespace DB -{ - -class Context; - -class StorageAzureBlobCluster : public IStorageCluster -{ -public: - StorageAzureBlobCluster( - const String & cluster_name_, - const StorageAzureBlob::Configuration & configuration_, - std::unique_ptr && object_storage_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); - - std::string getName() const override { return "AzureBlobStorageCluster"; } - - NamesAndTypesList getVirtuals() const override; - - RemoteQueryExecutor::Extension getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const override; - - bool supportsSubcolumns() const override { return true; } - - bool supportsTrivialCountOptimization() const override { return true; } - -private: - void updateBeforeRead(const ContextPtr & /*context*/) override {} - - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; - - StorageAzureBlob::Configuration configuration; - NamesAndTypesList virtual_columns; - std::unique_ptr object_storage; -}; - - -} - -#endif diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp deleted file mode 100644 index 4fde6fd04f3..00000000000 --- a/src/Storages/StorageS3.cpp +++ /dev/null @@ -1,1905 +0,0 @@ -#include "config.h" - -#if USE_AWS_S3 - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include - -#include - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#ifdef __clang__ -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" -#endif -#include -#ifdef __clang__ -# pragma clang diagnostic pop -#endif - -namespace fs = std::filesystem; - - -namespace CurrentMetrics -{ - extern const Metric StorageS3Threads; - extern const Metric StorageS3ThreadsActive; - extern const Metric StorageS3ThreadsScheduled; -} - -namespace ProfileEvents -{ - extern const Event S3DeleteObjects; - extern const Event S3ListObjects; - extern const Event EngineFileLikeReadFiles; -} - -namespace DB -{ - -static const std::unordered_set required_configuration_keys = { - "url", -}; -static const std::unordered_set optional_configuration_keys = { - "format", - "compression", - "compression_method", - "structure", - "access_key_id", - "secret_access_key", - "session_token", - "filename", - "use_environment_credentials", - "max_single_read_retries", - "min_upload_part_size", - "upload_part_size_multiply_factor", - "upload_part_size_multiply_parts_count_threshold", - "max_single_part_upload_size", - "max_connections", - "expiration_window_seconds", - "no_sign_request" -}; - -namespace ErrorCodes -{ - extern const int CANNOT_PARSE_TEXT; - extern const int BAD_ARGUMENTS; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int S3_ERROR; - extern const int UNEXPECTED_EXPRESSION; - extern const int DATABASE_ACCESS_DENIED; - extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int NOT_IMPLEMENTED; - extern const int CANNOT_COMPILE_REGEXP; - extern const int FILE_DOESNT_EXIST; -} - - -class ReadFromStorageS3Step : public SourceStepWithFilter -{ -public: - std::string getName() const override { return "ReadFromStorageS3Step"; } - - void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; - - void applyFilters() override; - - ReadFromStorageS3Step( - Block sample_block, - const Names & column_names_, - StorageSnapshotPtr storage_snapshot_, - StorageS3 & storage_, - ReadFromFormatInfo read_from_format_info_, - bool need_only_count_, - ContextPtr context_, - size_t max_block_size_, - size_t num_streams_) - : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}) - , column_names(column_names_) - , storage_snapshot(std::move(storage_snapshot_)) - , storage(storage_) - , read_from_format_info(std::move(read_from_format_info_)) - , need_only_count(need_only_count_) - , local_context(std::move(context_)) - , max_block_size(max_block_size_) - , num_streams(num_streams_) - { - query_configuration = storage.updateConfigurationAndGetCopy(local_context); - virtual_columns = storage.getVirtuals(); - } - -private: - Names column_names; - StorageSnapshotPtr storage_snapshot; - StorageS3 & storage; - ReadFromFormatInfo read_from_format_info; - bool need_only_count; - StorageS3::Configuration query_configuration; - NamesAndTypesList virtual_columns; - - ContextPtr local_context; - - size_t max_block_size; - size_t num_streams; - - std::shared_ptr iterator_wrapper; - - void createIterator(const ActionsDAG::Node * predicate); -}; - - -class IOutputFormat; -using OutputFormatPtr = std::shared_ptr; - -class StorageS3Source::DisclosedGlobIterator::Impl : WithContext -{ -public: - Impl( - const S3::Client & client_, - const S3::URI & globbed_uri_, - const ActionsDAG::Node * predicate, - const NamesAndTypesList & virtual_columns_, - ContextPtr context_, - KeysWithInfo * read_keys_, - const S3Settings::RequestSettings & request_settings_, - std::function file_progress_callback_) - : WithContext(context_) - , client(client_.clone()) - , globbed_uri(globbed_uri_) - , virtual_columns(virtual_columns_) - , read_keys(read_keys_) - , request_settings(request_settings_) - , list_objects_pool(CurrentMetrics::StorageS3Threads, CurrentMetrics::StorageS3ThreadsActive, CurrentMetrics::StorageS3ThreadsScheduled, 1) - , list_objects_scheduler(threadPoolCallbackRunner(list_objects_pool, "ListObjects")) - , file_progress_callback(file_progress_callback_) - { - if (globbed_uri.bucket.find_first_of("*?{") != globbed_uri.bucket.npos) - throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Expression can not have wildcards inside bucket name"); - - const String key_prefix = globbed_uri.key.substr(0, globbed_uri.key.find_first_of("*?{")); - - /// We don't have to list bucket, because there is no asterisks. - if (key_prefix.size() == globbed_uri.key.size()) - { - buffer.emplace_back(std::make_shared(globbed_uri.key, std::nullopt)); - buffer_iter = buffer.begin(); - is_finished = true; - return; - } - - request.SetBucket(globbed_uri.bucket); - request.SetPrefix(key_prefix); - request.SetMaxKeys(static_cast(request_settings.list_object_keys_size)); - - outcome_future = listObjectsAsync(); - - matcher = std::make_unique(makeRegexpPatternFromGlobs(globbed_uri.key)); - if (!matcher->ok()) - throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, - "Cannot compile regex from glob ({}): {}", globbed_uri.key, matcher->error()); - - recursive = globbed_uri.key == "/**" ? true : false; - - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - fillInternalBufferAssumeLocked(); - } - - KeyWithInfoPtr next(size_t) - { - std::lock_guard lock(mutex); - return nextAssumeLocked(); - } - - size_t objectsCount() - { - return buffer.size(); - } - - ~Impl() - { - list_objects_pool.wait(); - } - -private: - using ListObjectsOutcome = Aws::S3::Model::ListObjectsV2Outcome; - - KeyWithInfoPtr nextAssumeLocked() - { - if (buffer_iter != buffer.end()) - { - auto answer = *buffer_iter; - ++buffer_iter; - - /// If url doesn't contain globs, we didn't list s3 bucket and didn't get object info for the key. - /// So we get object info lazily here on 'next()' request. - if (!answer->info) - { - answer->info = S3::getObjectInfo(*client, globbed_uri.bucket, answer->key, globbed_uri.version_id, request_settings); - if (file_progress_callback) - file_progress_callback(FileProgress(0, answer->info->size)); - } - - return answer; - } - - if (is_finished) - return {}; - - try - { - fillInternalBufferAssumeLocked(); - } - catch (...) - { - /// In case of exception thrown while listing new batch of files - /// iterator may be partially initialized and its further using may lead to UB. - /// Iterator is used by several processors from several threads and - /// it may take some time for threads to stop processors and they - /// may still use this iterator after exception is thrown. - /// To avoid this UB, reset the buffer and return defaults for further calls. - is_finished = true; - buffer.clear(); - buffer_iter = buffer.begin(); - throw; - } - - return nextAssumeLocked(); - } - - void fillInternalBufferAssumeLocked() - { - buffer.clear(); - assert(outcome_future.valid()); - auto outcome = outcome_future.get(); - - if (!outcome.IsSuccess()) - { - throw S3Exception(outcome.GetError().GetErrorType(), "Could not list objects in bucket {} with prefix {}, S3 exception: {}, message: {}", - quoteString(request.GetBucket()), quoteString(request.GetPrefix()), - backQuote(outcome.GetError().GetExceptionName()), quoteString(outcome.GetError().GetMessage())); - } - - const auto & result_batch = outcome.GetResult().GetContents(); - - /// It returns false when all objects were returned - is_finished = !outcome.GetResult().GetIsTruncated(); - - if (!is_finished) - { - /// Even if task is finished the thread may be not freed in pool. - /// So wait until it will be freed before scheduling a new task. - list_objects_pool.wait(); - outcome_future = listObjectsAsync(); - } - - if (request_settings.throw_on_zero_files_match && result_batch.empty()) - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files using prefix {}", request.GetPrefix()); - - KeysWithInfo temp_buffer; - temp_buffer.reserve(result_batch.size()); - - for (const auto & row : result_batch) - { - String key = row.GetKey(); - if (recursive || re2::RE2::FullMatch(key, *matcher)) - { - S3::ObjectInfo info = - { - .size = size_t(row.GetSize()), - .last_modification_time = row.GetLastModified().Millis() / 1000, - }; - - temp_buffer.emplace_back(std::make_shared(std::move(key), std::move(info))); - } - } - - if (temp_buffer.empty()) - { - buffer_iter = buffer.begin(); - return; - } - - if (filter_dag) - { - std::vector paths; - paths.reserve(temp_buffer.size()); - for (const auto & key_with_info : temp_buffer) - paths.push_back(fs::path(globbed_uri.bucket) / key_with_info->key); - - VirtualColumnUtils::filterByPathOrFile(temp_buffer, paths, filter_dag, virtual_columns, getContext()); - } - - buffer = std::move(temp_buffer); - - if (file_progress_callback) - { - for (const auto & key_with_info : buffer) - file_progress_callback(FileProgress(0, key_with_info->info->size)); - } - - /// Set iterator only after the whole batch is processed - buffer_iter = buffer.begin(); - - if (read_keys) - read_keys->insert(read_keys->end(), buffer.begin(), buffer.end()); - } - - std::future listObjectsAsync() - { - return list_objects_scheduler([this] - { - ProfileEvents::increment(ProfileEvents::S3ListObjects); - auto outcome = client->ListObjectsV2(request); - - /// Outcome failure will be handled on the caller side. - if (outcome.IsSuccess()) - request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); - - return outcome; - }, Priority{}); - } - - std::mutex mutex; - - KeysWithInfo buffer; - KeysWithInfo::iterator buffer_iter; - - std::unique_ptr client; - S3::URI globbed_uri; - ASTPtr query; - NamesAndTypesList virtual_columns; - ActionsDAGPtr filter_dag; - std::unique_ptr matcher; - bool recursive{false}; - bool is_finished{false}; - KeysWithInfo * read_keys; - - S3::ListObjectsV2Request request; - S3Settings::RequestSettings request_settings; - - ThreadPool list_objects_pool; - ThreadPoolCallbackRunner list_objects_scheduler; - std::future outcome_future; - std::function file_progress_callback; -}; - -StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( - const S3::Client & client_, - const S3::URI & globbed_uri_, - const ActionsDAG::Node * predicate, - const NamesAndTypesList & virtual_columns_, - ContextPtr context, - KeysWithInfo * read_keys_, - const S3Settings::RequestSettings & request_settings_, - std::function file_progress_callback_) - : pimpl(std::make_shared(client_, globbed_uri_, predicate, virtual_columns_, context, read_keys_, request_settings_, file_progress_callback_)) -{ -} - -StorageS3Source::KeyWithInfoPtr StorageS3Source::DisclosedGlobIterator::next(size_t idx) /// NOLINT -{ - return pimpl->next(idx); -} - -size_t StorageS3Source::DisclosedGlobIterator::estimatedKeysCount() -{ - return pimpl->objectsCount(); -} - -class StorageS3Source::KeysIterator::Impl -{ -public: - explicit Impl( - const S3::Client & client_, - const std::string & version_id_, - const std::vector & keys_, - const String & bucket_, - const S3Settings::RequestSettings & request_settings_, - KeysWithInfo * read_keys_, - std::function file_progress_callback_) - : keys(keys_) - , client(client_.clone()) - , version_id(version_id_) - , bucket(bucket_) - , request_settings(request_settings_) - , file_progress_callback(file_progress_callback_) - { - if (read_keys_) - { - for (const auto & key : keys) - read_keys_->push_back(std::make_shared(key)); - } - } - - KeyWithInfoPtr next(size_t) - { - size_t current_index = index.fetch_add(1, std::memory_order_relaxed); - if (current_index >= keys.size()) - return {}; - auto key = keys[current_index]; - std::optional info; - if (file_progress_callback) - { - info = S3::getObjectInfo(*client, bucket, key, version_id, request_settings); - file_progress_callback(FileProgress(0, info->size)); - } - - return std::make_shared(key, info); - } - - size_t objectsCount() - { - return keys.size(); - } - -private: - Strings keys; - std::atomic_size_t index = 0; - std::unique_ptr client; - String version_id; - String bucket; - S3Settings::RequestSettings request_settings; - std::function file_progress_callback; -}; - -StorageS3Source::KeysIterator::KeysIterator( - const S3::Client & client_, - const std::string & version_id_, - const std::vector & keys_, - const String & bucket_, - const S3Settings::RequestSettings & request_settings_, - KeysWithInfo * read_keys, - std::function file_progress_callback_) - : pimpl(std::make_shared( - client_, version_id_, keys_, bucket_, request_settings_, - read_keys, file_progress_callback_)) -{ -} - -StorageS3Source::KeyWithInfoPtr StorageS3Source::KeysIterator::next(size_t idx) /// NOLINT -{ - return pimpl->next(idx); -} - -size_t StorageS3Source::KeysIterator::estimatedKeysCount() -{ - return pimpl->objectsCount(); -} - -StorageS3Source::ReadTaskIterator::ReadTaskIterator( - const DB::ReadTaskCallback & callback_, - size_t max_threads_count) - : callback(callback_) -{ - ThreadPool pool(CurrentMetrics::StorageS3Threads, CurrentMetrics::StorageS3ThreadsActive, CurrentMetrics::StorageS3ThreadsScheduled, max_threads_count); - auto pool_scheduler = threadPoolCallbackRunner(pool, "S3ReadTaskItr"); - - std::vector> keys; - keys.reserve(max_threads_count); - for (size_t i = 0; i < max_threads_count; ++i) - keys.push_back(pool_scheduler([this] { return callback(); }, Priority{})); - - pool.wait(); - buffer.reserve(max_threads_count); - for (auto & key_future : keys) - buffer.emplace_back(std::make_shared(key_future.get(), std::nullopt)); -} - -StorageS3Source::KeyWithInfoPtr StorageS3Source::ReadTaskIterator::next(size_t) /// NOLINT -{ - size_t current_index = index.fetch_add(1, std::memory_order_relaxed); - if (current_index >= buffer.size()) - return std::make_shared(callback()); - - return buffer[current_index]; -} - -size_t StorageS3Source::ReadTaskIterator::estimatedKeysCount() -{ - return buffer.size(); -} - -StorageS3Source::StorageS3Source( - const ReadFromFormatInfo & info, - const String & format_, - String name_, - ContextPtr context_, - std::optional format_settings_, - UInt64 max_block_size_, - const S3Settings::RequestSettings & request_settings_, - String compression_hint_, - const std::shared_ptr & client_, - const String & bucket_, - const String & version_id_, - const String & url_host_and_port_, - std::shared_ptr file_iterator_, - const size_t max_parsing_threads_, - bool need_only_count_) - : SourceWithKeyCondition(info.source_header, false) - , WithContext(context_) - , name(std::move(name_)) - , bucket(bucket_) - , version_id(version_id_) - , url_host_and_port(url_host_and_port_) - , format(format_) - , columns_desc(info.columns_description) - , requested_columns(info.requested_columns) - , max_block_size(max_block_size_) - , request_settings(request_settings_) - , compression_hint(std::move(compression_hint_)) - , client(client_) - , sample_block(info.format_header) - , format_settings(format_settings_) - , requested_virtual_columns(info.requested_virtual_columns) - , file_iterator(file_iterator_) - , max_parsing_threads(max_parsing_threads_) - , need_only_count(need_only_count_) - , create_reader_pool(CurrentMetrics::StorageS3Threads, CurrentMetrics::StorageS3ThreadsActive, CurrentMetrics::StorageS3ThreadsScheduled, 1) - , create_reader_scheduler(threadPoolCallbackRunner(create_reader_pool, "CreateS3Reader")) -{ -} - -void StorageS3Source::lazyInitialize(size_t idx) -{ - if (initialized) - return; - - reader = createReader(idx); - if (reader) - reader_future = createReaderAsync(idx); - initialized = true; -} - -StorageS3Source::ReaderHolder StorageS3Source::createReader(size_t idx) -{ - KeyWithInfoPtr key_with_info; - do - { - key_with_info = file_iterator->next(idx); - if (!key_with_info || key_with_info->key.empty()) - return {}; - - if (!key_with_info->info) - key_with_info->info = S3::getObjectInfo(*client, bucket, key_with_info->key, version_id, request_settings); - } - while (getContext()->getSettingsRef().s3_skip_empty_files && key_with_info->info->size == 0); - - QueryPipelineBuilder builder; - std::shared_ptr source; - std::unique_ptr read_buf; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(*key_with_info) : std::nullopt; - if (num_rows_from_cache) - { - /// We should not return single chunk with all number of rows, - /// because there is a chance that this chunk will be materialized later - /// (it can cause memory problems even with default values in columns or when virtual columns are requested). - /// Instead, we use special ConstChunkGenerator that will generate chunks - /// with max_block_size rows until total number of rows is reached. - source = std::make_shared(sample_block, *num_rows_from_cache, max_block_size); - builder.init(Pipe(source)); - } - else - { - auto compression_method = chooseCompressionMethod(key_with_info->key, compression_hint); - read_buf = createS3ReadBuffer(key_with_info->key, key_with_info->info->size); - - auto input_format = FormatFactory::instance().getInput( - format, - *read_buf, - sample_block, - getContext(), - max_block_size, - format_settings, - max_parsing_threads, - /* max_download_threads= */ std::nullopt, - /* is_remote_fs */ true, - compression_method, - need_only_count); - - if (key_condition) - input_format->setKeyCondition(key_condition); - - if (need_only_count) - input_format->needOnlyCount(); - - builder.init(Pipe(input_format)); - - if (columns_desc.hasDefaults()) - { - builder.addSimpleTransform( - [&](const Block & header) - { return std::make_shared(header, columns_desc, *input_format, getContext()); }); - } - - source = input_format; - } - - /// Add ExtractColumnsTransform to extract requested columns/subcolumns - /// from chunk read by IInputFormat. - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, requested_columns); - }); - - auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); - auto current_reader = std::make_unique(*pipeline); - - ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); - - return ReaderHolder{key_with_info, bucket, std::move(read_buf), std::move(source), std::move(pipeline), std::move(current_reader)}; -} - -std::future StorageS3Source::createReaderAsync(size_t idx) -{ - return create_reader_scheduler([=, this] { return createReader(idx); }, Priority{}); -} - -std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & key, size_t object_size) -{ - auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); - read_settings.enable_filesystem_cache = false; - auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; - const bool object_too_small = object_size <= 2 * download_buffer_size; - - // Create a read buffer that will prefetch the first ~1 MB of the file. - // When reading lots of tiny files, this prefetching almost doubles the throughput. - // For bigger files, parallel reading is more useful. - if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) - { - LOG_TRACE(log, "Downloading object of size {} from S3 with initial prefetch", object_size); - return createAsyncS3ReadBuffer(key, read_settings, object_size); - } - - return std::make_unique( - client, bucket, key, version_id, request_settings, read_settings, - /*use_external_buffer*/ false, /*offset_*/ 0, /*read_until_position_*/ 0, - /*restricted_seek_*/ false, object_size); -} - -std::unique_ptr StorageS3Source::createAsyncS3ReadBuffer( - const String & key, const ReadSettings & read_settings, size_t object_size) -{ - auto context = getContext(); - auto read_buffer_creator = - [this, read_settings, object_size] - (const std::string & path, size_t read_until_position) -> std::unique_ptr - { - return std::make_unique( - client, - bucket, - path, - version_id, - request_settings, - read_settings, - /* use_external_buffer */true, - /* offset */0, - read_until_position, - /* restricted_seek */true, - object_size); - }; - - auto s3_impl = std::make_unique( - std::move(read_buffer_creator), - StoredObjects{StoredObject{key, /* local_path */ "", object_size}}, - read_settings, - /* cache_log */nullptr, /* use_external_buffer */true); - - auto modified_settings{read_settings}; - /// FIXME: Changing this setting to default value breaks something around parquet reading - modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; - - auto & pool_reader = context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - auto async_reader = std::make_unique( - std::move(s3_impl), pool_reader, modified_settings, - context->getAsyncReadCounters(), context->getFilesystemReadPrefetchesLog()); - - async_reader->setReadUntilEnd(); - if (read_settings.remote_fs_prefetch) - async_reader->prefetch(DEFAULT_PREFETCH_PRIORITY); - - return async_reader; -} - -StorageS3Source::~StorageS3Source() -{ - create_reader_pool.wait(); -} - -String StorageS3Source::getName() const -{ - return name; -} - -Chunk StorageS3Source::generate() -{ - lazyInitialize(); - - while (true) - { - if (isCancelled() || !reader) - { - if (reader) - reader->cancel(); - break; - } - - Chunk chunk; - if (reader->pull(chunk)) - { - UInt64 num_rows = chunk.getNumRows(); - total_rows_in_file += num_rows; - size_t chunk_size = 0; - if (const auto * input_format = reader.getInputFormat()) - chunk_size = reader.getInputFormat()->getApproxBytesReadForChunk(); - progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk(chunk, requested_virtual_columns, reader.getPath(), reader.getFileSize()); - return chunk; - } - - if (reader.getInputFormat() && getContext()->getSettingsRef().use_cache_for_count_from_files) - addNumRowsToCache(reader.getFile(), total_rows_in_file); - - total_rows_in_file = 0; - - assert(reader_future.valid()); - reader = reader_future.get(); - - if (!reader) - break; - - /// Even if task is finished the thread may be not freed in pool. - /// So wait until it will be freed before scheduling a new task. - create_reader_pool.wait(); - reader_future = createReaderAsync(); - } - - return {}; -} - -void StorageS3Source::addNumRowsToCache(const String & key, size_t num_rows) -{ - String source = fs::path(url_host_and_port) / bucket / key; - auto cache_key = getKeyForSchemaCache(source, format, format_settings, getContext()); - StorageS3::getSchemaCache(getContext()).addNumRows(cache_key, num_rows); -} - -std::optional StorageS3Source::tryGetNumRowsFromCache(const KeyWithInfo & key_with_info) -{ - String source = fs::path(url_host_and_port) / bucket / key_with_info.key; - auto cache_key = getKeyForSchemaCache(source, format, format_settings, getContext()); - auto get_last_mod_time = [&]() -> std::optional - { - return key_with_info.info->last_modification_time; - }; - - return StorageS3::getSchemaCache(getContext()).tryGetNumRows(cache_key, get_last_mod_time); -} - -class StorageS3Sink : public SinkToStorage -{ -public: - StorageS3Sink( - const String & format, - const Block & sample_block_, - ContextPtr context, - std::optional format_settings_, - const CompressionMethod compression_method, - const StorageS3::Configuration & configuration_, - const String & bucket, - const String & key) - : SinkToStorage(sample_block_) - , sample_block(sample_block_) - , format_settings(format_settings_) - { - BlobStorageLogWriterPtr blob_log = nullptr; - if (auto blob_storage_log = context->getBlobStorageLog()) - { - blob_log = std::make_shared(std::move(blob_storage_log)); - blob_log->query_id = context->getCurrentQueryId(); - } - - const auto & settings = context->getSettingsRef(); - write_buf = wrapWriteBufferWithCompressionMethod( - std::make_unique( - configuration_.client, - bucket, - key, - DBMS_DEFAULT_BUFFER_SIZE, - configuration_.request_settings, - std::move(blob_log), - std::nullopt, - threadPoolCallbackRunner(getIOThreadPool().get(), "S3ParallelWrite"), - context->getWriteSettings()), - compression_method, - static_cast(settings.output_format_compression_level), - static_cast(settings.output_format_compression_zstd_window_log)); - writer - = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context, format_settings); - } - - String getName() const override { return "StorageS3Sink"; } - - void consume(Chunk chunk) override - { - std::lock_guard lock(cancel_mutex); - if (cancelled) - return; - writer->write(getHeader().cloneWithColumns(chunk.detachColumns())); - } - - void onCancel() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - cancelled = true; - } - - void onException(std::exception_ptr exception) override - { - std::lock_guard lock(cancel_mutex); - try - { - std::rethrow_exception(exception); - } - catch (...) - { - /// An exception context is needed to proper delete write buffers without finalization - release(); - } - } - - void onFinish() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - } - -private: - void finalize() - { - if (!writer) - return; - - try - { - writer->finalize(); - writer->flush(); - write_buf->finalize(); - } - catch (...) - { - /// Stop ParallelFormattingOutputFormat correctly. - release(); - throw; - } - } - - void release() - { - writer.reset(); - write_buf.reset(); - } - - Block sample_block; - std::optional format_settings; - std::unique_ptr write_buf; - OutputFormatPtr writer; - bool cancelled = false; - std::mutex cancel_mutex; -}; - - -class PartitionedStorageS3Sink : public PartitionedSink -{ -public: - PartitionedStorageS3Sink( - const ASTPtr & partition_by, - const String & format_, - const Block & sample_block_, - ContextPtr context_, - std::optional format_settings_, - const CompressionMethod compression_method_, - const StorageS3::Configuration & configuration_, - const String & bucket_, - const String & key_) - : PartitionedSink(partition_by, context_, sample_block_) - , format(format_) - , sample_block(sample_block_) - , context(context_) - , compression_method(compression_method_) - , configuration(configuration_) - , bucket(bucket_) - , key(key_) - , format_settings(format_settings_) - { - } - - SinkPtr createSinkForPartition(const String & partition_id) override - { - auto partition_bucket = replaceWildcards(bucket, partition_id); - validateBucket(partition_bucket); - - auto partition_key = replaceWildcards(key, partition_id); - validateKey(partition_key); - - return std::make_shared( - format, - sample_block, - context, - format_settings, - compression_method, - configuration, - partition_bucket, - partition_key - ); - } - -private: - const String format; - const Block sample_block; - const ContextPtr context; - const CompressionMethod compression_method; - const StorageS3::Configuration configuration; - const String bucket; - const String key; - const std::optional format_settings; - - static void validateBucket(const String & str) - { - S3::URI::validateBucket(str, {}); - - if (!DB::UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); - - validatePartitionKey(str, false); - } - - static void validateKey(const String & str) - { - /// See: - /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html - /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject - - if (str.empty() || str.size() > 1024) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); - - if (!DB::UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); - - validatePartitionKey(str, true); - } -}; - - -StorageS3::StorageS3( - const Configuration & configuration_, - ContextPtr context_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - std::optional format_settings_, - bool distributed_processing_, - ASTPtr partition_by_) - : IStorage(table_id_) - , configuration(configuration_) - , name(configuration.url.storage_name) - , distributed_processing(distributed_processing_) - , format_settings(format_settings_) - , partition_by(partition_by_) -{ - updateConfiguration(context_); // NOLINT(clang-analyzer-optin.cplusplus.VirtualCall) - - FormatFactory::instance().checkFormatName(configuration.format); - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.url.uri); - context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration.headers_from_ast); - - StorageInMemoryMetadata storage_metadata; - if (columns_.empty()) - { - auto columns = getTableStructureFromDataImpl(configuration, format_settings, context_); - storage_metadata.setColumns(columns); - } - else - { - /// We don't allow special columns in S3 storage. - if (!columns_.hasOnlyOrdinary()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine S3 doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); - storage_metadata.setColumns(columns_); - } - - storage_metadata.setConstraints(constraints_); - storage_metadata.setComment(comment); - setInMemoryMetadata(storage_metadata); - - virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); -} - -static std::shared_ptr createFileIterator( - const StorageS3::Configuration & configuration, - bool distributed_processing, - ContextPtr local_context, - const ActionsDAG::Node * predicate, - const NamesAndTypesList & virtual_columns, - StorageS3::KeysWithInfo * read_keys = nullptr, - std::function file_progress_callback = {}) -{ - if (distributed_processing) - { - return std::make_shared(local_context->getReadTaskCallback(), local_context->getSettingsRef().max_threads); - } - else if (configuration.withGlobs()) - { - /// Iterate through disclosed globs and make a source for each file - return std::make_shared( - *configuration.client, configuration.url, predicate, virtual_columns, - local_context, read_keys, configuration.request_settings, file_progress_callback); - } - else - { - Strings keys = configuration.keys; - auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - if (filter_dag) - { - std::vector paths; - paths.reserve(keys.size()); - for (const auto & key : keys) - paths.push_back(fs::path(configuration.url.bucket) / key); - VirtualColumnUtils::filterByPathOrFile(keys, paths, filter_dag, virtual_columns, local_context); - } - - return std::make_shared( - *configuration.client, configuration.url.version_id, keys, - configuration.url.bucket, configuration.request_settings, read_keys, file_progress_callback); - } -} - -bool StorageS3::supportsSubsetOfColumns(const ContextPtr & context) const -{ - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context, format_settings); -} - -bool StorageS3::prefersLargeBlocks() const -{ - return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration.format); -} - -bool StorageS3::parallelizeOutputAfterReading(ContextPtr context) const -{ - return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration.format, context); -} - -void StorageS3::read( - QueryPlan & query_plan, - const Names & column_names, - const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & query_info, - ContextPtr local_context, - QueryProcessingStage::Enum /*processed_stage*/, - size_t max_block_size, - size_t num_streams) -{ - auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context), virtual_columns); - - bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) - && local_context->getSettingsRef().optimize_count_from_files; - - auto reading = std::make_unique( - read_from_format_info.source_header, - column_names, - storage_snapshot, - *this, - std::move(read_from_format_info), - need_only_count, - local_context, - max_block_size, - num_streams); - - query_plan.addStep(std::move(reading)); -} - -void ReadFromStorageS3Step::applyFilters() -{ - auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes); - const ActionsDAG::Node * predicate = nullptr; - if (filter_actions_dag) - predicate = filter_actions_dag->getOutputs().at(0); - - createIterator(predicate); -} - -void ReadFromStorageS3Step::createIterator(const ActionsDAG::Node * predicate) -{ - if (iterator_wrapper) - return; - - iterator_wrapper = createFileIterator( - query_configuration, storage.distributed_processing, local_context, predicate, - virtual_columns, nullptr, local_context->getFileProgressCallback()); -} - -void ReadFromStorageS3Step::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) -{ - if (storage.partition_by && query_configuration.withWildcard()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned S3 storage is not implemented yet"); - - createIterator(nullptr); - - size_t estimated_keys_count = iterator_wrapper->estimatedKeysCount(); - if (estimated_keys_count > 1) - num_streams = std::min(num_streams, estimated_keys_count); - else - /// Disclosed glob iterator can underestimate the amount of keys in some cases. We will keep one stream for this particular case. - num_streams = 1; - - const size_t max_threads = local_context->getSettingsRef().max_threads; - const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / std::max(num_streams, 1ul)); - LOG_DEBUG(getLogger("StorageS3"), "Reading in {} streams, {} threads per stream", num_streams, max_parsing_threads); - - Pipes pipes; - pipes.reserve(num_streams); - for (size_t i = 0; i < num_streams; ++i) - { - auto source = std::make_shared( - read_from_format_info, - query_configuration.format, - storage.getName(), - local_context, - storage.format_settings, - max_block_size, - query_configuration.request_settings, - query_configuration.compression_method, - query_configuration.client, - query_configuration.url.bucket, - query_configuration.url.version_id, - query_configuration.url.uri.getHost() + std::to_string(query_configuration.url.uri.getPort()), - iterator_wrapper, - max_parsing_threads, - need_only_count); - - source->setKeyCondition(filter_nodes.nodes, local_context); - pipes.emplace_back(std::move(source)); - } - - auto pipe = Pipe::unitePipes(std::move(pipes)); - if (pipe.empty()) - pipe = Pipe(std::make_shared(read_from_format_info.source_header)); - - for (const auto & processor : pipe.getProcessors()) - processors.emplace_back(processor); - - pipeline.init(std::move(pipe)); -} - -SinkToStoragePtr StorageS3::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/) -{ - auto query_configuration = updateConfigurationAndGetCopy(local_context); - - auto sample_block = metadata_snapshot->getSampleBlock(); - auto chosen_compression_method = chooseCompressionMethod(query_configuration.keys.back(), query_configuration.compression_method); - auto insert_query = std::dynamic_pointer_cast(query); - - auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; - bool is_partitioned_implementation = partition_by_ast && query_configuration.withWildcard(); - - if (is_partitioned_implementation) - { - return std::make_shared( - partition_by_ast, - query_configuration.format, - sample_block, - local_context, - format_settings, - chosen_compression_method, - query_configuration, - query_configuration.url.bucket, - query_configuration.keys.back()); - } - else - { - if (query_configuration.withGlobs()) - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, - "S3 key '{}' contains globs, so the table is in readonly mode", query_configuration.url.key); - - bool truncate_in_insert = local_context->getSettingsRef().s3_truncate_on_insert; - - if (!truncate_in_insert && S3::objectExists(*query_configuration.client, query_configuration.url.bucket, query_configuration.keys.back(), query_configuration.url.version_id, query_configuration.request_settings)) - { - if (local_context->getSettingsRef().s3_create_new_file_on_insert) - { - size_t index = query_configuration.keys.size(); - const auto & first_key = query_configuration.keys[0]; - auto pos = first_key.find_first_of('.'); - String new_key; - do - { - new_key = first_key.substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : first_key.substr(pos)); - ++index; - } - while (S3::objectExists(*query_configuration.client, query_configuration.url.bucket, new_key, query_configuration.url.version_id, query_configuration.request_settings)); - - query_configuration.keys.push_back(new_key); - configuration.keys.push_back(new_key); - } - else - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Object in bucket {} with key {} already exists. " - "If you want to overwrite it, enable setting s3_truncate_on_insert, if you " - "want to create a new file on each insert, enable setting s3_create_new_file_on_insert", - query_configuration.url.bucket, query_configuration.keys.back()); - } - } - - return std::make_shared( - query_configuration.format, - sample_block, - local_context, - format_settings, - chosen_compression_method, - query_configuration, - query_configuration.url.bucket, - query_configuration.keys.back()); - } -} - -void StorageS3::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, ContextPtr local_context, TableExclusiveLockHolder &) -{ - auto query_configuration = updateConfigurationAndGetCopy(local_context); - - if (query_configuration.withGlobs()) - { - throw Exception( - ErrorCodes::DATABASE_ACCESS_DENIED, - "S3 key '{}' contains globs, so the table is in readonly mode", - query_configuration.url.key); - } - - Aws::S3::Model::Delete delkeys; - - for (const auto & key : query_configuration.keys) - { - Aws::S3::Model::ObjectIdentifier obj; - obj.SetKey(key); - delkeys.AddObjects(std::move(obj)); - } - - ProfileEvents::increment(ProfileEvents::S3DeleteObjects); - S3::DeleteObjectsRequest request; - request.SetBucket(query_configuration.url.bucket); - request.SetDelete(delkeys); - - auto response = query_configuration.client->DeleteObjects(request); - - const auto * response_error = response.IsSuccess() ? nullptr : &response.GetError(); - auto time_now = std::chrono::system_clock::now(); - if (auto blob_storage_log = BlobStorageLogWriter::create()) - { - for (const auto & key : query_configuration.keys) - blob_storage_log->addEvent(BlobStorageLogElement::EventType::Delete, query_configuration.url.bucket, key, {}, 0, response_error, time_now); - } - - if (!response.IsSuccess()) - { - const auto & err = response.GetError(); - throw S3Exception(err.GetMessage(), err.GetErrorType()); - } - - for (const auto & error : response.GetResult().GetErrors()) - LOG_WARNING(getLogger("StorageS3"), "Failed to delete {}, error: {}", error.GetKey(), error.GetMessage()); -} - -StorageS3::Configuration StorageS3::updateConfigurationAndGetCopy(ContextPtr local_context) -{ - std::lock_guard lock(configuration_update_mutex); - configuration.update(local_context); - return configuration; -} - -void StorageS3::updateConfiguration(ContextPtr local_context) -{ - std::lock_guard lock(configuration_update_mutex); - configuration.update(local_context); -} - -void StorageS3::useConfiguration(const Configuration & new_configuration) -{ - std::lock_guard lock(configuration_update_mutex); - configuration = new_configuration; -} - -const StorageS3::Configuration & StorageS3::getConfiguration() -{ - std::lock_guard lock(configuration_update_mutex); - return configuration; -} - -bool StorageS3::Configuration::update(ContextPtr context) -{ - auto s3_settings = context->getStorageS3Settings().getSettings(url.uri.toString()); - request_settings = s3_settings.request_settings; - request_settings.updateFromSettings(context->getSettings()); - - if (client && (static_configuration || !auth_settings.hasUpdates(s3_settings.auth_settings))) - return false; - - auth_settings.updateFrom(s3_settings.auth_settings); - keys[0] = url.key; - connect(context); - return true; -} - -void StorageS3::Configuration::connect(ContextPtr context) -{ - const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); - const Settings & local_settings = context->getSettingsRef(); - - S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( - auth_settings.region, - context->getRemoteHostFilter(), - static_cast(global_settings.s3_max_redirects), - static_cast(global_settings.s3_retry_attempts), - global_settings.enable_s3_requests_logging, - /* for_disk_s3 = */ false, - request_settings.get_request_throttler, - request_settings.put_request_throttler, - url.uri.getScheme()); - - client_configuration.endpointOverride = url.endpoint; - client_configuration.maxConnections = static_cast(request_settings.max_connections); - client_configuration.http_connection_pool_size = global_settings.s3_http_connection_pool_size; - auto headers = auth_settings.headers; - if (!headers_from_ast.empty()) - headers.insert(headers.end(), headers_from_ast.begin(), headers_from_ast.end()); - - client_configuration.requestTimeoutMs = request_settings.request_timeout_ms; - - S3::ClientSettings client_settings{ - .use_virtual_addressing = url.is_virtual_hosted_style, - .disable_checksum = local_settings.s3_disable_checksum, - .gcs_issue_compose_request = context->getConfigRef().getBool("s3.gcs_issue_compose_request", false), - }; - - auto credentials = Aws::Auth::AWSCredentials(auth_settings.access_key_id, auth_settings.secret_access_key, auth_settings.session_token); - client = S3::ClientFactory::instance().create( - client_configuration, - client_settings, - credentials.GetAWSAccessKeyId(), - credentials.GetAWSSecretKey(), - auth_settings.server_side_encryption_customer_key_base64, - auth_settings.server_side_encryption_kms_config, - std::move(headers), - S3::CredentialsConfiguration{ - auth_settings.use_environment_credentials.value_or(context->getConfigRef().getBool("s3.use_environment_credentials", true)), - auth_settings.use_insecure_imds_request.value_or(context->getConfigRef().getBool("s3.use_insecure_imds_request", false)), - auth_settings.expiration_window_seconds.value_or( - context->getConfigRef().getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), - auth_settings.no_sign_request.value_or(context->getConfigRef().getBool("s3.no_sign_request", false)), - }); -} - -void StorageS3::processNamedCollectionResult(StorageS3::Configuration & configuration, const NamedCollection & collection) -{ - validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); - - auto filename = collection.getOrDefault("filename", ""); - if (!filename.empty()) - configuration.url = S3::URI(std::filesystem::path(collection.get("url")) / filename); - else - configuration.url = S3::URI(collection.get("url")); - - configuration.auth_settings.access_key_id = collection.getOrDefault("access_key_id", ""); - configuration.auth_settings.secret_access_key = collection.getOrDefault("secret_access_key", ""); - configuration.auth_settings.use_environment_credentials = collection.getOrDefault("use_environment_credentials", 1); - configuration.auth_settings.no_sign_request = collection.getOrDefault("no_sign_request", false); - configuration.auth_settings.expiration_window_seconds = collection.getOrDefault("expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS); - - configuration.format = collection.getOrDefault("format", configuration.format); - configuration.compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); - configuration.structure = collection.getOrDefault("structure", "auto"); - - configuration.request_settings = S3Settings::RequestSettings(collection); -} - -StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) -{ - StorageS3::Configuration configuration; - - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - { - processNamedCollectionResult(configuration, *named_collection); - } - else - { - /// Supported signatures: - /// - /// S3('url') - /// S3('url', 'format') - /// S3('url', 'format', 'compression') - /// S3('url', NOSIGN) - /// S3('url', NOSIGN, 'format') - /// S3('url', NOSIGN, 'format', 'compression') - /// S3('url', 'aws_access_key_id', 'aws_secret_access_key') - /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'session_token') - /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format') - /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'session_token', 'format') - /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'format', 'compression') - /// S3('url', 'aws_access_key_id', 'aws_secret_access_key', 'session_token', 'format', 'compression') - /// with optional headers() function - - size_t count = StorageURL::evalArgsAndCollectHeaders(engine_args, configuration.headers_from_ast, local_context); - - if (count == 0 || count > 6) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Storage S3 requires 1 to 5 arguments: " - "url, [NOSIGN | access_key_id, secret_access_key], name of used format and [compression_method]"); - - std::unordered_map engine_args_to_idx; - bool no_sign_request = false; - - /// For 2 arguments we support 2 possible variants: - /// - s3(source, format) - /// - s3(source, NOSIGN) - /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or not. - if (count == 2) - { - auto second_arg = checkAndGetLiteralArgument(engine_args[1], "format/NOSIGN"); - if (boost::iequals(second_arg, "NOSIGN")) - no_sign_request = true; - else - engine_args_to_idx = {{"format", 1}}; - } - /// For 3 arguments we support 2 possible variants: - /// - s3(source, format, compression_method) - /// - s3(source, access_key_id, secret_access_key) - /// - s3(source, NOSIGN, format) - /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or format name. - else if (count == 3) - { - auto second_arg = checkAndGetLiteralArgument(engine_args[1], "format/access_key_id/NOSIGN"); - if (boost::iequals(second_arg, "NOSIGN")) - { - no_sign_request = true; - engine_args_to_idx = {{"format", 2}}; - } - else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) - engine_args_to_idx = {{"format", 1}, {"compression_method", 2}}; - else - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}}; - } - /// For 4 arguments we support 3 possible variants: - /// - s3(source, access_key_id, secret_access_key, session_token) - /// - s3(source, access_key_id, secret_access_key, format) - /// - s3(source, NOSIGN, format, compression_method) - /// We can distinguish them by looking at the 2-nd argument: check if it's a NOSIGN or not. - else if (count == 4) - { - auto second_arg = checkAndGetLiteralArgument(engine_args[1], "access_key_id/NOSIGN"); - if (boost::iequals(second_arg, "NOSIGN")) - { - no_sign_request = true; - engine_args_to_idx = {{"format", 2}, {"compression_method", 3}}; - } - else - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "session_token/format"); - if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}}; - } - else - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}}; - } - } - } - /// For 5 arguments we support 2 possible variants: - /// - s3(source, access_key_id, secret_access_key, session_token, format) - /// - s3(source, access_key_id, secret_access_key, format, compression) - else if (count == 5) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "session_token/format"); - if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"compression", 4}}; - } - else - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}}; - } - } - else if (count == 6) - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"compression_method", 5}}; - } - - /// This argument is always the first - configuration.url = S3::URI(checkAndGetLiteralArgument(engine_args[0], "url")); - - if (engine_args_to_idx.contains("format")) - configuration.format = checkAndGetLiteralArgument(engine_args[engine_args_to_idx["format"]], "format"); - - if (engine_args_to_idx.contains("compression_method")) - configuration.compression_method = checkAndGetLiteralArgument(engine_args[engine_args_to_idx["compression_method"]], "compression_method"); - - if (engine_args_to_idx.contains("access_key_id")) - configuration.auth_settings.access_key_id = checkAndGetLiteralArgument(engine_args[engine_args_to_idx["access_key_id"]], "access_key_id"); - - if (engine_args_to_idx.contains("secret_access_key")) - configuration.auth_settings.secret_access_key = checkAndGetLiteralArgument(engine_args[engine_args_to_idx["secret_access_key"]], "secret_access_key"); - - if (engine_args_to_idx.contains("session_token")) - configuration.auth_settings.session_token = checkAndGetLiteralArgument(engine_args[engine_args_to_idx["session_token"]], "session_token"); - - if (no_sign_request) - configuration.auth_settings.no_sign_request = no_sign_request; - } - - configuration.static_configuration = !configuration.auth_settings.access_key_id.empty() || configuration.auth_settings.no_sign_request.has_value(); - - configuration.keys = {configuration.url.key}; - - if (configuration.format == "auto" && get_format_from_file) - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.url.key, true); - - return configuration; -} - -ColumnsDescription StorageS3::getTableStructureFromData( - const StorageS3::Configuration & configuration, - const std::optional & format_settings, - ContextPtr ctx) -{ - return getTableStructureFromDataImpl(configuration, format_settings, ctx); -} - -namespace -{ - class ReadBufferIterator : public IReadBufferIterator, WithContext - { - public: - ReadBufferIterator( - std::shared_ptr file_iterator_, - const StorageS3Source::KeysWithInfo & read_keys_, - const StorageS3::Configuration & configuration_, - const std::optional & format_settings_, - const ContextPtr & context_) - : WithContext(context_) - , file_iterator(file_iterator_) - , read_keys(read_keys_) - , configuration(configuration_) - , format_settings(format_settings_) - , prev_read_keys_size(read_keys_.size()) - { - } - - std::pair, std::optional> next() override - { - /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) - { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; - } - - while (true) - { - current_key_with_info = (*file_iterator)(); - - if (!current_key_with_info || current_key_with_info->key.empty()) - { - if (first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in S3 or all files are empty. You must specify table structure manually", - configuration.format); - - return {nullptr, std::nullopt}; - } - - /// S3 file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) - { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); - prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache}; - } - - if (getContext()->getSettingsRef().s3_skip_empty_files && current_key_with_info->info && current_key_with_info->info->size == 0) - continue; - - /// In union mode, check cached columns only for current key. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) - { - StorageS3::KeysWithInfo keys = {current_key_with_info}; - if (auto columns_from_cache = tryGetColumnsFromCache(keys.begin(), keys.end())) - { - first = false; - return {nullptr, columns_from_cache}; - } - } - - int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); - auto impl = std::make_unique(configuration.client, configuration.url.bucket, current_key_with_info->key, configuration.url.version_id, configuration.request_settings, getContext()->getReadSettings()); - if (!getContext()->getSettingsRef().s3_skip_empty_files || !impl->eof()) - { - first = false; - return {wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max), std::nullopt}; - } - } - } - - void setNumRowsToLastFile(size_t num_rows) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_s3) - return; - - String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - StorageS3::getSchemaCache(getContext()).addNumRows(key, num_rows); - } - - void setSchemaToLastFile(const ColumnsDescription & columns) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_s3 - || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::UNION) - return; - - String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - StorageS3::getSchemaCache(getContext()).addColumns(cache_key, columns); - } - - void setResultingSchema(const ColumnsDescription & columns) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_s3 - || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::DEFAULT) - return; - - auto host_and_bucket = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket; - Strings sources; - sources.reserve(read_keys.size()); - std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket / elem->key; }); - auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); - StorageS3::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); - } - - String getLastFileName() const override - { - if (current_key_with_info) - return current_key_with_info->key; - return ""; - } - - private: - std::optional tryGetColumnsFromCache( - const StorageS3::KeysWithInfo::const_iterator & begin, - const StorageS3::KeysWithInfo::const_iterator & end) - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_s3) - return std::nullopt; - - auto & schema_cache = StorageS3::getSchemaCache(getContext()); - for (auto it = begin; it < end; ++it) - { - auto get_last_mod_time = [&] - { - time_t last_modification_time = 0; - if ((*it)->info) - { - last_modification_time = (*it)->info->last_modification_time; - } - else - { - /// Note that in case of exception in getObjectInfo returned info will be empty, - /// but schema cache will handle this case and won't return columns from cache - /// because we can't say that it's valid without last modification time. - last_modification_time = S3::getObjectInfo( - *configuration.client, - configuration.url.bucket, - (*it)->key, - configuration.url.version_id, - configuration.request_settings, - /*with_metadata=*/ false, - /*for_disk_s3=*/ false, - /*throw_on_error= */ false).last_modification_time; - } - - return last_modification_time ? std::make_optional(last_modification_time) : std::nullopt; - }; - - String path = fs::path(configuration.url.bucket) / (*it)->key; - String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / path; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; - } - - return std::nullopt; - } - - std::shared_ptr file_iterator; - const StorageS3Source::KeysWithInfo & read_keys; - const StorageS3::Configuration & configuration; - const std::optional & format_settings; - StorageS3Source::KeyWithInfoPtr current_key_with_info; - size_t prev_read_keys_size; - bool first = true; - }; - -} - -ColumnsDescription StorageS3::getTableStructureFromDataImpl( - const Configuration & configuration, - const std::optional & format_settings, - ContextPtr ctx) -{ - KeysWithInfo read_keys; - - auto file_iterator = createFileIterator(configuration, false, ctx, {}, {}, &read_keys); - - ReadBufferIterator read_buffer_iterator(file_iterator, read_keys, configuration, format_settings, ctx); - return readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); -} - -void registerStorageS3Impl(const String & name, StorageFactory & factory) -{ - factory.registerStorage(name, [](const StorageFactory::Arguments & args) - { - auto & engine_args = args.engine_args; - if (engine_args.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); - - auto configuration = StorageS3::getConfiguration(engine_args, args.getLocalContext()); - // Use format settings from global server context + settings from - // the SETTINGS clause of the create query. Settings from current - // session and user are ignored. - std::optional format_settings; - if (args.storage_def->settings) - { - FormatFactorySettings user_format_settings; - - // Apply changed settings from global context, but ignore the - // unknown ones, because we only have the format settings here. - const auto & changes = args.getContext()->getSettingsRef().changes(); - for (const auto & change : changes) - { - if (user_format_settings.has(change.name)) - user_format_settings.set(change.name, change.value); - } - - // Apply changes from SETTINGS clause, with validation. - user_format_settings.applyChanges(args.storage_def->settings->changes); - format_settings = getFormatSettings(args.getContext(), user_format_settings); - } - else - { - format_settings = getFormatSettings(args.getContext()); - } - - ASTPtr partition_by; - if (args.storage_def->partition_by) - partition_by = args.storage_def->partition_by->clone(); - - return std::make_shared( - std::move(configuration), - args.getContext(), - args.table_id, - args.columns, - args.constraints, - args.comment, - format_settings, - /* distributed_processing_ */false, - partition_by); - }, - { - .supports_settings = true, - .supports_sort_order = true, // for partition by - .supports_schema_inference = true, - .source_access_type = AccessType::S3, - }); -} - -void registerStorageS3(StorageFactory & factory) -{ - return registerStorageS3Impl("S3", factory); -} - -void registerStorageCOS(StorageFactory & factory) -{ - return registerStorageS3Impl("COSN", factory); -} - -void registerStorageOSS(StorageFactory & factory) -{ - return registerStorageS3Impl("OSS", factory); -} - -NamesAndTypesList StorageS3::getVirtuals() const -{ - return virtual_columns; -} - -Names StorageS3::getVirtualColumnNames() -{ - return VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage({}).getNames(); -} - -bool StorageS3::supportsPartitionBy() const -{ - return true; -} - -SchemaCache & StorageS3::getSchemaCache(const ContextPtr & ctx) -{ - static SchemaCache schema_cache(ctx->getConfigRef().getUInt("schema_inference_cache_max_elements_for_s3", DEFAULT_SCHEMA_CACHE_ELEMENTS)); - return schema_cache; -} - -} - -#endif diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h deleted file mode 100644 index 81a03cc5ad5..00000000000 --- a/src/Storages/StorageS3.h +++ /dev/null @@ -1,399 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace Aws::S3 -{ - class Client; -} - -namespace DB -{ - -class PullingPipelineExecutor; -class NamedCollection; - -class StorageS3Source : public SourceWithKeyCondition, WithContext -{ -public: - - struct KeyWithInfo - { - KeyWithInfo() = default; - - explicit KeyWithInfo(String key_, std::optional info_ = std::nullopt) - : key(std::move(key_)), info(std::move(info_)) {} - - virtual ~KeyWithInfo() = default; - - String key; - std::optional info; - }; - using KeyWithInfoPtr = std::shared_ptr; - - using KeysWithInfo = std::vector; - - class IIterator - { - public: - virtual ~IIterator() = default; - virtual KeyWithInfoPtr next(size_t idx = 0) = 0; /// NOLINT - - /// Estimates how many streams we need to process all files. - /// If keys count >= max_threads_count, the returned number may not represent the actual number of the keys. - /// Intended to be called before any next() calls, may underestimate otherwise - /// fixme: May underestimate if the glob has a strong filter, so there are few matches among the first 1000 ListObjects results. - virtual size_t estimatedKeysCount() = 0; - - KeyWithInfoPtr operator ()() { return next(); } - }; - - class DisclosedGlobIterator : public IIterator - { - public: - DisclosedGlobIterator( - const S3::Client & client_, - const S3::URI & globbed_uri_, - const ActionsDAG::Node * predicate, - const NamesAndTypesList & virtual_columns, - ContextPtr context, - KeysWithInfo * read_keys_ = nullptr, - const S3Settings::RequestSettings & request_settings_ = {}, - std::function progress_callback_ = {}); - - KeyWithInfoPtr next(size_t idx = 0) override; /// NOLINT - size_t estimatedKeysCount() override; - - private: - class Impl; - /// shared_ptr to have copy constructor - std::shared_ptr pimpl; - }; - - class KeysIterator : public IIterator - { - public: - explicit KeysIterator( - const S3::Client & client_, - const std::string & version_id_, - const std::vector & keys_, - const String & bucket_, - const S3Settings::RequestSettings & request_settings_, - KeysWithInfo * read_keys = nullptr, - std::function progress_callback_ = {}); - - KeyWithInfoPtr next(size_t idx = 0) override; /// NOLINT - size_t estimatedKeysCount() override; - - private: - class Impl; - /// shared_ptr to have copy constructor - std::shared_ptr pimpl; - }; - - class ReadTaskIterator : public IIterator - { - public: - explicit ReadTaskIterator(const ReadTaskCallback & callback_, size_t max_threads_count); - - KeyWithInfoPtr next(size_t idx = 0) override; /// NOLINT - size_t estimatedKeysCount() override; - - private: - KeysWithInfo buffer; - std::atomic_size_t index = 0; - - ReadTaskCallback callback; - }; - - StorageS3Source( - const ReadFromFormatInfo & info, - const String & format, - String name_, - ContextPtr context_, - std::optional format_settings_, - UInt64 max_block_size_, - const S3Settings::RequestSettings & request_settings_, - String compression_hint_, - const std::shared_ptr & client_, - const String & bucket, - const String & version_id, - const String & url_host_and_port, - std::shared_ptr file_iterator_, - size_t max_parsing_threads, - bool need_only_count_); - - ~StorageS3Source() override; - - String getName() const override; - - void setKeyCondition(const ActionsDAG::NodeRawConstPtrs & nodes, ContextPtr context_) override - { - setKeyConditionImpl(nodes, context_, sample_block); - } - - Chunk generate() override; - -private: - friend class StorageS3QueueSource; - - String name; - String bucket; - String version_id; - String url_host_and_port; - String format; - ColumnsDescription columns_desc; - NamesAndTypesList requested_columns; - UInt64 max_block_size; - S3Settings::RequestSettings request_settings; - String compression_hint; - std::shared_ptr client; - Block sample_block; - std::optional format_settings; - - struct ReaderHolder - { - public: - ReaderHolder( - KeyWithInfoPtr key_with_info_, - String bucket_, - std::unique_ptr read_buf_, - std::shared_ptr source_, - std::unique_ptr pipeline_, - std::unique_ptr reader_) - : key_with_info(key_with_info_) - , bucket(std::move(bucket_)) - , read_buf(std::move(read_buf_)) - , source(std::move(source_)) - , pipeline(std::move(pipeline_)) - , reader(std::move(reader_)) - { - } - - ReaderHolder() = default; - ReaderHolder(const ReaderHolder & other) = delete; - ReaderHolder & operator=(const ReaderHolder & other) = delete; - - ReaderHolder(ReaderHolder && other) noexcept - { - *this = std::move(other); - } - - ReaderHolder & operator=(ReaderHolder && other) noexcept - { - /// The order of destruction is important. - /// reader uses pipeline, pipeline uses read_buf. - reader = std::move(other.reader); - pipeline = std::move(other.pipeline); - source = std::move(other.source); - read_buf = std::move(other.read_buf); - key_with_info = std::move(other.key_with_info); - bucket = std::move(other.bucket); - return *this; - } - - explicit operator bool() const { return reader != nullptr; } - PullingPipelineExecutor * operator->() { return reader.get(); } - const PullingPipelineExecutor * operator->() const { return reader.get(); } - String getPath() const { return fs::path(bucket) / key_with_info->key; } - const String & getFile() const { return key_with_info->key; } - const KeyWithInfo & getKeyWithInfo() const { return *key_with_info; } - std::optional getFileSize() const { return key_with_info->info ? std::optional(key_with_info->info->size) : std::nullopt; } - - const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } - - private: - KeyWithInfoPtr key_with_info; - String bucket; - std::unique_ptr read_buf; - std::shared_ptr source; - std::unique_ptr pipeline; - std::unique_ptr reader; - }; - - ReaderHolder reader; - - NamesAndTypesList requested_virtual_columns; - std::shared_ptr file_iterator; - size_t max_parsing_threads = 1; - bool need_only_count; - - LoggerPtr log = getLogger("StorageS3Source"); - - ThreadPool create_reader_pool; - ThreadPoolCallbackRunner create_reader_scheduler; - std::future reader_future; - std::atomic initialized{false}; - - size_t total_rows_in_file = 0; - - /// Notice: we should initialize reader and future_reader lazily in generate to make sure key_condition - /// is set before createReader is invoked for key_condition is read in createReader. - void lazyInitialize(size_t idx = 0); - - /// Recreate ReadBuffer and Pipeline for each file. - ReaderHolder createReader(size_t idx = 0); - std::future createReaderAsync(size_t idx = 0); - - std::unique_ptr createS3ReadBuffer(const String & key, size_t object_size); - std::unique_ptr createAsyncS3ReadBuffer(const String & key, const ReadSettings & read_settings, size_t object_size); - - void addNumRowsToCache(const String & key, size_t num_rows); - std::optional tryGetNumRowsFromCache(const KeyWithInfo & key_with_info); -}; - -/** - * This class represents table engine for external S3 urls. - * It sends HTTP GET to server when select is called and - * HTTP PUT when insert is called. - */ -class StorageS3 : public IStorage -{ -public: - struct Configuration : public StatelessTableEngineConfiguration - { - Configuration() = default; - - String getPath() const { return url.key; } - - bool update(ContextPtr context); - - void connect(ContextPtr context); - - bool withGlobs() const { return url.key.find_first_of("*?{") != std::string::npos; } - - bool withWildcard() const - { - static const String PARTITION_ID_WILDCARD = "{_partition_id}"; - return url.bucket.find(PARTITION_ID_WILDCARD) != String::npos - || keys.back().find(PARTITION_ID_WILDCARD) != String::npos; - } - - S3::URI url; - S3::AuthSettings auth_settings; - S3Settings::RequestSettings request_settings; - /// If s3 configuration was passed from ast, then it is static. - /// If from config - it can be changed with config reload. - bool static_configuration = true; - /// Headers from ast is a part of static configuration. - HTTPHeaderEntries headers_from_ast; - - std::shared_ptr client; - std::vector keys; - }; - - StorageS3( - const Configuration & configuration_, - ContextPtr context_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - std::optional format_settings_, - bool distributed_processing_ = false, - ASTPtr partition_by_ = nullptr); - - String getName() const override - { - return name; - } - - void read( - QueryPlan & query_plan, - const Names & column_names, - const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & query_info, - ContextPtr context, - QueryProcessingStage::Enum processed_stage, - size_t max_block_size, - size_t num_streams) override; - - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; - - void truncate(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, TableExclusiveLockHolder &) override; - - NamesAndTypesList getVirtuals() const override; - static Names getVirtualColumnNames(); - - bool supportsPartitionBy() const override; - - static void processNamedCollectionResult(StorageS3::Configuration & configuration, const NamedCollection & collection); - - static SchemaCache & getSchemaCache(const ContextPtr & ctx); - - static StorageS3::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); - - static ColumnsDescription getTableStructureFromData( - const StorageS3::Configuration & configuration, - const std::optional & format_settings, - ContextPtr ctx); - - using KeysWithInfo = StorageS3Source::KeysWithInfo; - - bool supportsTrivialCountOptimization() const override { return true; } - -protected: - virtual Configuration updateConfigurationAndGetCopy(ContextPtr local_context); - - virtual void updateConfiguration(ContextPtr local_context); - - void useConfiguration(const Configuration & new_configuration); - - const Configuration & getConfiguration(); - -private: - friend class StorageS3Cluster; - friend class TableFunctionS3Cluster; - friend class StorageS3Queue; - friend class ReadFromStorageS3Step; - - Configuration configuration; - std::mutex configuration_update_mutex; - NamesAndTypesList virtual_columns; - - String name; - const bool distributed_processing; - std::optional format_settings; - ASTPtr partition_by; - - static ColumnsDescription getTableStructureFromDataImpl( - const Configuration & configuration, - const std::optional & format_settings, - ContextPtr ctx); - - bool supportsSubcolumns() const override { return true; } - - bool supportsSubsetOfColumns(const ContextPtr & context) const; - - bool prefersLargeBlocks() const override; - - bool parallelizeOutputAfterReading(ContextPtr context) const override; -}; - -} - -#endif diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp deleted file mode 100644 index 25c2b42b766..00000000000 --- a/src/Storages/StorageS3Cluster.cpp +++ /dev/null @@ -1,103 +0,0 @@ -#include "Storages/StorageS3Cluster.h" - -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -StorageS3Cluster::StorageS3Cluster( - const String & cluster_name_, - const StorageS3::Configuration & configuration_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageS3Cluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) - , s3_configuration{configuration_} -{ - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.url.uri); - context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration_.headers_from_ast); - - StorageInMemoryMetadata storage_metadata; - updateConfigurationIfChanged(context_); - - if (columns_.empty()) - { - /// `format_settings` is set to std::nullopt, because StorageS3Cluster is used only as table function - auto columns = StorageS3::getTableStructureFromDataImpl(s3_configuration, /*format_settings=*/std::nullopt, context_); - storage_metadata.setColumns(columns); - } - else - storage_metadata.setColumns(columns_); - - storage_metadata.setConstraints(constraints_); - setInMemoryMetadata(storage_metadata); - - virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); -} - -void StorageS3Cluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) -{ - ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); - if (!expression_list) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function s3Cluster, got '{}'", queryToString(query)); - - TableFunctionS3Cluster::addColumnsStructureToArguments(expression_list->children, structure, context); -} - -void StorageS3Cluster::updateConfigurationIfChanged(ContextPtr local_context) -{ - s3_configuration.update(local_context); -} - -RemoteQueryExecutor::Extension StorageS3Cluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const -{ - auto iterator = std::make_shared( - *s3_configuration.client, s3_configuration.url, predicate, virtual_columns, context, nullptr, s3_configuration.request_settings, context->getFileProgressCallback()); - - auto callback = std::make_shared>([iterator]() mutable -> String - { - if (auto next = iterator->next()) - return next->key; - return ""; - }); - return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) }; -} - -NamesAndTypesList StorageS3Cluster::getVirtuals() const -{ - return virtual_columns; -} - - -} - -#endif diff --git a/src/Storages/StorageS3Cluster.h b/src/Storages/StorageS3Cluster.h deleted file mode 100644 index c526f14834a..00000000000 --- a/src/Storages/StorageS3Cluster.h +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include -#include - -#include "Client/Connection.h" -#include -#include -#include -#include - -namespace DB -{ - -class Context; - -class StorageS3Cluster : public IStorageCluster -{ -public: - StorageS3Cluster( - const String & cluster_name_, - const StorageS3::Configuration & configuration_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); - - std::string getName() const override { return "S3Cluster"; } - - NamesAndTypesList getVirtuals() const override; - - RemoteQueryExecutor::Extension getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const override; - - bool supportsSubcolumns() const override { return true; } - - bool supportsTrivialCountOptimization() const override { return true; } - -protected: - void updateConfigurationIfChanged(ContextPtr local_context); - -private: - void updateBeforeRead(const ContextPtr & context) override { updateConfigurationIfChanged(context); } - - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; - - StorageS3::Configuration s3_configuration; - NamesAndTypesList virtual_columns; -}; - - -} - -#endif diff --git a/src/Storages/System/StorageSystemSchemaInferenceCache.cpp b/src/Storages/System/StorageSystemSchemaInferenceCache.cpp index 1426ea83800..77d5be3698c 100644 --- a/src/Storages/System/StorageSystemSchemaInferenceCache.cpp +++ b/src/Storages/System/StorageSystemSchemaInferenceCache.cpp @@ -1,9 +1,7 @@ #include #include -#include #include -#include -#include +#include #include #include #include @@ -83,7 +81,7 @@ void StorageSystemSchemaInferenceCache::fillData(MutableColumns & res_columns, C #endif fillDataImpl(res_columns, StorageURL::getSchemaCache(context), "URL"); #if USE_AZURE_BLOB_STORAGE - fillDataImpl(res_columns, StorageAzureBlob::getSchemaCache(context), "Azure"); + fillDataImpl(res_columns, StorageAzureBlobStorage::getSchemaCache(context), "Azure"); /// FIXME #endif } diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index dea9feaf28b..0b72d7e94fd 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -45,8 +45,6 @@ void registerStorageIceberg(StorageFactory & factory); #endif #if USE_HDFS -void registerStorageHDFS(StorageFactory & factory); - #if USE_HIVE void registerStorageHive(StorageFactory & factory); #endif @@ -99,9 +97,7 @@ void registerStorageSQLite(StorageFactory & factory); void registerStorageKeeperMap(StorageFactory & factory); -#if USE_AZURE_BLOB_STORAGE -void registerStorageAzureBlob(StorageFactory & factory); -#endif +void registerStorageObjectStorage(StorageFactory & factory); void registerStorages() { @@ -131,9 +127,7 @@ void registerStorages() #endif #if USE_AWS_S3 - registerStorageS3(factory); - registerStorageCOS(factory); - registerStorageOSS(factory); + // registerStorageS3(factory); registerStorageHudi(factory); registerStorageS3Queue(factory); @@ -148,12 +142,9 @@ void registerStorages() #endif #if USE_HDFS - registerStorageHDFS(factory); - #if USE_HIVE registerStorageHive(factory); #endif - #endif registerStorageODBC(factory); @@ -201,9 +192,7 @@ void registerStorages() registerStorageKeeperMap(factory); - #if USE_AZURE_BLOB_STORAGE - registerStorageAzureBlob(factory); - #endif + registerStorageObjectStorage(factory); } } diff --git a/src/TableFunctions/ITableFunctionCluster.h b/src/TableFunctions/ITableFunctionCluster.h index 7e81d6d21b7..0559472325b 100644 --- a/src/TableFunctions/ITableFunctionCluster.h +++ b/src/TableFunctions/ITableFunctionCluster.h @@ -1,14 +1,10 @@ #pragma once -#include "config.h" - #include #include -#include #include #include -#include -#include +#include namespace DB diff --git a/src/TableFunctions/ITableFunctionDataLake.h b/src/TableFunctions/ITableFunctionDataLake.h index 961e5683fe2..884e1f5c4a2 100644 --- a/src/TableFunctions/ITableFunctionDataLake.h +++ b/src/TableFunctions/ITableFunctionDataLake.h @@ -10,6 +10,9 @@ # include # include # include +#include +#include +#include namespace DB { @@ -30,12 +33,13 @@ protected: bool /*is_insert_query*/) const override { ColumnsDescription columns; - if (TableFunction::configuration.structure != "auto") - columns = parseColumnsListFromString(TableFunction::configuration.structure, context); + if (TableFunction::configuration->structure != "auto") + columns = parseColumnsListFromString(TableFunction::configuration->structure, context); - StoragePtr storage = Storage::create( - TableFunction::configuration, context, false, StorageID(TableFunction::getDatabaseName(), table_name), - columns, ConstraintsDescription{}, String{}, std::nullopt); + StorageObjectStorageConfigurationPtr configuration = TableFunction::configuration; + StoragePtr storage = StorageIceberg>::create( + configuration, context, "", StorageID(TableFunction::getDatabaseName(), table_name), + columns, ConstraintsDescription{}, String{}, std::nullopt, false); storage->startup(); return storage; @@ -45,19 +49,19 @@ protected: ColumnsDescription getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const override { - if (TableFunction::configuration.structure == "auto") + if (TableFunction::configuration->structure == "auto") { context->checkAccess(TableFunction::getSourceAccessType()); - return Storage::getTableStructureFromData(TableFunction::configuration, std::nullopt, context); + return Storage::getTableStructureFromData(TableFunction::object_storage, TableFunction::configuration, std::nullopt, context); } - return parseColumnsListFromString(TableFunction::configuration.structure, context); + return parseColumnsListFromString(TableFunction::configuration->structure, context); } void parseArguments(const ASTPtr & ast_function, ContextPtr context) override { /// Set default format to Parquet if it's not specified in arguments. - TableFunction::configuration.format = "Parquet"; + TableFunction::configuration->format = "Parquet"; TableFunction::parseArguments(ast_function, context); } }; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp deleted file mode 100644 index b098cac5144..00000000000 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ /dev/null @@ -1,323 +0,0 @@ -#include "config.h" - -#if USE_AZURE_BLOB_STORAGE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "registerTableFunctions.h" -#include -#include -#include - -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int BAD_ARGUMENTS; -} - -namespace -{ - -bool isConnectionString(const std::string & candidate) -{ - return !candidate.starts_with("http"); -} - -} - -void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context) -{ - /// Supported signatures: - /// - /// AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) - /// - - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - { - StorageAzureBlob::processNamedCollectionResult(configuration, *named_collection); - - configuration.blobs_paths = {configuration.blob_path}; - - if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); - } - else - { - if (engine_args.size() < 3 || engine_args.size() > 8) - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Storage Azure requires 3 to 7 arguments: " - "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); - - for (auto & engine_arg : engine_args) - engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, local_context); - - std::unordered_map engine_args_to_idx; - - configuration.connection_url = checkAndGetLiteralArgument(engine_args[0], "connection_string/storage_account_url"); - configuration.is_connection_string = isConnectionString(configuration.connection_url); - - configuration.container = checkAndGetLiteralArgument(engine_args[1], "container"); - configuration.blob_path = checkAndGetLiteralArgument(engine_args[2], "blobpath"); - - auto is_format_arg - = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); }; - - if (engine_args.size() == 4) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name/structure"); - if (is_format_arg(fourth_arg)) - { - configuration.format = fourth_arg; - } - else - { - configuration.structure = fourth_arg; - } - } - else if (engine_args.size() == 5) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - if (is_format_arg(fourth_arg)) - { - configuration.format = fourth_arg; - configuration.compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); - } - else - { - configuration.account_name = fourth_arg; - configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - } - } - else if (engine_args.size() == 6) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - if (is_format_arg(fourth_arg)) - { - configuration.format = fourth_arg; - configuration.compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); - configuration.structure = checkAndGetLiteralArgument(engine_args[5], "structure"); - } - else - { - configuration.account_name = fourth_arg; - configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name/structure"); - if (is_format_arg(sixth_arg)) - configuration.format = sixth_arg; - else - configuration.structure = sixth_arg; - } - } - else if (engine_args.size() == 7) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - configuration.account_name = fourth_arg; - configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); - if (!is_format_arg(sixth_arg)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - configuration.format = sixth_arg; - configuration.compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); - } - else if (engine_args.size() == 8) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - configuration.account_name = fourth_arg; - configuration.account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); - if (!is_format_arg(sixth_arg)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - configuration.format = sixth_arg; - configuration.compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); - configuration.structure = checkAndGetLiteralArgument(engine_args[7], "structure"); - } - - configuration.blobs_paths = {configuration.blob_path}; - - if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); - } -} - -void TableFunctionAzureBlobStorage::parseArguments(const ASTPtr & ast_function, ContextPtr context) -{ - /// Clone ast function, because we can modify its arguments like removing headers. - auto ast_copy = ast_function->clone(); - - ASTs & args_func = ast_function->children; - - if (args_func.size() != 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' must have arguments.", getName()); - - auto & args = args_func.at(0)->children; - - parseArgumentsImpl(args, context); -} - -void TableFunctionAzureBlobStorage::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) -{ - if (tryGetNamedCollectionWithOverrides(args, context)) - { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); - } - else - { - if (args.size() < 3 || args.size() > 8) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Storage Azure requires 3 to 7 arguments: " - "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); - - auto structure_literal = std::make_shared(structure); - - auto is_format_arg - = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); }; - - - if (args.size() == 3) - { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(std::make_shared("auto")); - args.push_back(structure_literal); - } - else if (args.size() == 4) - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name/structure"); - if (is_format_arg(fourth_arg)) - { - /// Add compression=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(structure_literal); - } - else - { - args.back() = structure_literal; - } - } - else if (args.size() == 5) - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) - { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(std::make_shared("auto")); - } - args.push_back(structure_literal); - } - else if (args.size() == 6) - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) - { - /// Add compression=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(structure_literal); - } - else - { - args.back() = structure_literal; - } - } - else if (args.size() == 7) - { - args.push_back(structure_literal); - } - else if (args.size() == 8) - { - args.back() = structure_literal; - } - } -} - -ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(ContextPtr context, bool is_insert_query) const -{ - if (configuration.structure == "auto") - { - context->checkAccess(getSourceAccessType()); - auto client = StorageAzureBlob::createClient(configuration, !is_insert_query); - auto settings = StorageAzureBlob::createSettings(context); - - auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings), configuration.container); - return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context, false); - } - - return parseColumnsListFromString(configuration.structure, context); -} - -bool TableFunctionAzureBlobStorage::supportsReadingSubsetOfColumns(const ContextPtr & context) -{ - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context); -} - -std::unordered_set TableFunctionAzureBlobStorage::getVirtualsToCheckBeforeUsingStructureHint() const -{ - auto virtual_column_names = StorageAzureBlob::getVirtualColumnNames(); - return {virtual_column_names.begin(), virtual_column_names.end()}; -} - -StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const -{ - auto client = StorageAzureBlob::createClient(configuration, !is_insert_query); - auto settings = StorageAzureBlob::createSettings(context); - - ColumnsDescription columns; - if (configuration.structure != "auto") - columns = parseColumnsListFromString(configuration.structure, context); - else if (!structure_hint.empty()) - columns = structure_hint; - - StoragePtr storage = std::make_shared( - configuration, - std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), - context, - StorageID(getDatabaseName(), table_name), - columns, - ConstraintsDescription{}, - String{}, - /// No format_settings for table function Azure - std::nullopt, - /* distributed_processing */ false, - nullptr); - - storage->startup(); - - return storage; -} - -void registerTableFunctionAzureBlobStorage(TableFunctionFactory & factory) -{ - factory.registerFunction( - {.documentation - = {.description=R"(The table function can be used to read the data stored on Azure Blob Storage.)", - .examples{{"azureBlobStorage", "SELECT * FROM azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])", ""}}}, - .allow_readonly = false}); -} - -} - -#endif diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.h b/src/TableFunctions/TableFunctionAzureBlobStorage.h deleted file mode 100644 index 1a221f60c55..00000000000 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.h +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AZURE_BLOB_STORAGE - -#include -#include - - -namespace DB -{ - -class Context; - -/* AzureBlob(source, [access_key_id, secret_access_key,] [format, compression, structure]) - creates a temporary storage for a file in AzureBlob. - */ -class TableFunctionAzureBlobStorage : public ITableFunction -{ -public: - static constexpr auto name = "azureBlobStorage"; - - static constexpr auto signature = " - connection_string, container_name, blobpath\n" - " - connection_string, container_name, blobpath, structure \n" - " - connection_string, container_name, blobpath, format \n" - " - connection_string, container_name, blobpath, format, compression \n" - " - connection_string, container_name, blobpath, format, compression, structure \n" - " - storage_account_url, container_name, blobpath, account_name, account_key\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, structure\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure\n"; - - static size_t getMaxNumberOfArguments() { return 8; } - - String getName() const override - { - return name; - } - - virtual String getSignature() const - { - return signature; - } - - bool hasStaticStructure() const override { return configuration.structure != "auto"; } - - bool needStructureHint() const override { return configuration.structure == "auto"; } - - void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } - - bool supportsReadingSubsetOfColumns(const ContextPtr & context) override; - - std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override; - - virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); - -protected: - - StoragePtr executeImpl( - const ASTPtr & ast_function, - ContextPtr context, - const std::string & table_name, - ColumnsDescription cached_columns, - bool is_insert_query) const override; - - const char * getStorageTypeName() const override { return "Azure"; } - - ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; - void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - - mutable StorageAzureBlob::Configuration configuration; - ColumnsDescription structure_hint; -}; - -} - -#endif diff --git a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp deleted file mode 100644 index 1c3b302a186..00000000000 --- a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp +++ /dev/null @@ -1,85 +0,0 @@ -#include "config.h" - -#if USE_AZURE_BLOB_STORAGE - -#include -#include -#include -#include - -#include "registerTableFunctions.h" - -#include - - -namespace DB -{ - -StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( - const ASTPtr & /*function*/, ContextPtr context, - const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const -{ - StoragePtr storage; - ColumnsDescription columns; - bool structure_argument_was_provided = configuration.structure != "auto"; - - if (structure_argument_was_provided) - { - columns = parseColumnsListFromString(configuration.structure, context); - } - else if (!structure_hint.empty()) - { - columns = structure_hint; - } - - auto client = StorageAzureBlob::createClient(configuration, !is_insert_query); - auto settings = StorageAzureBlob::createSettings(context); - - if (context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) - { - /// On worker node this filename won't contains globs - storage = std::make_shared( - configuration, - std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), - context, - StorageID(getDatabaseName(), table_name), - columns, - ConstraintsDescription{}, - /* comment */String{}, - /* format_settings */std::nullopt, /// No format_settings - /* distributed_processing */ true, - /*partition_by_=*/nullptr); - } - else - { - storage = std::make_shared( - cluster_name, - configuration, - std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), - StorageID(getDatabaseName(), table_name), - columns, - ConstraintsDescription{}, - context, - structure_argument_was_provided); - } - - storage->startup(); - - return storage; -} - - -void registerTableFunctionAzureBlobStorageCluster(TableFunctionFactory & factory) -{ - factory.registerFunction( - {.documentation - = {.description=R"(The table function can be used to read the data stored on Azure Blob Storage in parallel for many nodes in a specified cluster.)", - .examples{{"azureBlobStorageCluster", "SELECT * FROM azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])", ""}}}, - .allow_readonly = false} - ); -} - - -} - -#endif diff --git a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.h b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.h deleted file mode 100644 index 58f79328f63..00000000000 --- a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.h +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AZURE_BLOB_STORAGE - -#include -#include -#include -#include - - -namespace DB -{ - -class Context; - -/** - * azureBlobStorageCluster(cluster_name, source, [access_key_id, secret_access_key,] format, compression_method, structure) - * A table function, which allows to process many files from Azure Blob Storage on a specific cluster - * On initiator it creates a connection to _all_ nodes in cluster, discloses asterisks - * in Azure Blob Storage file path and dispatch each file dynamically. - * On worker node it asks initiator about next task to process, processes it. - * This is repeated until the tasks are finished. - */ -class TableFunctionAzureBlobStorageCluster : public ITableFunctionCluster -{ -public: - static constexpr auto name = "azureBlobStorageCluster"; - static constexpr auto signature = " - cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]"; - - String getName() const override - { - return name; - } - - String getSignature() const override - { - return signature; - } - -protected: - StoragePtr executeImpl( - const ASTPtr & ast_function, - ContextPtr context, - const std::string & table_name, - ColumnsDescription cached_columns, - bool is_insert_query) const override; - - const char * getStorageTypeName() const override { return "AzureBlobStorageCluster"; } -}; - -} - -#endif diff --git a/src/TableFunctions/TableFunctionDeltaLake.cpp b/src/TableFunctions/TableFunctionDeltaLake.cpp index b8bf810f6fa..08b62ed2612 100644 --- a/src/TableFunctions/TableFunctionDeltaLake.cpp +++ b/src/TableFunctions/TableFunctionDeltaLake.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "registerTableFunctions.h" namespace DB @@ -16,17 +16,17 @@ struct TableFunctionDeltaLakeName static constexpr auto name = "deltaLake"; }; -using TableFunctionDeltaLake = ITableFunctionDataLake; - -void registerTableFunctionDeltaLake(TableFunctionFactory & factory) -{ - factory.registerFunction( - {.documentation = { - .description=R"(The table function can be used to read the DeltaLake table stored on object store.)", - .examples{{"deltaLake", "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)", ""}}, - .categories{"DataLake"}}, - .allow_readonly = false}); -} +// using TableFunctionDeltaLake = ITableFunctionDataLake; +// +// void registerTableFunctionDeltaLake(TableFunctionFactory & factory) +// { +// factory.registerFunction( +// {.documentation = { +// .description=R"(The table function can be used to read the DeltaLake table stored on object store.)", +// .examples{{"deltaLake", "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)", ""}}, +// .categories{"DataLake"}}, +// .allow_readonly = false}); +// } } diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp deleted file mode 100644 index 8d48a7ba30e..00000000000 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include "config.h" -#include "registerTableFunctions.h" - -#if USE_HDFS -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -StoragePtr TableFunctionHDFS::getStorage( - const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, - const std::string & table_name, const String & compression_method_) const -{ - return std::make_shared( - source, - StorageID(getDatabaseName(), table_name), - format_, - columns, - ConstraintsDescription{}, - String{}, - global_context, - compression_method_); -} - -ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const -{ - if (structure == "auto") - { - context->checkAccess(getSourceAccessType()); - return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context); - } - - return parseColumnsListFromString(structure, context); -} - -std::unordered_set TableFunctionHDFS::getVirtualsToCheckBeforeUsingStructureHint() const -{ - auto virtual_column_names = StorageHDFS::getVirtualColumnNames(); - return {virtual_column_names.begin(), virtual_column_names.end()}; -} - -void registerTableFunctionHDFS(TableFunctionFactory & factory) -{ - factory.registerFunction(); -} - -} -#endif diff --git a/src/TableFunctions/TableFunctionHDFS.h b/src/TableFunctions/TableFunctionHDFS.h deleted file mode 100644 index 3a719496b26..00000000000 --- a/src/TableFunctions/TableFunctionHDFS.h +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_HDFS - -#include - - -namespace DB -{ - -class Context; - -/* hdfs(URI, [format, structure, compression]) - creates a temporary storage from hdfs files - * - */ -class TableFunctionHDFS : public ITableFunctionFileLike -{ -public: - static constexpr auto name = "hdfs"; - static constexpr auto signature = " - uri\n" - " - uri, format\n" - " - uri, format, structure\n" - " - uri, format, structure, compression_method\n"; - - String getName() const override - { - return name; - } - - String getSignature() const override - { - return signature; - } - - ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; - - std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override; - -private: - StoragePtr getStorage( - const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, - const std::string & table_name, const String & compression_method_) const override; - const char * getStorageTypeName() const override { return "HDFS"; } -}; - -} - -#endif diff --git a/src/TableFunctions/TableFunctionHDFSCluster.cpp b/src/TableFunctions/TableFunctionHDFSCluster.cpp deleted file mode 100644 index 6fb7ed0fce5..00000000000 --- a/src/TableFunctions/TableFunctionHDFSCluster.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include "config.h" - -#if USE_HDFS - -#include -#include - -#include -#include -#include "registerTableFunctions.h" - -#include - - -namespace DB -{ - -StoragePtr TableFunctionHDFSCluster::getStorage( - const String & /*source*/, const String & /*format_*/, const ColumnsDescription & columns, ContextPtr context, - const std::string & table_name, const String & /*compression_method_*/) const -{ - StoragePtr storage; - if (context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) - { - /// On worker node this uri won't contains globs - storage = std::make_shared( - filename, - StorageID(getDatabaseName(), table_name), - format, - columns, - ConstraintsDescription{}, - String{}, - context, - compression_method, - /*distributed_processing=*/true, - nullptr); - } - else - { - storage = std::make_shared( - context, - cluster_name, - filename, - StorageID(getDatabaseName(), table_name), - format, - columns, - ConstraintsDescription{}, - compression_method, - structure != "auto"); - } - return storage; -} - -void registerTableFunctionHDFSCluster(TableFunctionFactory & factory) -{ - factory.registerFunction(); -} - -} - -#endif diff --git a/src/TableFunctions/TableFunctionHDFSCluster.h b/src/TableFunctions/TableFunctionHDFSCluster.h deleted file mode 100644 index 0253217feb7..00000000000 --- a/src/TableFunctions/TableFunctionHDFSCluster.h +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_HDFS - -#include -#include -#include - - -namespace DB -{ - -class Context; - -/** - * hdfsCluster(cluster, URI, format, structure, compression_method) - * A table function, which allows to process many files from HDFS on a specific cluster - * On initiator it creates a connection to _all_ nodes in cluster, discloses asterisks - * in HDFS file path and dispatch each file dynamically. - * On worker node it asks initiator about next task to process, processes it. - * This is repeated until the tasks are finished. - */ -class TableFunctionHDFSCluster : public ITableFunctionCluster -{ -public: - static constexpr auto name = "hdfsCluster"; - static constexpr auto signature = " - cluster_name, uri\n" - " - cluster_name, uri, format\n" - " - cluster_name, uri, format, structure\n" - " - cluster_name, uri, format, structure, compression_method\n"; - - String getName() const override - { - return name; - } - - String getSignature() const override - { - return signature; - } - -protected: - StoragePtr getStorage( - const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, - const std::string & table_name, const String & compression_method_) const override; - - const char * getStorageTypeName() const override { return "HDFSCluster"; } -}; - -} - -#endif diff --git a/src/TableFunctions/TableFunctionHudi.cpp b/src/TableFunctions/TableFunctionHudi.cpp index 436e708b72d..c6d84504c40 100644 --- a/src/TableFunctions/TableFunctionHudi.cpp +++ b/src/TableFunctions/TableFunctionHudi.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "registerTableFunctions.h" namespace DB @@ -15,17 +15,17 @@ struct TableFunctionHudiName { static constexpr auto name = "hudi"; }; -using TableFunctionHudi = ITableFunctionDataLake; - -void registerTableFunctionHudi(TableFunctionFactory & factory) -{ - factory.registerFunction( - {.documentation - = {.description=R"(The table function can be used to read the Hudi table stored on object store.)", - .examples{{"hudi", "SELECT * FROM hudi(url, access_key_id, secret_access_key)", ""}}, - .categories{"DataLake"}}, - .allow_readonly = false}); -} +// using TableFunctionHudi = ITableFunctionDataLake; +// +// void registerTableFunctionHudi(TableFunctionFactory & factory) +// { +// factory.registerFunction( +// {.documentation +// = {.description=R"(The table function can be used to read the Hudi table stored on object store.)", +// .examples{{"hudi", "SELECT * FROM hudi(url, access_key_id, secret_access_key)", ""}}, +// .categories{"DataLake"}}, +// .allow_readonly = false}); +// } } #endif diff --git a/src/TableFunctions/TableFunctionIceberg.cpp b/src/TableFunctions/TableFunctionIceberg.cpp index d37aace01c6..1a28f9292d1 100644 --- a/src/TableFunctions/TableFunctionIceberg.cpp +++ b/src/TableFunctions/TableFunctionIceberg.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "registerTableFunctions.h" @@ -17,7 +17,10 @@ struct TableFunctionIcebergName static constexpr auto name = "iceberg"; }; -using TableFunctionIceberg = ITableFunctionDataLake; +using TableFunctionIceberg = ITableFunctionDataLake< + TableFunctionIcebergName, + StorageIceberg, + TableFunctionS3>; void registerTableFunctionIceberg(TableFunctionFactory & factory) { diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp new file mode 100644 index 00000000000..d009a9347f3 --- /dev/null +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -0,0 +1,224 @@ +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "registerTableFunctions.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; +} + +static void initializeConfiguration( + StorageObjectStorageConfiguration & configuration, + ASTs & engine_args, + ContextPtr local_context, + bool with_table_structure) +{ + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) + configuration.fromNamedCollection(*named_collection); + else + configuration.fromAST(engine_args, local_context, with_table_structure); +} + +template +ObjectStoragePtr TableFunctionObjectStorage::getObjectStorage(const ContextPtr & context, bool create_readonly) const +{ + if (!object_storage) + object_storage = configuration->createOrUpdateObjectStorage(context, create_readonly); + return object_storage; +} + +template +std::vector TableFunctionObjectStorage::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const +{ + auto & table_function_node = query_node_table_function->as(); + auto & table_function_arguments_nodes = table_function_node.getArguments().getNodes(); + size_t table_function_arguments_size = table_function_arguments_nodes.size(); + + std::vector result; + for (size_t i = 0; i < table_function_arguments_size; ++i) + { + auto * function_node = table_function_arguments_nodes[i]->as(); + if (function_node && function_node->getFunctionName() == "headers") + result.push_back(i); + } + return result; +} + +template +void TableFunctionObjectStorage::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +{ + Configuration::addStructureToArgs(args, structure, context); +} + +template +void TableFunctionObjectStorage::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context) +{ + configuration = std::make_shared(); + initializeConfiguration(*configuration, engine_args, local_context, true); +} + +template +void TableFunctionObjectStorage::parseArguments(const ASTPtr & ast_function, ContextPtr context) +{ + /// Clone ast function, because we can modify its arguments like removing headers. + auto ast_copy = ast_function->clone(); + ASTs & args_func = ast_copy->children; + if (args_func.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' must have arguments.", getName()); + + auto & args = args_func.at(0)->children; + parseArgumentsImpl(args, context); +} + +template +ColumnsDescription TableFunctionObjectStorage::getActualTableStructure(ContextPtr context, bool is_insert_query) const +{ + if (configuration->structure == "auto") + { + context->checkAccess(getSourceAccessType()); + auto storage = getObjectStorage(context, !is_insert_query); + return StorageObjectStorage::getTableStructureFromData(storage, configuration, std::nullopt, context); + } + + return parseColumnsListFromString(configuration->structure, context); +} + +template +bool TableFunctionObjectStorage::supportsReadingSubsetOfColumns(const ContextPtr & context) +{ + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); +} + +template +std::unordered_set TableFunctionObjectStorage::getVirtualsToCheckBeforeUsingStructureHint() const +{ + auto virtual_column_names = StorageObjectStorage::getVirtualColumnNames(); + return {virtual_column_names.begin(), virtual_column_names.end()}; +} + +template +StoragePtr TableFunctionObjectStorage::executeImpl( + const ASTPtr & /* ast_function */, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + bool is_insert_query) const +{ + ColumnsDescription columns; + if (configuration->structure != "auto") + columns = parseColumnsListFromString(configuration->structure, context); + else if (!structure_hint.empty()) + columns = structure_hint; + else if (!cached_columns.empty()) + columns = cached_columns; + + StoragePtr storage = std::make_shared>( + configuration, + getObjectStorage(context, !is_insert_query), + Definition::storage_type_name, + context, + StorageID(getDatabaseName(), table_name), + columns, + ConstraintsDescription{}, + String{}, + /// No format_settings for table function Azure + std::nullopt, + /* distributed_processing */ false, + nullptr); + + storage->startup(); + return storage; +} + +void registerTableFunctionObjectStorage(TableFunctionFactory & factory) +{ +#if USE_AWS_S3 + factory.registerFunction>( + { + .documentation = + { + .description=R"(The table function can be used to read the data stored on AWS S3.)", + .examples{{"s3", "SELECT * FROM s3(url, access_key_id, secret_access_key)", ""} + }, + .categories{"DataLake"}}, + .allow_readonly = false + }); + + factory.registerFunction>( + { + .allow_readonly = false + }); + + factory.registerFunction>( + { + .allow_readonly = false + }); + factory.registerFunction>( + { + .allow_readonly = false + }); +#endif + +#if USE_AZURE_BLOB_STORAGE + factory.registerFunction>( + { + .documentation = + { + .description=R"(The table function can be used to read the data stored on Azure Blob Storage.)", + .examples{ + { + "azureBlobStorage", + "SELECT * FROM azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, " + "[account_name, account_key, format, compression, structure])", "" + }} + }, + .allow_readonly = false + }); +#endif +#if USE_HDFS + factory.registerFunction>( + { + .allow_readonly = false + }); +#endif +} + +#if USE_AZURE_BLOB_STORAGE +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +#endif + +#if USE_AWS_S3 +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +#endif + +#if USE_HDFS +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +#endif + +} diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h new file mode 100644 index 00000000000..1df0ba2f843 --- /dev/null +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -0,0 +1,150 @@ +#pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include + + +namespace DB +{ + +class Context; +class StorageS3Configuration; +class StorageAzureBlobConfiguration; +class StorageHDFSConfiguration; +struct S3StorageSettings; +struct AzureStorageSettings; +struct HDFSStorageSettings; + +struct AzureDefinition +{ + static constexpr auto name = "azureBlobStorage"; + static constexpr auto storage_type_name = "Azure"; + static constexpr auto signature = " - connection_string, container_name, blobpath\n" + " - connection_string, container_name, blobpath, structure \n" + " - connection_string, container_name, blobpath, format \n" + " - connection_string, container_name, blobpath, format, compression \n" + " - connection_string, container_name, blobpath, format, compression, structure \n" + " - storage_account_url, container_name, blobpath, account_name, account_key\n" + " - storage_account_url, container_name, blobpath, account_name, account_key, structure\n" + " - storage_account_url, container_name, blobpath, account_name, account_key, format\n" + " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression\n" + " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure\n"; +}; + +struct S3Definition +{ + static constexpr auto name = "s3"; + static constexpr auto storage_type_name = "S3"; + static constexpr auto signature = " - url\n" + " - url, format\n" + " - url, format, structure\n" + " - url, format, structure, compression_method\n" + " - url, access_key_id, secret_access_key\n" + " - url, access_key_id, secret_access_key, session_token\n" + " - url, access_key_id, secret_access_key, format\n" + " - url, access_key_id, secret_access_key, session_token, format\n" + " - url, access_key_id, secret_access_key, format, structure\n" + " - url, access_key_id, secret_access_key, session_token, format, structure\n" + " - url, access_key_id, secret_access_key, format, structure, compression_method\n" + " - url, access_key_id, secret_access_key, session_token, format, structure, compression_method\n" + "All signatures supports optional headers (specified as `headers('name'='value', 'name2'='value2')`)"; +}; + +struct GCSDefinition +{ + static constexpr auto name = "gcs"; + static constexpr auto storage_type_name = "GCS"; + static constexpr auto signature = S3Definition::signature; +}; + +struct COSNDefinition +{ + static constexpr auto name = "cosn"; + static constexpr auto storage_type_name = "COSN"; + static constexpr auto signature = S3Definition::signature; +}; + +struct OSSDefinition +{ + static constexpr auto name = "oss"; + static constexpr auto storage_type_name = "OSS"; + static constexpr auto signature = S3Definition::signature; +}; + +struct HDFSDefinition +{ + static constexpr auto name = "hdfs"; + static constexpr auto storage_type_name = "HDFS"; + static constexpr auto signature = " - uri\n" + " - uri, format\n" + " - uri, format, structure\n" + " - uri, format, structure, compression_method\n"; +}; + +template +class TableFunctionObjectStorage : public ITableFunction +{ +public: + static constexpr auto name = Definition::name; + static constexpr auto signature = Definition::signature; + + static size_t getMaxNumberOfArguments() { return 8; } + + String getName() const override { return name; } + + virtual String getSignature() const { return signature; } + + bool hasStaticStructure() const override { return configuration->structure != "auto"; } + + bool needStructureHint() const override { return configuration->structure == "auto"; } + + void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } + + bool supportsReadingSubsetOfColumns(const ContextPtr & context) override; + + std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override; + + virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); + + static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + +protected: + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + bool is_insert_query) const override; + + const char * getStorageTypeName() const override { return Definition::storage_type_name; } + + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; + void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; + ObjectStoragePtr getObjectStorage(const ContextPtr & context, bool create_readonly) const; + + mutable typename StorageObjectStorage::ConfigurationPtr configuration; + mutable ObjectStoragePtr object_storage; + ColumnsDescription structure_hint; + + std::vector skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr context) const override; +}; + +#if USE_AWS_S3 +using TableFunctionS3 = TableFunctionObjectStorage; +#endif + +#if USE_AZURE_BLOB_STORAGE +using TableFunctionAzureBlob = TableFunctionObjectStorage; +#endif + +#if USE_HDFS +using TableFunctionHDFS = TableFunctionObjectStorage; +#endif +} + +#endif diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp new file mode 100644 index 00000000000..1d27a857cea --- /dev/null +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -0,0 +1,113 @@ +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +template +StoragePtr TableFunctionObjectStorageCluster::executeImpl( + const ASTPtr & /*function*/, ContextPtr context, + const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const +{ + using Base = TableFunctionObjectStorage; + + StoragePtr storage; + ColumnsDescription columns; + bool structure_argument_was_provided = Base::configuration->structure != "auto"; + + if (structure_argument_was_provided) + { + columns = parseColumnsListFromString(Base::configuration->structure, context); + } + else if (!Base::structure_hint.empty()) + { + columns = Base::structure_hint; + } + + if (context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) + { + /// On worker node this filename won't contains globs + storage = std::make_shared>( + Base::configuration, + Base::configuration->createOrUpdateObjectStorage(context, !is_insert_query), + Definition::storage_type_name, + context, + StorageID(Base::getDatabaseName(), table_name), + columns, + ConstraintsDescription{}, + /* comment */String{}, + /* format_settings */std::nullopt, /// No format_settings + /* distributed_processing */ true, + /*partition_by_=*/nullptr); + } + else + { + storage = std::make_shared>( + ITableFunctionCluster::cluster_name, + Base::configuration, + Base::configuration->createOrUpdateObjectStorage(context, !is_insert_query), + Definition::storage_type_name, + StorageID(Base::getDatabaseName(), table_name), + columns, + ConstraintsDescription{}, + context, + structure_argument_was_provided); + } + + storage->startup(); + return storage; +} + + +void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory) +{ +#if USE_AWS_S3 + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the data stored on Azure Blob Storage in parallel for many nodes in a specified cluster.)", + .examples{{"azureBlobStorageCluster", "SELECT * FROM azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])", ""}}}, + .allow_readonly = false + } + ); +#endif + +#if USE_AZURE_BLOB_STORAGE + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the data stored on Azure Blob Storage in parallel for many nodes in a specified cluster.)", + .examples{{"azureBlobStorageCluster", "SELECT * FROM azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])", ""}}}, + .allow_readonly = false + } + ); +#endif + +#if USE_HDFS + factory.registerFunction(); +#endif +} + +#if USE_AWS_S3 +template class TableFunctionObjectStorageCluster; +#endif + +#if USE_AZURE_BLOB_STORAGE +template class TableFunctionObjectStorageCluster; +#endif + +#if USE_HDFS +template class TableFunctionObjectStorageCluster; +#endif +} diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.h b/src/TableFunctions/TableFunctionObjectStorageCluster.h new file mode 100644 index 00000000000..461456e37df --- /dev/null +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.h @@ -0,0 +1,91 @@ +#pragma once +#include "config.h" +#include +#include +#include + + +namespace DB +{ + +class Context; + +class StorageS3Settings; +class StorageAzureBlobSettings; +class StorageS3Configuration; +class StorageAzureBlobConfiguration; + +struct AzureClusterDefinition +{ + /** + * azureBlobStorageCluster(cluster_name, source, [access_key_id, secret_access_key,] format, compression_method, structure) + * A table function, which allows to process many files from Azure Blob Storage on a specific cluster + * On initiator it creates a connection to _all_ nodes in cluster, discloses asterisks + * in Azure Blob Storage file path and dispatch each file dynamically. + * On worker node it asks initiator about next task to process, processes it. + * This is repeated until the tasks are finished. + */ + static constexpr auto name = "azureBlobStorageCluster"; + static constexpr auto storage_type_name = "AzureBlobStorageCluster"; + static constexpr auto signature = " - cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]"; +}; + +struct S3ClusterDefinition +{ + static constexpr auto name = "s3Cluster"; + static constexpr auto storage_type_name = "S3Cluster"; + static constexpr auto signature = " - cluster, url\n" + " - cluster, url, format\n" + " - cluster, url, format, structure\n" + " - cluster, url, access_key_id, secret_access_key\n" + " - cluster, url, format, structure, compression_method\n" + " - cluster, url, access_key_id, secret_access_key, format\n" + " - cluster, url, access_key_id, secret_access_key, format, structure\n" + " - cluster, url, access_key_id, secret_access_key, format, structure, compression_method\n" + " - cluster, url, access_key_id, secret_access_key, session_token, format, structure, compression_method\n" + "All signatures supports optional headers (specified as `headers('name'='value', 'name2'='value2')`)"; +}; + +struct HDFSClusterDefinition +{ + static constexpr auto name = "hdfsCluster"; + static constexpr auto storage_type_name = "HDFSCluster"; + static constexpr auto signature = " - cluster_name, uri\n" + " - cluster_name, uri, format\n" + " - cluster_name, uri, format, structure\n" + " - cluster_name, uri, format, structure, compression_method\n"; +}; + +template +class TableFunctionObjectStorageCluster : public ITableFunctionCluster> +{ +public: + static constexpr auto name = Definition::name; + static constexpr auto signature = Definition::signature; + + String getName() const override { return name; } + String getSignature() const override { return signature; } + +protected: + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + bool is_insert_query) const override; + + const char * getStorageTypeName() const override { return Definition::storage_type_name; } +}; + +#if USE_AWS_S3 +using TableFunctionS3Cluster = TableFunctionObjectStorageCluster; +#endif + +#if USE_AZURE_BLOB_STORAGE +using TableFunctionAzureBlobCluster = TableFunctionObjectStorageCluster; +#endif + +#if USE_HDFS +using TableFunctionHDFSCluster = TableFunctionObjectStorageCluster; +#endif +} diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp deleted file mode 100644 index a9c5a5c99f0..00000000000 --- a/src/TableFunctions/TableFunctionS3.cpp +++ /dev/null @@ -1,464 +0,0 @@ -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "registerTableFunctions.h" -#include -#include - -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int LOGICAL_ERROR; -} - - -std::vector TableFunctionS3::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const -{ - auto & table_function_node = query_node_table_function->as(); - auto & table_function_arguments_nodes = table_function_node.getArguments().getNodes(); - size_t table_function_arguments_size = table_function_arguments_nodes.size(); - - std::vector result; - - for (size_t i = 0; i < table_function_arguments_size; ++i) - { - auto * function_node = table_function_arguments_nodes[i]->as(); - if (function_node && function_node->getFunctionName() == "headers") - result.push_back(i); - } - - return result; -} - -/// This is needed to avoid copy-paste. Because s3Cluster arguments only differ in additional argument (first) - cluster name -void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context) -{ - if (auto named_collection = tryGetNamedCollectionWithOverrides(args, context)) - { - StorageS3::processNamedCollectionResult(configuration, *named_collection); - if (configuration.format == "auto") - { - String file_path = named_collection->getOrDefault("filename", Poco::URI(named_collection->get("url")).getPath()); - configuration.format = FormatFactory::instance().getFormatFromFileName(file_path, true); - } - } - else - { - - size_t count = StorageURL::evalArgsAndCollectHeaders(args, configuration.headers_from_ast, context); - - if (count == 0 || count > 7) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "The signature of table function {} shall be the following:\n{}", getName(), getSignature()); - - std::unordered_map args_to_idx; - - bool no_sign_request = false; - - /// For 2 arguments we support 2 possible variants: - /// - s3(source, format) - /// - s3(source, NOSIGN) - /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or not. - if (count == 2) - { - auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); - if (boost::iequals(second_arg, "NOSIGN")) - no_sign_request = true; - else - args_to_idx = {{"format", 1}}; - } - /// For 3 arguments we support 3 possible variants: - /// - s3(source, format, structure) - /// - s3(source, access_key_id, secret_access_key) - /// - s3(source, NOSIGN, format) - /// We can distinguish them by looking at the 2-nd argument: check if it's a format name or not. - else if (count == 3) - { - auto second_arg = checkAndGetLiteralArgument(args[1], "format/access_key_id/NOSIGN"); - if (boost::iequals(second_arg, "NOSIGN")) - { - no_sign_request = true; - args_to_idx = {{"format", 2}}; - } - else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) - args_to_idx = {{"format", 1}, {"structure", 2}}; - else - args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}}; - } - /// For 4 arguments we support 4 possible variants: - /// - s3(source, format, structure, compression_method), - /// - s3(source, access_key_id, secret_access_key, format), - /// - s3(source, access_key_id, secret_access_key, session_token) - /// - s3(source, NOSIGN, format, structure) - /// We can distinguish them by looking at the 2-nd and 4-th argument: check if it's a format name or not. - else if (count == 4) - { - auto second_arg = checkAndGetLiteralArgument(args[1], "format/access_key_id/NOSIGN"); - if (boost::iequals(second_arg, "NOSIGN")) - { - no_sign_request = true; - args_to_idx = {{"format", 2}, {"structure", 3}}; - } - else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) - { - args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}}; - } - else - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/session_token"); - if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) - { - args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}}; - } - else - { - args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}}; - } - } - } - /// For 5 arguments we support 3 possible variants: - /// - s3(source, access_key_id, secret_access_key, format, structure) - /// - s3(source, access_key_id, secret_access_key, session_token, format) - /// - s3(source, NOSIGN, format, structure, compression_method) - /// We can distinguish them by looking at the 2-nd argument: check if it's a NOSIGN keyword name or no, - /// and by the 4-th argument, check if it's a format name or not - else if (count == 5) - { - auto second_arg = checkAndGetLiteralArgument(args[1], "NOSIGN/access_key_id"); - if (boost::iequals(second_arg, "NOSIGN")) - { - no_sign_request = true; - args_to_idx = {{"format", 2}, {"structure", 3}, {"compression_method", 4}}; - } - else - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/session_token"); - if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) - { - args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}}; - } - else - { - args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}}; - } - } - } - // For 6 arguments we support 2 possible variants: - /// - s3(source, access_key_id, secret_access_key, format, structure, compression_method) - /// - s3(source, access_key_id, secret_access_key, session_token, format, structure) - /// We can distinguish them by looking at the 4-th argument: check if it's a format name or not - else if (count == 6) - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/session_token"); - if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg)) - { - args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}}; - } - else - { - args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}}; - } - } - else if (count == 7) - { - args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}, {"compression_method", 6}}; - } - - /// This argument is always the first - String url = checkAndGetLiteralArgument(args[0], "url"); - configuration.url = S3::URI(url); - - if (args_to_idx.contains("format")) - { - auto format = checkAndGetLiteralArgument(args[args_to_idx["format"]], "format"); - /// Set format to configuration only of it's not 'auto', - /// because we can have default format set in configuration. - if (format != "auto") - configuration.format = format; - } - - if (args_to_idx.contains("structure")) - configuration.structure = checkAndGetLiteralArgument(args[args_to_idx["structure"]], "structure"); - - if (args_to_idx.contains("compression_method")) - configuration.compression_method = checkAndGetLiteralArgument(args[args_to_idx["compression_method"]], "compression_method"); - - if (args_to_idx.contains("access_key_id")) - configuration.auth_settings.access_key_id = checkAndGetLiteralArgument(args[args_to_idx["access_key_id"]], "access_key_id"); - - if (args_to_idx.contains("secret_access_key")) - configuration.auth_settings.secret_access_key = checkAndGetLiteralArgument(args[args_to_idx["secret_access_key"]], "secret_access_key"); - - if (args_to_idx.contains("session_token")) - configuration.auth_settings.session_token = checkAndGetLiteralArgument(args[args_to_idx["session_token"]], "session_token"); - - configuration.auth_settings.no_sign_request = no_sign_request; - - if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(Poco::URI(url).getPath(), true); - } - - configuration.keys = {configuration.url.key}; -} - -void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr context) -{ - /// Clone ast function, because we can modify its arguments like removing headers. - auto ast_copy = ast_function->clone(); - - /// Parse args - ASTs & args_func = ast_function->children; - - if (args_func.size() != 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' must have arguments.", getName()); - - auto & args = args_func.at(0)->children; - - parseArgumentsImpl(args, context); -} - -void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) -{ - if (tryGetNamedCollectionWithOverrides(args, context)) - { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); - } - else - { - HTTPHeaderEntries tmp_headers; - size_t count = StorageURL::evalArgsAndCollectHeaders(args, tmp_headers, context); - - if (count == 0 || count > getMaxNumberOfArguments()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to {} arguments in table function, got {}", getMaxNumberOfArguments(), count); - - auto structure_literal = std::make_shared(structure); - - /// s3(s3_url) - if (count == 1) - { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(structure_literal); - } - /// s3(s3_url, format) or s3(s3_url, NOSIGN) - /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or not. - else if (count == 2) - { - auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); - /// If there is NOSIGN, add format=auto before structure. - if (boost::iequals(second_arg, "NOSIGN")) - args.push_back(std::make_shared("auto")); - args.push_back(structure_literal); - } - /// s3(source, format, structure) or - /// s3(source, access_key_id, secret_access_key) or - /// s3(source, NOSIGN, format) - /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN, format name or neither. - else if (count == 3) - { - auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); - if (boost::iequals(second_arg, "NOSIGN")) - { - args.push_back(structure_literal); - } - else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) - { - args[count - 1] = structure_literal; - } - else - { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(structure_literal); - } - } - /// s3(source, format, structure, compression_method) or - /// s3(source, access_key_id, secret_access_key, format) or - /// s3(source, NOSIGN, format, structure) - /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN, format name or neither. - else if (count == 4) - { - auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); - if (boost::iequals(second_arg, "NOSIGN")) - { - args[count - 1] = structure_literal; - } - else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) - { - args[count - 2] = structure_literal; - } - else - { - args.push_back(structure_literal); - } - } - /// s3(source, access_key_id, secret_access_key, format, structure) or - /// s3(source, NOSIGN, format, structure, compression_method) - /// We can distinguish them by looking at the 2-nd argument: check if it's a NOSIGN keyword name or not. - else if (count == 5) - { - auto sedond_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); - if (boost::iequals(sedond_arg, "NOSIGN")) - { - args[count - 2] = structure_literal; - } - else - { - args[count - 1] = structure_literal; - } - } - /// s3(source, access_key_id, secret_access_key, format, structure, compression) - else if (count == 6) - { - args[count - 2] = structure_literal; - } - } -} - -ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const -{ - if (configuration.structure == "auto") - { - context->checkAccess(getSourceAccessType()); - configuration.update(context); - return StorageS3::getTableStructureFromData(configuration, std::nullopt, context); - } - - return parseColumnsListFromString(configuration.structure, context); -} - -bool TableFunctionS3::supportsReadingSubsetOfColumns(const ContextPtr & context) -{ - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context); -} - -std::unordered_set TableFunctionS3::getVirtualsToCheckBeforeUsingStructureHint() const -{ - auto virtual_column_names = StorageS3::getVirtualColumnNames(); - return {virtual_column_names.begin(), virtual_column_names.end()}; -} - -StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool /*is_insert_query*/) const -{ - S3::URI s3_uri (configuration.url); - - ColumnsDescription columns; - if (configuration.structure != "auto") - columns = parseColumnsListFromString(configuration.structure, context); - else if (!structure_hint.empty()) - columns = structure_hint; - else if (!cached_columns.empty()) - columns = cached_columns; - - StoragePtr storage = std::make_shared( - configuration, - context, - StorageID(getDatabaseName(), table_name), - columns, - ConstraintsDescription{}, - String{}, - /// No format_settings for table function S3 - std::nullopt); - - storage->startup(); - - return storage; -} - - -class TableFunctionGCS : public TableFunctionS3 -{ -public: - static constexpr auto name = "gcs"; - std::string getName() const override - { - return name; - } -private: - const char * getStorageTypeName() const override { return "GCS"; } -}; - -class TableFunctionCOS : public TableFunctionS3 -{ -public: - static constexpr auto name = "cosn"; - std::string getName() const override - { - return name; - } -private: - const char * getStorageTypeName() const override { return "COSN"; } -}; - -class TableFunctionOSS : public TableFunctionS3 -{ -public: - static constexpr auto name = "oss"; - std::string getName() const override - { - return name; - } -private: - const char * getStorageTypeName() const override { return "OSS"; } -}; - - -void registerTableFunctionGCS(TableFunctionFactory & factory) -{ - factory.registerFunction( - {.documentation - = {.description=R"(The table function can be used to read the data stored on Google Cloud Storage.)", - .examples{{"gcs", "SELECT * FROM gcs(url, hmac_key, hmac_secret)", ""}}, - .categories{"DataLake"}}, - .allow_readonly = false}); -} - -void registerTableFunctionS3(TableFunctionFactory & factory) -{ - factory.registerFunction( - {.documentation - = {.description=R"(The table function can be used to read the data stored on AWS S3.)", - .examples{{"s3", "SELECT * FROM s3(url, access_key_id, secret_access_key)", ""}}, - .categories{"DataLake"}}, - .allow_readonly = false}); -} - - -void registerTableFunctionCOS(TableFunctionFactory & factory) -{ - factory.registerFunction(); -} - -void registerTableFunctionOSS(TableFunctionFactory & factory) -{ - factory.registerFunction(); -} - -} - -#endif diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h deleted file mode 100644 index fa73c1d313e..00000000000 --- a/src/TableFunctions/TableFunctionS3.h +++ /dev/null @@ -1,86 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include -#include - - -namespace DB -{ - -class Context; - -/* s3(source, [access_key_id, secret_access_key,] [format, structure, compression]) - creates a temporary storage for a file in S3. - */ -class TableFunctionS3 : public ITableFunction -{ -public: - static constexpr auto name = "s3"; - static constexpr auto signature = " - url\n" - " - url, format\n" - " - url, format, structure\n" - " - url, format, structure, compression_method\n" - " - url, access_key_id, secret_access_key\n" - " - url, access_key_id, secret_access_key, session_token\n" - " - url, access_key_id, secret_access_key, format\n" - " - url, access_key_id, secret_access_key, session_token, format\n" - " - url, access_key_id, secret_access_key, format, structure\n" - " - url, access_key_id, secret_access_key, session_token, format, structure\n" - " - url, access_key_id, secret_access_key, format, structure, compression_method\n" - " - url, access_key_id, secret_access_key, session_token, format, structure, compression_method\n" - "All signatures supports optional headers (specified as `headers('name'='value', 'name2'='value2')`)"; - - static size_t getMaxNumberOfArguments() { return 6; } - - String getName() const override - { - return name; - } - - virtual String getSignature() const - { - return signature; - } - - bool hasStaticStructure() const override { return configuration.structure != "auto"; } - - bool needStructureHint() const override { return configuration.structure == "auto"; } - - void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } - - bool supportsReadingSubsetOfColumns(const ContextPtr & context) override; - - std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override; - - virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); - -protected: - - StoragePtr executeImpl( - const ASTPtr & ast_function, - ContextPtr context, - const std::string & table_name, - ColumnsDescription cached_columns, - bool is_insert_query) const override; - - const char * getStorageTypeName() const override { return "S3"; } - - ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; - void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - - mutable StorageS3::Configuration configuration; - ColumnsDescription structure_hint; - -private: - - std::vector skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr context) const override; -}; - -} - -#endif diff --git a/src/TableFunctions/TableFunctionS3Cluster.cpp b/src/TableFunctions/TableFunctionS3Cluster.cpp deleted file mode 100644 index ce96f7f580b..00000000000 --- a/src/TableFunctions/TableFunctionS3Cluster.cpp +++ /dev/null @@ -1,74 +0,0 @@ -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include -#include - -#include "registerTableFunctions.h" - -#include - - -namespace DB -{ - -StoragePtr TableFunctionS3Cluster::executeImpl( - const ASTPtr & /*function*/, ContextPtr context, - const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const -{ - StoragePtr storage; - ColumnsDescription columns; - bool structure_argument_was_provided = configuration.structure != "auto"; - - if (structure_argument_was_provided) - { - columns = parseColumnsListFromString(configuration.structure, context); - } - else if (!structure_hint.empty()) - { - columns = structure_hint; - } - - if (context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) - { - /// On worker node this filename won't contains globs - storage = std::make_shared( - configuration, - context, - StorageID(getDatabaseName(), table_name), - columns, - ConstraintsDescription{}, - /* comment */String{}, - /* format_settings */std::nullopt, /// No format_settings for S3Cluster - /*distributed_processing=*/true); - } - else - { - storage = std::make_shared( - cluster_name, - configuration, - StorageID(getDatabaseName(), table_name), - columns, - ConstraintsDescription{}, - context, - structure_argument_was_provided); - } - - storage->startup(); - - return storage; -} - - -void registerTableFunctionS3Cluster(TableFunctionFactory & factory) -{ - factory.registerFunction(); -} - - -} - -#endif diff --git a/src/TableFunctions/TableFunctionS3Cluster.h b/src/TableFunctions/TableFunctionS3Cluster.h deleted file mode 100644 index 718b0d90de8..00000000000 --- a/src/TableFunctions/TableFunctionS3Cluster.h +++ /dev/null @@ -1,64 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include -#include - - -namespace DB -{ - -class Context; - -/** - * s3cluster(cluster_name, source, [access_key_id, secret_access_key,] format, structure, compression_method) - * A table function, which allows to process many files from S3 on a specific cluster - * On initiator it creates a connection to _all_ nodes in cluster, discloses asterisks - * in S3 file path and dispatch each file dynamically. - * On worker node it asks initiator about next task to process, processes it. - * This is repeated until the tasks are finished. - */ -class TableFunctionS3Cluster : public ITableFunctionCluster -{ -public: - static constexpr auto name = "s3Cluster"; - static constexpr auto signature = " - cluster, url\n" - " - cluster, url, format\n" - " - cluster, url, format, structure\n" - " - cluster, url, access_key_id, secret_access_key\n" - " - cluster, url, format, structure, compression_method\n" - " - cluster, url, access_key_id, secret_access_key, format\n" - " - cluster, url, access_key_id, secret_access_key, format, structure\n" - " - cluster, url, access_key_id, secret_access_key, format, structure, compression_method\n" - " - cluster, url, access_key_id, secret_access_key, session_token, format, structure, compression_method\n" - "All signatures supports optional headers (specified as `headers('name'='value', 'name2'='value2')`)"; - - String getName() const override - { - return name; - } - - String getSignature() const override - { - return signature; - } - -protected: - StoragePtr executeImpl( - const ASTPtr & ast_function, - ContextPtr context, - const std::string & table_name, - ColumnsDescription cached_columns, - bool is_insert_query) const override; - - const char * getStorageTypeName() const override { return "S3Cluster"; } -}; - -} - -#endif diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index 8c18c298f45..627d945fbf3 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -28,26 +28,17 @@ void registerTableFunctions() #endif #if USE_AWS_S3 - registerTableFunctionS3(factory); - registerTableFunctionS3Cluster(factory); - registerTableFunctionCOS(factory); - registerTableFunctionOSS(factory); - registerTableFunctionGCS(factory); - registerTableFunctionHudi(factory); + // registerTableFunctionS3Cluster(factory); + // registerTableFunctionHudi(factory); #if USE_PARQUET - registerTableFunctionDeltaLake(factory); + // registerTableFunctionDeltaLake(factory); #endif #if USE_AVRO - registerTableFunctionIceberg(factory); + // registerTableFunctionIceberg(factory); #endif #endif -#if USE_HDFS - registerTableFunctionHDFS(factory); - registerTableFunctionHDFSCluster(factory); -#endif - #if USE_HIVE registerTableFunctionHive(factory); #endif @@ -75,10 +66,8 @@ void registerTableFunctions() registerTableFunctionFormat(factory); registerTableFunctionExplain(factory); -#if USE_AZURE_BLOB_STORAGE - registerTableFunctionAzureBlobStorage(factory); - registerTableFunctionAzureBlobStorageCluster(factory); -#endif + registerTableFunctionObjectStorage(factory); + registerTableFunctionObjectStorageCluster(factory); } diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index fae763e7dc8..cefb198273e 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -39,11 +39,6 @@ void registerTableFunctionIceberg(TableFunctionFactory & factory); #endif #endif -#if USE_HDFS -void registerTableFunctionHDFS(TableFunctionFactory & factory); -void registerTableFunctionHDFSCluster(TableFunctionFactory & factory); -#endif - #if USE_HIVE void registerTableFunctionHive(TableFunctionFactory & factory); #endif @@ -73,8 +68,8 @@ void registerTableFunctionFormat(TableFunctionFactory & factory); void registerTableFunctionExplain(TableFunctionFactory & factory); #if USE_AZURE_BLOB_STORAGE -void registerTableFunctionAzureBlobStorage(TableFunctionFactory & factory); -void registerTableFunctionAzureBlobStorageCluster(TableFunctionFactory & factory); +void registerTableFunctionObjectStorage(TableFunctionFactory & factory); +void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory); #endif void registerTableFunctions(); diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 3cccd07c134..41218e41069 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -29,6 +29,8 @@ def cluster(): with_azurite=True, ) cluster.start() + container_client = cluster.blob_service_client.get_container_client("cont") + container_client.create_container() yield cluster finally: cluster.shutdown() @@ -129,8 +131,10 @@ def test_create_table_connection_string(cluster): node = cluster.instances["node"] azure_query( node, - f"CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}'," - f"'cont', 'test_create_connection_string', 'CSV')", + f""" + CREATE TABLE test_create_table_conn_string (key UInt64, data String) + Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_create_connection_string', 'CSV') + """, ) From 6d91d92601c04f160ba95a743fca270371b65eb8 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 12 Feb 2024 18:17:22 +0100 Subject: [PATCH 011/392] Better --- src/Backups/BackupIO_AzureBlobStorage.cpp | 13 +- .../AzureBlobStorage/AzureObjectStorage.cpp | 8 +- .../AzureBlobStorage/AzureObjectStorage.h | 4 +- .../Cached/CachedObjectStorage.cpp | 2 +- .../Cached/CachedObjectStorage.h | 2 +- src/Disks/ObjectStorages/IObjectStorage.cpp | 6 +- src/Disks/ObjectStorages/IObjectStorage.h | 8 +- .../ObjectStorageIteratorAsync.cpp | 63 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 19 +- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 4 +- .../DataLakes/DeltaLakeMetadataParser.h | 2 +- src/Storages/DataLakes/HudiMetadataParser.h | 3 +- .../DataLakes/Iceberg/IcebergMetadata.cpp | 1 - .../DataLakes/Iceberg/IcebergMetadata.h | 2 +- .../ObjectStorage/AzureConfiguration.cpp | 11 + .../ObjectStorage/AzureConfiguration.h | 2 +- .../ObjectStorage/HDFSConfiguration.h | 2 +- .../ObjectStorage/ReadBufferIterator.cpp | 179 ++++++ .../ObjectStorage/ReadBufferIterator.h | 179 +----- .../ObjectStorage/ReadFromObjectStorage.h | 105 ---- .../ReadFromStorageObjectStorage.cpp | 94 +++ .../ReadFromStorageObjectStorage.h | 60 ++ src/Storages/ObjectStorage/S3Configuration.h | 2 +- ....h => StorageObejctStorageConfiguration.h} | 28 +- .../ObjectStorage/StorageObjectStorage.cpp | 91 +-- .../StorageObjectStorageCluster.cpp | 9 +- .../StorageObjectStorageCluster.h | 1 - .../StorageObjectStorageConfiguration.cpp | 40 ++ ....h => StorageObjectStorageQuerySettings.h} | 8 + .../ObjectStorage/StorageObjectStorageSink.h | 2 +- .../StorageObjectStorageSource.cpp | 539 +++++++++--------- .../StorageObjectStorageSource.h | 98 ++-- .../StorageObjectStorage_fwd_internal.h | 11 + .../registerStorageObjectStorage.cpp | 18 +- src/Storages/S3Queue/S3QueueSource.cpp | 17 +- src/Storages/S3Queue/S3QueueSource.h | 25 +- src/Storages/S3Queue/S3QueueTableMetadata.h | 2 +- src/Storages/S3Queue/StorageS3Queue.cpp | 32 +- src/Storages/S3Queue/StorageS3Queue.h | 1 - src/TableFunctions/ITableFunctionDataLake.h | 2 +- .../TableFunctionObjectStorage.cpp | 55 +- .../TableFunctionObjectStorageCluster.cpp | 14 +- 42 files changed, 973 insertions(+), 791 deletions(-) create mode 100644 src/Storages/ObjectStorage/ReadBufferIterator.cpp delete mode 100644 src/Storages/ObjectStorage/ReadFromObjectStorage.h create mode 100644 src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp create mode 100644 src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h rename src/Storages/ObjectStorage/{Configuration.h => StorageObejctStorageConfiguration.h} (73%) create mode 100644 src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp rename src/Storages/ObjectStorage/{Settings.h => StorageObjectStorageQuerySettings.h} (86%) create mode 100644 src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index dc636f90be7..f12cc4c1d58 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -208,10 +208,15 @@ void BackupWriterAzureBlobStorage::copyFile(const String & destination, const St /* for_disk_azure_blob_storage= */ true); } -void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) +void BackupWriterAzureBlobStorage::copyDataToFile( + const String & path_in_backup, + const CreateReadBufferFunction & create_read_buffer, + UInt64 start_pos, + UInt64 length) { - copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure")); + copyDataToAzureBlobStorageFile( + create_read_buffer, start_pos, length, client, configuration.container, + path_in_backup, settings, threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure")); } BackupWriterAzureBlobStorage::~BackupWriterAzureBlobStorage() = default; @@ -245,7 +250,7 @@ UInt64 BackupWriterAzureBlobStorage::getFileSize(const String & file_name) object_storage->listObjects(key,children,/*max_keys*/0); if (children.empty()) throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Object must exist"); - return children[0]->metadata.size_bytes; + return children[0]->metadata->size_bytes; } std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String & file_name, size_t /*expected_file_size*/) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 2ca44137442..bbbb5357505 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -128,15 +128,15 @@ bool AzureObjectStorage::exists(const StoredObject & object) const return false; } -ObjectStorageIteratorPtr AzureObjectStorage::iterate(const std::string & path_prefix) const +ObjectStorageIteratorPtr AzureObjectStorage::iterate(const std::string & path_prefix, size_t max_keys) const { auto settings_ptr = settings.get(); auto client_ptr = client.get(); - return std::make_shared(path_prefix, client_ptr, settings_ptr->list_object_keys_size); + return std::make_shared(path_prefix, client_ptr, max_keys); } -void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const +void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const { auto client_ptr = client.get(); @@ -168,7 +168,7 @@ void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWith if (max_keys) { - int keys_left = max_keys - static_cast(children.size()); + size_t keys_left = max_keys - children.size(); if (keys_left <= 0) break; options.PageSizeHint = keys_left; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index f16c35fb52c..31eb78924f9 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -69,9 +69,9 @@ public: SettingsPtr && settings_, const String & container_); - void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; + void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; - ObjectStorageIteratorPtr iterate(const std::string & path_prefix) const override; + ObjectStorageIteratorPtr iterate(const std::string & path_prefix, size_t max_keys) const override; std::string getName() const override { return "AzureObjectStorage"; } diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp index 1444f4c9c76..9f195b787a8 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp @@ -180,7 +180,7 @@ std::unique_ptr CachedObjectStorage::cloneObjectStorage( return object_storage->cloneObjectStorage(new_namespace, config, config_prefix, context); } -void CachedObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const +void CachedObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const { object_storage->listObjects(path, children, max_keys); } diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 437baead7be..ec116b63d01 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -80,7 +80,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; + void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; ObjectMetadata getObjectMetadata(const std::string & path) const override; diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index 78fbdcaddfa..d36ef4f414a 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -24,16 +24,16 @@ bool IObjectStorage::existsOrHasAnyChild(const std::string & path) const return !files.empty(); } -void IObjectStorage::listObjects(const std::string &, RelativePathsWithMetadata &, int) const +void IObjectStorage::listObjects(const std::string &, RelativePathsWithMetadata &, size_t) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "listObjects() is not supported"); } -ObjectStorageIteratorPtr IObjectStorage::iterate(const std::string & path_prefix) const +ObjectStorageIteratorPtr IObjectStorage::iterate(const std::string & path_prefix, size_t max_keys) const { RelativePathsWithMetadata files; - listObjects(path_prefix, files, 0); + listObjects(path_prefix, files, max_keys); return std::make_shared(std::move(files)); } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 7d354e6383d..4955b0e6924 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -54,11 +54,11 @@ struct ObjectMetadata struct RelativePathWithMetadata { String relative_path; - ObjectMetadata metadata; + std::optional metadata; RelativePathWithMetadata() = default; - RelativePathWithMetadata(String relative_path_, ObjectMetadata metadata_) + explicit RelativePathWithMetadata(String relative_path_, std::optional metadata_ = std::nullopt) : relative_path(std::move(relative_path_)) , metadata(std::move(metadata_)) {} @@ -111,9 +111,9 @@ public: /// /, /a, /a/b, /a/b/c, /a/b/c/d while exists will return true only for /a/b/c/d virtual bool existsOrHasAnyChild(const std::string & path) const; - virtual void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const; + virtual void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const; - virtual ObjectStorageIteratorPtr iterate(const std::string & path_prefix) const; + virtual ObjectStorageIteratorPtr iterate(const std::string & path_prefix, size_t max_keys) const; /// Get object metadata if supported. It should be possible to receive /// at least size of object diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index b7729623a64..62bdd0ed0c8 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -14,27 +14,32 @@ namespace ErrorCodes void IObjectStorageIteratorAsync::nextBatch() { std::lock_guard lock(mutex); - if (!is_finished) + if (is_finished) { + LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: here 3"); + current_batch.clear(); + current_batch_iterator = current_batch.begin(); + } + else + { + LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: here 4"); if (!is_initialized) { outcome_future = scheduleBatch(); is_initialized = true; } - BatchAndHasNext next_batch = outcome_future.get(); - current_batch = std::move(next_batch.batch); - accumulated_size.fetch_add(current_batch.size(), std::memory_order_relaxed); - current_batch_iterator = current_batch.begin(); - if (next_batch.has_next) - outcome_future = scheduleBatch(); - else - is_finished = true; - } - else - { - current_batch.clear(); + chassert(outcome_future.valid()); + auto [batch, has_next] = outcome_future.get(); + current_batch = std::move(batch); current_batch_iterator = current_batch.begin(); + + accumulated_size.fetch_add(current_batch.size(), std::memory_order_relaxed); + + if (has_next) + outcome_future = scheduleBatch(); + else + is_finished = true; } } @@ -42,24 +47,10 @@ void IObjectStorageIteratorAsync::next() { std::lock_guard lock(mutex); - if (current_batch_iterator != current_batch.end()) - { + if (current_batch_iterator == current_batch.end()) + nextBatch(); + else ++current_batch_iterator; - } - else if (!is_finished) - { - if (outcome_future.valid()) - { - BatchAndHasNext next_batch = outcome_future.get(); - current_batch = std::move(next_batch.batch); - accumulated_size.fetch_add(current_batch.size(), std::memory_order_relaxed); - current_batch_iterator = current_batch.begin(); - if (next_batch.has_next) - outcome_future = scheduleBatch(); - else - is_finished = true; - } - } } std::future IObjectStorageIteratorAsync::scheduleBatch() @@ -107,14 +98,16 @@ std::optional IObjectStorageIteratorAsync::getCurrent if (!is_initialized) nextBatch(); - if (current_batch_iterator != current_batch.end()) + if (current_batch_iterator == current_batch.end()) { - auto temp_current_batch = current_batch; - nextBatch(); - return temp_current_batch; + LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: here 2"); + return std::nullopt; } - return std::nullopt; + auto temp_current_batch = std::move(current_batch); + LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: here 1: {}", temp_current_batch.size()); + nextBatch(); + return temp_current_batch; } size_t IObjectStorageIteratorAsync::getAccumulatedSize() const diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index cc138c43c71..a9bd520e6e9 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -138,9 +138,10 @@ private: return outcome.GetResult().GetIsTruncated(); } - throw S3Exception(outcome.GetError().GetErrorType(), "Could not list objects in bucket {} with prefix {}, S3 exception: {}, message: {}", - quoteString(request.GetBucket()), quoteString(request.GetPrefix()), - backQuote(outcome.GetError().GetExceptionName()), quoteString(outcome.GetError().GetMessage())); + throw S3Exception(outcome.GetError().GetErrorType(), + "Could not list objects in bucket {} with prefix {}, S3 exception: {}, message: {}", + quoteString(request.GetBucket()), quoteString(request.GetPrefix()), + backQuote(outcome.GetError().GetExceptionName()), quoteString(outcome.GetError().GetMessage())); } std::shared_ptr client; @@ -263,13 +264,13 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN } -ObjectStorageIteratorPtr S3ObjectStorage::iterate(const std::string & path_prefix) const +ObjectStorageIteratorPtr S3ObjectStorage::iterate(const std::string & path_prefix, size_t max_keys) const { auto settings_ptr = s3_settings.get(); - return std::make_shared(uri.bucket, path_prefix, client.get(), settings_ptr->list_object_keys_size); + return std::make_shared(uri.bucket, path_prefix, client.get(), max_keys); } -void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const +void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const { auto settings_ptr = s3_settings.get(); @@ -277,7 +278,7 @@ void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMet request.SetBucket(uri.bucket); request.SetPrefix(path); if (max_keys) - request.SetMaxKeys(max_keys); + request.SetMaxKeys(static_cast(max_keys)); else request.SetMaxKeys(settings_ptr->list_object_keys_size); @@ -305,10 +306,10 @@ void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMet if (max_keys) { - int keys_left = max_keys - static_cast(children.size()); + size_t keys_left = max_keys - children.size(); if (keys_left <= 0) break; - request.SetMaxKeys(keys_left); + request.SetMaxKeys(static_cast(keys_left)); } request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index ab0fa5bed68..a6843a383e5 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -100,9 +100,9 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; + void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; - ObjectStorageIteratorPtr iterate(const std::string & path_prefix) const override; + ObjectStorageIteratorPtr iterate(const std::string & path_prefix, size_t max_keys) const override; /// Uses `DeleteObjectRequest`. void removeObject(const StoredObject & object) override; diff --git a/src/Storages/DataLakes/DeltaLakeMetadataParser.h b/src/Storages/DataLakes/DeltaLakeMetadataParser.h index f94024597d6..251ea3e3f15 100644 --- a/src/Storages/DataLakes/DeltaLakeMetadataParser.h +++ b/src/Storages/DataLakes/DeltaLakeMetadataParser.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Storages/DataLakes/HudiMetadataParser.h b/src/Storages/DataLakes/HudiMetadataParser.h index 2fc004595ca..72766a95876 100644 --- a/src/Storages/DataLakes/HudiMetadataParser.h +++ b/src/Storages/DataLakes/HudiMetadataParser.h @@ -2,7 +2,8 @@ #include #include -#include +#include +#include namespace DB { diff --git a/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp index 08cebb3f396..5543e60e7a7 100644 --- a/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/src/Storages/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/DataLakes/Iceberg/IcebergMetadata.h index 92946e4192b..a289715848f 100644 --- a/src/Storages/DataLakes/Iceberg/IcebergMetadata.h +++ b/src/Storages/DataLakes/Iceberg/IcebergMetadata.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Storages/ObjectStorage/AzureConfiguration.cpp b/src/Storages/ObjectStorage/AzureConfiguration.cpp index ba3e796223a..04f6f26111b 100644 --- a/src/Storages/ObjectStorage/AzureConfiguration.cpp +++ b/src/Storages/ObjectStorage/AzureConfiguration.cpp @@ -89,6 +89,17 @@ StorageObjectStorageConfigurationPtr StorageAzureBlobConfiguration::clone() return configuration; } +StorageAzureBlobConfiguration::StorageAzureBlobConfiguration(const StorageAzureBlobConfiguration & other) +{ + connection_url = other.connection_url; + is_connection_string = other.is_connection_string; + account_name = other.account_name; + account_key = other.account_key; + container = other.container; + blob_path = other.blob_path; + blobs_paths = other.blobs_paths; +} + AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(ContextPtr context) { const auto & context_settings = context->getSettingsRef(); diff --git a/src/Storages/ObjectStorage/AzureConfiguration.h b/src/Storages/ObjectStorage/AzureConfiguration.h index 40d718d7690..4f285128241 100644 --- a/src/Storages/ObjectStorage/AzureConfiguration.h +++ b/src/Storages/ObjectStorage/AzureConfiguration.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include namespace DB { diff --git a/src/Storages/ObjectStorage/HDFSConfiguration.h b/src/Storages/ObjectStorage/HDFSConfiguration.h index f42cedf459d..aa45c634042 100644 --- a/src/Storages/ObjectStorage/HDFSConfiguration.h +++ b/src/Storages/ObjectStorage/HDFSConfiguration.h @@ -3,7 +3,7 @@ #if USE_HDFS -#include +#include #include #include #include diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp new file mode 100644 index 00000000000..dcdf36dbcf5 --- /dev/null +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -0,0 +1,179 @@ +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + +} + +ReadBufferIterator::ReadBufferIterator( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + const FileIterator & file_iterator_, + const std::optional & format_settings_, + const StorageObjectStorageSettings & query_settings_, + SchemaCache & schema_cache_, + ObjectInfos & read_keys_, + const ContextPtr & context_) + : WithContext(context_) + , object_storage(object_storage_) + , configuration(configuration_) + , file_iterator(file_iterator_) + , format_settings(format_settings_) + , query_settings(query_settings_) + , schema_cache(schema_cache_) + , read_keys(read_keys_) + , prev_read_keys_size(read_keys_.size()) +{ +} + +SchemaCache::Key ReadBufferIterator::getKeyForSchemaCache(const String & path) const +{ + auto source = fs::path(configuration->getDataSourceDescription()) / path; + return DB::getKeyForSchemaCache(source, configuration->format, format_settings, getContext()); +} + +SchemaCache::Keys ReadBufferIterator::getPathsForSchemaCache() const +{ + Strings sources; + sources.reserve(read_keys.size()); + std::transform( + read_keys.begin(), read_keys.end(), + std::back_inserter(sources), + [&](const auto & elem) + { + return fs::path(configuration->getDataSourceDescription()) / elem->relative_path; + }); + return DB::getKeysForSchemaCache(sources, configuration->format, format_settings, getContext()); +} + +std::optional ReadBufferIterator::tryGetColumnsFromCache( + const ObjectInfos::iterator & begin, + const ObjectInfos::iterator & end) +{ + if (!query_settings.schema_inference_use_cache) + return std::nullopt; + + for (auto it = begin; it < end; ++it) + { + const auto & object_info = (*it); + auto get_last_mod_time = [&] -> std::optional + { + if (object_info->metadata) + return object_info->metadata->last_modified->epochMicroseconds(); + else + { + object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); + return object_info->metadata->last_modified->epochMicroseconds(); + } + }; + + auto cache_key = getKeyForSchemaCache(object_info->relative_path); + auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); + if (columns) + return columns; + } + + return std::nullopt; +} + +void ReadBufferIterator::setNumRowsToLastFile(size_t num_rows) +{ + if (query_settings.schema_inference_use_cache) + schema_cache.addNumRows(getKeyForSchemaCache(current_object_info->relative_path), num_rows); +} + +void ReadBufferIterator::setSchemaToLastFile(const ColumnsDescription & columns) +{ + if (query_settings.schema_inference_use_cache + && query_settings.schema_inference_mode == SchemaInferenceMode::UNION) + { + schema_cache.addColumns(getKeyForSchemaCache(current_object_info->relative_path), columns); + } +} + +void ReadBufferIterator::setResultingSchema(const ColumnsDescription & columns) +{ + if (query_settings.schema_inference_use_cache + && query_settings.schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + schema_cache.addManyColumns(getPathsForSchemaCache(), columns); + } +} + +String ReadBufferIterator::getLastFileName() const +{ + if (current_object_info) + return current_object_info->relative_path; + else + return ""; +} + +std::pair, std::optional> ReadBufferIterator::next() +{ + /// For default mode check cached columns for currently read keys on first iteration. + if (first && query_settings.schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns}; + } + + current_object_info = file_iterator->next(0); + if (!current_object_info || current_object_info->relative_path.empty()) + { + if (first) + { + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, " + "because there are no files with provided path. " + "You must specify table structure manually", + configuration->format); + } + return {nullptr, std::nullopt}; + } + + first = false; + + /// File iterator could get new keys after new iteration, + /// check them in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT + && read_keys.size() > prev_read_keys_size) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + prev_read_keys_size = read_keys.size(); + if (columns_from_cache) + return {nullptr, columns_from_cache}; + } + else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) + { + ObjectInfos paths = {current_object_info}; + if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) + return {nullptr, columns_from_cache}; + } + + first = false; + + chassert(current_object_info->metadata); + std::unique_ptr read_buffer = object_storage->readObject( + StoredObject(current_object_info->relative_path), + getContext()->getReadSettings(), + {}, + current_object_info->metadata->size_bytes); + + read_buffer = wrapReadBufferWithCompressionMethod( + std::move(read_buffer), + chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method), + static_cast(getContext()->getSettingsRef().zstd_window_log_max)); + + return {std::move(read_buffer), std::nullopt}; +} + +} diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.h b/src/Storages/ObjectStorage/ReadBufferIterator.h index 248700e2edf..4e9b8cfcfca 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.h +++ b/src/Storages/ObjectStorage/ReadBufferIterator.h @@ -1,197 +1,54 @@ #pragma once #include -#include +#include #include -#include -#include #include namespace DB { -namespace ErrorCodes -{ - extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; -} - -template class ReadBufferIterator : public IReadBufferIterator, WithContext { public: - using Storage = StorageObjectStorage; - using Source = StorageObjectStorageSource; - using FileIterator = std::shared_ptr; - using ObjectInfos = typename Storage::ObjectInfos; + using FileIterator = std::shared_ptr; ReadBufferIterator( ObjectStoragePtr object_storage_, - Storage::ConfigurationPtr configuration_, + ConfigurationPtr configuration_, const FileIterator & file_iterator_, const std::optional & format_settings_, + const StorageObjectStorageSettings & query_settings_, + SchemaCache & schema_cache_, ObjectInfos & read_keys_, - const ContextPtr & context_) - : WithContext(context_) - , object_storage(object_storage_) - , configuration(configuration_) - , file_iterator(file_iterator_) - , format_settings(format_settings_) - , storage_settings(StorageSettings::create(context_->getSettingsRef())) - , read_keys(read_keys_) - , prev_read_keys_size(read_keys_.size()) - { - } + const ContextPtr & context_); - std::pair, std::optional> next() override - { - /// For default mode check cached columns for currently read keys on first iteration. - if (first && storage_settings.schema_inference_mode == SchemaInferenceMode::DEFAULT) - { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; - } + std::pair, std::optional> next() override; - current_object_info = file_iterator->next(0); - if (current_object_info->relative_path.empty()) - { - if (first) - { - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, " - "because there are no files with provided path. " - "You must specify table structure manually", - configuration->format); - } - return {nullptr, std::nullopt}; - } + void setNumRowsToLastFile(size_t num_rows) override; - first = false; + void setSchemaToLastFile(const ColumnsDescription & columns) override; - /// File iterator could get new keys after new iteration, - /// check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT - && read_keys.size() > prev_read_keys_size) - { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); - prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache}; - } - else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) - { - ObjectInfos paths = {current_object_info}; - if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) - return {nullptr, columns_from_cache}; - } + void setResultingSchema(const ColumnsDescription & columns) override; - first = false; - - std::unique_ptr read_buffer = object_storage->readObject( - StoredObject(current_object_info->relative_path), - getContext()->getReadSettings(), - {}, - current_object_info->metadata.size_bytes); - - read_buffer = wrapReadBufferWithCompressionMethod( - std::move(read_buffer), - chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method), - static_cast(getContext()->getSettingsRef().zstd_window_log_max)); - - return {std::move(read_buffer), std::nullopt}; - } - - void setNumRowsToLastFile(size_t num_rows) override - { - if (storage_settings.schema_inference_use_cache) - { - Storage::getSchemaCache(getContext()).addNumRows( - getKeyForSchemaCache(current_object_info->relative_path), num_rows); - } - } - - void setSchemaToLastFile(const ColumnsDescription & columns) override - { - if (storage_settings.schema_inference_use_cache - && storage_settings.schema_inference_mode == SchemaInferenceMode::UNION) - { - Storage::getSchemaCache(getContext()).addColumns( - getKeyForSchemaCache(current_object_info->relative_path), columns); - } - } - - void setResultingSchema(const ColumnsDescription & columns) override - { - if (storage_settings.schema_inference_use_cache - && storage_settings.schema_inference_mode == SchemaInferenceMode::DEFAULT) - { - Storage::getSchemaCache(getContext()).addManyColumns(getPathsForSchemaCache(), columns); - } - } - - String getLastFileName() const override { return current_object_info->relative_path; } + String getLastFileName() const override; private: - SchemaCache::Key getKeyForSchemaCache(const String & path) const - { - auto source = fs::path(configuration->getDataSourceDescription()) / path; - return DB::getKeyForSchemaCache(source, configuration->format, format_settings, getContext()); - } - - SchemaCache::Keys getPathsForSchemaCache() const - { - Strings sources; - sources.reserve(read_keys.size()); - std::transform( - read_keys.begin(), read_keys.end(), - std::back_inserter(sources), - [&](const auto & elem) - { - return fs::path(configuration->getDataSourceDescription()) / elem->relative_path; - }); - return DB::getKeysForSchemaCache(sources, configuration->format, format_settings, getContext()); - } - + SchemaCache::Key getKeyForSchemaCache(const String & path) const; + SchemaCache::Keys getPathsForSchemaCache() const; std::optional tryGetColumnsFromCache( - const ObjectInfos::iterator & begin, - const ObjectInfos::iterator & end) - { - if (!storage_settings.schema_inference_use_cache) - return std::nullopt; - - auto & schema_cache = Storage::getSchemaCache(getContext()); - for (auto it = begin; it < end; ++it) - { - const auto & object_info = (*it); - auto get_last_mod_time = [&] -> std::optional - { - if (object_info->metadata.last_modified) - return object_info->metadata.last_modified->epochMicroseconds(); - else - { - object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); - return object_info->metadata.last_modified->epochMicroseconds(); - } - }; - - auto cache_key = getKeyForSchemaCache(object_info->relative_path); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; - } - - return std::nullopt; - } + const ObjectInfos::iterator & begin, const ObjectInfos::iterator & end); ObjectStoragePtr object_storage; - const Storage::ConfigurationPtr configuration; + const ConfigurationPtr configuration; const FileIterator file_iterator; const std::optional & format_settings; - const StorageObjectStorageSettings storage_settings; + const StorageObjectStorageSettings query_settings; + SchemaCache & schema_cache; ObjectInfos & read_keys; size_t prev_read_keys_size; - Storage::ObjectInfoPtr current_object_info; + ObjectInfoPtr current_object_info; bool first = true; }; } diff --git a/src/Storages/ObjectStorage/ReadFromObjectStorage.h b/src/Storages/ObjectStorage/ReadFromObjectStorage.h deleted file mode 100644 index 9cb77dcc25e..00000000000 --- a/src/Storages/ObjectStorage/ReadFromObjectStorage.h +++ /dev/null @@ -1,105 +0,0 @@ -#pragma once -#include -#include -#include -#include - -namespace DB -{ - -template -class ReadFromStorageObejctStorage : public SourceStepWithFilter -{ -public: - using Storage = StorageObjectStorage; - using Source = StorageObjectStorageSource; - - ReadFromStorageObejctStorage( - ObjectStoragePtr object_storage_, - Storage::ConfigurationPtr configuration_, - const String & name_, - const NamesAndTypesList & virtual_columns_, - const std::optional & format_settings_, - bool distributed_processing_, - ReadFromFormatInfo info_, - const bool need_only_count_, - ContextPtr context_, - size_t max_block_size_, - size_t num_streams_) - : SourceStepWithFilter(DataStream{.header = info_.source_header}) - , object_storage(object_storage_) - , configuration(configuration_) - , context(std::move(context_)) - , info(std::move(info_)) - , virtual_columns(virtual_columns_) - , format_settings(format_settings_) - , name(name_ + "Source") - , need_only_count(need_only_count_) - , max_block_size(max_block_size_) - , num_streams(num_streams_) - , distributed_processing(distributed_processing_) - { - } - - std::string getName() const override { return name; } - - void applyFilters() override - { - auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes); - const ActionsDAG::Node * predicate = nullptr; - if (filter_actions_dag) - predicate = filter_actions_dag->getOutputs().at(0); - - createIterator(predicate); - } - - void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override - { - createIterator(nullptr); - - Pipes pipes; - for (size_t i = 0; i < num_streams; ++i) - { - pipes.emplace_back(std::make_shared( - getName(), object_storage, configuration, info, format_settings, - context, max_block_size, iterator_wrapper, need_only_count)); - } - - auto pipe = Pipe::unitePipes(std::move(pipes)); - if (pipe.empty()) - pipe = Pipe(std::make_shared(info.source_header)); - - for (const auto & processor : pipe.getProcessors()) - processors.emplace_back(processor); - - pipeline.init(std::move(pipe)); - } - -private: - ObjectStoragePtr object_storage; - Storage::ConfigurationPtr configuration; - ContextPtr context; - - const ReadFromFormatInfo info; - const NamesAndTypesList virtual_columns; - const std::optional format_settings; - const String name; - const bool need_only_count; - const size_t max_block_size; - const size_t num_streams; - const bool distributed_processing; - - std::shared_ptr iterator_wrapper; - - void createIterator(const ActionsDAG::Node * predicate) - { - if (!iterator_wrapper) - { - iterator_wrapper = Source::createFileIterator( - configuration, object_storage, distributed_processing, context, - predicate, virtual_columns, nullptr, context->getFileProgressCallback()); - } - } -}; - -} diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp new file mode 100644 index 00000000000..2c27c816078 --- /dev/null +++ b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp @@ -0,0 +1,94 @@ +#include +#include +#include + +namespace DB +{ + +ReadFromStorageObejctStorage::ReadFromStorageObejctStorage( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + const String & name_, + const NamesAndTypesList & virtual_columns_, + const std::optional & format_settings_, + const StorageObjectStorageSettings & query_settings_, + bool distributed_processing_, + ReadFromFormatInfo info_, + SchemaCache & schema_cache_, + const bool need_only_count_, + ContextPtr context_, + size_t max_block_size_, + size_t num_streams_, + CurrentMetrics::Metric metric_threads_count_, + CurrentMetrics::Metric metric_threads_active_, + CurrentMetrics::Metric metric_threads_scheduled_) + : SourceStepWithFilter(DataStream{.header = info_.source_header}) + , WithContext(context_) + , object_storage(object_storage_) + , configuration(configuration_) + , info(std::move(info_)) + , virtual_columns(virtual_columns_) + , format_settings(format_settings_) + , query_settings(query_settings_) + , schema_cache(schema_cache_) + , name(name_ + "Source") + , need_only_count(need_only_count_) + , max_block_size(max_block_size_) + , num_streams(num_streams_) + , distributed_processing(distributed_processing_) + , metric_threads_count(metric_threads_count_) + , metric_threads_active(metric_threads_active_) + , metric_threads_scheduled(metric_threads_scheduled_) +{ +} + +void ReadFromStorageObejctStorage::createIterator(const ActionsDAG::Node * predicate) +{ + if (!iterator_wrapper) + { + auto context = getContext(); + iterator_wrapper = StorageObjectStorageSource::createFileIterator( + configuration, object_storage, distributed_processing, context, predicate, + virtual_columns, nullptr, query_settings.list_object_keys_size, context->getFileProgressCallback()); + } +} + +void ReadFromStorageObejctStorage::applyFilters() +{ + auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes); + const ActionsDAG::Node * predicate = nullptr; + if (filter_actions_dag) + predicate = filter_actions_dag->getOutputs().at(0); + + createIterator(predicate); +} + +void ReadFromStorageObejctStorage::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) +{ + createIterator(nullptr); + auto context = getContext(); + + Pipes pipes; + for (size_t i = 0; i < num_streams; ++i) + { + auto threadpool = std::make_shared( + metric_threads_count, metric_threads_active, metric_threads_scheduled, /* max_threads */1); + + auto source = std::make_shared( + getName(), object_storage, configuration, info, format_settings, query_settings, + context, max_block_size, iterator_wrapper, need_only_count, schema_cache, std::move(threadpool)); + + pipes.emplace_back(std::move(source)); + } + + auto pipe = Pipe::unitePipes(std::move(pipes)); + if (pipe.empty()) + pipe = Pipe(std::make_shared(info.source_header)); + + for (const auto & processor : pipe.getProcessors()) + processors.emplace_back(processor); + + pipeline.init(std::move(pipe)); +} + +} diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h new file mode 100644 index 00000000000..f5e057d297f --- /dev/null +++ b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h @@ -0,0 +1,60 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +class ReadFromStorageObejctStorage : public SourceStepWithFilter, WithContext +{ +public: + using ConfigurationPtr = StorageObjectStorageConfigurationPtr; + + ReadFromStorageObejctStorage( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + const String & name_, + const NamesAndTypesList & virtual_columns_, + const std::optional & format_settings_, + const StorageObjectStorageSettings & query_settings_, + bool distributed_processing_, + ReadFromFormatInfo info_, + SchemaCache & schema_cache_, + bool need_only_count_, + ContextPtr context_, + size_t max_block_size_, + size_t num_streams_, + CurrentMetrics::Metric metric_threads_count_, + CurrentMetrics::Metric metric_threads_active_, + CurrentMetrics::Metric metric_threads_scheduled_); + + std::string getName() const override { return name; } + + void applyFilters() override; + + void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; + +private: + ObjectStoragePtr object_storage; + ConfigurationPtr configuration; + std::shared_ptr iterator_wrapper; + + const ReadFromFormatInfo info; + const NamesAndTypesList virtual_columns; + const std::optional format_settings; + const StorageObjectStorageSettings query_settings; + SchemaCache & schema_cache; + const String name; + const bool need_only_count; + const size_t max_block_size; + const size_t num_streams; + const bool distributed_processing; + const CurrentMetrics::Metric metric_threads_count; + const CurrentMetrics::Metric metric_threads_active; + const CurrentMetrics::Metric metric_threads_scheduled; + + void createIterator(const ActionsDAG::Node * predicate); +}; + +} diff --git a/src/Storages/ObjectStorage/S3Configuration.h b/src/Storages/ObjectStorage/S3Configuration.h index 34f5735e02a..c953bc25c4e 100644 --- a/src/Storages/ObjectStorage/S3Configuration.h +++ b/src/Storages/ObjectStorage/S3Configuration.h @@ -1,7 +1,7 @@ #pragma once #include #include -#include +#include namespace DB { diff --git a/src/Storages/ObjectStorage/Configuration.h b/src/Storages/ObjectStorage/StorageObejctStorageConfiguration.h similarity index 73% rename from src/Storages/ObjectStorage/Configuration.h rename to src/Storages/ObjectStorage/StorageObejctStorageConfiguration.h index 708041980e3..427d6a8d453 100644 --- a/src/Storages/ObjectStorage/Configuration.h +++ b/src/Storages/ObjectStorage/StorageObejctStorageConfiguration.h @@ -17,6 +17,12 @@ public: using Path = std::string; using Paths = std::vector; + static void initialize( + StorageObjectStorageConfiguration & configuration, + ASTs & engine_args, + ContextPtr local_context, + bool with_table_structure); + virtual Path getPath() const = 0; virtual void setPath(const Path & path) = 0; @@ -26,28 +32,24 @@ public: virtual String getDataSourceDescription() = 0; virtual String getNamespace() const = 0; - bool isPathWithGlobs() const { return getPath().find_first_of("*?{") != std::string::npos; } - bool isNamespaceWithGlobs() const { return getNamespace().find_first_of("*?{") != std::string::npos; } - - std::string getPathWithoutGlob() const { return getPath().substr(0, getPath().find_first_of("*?{")); } - - virtual bool withWildcard() const - { - static const String PARTITION_ID_WILDCARD = "{_partition_id}"; - return getPath().find(PARTITION_ID_WILDCARD) != String::npos; - } + bool withWildcard() const; + bool withGlobs() const { return isPathWithGlobs() || isNamespaceWithGlobs(); } + bool isPathWithGlobs() const; + bool isNamespaceWithGlobs() const; + std::string getPathWithoutGlob() const; virtual void check(ContextPtr context) const = 0; virtual StorageObjectStorageConfigurationPtr clone() = 0; virtual ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT - virtual void fromNamedCollection(const NamedCollection & collection) = 0; - virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; - String format = "auto"; String compression_method = "auto"; String structure = "auto"; + +protected: + virtual void fromNamedCollection(const NamedCollection & collection) = 0; + virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; }; using StorageObjectStorageConfigurationPtr = std::shared_ptr; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 9250ab8ecbe..9a7260ea47c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -9,12 +9,12 @@ #include #include #include -#include -#include +#include +#include #include #include #include -#include +#include namespace DB @@ -154,34 +154,38 @@ void StorageObjectStorage::read( size_t max_block_size, size_t num_streams) { - if (partition_by && configuration->withWildcard()) + auto [query_configuration, query_object_storage] = updateConfigurationAndGetCopy(local_context); + if (partition_by && query_configuration->withWildcard()) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned {} storage is not implemented yet", getName()); } - auto this_ptr = std::static_pointer_cast(shared_from_this()); - auto read_from_format_info = prepareReadingFromFormat( + const auto read_from_format_info = prepareReadingFromFormat( column_names, storage_snapshot, supportsSubsetOfColumns(local_context), getVirtuals()); - bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) + const bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef().optimize_count_from_files; - auto [query_configuration, query_object_storage] = updateConfigurationAndGetCopy(local_context); - auto reading = std::make_unique>( + auto read_step = std::make_unique( query_object_storage, query_configuration, getName(), virtual_columns, format_settings, + StorageSettings::create(local_context->getSettingsRef()), distributed_processing, std::move(read_from_format_info), + getSchemaCache(local_context), need_only_count, local_context, max_block_size, - num_streams); + num_streams, + StorageSettings::ObjectStorageThreads(), + StorageSettings::ObjectStorageThreadsActive(), + StorageSettings::ObjectStorageThreadsScheduled()); - query_plan.addStep(std::move(reading)); + query_plan.addStep(std::move(read_step)); } template @@ -191,35 +195,43 @@ SinkToStoragePtr StorageObjectStorage::write( ContextPtr local_context, bool /* async_insert */) { - auto insert_query = std::dynamic_pointer_cast(query); - auto partition_by_ast = insert_query - ? (insert_query->partition_by ? insert_query->partition_by : partition_by) - : nullptr; - bool is_partitioned_implementation = partition_by_ast && configuration->withWildcard(); + auto [query_configuration, query_object_storage] = updateConfigurationAndGetCopy(local_context); + const auto sample_block = metadata_snapshot->getSampleBlock(); - auto sample_block = metadata_snapshot->getSampleBlock(); - auto storage_settings = StorageSettings::create(local_context->getSettingsRef()); - - if (is_partitioned_implementation) + if (query_configuration->withWildcard()) { - return std::make_shared( - object_storage, configuration, format_settings, sample_block, local_context, partition_by_ast); + ASTPtr partition_by_ast = nullptr; + if (auto insert_query = std::dynamic_pointer_cast(query)) + { + if (insert_query->partition_by) + partition_by_ast = insert_query->partition_by; + else + partition_by_ast = partition_by; + } + + if (partition_by_ast) + { + return std::make_shared( + object_storage, query_configuration, format_settings, sample_block, local_context, partition_by_ast); + } } - if (configuration->isPathWithGlobs() || configuration->isNamespaceWithGlobs()) + if (query_configuration->withGlobs()) { throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "{} key '{}' contains globs, so the table is in readonly mode", - getName(), configuration->getPath()); + getName(), query_configuration->getPath()); } + const auto storage_settings = StorageSettings::create(local_context->getSettingsRef()); if (!storage_settings.truncate_on_insert - && object_storage->exists(StoredObject(configuration->getPath()))) + && object_storage->exists(StoredObject(query_configuration->getPath()))) { if (storage_settings.create_new_file_on_insert) { - size_t index = configuration->getPaths().size(); - const auto & first_key = configuration->getPaths()[0]; + auto & paths = query_configuration->getPaths(); + size_t index = paths.size(); + const auto & first_key = paths[0]; auto pos = first_key.find_first_of('.'); String new_key; @@ -233,7 +245,7 @@ SinkToStoragePtr StorageObjectStorage::write( } while (object_storage->exists(StoredObject(new_key))); - configuration->getPaths().push_back(new_key); + paths.push_back(new_key); } else { @@ -242,12 +254,12 @@ SinkToStoragePtr StorageObjectStorage::write( "Object in bucket {} with key {} already exists. " "If you want to overwrite it, enable setting [engine_name]_truncate_on_insert, if you " "want to create a new file on each insert, enable setting [engine_name]_create_new_file_on_insert", - configuration->getNamespace(), configuration->getPaths().back()); + query_configuration->getNamespace(), query_configuration->getPaths().back()); } } return std::make_shared( - object_storage, configuration, format_settings, sample_block, local_context); + object_storage, query_configuration, format_settings, sample_block, local_context); } template @@ -257,7 +269,7 @@ void StorageObjectStorage::truncate( ContextPtr, TableExclusiveLockHolder &) { - if (configuration->isPathWithGlobs() || configuration->isNamespaceWithGlobs()) + if (configuration->withGlobs()) { throw Exception( ErrorCodes::DATABASE_ACCESS_DENIED, @@ -279,21 +291,18 @@ ColumnsDescription StorageObjectStorage::getTableStructureFromD const std::optional & format_settings, ContextPtr context) { - using Source = StorageObjectStorageSource; - ObjectInfos read_keys; - auto file_iterator = Source::createFileIterator( + const auto settings = StorageSettings::create(context->getSettingsRef()); + auto file_iterator = StorageObjectStorageSource::createFileIterator( configuration, object_storage, /* distributed_processing */false, - context, /* predicate */{}, /* virtual_columns */{}, &read_keys); + context, /* predicate */{}, /* virtual_columns */{}, &read_keys, settings.list_object_keys_size); - ReadBufferIterator read_buffer_iterator( + ReadBufferIterator read_buffer_iterator( object_storage, configuration, file_iterator, - format_settings, read_keys, context); + format_settings, StorageSettings::create(context->getSettingsRef()), getSchemaCache(context), read_keys, context); - const bool retry = configuration->isPathWithGlobs() || configuration->isNamespaceWithGlobs(); - return readSchemaFromFormat( - configuration->format, format_settings, - read_buffer_iterator, retry, context); + const bool retry = configuration->withGlobs(); + return readSchemaFromFormat(configuration->format, format_settings, read_buffer_iterator, retry, context); } template class StorageObjectStorage; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 414932016f4..39cd5d8eca6 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -11,8 +11,8 @@ #include #include #include -#include #include +#include #include #include @@ -82,10 +82,11 @@ void StorageObjectStorageCluster::ad template RemoteQueryExecutor::Extension -StorageObjectStorageCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr &) const +StorageObjectStorageCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & local_context) const { - auto iterator = std::make_shared( - object_storage, configuration, predicate, virtual_columns, nullptr); + const auto settings = StorageSettings::create(local_context->getSettingsRef()); + auto iterator = std::make_shared( + object_storage, configuration, predicate, virtual_columns, local_context, nullptr, settings.list_object_keys_size); auto callback = std::make_shared>([iterator]() mutable -> String{ return iterator->next(0)->relative_path; }); return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) }; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index b1f9af14e03..aae8f704a73 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -21,7 +21,6 @@ class StorageObjectStorageCluster : public IStorageCluster { public: using Storage = StorageObjectStorage; - using Source = StorageObjectStorageSource; StorageObjectStorageCluster( const String & cluster_name_, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp new file mode 100644 index 00000000000..2d5760ed9d8 --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -0,0 +1,40 @@ +#include + + +namespace DB +{ + +void StorageObjectStorageConfiguration::initialize( + StorageObjectStorageConfiguration & configuration, + ASTs & engine_args, + ContextPtr local_context, + bool with_table_structure) +{ + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) + configuration.fromNamedCollection(*named_collection); + else + configuration.fromAST(engine_args, local_context, with_table_structure); +} + +bool StorageObjectStorageConfiguration::withWildcard() const +{ + static const String PARTITION_ID_WILDCARD = "{_partition_id}"; + return getPath().find(PARTITION_ID_WILDCARD) != String::npos; +} + +bool StorageObjectStorageConfiguration::isPathWithGlobs() const +{ + return getPath().find_first_of("*?{") != std::string::npos; +} + +bool StorageObjectStorageConfiguration::isNamespaceWithGlobs() const +{ + return getNamespace().find_first_of("*?{") != std::string::npos; +} + +std::string StorageObjectStorageConfiguration::getPathWithoutGlob() const +{ + return getPath().substr(0, getPath().find_first_of("*?{")); +} + +} diff --git a/src/Storages/ObjectStorage/Settings.h b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h similarity index 86% rename from src/Storages/ObjectStorage/Settings.h rename to src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h index 015cf9bc01d..454da7c355f 100644 --- a/src/Storages/ObjectStorage/Settings.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h @@ -23,6 +23,8 @@ struct StorageObjectStorageSettings bool create_new_file_on_insert; bool schema_inference_use_cache; SchemaInferenceMode schema_inference_mode; + bool skip_empty_files; + size_t list_object_keys_size; }; struct S3StorageSettings @@ -34,6 +36,8 @@ struct S3StorageSettings .create_new_file_on_insert = settings.s3_create_new_file_on_insert, .schema_inference_use_cache = settings.schema_inference_use_cache_for_s3, .schema_inference_mode = settings.schema_inference_mode, + .skip_empty_files = settings.s3_skip_empty_files, + .list_object_keys_size = settings.s3_list_object_keys_size, }; } @@ -53,6 +57,8 @@ struct AzureStorageSettings .create_new_file_on_insert = settings.azure_create_new_file_on_insert, .schema_inference_use_cache = settings.schema_inference_use_cache_for_azure, .schema_inference_mode = settings.schema_inference_mode, + .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for azure + .list_object_keys_size = settings.azure_list_object_keys_size, }; } @@ -72,6 +78,8 @@ struct HDFSStorageSettings .create_new_file_on_insert = settings.hdfs_create_new_file_on_insert, .schema_inference_use_cache = settings.schema_inference_use_cache_for_hdfs, .schema_inference_mode = settings.schema_inference_mode, + .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for hdfs + .list_object_keys_size = settings.s3_list_object_keys_size, /// TODO: add a setting for hdfs }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index 34ab8ebec66..a2d42d7fa9f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include #include #include #include diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 9fc7925a6d1..f170a46112f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -9,8 +9,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -28,20 +28,55 @@ namespace ErrorCodes extern const int CANNOT_COMPILE_REGEXP; } -template -std::shared_ptr::IIterator> -StorageObjectStorageSource::createFileIterator( - Storage::ConfigurationPtr configuration, +StorageObjectStorageSource::StorageObjectStorageSource( + String name_, + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + const ReadFromFormatInfo & info, + std::optional format_settings_, + const StorageObjectStorageSettings & query_settings_, + ContextPtr context_, + UInt64 max_block_size_, + std::shared_ptr file_iterator_, + bool need_only_count_, + SchemaCache & schema_cache_, + std::shared_ptr reader_pool_) + : SourceWithKeyCondition(info.source_header, false) + , WithContext(context_) + , name(std::move(name_)) + , object_storage(object_storage_) + , configuration(configuration_) + , format_settings(format_settings_) + , query_settings(query_settings_) + , max_block_size(max_block_size_) + , need_only_count(need_only_count_) + , read_from_format_info(info) + , create_reader_pool(reader_pool_) + , columns_desc(info.columns_description) + , file_iterator(file_iterator_) + , schema_cache(schema_cache_) + , create_reader_scheduler(threadPoolCallbackRunner(*create_reader_pool, "Reader")) +{ +} + +StorageObjectStorageSource::~StorageObjectStorageSource() +{ + create_reader_pool->wait(); +} + +std::shared_ptr StorageObjectStorageSource::createFileIterator( + ConfigurationPtr configuration, ObjectStoragePtr object_storage, bool distributed_processing, const ContextPtr & local_context, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, + size_t list_object_keys_size, std::function file_progress_callback) { if (distributed_processing) - return std::make_shared(local_context->getReadTaskCallback()); + return std::make_shared(local_context->getReadTaskCallback()); if (configuration->isNamespaceWithGlobs()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); @@ -49,25 +84,240 @@ StorageObjectStorageSource::createFileIterator( if (configuration->isPathWithGlobs()) { /// Iterate through disclosed globs and make a source for each file - return std::make_shared( - object_storage, configuration, predicate, virtual_columns, read_keys, file_progress_callback); + return std::make_shared( + object_storage, configuration, predicate, virtual_columns, local_context, read_keys, list_object_keys_size, file_progress_callback); } else { - return std::make_shared( + return std::make_shared( object_storage, configuration, virtual_columns, read_keys, file_progress_callback); } } -template -StorageObjectStorageSource::GlobIterator::GlobIterator( +void StorageObjectStorageSource::lazyInitialize(size_t processor) +{ + if (initialized) + return; + + reader = createReader(processor); + if (reader) + reader_future = createReaderAsync(processor); + initialized = true; +} + +Chunk StorageObjectStorageSource::generate() +{ + lazyInitialize(0); + + while (true) + { + if (isCancelled() || !reader) + { + if (reader) + reader->cancel(); + break; + } + + Chunk chunk; + if (reader->pull(chunk)) + { + UInt64 num_rows = chunk.getNumRows(); + total_rows_in_file += num_rows; + + size_t chunk_size = 0; + if (const auto * input_format = reader.getInputFormat()) + chunk_size = input_format->getApproxBytesReadForChunk(); + + progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); + + const auto & object_info = reader.getObjectInfo(); + chassert(object_info.metadata); + VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( + chunk, + read_from_format_info.requested_virtual_columns, + fs::path(configuration->getNamespace()) / reader.getRelativePath(), + object_info.metadata->size_bytes); + + return chunk; + } + + if (reader.getInputFormat() && getContext()->getSettingsRef().use_cache_for_count_from_files) + addNumRowsToCache(reader.getRelativePath(), total_rows_in_file); + + total_rows_in_file = 0; + + assert(reader_future.valid()); + reader = reader_future.get(); + + if (!reader) + break; + + /// Even if task is finished the thread may be not freed in pool. + /// So wait until it will be freed before scheduling a new task. + create_reader_pool->wait(); + reader_future = createReaderAsync(); + } + + return {}; +} + +void StorageObjectStorageSource::addNumRowsToCache(const String & path, size_t num_rows) +{ + const auto cache_key = getKeyForSchemaCache( + fs::path(configuration->getDataSourceDescription()) / path, + configuration->format, + format_settings, + getContext()); + + schema_cache.addNumRows(cache_key, num_rows); +} + +std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const ObjectInfoPtr & object_info) +{ + const auto cache_key = getKeyForSchemaCache( + fs::path(configuration->getDataSourceDescription()) / object_info->relative_path, + configuration->format, + format_settings, + getContext()); + + auto get_last_mod_time = [&]() -> std::optional + { + return object_info->metadata && object_info->metadata->last_modified + ? object_info->metadata->last_modified->epochMicroseconds() + : 0; + }; + return schema_cache.tryGetNumRows(cache_key, get_last_mod_time); +} + +StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReader(size_t processor) +{ + ObjectInfoPtr object_info; + do + { + object_info = file_iterator->next(processor); + if (!object_info || object_info->relative_path.empty()) + return {}; + + if (!object_info->metadata) + object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); + } + while (query_settings.skip_empty_files && object_info->metadata->size_bytes == 0); + + QueryPipelineBuilder builder; + std::shared_ptr source; + std::unique_ptr read_buf; + + std::optional num_rows_from_cache = need_only_count + && getContext()->getSettingsRef().use_cache_for_count_from_files + ? tryGetNumRowsFromCache(object_info) + : std::nullopt; + + if (num_rows_from_cache) + { + /// We should not return single chunk with all number of rows, + /// because there is a chance that this chunk will be materialized later + /// (it can cause memory problems even with default values in columns or when virtual columns are requested). + /// Instead, we use special ConstChunkGenerator that will generate chunks + /// with max_block_size rows until total number of rows is reached. + builder.init(Pipe(std::make_shared( + read_from_format_info.format_header, *num_rows_from_cache, max_block_size))); + } + else + { + const auto compression_method = chooseCompressionMethod(object_info->relative_path, configuration->compression_method); + const auto max_parsing_threads = need_only_count ? std::optional(1) : std::nullopt; + read_buf = createReadBuffer(object_info->relative_path, object_info->metadata->size_bytes); + + auto input_format = FormatFactory::instance().getInput( + configuration->format, *read_buf, read_from_format_info.format_header, + getContext(), max_block_size, format_settings, max_parsing_threads, + std::nullopt, /* is_remote_fs */ true, compression_method); + + if (key_condition) + input_format->setKeyCondition(key_condition); + + if (need_only_count) + input_format->needOnlyCount(); + + builder.init(Pipe(input_format)); + + if (columns_desc.hasDefaults()) + { + builder.addSimpleTransform( + [&](const Block & header) + { + return std::make_shared(header, columns_desc, *input_format, getContext()); + }); + } + + source = input_format; + } + + /// Add ExtractColumnsTransform to extract requested columns/subcolumns + /// from chunk read by IInputFormat. + builder.addSimpleTransform([&](const Block & header) + { + return std::make_shared(header, read_from_format_info.requested_columns); + }); + + auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); + auto current_reader = std::make_unique(*pipeline); + + ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); + + return ReaderHolder( + object_info, std::move(read_buf), std::move(source), std::move(pipeline), std::move(current_reader)); +} + +std::future StorageObjectStorageSource::createReaderAsync(size_t processor) +{ + return create_reader_scheduler([=, this] { return createReader(processor); }, Priority{}); +} + +std::unique_ptr StorageObjectStorageSource::createReadBuffer(const String & key, size_t object_size) +{ + auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); + read_settings.enable_filesystem_cache = false; + read_settings.remote_read_min_bytes_for_seek = read_settings.remote_fs_buffer_size; + + const bool object_too_small = object_size <= 2 * getContext()->getSettings().max_download_buffer_size; + const bool use_prefetch = object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool; + read_settings.remote_fs_method = use_prefetch ? RemoteFSReadMethod::threadpool : RemoteFSReadMethod::read; + + // Create a read buffer that will prefetch the first ~1 MB of the file. + // When reading lots of tiny files, this prefetching almost doubles the throughput. + // For bigger files, parallel reading is more useful. + if (use_prefetch) + { + LOG_TRACE(log, "Downloading object of size {} with initial prefetch", object_size); + + auto async_reader = object_storage->readObjects( + StoredObjects{StoredObject{key, /* local_path */ "", object_size}}, read_settings); + + async_reader->setReadUntilEnd(); + if (read_settings.remote_fs_prefetch) + async_reader->prefetch(DEFAULT_PREFETCH_PRIORITY); + + return async_reader; + } + else + { + /// FIXME: this is inconsistent that readObject always reads synchronously ignoring read_method setting. + return object_storage->readObject(StoredObject(key), read_settings); + } +} + +StorageObjectStorageSource::GlobIterator::GlobIterator( ObjectStoragePtr object_storage_, - Storage::ConfigurationPtr configuration_, + ConfigurationPtr configuration_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, + ContextPtr context_, ObjectInfos * read_keys_, + size_t list_object_keys_size, std::function file_progress_callback_) - : object_storage(object_storage_) + : WithContext(context_) + , object_storage(object_storage_) , configuration(configuration_) , virtual_columns(virtual_columns_) , read_keys(read_keys_) @@ -81,7 +331,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( { const auto key_with_globs = configuration_->getPath(); const auto key_prefix = configuration->getPathWithoutGlob(); - object_storage_iterator = object_storage->iterate(key_prefix); + object_storage_iterator = object_storage->iterate(key_prefix, list_object_keys_size); matcher = std::make_unique(makeRegexpPatternFromGlobs(key_with_globs)); if (matcher->ok()) @@ -113,13 +363,11 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } } -template -StorageObjectStorageSource::ObjectInfoPtr -StorageObjectStorageSource::GlobIterator::next(size_t /* processor */) +ObjectInfoPtr StorageObjectStorageSource::GlobIterator::next(size_t /* processor */) { std::lock_guard lock(next_mutex); - if (is_finished && index >= object_infos.size()) + if (is_finished) return {}; bool need_new_batch = object_infos.empty() || index >= object_infos.size(); @@ -130,9 +378,10 @@ StorageObjectStorageSource::GlobIterator::next(size_t /* proces while (new_batch.empty()) { auto result = object_storage_iterator->getCurrentBatchAndScheduleNext(); + LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: {}", result.has_value()); if (result.has_value()) { - new_batch = result.value(); + new_batch = std::move(result.value()); } else { @@ -169,7 +418,8 @@ StorageObjectStorageSource::GlobIterator::next(size_t /* proces { for (const auto & object_info : object_infos) { - file_progress_callback(FileProgress(0, object_info->metadata.size_bytes)); + chassert(object_info->metadata); + file_progress_callback(FileProgress(0, object_info->metadata->size_bytes)); } } } @@ -181,10 +431,9 @@ StorageObjectStorageSource::GlobIterator::next(size_t /* proces return object_infos[current_index]; } -template -StorageObjectStorageSource::KeysIterator::KeysIterator( +StorageObjectStorageSource::KeysIterator::KeysIterator( ObjectStoragePtr object_storage_, - Storage::ConfigurationPtr configuration_, + ConfigurationPtr configuration_, const NamesAndTypesList & virtual_columns_, ObjectInfos * read_keys_, std::function file_progress_callback_) @@ -199,15 +448,13 @@ StorageObjectStorageSource::KeysIterator::KeysIterator( /// TODO: should we add metadata if we anyway fetch it if file_progress_callback is passed? for (auto && key : keys) { - auto object_info = std::make_shared(key, ObjectMetadata{}); + auto object_info = std::make_shared(key); read_keys_->emplace_back(object_info); } } } -template -StorageObjectStorageSource::ObjectInfoPtr -StorageObjectStorageSource::KeysIterator::next(size_t /* processor */) +ObjectInfoPtr StorageObjectStorageSource::KeysIterator::next(size_t /* processor */) { size_t current_index = index.fetch_add(1, std::memory_order_relaxed); if (current_index >= keys.size()) @@ -225,240 +472,4 @@ StorageObjectStorageSource::KeysIterator::next(size_t /* proces return std::make_shared(key, metadata); } -template -Chunk StorageObjectStorageSource::generate() -{ - while (true) - { - if (isCancelled() || !reader) - { - if (reader) - reader->cancel(); - break; - } - - Chunk chunk; - if (reader->pull(chunk)) - { - UInt64 num_rows = chunk.getNumRows(); - total_rows_in_file += num_rows; - size_t chunk_size = 0; - if (const auto * input_format = reader.getInputFormat()) - chunk_size = input_format->getApproxBytesReadForChunk(); - progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( - chunk, - read_from_format_info.requested_virtual_columns, - fs::path(configuration->getNamespace()) / reader.getRelativePath(), - reader.getObjectInfo().metadata.size_bytes); - - return chunk; - } - - if (reader.getInputFormat() && getContext()->getSettingsRef().use_cache_for_count_from_files) - addNumRowsToCache(reader.getRelativePath(), total_rows_in_file); - - total_rows_in_file = 0; - - assert(reader_future.valid()); - reader = reader_future.get(); - - if (!reader) - break; - - /// Even if task is finished the thread may be not freed in pool. - /// So wait until it will be freed before scheduling a new task. - create_reader_pool.wait(); - reader_future = createReaderAsync(); - } - - return {}; -} - -template -void StorageObjectStorageSource::addNumRowsToCache(const String & path, size_t num_rows) -{ - String source = fs::path(configuration->getDataSourceDescription()) / path; - auto cache_key = getKeyForSchemaCache(source, configuration->format, format_settings, getContext()); - Storage::getSchemaCache(getContext()).addNumRows(cache_key, num_rows); -} - -template -std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const ObjectInfoPtr & object_info) -{ - String source = fs::path(configuration->getDataSourceDescription()) / object_info->relative_path; - auto cache_key = getKeyForSchemaCache(source, configuration->format, format_settings, getContext()); - auto get_last_mod_time = [&]() -> std::optional - { - auto last_mod = object_info->metadata.last_modified; - if (last_mod) - return last_mod->epochTime(); - else - { - object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); - return object_info->metadata.last_modified->epochMicroseconds(); - } - }; - return Storage::getSchemaCache(getContext()).tryGetNumRows(cache_key, get_last_mod_time); -} - -template -StorageObjectStorageSource::StorageObjectStorageSource( - String name_, - ObjectStoragePtr object_storage_, - Storage::ConfigurationPtr configuration_, - const ReadFromFormatInfo & info, - std::optional format_settings_, - ContextPtr context_, - UInt64 max_block_size_, - std::shared_ptr file_iterator_, - bool need_only_count_) - :ISource(info.source_header, false) - , WithContext(context_) - , name(std::move(name_)) - , object_storage(object_storage_) - , configuration(configuration_) - , format_settings(format_settings_) - , max_block_size(max_block_size_) - , need_only_count(need_only_count_) - , read_from_format_info(info) - , columns_desc(info.columns_description) - , file_iterator(file_iterator_) - , create_reader_pool(StorageSettings::ObjectStorageThreads(), - StorageSettings::ObjectStorageThreadsActive(), - StorageSettings::ObjectStorageThreadsScheduled(), 1) - , create_reader_scheduler(threadPoolCallbackRunner(create_reader_pool, "Reader")) -{ - reader = createReader(); - if (reader) - reader_future = createReaderAsync(); -} - -template -StorageObjectStorageSource::~StorageObjectStorageSource() -{ - create_reader_pool.wait(); -} - -template -StorageObjectStorageSource::ReaderHolder -StorageObjectStorageSource::createReader(size_t processor) -{ - auto object_info = file_iterator->next(processor); - if (object_info->relative_path.empty()) - return {}; - - if (object_info->metadata.size_bytes == 0) - object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); - - QueryPipelineBuilder builder; - std::shared_ptr source; - std::unique_ptr read_buf; - std::optional num_rows_from_cache = need_only_count - && getContext()->getSettingsRef().use_cache_for_count_from_files - ? tryGetNumRowsFromCache(object_info) - : std::nullopt; - - if (num_rows_from_cache) - { - /// We should not return single chunk with all number of rows, - /// because there is a chance that this chunk will be materialized later - /// (it can cause memory problems even with default values in columns or when virtual columns are requested). - /// Instead, we use special ConstChunkGenerator that will generate chunks - /// with max_block_size rows until total number of rows is reached. - source = std::make_shared( - read_from_format_info.format_header, *num_rows_from_cache, max_block_size); - builder.init(Pipe(source)); - } - else - { - std::optional max_parsing_threads; - if (need_only_count) - max_parsing_threads = 1; - - auto compression_method = chooseCompressionMethod( - object_info->relative_path, configuration->compression_method); - - read_buf = createReadBuffer(object_info->relative_path, object_info->metadata.size_bytes); - - auto input_format = FormatFactory::instance().getInput( - configuration->format, *read_buf, read_from_format_info.format_header, - getContext(), max_block_size, format_settings, max_parsing_threads, - std::nullopt, /* is_remote_fs */ true, compression_method); - - if (need_only_count) - input_format->needOnlyCount(); - - builder.init(Pipe(input_format)); - - if (columns_desc.hasDefaults()) - { - builder.addSimpleTransform( - [&](const Block & header) - { - return std::make_shared(header, columns_desc, *input_format, getContext()); - }); - } - - source = input_format; - } - - /// Add ExtractColumnsTransform to extract requested columns/subcolumns - /// from chunk read by IInputFormat. - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, read_from_format_info.requested_columns); - }); - - auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); - auto current_reader = std::make_unique(*pipeline); - - ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); - - return ReaderHolder{object_info, std::move(read_buf), - std::move(source), std::move(pipeline), std::move(current_reader)}; -} - -template -std::future::ReaderHolder> -StorageObjectStorageSource::createReaderAsync(size_t processor) -{ - return create_reader_scheduler([=, this] { return createReader(processor); }, Priority{}); -} - -template -std::unique_ptr StorageObjectStorageSource::createReadBuffer(const String & key, size_t object_size) -{ - auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); - read_settings.enable_filesystem_cache = false; - read_settings.remote_read_min_bytes_for_seek = read_settings.remote_fs_buffer_size; - - // auto download_buffer_size = getContext()->getSettings().max_download_buffer_size; - // const bool object_too_small = object_size <= 2 * download_buffer_size; - - // Create a read buffer that will prefetch the first ~1 MB of the file. - // When reading lots of tiny files, this prefetching almost doubles the throughput. - // For bigger files, parallel reading is more useful. - // if (object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) - // { - // LOG_TRACE(log, "Downloading object of size {} with initial prefetch", object_size); - - // auto async_reader = object_storage->readObjects( - // StoredObjects{StoredObject{key, /* local_path */ "", object_size}}, read_settings); - - // async_reader->setReadUntilEnd(); - // if (read_settings.remote_fs_prefetch) - // async_reader->prefetch(DEFAULT_PREFETCH_PRIORITY); - - // return async_reader; - // } - // else - return object_storage->readObject(StoredObject(key), read_settings); -} - -template class StorageObjectStorageSource; -template class StorageObjectStorageSource; -template class StorageObjectStorageSource; - } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index f68a5d47456..0d6a6b71271 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -1,31 +1,19 @@ #pragma once -#include +#include +#include #include #include +#include +#include namespace DB { -template -class StorageObjectStorageSource : public ISource, WithContext +class StorageObjectStorageSource : public SourceWithKeyCondition, WithContext { friend class StorageS3QueueSource; public: - using Source = StorageObjectStorageSource; - using Storage = StorageObjectStorage; - using ObjectInfo = Storage::ObjectInfo; - using ObjectInfoPtr = Storage::ObjectInfoPtr; - using ObjectInfos = Storage::ObjectInfos; - - class IIterator : public WithContext - { - public: - virtual ~IIterator() = default; - - virtual size_t estimatedKeysCount() = 0; - virtual ObjectInfoPtr next(size_t processor) = 0; - }; - + class IIterator; class ReadTaskIterator; class GlobIterator; class KeysIterator; @@ -33,13 +21,16 @@ public: StorageObjectStorageSource( String name_, ObjectStoragePtr object_storage_, - Storage::ConfigurationPtr configuration, + ConfigurationPtr configuration, const ReadFromFormatInfo & info, std::optional format_settings_, + const StorageObjectStorageSettings & query_settings_, ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, - bool need_only_count_); + bool need_only_count_, + SchemaCache & schema_cache_, + std::shared_ptr reader_pool_); ~StorageObjectStorageSource() override; @@ -48,32 +39,35 @@ public: Chunk generate() override; static std::shared_ptr createFileIterator( - Storage::ConfigurationPtr configuration, + ConfigurationPtr configuration, ObjectStoragePtr object_storage, bool distributed_processing, const ContextPtr & local_context, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, + size_t list_object_keys_size, std::function file_progress_callback = {}); protected: - void addNumRowsToCache(const String & path, size_t num_rows); - std::optional tryGetNumRowsFromCache(const ObjectInfoPtr & object_info); - const String name; ObjectStoragePtr object_storage; - const Storage::ConfigurationPtr configuration; + const ConfigurationPtr configuration; const std::optional format_settings; + const StorageObjectStorageSettings query_settings; const UInt64 max_block_size; const bool need_only_count; const ReadFromFormatInfo read_from_format_info; - + const std::shared_ptr create_reader_pool; ColumnsDescription columns_desc; std::shared_ptr file_iterator; - size_t total_rows_in_file = 0; + SchemaCache & schema_cache; + bool initialized = false; - struct ReaderHolder + size_t total_rows_in_file = 0; + LoggerPtr log = getLogger("StorageObjectStorageSource"); + + struct ReaderHolder : private boost::noncopyable { public: ReaderHolder( @@ -86,15 +80,15 @@ protected: , read_buf(std::move(read_buf_)) , source(std::move(source_)) , pipeline(std::move(pipeline_)) - , reader(std::move(reader_)) - { - } + , reader(std::move(reader_)) {} ReaderHolder() = default; - ReaderHolder(const ReaderHolder & other) = delete; - ReaderHolder & operator=(const ReaderHolder & other) = delete; ReaderHolder(ReaderHolder && other) noexcept { *this = std::move(other); } + explicit operator bool() const { return reader != nullptr; } + PullingPipelineExecutor * operator->() { return reader.get(); } + const PullingPipelineExecutor * operator->() const { return reader.get(); } + ReaderHolder & operator=(ReaderHolder && other) noexcept { /// The order of destruction is important. @@ -107,9 +101,6 @@ protected: return *this; } - explicit operator bool() const { return reader != nullptr; } - PullingPipelineExecutor * operator->() { return reader.get(); } - const PullingPipelineExecutor * operator->() const { return reader.get(); } const String & getRelativePath() const { return object_info->relative_path; } const ObjectInfo & getObjectInfo() const { return *object_info; } const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } @@ -123,20 +114,29 @@ protected: }; ReaderHolder reader; - LoggerPtr log = getLogger("StorageObjectStorageSource"); - ThreadPool create_reader_pool; ThreadPoolCallbackRunner create_reader_scheduler; std::future reader_future; /// Recreate ReadBuffer and Pipeline for each file. ReaderHolder createReader(size_t processor = 0); std::future createReaderAsync(size_t processor = 0); - std::unique_ptr createReadBuffer(const String & key, size_t object_size); + + void addNumRowsToCache(const String & path, size_t num_rows); + std::optional tryGetNumRowsFromCache(const ObjectInfoPtr & object_info); + void lazyInitialize(size_t processor); }; -template -class StorageObjectStorageSource::ReadTaskIterator : public IIterator +class StorageObjectStorageSource::IIterator +{ +public: + virtual ~IIterator() = default; + + virtual size_t estimatedKeysCount() = 0; + virtual ObjectInfoPtr next(size_t processor) = 0; +}; + +class StorageObjectStorageSource::ReadTaskIterator : public IIterator { public: explicit ReadTaskIterator(const ReadTaskCallback & callback_) : callback(callback_) {} @@ -149,16 +149,17 @@ private: ReadTaskCallback callback; }; -template -class StorageObjectStorageSource::GlobIterator : public IIterator +class StorageObjectStorageSource::GlobIterator : public IIterator, WithContext { public: GlobIterator( ObjectStoragePtr object_storage_, - Storage::ConfigurationPtr configuration_, + ConfigurationPtr configuration_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, + ContextPtr context_, ObjectInfos * read_keys_, + size_t list_object_keys_size, std::function file_progress_callback_ = {}); ~GlobIterator() override = default; @@ -169,7 +170,7 @@ public: private: ObjectStoragePtr object_storage; - Storage::ConfigurationPtr configuration; + ConfigurationPtr configuration; ActionsDAGPtr filter_dag; NamesAndTypesList virtual_columns; @@ -189,13 +190,12 @@ private: std::function file_progress_callback; }; -template -class StorageObjectStorageSource::KeysIterator : public IIterator +class StorageObjectStorageSource::KeysIterator : public IIterator { public: KeysIterator( ObjectStoragePtr object_storage_, - Storage::ConfigurationPtr configuration_, + ConfigurationPtr configuration_, const NamesAndTypesList & virtual_columns_, ObjectInfos * read_keys_, std::function file_progress_callback = {}); @@ -208,7 +208,7 @@ public: private: const ObjectStoragePtr object_storage; - const Storage::ConfigurationPtr configuration; + const ConfigurationPtr configuration; const NamesAndTypesList virtual_columns; const std::function file_progress_callback; const std::vector keys; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h b/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h new file mode 100644 index 00000000000..51be7419e1c --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h @@ -0,0 +1,11 @@ +#include + +namespace DB +{ + +using ConfigurationPtr = StorageObjectStorageConfigurationPtr; +using ObjectInfo = RelativePathWithMetadata; +using ObjectInfoPtr = std::shared_ptr; +using ObjectInfos = std::vector; + +} diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index bc9f93690f5..f7ab37490e1 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -8,18 +8,6 @@ namespace DB { -static void initializeConfiguration( - StorageObjectStorageConfiguration & configuration, - ASTs & engine_args, - ContextPtr local_context, - bool with_table_structure) -{ - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - configuration.fromNamedCollection(*named_collection); - else - configuration.fromAST(engine_args, local_context, with_table_structure); -} - template static std::shared_ptr> createStorageObjectStorage( const StorageFactory::Arguments & args, @@ -82,7 +70,7 @@ void registerStorageAzure(StorageFactory & factory) { auto context = args.getLocalContext(); auto configuration = std::make_shared(); - initializeConfiguration(*configuration, args.engine_args, context, false); + StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); return createStorageObjectStorage(args, configuration, "Azure", context); }, { @@ -101,7 +89,7 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) { auto context = args.getLocalContext(); auto configuration = std::make_shared(); - initializeConfiguration(*configuration, args.engine_args, context, false); + StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); return createStorageObjectStorage(args, configuration, name, context); }, { @@ -136,7 +124,7 @@ void registerStorageHDFS(StorageFactory & factory) { auto context = args.getLocalContext(); auto configuration = std::make_shared(); - initializeConfiguration(*configuration, args.engine_args, context, false); + StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); return createStorageObjectStorage(args, configuration, "HDFS", context); }, { diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index bd34d1ec093..b64aa23d47c 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -35,7 +35,7 @@ StorageS3QueueSource::S3QueueObjectInfo::S3QueueObjectInfo( const std::string & key_, const ObjectMetadata & object_metadata_, Metadata::ProcessingNodeHolderPtr processing_holder_) - : Source::ObjectInfo(key_, object_metadata_) + : ObjectInfo(key_, object_metadata_) , processing_holder(processing_holder_) { } @@ -55,15 +55,15 @@ StorageS3QueueSource::FileIterator::FileIterator( if (sharded_processing) { for (const auto & id : metadata->getProcessingIdsForShard(current_shard)) - sharded_keys.emplace(id, std::deque{}); + sharded_keys.emplace(id, std::deque{}); } } -StorageS3QueueSource::Source::ObjectInfoPtr StorageS3QueueSource::FileIterator::next(size_t processor) +StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::next(size_t processor) { while (!shutdown_called) { - Source::ObjectInfoPtr val{nullptr}; + ObjectInfoPtr val{nullptr}; { std::unique_lock lk(sharded_keys_mutex, std::defer_lock); @@ -140,7 +140,7 @@ StorageS3QueueSource::Source::ObjectInfoPtr StorageS3QueueSource::FileIterator:: if (processing_holder) { - return std::make_shared(val->relative_path, val->metadata, processing_holder); + return std::make_shared(val->relative_path, val->metadata.value(), processing_holder); } else if (sharded_processing && metadata->getFileStatus(val->relative_path)->state == S3QueueFilesMetadata::FileStatus::State::Processing) @@ -161,7 +161,7 @@ size_t StorageS3QueueSource::FileIterator::estimatedKeysCount() StorageS3QueueSource::StorageS3QueueSource( String name_, const Block & header_, - std::unique_ptr internal_source_, + std::unique_ptr internal_source_, std::shared_ptr files_metadata_, size_t processing_id_, const S3QueueAction & action_, @@ -273,7 +273,8 @@ Chunk StorageS3QueueSource::generate() file_status->processed_rows += chunk.getNumRows(); processed_rows_from_file += chunk.getNumRows(); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk(chunk, requested_virtual_columns, reader.getRelativePath(), reader.getObjectInfo().metadata.size_bytes); + VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( + chunk, requested_virtual_columns, reader.getRelativePath(), reader.getObjectInfo().metadata->size_bytes); return chunk; } } @@ -311,7 +312,7 @@ Chunk StorageS3QueueSource::generate() /// Even if task is finished the thread may be not freed in pool. /// So wait until it will be freed before scheduling a new task. - internal_source->create_reader_pool.wait(); + internal_source->create_reader_pool->wait(); reader_future = internal_source->createReaderAsync(processing_id); } diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index fcf5c5c0160..2bdac7f2311 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include @@ -22,16 +22,19 @@ class StorageS3QueueSource : public ISource, WithContext { public: using Storage = StorageObjectStorage; - using Source = StorageObjectStorageSource; using ConfigurationPtr = Storage::ConfigurationPtr; - using GlobIterator = Source::GlobIterator; + using GlobIterator = StorageObjectStorageSource::GlobIterator; using ZooKeeperGetter = std::function; using RemoveFileFunc = std::function; using FileStatusPtr = S3QueueFilesMetadata::FileStatusPtr; + using ReaderHolder = StorageObjectStorageSource::ReaderHolder; using Metadata = S3QueueFilesMetadata; + using ObjectInfo = RelativePathWithMetadata; + using ObjectInfoPtr = std::shared_ptr; + using ObjectInfos = std::vector; - struct S3QueueObjectInfo : public Source::ObjectInfo + struct S3QueueObjectInfo : public ObjectInfo { S3QueueObjectInfo( const std::string & key_, @@ -41,7 +44,7 @@ public: Metadata::ProcessingNodeHolderPtr processing_holder; }; - class FileIterator : public Source::IIterator + class FileIterator : public StorageObjectStorageSource::IIterator { public: FileIterator( @@ -53,7 +56,7 @@ public: /// Note: /// List results in s3 are always returned in UTF-8 binary order. /// (https://docs.aws.amazon.com/AmazonS3/latest/userguide/ListingKeysUsingAPIs.html) - Source::ObjectInfoPtr next(size_t processor) override; + ObjectInfoPtr next(size_t processor) override; size_t estimatedKeysCount() override; @@ -66,14 +69,14 @@ public: const bool sharded_processing; const size_t current_shard; - std::unordered_map> sharded_keys; + std::unordered_map> sharded_keys; std::mutex sharded_keys_mutex; }; StorageS3QueueSource( String name_, const Block & header_, - std::unique_ptr internal_source_, + std::unique_ptr internal_source_, std::shared_ptr files_metadata_, size_t processing_id_, const S3QueueAction & action_, @@ -97,7 +100,7 @@ private: const S3QueueAction action; const size_t processing_id; const std::shared_ptr files_metadata; - const std::shared_ptr internal_source; + const std::shared_ptr internal_source; const NamesAndTypesList requested_virtual_columns; const std::atomic & shutdown_called; const std::atomic & table_is_being_dropped; @@ -107,8 +110,8 @@ private: RemoveFileFunc remove_file_func; LoggerPtr log; - Source::ReaderHolder reader; - std::future reader_future; + ReaderHolder reader; + std::future reader_future; std::atomic initialized{false}; size_t processed_rows_from_file = 0; diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.h b/src/Storages/S3Queue/S3QueueTableMetadata.h index 942ce7973ef..70dd8f27d71 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.h +++ b/src/Storages/S3Queue/S3QueueTableMetadata.h @@ -3,7 +3,7 @@ #if USE_AWS_S3 #include -#include +#include #include namespace DB diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index fa7132f705a..fc4ef77ebb9 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -341,16 +341,23 @@ std::shared_ptr StorageS3Queue::createSource( size_t max_block_size, ContextPtr local_context) { - auto internal_source = std::make_unique( + auto threadpool = std::make_shared(CurrentMetrics::ObjectStorageS3Threads, + CurrentMetrics::ObjectStorageS3ThreadsActive, + CurrentMetrics::ObjectStorageS3ThreadsScheduled, + /* max_threads */1); + auto internal_source = std::make_unique( getName(), object_storage, configuration, info, format_settings, + S3StorageSettings::create(local_context->getSettingsRef()), local_context, max_block_size, file_iterator, - false); + false, + Storage::getSchemaCache(local_context), + threadpool); auto file_deleter = [=, this](const std::string & path) mutable { @@ -555,25 +562,14 @@ void StorageS3Queue::checkTableStructure(const String & zookeeper_prefix, const } } -std::shared_ptr StorageS3Queue::createFileIterator(ContextPtr , const ActionsDAG::Node * predicate) +std::shared_ptr StorageS3Queue::createFileIterator(ContextPtr local_context, const ActionsDAG::Node * predicate) { - auto glob_iterator = std::make_unique(object_storage, configuration, predicate, virtual_columns, nullptr); - + auto settings = S3StorageSettings::create(local_context->getSettingsRef()); + auto glob_iterator = std::make_unique( + object_storage, configuration, predicate, virtual_columns, local_context, nullptr, settings.list_object_keys_size); return std::make_shared(files_metadata, std::move(glob_iterator), s3queue_settings->s3queue_current_shard_num, shutdown_called); } -static void initializeConfiguration( - StorageObjectStorageConfiguration & configuration, - ASTs & engine_args, - ContextPtr local_context, - bool with_table_structure) -{ - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - configuration.fromNamedCollection(*named_collection); - else - configuration.fromAST(engine_args, local_context, with_table_structure); -} - void registerStorageS3QueueImpl(const String & name, StorageFactory & factory) { factory.registerStorage( @@ -585,7 +581,7 @@ void registerStorageS3QueueImpl(const String & name, StorageFactory & factory) throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); auto configuration = std::make_shared(); - initializeConfiguration(*configuration, args.engine_args, args.getContext(), false); + StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getContext(), false); // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index 88f9bd65093..46a8b8d82c1 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -27,7 +27,6 @@ class StorageS3Queue : public IStorage, WithContext { public: using Storage = StorageObjectStorage; - using Source = StorageObjectStorageSource; using ConfigurationPtr = Storage::ConfigurationPtr; StorageS3Queue( diff --git a/src/TableFunctions/ITableFunctionDataLake.h b/src/TableFunctions/ITableFunctionDataLake.h index 884e1f5c4a2..0ffa1460d78 100644 --- a/src/TableFunctions/ITableFunctionDataLake.h +++ b/src/TableFunctions/ITableFunctionDataLake.h @@ -10,7 +10,7 @@ # include # include # include -#include +#include #include #include diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index d009a9347f3..de46c13af37 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -27,20 +27,9 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -static void initializeConfiguration( - StorageObjectStorageConfiguration & configuration, - ASTs & engine_args, - ContextPtr local_context, - bool with_table_structure) -{ - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - configuration.fromNamedCollection(*named_collection); - else - configuration.fromAST(engine_args, local_context, with_table_structure); -} - template -ObjectStoragePtr TableFunctionObjectStorage::getObjectStorage(const ContextPtr & context, bool create_readonly) const +ObjectStoragePtr TableFunctionObjectStorage< + Definition, StorageSettings, Configuration>::getObjectStorage(const ContextPtr & context, bool create_readonly) const { if (!object_storage) object_storage = configuration->createOrUpdateObjectStorage(context, create_readonly); @@ -48,7 +37,8 @@ ObjectStoragePtr TableFunctionObjectStorage -std::vector TableFunctionObjectStorage::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const +std::vector TableFunctionObjectStorage< + Definition, StorageSettings, Configuration>::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const { auto & table_function_node = query_node_table_function->as(); auto & table_function_arguments_nodes = table_function_node.getArguments().getNodes(); @@ -65,16 +55,18 @@ std::vector TableFunctionObjectStorage -void TableFunctionObjectStorage::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +void TableFunctionObjectStorage< + Definition, StorageSettings, Configuration>::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) { Configuration::addStructureToArgs(args, structure, context); } template -void TableFunctionObjectStorage::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context) +void TableFunctionObjectStorage< + Definition, StorageSettings, Configuration>::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context) { configuration = std::make_shared(); - initializeConfiguration(*configuration, engine_args, local_context, true); + StorageObjectStorageConfiguration::initialize(*configuration, engine_args, local_context, true); } template @@ -91,7 +83,8 @@ void TableFunctionObjectStorage::par } template -ColumnsDescription TableFunctionObjectStorage::getActualTableStructure(ContextPtr context, bool is_insert_query) const +ColumnsDescription TableFunctionObjectStorage< + Definition, StorageSettings, Configuration>::getActualTableStructure(ContextPtr context, bool is_insert_query) const { if (configuration->structure == "auto") { @@ -104,13 +97,15 @@ ColumnsDescription TableFunctionObjectStorage -bool TableFunctionObjectStorage::supportsReadingSubsetOfColumns(const ContextPtr & context) +bool TableFunctionObjectStorage< + Definition, StorageSettings, Configuration>::supportsReadingSubsetOfColumns(const ContextPtr & context) { return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); } template -std::unordered_set TableFunctionObjectStorage::getVirtualsToCheckBeforeUsingStructureHint() const +std::unordered_set TableFunctionObjectStorage< + Definition, StorageSettings, Configuration>::getVirtualsToCheckBeforeUsingStructureHint() const { auto virtual_column_names = StorageObjectStorage::getVirtualColumnNames(); return {virtual_column_names.begin(), virtual_column_names.end()}; @@ -166,15 +161,33 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) factory.registerFunction>( { + .documentation = + { + .description=R"(The table function can be used to read the data stored on GCS.)", + .examples{{"gcs", "SELECT * FROM gcs(url, access_key_id, secret_access_key)", ""} + }, + .categories{"DataLake"}}, .allow_readonly = false }); factory.registerFunction>( { + .documentation = + { + .description=R"(The table function can be used to read the data stored on COSN.)", + .examples{{"cosn", "SELECT * FROM cosn(url, access_key_id, secret_access_key)", ""} + }, + .categories{"DataLake"}}, .allow_readonly = false }); factory.registerFunction>( { + .documentation = + { + .description=R"(The table function can be used to read the data stored on OSS.)", + .examples{{"oss", "SELECT * FROM oss(url, access_key_id, secret_access_key)", ""} + }, + .categories{"DataLake"}}, .allow_readonly = false }); #endif diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index 1d27a857cea..8e6c96a3f2a 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -76,8 +75,8 @@ void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory) factory.registerFunction( { .documentation = { - .description=R"(The table function can be used to read the data stored on Azure Blob Storage in parallel for many nodes in a specified cluster.)", - .examples{{"azureBlobStorageCluster", "SELECT * FROM azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])", ""}}}, + .description=R"(The table function can be used to read the data stored on S3 in parallel for many nodes in a specified cluster.)", + .examples{{"s3Cluster", "SELECT * FROM s3Cluster(cluster, url, format, structure)", ""}}}, .allow_readonly = false } ); @@ -95,7 +94,14 @@ void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory) #endif #if USE_HDFS - factory.registerFunction(); + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the data stored on HDFS in parallel for many nodes in a specified cluster.)", + .examples{{"HDFSCluster", "SELECT * FROM HDFSCluster(cluster_name, uri, format)", ""}}}, + .allow_readonly = false + } + ); #endif } From 27a8bcc4383578b267ebcf0c8e0f65e83053c750 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Tue, 13 Feb 2024 20:16:37 +0100 Subject: [PATCH 012/392] Update ReadHelpers.cpp to fix failing style check --- src/IO/ReadHelpers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 3f9ceef50d4..ddf932b98a6 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -540,7 +540,7 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) if (*buf.position() == '\r') { - ++buf.position(); // advance to \n after \r + ++buf.position(); } } } From 80b2276599024032ca656206042b2d5f1fdc1571 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 14 Feb 2024 10:38:20 +0100 Subject: [PATCH 013/392] fix style check --- src/IO/ReadHelpers.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index ddf932b98a6..af66cbb4cb5 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -510,7 +510,6 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) { next_pos = find_first_symbols<'\t', '\n', '\\'>(buf.position(), buf.buffer().end()); } - appendToStringOrVector(s, buf, next_pos); buf.position() = next_pos; @@ -539,9 +538,8 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) } if (*buf.position() == '\r') - { - ++buf.position(); - } + ++buf.position(); + } } @@ -1987,7 +1985,4 @@ void readTSVField(String & s, ReadBuffer & buf) template void readTSVField(String & s, ReadBuffer & buf); template void readTSVField(String & s, ReadBuffer & buf); - } - - From 84b0fe670a4d73cc0b5c26bb922e90369025dae6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 13 Feb 2024 17:03:11 +0100 Subject: [PATCH 014/392] Refactor data lakes --- src/Backups/BackupIO_AzureBlobStorage.h | 2 +- .../registerBackupEngineAzureBlobStorage.cpp | 5 +- src/CMakeLists.txt | 7 +- .../AzureBlobStorage/AzureObjectStorage.cpp | 6 +- ...jectStorageRemoteMetadataRestoreHelper.cpp | 28 ++-- src/Disks/ObjectStorages/IObjectStorage.h | 4 +- .../ObjectStorageIteratorAsync.cpp | 30 ++++- .../ObjectStorageIteratorAsync.h | 6 +- src/Interpreters/InterpreterSystemQuery.cpp | 2 +- .../DataLakes/DeltaLakeMetadataParser.h | 26 ---- src/Storages/DataLakes/HudiMetadataParser.h | 18 --- src/Storages/DataLakes/IStorageDataLake.h | 98 -------------- .../DataLakes/Iceberg/StorageIceberg.cpp | 11 -- src/Storages/DataLakes/StorageDeltaLake.h | 20 --- src/Storages/DataLakes/StorageHudi.h | 20 --- src/Storages/DataLakes/registerDataLakes.cpp | 50 ------- .../Configuration.cpp} | 49 +++---- .../Configuration.h} | 11 +- .../ObjectStorage/DataLakes/Common.cpp | 28 ++++ src/Storages/ObjectStorage/DataLakes/Common.h | 15 +++ .../DataLakes/DeltaLakeMetadata.cpp} | 110 +++++++-------- .../DataLakes/DeltaLakeMetadata.h | 48 +++++++ .../DataLakes/HudiMetadata.cpp} | 55 ++++---- .../ObjectStorage/DataLakes/HudiMetadata.h | 51 +++++++ .../DataLakes/IDataLakeMetadata.h | 19 +++ .../DataLakes/IStorageDataLake.h} | 58 ++++---- .../DataLakes}/IcebergMetadata.cpp | 36 ++--- .../DataLakes}/IcebergMetadata.h | 40 +++--- .../DataLakes/registerDataLakeStorages.cpp | 83 ++++++++++++ .../ObjectStorage/HDFS/Configuration.cpp | 57 ++++++++ .../ObjectStorage/HDFS/Configuration.h | 45 +++++++ .../ObjectStorage/HDFSConfiguration.h | 81 ----------- .../ObjectStorage/ReadBufferIterator.cpp | 4 +- .../ReadFromStorageObjectStorage.cpp | 1 - .../Configuration.cpp} | 30 +++-- .../{S3Configuration.h => S3/Configuration.h} | 15 ++- .../ObjectStorage/StorageObjectStorage.cpp | 10 +- .../ObjectStorage/StorageObjectStorage.h | 5 +- .../StorageObjectStorageCluster.cpp | 2 +- .../StorageObjectStorageCluster.h | 3 + .../StorageObjectStorageConfiguration.cpp | 2 +- ....h => StorageObjectStorageConfiguration.h} | 3 +- .../StorageObjectStorageSink.cpp | 127 ++++++++++++++++++ .../ObjectStorage/StorageObjectStorageSink.h | 113 ++-------------- .../StorageObjectStorageSource.cpp | 33 ++++- .../StorageObjectStorageSource.h | 22 +-- .../StorageObjectStorage_fwd_internal.h | 3 +- .../registerStorageObjectStorage.cpp | 12 +- src/Storages/ObjectStorageConfiguration.h | 0 src/Storages/S3Queue/S3QueueTableMetadata.h | 2 +- src/Storages/S3Queue/StorageS3Queue.cpp | 9 +- .../StorageSystemSchemaInferenceCache.cpp | 2 +- src/TableFunctions/ITableFunctionDataLake.h | 76 +++++++---- src/TableFunctions/TableFunctionDeltaLake.cpp | 33 ----- src/TableFunctions/TableFunctionHudi.cpp | 31 ----- src/TableFunctions/TableFunctionIceberg.cpp | 37 ----- .../TableFunctionObjectStorage.cpp | 22 ++- .../TableFunctionObjectStorage.h | 13 +- .../TableFunctionObjectStorageCluster.cpp | 8 +- .../registerDataLakeTableFunctions.cpp | 69 ++++++++++ src/TableFunctions/registerTableFunctions.cpp | 3 +- src/TableFunctions/registerTableFunctions.h | 10 +- 62 files changed, 946 insertions(+), 873 deletions(-) delete mode 100644 src/Storages/DataLakes/DeltaLakeMetadataParser.h delete mode 100644 src/Storages/DataLakes/HudiMetadataParser.h delete mode 100644 src/Storages/DataLakes/IStorageDataLake.h delete mode 100644 src/Storages/DataLakes/Iceberg/StorageIceberg.cpp delete mode 100644 src/Storages/DataLakes/StorageDeltaLake.h delete mode 100644 src/Storages/DataLakes/StorageHudi.h delete mode 100644 src/Storages/DataLakes/registerDataLakes.cpp rename src/Storages/ObjectStorage/{AzureConfiguration.cpp => AzureBlob/Configuration.cpp} (92%) rename src/Storages/ObjectStorage/{AzureConfiguration.h => AzureBlob/Configuration.h} (88%) create mode 100644 src/Storages/ObjectStorage/DataLakes/Common.cpp create mode 100644 src/Storages/ObjectStorage/DataLakes/Common.h rename src/Storages/{DataLakes/DeltaLakeMetadataParser.cpp => ObjectStorage/DataLakes/DeltaLakeMetadata.cpp} (79%) create mode 100644 src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h rename src/Storages/{DataLakes/HudiMetadataParser.cpp => ObjectStorage/DataLakes/HudiMetadata.cpp} (68%) create mode 100644 src/Storages/ObjectStorage/DataLakes/HudiMetadata.h create mode 100644 src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h rename src/Storages/{DataLakes/Iceberg/StorageIceberg.h => ObjectStorage/DataLakes/IStorageDataLake.h} (61%) rename src/Storages/{DataLakes/Iceberg => ObjectStorage/DataLakes}/IcebergMetadata.cpp (96%) rename src/Storages/{DataLakes/Iceberg => ObjectStorage/DataLakes}/IcebergMetadata.h (76%) create mode 100644 src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp create mode 100644 src/Storages/ObjectStorage/HDFS/Configuration.cpp create mode 100644 src/Storages/ObjectStorage/HDFS/Configuration.h delete mode 100644 src/Storages/ObjectStorage/HDFSConfiguration.h rename src/Storages/ObjectStorage/{S3Configuration.cpp => S3/Configuration.cpp} (97%) rename src/Storages/ObjectStorage/{S3Configuration.h => S3/Configuration.h} (81%) rename src/Storages/ObjectStorage/{StorageObejctStorageConfiguration.h => StorageObjectStorageConfiguration.h} (99%) create mode 100644 src/Storages/ObjectStorage/StorageObjectStorageSink.cpp delete mode 100644 src/Storages/ObjectStorageConfiguration.h delete mode 100644 src/TableFunctions/TableFunctionDeltaLake.cpp delete mode 100644 src/TableFunctions/TableFunctionHudi.cpp delete mode 100644 src/TableFunctions/TableFunctionIceberg.cpp create mode 100644 src/TableFunctions/registerDataLakeTableFunctions.cpp diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 99002c53769..9f1702cb3a3 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index 9408c7ccdcf..c4c04bbc057 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #endif @@ -59,9 +59,6 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) if (!config.has(config_prefix)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no collection named `{}` in config", id_arg); - if (!config.has(config_prefix)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no `{}` in config", config_prefix); - if (config.has(config_prefix + ".connection_string")) { configuration.connection_url = config.getString(config_prefix + ".connection_string"); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 50130e6abd0..118e0131b37 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -105,6 +105,7 @@ add_library(clickhouse_compression ${clickhouse_compression_headers} ${clickhous add_headers_and_sources(dbms Disks/IO) add_headers_and_sources(dbms Disks/ObjectStorages) +add_headers_and_sources(dbms Disks/ObjectStorages) if (TARGET ch_contrib::sqlite) add_headers_and_sources(dbms Databases/SQLite) endif() @@ -117,9 +118,11 @@ if (TARGET ch_contrib::nats_io) add_headers_and_sources(dbms Storages/NATS) endif() -add_headers_and_sources(dbms Storages/DataLakes) -add_headers_and_sources(dbms Storages/DataLakes/Iceberg) add_headers_and_sources(dbms Storages/ObjectStorage) +add_headers_and_sources(dbms Storages/ObjectStorage/AzureBlob) +add_headers_and_sources(dbms Storages/ObjectStorage/S3) +add_headers_and_sources(dbms Storages/ObjectStorage/HDFS) +add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes) add_headers_and_sources(dbms Common/NamedCollections) if (TARGET ch_contrib::amqp_cpp) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index bbbb5357505..bcc75f91e2a 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -323,10 +323,8 @@ void AzureObjectStorage::removeObjectsIfExist(const StoredObjects & objects) { removeObjectIfExists(object); } - } - ObjectMetadata AzureObjectStorage::getObjectMetadata(const std::string & path) const { auto client_ptr = client.get(); @@ -338,9 +336,9 @@ ObjectMetadata AzureObjectStorage::getObjectMetadata(const std::string & path) c { result.attributes.emplace(); for (const auto & [key, value] : properties.Metadata) - (*result.attributes)[key] = value; + result.attributes[key] = value; } - result.last_modified.emplace(static_cast(properties.LastModified).time_since_epoch().count()); + result.last_modified = static_cast(properties.LastModified).time_since_epoch().count(); return result; } diff --git a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp index cc9ee3db505..9f9efad9615 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp @@ -404,26 +404,20 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::processRestoreFiles( { for (const auto & key : keys) { - auto meta = source_object_storage->getObjectMetadata(key); - auto object_attributes = meta.attributes; + auto metadata = source_object_storage->getObjectMetadata(key); + auto object_attributes = metadata.attributes; String path; - if (object_attributes.has_value()) + /// Restore file if object has 'path' in metadata. + auto path_entry = object_attributes.find("path"); + if (path_entry == object_attributes.end()) { - /// Restore file if object has 'path' in metadata. - auto path_entry = object_attributes->find("path"); - if (path_entry == object_attributes->end()) - { - /// Such keys can remain after migration, we can skip them. - LOG_WARNING(disk->log, "Skip key {} because it doesn't have 'path' in metadata", key); - continue; - } - - path = path_entry->second; - } - else + /// Such keys can remain after migration, we can skip them. + LOG_WARNING(disk->log, "Skip key {} because it doesn't have 'path' in metadata", key); continue; + } + path = path_entry->second; disk->createDirectories(directoryPath(path)); auto object_key = ObjectStorageKey::createAsRelative(disk->object_key_prefix, shrinkKey(source_path, key)); @@ -435,7 +429,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::processRestoreFiles( source_object_storage->copyObjectToAnotherObjectStorage(object_from, object_to, read_settings, write_settings, *disk->object_storage); auto tx = disk->metadata_storage->createTransaction(); - tx->addBlobToMetadata(path, object_key, meta.size_bytes); + tx->addBlobToMetadata(path, object_key, metadata.size_bytes); tx->commit(); LOG_TRACE(disk->log, "Restored file {}", path); @@ -490,7 +484,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFileOperations(IObject if (send_metadata) revision_counter = revision - 1; - auto object_attributes = *(source_object_storage->getObjectMetadata(object->relative_path).attributes); + auto object_attributes = source_object_storage->getObjectMetadata(object->relative_path).attributes; if (operation == rename) { auto from_path = object_attributes["from_path"]; diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 4955b0e6924..8a5352e71ca 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -47,8 +47,8 @@ using ObjectAttributes = std::map; struct ObjectMetadata { uint64_t size_bytes = 0; - std::optional last_modified; - std::optional attributes; + Poco::Timestamp last_modified; + ObjectAttributes attributes; }; struct RelativePathWithMetadata diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index 62bdd0ed0c8..f441b18d59d 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -11,18 +11,26 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +IObjectStorageIteratorAsync::IObjectStorageIteratorAsync( + CurrentMetrics::Metric threads_metric, + CurrentMetrics::Metric threads_active_metric, + CurrentMetrics::Metric threads_scheduled_metric, + const std::string & thread_name) + : list_objects_pool(threads_metric, threads_active_metric, threads_scheduled_metric, 1) + , list_objects_scheduler(threadPoolCallbackRunner(list_objects_pool, thread_name)) +{ +} + void IObjectStorageIteratorAsync::nextBatch() { std::lock_guard lock(mutex); if (is_finished) { - LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: here 3"); current_batch.clear(); current_batch_iterator = current_batch.begin(); } else { - LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: here 4"); if (!is_initialized) { outcome_future = scheduleBatch(); @@ -30,13 +38,23 @@ void IObjectStorageIteratorAsync::nextBatch() } chassert(outcome_future.valid()); - auto [batch, has_next] = outcome_future.get(); - current_batch = std::move(batch); + BatchAndHasNext result; + try + { + result = outcome_future.get(); + } + catch (...) + { + is_finished = true; + throw; + } + + current_batch = std::move(result.batch); current_batch_iterator = current_batch.begin(); accumulated_size.fetch_add(current_batch.size(), std::memory_order_relaxed); - if (has_next) + if (result.has_next) outcome_future = scheduleBatch(); else is_finished = true; @@ -100,12 +118,10 @@ std::optional IObjectStorageIteratorAsync::getCurrent if (current_batch_iterator == current_batch.end()) { - LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: here 2"); return std::nullopt; } auto temp_current_batch = std::move(current_batch); - LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: here 1: {}", temp_current_batch.size()); nextBatch(); return temp_current_batch; } diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h index 8d155f7ec8d..86e5feb3010 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h @@ -17,11 +17,7 @@ public: CurrentMetrics::Metric threads_metric, CurrentMetrics::Metric threads_active_metric, CurrentMetrics::Metric threads_scheduled_metric, - const std::string & thread_name) - : list_objects_pool(threads_metric, threads_active_metric, threads_scheduled_metric, 1) - , list_objects_scheduler(threadPoolCallbackRunner(list_objects_pool, thread_name)) - { - } + const std::string & thread_name); void next() override; void nextBatch() override; diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index d697d90c8a6..36f5bd73ca6 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -480,7 +480,7 @@ BlockIO InterpreterSystemQuery::execute() StorageURL::getSchemaCache(getContext()).clear(); #if USE_AZURE_BLOB_STORAGE if (caches_to_drop.contains("AZURE")) - StorageAzureBlobStorage::getSchemaCache(getContext()).clear(); + StorageAzureBlob::getSchemaCache(getContext()).clear(); #endif break; } diff --git a/src/Storages/DataLakes/DeltaLakeMetadataParser.h b/src/Storages/DataLakes/DeltaLakeMetadataParser.h deleted file mode 100644 index 251ea3e3f15..00000000000 --- a/src/Storages/DataLakes/DeltaLakeMetadataParser.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace DB -{ - -struct DeltaLakeMetadataParser -{ -public: - DeltaLakeMetadataParser(); - - Strings getFiles( - ObjectStoragePtr object_storage, - StorageObjectStorageConfigurationPtr configuration, - ContextPtr context); - -private: - struct Impl; - std::shared_ptr impl; -}; - -} diff --git a/src/Storages/DataLakes/HudiMetadataParser.h b/src/Storages/DataLakes/HudiMetadataParser.h deleted file mode 100644 index 72766a95876..00000000000 --- a/src/Storages/DataLakes/HudiMetadataParser.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace DB -{ - -struct HudiMetadataParser -{ - Strings getFiles( - ObjectStoragePtr object_storage, - StorageObjectStorageConfigurationPtr configuration, ContextPtr context); -}; - -} diff --git a/src/Storages/DataLakes/IStorageDataLake.h b/src/Storages/DataLakes/IStorageDataLake.h deleted file mode 100644 index 934bf227c42..00000000000 --- a/src/Storages/DataLakes/IStorageDataLake.h +++ /dev/null @@ -1,98 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -template -class IStorageDataLake : public StorageObjectStorage -{ -public: - static constexpr auto name = Name::name; - - using Storage = StorageObjectStorage; - using ConfigurationPtr = Storage::ConfigurationPtr; - - static StoragePtr create( - ConfigurationPtr base_configuration, - ContextPtr context, - const String & engine_name_, - const StorageID & table_id_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment_, - std::optional format_settings_, - bool /* attach */) - { - auto object_storage = base_configuration->createOrUpdateObjectStorage(context); - - auto configuration = base_configuration->clone(); - configuration->getPaths() = MetadataParser().getFiles(object_storage, configuration, context); - - return std::make_shared>( - base_configuration, configuration, object_storage, engine_name_, context, - table_id_, columns_, constraints_, comment_, format_settings_); - } - - String getName() const override { return name; } - - static ColumnsDescription getTableStructureFromData( - ObjectStoragePtr object_storage_, - ConfigurationPtr base_configuration, - const std::optional &, - ContextPtr local_context) - { - auto metadata = parseIcebergMetadata(object_storage_, base_configuration, local_context); - return ColumnsDescription(metadata->getTableSchema()); - } - - std::pair updateConfigurationAndGetCopy(ContextPtr local_context) override - { - std::lock_guard lock(Storage::configuration_update_mutex); - - auto new_object_storage = base_configuration->createOrUpdateObjectStorage(local_context); - bool updated = new_object_storage != nullptr; - if (updated) - Storage::object_storage = new_object_storage; - - auto new_keys = MetadataParser().getFiles(Storage::object_storage, base_configuration, local_context); - - if (updated || new_keys != Storage::configuration->getPaths()) - { - auto updated_configuration = base_configuration->clone(); - /// If metadata wasn't changed, we won't list data files again. - updated_configuration->getPaths() = new_keys; - Storage::configuration = updated_configuration; - } - return {Storage::configuration, Storage::object_storage}; - } - - template - explicit IStorageDataLake( - ConfigurationPtr base_configuration_, - Args &&... args) - : Storage(std::forward(args)...) - , base_configuration(base_configuration_) - { - } - -private: - ConfigurationPtr base_configuration; - LoggerPtr log; -}; - - -} - -#endif diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp deleted file mode 100644 index ad1a27c312b..00000000000 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include - -#if USE_AWS_S3 && USE_AVRO - -namespace DB -{ - - -} - -#endif diff --git a/src/Storages/DataLakes/StorageDeltaLake.h b/src/Storages/DataLakes/StorageDeltaLake.h deleted file mode 100644 index 07c2205d2df..00000000000 --- a/src/Storages/DataLakes/StorageDeltaLake.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include -#include -#include -#include "config.h" - -namespace DB -{ - -struct StorageDeltaLakeName -{ - static constexpr auto name = "DeltaLake"; -}; - -#if USE_AWS_S3 && USE_PARQUET -using StorageDeltaLakeS3 = IStorageDataLake; -#endif - -} diff --git a/src/Storages/DataLakes/StorageHudi.h b/src/Storages/DataLakes/StorageHudi.h deleted file mode 100644 index 3fd52c82d32..00000000000 --- a/src/Storages/DataLakes/StorageHudi.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include -#include -#include -#include "config.h" - -namespace DB -{ - -struct StorageHudiName -{ - static constexpr auto name = "Hudi"; -}; - -#if USE_AWS_S3 -using StorageHudiS3 = IStorageDataLake; -#endif - -} diff --git a/src/Storages/DataLakes/registerDataLakes.cpp b/src/Storages/DataLakes/registerDataLakes.cpp deleted file mode 100644 index 2647fbce39d..00000000000 --- a/src/Storages/DataLakes/registerDataLakes.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include -#include - - -namespace DB -{ - -#if USE_PARQUET -void registerStorageDeltaLake(StorageFactory & ) -{ - // factory.registerStorage( - // StorageDeltaLakeName::name, - // [&](const StorageFactory::Arguments & args) - // { - // auto configuration = std::make_shared(); - // return IStorageDataLake::create( - // configuration, args.getContext(), "deltaLake", args.table_id, args.columns, - // args.constraints, args.comment, std::nullopt, args.attach); - // }, - // { - // .supports_settings = false, - // .supports_schema_inference = true, - // .source_access_type = AccessType::S3, - // }); -} -#endif - -#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format. - -void registerStorageIceberg(StorageFactory &) -{ - // REGISTER_DATA_LAKE_STORAGE(StorageIceberg, StorageIceberg::name) -} - -#endif - -void registerStorageHudi(StorageFactory &) -{ -} - -} - -#endif diff --git a/src/Storages/ObjectStorage/AzureConfiguration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp similarity index 92% rename from src/Storages/ObjectStorage/AzureConfiguration.cpp rename to src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index 04f6f26111b..109918dfc8b 100644 --- a/src/Storages/ObjectStorage/AzureConfiguration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -1,4 +1,7 @@ -#include +#include + +#if USE_AZURE_BLOB_STORAGE + #include #include #include @@ -44,21 +47,19 @@ namespace return !candidate.starts_with("http"); } - bool containerExists(std::unique_ptr & blob_service_client, std::string container_name) + bool containerExists(Azure::Storage::Blobs::BlobServiceClient & blob_service_client, std::string container_name) { Azure::Storage::Blobs::ListBlobContainersOptions options; options.Prefix = container_name; options.PageSizeHint = 1; - auto containers_list_response = blob_service_client->ListBlobContainers(options); + auto containers_list_response = blob_service_client.ListBlobContainers(options); auto containers_list = containers_list_response.BlobContainers; - for (const auto & container : containers_list) - { - if (container_name == container.Name) - return true; - } - return false; + auto it = std::find_if( + containers_list.begin(), containers_list.end(), + [&](const auto & c) { return c.Name == container_name; }); + return it != containers_list.end(); } } @@ -76,19 +77,6 @@ void StorageAzureBlobConfiguration::check(ContextPtr context) const context->getGlobalContext()->getRemoteHostFilter().checkURL(url_to_check); } -StorageObjectStorageConfigurationPtr StorageAzureBlobConfiguration::clone() -{ - auto configuration = std::make_shared(); - configuration->connection_url = connection_url; - configuration->is_connection_string = is_connection_string; - configuration->account_name = account_name; - configuration->account_key = account_key; - configuration->container = container; - configuration->blob_path = blob_path; - configuration->blobs_paths = blobs_paths; - return configuration; -} - StorageAzureBlobConfiguration::StorageAzureBlobConfiguration(const StorageAzureBlobConfiguration & other) { connection_url = other.connection_url; @@ -98,6 +86,10 @@ StorageAzureBlobConfiguration::StorageAzureBlobConfiguration(const StorageAzureB container = other.container; blob_path = other.blob_path; blobs_paths = other.blobs_paths; + + format = other.format; + compression_method = other.compression_method; + structure = other.structure; } AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(ContextPtr context) @@ -127,7 +119,7 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) { auto blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(connection_url)); result = std::make_unique(BlobContainerClient::CreateFromConnectionString(connection_url, container)); - bool container_exists = containerExists(blob_service_client, container); + bool container_exists = containerExists(*blob_service_client, container); if (!container_exists) { @@ -140,10 +132,11 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) try { result->CreateIfNotExists(); - } catch (const Azure::Storage::StorageException & e) + } + catch (const Azure::Storage::StorageException & e) { - if (!(e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict - && e.ReasonPhrase == "The specified container already exists.")) + if (e.StatusCode != Azure::Core::Http::HttpStatusCode::Conflict + || e.ReasonPhrase != "The specified container already exists.") { throw; } @@ -169,7 +162,7 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) blob_service_client = std::make_unique(connection_url); } - bool container_exists = containerExists(blob_service_client, container); + bool container_exists = containerExists(*blob_service_client, container); std::string final_url; size_t pos = connection_url.find('?'); @@ -460,3 +453,5 @@ void StorageAzureBlobConfiguration::addStructureToArgs(ASTs & args, const String } } + +#endif diff --git a/src/Storages/ObjectStorage/AzureConfiguration.h b/src/Storages/ObjectStorage/AzureBlob/Configuration.h similarity index 88% rename from src/Storages/ObjectStorage/AzureConfiguration.h rename to src/Storages/ObjectStorage/AzureBlob/Configuration.h index 4f285128241..deeb365d012 100644 --- a/src/Storages/ObjectStorage/AzureConfiguration.h +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.h @@ -1,6 +1,11 @@ #pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE + #include -#include +#include namespace DB { @@ -26,8 +31,8 @@ public: String getNamespace() const override { return container; } void check(ContextPtr context) const override; - StorageObjectStorageConfigurationPtr clone() override; ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT + StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } void fromNamedCollection(const NamedCollection & collection) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; @@ -52,3 +57,5 @@ protected: }; } + +#endif diff --git a/src/Storages/ObjectStorage/DataLakes/Common.cpp b/src/Storages/ObjectStorage/DataLakes/Common.cpp new file mode 100644 index 00000000000..5f0138078d4 --- /dev/null +++ b/src/Storages/ObjectStorage/DataLakes/Common.cpp @@ -0,0 +1,28 @@ +#include "Common.h" +#include +#include +#include + +namespace DB +{ + +std::vector listFiles( + const IObjectStorage & object_storage, + const StorageObjectStorageConfiguration & configuration, + const String & prefix, const String & suffix) +{ + auto key = std::filesystem::path(configuration.getPath()) / prefix; + RelativePathsWithMetadata files_with_metadata; + object_storage.listObjects(key, files_with_metadata, 0); + Strings res; + for (const auto & file_with_metadata : files_with_metadata) + { + const auto & filename = file_with_metadata->relative_path; + if (filename.ends_with(suffix)) + res.push_back(filename); + } + LOG_TRACE(getLogger("DataLakeCommon"), "Listed {} files", res.size()); + return res; +} + +} diff --git a/src/Storages/ObjectStorage/DataLakes/Common.h b/src/Storages/ObjectStorage/DataLakes/Common.h new file mode 100644 index 00000000000..ae3767f2eec --- /dev/null +++ b/src/Storages/ObjectStorage/DataLakes/Common.h @@ -0,0 +1,15 @@ +#pragma once +#include + +namespace DB +{ + +class IObjectStorage; +class StorageObjectStorageConfiguration; + +std::vector listFiles( + const IObjectStorage & object_storage, + const StorageObjectStorageConfiguration & configuration, + const String & prefix, const String & suffix); + +} diff --git a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp similarity index 79% rename from src/Storages/DataLakes/DeltaLakeMetadataParser.cpp rename to src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index 55ff8fefdd5..903558b73ab 100644 --- a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include "config.h" #include @@ -15,8 +15,7 @@ #include #include #include - -namespace fs = std::filesystem; +#include namespace DB { @@ -27,12 +26,23 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -struct DeltaLakeMetadataParser::Impl +struct DeltaLakeMetadata::Impl final : private WithContext { + ObjectStoragePtr object_storage; + ConfigurationPtr configuration; + /** * Useful links: * - https://github.com/delta-io/delta/blob/master/PROTOCOL.md#data-files */ + Impl(ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + ContextPtr context_) + : WithContext(context_) + , object_storage(object_storage_) + , configuration(configuration_) + { + } /** * DeltaLake tables store metadata files and data files. @@ -62,13 +72,10 @@ struct DeltaLakeMetadataParser::Impl * An action changes one aspect of the table's state, for example, adding or removing a file. * Note: it is not a valid json, but a list of json's, so we read it in a while cycle. */ - std::set processMetadataFiles( - ObjectStoragePtr object_storage, - const StorageObjectStorageConfiguration & configuration, - ContextPtr context) + std::set processMetadataFiles() { std::set result_files; - const auto checkpoint_version = getCheckpointIfExists(result_files, object_storage, configuration, context); + const auto checkpoint_version = getCheckpointIfExists(result_files); if (checkpoint_version) { @@ -76,12 +83,12 @@ struct DeltaLakeMetadataParser::Impl while (true) { const auto filename = withPadding(++current_version) + metadata_file_suffix; - const auto file_path = fs::path(configuration.getPath()) / deltalake_metadata_directory / filename; + const auto file_path = fs::path(configuration->getPath()) / deltalake_metadata_directory / filename; if (!object_storage->exists(StoredObject(file_path))) break; - processMetadataFile(file_path, result_files, object_storage, configuration, context); + processMetadataFile(file_path, result_files); } LOG_TRACE( @@ -90,33 +97,14 @@ struct DeltaLakeMetadataParser::Impl } else { - const auto keys = listFiles(object_storage, configuration, deltalake_metadata_directory, metadata_file_suffix); + const auto keys = listFiles(*object_storage, *configuration, deltalake_metadata_directory, metadata_file_suffix); for (const String & key : keys) - processMetadataFile(key, result_files, object_storage, configuration, context); + processMetadataFile(key, result_files); } return result_files; } - std::vector listFiles( - const ObjectStoragePtr & object_storage, - const StorageObjectStorageConfiguration & configuration, - const String & prefix, const String & suffix) - { - auto key = std::filesystem::path(configuration.getPath()) / prefix; - RelativePathsWithMetadata files_with_metadata; - object_storage->listObjects(key, files_with_metadata, 0); - Strings res; - for (const auto & file_with_metadata : files_with_metadata) - { - const auto & filename = file_with_metadata->relative_path; - if (filename.ends_with(suffix)) - res.push_back(filename); - } - LOG_TRACE(getLogger("DataLakeMetadataReadHelper"), "Listed {} files", res.size()); - return res; - } - /** * Example of content of a single .json metadata file: * " @@ -146,14 +134,9 @@ struct DeltaLakeMetadataParser::Impl * \"nullCount\":{\"col-6c990940-59bb-4709-8f2e-17083a82c01a\":0,\"col-763cd7e2-7627-4d8e-9fb7-9e85d0c8845b\":0}}"}} * " */ - void processMetadataFile( - const String & key, - std::set & result, - ObjectStoragePtr object_storage, - const StorageObjectStorageConfiguration & configuration, - ContextPtr context) + void processMetadataFile(const String & key, std::set & result) { - auto read_settings = context->getReadSettings(); + auto read_settings = getContext()->getReadSettings(); auto buf = object_storage->readObject(StoredObject(key), read_settings); char c; @@ -176,12 +159,12 @@ struct DeltaLakeMetadataParser::Impl if (json.has("add")) { const auto path = json["add"]["path"].getString(); - result.insert(fs::path(configuration.getPath()) / path); + result.insert(fs::path(configuration->getPath()) / path); } else if (json.has("remove")) { const auto path = json["remove"]["path"].getString(); - result.erase(fs::path(configuration.getPath()) / path); + result.erase(fs::path(configuration->getPath()) / path); } } } @@ -199,17 +182,14 @@ struct DeltaLakeMetadataParser::Impl * * We need to get "version", which is the version of the checkpoint we need to read. */ - size_t readLastCheckpointIfExists( - ObjectStoragePtr object_storage, - const StorageObjectStorageConfiguration & configuration, - ContextPtr context) const + size_t readLastCheckpointIfExists() { - const auto last_checkpoint_file = fs::path(configuration.getPath()) / deltalake_metadata_directory / "_last_checkpoint"; + const auto last_checkpoint_file = fs::path(configuration->getPath()) / deltalake_metadata_directory / "_last_checkpoint"; if (!object_storage->exists(StoredObject(last_checkpoint_file))) return 0; String json_str; - auto read_settings = context->getReadSettings(); + auto read_settings = getContext()->getReadSettings(); auto buf = object_storage->readObject(StoredObject(last_checkpoint_file), read_settings); readJSONObjectPossiblyInvalid(json_str, *buf); @@ -260,21 +240,18 @@ struct DeltaLakeMetadataParser::Impl throw Exception(ErrorCodes::BAD_ARGUMENTS, "Arrow error: {}", _s.ToString()); \ } while (false) - size_t getCheckpointIfExists( - std::set & result, - ObjectStoragePtr object_storage, - const StorageObjectStorageConfiguration & configuration, - ContextPtr context) + size_t getCheckpointIfExists(std::set & result) { - const auto version = readLastCheckpointIfExists(object_storage, configuration, context); + const auto version = readLastCheckpointIfExists(); if (!version) return 0; const auto checkpoint_filename = withPadding(version) + ".checkpoint.parquet"; - const auto checkpoint_path = fs::path(configuration.getPath()) / deltalake_metadata_directory / checkpoint_filename; + const auto checkpoint_path = fs::path(configuration->getPath()) / deltalake_metadata_directory / checkpoint_filename; LOG_TRACE(log, "Using checkpoint file: {}", checkpoint_path.string()); + auto context = getContext(); auto read_settings = context->getReadSettings(); auto buf = object_storage->readObject(StoredObject(checkpoint_path), read_settings); auto format_settings = getFormatSettings(context); @@ -334,7 +311,7 @@ struct DeltaLakeMetadataParser::Impl if (filename.empty()) continue; LOG_TEST(log, "Adding {}", filename); - const auto [_, inserted] = result.insert(fs::path(configuration.getPath()) / filename); + const auto [_, inserted] = result.insert(fs::path(configuration->getPath()) / filename); if (!inserted) throw Exception(ErrorCodes::INCORRECT_DATA, "File already exists {}", filename); } @@ -345,15 +322,22 @@ struct DeltaLakeMetadataParser::Impl LoggerPtr log = getLogger("DeltaLakeMetadataParser"); }; -DeltaLakeMetadataParser::DeltaLakeMetadataParser() : impl(std::make_unique()) {} - -Strings DeltaLakeMetadataParser::getFiles( - ObjectStoragePtr object_storage, - StorageObjectStorageConfigurationPtr configuration, - ContextPtr context) +DeltaLakeMetadata::DeltaLakeMetadata( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + ContextPtr context_) + : impl(std::make_unique(object_storage_, configuration_, context_)) { - auto result = impl->processMetadataFiles(object_storage, *configuration, context); - return Strings(result.begin(), result.end()); +} + +Strings DeltaLakeMetadata::getDataFiles() const +{ + if (!data_files.empty()) + return data_files; + + auto result = impl->processMetadataFiles(); + data_files = Strings(result.begin(), result.end()); + return data_files; } } diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h new file mode 100644 index 00000000000..1a5bb85586a --- /dev/null +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + +class DeltaLakeMetadata final : public IDataLakeMetadata, private WithContext +{ +public: + using ConfigurationPtr = StorageObjectStorageConfigurationPtr; + + static constexpr auto name = "DeltaLake"; + + DeltaLakeMetadata( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + ContextPtr context_); + + Strings getDataFiles() const override; + + NamesAndTypesList getTableSchema() const override { return {}; } + + bool operator ==(const IDataLakeMetadata & other) const override + { + const auto * deltalake_metadata = dynamic_cast(&other); + return deltalake_metadata && getDataFiles() == deltalake_metadata->getDataFiles(); + } + + static DataLakeMetadataPtr create( + ObjectStoragePtr object_storage, + ConfigurationPtr configuration, + ContextPtr local_context) + { + return std::make_unique(object_storage, configuration, local_context); + } + +private: + struct Impl; + const std::shared_ptr impl; + mutable Strings data_files; +}; + +} diff --git a/src/Storages/DataLakes/HudiMetadataParser.cpp b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp similarity index 68% rename from src/Storages/DataLakes/HudiMetadataParser.cpp rename to src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp index 8571c035b32..91a586ccbf9 100644 --- a/src/Storages/DataLakes/HudiMetadataParser.cpp +++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp @@ -1,4 +1,5 @@ -#include +#include +#include #include #include #include @@ -40,33 +41,10 @@ namespace ErrorCodes * hoodie.parquet.max.file.size option. Once a single Parquet file is too large, Hudi creates a second file group. * Each file group is identified by File Id. */ -std::vector listFiles( - const ObjectStoragePtr & object_storage, - const StorageObjectStorageConfiguration & configuration, - const String & prefix, const String & suffix) +Strings HudiMetadata::getDataFilesImpl() const { - auto key = std::filesystem::path(configuration.getPath()) / prefix; - RelativePathsWithMetadata files_with_metadata; - object_storage->listObjects(key, files_with_metadata, 0); - Strings res; - for (const auto & file_with_metadata : files_with_metadata) - { - const auto & filename = file_with_metadata->relative_path; - if (filename.ends_with(suffix)) - res.push_back(filename); - } - LOG_TRACE(getLogger("DataLakeMetadataReadHelper"), "Listed {} files", res.size()); - return res; -} - -Strings HudiMetadataParser::getFiles( - ObjectStoragePtr object_storage, - StorageObjectStorageConfigurationPtr configuration, - ContextPtr) -{ - auto log = getLogger("HudiMetadataParser"); - - const auto keys = listFiles(object_storage, *configuration, "", Poco::toLower(configuration->format)); + auto log = getLogger("HudiMetadata"); + const auto keys = listFiles(*object_storage, *configuration, "", Poco::toLower(configuration->format)); using Partition = std::string; using FileID = std::string; @@ -75,7 +53,7 @@ Strings HudiMetadataParser::getFiles( String key; UInt64 timestamp = 0; }; - std::unordered_map> data_files; + std::unordered_map> files; for (const auto & key : keys) { @@ -90,7 +68,7 @@ Strings HudiMetadataParser::getFiles( const auto & file_id = file_parts[0]; const auto timestamp = parse(file_parts[2]); - auto & file_info = data_files[partition][file_id]; + auto & file_info = files[partition][file_id]; if (file_info.timestamp == 0 || file_info.timestamp < timestamp) { file_info.key = key; @@ -99,7 +77,7 @@ Strings HudiMetadataParser::getFiles( } Strings result; - for (auto & [partition, partition_data] : data_files) + for (auto & [partition, partition_data] : files) { LOG_TRACE(log, "Adding {} data files from partition {}", partition, partition_data.size()); for (auto & [file_id, file_data] : partition_data) @@ -108,4 +86,21 @@ Strings HudiMetadataParser::getFiles( return result; } +HudiMetadata::HudiMetadata( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + ContextPtr context_) + : WithContext(context_) + , object_storage(object_storage_) + , configuration(configuration_) +{ +} + +Strings HudiMetadata::getDataFiles() const +{ + if (data_files.empty()) + data_files = getDataFilesImpl(); + return data_files; +} + } diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h new file mode 100644 index 00000000000..ee8b1ea4978 --- /dev/null +++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class HudiMetadata final : public IDataLakeMetadata, private WithContext +{ +public: + using ConfigurationPtr = StorageObjectStorageConfigurationPtr; + + static constexpr auto name = "Hudi"; + + HudiMetadata( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + ContextPtr context_); + + Strings getDataFiles() const override; + + NamesAndTypesList getTableSchema() const override { return {}; } + + bool operator ==(const IDataLakeMetadata & other) const override + { + const auto * hudi_metadata = dynamic_cast(&other); + return hudi_metadata && getDataFiles() == hudi_metadata->getDataFiles(); + } + + static DataLakeMetadataPtr create( + ObjectStoragePtr object_storage, + ConfigurationPtr configuration, + ContextPtr local_context) + { + return std::make_unique(object_storage, configuration, local_context); + } + +private: + const ObjectStoragePtr object_storage; + const ConfigurationPtr configuration; + mutable Strings data_files; + + Strings getDataFilesImpl() const; +}; + +} diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h new file mode 100644 index 00000000000..a2bd5adb947 --- /dev/null +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h @@ -0,0 +1,19 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +class IDataLakeMetadata : boost::noncopyable +{ +public: + virtual ~IDataLakeMetadata() = default; + virtual Strings getDataFiles() const = 0; + virtual NamesAndTypesList getTableSchema() const = 0; + virtual bool operator==(const IDataLakeMetadata & other) const = 0; +}; +using DataLakeMetadataPtr = std::unique_ptr; + +} diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h similarity index 61% rename from src/Storages/DataLakes/Iceberg/StorageIceberg.h rename to src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index bca6e3c868f..95196cdd000 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -5,11 +5,13 @@ #if USE_AWS_S3 && USE_AVRO #include -#include #include #include #include -#include +#include +#include +#include +#include #include @@ -19,13 +21,10 @@ namespace DB /// Storage for read-only integration with Apache Iceberg tables in Amazon S3 (see https://iceberg.apache.org/) /// Right now it's implemented on top of StorageS3 and right now it doesn't support /// many Iceberg features like schema evolution, partitioning, positional and equality deletes. -/// TODO: Implement Iceberg as a separate storage using IObjectStorage -/// (to support all object storages, not only S3) and add support for missing Iceberg features. -template -class StorageIceberg : public StorageObjectStorage +template +class IStorageDataLake final : public StorageObjectStorage { public: - static constexpr auto name = "Iceberg"; using Storage = StorageObjectStorage; using ConfigurationPtr = Storage::ConfigurationPtr; @@ -41,12 +40,14 @@ public: bool attach) { auto object_storage = base_configuration->createOrUpdateObjectStorage(context); - std::unique_ptr metadata; + DataLakeMetadataPtr metadata; NamesAndTypesList schema_from_metadata; + ConfigurationPtr configuration = base_configuration->clone(); try { - metadata = parseIcebergMetadata(object_storage, base_configuration, context); + metadata = DataLakeMetadata::create(object_storage, base_configuration, context); schema_from_metadata = metadata->getTableSchema(); + configuration->getPaths() = metadata->getDataFiles(); } catch (...) { @@ -55,17 +56,14 @@ public: tryLogCurrentException(__PRETTY_FUNCTION__); } - auto configuration = base_configuration->clone(); - configuration->getPaths() = metadata->getDataFiles(); - - return std::make_shared>( + return std::make_shared>( base_configuration, std::move(metadata), configuration, object_storage, engine_name_, context, table_id_, columns_.empty() ? ColumnsDescription(schema_from_metadata) : columns_, constraints_, comment_, format_settings_); } - String getName() const override { return name; } + String getName() const override { return DataLakeMetadata::name; } static ColumnsDescription getTableStructureFromData( ObjectStoragePtr object_storage_, @@ -73,7 +71,7 @@ public: const std::optional &, ContextPtr local_context) { - auto metadata = parseIcebergMetadata(object_storage_, base_configuration, local_context); + auto metadata = DataLakeMetadata::create(object_storage_, base_configuration, local_context); return ColumnsDescription(metadata->getTableSchema()); } @@ -86,24 +84,25 @@ public: if (updated) Storage::object_storage = new_object_storage; - auto new_metadata = parseIcebergMetadata(Storage::object_storage, base_configuration, local_context); + auto new_metadata = DataLakeMetadata::create(Storage::object_storage, base_configuration, local_context); - if (!current_metadata || new_metadata->getVersion() != current_metadata->getVersion()) + if (!current_metadata || !(*current_metadata == *new_metadata)) current_metadata = std::move(new_metadata); - else if (updated) - { - auto updated_configuration = base_configuration->clone(); - /// If metadata wasn't changed, we won't list data files again. - updated_configuration->getPaths() = current_metadata->getDataFiles(); - Storage::configuration = updated_configuration; - } + else if (!updated) + return {Storage::configuration, Storage::object_storage}; + + auto updated_configuration = base_configuration->clone(); + /// If metadata wasn't changed, we won't list data files again. + updated_configuration->getPaths() = current_metadata->getDataFiles(); + Storage::configuration = updated_configuration; + return {Storage::configuration, Storage::object_storage}; } template - StorageIceberg( + IStorageDataLake( ConfigurationPtr base_configuration_, - std::unique_ptr metadata_, + DataLakeMetadataPtr metadata_, Args &&... args) : Storage(std::forward(args)...) , base_configuration(base_configuration_) @@ -113,8 +112,13 @@ public: private: ConfigurationPtr base_configuration; - std::unique_ptr current_metadata; + DataLakeMetadataPtr current_metadata; }; + +using StorageIceberg = IStorageDataLake; +using StorageDeltaLake = IStorageDataLake; +using StorageHudi = IStorageDataLake; + } #endif diff --git a/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp similarity index 96% rename from src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp rename to src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp index 5543e60e7a7..8ee6f002ca6 100644 --- a/src/Storages/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp @@ -24,7 +24,8 @@ #include #include #include -#include +#include +#include #include #include @@ -332,25 +333,6 @@ MutableColumns parseAvro( return columns; } -std::vector listFiles( - const ObjectStoragePtr & object_storage, - const StorageObjectStorageConfiguration & configuration, - const String & prefix, const String & suffix) -{ - auto key = std::filesystem::path(configuration.getPath()) / prefix; - RelativePathsWithMetadata files_with_metadata; - object_storage->listObjects(key, files_with_metadata, 0); - Strings res; - for (const auto & file_with_metadata : files_with_metadata) - { - const auto & filename = file_with_metadata->relative_path; - if (filename.ends_with(suffix)) - res.push_back(filename); - } - LOG_TRACE(getLogger("DataLakeMetadataReadHelper"), "Listed {} files", res.size()); - return res; -} - /** * Each version of table metadata is stored in a `metadata` directory and * has one of 2 formats: @@ -361,7 +343,7 @@ std::pair getMetadataFileAndVersion( ObjectStoragePtr object_storage, const StorageObjectStorageConfiguration & configuration) { - const auto metadata_files = listFiles(object_storage, configuration, "metadata", ".metadata.json"); + const auto metadata_files = listFiles(*object_storage, configuration, "metadata", ".metadata.json"); if (metadata_files.empty()) { throw Exception( @@ -394,14 +376,14 @@ std::pair getMetadataFileAndVersion( } -std::unique_ptr parseIcebergMetadata( +DataLakeMetadataPtr IcebergMetadata::create( ObjectStoragePtr object_storage, StorageObjectStorageConfigurationPtr configuration, - ContextPtr context_) + ContextPtr local_context) { const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration); LOG_DEBUG(getLogger("IcebergMetadata"), "Parse metadata {}", metadata_file_path); - auto read_settings = context_->getReadSettings(); + auto read_settings = local_context->getReadSettings(); auto buf = object_storage->readObject(StoredObject(metadata_file_path), read_settings); String json_str; readJSONObjectPossiblyInvalid(json_str, *buf); @@ -411,7 +393,7 @@ std::unique_ptr parseIcebergMetadata( Poco::JSON::Object::Ptr object = json.extract(); auto format_version = object->getValue("format-version"); - auto [schema, schema_id] = parseTableSchema(object, format_version, context_->getSettingsRef().iceberg_engine_ignore_schema_evolution); + auto [schema, schema_id] = parseTableSchema(object, format_version, local_context->getSettingsRef().iceberg_engine_ignore_schema_evolution); auto current_snapshot_id = object->getValue("current-snapshot-id"); auto snapshots = object->get("snapshots").extract(); @@ -428,7 +410,7 @@ std::unique_ptr parseIcebergMetadata( } } - return std::make_unique(object_storage, configuration, context_, metadata_version, format_version, manifest_list_file, schema_id, schema); + return std::make_unique(object_storage, configuration, local_context, metadata_version, format_version, manifest_list_file, schema_id, schema); } /** @@ -456,7 +438,7 @@ std::unique_ptr parseIcebergMetadata( * │ 1 │ 2252246380142525104 │ ('/iceberg_data/db/table_name/data/a=2/00000-1-c9535a00-2f4f-405c-bcfa-6d4f9f477235-00003.parquet','PARQUET',(2),1,631,67108864,[(1,46),(2,48)],[(1,1),(2,1)],[(1,0),(2,0)],[],[(1,'\0\0\0\0\0\0\0'),(2,'3')],[(1,'\0\0\0\0\0\0\0'),(2,'3')],NULL,[4],0) │ * └────────┴─────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ */ -Strings IcebergMetadata::getDataFiles() +Strings IcebergMetadata::getDataFiles() const { if (!data_files.empty()) return data_files; diff --git a/src/Storages/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h similarity index 76% rename from src/Storages/DataLakes/Iceberg/IcebergMetadata.h rename to src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h index a289715848f..f88e3eecc67 100644 --- a/src/Storages/DataLakes/Iceberg/IcebergMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h @@ -5,7 +5,8 @@ #include #include #include -#include +#include +#include namespace DB { @@ -57,12 +58,16 @@ namespace DB * "metadata-log" : [ ] * } */ -class IcebergMetadata : WithContext +class IcebergMetadata : public IDataLakeMetadata, private WithContext { public: + using ConfigurationPtr = StorageObjectStorageConfigurationPtr; + + static constexpr auto name = "Iceberg"; + IcebergMetadata( ObjectStoragePtr object_storage_, - StorageObjectStorageConfigurationPtr configuration_, + ConfigurationPtr configuration_, ContextPtr context_, Int32 metadata_version_, Int32 format_version_, @@ -72,31 +77,36 @@ public: /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files. /// All subsequent calls will return saved list of files (because it cannot be changed without changing metadata file) - Strings getDataFiles(); + Strings getDataFiles() const override; /// Get table schema parsed from metadata. - NamesAndTypesList getTableSchema() const { return schema; } + NamesAndTypesList getTableSchema() const override { return schema; } - size_t getVersion() const { return metadata_version; } + bool operator ==(const IDataLakeMetadata & other) const override + { + const auto * iceberg_metadata = dynamic_cast(&other); + return iceberg_metadata && getVersion() == iceberg_metadata->getVersion(); + } + + static DataLakeMetadataPtr create( + ObjectStoragePtr object_storage, + ConfigurationPtr configuration, + ContextPtr local_context); private: - ObjectStoragePtr object_storage; - StorageObjectStorageConfigurationPtr configuration; + size_t getVersion() const { return metadata_version; } + + const ObjectStoragePtr object_storage; + const ConfigurationPtr configuration; Int32 metadata_version; Int32 format_version; String manifest_list_file; Int32 current_schema_id; NamesAndTypesList schema; - Strings data_files; + mutable Strings data_files; LoggerPtr log; - }; -std::unique_ptr parseIcebergMetadata( - ObjectStoragePtr object_storage, - StorageObjectStorageConfigurationPtr configuration, - ContextPtr context); - } #endif diff --git a/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp b/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp new file mode 100644 index 00000000000..d93c14dfe32 --- /dev/null +++ b/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp @@ -0,0 +1,83 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include +#include +#include +#include +#include + + +namespace DB +{ + +#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format. + +void registerStorageIceberg(StorageFactory & factory) +{ + factory.registerStorage( + "Iceberg", + [&](const StorageFactory::Arguments & args) + { + auto configuration = std::make_shared(); + StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); + + return StorageIceberg::create( + configuration, args.getContext(), "Iceberg", args.table_id, args.columns, + args.constraints, args.comment, std::nullopt, args.attach); + }, + { + .supports_settings = false, + .supports_schema_inference = true, + .source_access_type = AccessType::S3, + }); +} + +#endif + +#if USE_PARQUET +void registerStorageDeltaLake(StorageFactory & factory) +{ + factory.registerStorage( + "DeltaLake", + [&](const StorageFactory::Arguments & args) + { + auto configuration = std::make_shared(); + StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); + + return StorageDeltaLake::create( + configuration, args.getContext(), "DeltaLake", args.table_id, args.columns, + args.constraints, args.comment, std::nullopt, args.attach); + }, + { + .supports_settings = false, + .supports_schema_inference = true, + .source_access_type = AccessType::S3, + }); +} +#endif + +void registerStorageHudi(StorageFactory & factory) +{ + factory.registerStorage( + "Hudi", + [&](const StorageFactory::Arguments & args) + { + auto configuration = std::make_shared(); + StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); + + return StorageHudi::create( + configuration, args.getContext(), "Hudi", args.table_id, args.columns, + args.constraints, args.comment, std::nullopt, args.attach); + }, + { + .supports_settings = false, + .supports_schema_inference = true, + .source_access_type = AccessType::S3, + }); +} + +} + +#endif diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp new file mode 100644 index 00000000000..c80237b3055 --- /dev/null +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -0,0 +1,57 @@ +#include + +#if USE_HDFS +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +StorageHDFSConfiguration::StorageHDFSConfiguration(const StorageHDFSConfiguration & other) +{ + url = other.url; + path = other.path; + paths = other.paths; + format = other.format; + compression_method = other.compression_method; + structure = other.structure; +} + +void StorageHDFSConfiguration::check(ContextPtr context) const +{ + context->getRemoteHostFilter().checkURL(Poco::URI(url)); + checkHDFSURL(url); +} + +ObjectStoragePtr StorageHDFSConfiguration::createOrUpdateObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT +{ + UNUSED(is_readonly); + auto settings = std::make_unique(); + return std::make_shared(url, std::move(settings), context->getConfigRef()); +} + +void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr, bool /* with_structure */) +{ + url = checkAndGetLiteralArgument(args[0], "url"); + + String format_name = "auto"; + if (args.size() > 1) + format_name = checkAndGetLiteralArgument(args[1], "format_name"); + + if (format_name == "auto") + format_name = FormatFactory::instance().getFormatFromFileName(url, true); + + String compression_method; + if (args.size() == 3) + compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); + else + compression_method = "auto"; + +} +} + +#endif diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h new file mode 100644 index 00000000000..03fb0824123 --- /dev/null +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -0,0 +1,45 @@ +#pragma once +#include "config.h" + +#if USE_HDFS +#include +#include +#include +#include + +namespace DB +{ + +class StorageHDFSConfiguration : public StorageObjectStorageConfiguration +{ +public: + StorageHDFSConfiguration() = default; + StorageHDFSConfiguration(const StorageHDFSConfiguration & other); + + Path getPath() const override { return path; } + void setPath(const Path & path_) override { path = path_; } + + const Paths & getPaths() const override { return paths; } + Paths & getPaths() override { return paths; } + + String getNamespace() const override { return ""; } + String getDataSourceDescription() override { return url; } + + void check(ContextPtr context) const override; + ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT + StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } + + void fromNamedCollection(const NamedCollection &) override {} + void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override; + + static void addStructureToArgs(ASTs &, const String &, ContextPtr) {} + +private: + String url; + String path; + std::vector paths; +}; + +} + +#endif diff --git a/src/Storages/ObjectStorage/HDFSConfiguration.h b/src/Storages/ObjectStorage/HDFSConfiguration.h deleted file mode 100644 index aa45c634042..00000000000 --- a/src/Storages/ObjectStorage/HDFSConfiguration.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once -#include "config.h" - -#if USE_HDFS - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; -} - -class StorageHDFSConfiguration : public StorageObjectStorageConfiguration -{ -public: - Path getPath() const override { return path; } - void setPath(const Path & path_) override { path = path_; } - - const Paths & getPaths() const override { return paths; } - Paths & getPaths() override { return paths; } - - String getNamespace() const override { return ""; } - String getDataSourceDescription() override { return url; } - - void check(ContextPtr context) const override - { - context->getRemoteHostFilter().checkURL(Poco::URI(url)); - checkHDFSURL(url); - } - StorageObjectStorageConfigurationPtr clone() override - { - auto configuration = std::make_shared(); - return configuration; - } - - ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override /// NOLINT - { - UNUSED(is_readonly); - auto settings = std::make_unique(); - return std::make_shared(url, std::move(settings), context->getConfigRef()); - } - - void fromNamedCollection(const NamedCollection &) override {} - void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override - { - url = checkAndGetLiteralArgument(args[0], "url"); - - String format_name = "auto"; - if (args.size() > 1) - format_name = checkAndGetLiteralArgument(args[1], "format_name"); - - if (format_name == "auto") - format_name = FormatFactory::instance().getFormatFromFileName(url, true); - - String compression_method; - if (args.size() == 3) - { - compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); - } else compression_method = "auto"; - - } - static void addStructureToArgs(ASTs &, const String &, ContextPtr) {} - -private: - String url; - String path; - std::vector paths; -}; - -} - -#endif diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index dcdf36dbcf5..a3e19b907bc 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -67,11 +67,11 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( auto get_last_mod_time = [&] -> std::optional { if (object_info->metadata) - return object_info->metadata->last_modified->epochMicroseconds(); + return object_info->metadata->last_modified.epochMicroseconds(); else { object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); - return object_info->metadata->last_modified->epochMicroseconds(); + return object_info->metadata->last_modified.epochMicroseconds(); } }; diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp index 2c27c816078..b33eea7d354 100644 --- a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp @@ -59,7 +59,6 @@ void ReadFromStorageObejctStorage::applyFilters() const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); - createIterator(predicate); } diff --git a/src/Storages/ObjectStorage/S3Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp similarity index 97% rename from src/Storages/ObjectStorage/S3Configuration.cpp rename to src/Storages/ObjectStorage/S3/Configuration.cpp index 5a5412019f5..f057745d669 100644 --- a/src/Storages/ObjectStorage/S3Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -1,4 +1,7 @@ -#include +#include + +#if USE_AWS_S3 + #include #include #include @@ -14,6 +17,7 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int LOGICAL_ERROR; } static const std::unordered_set required_configuration_keys = { @@ -51,17 +55,19 @@ void StorageS3Configuration::check(ContextPtr context) const context->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(headers_from_ast); } -StorageObjectStorageConfigurationPtr StorageS3Configuration::clone() +StorageS3Configuration::StorageS3Configuration(const StorageS3Configuration & other) { - auto configuration = std::make_shared(); - configuration->url = url; - configuration->auth_settings = auth_settings; - configuration->request_settings = request_settings; - configuration->static_configuration = static_configuration; - configuration->headers_from_ast = headers_from_ast; - configuration->keys = keys; - configuration->initialized = initialized; - return configuration; + url = other.url; + auth_settings = other.auth_settings; + request_settings = other.request_settings; + static_configuration = other.static_configuration; + headers_from_ast = other.headers_from_ast; + keys = other.keys; + initialized = other.initialized; + + format = other.format; + compression_method = other.compression_method; + structure = other.structure; } ObjectStoragePtr StorageS3Configuration::createOrUpdateObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT @@ -489,3 +495,5 @@ void StorageS3Configuration::addStructureToArgs(ASTs & args, const String & stru } } + +#endif diff --git a/src/Storages/ObjectStorage/S3Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h similarity index 81% rename from src/Storages/ObjectStorage/S3Configuration.h rename to src/Storages/ObjectStorage/S3/Configuration.h index c953bc25c4e..037cf2eae87 100644 --- a/src/Storages/ObjectStorage/S3Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -1,7 +1,12 @@ #pragma once + +#include "config.h" + +#if USE_AWS_S3 + #include #include -#include +#include namespace DB { @@ -9,6 +14,9 @@ namespace DB class StorageS3Configuration : public StorageObjectStorageConfiguration { public: + StorageS3Configuration() = default; + StorageS3Configuration(const StorageS3Configuration & other); + Path getPath() const override { return url.key; } void setPath(const Path & path) override { url.key = path; } @@ -19,9 +27,8 @@ public: String getDataSourceDescription() override; void check(ContextPtr context) const override; - StorageObjectStorageConfigurationPtr clone() override; - ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT + StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } void fromNamedCollection(const NamedCollection & collection) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; @@ -44,3 +51,5 @@ private: }; } + +#endif diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 9a7260ea47c..08d7c9d0014 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -24,8 +24,6 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int DATABASE_ACCESS_DENIED; - extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; } @@ -59,7 +57,6 @@ std::unique_ptr getStorageMetadata( storage_metadata->setColumns(columns); } - storage_metadata->setConstraints(constraints); storage_metadata->setComment(comment); return storage_metadata; @@ -264,10 +261,7 @@ SinkToStoragePtr StorageObjectStorage::write( template void StorageObjectStorage::truncate( - const ASTPtr &, - const StorageMetadataPtr &, - ContextPtr, - TableExclusiveLockHolder &) + const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) { if (configuration->withGlobs()) { diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 0b29845ba5c..6f18153c7af 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -95,8 +95,7 @@ public: ContextPtr context); protected: - virtual std::pair - updateConfigurationAndGetCopy(ContextPtr local_context); + virtual std::pair updateConfigurationAndGetCopy(ContextPtr local_context); const std::string engine_name; const NamesAndTypesList virtual_columns; @@ -110,7 +109,7 @@ protected: }; using StorageS3 = StorageObjectStorage; -using StorageAzureBlobStorage = StorageObjectStorage; +using StorageAzureBlob = StorageObjectStorage; using StorageHDFS = StorageObjectStorage; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 39cd5d8eca6..c03bbd1a45d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index aae8f704a73..507de20e888 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -67,5 +67,8 @@ using StorageS3Cluster = StorageObjectStorageCluster; #endif +#if USE_HDFS +using StorageHDFSCluster = StorageObjectStorageCluster; +#endif } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 2d5760ed9d8..651f1d25ec1 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -1,4 +1,4 @@ -#include +#include namespace DB diff --git a/src/Storages/ObjectStorage/StorageObejctStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h similarity index 99% rename from src/Storages/ObjectStorage/StorageObejctStorageConfiguration.h rename to src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 427d6a8d453..04b2d8e8fd9 100644 --- a/src/Storages/ObjectStorage/StorageObejctStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -39,9 +39,8 @@ public: std::string getPathWithoutGlob() const; virtual void check(ContextPtr context) const = 0; - virtual StorageObjectStorageConfigurationPtr clone() = 0; - virtual ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT + virtual StorageObjectStorageConfigurationPtr clone() = 0; String format = "auto"; String compression_method = "auto"; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp new file mode 100644 index 00000000000..37f93a2b82f --- /dev/null +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -0,0 +1,127 @@ +#include "StorageObjectStorageSink.h" +#include +#include + +namespace DB +{ + +StorageObjectStorageSink::StorageObjectStorageSink( + ObjectStoragePtr object_storage, + StorageObjectStorageConfigurationPtr configuration, + std::optional format_settings_, + const Block & sample_block_, + ContextPtr context, + const std::string & blob_path) + : SinkToStorage(sample_block_) + , sample_block(sample_block_) + , format_settings(format_settings_) +{ + const auto & settings = context->getSettingsRef(); + const auto path = blob_path.empty() ? configuration->getPaths().back() : blob_path; + const auto chosen_compression_method = chooseCompressionMethod(path, configuration->compression_method); + + auto buffer = object_storage->writeObject( + StoredObject(path), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); + + write_buf = wrapWriteBufferWithCompressionMethod( + std::move(buffer), + chosen_compression_method, + static_cast(settings.output_format_compression_level), + static_cast(settings.output_format_compression_zstd_window_log)); + + writer = FormatFactory::instance().getOutputFormatParallelIfPossible( + configuration->format, *write_buf, sample_block, context, format_settings); +} + +void StorageObjectStorageSink::consume(Chunk chunk) +{ + std::lock_guard lock(cancel_mutex); + if (cancelled) + return; + writer->write(getHeader().cloneWithColumns(chunk.detachColumns())); +} + +void StorageObjectStorageSink::onCancel() +{ + std::lock_guard lock(cancel_mutex); + finalize(); + cancelled = true; +} + +void StorageObjectStorageSink::onException(std::exception_ptr exception) +{ + std::lock_guard lock(cancel_mutex); + try + { + std::rethrow_exception(exception); + } + catch (...) + { + /// An exception context is needed to proper delete write buffers without finalization. + release(); + } +} + +void StorageObjectStorageSink::onFinish() +{ + std::lock_guard lock(cancel_mutex); + finalize(); +} + +void StorageObjectStorageSink::finalize() +{ + if (!writer) + return; + + try + { + writer->finalize(); + writer->flush(); + write_buf->finalize(); + } + catch (...) + { + /// Stop ParallelFormattingOutputFormat correctly. + release(); + throw; + } +} + +void StorageObjectStorageSink::release() +{ + writer.reset(); + write_buf->finalize(); +} + +PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( + ObjectStoragePtr object_storage_, + StorageObjectStorageConfigurationPtr configuration_, + std::optional format_settings_, + const Block & sample_block_, + ContextPtr context_, + const ASTPtr & partition_by) + : PartitionedSink(partition_by, context_, sample_block_) + , object_storage(object_storage_) + , configuration(configuration_) + , format_settings(format_settings_) + , sample_block(sample_block_) + , context(context_) +{ +} + +SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String & partition_id) +{ + auto blob = configuration->getPaths().back(); + auto partition_key = replaceWildcards(blob, partition_id); + validatePartitionKey(partition_key, true); + return std::make_shared( + object_storage, + configuration, + format_settings, + sample_block, + context, + partition_key + ); +} + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index a2d42d7fa9f..14298376d0e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -1,9 +1,8 @@ #pragma once #include -#include -#include +#include #include -#include +#include namespace DB { @@ -16,64 +15,17 @@ public: std::optional format_settings_, const Block & sample_block_, ContextPtr context, - const std::string & blob_path = "") - : SinkToStorage(sample_block_) - , sample_block(sample_block_) - , format_settings(format_settings_) - { - const auto & settings = context->getSettingsRef(); - const auto path = blob_path.empty() ? configuration->getPaths().back() : blob_path; - const auto chosen_compression_method = chooseCompressionMethod(path, configuration->compression_method); - - auto buffer = object_storage->writeObject( - StoredObject(path), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); - - write_buf = wrapWriteBufferWithCompressionMethod( - std::move(buffer), - chosen_compression_method, - static_cast(settings.output_format_compression_level), - static_cast(settings.output_format_compression_zstd_window_log)); - - writer = FormatFactory::instance().getOutputFormatParallelIfPossible( - configuration->format, *write_buf, sample_block, context, format_settings); - } + const std::string & blob_path = ""); String getName() const override { return "StorageObjectStorageSink"; } - void consume(Chunk chunk) override - { - std::lock_guard lock(cancel_mutex); - if (cancelled) - return; - writer->write(getHeader().cloneWithColumns(chunk.detachColumns())); - } + void consume(Chunk chunk) override; - void onCancel() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - cancelled = true; - } + void onCancel() override; - void onException(std::exception_ptr exception) override - { - std::lock_guard lock(cancel_mutex); - try - { - std::rethrow_exception(exception); - } - catch (...) - { - /// An exception context is needed to proper delete write buffers without finalization. - release(); - } - } + void onException(std::exception_ptr exception) override; - void onFinish() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - } + void onFinish() override; private: const Block sample_block; @@ -84,30 +36,8 @@ private: bool cancelled = false; std::mutex cancel_mutex; - void finalize() - { - if (!writer) - return; - - try - { - writer->finalize(); - writer->flush(); - write_buf->finalize(); - } - catch (...) - { - /// Stop ParallelFormattingOutputFormat correctly. - release(); - throw; - } - } - - void release() - { - writer.reset(); - write_buf->finalize(); - } + void finalize(); + void release(); }; class PartitionedStorageObjectStorageSink : public PartitionedSink @@ -119,30 +49,9 @@ public: std::optional format_settings_, const Block & sample_block_, ContextPtr context_, - const ASTPtr & partition_by) - : PartitionedSink(partition_by, context_, sample_block_) - , object_storage(object_storage_) - , configuration(configuration_) - , format_settings(format_settings_) - , sample_block(sample_block_) - , context(context_) - { - } + const ASTPtr & partition_by); - SinkPtr createSinkForPartition(const String & partition_id) override - { - auto blob = configuration->getPaths().back(); - auto partition_key = replaceWildcards(blob, partition_id); - validatePartitionKey(partition_key, true); - return std::make_shared( - object_storage, - configuration, - format_settings, - sample_block, - context, - partition_key - ); - } + SinkPtr createSinkForPartition(const String & partition_id) override; private: ObjectStoragePtr object_storage; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index f170a46112f..1fda75897f9 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -26,6 +26,8 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_COMPILE_REGEXP; + extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } StorageObjectStorageSource::StorageObjectStorageSource( @@ -182,8 +184,8 @@ std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const O auto get_last_mod_time = [&]() -> std::optional { - return object_info->metadata && object_info->metadata->last_modified - ? object_info->metadata->last_modified->epochMicroseconds() + return object_info->metadata + ? object_info->metadata->last_modified.epochMicroseconds() : 0; }; return schema_cache.tryGetNumRows(cache_key, get_last_mod_time); @@ -472,4 +474,29 @@ ObjectInfoPtr StorageObjectStorageSource::KeysIterator::next(size_t /* processor return std::make_shared(key, metadata); } +StorageObjectStorageSource::ReaderHolder::ReaderHolder( + ObjectInfoPtr object_info_, + std::unique_ptr read_buf_, + std::shared_ptr source_, + std::unique_ptr pipeline_, + std::unique_ptr reader_) + : object_info(std::move(object_info_)) + , read_buf(std::move(read_buf_)) + , source(std::move(source_)) + , pipeline(std::move(pipeline_)) + , reader(std::move(reader_)) +{ +} + +StorageObjectStorageSource::ReaderHolder & StorageObjectStorageSource::ReaderHolder::operator=(ReaderHolder && other) noexcept +{ + /// The order of destruction is important. + /// reader uses pipeline, pipeline uses read_buf. + reader = std::move(other.reader); + pipeline = std::move(other.pipeline); + source = std::move(other.source); + read_buf = std::move(other.read_buf); + object_info = std::move(other.object_info); + return *this; +} } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 0d6a6b71271..214a7de14d6 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -75,32 +75,16 @@ protected: std::unique_ptr read_buf_, std::shared_ptr source_, std::unique_ptr pipeline_, - std::unique_ptr reader_) - : object_info(std::move(object_info_)) - , read_buf(std::move(read_buf_)) - , source(std::move(source_)) - , pipeline(std::move(pipeline_)) - , reader(std::move(reader_)) {} + std::unique_ptr reader_); ReaderHolder() = default; ReaderHolder(ReaderHolder && other) noexcept { *this = std::move(other); } + ReaderHolder & operator=(ReaderHolder && other) noexcept; explicit operator bool() const { return reader != nullptr; } PullingPipelineExecutor * operator->() { return reader.get(); } const PullingPipelineExecutor * operator->() const { return reader.get(); } - ReaderHolder & operator=(ReaderHolder && other) noexcept - { - /// The order of destruction is important. - /// reader uses pipeline, pipeline uses read_buf. - reader = std::move(other.reader); - pipeline = std::move(other.pipeline); - source = std::move(other.source); - read_buf = std::move(other.read_buf); - object_info = std::move(other.object_info); - return *this; - } - const String & getRelativePath() const { return object_info->relative_path; } const ObjectInfo & getObjectInfo() const { return *object_info; } const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } @@ -143,7 +127,7 @@ public: size_t estimatedKeysCount() override { return 0; } /// TODO FIXME - ObjectInfoPtr next(size_t) override { return std::make_shared( callback(), ObjectMetadata{} ); } + ObjectInfoPtr next(size_t) override { return std::make_shared(callback(), ObjectMetadata{}); } private: ReadTaskCallback callback; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h b/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h index 51be7419e1c..241e2f20962 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h @@ -1,4 +1,5 @@ -#include +#pragma once +#include namespace DB { diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index f7ab37490e1..e23457c04e9 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -1,6 +1,6 @@ -#include -#include -#include +#include +#include +#include #include #include #include @@ -8,6 +8,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + template static std::shared_ptr> createStorageObjectStorage( const StorageFactory::Arguments & args, @@ -149,6 +154,7 @@ void registerStorageObjectStorage(StorageFactory & factory) #if USE_HDFS registerStorageHDFS(factory); #endif + UNUSED(factory); } } diff --git a/src/Storages/ObjectStorageConfiguration.h b/src/Storages/ObjectStorageConfiguration.h deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.h b/src/Storages/S3Queue/S3QueueTableMetadata.h index 70dd8f27d71..9502a3c5e70 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.h +++ b/src/Storages/S3Queue/S3QueueTableMetadata.h @@ -3,7 +3,7 @@ #if USE_AWS_S3 #include -#include +#include #include namespace DB diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index fc4ef77ebb9..b03224cedff 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -36,6 +36,13 @@ namespace ProfileEvents extern const Event S3ListObjects; } +namespace CurrentMetrics +{ + extern const Metric ObjectStorageS3Threads; + extern const Metric ObjectStorageS3ThreadsActive; + extern const Metric ObjectStorageS3ThreadsScheduled; +} + namespace DB { diff --git a/src/Storages/System/StorageSystemSchemaInferenceCache.cpp b/src/Storages/System/StorageSystemSchemaInferenceCache.cpp index 77d5be3698c..a53ce440c3f 100644 --- a/src/Storages/System/StorageSystemSchemaInferenceCache.cpp +++ b/src/Storages/System/StorageSystemSchemaInferenceCache.cpp @@ -81,7 +81,7 @@ void StorageSystemSchemaInferenceCache::fillData(MutableColumns & res_columns, C #endif fillDataImpl(res_columns, StorageURL::getSchemaCache(context), "URL"); #if USE_AZURE_BLOB_STORAGE - fillDataImpl(res_columns, StorageAzureBlobStorage::getSchemaCache(context), "Azure"); /// FIXME + fillDataImpl(res_columns, StorageAzureBlob::getSchemaCache(context), "Azure"); /// FIXME #endif } diff --git a/src/TableFunctions/ITableFunctionDataLake.h b/src/TableFunctions/ITableFunctionDataLake.h index 0ffa1460d78..8edba4e6e4b 100644 --- a/src/TableFunctions/ITableFunctionDataLake.h +++ b/src/TableFunctions/ITableFunctionDataLake.h @@ -1,18 +1,17 @@ #pragma once #include "config.h" - -#if USE_AWS_S3 - -# include -# include -# include -# include -# include -# include -#include +#include +#include +#include +#include +#include +#include #include -#include +#include +#include +#include + namespace DB { @@ -26,18 +25,20 @@ public: protected: StoragePtr executeImpl( - const ASTPtr & /*ast_function*/, + const ASTPtr & /* ast_function */, ContextPtr context, const std::string & table_name, - ColumnsDescription /*cached_columns*/, + ColumnsDescription cached_columns, bool /*is_insert_query*/) const override { ColumnsDescription columns; - if (TableFunction::configuration->structure != "auto") - columns = parseColumnsListFromString(TableFunction::configuration->structure, context); + auto configuration = TableFunction::getConfiguration(); + if (configuration->structure != "auto") + columns = parseColumnsListFromString(configuration->structure, context); + else if (!cached_columns.empty()) + columns = cached_columns; - StorageObjectStorageConfigurationPtr configuration = TableFunction::configuration; - StoragePtr storage = StorageIceberg>::create( + StoragePtr storage = Storage::create( configuration, context, "", StorageID(TableFunction::getDatabaseName(), table_name), columns, ConstraintsDescription{}, String{}, std::nullopt, false); @@ -45,26 +46,53 @@ protected: return storage; } - const char * getStorageTypeName() const override { return Storage::name; } + const char * getStorageTypeName() const override { return name; } - ColumnsDescription getActualTableStructure(ContextPtr context, bool /*is_insert_query*/) const override + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override { - if (TableFunction::configuration->structure == "auto") + auto configuration = TableFunction::getConfiguration(); + if (configuration->structure == "auto") { context->checkAccess(TableFunction::getSourceAccessType()); - return Storage::getTableStructureFromData(TableFunction::object_storage, TableFunction::configuration, std::nullopt, context); + auto object_storage = TableFunction::getObjectStorage(context, !is_insert_query); + return Storage::getTableStructureFromData(object_storage, configuration, std::nullopt, context); } - return parseColumnsListFromString(TableFunction::configuration->structure, context); + return parseColumnsListFromString(configuration->structure, context); } void parseArguments(const ASTPtr & ast_function, ContextPtr context) override { + auto configuration = TableFunction::getConfiguration(); + configuration->format = "Parquet"; /// Set default format to Parquet if it's not specified in arguments. - TableFunction::configuration->format = "Parquet"; TableFunction::parseArguments(ast_function, context); } }; -} +struct TableFunctionIcebergName +{ + static constexpr auto name = "iceberg"; +}; + +struct TableFunctionDeltaLakeName +{ + static constexpr auto name = "deltaLake"; +}; + +struct TableFunctionHudiName +{ + static constexpr auto name = "hudi"; +}; + +#if USE_AWS_S3 +#if USE_AVRO +using TableFunctionIceberg = ITableFunctionDataLake; #endif +#if USE_PARQUET +using TableFunctionDeltaLake = ITableFunctionDataLake; +#endif +using TableFunctionHudi = ITableFunctionDataLake; +#endif + +} diff --git a/src/TableFunctions/TableFunctionDeltaLake.cpp b/src/TableFunctions/TableFunctionDeltaLake.cpp deleted file mode 100644 index 08b62ed2612..00000000000 --- a/src/TableFunctions/TableFunctionDeltaLake.cpp +++ /dev/null @@ -1,33 +0,0 @@ -#include "config.h" - -#if USE_AWS_S3 && USE_PARQUET - -#include -#include -#include -#include -#include "registerTableFunctions.h" - -namespace DB -{ - -struct TableFunctionDeltaLakeName -{ - static constexpr auto name = "deltaLake"; -}; - -// using TableFunctionDeltaLake = ITableFunctionDataLake; -// -// void registerTableFunctionDeltaLake(TableFunctionFactory & factory) -// { -// factory.registerFunction( -// {.documentation = { -// .description=R"(The table function can be used to read the DeltaLake table stored on object store.)", -// .examples{{"deltaLake", "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)", ""}}, -// .categories{"DataLake"}}, -// .allow_readonly = false}); -// } - -} - -#endif diff --git a/src/TableFunctions/TableFunctionHudi.cpp b/src/TableFunctions/TableFunctionHudi.cpp deleted file mode 100644 index c6d84504c40..00000000000 --- a/src/TableFunctions/TableFunctionHudi.cpp +++ /dev/null @@ -1,31 +0,0 @@ -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include -#include -#include "registerTableFunctions.h" - -namespace DB -{ - -struct TableFunctionHudiName -{ - static constexpr auto name = "hudi"; -}; -// using TableFunctionHudi = ITableFunctionDataLake; -// -// void registerTableFunctionHudi(TableFunctionFactory & factory) -// { -// factory.registerFunction( -// {.documentation -// = {.description=R"(The table function can be used to read the Hudi table stored on object store.)", -// .examples{{"hudi", "SELECT * FROM hudi(url, access_key_id, secret_access_key)", ""}}, -// .categories{"DataLake"}}, -// .allow_readonly = false}); -// } -} - -#endif diff --git a/src/TableFunctions/TableFunctionIceberg.cpp b/src/TableFunctions/TableFunctionIceberg.cpp deleted file mode 100644 index 1a28f9292d1..00000000000 --- a/src/TableFunctions/TableFunctionIceberg.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include "config.h" - -#if USE_AWS_S3 && USE_AVRO - -#include -#include -#include -#include -#include "registerTableFunctions.h" - - -namespace DB -{ - -struct TableFunctionIcebergName -{ - static constexpr auto name = "iceberg"; -}; - -using TableFunctionIceberg = ITableFunctionDataLake< - TableFunctionIcebergName, - StorageIceberg, - TableFunctionS3>; - -void registerTableFunctionIceberg(TableFunctionFactory & factory) -{ - factory.registerFunction( - {.documentation - = {.description=R"(The table function can be used to read the Iceberg table stored on object store.)", - .examples{{"iceberg", "SELECT * FROM iceberg(url, access_key_id, secret_access_key)", ""}}, - .categories{"DataLake"}}, - .allow_readonly = false}); -} - -} - -#endif diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index de46c13af37..a948102ac2b 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -7,10 +7,10 @@ #include #include #include -#include -#include -#include -#include +#include +#include +#include +#include #include #include #include @@ -24,7 +24,6 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int BAD_ARGUMENTS; } template @@ -36,6 +35,15 @@ ObjectStoragePtr TableFunctionObjectStorage< return object_storage; } +template +StorageObjectStorageConfigurationPtr TableFunctionObjectStorage< + Definition, StorageSettings, Configuration>::getConfiguration() const +{ + if (!configuration) + configuration = std::make_shared(); + return configuration; +} + template std::vector TableFunctionObjectStorage< Definition, StorageSettings, Configuration>::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const @@ -65,8 +73,7 @@ template void TableFunctionObjectStorage< Definition, StorageSettings, Configuration>::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context) { - configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, engine_args, local_context, true); + StorageObjectStorageConfiguration::initialize(*getConfiguration(), engine_args, local_context, true); } template @@ -147,6 +154,7 @@ StoragePtr TableFunctionObjectStorage>( { diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h index 1df0ba2f843..5e180301862 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.h +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -2,10 +2,9 @@ #include "config.h" -#if USE_AZURE_BLOB_STORAGE - #include #include +#include #include @@ -114,6 +113,8 @@ public: static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); protected: + using ConfigurationPtr = StorageObjectStorageConfigurationPtr; + StoragePtr executeImpl( const ASTPtr & ast_function, ContextPtr context, @@ -125,9 +126,11 @@ protected: ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - ObjectStoragePtr getObjectStorage(const ContextPtr & context, bool create_readonly) const; - mutable typename StorageObjectStorage::ConfigurationPtr configuration; + ObjectStoragePtr getObjectStorage(const ContextPtr & context, bool create_readonly) const; + ConfigurationPtr getConfiguration() const; + + mutable ConfigurationPtr configuration; mutable ObjectStoragePtr object_storage; ColumnsDescription structure_hint; @@ -146,5 +149,3 @@ using TableFunctionAzureBlob = TableFunctionObjectStorage; #endif } - -#endif diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index 8e6c96a3f2a..c93d816dc07 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -6,9 +6,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include namespace DB @@ -103,6 +103,8 @@ void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory) } ); #endif + + UNUSED(factory); } #if USE_AWS_S3 diff --git a/src/TableFunctions/registerDataLakeTableFunctions.cpp b/src/TableFunctions/registerDataLakeTableFunctions.cpp new file mode 100644 index 00000000000..15a6668f434 --- /dev/null +++ b/src/TableFunctions/registerDataLakeTableFunctions.cpp @@ -0,0 +1,69 @@ +#include +#include + +namespace DB +{ + +#if USE_AWS_S3 +#if USE_AVRO +void registerTableFunctionIceberg(TableFunctionFactory & factory) +{ + factory.registerFunction( + { + .documentation = + { + .description=R"(The table function can be used to read the Iceberg table stored on object store.)", + .examples{{"iceberg", "SELECT * FROM iceberg(url, access_key_id, secret_access_key)", ""}}, + .categories{"DataLake"} + }, + .allow_readonly = false + }); +} +#endif + +#if USE_PARQUET +void registerTableFunctionDeltaLake(TableFunctionFactory & factory) +{ + factory.registerFunction( + { + .documentation = + { + .description=R"(The table function can be used to read the DeltaLake table stored on object store.)", + .examples{{"deltaLake", "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)", ""}}, + .categories{"DataLake"} + }, + .allow_readonly = false + }); +} +#endif + +void registerTableFunctionHudi(TableFunctionFactory & factory) +{ + factory.registerFunction( + { + .documentation = + { + .description=R"(The table function can be used to read the Hudi table stored on object store.)", + .examples{{"hudi", "SELECT * FROM hudi(url, access_key_id, secret_access_key)", ""}}, + .categories{"DataLake"} + }, + .allow_readonly = false + }); +} +#endif + +void registerDataLakeTableFunctions(TableFunctionFactory & factory) +{ + UNUSED(factory); +#if USE_AWS_S3 +#if USE_AVRO + registerTableFunctionIceberg(factory); +#endif +#if USE_PARQUET + registerTableFunctionDeltaLake(factory); +#endif + registerTableFunctionHudi(factory); +#endif +} + +} diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index 627d945fbf3..05fe147e076 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -68,8 +68,7 @@ void registerTableFunctions() registerTableFunctionObjectStorage(factory); registerTableFunctionObjectStorageCluster(factory); - - + registerDataLakeTableFunctions(factory); } } diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index cefb198273e..7998a4b49d9 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -30,13 +30,6 @@ void registerTableFunctionS3Cluster(TableFunctionFactory & factory); void registerTableFunctionCOS(TableFunctionFactory & factory); void registerTableFunctionOSS(TableFunctionFactory & factory); void registerTableFunctionGCS(TableFunctionFactory & factory); -void registerTableFunctionHudi(TableFunctionFactory & factory); -#if USE_PARQUET -void registerTableFunctionDeltaLake(TableFunctionFactory & factory); -#endif -#if USE_AVRO -void registerTableFunctionIceberg(TableFunctionFactory & factory); -#endif #endif #if USE_HIVE @@ -67,10 +60,9 @@ void registerTableFunctionFormat(TableFunctionFactory & factory); void registerTableFunctionExplain(TableFunctionFactory & factory); -#if USE_AZURE_BLOB_STORAGE void registerTableFunctionObjectStorage(TableFunctionFactory & factory); void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory); -#endif +void registerDataLakeTableFunctions(TableFunctionFactory & factory); void registerTableFunctions(); From 7577257df558fb3bd74e862e7da7b0f1b485ffeb Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 14 Feb 2024 17:29:03 +0100 Subject: [PATCH 015/392] Fix cluster functions --- .../ReadFromStorageObjectStorage.cpp | 6 +- .../ObjectStorage/StorageObjectStorage.cpp | 3 +- .../StorageObjectStorageCluster.cpp | 9 ++- .../StorageObjectStorageSource.cpp | 63 ++++++++++++++++--- .../StorageObjectStorageSource.h | 25 ++++++-- src/Storages/S3Queue/StorageS3Queue.cpp | 5 +- .../TableFunctionObjectStorage.cpp | 3 + .../TableFunctionObjectStorageCluster.cpp | 27 ++++---- 8 files changed, 110 insertions(+), 31 deletions(-) diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp index b33eea7d354..9c58fcdaa9a 100644 --- a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp @@ -49,7 +49,8 @@ void ReadFromStorageObejctStorage::createIterator(const ActionsDAG::Node * predi auto context = getContext(); iterator_wrapper = StorageObjectStorageSource::createFileIterator( configuration, object_storage, distributed_processing, context, predicate, - virtual_columns, nullptr, query_settings.list_object_keys_size, context->getFileProgressCallback()); + virtual_columns, nullptr, query_settings.list_object_keys_size, metric_threads_count, + metric_threads_active, metric_threads_scheduled, context->getFileProgressCallback()); } } @@ -75,7 +76,8 @@ void ReadFromStorageObejctStorage::initializePipeline(QueryPipelineBuilder & pip auto source = std::make_shared( getName(), object_storage, configuration, info, format_settings, query_settings, - context, max_block_size, iterator_wrapper, need_only_count, schema_cache, std::move(threadpool)); + context, max_block_size, iterator_wrapper, need_only_count, schema_cache, + std::move(threadpool), metric_threads_count, metric_threads_active, metric_threads_scheduled); pipes.emplace_back(std::move(source)); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 08d7c9d0014..2e834da5529 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -289,7 +289,8 @@ ColumnsDescription StorageObjectStorage::getTableStructureFromD const auto settings = StorageSettings::create(context->getSettingsRef()); auto file_iterator = StorageObjectStorageSource::createFileIterator( configuration, object_storage, /* distributed_processing */false, - context, /* predicate */{}, /* virtual_columns */{}, &read_keys, settings.list_object_keys_size); + context, /* predicate */{}, /* virtual_columns */{}, &read_keys, settings.list_object_keys_size, + StorageSettings::ObjectStorageThreads(), StorageSettings::ObjectStorageThreadsActive(), StorageSettings::ObjectStorageThreadsScheduled()); ReadBufferIterator read_buffer_iterator( object_storage, configuration, file_iterator, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index c03bbd1a45d..f0d9ea400c4 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -88,7 +88,14 @@ StorageObjectStorageCluster::getTask auto iterator = std::make_shared( object_storage, configuration, predicate, virtual_columns, local_context, nullptr, settings.list_object_keys_size); - auto callback = std::make_shared>([iterator]() mutable -> String{ return iterator->next(0)->relative_path; }); + auto callback = std::make_shared>([iterator]() mutable -> String + { + auto object_info = iterator->next(0); + if (object_info) + return object_info->relative_path; + else + return ""; + }); return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 1fda75897f9..a8bde4cd56f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -42,7 +42,10 @@ StorageObjectStorageSource::StorageObjectStorageSource( std::shared_ptr file_iterator_, bool need_only_count_, SchemaCache & schema_cache_, - std::shared_ptr reader_pool_) + std::shared_ptr reader_pool_, + CurrentMetrics::Metric metric_threads_, + CurrentMetrics::Metric metric_threads_active_, + CurrentMetrics::Metric metric_threads_scheduled_) : SourceWithKeyCondition(info.source_header, false) , WithContext(context_) , name(std::move(name_)) @@ -57,6 +60,9 @@ StorageObjectStorageSource::StorageObjectStorageSource( , columns_desc(info.columns_description) , file_iterator(file_iterator_) , schema_cache(schema_cache_) + , metric_threads(metric_threads_) + , metric_threads_active(metric_threads_active_) + , metric_threads_scheduled(metric_threads_scheduled_) , create_reader_scheduler(threadPoolCallbackRunner(*create_reader_pool, "Reader")) { } @@ -75,10 +81,16 @@ std::shared_ptr StorageObjectStorageSourc const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, size_t list_object_keys_size, + CurrentMetrics::Metric metric_threads_, + CurrentMetrics::Metric metric_threads_active_, + CurrentMetrics::Metric metric_threads_scheduled_, std::function file_progress_callback) { if (distributed_processing) - return std::make_shared(local_context->getReadTaskCallback()); + return std::make_shared( + local_context->getReadTaskCallback(), + local_context->getSettingsRef().max_threads, + metric_threads_, metric_threads_active_, metric_threads_scheduled_); if (configuration->isNamespaceWithGlobs()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); @@ -380,19 +392,16 @@ ObjectInfoPtr StorageObjectStorageSource::GlobIterator::next(size_t /* processor while (new_batch.empty()) { auto result = object_storage_iterator->getCurrentBatchAndScheduleNext(); - LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: {}", result.has_value()); - if (result.has_value()) - { - new_batch = std::move(result.value()); - } - else + if (!result.has_value()) { is_finished = true; return {}; } + new_batch = std::move(result.value()); for (auto it = new_batch.begin(); it != new_batch.end();) { + chassert(*it); if (!recursive && !re2::RE2::FullMatch((*it)->relative_path, *matcher)) it = new_batch.erase(it); else @@ -406,8 +415,11 @@ ObjectInfoPtr StorageObjectStorageSource::GlobIterator::next(size_t /* processor { std::vector paths; paths.reserve(new_batch.size()); - for (auto & object_info : new_batch) + for (const auto & object_info : new_batch) + { + chassert(object_info); paths.push_back(fs::path(configuration->getNamespace()) / object_info->relative_path); + } VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_dag, virtual_columns, getContext()); } @@ -416,6 +428,7 @@ ObjectInfoPtr StorageObjectStorageSource::GlobIterator::next(size_t /* processor read_keys->insert(read_keys->end(), new_batch.begin(), new_batch.end()); object_infos = std::move(new_batch); + if (file_progress_callback) { for (const auto & object_info : object_infos) @@ -499,4 +512,36 @@ StorageObjectStorageSource::ReaderHolder & StorageObjectStorageSource::ReaderHol object_info = std::move(other.object_info); return *this; } + +StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( + const ReadTaskCallback & callback_, + size_t max_threads_count, + CurrentMetrics::Metric metric_threads_, + CurrentMetrics::Metric metric_threads_active_, + CurrentMetrics::Metric metric_threads_scheduled_) + : callback(callback_) +{ + ThreadPool pool(metric_threads_, metric_threads_active_, metric_threads_scheduled_, max_threads_count); + auto pool_scheduler = threadPoolCallbackRunner(pool, "ReadTaskIter"); + + std::vector> keys; + keys.reserve(max_threads_count); + for (size_t i = 0; i < max_threads_count; ++i) + keys.push_back(pool_scheduler([this] { return callback(); }, Priority{})); + + pool.wait(); + buffer.reserve(max_threads_count); + for (auto & key_future : keys) + buffer.emplace_back(std::make_shared(key_future.get(), std::nullopt)); +} + +ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator::next(size_t) +{ + size_t current_index = index.fetch_add(1, std::memory_order_relaxed); + if (current_index >= buffer.size()) + return std::make_shared(callback()); + + return buffer[current_index]; +} + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 214a7de14d6..14e59312c8c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -30,7 +30,10 @@ public: std::shared_ptr file_iterator_, bool need_only_count_, SchemaCache & schema_cache_, - std::shared_ptr reader_pool_); + std::shared_ptr reader_pool_, + CurrentMetrics::Metric metric_threads_, + CurrentMetrics::Metric metric_threads_active_, + CurrentMetrics::Metric metric_threads_scheduled_); ~StorageObjectStorageSource() override; @@ -47,6 +50,9 @@ public: const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, size_t list_object_keys_size, + CurrentMetrics::Metric metric_threads_, + CurrentMetrics::Metric metric_threads_active_, + CurrentMetrics::Metric metric_threads_scheduled_, std::function file_progress_callback = {}); protected: @@ -64,6 +70,10 @@ protected: SchemaCache & schema_cache; bool initialized = false; + const CurrentMetrics::Metric metric_threads; + const CurrentMetrics::Metric metric_threads_active; + const CurrentMetrics::Metric metric_threads_scheduled; + size_t total_rows_in_file = 0; LoggerPtr log = getLogger("StorageObjectStorageSource"); @@ -123,14 +133,21 @@ public: class StorageObjectStorageSource::ReadTaskIterator : public IIterator { public: - explicit ReadTaskIterator(const ReadTaskCallback & callback_) : callback(callback_) {} + ReadTaskIterator( + const ReadTaskCallback & callback_, + size_t max_threads_count, + CurrentMetrics::Metric metric_threads_, + CurrentMetrics::Metric metric_threads_active_, + CurrentMetrics::Metric metric_threads_scheduled_); - size_t estimatedKeysCount() override { return 0; } /// TODO FIXME + size_t estimatedKeysCount() override { return buffer.size(); } - ObjectInfoPtr next(size_t) override { return std::make_shared(callback(), ObjectMetadata{}); } + ObjectInfoPtr next(size_t) override; private: ReadTaskCallback callback; + ObjectInfos buffer; + std::atomic_size_t index = 0; }; class StorageObjectStorageSource::GlobIterator : public IIterator, WithContext diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index b03224cedff..b256f030da1 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -364,7 +364,10 @@ std::shared_ptr StorageS3Queue::createSource( file_iterator, false, Storage::getSchemaCache(local_context), - threadpool); + threadpool, + CurrentMetrics::ObjectStorageS3Threads, + CurrentMetrics::ObjectStorageS3ThreadsActive, + CurrentMetrics::ObjectStorageS3ThreadsScheduled); auto file_deleter = [=, this](const std::string & path) mutable { diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index a948102ac2b..a48c95469d0 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -93,6 +93,7 @@ template ColumnsDescription TableFunctionObjectStorage< Definition, StorageSettings, Configuration>::getActualTableStructure(ContextPtr context, bool is_insert_query) const { + chassert(configuration); if (configuration->structure == "auto") { context->checkAccess(getSourceAccessType()); @@ -107,6 +108,7 @@ template bool TableFunctionObjectStorage< Definition, StorageSettings, Configuration>::supportsReadingSubsetOfColumns(const ContextPtr & context) { + chassert(configuration); return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); } @@ -127,6 +129,7 @@ StoragePtr TableFunctionObjectStoragestructure != "auto") columns = parseColumnsListFromString(configuration->structure, context); else if (!structure_hint.empty()) diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index c93d816dc07..5a29a693431 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -21,25 +21,23 @@ StoragePtr TableFunctionObjectStorageCluster; - StoragePtr storage; + auto configuration = Base::getConfiguration(); + bool structure_argument_was_provided = configuration->structure != "auto"; + ColumnsDescription columns; - bool structure_argument_was_provided = Base::configuration->structure != "auto"; - if (structure_argument_was_provided) - { - columns = parseColumnsListFromString(Base::configuration->structure, context); - } + columns = parseColumnsListFromString(configuration->structure, context); else if (!Base::structure_hint.empty()) - { columns = Base::structure_hint; - } + auto object_storage = Base::getObjectStorage(context, !is_insert_query); + StoragePtr storage; if (context->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) { /// On worker node this filename won't contains globs storage = std::make_shared>( - Base::configuration, - Base::configuration->createOrUpdateObjectStorage(context, !is_insert_query), + configuration, + object_storage, Definition::storage_type_name, context, StorageID(Base::getDatabaseName(), table_name), @@ -54,8 +52,8 @@ StoragePtr TableFunctionObjectStorageCluster>( ITableFunctionCluster::cluster_name, - Base::configuration, - Base::configuration->createOrUpdateObjectStorage(context, !is_insert_query), + configuration, + object_storage, Definition::storage_type_name, StorageID(Base::getDatabaseName(), table_name), columns, @@ -87,7 +85,10 @@ void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory) { .documentation = { .description=R"(The table function can be used to read the data stored on Azure Blob Storage in parallel for many nodes in a specified cluster.)", - .examples{{"azureBlobStorageCluster", "SELECT * FROM azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])", ""}}}, + .examples{{ + "azureBlobStorageCluster", + "SELECT * FROM azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, " + "[account_name, account_key, format, compression, structure])", ""}}}, .allow_readonly = false } ); From ba0dc7bc54c8e621f63e3ba2f1bdbec15bdb9114 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 14 Feb 2024 10:32:29 +0100 Subject: [PATCH 016/392] fix failing style check and tests --- src/IO/ReadHelpers.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index af66cbb4cb5..53a7229e7d5 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -539,7 +539,6 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) if (*buf.position() == '\r') ++buf.position(); - } } From bf12c376b0dde30092f0588a5439d7c7cab5e08b Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 16 Feb 2024 13:30:55 +0100 Subject: [PATCH 017/392] fix for fast tests failing on shell test --- tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh index 1e8dee22d28..88448171516 100755 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -5,7 +5,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CUR_DIR"/../shell_config.sh # Data preparation step -USER_FILES_PATH=$($CLICKHOUSE_CLIENT --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +USER_FILES_PATH = $($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') DATA_FILE_UNIX_ENDINGS=${USER_FILES_PATH:?}/data_without_crlf.tsv DATA_FILE_DOS_ENDINGS=${USER_FILES_PATH:?}/data_with_crlf.tsv From 6e6bc97a3e0d8618dc80f5a26bb59f73623d1ccb Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 16 Feb 2024 13:42:58 +0100 Subject: [PATCH 018/392] fix failing style check --- tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh index 88448171516..cb7472be418 100755 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -5,7 +5,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CUR_DIR"/../shell_config.sh # Data preparation step -USER_FILES_PATH = $($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') DATA_FILE_UNIX_ENDINGS=${USER_FILES_PATH:?}/data_without_crlf.tsv DATA_FILE_DOS_ENDINGS=${USER_FILES_PATH:?}/data_with_crlf.tsv From 0552f44f70d76f25f268259a09cbbb10dc3781d7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 19 Feb 2024 10:45:56 +0100 Subject: [PATCH 019/392] Fixes after merge with master, move some part of code to object storage --- src/Backups/BackupIO_S3.cpp | 8 +- src/Disks/ObjectStorages/IObjectStorage.h | 3 +- .../ObjectStorages/ObjectStorageFactory.cpp | 4 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 57 ++++- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 12 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 110 +++++---- src/Disks/ObjectStorages/S3/diskSettings.h | 13 +- src/IO/S3Common.cpp | 7 +- src/Storages/Cache/SchemaCache.cpp | 2 + .../ObjectStorage/AzureBlob/Configuration.cpp | 7 +- .../ObjectStorage/AzureBlob/Configuration.h | 2 +- .../DataLakes/IStorageDataLake.h | 18 +- .../ObjectStorage/HDFS/Configuration.cpp | 7 +- .../ObjectStorage/HDFS/Configuration.h | 2 +- .../ObjectStorage/ReadBufferIterator.cpp | 210 +++++++++++++----- .../ObjectStorage/ReadBufferIterator.h | 12 +- .../ObjectStorage/S3/Configuration.cpp | 108 ++------- src/Storages/ObjectStorage/S3/Configuration.h | 18 +- .../ObjectStorage/StorageObjectStorage.cpp | 109 ++++++--- .../ObjectStorage/StorageObjectStorage.h | 20 +- .../StorageObjectStorageCluster.cpp | 30 ++- .../StorageObjectStorageCluster.h | 8 +- .../StorageObjectStorageConfiguration.cpp | 6 +- .../StorageObjectStorageConfiguration.h | 3 +- .../StorageObjectStorageSource.h | 4 + .../registerStorageObjectStorage.cpp | 2 +- src/Storages/S3Queue/StorageS3Queue.cpp | 8 +- src/Storages/StorageS3Settings.cpp | 11 +- src/Storages/StorageS3Settings.h | 8 +- .../TableFunctionObjectStorage.cpp | 6 +- .../TableFunctionObjectStorage.h | 6 +- .../TableFunctionObjectStorageCluster.cpp | 7 +- 32 files changed, 498 insertions(+), 330 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index fa4c1af3698..6c7b3674fb7 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -127,10 +127,10 @@ BackupReaderS3::BackupReaderS3( : BackupReaderDefault(read_settings_, write_settings_, getLogger("BackupReaderS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} - , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString())) + , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).value_or(S3Settings{})) { auto & request_settings = s3_settings.request_settings; - request_settings.updateFromSettings(context_->getSettingsRef()); + request_settings.updateFromSettingsIfChanged(context_->getSettingsRef()); request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint request_settings.allow_native_copy = allow_s3_native_copy; client = makeS3Client(s3_uri_, access_key_id_, secret_access_key_, s3_settings, context_); @@ -217,10 +217,10 @@ BackupWriterS3::BackupWriterS3( : BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} - , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString())) + , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).value_or(S3Settings{})) { auto & request_settings = s3_settings.request_settings; - request_settings.updateFromSettings(context_->getSettingsRef()); + request_settings.updateFromSettingsIfChanged(context_->getSettingsRef()); request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint request_settings.allow_native_copy = allow_s3_native_copy; request_settings.setStorageClassName(storage_class_name); diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 8a5352e71ca..5ff618e08eb 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -193,8 +193,7 @@ public: virtual void applyNewSettings( const Poco::Util::AbstractConfiguration &, const std::string & /*config_prefix*/, - ContextPtr) - {} + ContextPtr) {} /// Sometimes object storages have something similar to chroot or namespace, for example /// buckets in S3. If object storage doesn't have any namepaces return empty string. diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index b3626135177..0855ba54d2f 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -126,7 +126,7 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings); + auto client = getClient(config, config_prefix, context, *settings, true); auto key_generator = getKeyGenerator(disk_type, uri, config, config_prefix); auto object_storage = std::make_shared( @@ -162,7 +162,7 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings); + auto client = getClient(config, config_prefix, context, *settings, true); auto key_generator = getKeyGenerator(disk_type, uri, config, config_prefix); auto object_storage = std::make_shared( diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index a9bd520e6e9..7e856b45aea 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -242,7 +242,12 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN if (mode != WriteMode::Rewrite) throw Exception(ErrorCodes::BAD_ARGUMENTS, "S3 doesn't support append to files"); - auto settings_ptr = s3_settings.get(); + S3Settings::RequestSettings request_settings = s3_settings.get()->request_settings; + if (auto query_context = CurrentThread::getQueryContext()) + { + request_settings.updateFromSettingsIfChanged(query_context->getSettingsRef()); + } + ThreadPoolCallbackRunner scheduler; if (write_settings.s3_allow_parallel_part_upload) scheduler = threadPoolCallbackRunner(getThreadPoolWriter(), "VFSWrite"); @@ -256,7 +261,7 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN uri.bucket, object.remote_path, buf_size, - settings_ptr->request_settings, + request_settings, std::move(blob_storage_log), attributes, std::move(scheduler), @@ -534,19 +539,57 @@ void S3ObjectStorage::startup() const_cast(*client.get()).EnableRequestProcessing(); } -void S3ObjectStorage::applyNewSettings(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, ContextPtr context) +void S3ObjectStorage::applyNewSettings( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + ContextPtr context) { auto new_s3_settings = getSettings(config, config_prefix, context); - auto new_client = getClient(config, config_prefix, context, *new_s3_settings); + if (!static_headers.empty()) + { + new_s3_settings->auth_settings.headers.insert( + new_s3_settings->auth_settings.headers.end(), + static_headers.begin(), static_headers.end()); + } + + if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString())) + new_s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); + + auto current_s3_settings = s3_settings.get(); + if (current_s3_settings->auth_settings.hasUpdates(new_s3_settings->auth_settings) || for_disk_s3) + { + auto new_client = getClient(config, config_prefix, context, *new_s3_settings, for_disk_s3, &uri); + client.set(std::move(new_client)); + } + s3_settings.set(std::move(new_s3_settings)); - client.set(std::move(new_client)); } +// void S3ObjectStorage::applyNewSettings(ContextPtr context) +// { +// auto settings = s3_settings.get(); +// if (!endpoint_settings || !settings->auth_settings.hasUpdates(endpoint_settings->auth_settings)) +// return; +// +// const auto & config = context->getConfigRef(); +// auto new_s3_settings = getSettings(uri, config, "s3.", context); +// +// new_s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); +// +// auto new_client = getClient(config, "s3.", context, *new_s3_settings, false); +// +// s3_settings.set(std::move(new_s3_settings)); +// client.set(std::move(new_client)); +// } + std::unique_ptr S3ObjectStorage::cloneObjectStorage( - const std::string & new_namespace, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, ContextPtr context) + const std::string & new_namespace, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + ContextPtr context) { auto new_s3_settings = getSettings(config, config_prefix, context); - auto new_client = getClient(config, config_prefix, context, *new_s3_settings); + auto new_client = getClient(config, config_prefix, context, *new_s3_settings, true); String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); auto new_uri{uri}; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index a6843a383e5..187cdb58447 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -21,11 +21,13 @@ struct S3ObjectStorageSettings S3ObjectStorageSettings( const S3Settings::RequestSettings & request_settings_, + const S3::AuthSettings & auth_settings_, uint64_t min_bytes_for_seek_, int32_t list_object_keys_size_, int32_t objects_chunk_size_to_delete_, bool read_only_) : request_settings(request_settings_) + , auth_settings(auth_settings_) , min_bytes_for_seek(min_bytes_for_seek_) , list_object_keys_size(list_object_keys_size_) , objects_chunk_size_to_delete(objects_chunk_size_to_delete_) @@ -33,6 +35,7 @@ struct S3ObjectStorageSettings {} S3Settings::RequestSettings request_settings; + S3::AuthSettings auth_settings; uint64_t min_bytes_for_seek; int32_t list_object_keys_size; @@ -52,7 +55,9 @@ private: S3::URI uri_, const S3Capabilities & s3_capabilities_, ObjectStorageKeysGeneratorPtr key_generator_, - const String & disk_name_) + const String & disk_name_, + bool for_disk_s3_ = true, + const HTTPHeaderEntries & static_headers_ = {}) : uri(uri_) , key_generator(std::move(key_generator_)) , disk_name(disk_name_) @@ -60,6 +65,8 @@ private: , s3_settings(std::move(s3_settings_)) , s3_capabilities(s3_capabilities_) , log(getLogger(logger_name)) + , for_disk_s3(for_disk_s3_) + , static_headers(static_headers_) { } @@ -180,6 +187,9 @@ private: S3Capabilities s3_capabilities; LoggerPtr log; + + const bool for_disk_s3; + const HTTPHeaderEntries static_headers; }; /// Do not encode keys, store as-is, and do not require separate disk for metadata. diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 4fd4b17aabe..cb2bb690292 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -10,8 +10,6 @@ #include #include #include -#include "Disks/DiskFactory.h" - #include #include #include @@ -25,13 +23,19 @@ namespace DB { -std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) +std::unique_ptr getSettings( + const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { const Settings & settings = context->getSettingsRef(); S3Settings::RequestSettings request_settings(config, config_prefix, settings, "s3_"); + /// TODO: add request settings prefix, becausse for StorageS3 it should be "s3." + + S3::AuthSettings auth_settings; + auth_settings.loadFromConfig(config_prefix, config); return std::make_unique( request_settings, + auth_settings, config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".list_object_keys_size", 1000), config.getInt(config_prefix + ".objects_chunk_size_to_delete", 1000), @@ -42,78 +46,92 @@ std::unique_ptr getClient( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, - const S3ObjectStorageSettings & settings) + const S3ObjectStorageSettings & settings, + bool for_disk_s3, + const S3::URI * url_) { const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); const Settings & local_settings = context->getSettingsRef(); - String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); - S3::URI uri(endpoint); - if (!uri.key.ends_with('/')) - uri.key.push_back('/'); + const auto & auth_settings = settings.auth_settings; + const auto & request_settings = settings.request_settings; + + S3::URI url; + if (for_disk_s3) + { + String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); + S3::URI uri(endpoint); + if (!uri.key.ends_with('/')) + uri.key.push_back('/'); + } + else + { + if (!url_) + throw Exception(ErrorCodes::LOGICAL_ERROR, "URL not passed"); + url = *url_; + } S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( - config.getString(config_prefix + ".region", ""), + auth_settings.region, context->getRemoteHostFilter(), static_cast(global_settings.s3_max_redirects), static_cast(global_settings.s3_retry_attempts), global_settings.enable_s3_requests_logging, - /* for_disk_s3 = */ true, + for_disk_s3, settings.request_settings.get_request_throttler, settings.request_settings.put_request_throttler, - uri.uri.getScheme()); + url.uri.getScheme()); + client_configuration.endpointOverride = url.endpoint; + client_configuration.maxConnections = static_cast(request_settings.max_connections); client_configuration.connectTimeoutMs = config.getUInt(config_prefix + ".connect_timeout_ms", S3::DEFAULT_CONNECT_TIMEOUT_MS); client_configuration.requestTimeoutMs = config.getUInt(config_prefix + ".request_timeout_ms", S3::DEFAULT_REQUEST_TIMEOUT_MS); - client_configuration.maxConnections = config.getUInt(config_prefix + ".max_connections", S3::DEFAULT_MAX_CONNECTIONS); - client_configuration.endpointOverride = uri.endpoint; - client_configuration.http_keep_alive_timeout_ms = config.getUInt( - config_prefix + ".http_keep_alive_timeout_ms", DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT * 1000); - client_configuration.http_connection_pool_size = config.getUInt(config_prefix + ".http_connection_pool_size", 1000); - client_configuration.wait_on_pool_size_limit = false; - client_configuration.s3_use_adaptive_timeouts = config.getBool( - config_prefix + ".use_adaptive_timeouts", client_configuration.s3_use_adaptive_timeouts); - /* - * Override proxy configuration for backwards compatibility with old configuration format. - * */ - auto proxy_config = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( - ProxyConfiguration::protocolFromString(uri.uri.getScheme()), - config_prefix, - config - ); - if (proxy_config) + client_configuration.http_keep_alive_timeout_ms = config.getUInt(config_prefix + ".http_keep_alive_timeout_ms", DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT * 1000); + client_configuration.http_connection_pool_size = config.getUInt( + config_prefix + ".http_connection_pool_size", static_cast(global_settings.s3_http_connection_pool_size.value)); + client_configuration.s3_use_adaptive_timeouts = config.getBool(config_prefix + ".use_adaptive_timeouts", client_configuration.s3_use_adaptive_timeouts); + client_configuration.wait_on_pool_size_limit = for_disk_s3; + + if (for_disk_s3) { - client_configuration.per_request_configuration - = [proxy_config]() { return proxy_config->resolve(); }; - client_configuration.error_report - = [proxy_config](const auto & request_config) { proxy_config->errorReport(request_config); }; + /* + * Override proxy configuration for backwards compatibility with old configuration format. + * */ + if (auto proxy_config = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( + ProxyConfiguration::protocolFromString(url.uri.getScheme()), config_prefix, config)) + { + client_configuration.per_request_configuration + = [proxy_config]() { return proxy_config->resolve(); }; + client_configuration.error_report + = [proxy_config](const auto & request_config) { proxy_config->errorReport(request_config); }; + } } - HTTPHeaderEntries headers = S3::getHTTPHeaders(config_prefix, config); S3::ServerSideEncryptionKMSConfig sse_kms_config = S3::getSSEKMSConfig(config_prefix, config); - S3::ClientSettings client_settings{ - .use_virtual_addressing = uri.is_virtual_hosted_style, + .use_virtual_addressing = url.is_virtual_hosted_style, .disable_checksum = local_settings.s3_disable_checksum, .gcs_issue_compose_request = config.getBool("s3.gcs_issue_compose_request", false), }; + auto credentials_configuration = S3::CredentialsConfiguration + { + auth_settings.use_environment_credentials.value_or(context->getConfigRef().getBool("s3.use_environment_credentials", true)), + auth_settings.use_insecure_imds_request.value_or(context->getConfigRef().getBool("s3.use_insecure_imds_request", false)), + auth_settings.expiration_window_seconds.value_or(context->getConfigRef().getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), + auth_settings.no_sign_request.value_or(context->getConfigRef().getBool("s3.no_sign_request", false)), + }; + return S3::ClientFactory::instance().create( client_configuration, client_settings, - config.getString(config_prefix + ".access_key_id", ""), - config.getString(config_prefix + ".secret_access_key", ""), - config.getString(config_prefix + ".server_side_encryption_customer_key_base64", ""), + auth_settings.access_key_id, + auth_settings.secret_access_key, + auth_settings.server_side_encryption_customer_key_base64, std::move(sse_kms_config), - std::move(headers), - S3::CredentialsConfiguration - { - config.getBool(config_prefix + ".use_environment_credentials", config.getBool("s3.use_environment_credentials", true)), - config.getBool(config_prefix + ".use_insecure_imds_request", config.getBool("s3.use_insecure_imds_request", false)), - config.getUInt64(config_prefix + ".expiration_window_seconds", config.getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), - config.getBool(config_prefix + ".no_sign_request", config.getBool("s3.no_sign_request", false)) - }); + auth_settings.headers, + credentials_configuration); } } diff --git a/src/Disks/ObjectStorages/S3/diskSettings.h b/src/Disks/ObjectStorages/S3/diskSettings.h index 83bf7b179ef..194035365ea 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.h +++ b/src/Disks/ObjectStorages/S3/diskSettings.h @@ -22,9 +22,18 @@ namespace DB struct S3ObjectStorageSettings; -std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context); +std::unique_ptr getSettings( + const Poco::Util::AbstractConfiguration & config, + const String & config_prefix, + ContextPtr context); -std::unique_ptr getClient(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, const S3ObjectStorageSettings & settings); +std::unique_ptr getClient( + const Poco::Util::AbstractConfiguration & config, + const String & config_prefix, + ContextPtr context, + const S3ObjectStorageSettings & settings, + bool for_disk_s3, + const S3::URI * url_ = nullptr); } diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 5039059f522..d33d5284240 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -157,8 +157,11 @@ void AuthSettings::updateFrom(const AuthSettings & from) if (!from.session_token.empty()) session_token = from.session_token; - headers = from.headers; - region = from.region; + if (!from.headers.empty()) + headers = from.headers; + if (!from.region.empty()) + region = from.region; + server_side_encryption_customer_key_base64 = from.server_side_encryption_customer_key_base64; server_side_encryption_kms_config = from.server_side_encryption_kms_config; diff --git a/src/Storages/Cache/SchemaCache.cpp b/src/Storages/Cache/SchemaCache.cpp index 299dd292772..35fb8d348ef 100644 --- a/src/Storages/Cache/SchemaCache.cpp +++ b/src/Storages/Cache/SchemaCache.cpp @@ -1,5 +1,6 @@ #include #include +#include #include namespace ProfileEvents @@ -109,6 +110,7 @@ std::optional SchemaCache::tryGetImpl(const Key & key, } ProfileEvents::increment(ProfileEvents::SchemaInferenceCacheHits); + LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: {}", StackTrace().toString()); auto & schema_info = it->second.schema_info; auto & queue_iterator = it->second.iterator; diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index 109918dfc8b..9d21541e7e2 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -102,7 +102,7 @@ AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(Co return settings_ptr; } -ObjectStoragePtr StorageAzureBlobConfiguration::createOrUpdateObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT +ObjectStoragePtr StorageAzureBlobConfiguration::createObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT { auto client = createClient(is_readonly); auto settings = createSettings(context); @@ -245,8 +245,6 @@ void StorageAzureBlobConfiguration::fromNamedCollection(const NamedCollection & compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); blobs_paths = {blob_path}; - if (format == "auto") - format = FormatFactory::instance().getFormatFromFileName(blob_path, true); } void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr context, bool with_structure) @@ -367,9 +365,6 @@ void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr conte } blobs_paths = {blob_path}; - - if (format == "auto") - format = FormatFactory::instance().getFormatFromFileName(blob_path, true); } void StorageAzureBlobConfiguration::addStructureToArgs(ASTs & args, const String & structure_, ContextPtr context) diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.h b/src/Storages/ObjectStorage/AzureBlob/Configuration.h index deeb365d012..3d701e72cb4 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.h +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.h @@ -31,7 +31,7 @@ public: String getNamespace() const override { return container; } void check(ContextPtr context) const override; - ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT + ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } void fromNamedCollection(const NamedCollection & collection) override; diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index 95196cdd000..8a21fc1152f 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -39,7 +39,7 @@ public: std::optional format_settings_, bool attach) { - auto object_storage = base_configuration->createOrUpdateObjectStorage(context); + auto object_storage = base_configuration->createObjectStorage(context); DataLakeMetadataPtr metadata; NamesAndTypesList schema_from_metadata; ConfigurationPtr configuration = base_configuration->clone(); @@ -75,28 +75,22 @@ public: return ColumnsDescription(metadata->getTableSchema()); } - std::pair updateConfigurationAndGetCopy(ContextPtr local_context) override + void updateConfiguration(ContextPtr local_context) override { std::lock_guard lock(Storage::configuration_update_mutex); - auto new_object_storage = base_configuration->createOrUpdateObjectStorage(local_context); - bool updated = new_object_storage != nullptr; - if (updated) - Storage::object_storage = new_object_storage; + Storage::updateConfiguration(local_context); auto new_metadata = DataLakeMetadata::create(Storage::object_storage, base_configuration, local_context); - if (!current_metadata || !(*current_metadata == *new_metadata)) - current_metadata = std::move(new_metadata); - else if (!updated) - return {Storage::configuration, Storage::object_storage}; + if (current_metadata && *current_metadata == *new_metadata) + return; + current_metadata = std::move(new_metadata); auto updated_configuration = base_configuration->clone(); /// If metadata wasn't changed, we won't list data files again. updated_configuration->getPaths() = current_metadata->getDataFiles(); Storage::configuration = updated_configuration; - - return {Storage::configuration, Storage::object_storage}; } template diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index c80237b3055..731b05f4621 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -27,7 +27,7 @@ void StorageHDFSConfiguration::check(ContextPtr context) const checkHDFSURL(url); } -ObjectStoragePtr StorageHDFSConfiguration::createOrUpdateObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT +ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT { UNUSED(is_readonly); auto settings = std::make_unique(); @@ -42,16 +42,13 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr, bool /* with_str if (args.size() > 1) format_name = checkAndGetLiteralArgument(args[1], "format_name"); - if (format_name == "auto") - format_name = FormatFactory::instance().getFormatFromFileName(url, true); - String compression_method; if (args.size() == 3) compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); else compression_method = "auto"; - } + } #endif diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 03fb0824123..1013c2e00c2 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -26,7 +26,7 @@ public: String getDataSourceDescription() override { return url; } void check(ContextPtr context) const override; - ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT + ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } void fromNamedCollection(const NamedCollection &) override {} diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index a3e19b907bc..a0e719878ac 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -10,6 +10,7 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; } @@ -30,14 +31,15 @@ ReadBufferIterator::ReadBufferIterator( , query_settings(query_settings_) , schema_cache(schema_cache_) , read_keys(read_keys_) + , format(configuration->format.empty() || configuration->format == "auto" ? std::nullopt : std::optional(configuration->format)) , prev_read_keys_size(read_keys_.size()) { } -SchemaCache::Key ReadBufferIterator::getKeyForSchemaCache(const String & path) const +SchemaCache::Key ReadBufferIterator::getKeyForSchemaCache(const String & path, const String & format_name) const { auto source = fs::path(configuration->getDataSourceDescription()) / path; - return DB::getKeyForSchemaCache(source, configuration->format, format_settings, getContext()); + return DB::getKeyForSchemaCache(source, format_name, format_settings, getContext()); } SchemaCache::Keys ReadBufferIterator::getPathsForSchemaCache() const @@ -51,7 +53,7 @@ SchemaCache::Keys ReadBufferIterator::getPathsForSchemaCache() const { return fs::path(configuration->getDataSourceDescription()) / elem->relative_path; }); - return DB::getKeysForSchemaCache(sources, configuration->format, format_settings, getContext()); + return DB::getKeysForSchemaCache(sources, *format, format_settings, getContext()); } std::optional ReadBufferIterator::tryGetColumnsFromCache( @@ -75,10 +77,29 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( } }; - auto cache_key = getKeyForSchemaCache(object_info->relative_path); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + chassert(object_info); + if (format) + { + auto cache_key = getKeyForSchemaCache(object_info->relative_path, *format); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(object_info->relative_path, format_name); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -86,16 +107,18 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( void ReadBufferIterator::setNumRowsToLastFile(size_t num_rows) { + chassert(current_object_info); if (query_settings.schema_inference_use_cache) - schema_cache.addNumRows(getKeyForSchemaCache(current_object_info->relative_path), num_rows); + schema_cache.addNumRows(getKeyForSchemaCache(current_object_info->relative_path, *format), num_rows); } void ReadBufferIterator::setSchemaToLastFile(const ColumnsDescription & columns) { + chassert(current_object_info); if (query_settings.schema_inference_use_cache && query_settings.schema_inference_mode == SchemaInferenceMode::UNION) { - schema_cache.addColumns(getKeyForSchemaCache(current_object_info->relative_path), columns); + schema_cache.addColumns(getKeyForSchemaCache(current_object_info->relative_path, *format), columns); } } @@ -108,6 +131,11 @@ void ReadBufferIterator::setResultingSchema(const ColumnsDescription & columns) } } +void ReadBufferIterator::setFormatName(const String & format_name) +{ + format = format_name; +} + String ReadBufferIterator::getLastFileName() const { if (current_object_info) @@ -116,64 +144,128 @@ String ReadBufferIterator::getLastFileName() const return ""; } -std::pair, std::optional> ReadBufferIterator::next() +std::unique_ptr ReadBufferIterator::recreateLastReadBuffer() { - /// For default mode check cached columns for currently read keys on first iteration. - if (first && query_settings.schema_inference_mode == SchemaInferenceMode::DEFAULT) - { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; - } + chassert(current_object_info); - current_object_info = file_iterator->next(0); - if (!current_object_info || current_object_info->relative_path.empty()) + auto impl = object_storage->readObject( + StoredObject(current_object_info->relative_path), getContext()->getReadSettings()); + + int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); + return wrapReadBufferWithCompressionMethod( + std::move(impl), chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method), + zstd_window_log_max); +} + +ReadBufferIterator::Data ReadBufferIterator::next() +{ + if (first) { - if (first) + /// If format is unknown we iterate through all currently read keys on first iteration and + /// try to determine format by file name. + if (!format) { - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, " - "because there are no files with provided path. " - "You must specify table structure manually", - configuration->format); + for (const auto & object_info : read_keys) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(object_info->relative_path)) + { + format = format_from_file_name; + break; + } + } + } + + /// For default mode check cached columns for currently read keys on first iteration. + if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns, format}; } - return {nullptr, std::nullopt}; } - first = false; - - /// File iterator could get new keys after new iteration, - /// check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT - && read_keys.size() > prev_read_keys_size) + while (true) { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); - prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache}; + current_object_info = file_iterator->next(0); + + if (!current_object_info || current_object_info->relative_path.empty()) + { + if (first) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path " + "in S3 or all files are empty. You can specify table structure manually", + *format); + + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "in S3 or all files are empty. You can specify the format manually"); + } + + return {nullptr, std::nullopt, format}; + } + + /// S3 file iterator could get new keys after new iteration + if (read_keys.size() > prev_read_keys_size) + { + /// If format is unknown we can try to determine it by new file names. + if (!format) + { + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->relative_path)) + { + format = format_from_file_name; + break; + } + } + } + + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + + prev_read_keys_size = read_keys.size(); + } + + if (getContext()->getSettingsRef().s3_skip_empty_files + && current_object_info->metadata && current_object_info->metadata->size_bytes == 0) + continue; + + /// In union mode, check cached columns only for current key. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) + { + ObjectInfos objects{current_object_info}; + if (auto columns_from_cache = tryGetColumnsFromCache(objects.begin(), objects.end())) + { + first = false; + return {nullptr, columns_from_cache, format}; + } + } + + std::unique_ptr read_buffer = object_storage->readObject( + StoredObject(current_object_info->relative_path), + getContext()->getReadSettings(), + {}, + current_object_info->metadata->size_bytes); + + if (!getContext()->getSettingsRef().s3_skip_empty_files || !read_buffer->eof()) + { + first = false; + + read_buffer = wrapReadBufferWithCompressionMethod( + std::move(read_buffer), + chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method), + static_cast(getContext()->getSettingsRef().zstd_window_log_max)); + + return {std::move(read_buffer), std::nullopt, format}; + } } - else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) - { - ObjectInfos paths = {current_object_info}; - if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) - return {nullptr, columns_from_cache}; - } - - first = false; - - chassert(current_object_info->metadata); - std::unique_ptr read_buffer = object_storage->readObject( - StoredObject(current_object_info->relative_path), - getContext()->getReadSettings(), - {}, - current_object_info->metadata->size_bytes); - - read_buffer = wrapReadBufferWithCompressionMethod( - std::move(read_buffer), - chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method), - static_cast(getContext()->getSettingsRef().zstd_window_log_max)); - - return {std::move(read_buffer), std::nullopt}; } - } diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.h b/src/Storages/ObjectStorage/ReadBufferIterator.h index 4e9b8cfcfca..053bcbf894f 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.h +++ b/src/Storages/ObjectStorage/ReadBufferIterator.h @@ -2,6 +2,7 @@ #include #include #include +#include #include @@ -23,7 +24,7 @@ public: ObjectInfos & read_keys_, const ContextPtr & context_); - std::pair, std::optional> next() override; + Data next() override; void setNumRowsToLastFile(size_t num_rows) override; @@ -33,8 +34,14 @@ public: String getLastFileName() const override; + void setFormatName(const String & format_name) override; + + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override; + private: - SchemaCache::Key getKeyForSchemaCache(const String & path) const; + SchemaCache::Key getKeyForSchemaCache(const String & path, const String & format_name) const; SchemaCache::Keys getPathsForSchemaCache() const; std::optional tryGetColumnsFromCache( const ObjectInfos::iterator & begin, const ObjectInfos::iterator & end); @@ -46,6 +53,7 @@ private: const StorageObjectStorageSettings query_settings; SchemaCache & schema_cache; ObjectInfos & read_keys; + std::optional format; size_t prev_read_keys_size; ObjectInfoPtr current_object_info; diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index f057745d669..896131e74d7 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -58,106 +59,47 @@ void StorageS3Configuration::check(ContextPtr context) const StorageS3Configuration::StorageS3Configuration(const StorageS3Configuration & other) { url = other.url; - auth_settings = other.auth_settings; - request_settings = other.request_settings; static_configuration = other.static_configuration; headers_from_ast = other.headers_from_ast; keys = other.keys; - initialized = other.initialized; format = other.format; compression_method = other.compression_method; structure = other.structure; } -ObjectStoragePtr StorageS3Configuration::createOrUpdateObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT +ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT { - auto s3_settings = context->getStorageS3Settings().getSettings(url.uri.toString()); - request_settings = s3_settings.request_settings; - request_settings.updateFromSettings(context->getSettings()); + const auto & config = context->getConfigRef(); + const std::string config_prefix = "s3."; - if (!initialized || (!static_configuration && auth_settings.hasUpdates(s3_settings.auth_settings))) + auto s3_settings = getSettings(config, config_prefix, context); + + auth_settings.updateFrom(s3_settings->auth_settings); + s3_settings->auth_settings = auth_settings; + s3_settings->request_settings = request_settings; + + if (!headers_from_ast.empty()) { - auth_settings.updateFrom(s3_settings.auth_settings); - keys[0] = url.key; - initialized = true; + s3_settings->auth_settings.headers.insert( + s3_settings->auth_settings.headers.end(), + headers_from_ast.begin(), headers_from_ast.end()); } - const auto & config = context->getConfigRef(); + if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString())) + s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); + + auto client = getClient(config, config_prefix, context, *s3_settings, false, &url); + auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); auto s3_capabilities = S3Capabilities { .support_batch_delete = config.getBool("s3.support_batch_delete", true), .support_proxy = config.getBool("s3.support_proxy", config.has("s3.proxy")), }; - auto s3_storage_settings = std::make_unique( - request_settings, - config.getUInt64("s3.min_bytes_for_seek", 1024 * 1024), - config.getInt("s3.list_object_keys_size", 1000), - config.getInt("s3.objects_chunk_size_to_delete", 1000), - config.getBool("s3.readonly", false)); - - auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); - auto client = createClient(context); - std::string disk_name = "StorageS3"; - return std::make_shared( - std::move(client), std::move(s3_storage_settings), url, s3_capabilities, key_generator, /*disk_name*/disk_name); -} - -std::unique_ptr StorageS3Configuration::createClient(ContextPtr context) -{ - const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); - const Settings & local_settings = context->getSettingsRef(); - - auto client_configuration = S3::ClientFactory::instance().createClientConfiguration( - auth_settings.region, - context->getRemoteHostFilter(), - static_cast(global_settings.s3_max_redirects), - static_cast(global_settings.s3_retry_attempts), - global_settings.enable_s3_requests_logging, - /* for_disk_s3 = */ false, - request_settings.get_request_throttler, - request_settings.put_request_throttler, - url.uri.getScheme()); - - client_configuration.endpointOverride = url.endpoint; - client_configuration.maxConnections = static_cast(request_settings.max_connections); - client_configuration.http_connection_pool_size = global_settings.s3_http_connection_pool_size; - - auto headers = auth_settings.headers; - if (!headers_from_ast.empty()) - headers.insert(headers.end(), headers_from_ast.begin(), headers_from_ast.end()); - - client_configuration.requestTimeoutMs = request_settings.request_timeout_ms; - - S3::ClientSettings client_settings{ - .use_virtual_addressing = url.is_virtual_hosted_style, - .disable_checksum = local_settings.s3_disable_checksum, - .gcs_issue_compose_request = context->getConfigRef().getBool("s3.gcs_issue_compose_request", false), - }; - - auto credentials = Aws::Auth::AWSCredentials(auth_settings.access_key_id, - auth_settings.secret_access_key, - auth_settings.session_token); - - auto credentials_configuration = S3::CredentialsConfiguration - { - auth_settings.use_environment_credentials.value_or(context->getConfigRef().getBool("s3.use_environment_credentials", true)), - auth_settings.use_insecure_imds_request.value_or(context->getConfigRef().getBool("s3.use_insecure_imds_request", false)), - auth_settings.expiration_window_seconds.value_or(context->getConfigRef().getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), - auth_settings.no_sign_request.value_or(context->getConfigRef().getBool("s3.no_sign_request", false)), - }; - - return S3::ClientFactory::instance().create( - client_configuration, - client_settings, - credentials.GetAWSAccessKeyId(), - credentials.GetAWSSecretKey(), - auth_settings.server_side_encryption_customer_key_base64, - auth_settings.server_side_encryption_kms_config, - std::move(headers), - credentials_configuration); + std::move(client), std::move(s3_settings), url, s3_capabilities, + key_generator, "StorageS3", false, headers_from_ast); } void StorageS3Configuration::fromNamedCollection(const NamedCollection & collection) @@ -185,10 +127,6 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect static_configuration = !auth_settings.access_key_id.empty() || auth_settings.no_sign_request.has_value(); keys = {url.key}; - - //if (format == "auto" && get_format_from_file) - if (format == "auto") - format = FormatFactory::instance().getFormatFromFileName(url.key, true); } void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_structure) @@ -386,10 +324,6 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ auth_settings.no_sign_request = no_sign_request; keys = {url.key}; - - // if (format == "auto" && get_format_from_file) - if (format == "auto") - format = FormatFactory::instance().getFormatFromFileName(url.key, true); } void StorageS3Configuration::addStructureToArgs(ASTs & args, const String & structure_, ContextPtr context) diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index 037cf2eae87..88a084f29b3 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -27,27 +27,25 @@ public: String getDataSourceDescription() override; void check(ContextPtr context) const override; - ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } + bool isStaticConfiguration() const override { return static_configuration; } - void fromNamedCollection(const NamedCollection & collection) override; - void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; + ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT static void addStructureToArgs(ASTs & args, const String & structure, ContextPtr context); private: + void fromNamedCollection(const NamedCollection & collection) override; + void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; + S3::URI url; + std::vector keys; + S3::AuthSettings auth_settings; S3Settings::RequestSettings request_settings; + HTTPHeaderEntries headers_from_ast; /// Headers from ast is a part of static configuration. /// If s3 configuration was passed from ast, then it is static. /// If from config - it can be changed with config reload. bool static_configuration = true; - /// Headers from ast is a part of static configuration. - HTTPHeaderEntries headers_from_ast; - std::vector keys; - - std::unique_ptr createClient(ContextPtr context); - - bool initialized = false; }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 2e834da5529..7337a528a76 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -13,8 +14,9 @@ #include #include #include -#include #include +#include +#include namespace DB @@ -39,21 +41,24 @@ std::unique_ptr getStorageMetadata( const std::string & engine_name, const ContextPtr & context) { + using Storage = StorageObjectStorage; + auto storage_metadata = std::make_unique(); if (columns.empty()) { - auto fetched_columns = StorageObjectStorage::getTableStructureFromData( - object_storage, configuration, format_settings, context); + auto fetched_columns = Storage::getTableStructureFromData(object_storage, configuration, format_settings, context); storage_metadata->setColumns(fetched_columns); } + else if (!columns.hasOnlyOrdinary()) + { + /// We don't allow special columns. + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine {} doesn't support special columns " + "like MATERIALIZED, ALIAS or EPHEMERAL", engine_name); + } else { - /// We don't allow special columns. - if (!columns.hasOnlyOrdinary()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Table engine {} doesn't support special columns " - "like MATERIALIZED, ALIAS or EPHEMERAL", - engine_name); + if (configuration->format == "auto") + Storage::setFormatFromData(object_storage, configuration, format_settings, context); storage_metadata->setColumns(columns); } @@ -120,14 +125,10 @@ bool StorageObjectStorage::parallelizeOutputAfterReading(Contex } template -std::pair -StorageObjectStorage::updateConfigurationAndGetCopy(ContextPtr local_context) +void StorageObjectStorage::updateConfiguration(ContextPtr context) { - std::lock_guard lock(configuration_update_mutex); - auto new_object_storage = configuration->createOrUpdateObjectStorage(local_context); - if (new_object_storage) - object_storage = new_object_storage; - return {configuration, object_storage}; + if (!configuration->isStaticConfiguration()) + object_storage->applyNewSettings(context->getConfigRef(), "s3.", context); } template @@ -151,8 +152,8 @@ void StorageObjectStorage::read( size_t max_block_size, size_t num_streams) { - auto [query_configuration, query_object_storage] = updateConfigurationAndGetCopy(local_context); - if (partition_by && query_configuration->withWildcard()) + updateConfiguration(local_context); + if (partition_by && configuration->withWildcard()) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned {} storage is not implemented yet", @@ -165,8 +166,8 @@ void StorageObjectStorage::read( && local_context->getSettingsRef().optimize_count_from_files; auto read_step = std::make_unique( - query_object_storage, - query_configuration, + object_storage, + configuration, getName(), virtual_columns, format_settings, @@ -192,10 +193,10 @@ SinkToStoragePtr StorageObjectStorage::write( ContextPtr local_context, bool /* async_insert */) { - auto [query_configuration, query_object_storage] = updateConfigurationAndGetCopy(local_context); + updateConfiguration(local_context); const auto sample_block = metadata_snapshot->getSampleBlock(); - if (query_configuration->withWildcard()) + if (configuration->withWildcard()) { ASTPtr partition_by_ast = nullptr; if (auto insert_query = std::dynamic_pointer_cast(query)) @@ -209,24 +210,28 @@ SinkToStoragePtr StorageObjectStorage::write( if (partition_by_ast) { return std::make_shared( - object_storage, query_configuration, format_settings, sample_block, local_context, partition_by_ast); + object_storage, configuration, format_settings, sample_block, local_context, partition_by_ast); } } - if (query_configuration->withGlobs()) + if (configuration->withGlobs()) { throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "{} key '{}' contains globs, so the table is in readonly mode", - getName(), query_configuration->getPath()); + getName(), configuration->getPath()); } const auto storage_settings = StorageSettings::create(local_context->getSettingsRef()); + + LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII: {}", object_storage->exists(StoredObject(configuration->getPath()))); + auto configuration_copy = configuration->clone(); if (!storage_settings.truncate_on_insert - && object_storage->exists(StoredObject(query_configuration->getPath()))) + && object_storage->exists(StoredObject(configuration->getPath()))) { + LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII 2: {}", storage_settings.create_new_file_on_insert); if (storage_settings.create_new_file_on_insert) { - auto & paths = query_configuration->getPaths(); + auto & paths = configuration_copy->getPaths(); size_t index = paths.size(); const auto & first_key = paths[0]; auto pos = first_key.find_first_of('.'); @@ -243,6 +248,7 @@ SinkToStoragePtr StorageObjectStorage::write( while (object_storage->exists(StoredObject(new_key))); paths.push_back(new_key); + configuration->getPaths().push_back(new_key); } else { @@ -251,12 +257,13 @@ SinkToStoragePtr StorageObjectStorage::write( "Object in bucket {} with key {} already exists. " "If you want to overwrite it, enable setting [engine_name]_truncate_on_insert, if you " "want to create a new file on each insert, enable setting [engine_name]_create_new_file_on_insert", - query_configuration->getNamespace(), query_configuration->getPaths().back()); + configuration_copy->getNamespace(), configuration_copy->getPaths().back()); } } + LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII 3: {}", configuration_copy->getPaths().size()); return std::make_shared( - object_storage, query_configuration, format_settings, sample_block, local_context); + object_storage, configuration_copy, format_settings, sample_block, local_context); } template @@ -279,25 +286,55 @@ void StorageObjectStorage::truncate( } template -ColumnsDescription StorageObjectStorage::getTableStructureFromData( - ObjectStoragePtr object_storage, +std::unique_ptr StorageObjectStorage::createReadBufferIterator( + const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, - ContextPtr context) + ObjectInfos & read_keys, + const ContextPtr & context) { - ObjectInfos read_keys; const auto settings = StorageSettings::create(context->getSettingsRef()); auto file_iterator = StorageObjectStorageSource::createFileIterator( configuration, object_storage, /* distributed_processing */false, context, /* predicate */{}, /* virtual_columns */{}, &read_keys, settings.list_object_keys_size, StorageSettings::ObjectStorageThreads(), StorageSettings::ObjectStorageThreadsActive(), StorageSettings::ObjectStorageThreadsScheduled()); - ReadBufferIterator read_buffer_iterator( + return std::make_unique( object_storage, configuration, file_iterator, format_settings, StorageSettings::create(context->getSettingsRef()), getSchemaCache(context), read_keys, context); +} - const bool retry = configuration->withGlobs(); - return readSchemaFromFormat(configuration->format, format_settings, read_buffer_iterator, retry, context); +template +ColumnsDescription StorageObjectStorage::getTableStructureFromData( + const ObjectStoragePtr & object_storage, + const ConfigurationPtr & configuration, + const std::optional & format_settings, + const ContextPtr & context) +{ + ObjectInfos read_keys; + auto read_buffer_iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + if (configuration->format == "auto") + { + auto [columns, format] = detectFormatAndReadSchema(format_settings, *read_buffer_iterator, context); + configuration->format = format; + return columns; + } + else + { + return readSchemaFromFormat(configuration->format, format_settings, *read_buffer_iterator, context); + } +} + +template +void StorageObjectStorage::setFormatFromData( + const ObjectStoragePtr & object_storage, + const ConfigurationPtr & configuration, + const std::optional & format_settings, + const ContextPtr & context) +{ + ObjectInfos read_keys; + auto read_buffer_iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + configuration->format = detectFormatAndReadSchema(format_settings, *read_buffer_iterator, context).second; } template class StorageObjectStorage; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 6f18153c7af..64c4c74ab22 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -21,6 +21,7 @@ using ReadTaskCallback = std::function; class IOutputFormat; class IInputFormat; class SchemaCache; +class ReadBufferIterator; template @@ -89,13 +90,26 @@ public: static SchemaCache & getSchemaCache(const ContextPtr & context); static ColumnsDescription getTableStructureFromData( - ObjectStoragePtr object_storage, + const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, - ContextPtr context); + const ContextPtr & context); + + static void setFormatFromData( + const ObjectStoragePtr & object_storage, + const ConfigurationPtr & configuration, + const std::optional & format_settings, + const ContextPtr & context); protected: - virtual std::pair updateConfigurationAndGetCopy(ContextPtr local_context); + virtual void updateConfiguration(ContextPtr local_context); + + static std::unique_ptr createReadBufferIterator( + const ObjectStoragePtr & object_storage, + const ConfigurationPtr & configuration, + const std::optional & format_settings, + ObjectInfos & read_keys, + const ContextPtr & context); const std::string engine_name; const NamesAndTypesList virtual_columns; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index f0d9ea400c4..2bd2c022aa8 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -33,12 +33,10 @@ StorageObjectStorageCluster::Storage const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) + ContextPtr context_) : IStorageCluster(cluster_name_, table_id_, - getLogger(fmt::format("{}({})", engine_name_, table_id_.table_name)), - structure_argument_was_provided_) + getLogger(fmt::format("{}({})", engine_name_, table_id_.table_name))) , engine_name(engine_name_) , configuration{configuration_} , object_storage(object_storage_) @@ -48,13 +46,16 @@ StorageObjectStorageCluster::Storage if (columns_.empty()) { - /// `format_settings` is set to std::nullopt, because StorageObjectStorageCluster is used only as table function - auto columns = StorageObjectStorage::getTableStructureFromData( - object_storage, configuration, /*format_settings=*/std::nullopt, context_); + ColumnsDescription columns = Storage::getTableStructureFromData(object_storage, configuration, /*format_settings=*/std::nullopt, context_); storage_metadata.setColumns(columns); } else + { + if (configuration->format == "auto") + StorageS3::setFormatFromData(object_storage, configuration, /*format_settings=*/std::nullopt, context_); + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -64,9 +65,9 @@ StorageObjectStorageCluster::Storage } template -void StorageObjectStorageCluster::addColumnsStructureToQuery( +void StorageObjectStorageCluster::updateQueryToSendIfNeeded( ASTPtr & query, - const String & structure, + const DB::StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); @@ -76,13 +77,18 @@ void StorageObjectStorageCluster::ad "Expected SELECT query from table function {}, got '{}'", engine_name, queryToString(query)); } - using TableFunction = TableFunctionObjectStorageCluster; - TableFunction::addColumnsStructureToArguments(expression_list->children, structure, context); + + TableFunction::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, + storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), + configuration->format, + context); } template RemoteQueryExecutor::Extension -StorageObjectStorageCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & local_context) const +StorageObjectStorageCluster::getTaskIteratorExtension( + const ActionsDAG::Node * predicate, const ContextPtr & local_context) const { const auto settings = StorageSettings::create(local_context->getSettingsRef()); auto iterator = std::make_shared( diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 507de20e888..5d77d4ced60 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -21,6 +21,7 @@ class StorageObjectStorageCluster : public IStorageCluster { public: using Storage = StorageObjectStorage; + using TableFunction = TableFunctionObjectStorageCluster; StorageObjectStorageCluster( const String & cluster_name_, @@ -30,8 +31,7 @@ public: const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); + ContextPtr context_); std::string getName() const override { return engine_name; } @@ -49,9 +49,9 @@ public: private: void updateBeforeRead(const ContextPtr & /* context */) override {} - void addColumnsStructureToQuery( + void updateQueryToSendIfNeeded( ASTPtr & query, - const String & structure, + const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; const String & engine_name; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 651f1d25ec1..a1c7d468fa6 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -1,5 +1,5 @@ #include - +#include namespace DB { @@ -14,6 +14,10 @@ void StorageObjectStorageConfiguration::initialize( configuration.fromNamedCollection(*named_collection); else configuration.fromAST(engine_args, local_context, with_table_structure); + + // FIXME: it should be - if (format == "auto" && get_format_from_file) + if (configuration.format == "auto") + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.getPath()).value_or("auto"); } bool StorageObjectStorageConfiguration::withWildcard() const diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 04b2d8e8fd9..2da262eb55d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -39,8 +39,9 @@ public: std::string getPathWithoutGlob() const; virtual void check(ContextPtr context) const = 0; - virtual ObjectStoragePtr createOrUpdateObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT + virtual ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT virtual StorageObjectStorageConfigurationPtr clone() = 0; + virtual bool isStaticConfiguration() const { return true; } String format = "auto"; String compression_method = "auto"; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 14e59312c8c..3b503fd4f0c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -5,10 +5,14 @@ #include #include #include +#include namespace DB { + +class SchemaCache; + class StorageObjectStorageSource : public SourceWithKeyCondition, WithContext { friend class StorageS3QueueSource; diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index e23457c04e9..3271b766f68 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -56,7 +56,7 @@ static std::shared_ptr> createStorageObjec return std::make_shared>( configuration, - configuration->createOrUpdateObjectStorage(context), + configuration->createObjectStorage(context), engine_name, args.getContext(), args.table_id, diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 2673aa94347..bd526ad687b 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -134,7 +134,7 @@ StorageS3Queue::StorageS3Queue( checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef()); - object_storage = configuration->createOrUpdateObjectStorage(context_); + object_storage = configuration->createObjectStorage(context_); FormatFactory::instance().checkFormatName(configuration->format); configuration->check(context_); @@ -146,8 +146,10 @@ StorageS3Queue::StorageS3Queue( } else { - if (configuration.format == "auto") - configuration.format = StorageS3::getTableStructureAndFormatFromData(configuration, format_settings, context_).second; + if (configuration->format == "auto") + { + StorageObjectStorage::setFormatFromData(object_storage, configuration, format_settings, context_); + } storage_metadata.setColumns(columns_); } diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index b0c1160429a..8510a6e4bdd 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -21,7 +21,7 @@ namespace ErrorCodes S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const Settings & settings) { - updateFromSettingsImpl(settings, false); + updateFromSettings(settings, false); validate(); } @@ -66,7 +66,7 @@ S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const NamedC validate(); } -void S3Settings::RequestSettings::PartUploadSettings::updateFromSettingsImpl(const Settings & settings, bool if_changed) +void S3Settings::RequestSettings::PartUploadSettings::updateFromSettings(const Settings & settings, bool if_changed) { if (!if_changed || settings.s3_strict_upload_part_size.changed) strict_upload_part_size = settings.s3_strict_upload_part_size; @@ -263,13 +263,12 @@ void S3Settings::RequestSettings::updateFromSettingsImpl(const Settings & settin request_timeout_ms = settings.s3_request_timeout_ms; } -void S3Settings::RequestSettings::updateFromSettings(const Settings & settings) +void S3Settings::RequestSettings::updateFromSettingsIfChanged(const Settings & settings) { updateFromSettingsImpl(settings, true); - upload_settings.updateFromSettings(settings); + upload_settings.updateFromSettings(settings, true); } - void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings) { std::lock_guard lock(mutex); @@ -293,7 +292,7 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U } } -S3Settings StorageS3Settings::getSettings(const String & endpoint) const +std::optional StorageS3Settings::getSettings(const String & endpoint) const { std::lock_guard lock(mutex); auto next_prefix_setting = s3_settings.upper_bound(endpoint); diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 0e152bb2d31..a4bc9f0b5cf 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -39,7 +39,7 @@ struct S3Settings size_t max_single_operation_copy_size = 5ULL * 1024 * 1024 * 1024; String storage_class_name; - void updateFromSettings(const Settings & settings) { updateFromSettingsImpl(settings, true); } + void updateFromSettings(const Settings & settings, bool if_changed); void validate(); private: @@ -52,8 +52,6 @@ struct S3Settings const Settings & settings, String setting_name_prefix = {}); - void updateFromSettingsImpl(const Settings & settings, bool if_changed); - friend struct RequestSettings; }; @@ -96,7 +94,7 @@ struct S3Settings const Settings & settings, String setting_name_prefix = {}); - void updateFromSettings(const Settings & settings); + void updateFromSettingsIfChanged(const Settings & settings); private: void updateFromSettingsImpl(const Settings & settings, bool if_changed); @@ -112,7 +110,7 @@ class StorageS3Settings public: void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings); - S3Settings getSettings(const String & endpoint) const; + std::optional getSettings(const String & endpoint) const; private: mutable std::mutex mutex; diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index a48c95469d0..b07b328eed9 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -31,7 +31,7 @@ ObjectStoragePtr TableFunctionObjectStorage< Definition, StorageSettings, Configuration>::getObjectStorage(const ContextPtr & context, bool create_readonly) const { if (!object_storage) - object_storage = configuration->createOrUpdateObjectStorage(context, create_readonly); + object_storage = configuration->createObjectStorage(context, create_readonly); return object_storage; } @@ -63,8 +63,8 @@ std::vector TableFunctionObjectStorage< } template -void TableFunctionObjectStorage< - Definition, StorageSettings, Configuration>::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +void TableFunctionObjectStorage::updateStructureAndFormatArgumentsIfNeeded( + ASTs & args, const String & structure, const String & /* format */, const ContextPtr & context) { Configuration::addStructureToArgs(args, structure, context); } diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h index 5e180301862..9022f6e577f 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.h +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -110,7 +110,11 @@ public: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded( + ASTs & args, + const String & structure, + const String & format, + const ContextPtr & context); protected: using ConfigurationPtr = StorageObjectStorageConfigurationPtr; diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index 5a29a693431..55b41cf6ca8 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -20,12 +20,10 @@ StoragePtr TableFunctionObjectStorageCluster; - auto configuration = Base::getConfiguration(); - bool structure_argument_was_provided = configuration->structure != "auto"; ColumnsDescription columns; - if (structure_argument_was_provided) + if (configuration->structure != "auto") columns = parseColumnsListFromString(configuration->structure, context); else if (!Base::structure_hint.empty()) columns = Base::structure_hint; @@ -58,8 +56,7 @@ StoragePtr TableFunctionObjectStorageClusterstartup(); From 2e9b6545b6f060e1fa92970276116734f483f417 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 19 Feb 2024 18:24:23 +0100 Subject: [PATCH 020/392] Fix --- src/Disks/ObjectStorages/S3/diskSettings.cpp | 16 ++++++------- src/Storages/Cache/SchemaCache.cpp | 1 - .../ObjectStorage/StorageObjectStorage.cpp | 3 --- .../StorageObjectStorageCluster.cpp | 3 ++- .../StorageObjectStorageSource.cpp | 24 ++++++++++--------- .../TableFunctionObjectStorageCluster.cpp | 2 +- 6 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index cb2bb690292..43b1cffb3e6 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -27,12 +27,8 @@ std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { const Settings & settings = context->getSettingsRef(); - S3Settings::RequestSettings request_settings(config, config_prefix, settings, "s3_"); - /// TODO: add request settings prefix, becausse for StorageS3 it should be "s3." - - S3::AuthSettings auth_settings; - auth_settings.loadFromConfig(config_prefix, config); - + auto request_settings = S3Settings::RequestSettings(config, config_prefix, settings, "s3_"); + auto auth_settings = S3::AuthSettings::loadFromConfig(config_prefix, config); return std::make_unique( request_settings, auth_settings, @@ -60,9 +56,9 @@ std::unique_ptr getClient( if (for_disk_s3) { String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); - S3::URI uri(endpoint); - if (!uri.key.ends_with('/')) - uri.key.push_back('/'); + url = S3::URI(endpoint); + if (!url.key.ends_with('/')) + url.key.push_back('/'); } else { @@ -123,6 +119,8 @@ std::unique_ptr getClient( auth_settings.no_sign_request.value_or(context->getConfigRef().getBool("s3.no_sign_request", false)), }; + LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: {} - {}", auth_settings.access_key_id, auth_settings.secret_access_key); + return S3::ClientFactory::instance().create( client_configuration, client_settings, diff --git a/src/Storages/Cache/SchemaCache.cpp b/src/Storages/Cache/SchemaCache.cpp index 35fb8d348ef..5dc39f04ae0 100644 --- a/src/Storages/Cache/SchemaCache.cpp +++ b/src/Storages/Cache/SchemaCache.cpp @@ -110,7 +110,6 @@ std::optional SchemaCache::tryGetImpl(const Key & key, } ProfileEvents::increment(ProfileEvents::SchemaInferenceCacheHits); - LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: {}", StackTrace().toString()); auto & schema_info = it->second.schema_info; auto & queue_iterator = it->second.iterator; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 7337a528a76..30f5c36879c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -223,12 +223,10 @@ SinkToStoragePtr StorageObjectStorage::write( const auto storage_settings = StorageSettings::create(local_context->getSettingsRef()); - LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII: {}", object_storage->exists(StoredObject(configuration->getPath()))); auto configuration_copy = configuration->clone(); if (!storage_settings.truncate_on_insert && object_storage->exists(StoredObject(configuration->getPath()))) { - LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII 2: {}", storage_settings.create_new_file_on_insert); if (storage_settings.create_new_file_on_insert) { auto & paths = configuration_copy->getPaths(); @@ -260,7 +258,6 @@ SinkToStoragePtr StorageObjectStorage::write( configuration_copy->getNamespace(), configuration_copy->getPaths().back()); } } - LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII 3: {}", configuration_copy->getPaths().size()); return std::make_shared( object_storage, configuration_copy, format_settings, sample_block, local_context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 2bd2c022aa8..9b98051086d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -92,7 +92,8 @@ StorageObjectStorageCluster::getTask { const auto settings = StorageSettings::create(local_context->getSettingsRef()); auto iterator = std::make_shared( - object_storage, configuration, predicate, virtual_columns, local_context, nullptr, settings.list_object_keys_size); + object_storage, configuration, predicate, virtual_columns, local_context, + nullptr, settings.list_object_keys_size, local_context->getFileProgressCallback()); auto callback = std::make_shared>([iterator]() mutable -> String { diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index a8bde4cd56f..d91850bf99c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -362,9 +362,9 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } else { - const auto key_with_globs = configuration_->getPath(); - auto object_metadata = object_storage->getObjectMetadata(key_with_globs); - auto object_info = std::make_shared(key_with_globs, object_metadata); + const auto object_key = configuration_->getPath(); + auto object_metadata = object_storage->getObjectMetadata(object_key); + auto object_info = std::make_shared(object_key, object_metadata); object_infos.emplace_back(object_info); if (read_keys) @@ -381,12 +381,11 @@ ObjectInfoPtr StorageObjectStorageSource::GlobIterator::next(size_t /* processor { std::lock_guard lock(next_mutex); - if (is_finished) + bool current_batch_processed = object_infos.empty() || index >= object_infos.size(); + if (is_finished && current_batch_processed) return {}; - bool need_new_batch = object_infos.empty() || index >= object_infos.size(); - - if (need_new_batch) + if (current_batch_processed) { ObjectInfos new_batch; while (new_batch.empty()) @@ -439,11 +438,10 @@ ObjectInfoPtr StorageObjectStorageSource::GlobIterator::next(size_t /* processor } } - size_t current_index = index++; - if (current_index >= object_infos.size()) + if (index >= object_infos.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Index out of bound for blob metadata"); - return object_infos[current_index]; + return object_infos[index++]; } StorageObjectStorageSource::KeysIterator::KeysIterator( @@ -532,7 +530,11 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( pool.wait(); buffer.reserve(max_threads_count); for (auto & key_future : keys) - buffer.emplace_back(std::make_shared(key_future.get(), std::nullopt)); + { + auto key = key_future.get(); + if (!key.empty()) + buffer.emplace_back(std::make_shared(key, std::nullopt)); + } } ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator::next(size_t) diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index 55b41cf6ca8..4ec94cfaf7c 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -43,7 +43,7 @@ StoragePtr TableFunctionObjectStorageCluster Date: Mon, 19 Feb 2024 20:29:22 +0100 Subject: [PATCH 021/392] Fix style check --- src/Disks/ObjectStorages/S3/diskSettings.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 43b1cffb3e6..6fec4758456 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -22,6 +22,10 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) From d88f8646b180f0ca4fec7bab5c9c9c7cc7574c0c Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 21 Feb 2024 11:03:12 +0100 Subject: [PATCH 022/392] Fix after merge with master --- src/Coordination/Standalone/Context.cpp | 15 +++++++++++++++ src/Coordination/Standalone/Context.h | 3 +++ src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- src/Storages/ObjectStorage/S3/Configuration.cpp | 2 +- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Standalone/Context.cpp b/src/Coordination/Standalone/Context.cpp index 374610769c4..c16ecbfd5c3 100644 --- a/src/Coordination/Standalone/Context.cpp +++ b/src/Coordination/Standalone/Context.cpp @@ -77,6 +77,8 @@ struct ContextSharedPart : boost::noncopyable mutable ThrottlerPtr local_read_throttler; /// A server-wide throttler for local IO reads mutable ThrottlerPtr local_write_throttler; /// A server-wide throttler for local IO writes + + std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage }; ContextData::ContextData() = default; @@ -382,4 +384,17 @@ std::shared_ptr Context::getZooKeeper() const throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot connect to ZooKeeper from Keeper"); } +const StorageS3Settings & Context::getStorageS3Settings() const +{ + std::lock_guard lock(shared->mutex); + + if (!shared->storage_s3_settings) + { + const auto & config = shared->config ? *shared->config : Poco::Util::Application::instance().config(); + shared->storage_s3_settings.emplace().loadFromConfig("s3", config, getSettingsRef()); + } + + return *shared->storage_s3_settings; +} + } diff --git a/src/Coordination/Standalone/Context.h b/src/Coordination/Standalone/Context.h index 49ad2b568fe..3346a865f0f 100644 --- a/src/Coordination/Standalone/Context.h +++ b/src/Coordination/Standalone/Context.h @@ -36,6 +36,7 @@ class FilesystemCacheLog; class FilesystemReadPrefetchesLog; class BlobStorageLog; class IOUringReader; +class StorageS3Settings; /// A small class which owns ContextShared. /// We don't use something like unique_ptr directly to allow ContextShared type to be incomplete. @@ -160,6 +161,8 @@ public: void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config); zkutil::ZooKeeperPtr getZooKeeper() const; + + const StorageS3Settings & getStorageS3Settings() const; }; } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index a75a747f334..0869e2ebbd2 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -552,7 +552,7 @@ void S3ObjectStorage::applyNewSettings( static_headers.begin(), static_headers.end()); } - if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString())) + if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString(), context->getUserName())) new_s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); auto current_s3_settings = s3_settings.get(); diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 896131e74d7..47e7ebd53a6 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -86,7 +86,7 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, headers_from_ast.begin(), headers_from_ast.end()); } - if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString())) + if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName())) s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); auto client = getClient(config, config_prefix, context, *s3_settings, false, &url); From 94c44cefc89fbb471505aedd803600bc8ace7a49 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 21 Feb 2024 16:24:23 +0100 Subject: [PATCH 023/392] Fix clang tidy --- src/Storages/ObjectStorage/AzureBlob/Configuration.cpp | 5 +---- src/Storages/ObjectStorage/HDFS/Configuration.cpp | 4 +--- src/Storages/ObjectStorage/S3/Configuration.cpp | 5 +---- .../ObjectStorage/StorageObjectStorageConfiguration.cpp | 7 +++++++ .../ObjectStorage/StorageObjectStorageConfiguration.h | 1 + 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index 9d21541e7e2..7a670441e72 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -78,6 +78,7 @@ void StorageAzureBlobConfiguration::check(ContextPtr context) const } StorageAzureBlobConfiguration::StorageAzureBlobConfiguration(const StorageAzureBlobConfiguration & other) + : StorageObjectStorageConfiguration(other) { connection_url = other.connection_url; is_connection_string = other.is_connection_string; @@ -86,10 +87,6 @@ StorageAzureBlobConfiguration::StorageAzureBlobConfiguration(const StorageAzureB container = other.container; blob_path = other.blob_path; blobs_paths = other.blobs_paths; - - format = other.format; - compression_method = other.compression_method; - structure = other.structure; } AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(ContextPtr context) diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 731b05f4621..2f2427edb24 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -12,13 +12,11 @@ namespace DB { StorageHDFSConfiguration::StorageHDFSConfiguration(const StorageHDFSConfiguration & other) + : StorageObjectStorageConfiguration(other) { url = other.url; path = other.path; paths = other.paths; - format = other.format; - compression_method = other.compression_method; - structure = other.structure; } void StorageHDFSConfiguration::check(ContextPtr context) const diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 47e7ebd53a6..1e14ccc4b31 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -57,15 +57,12 @@ void StorageS3Configuration::check(ContextPtr context) const } StorageS3Configuration::StorageS3Configuration(const StorageS3Configuration & other) + : StorageObjectStorageConfiguration(other) { url = other.url; static_configuration = other.static_configuration; headers_from_ast = other.headers_from_ast; keys = other.keys; - - format = other.format; - compression_method = other.compression_method; - structure = other.structure; } ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index a1c7d468fa6..8a4dee2c31b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -20,6 +20,13 @@ void StorageObjectStorageConfiguration::initialize( configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.getPath()).value_or("auto"); } +StorageObjectStorageConfiguration::StorageObjectStorageConfiguration(const StorageObjectStorageConfiguration & other) +{ + format = other.format; + compression_method = other.compression_method; + structure = other.structure; +} + bool StorageObjectStorageConfiguration::withWildcard() const { static const String PARTITION_ID_WILDCARD = "{_partition_id}"; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 2da262eb55d..47afbc5d0c6 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -12,6 +12,7 @@ class StorageObjectStorageConfiguration { public: StorageObjectStorageConfiguration() = default; + StorageObjectStorageConfiguration(const StorageObjectStorageConfiguration & other); virtual ~StorageObjectStorageConfiguration() = default; using Path = std::string; From 6b5953859ec7fbd22728426e8110162b57b1b9aa Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 21 Feb 2024 17:59:11 +0100 Subject: [PATCH 024/392] Check for deserializeTextEscaped in other serializations, fix review changes --- .../SerializationAggregateFunction.cpp | 4 +- .../SerializationCustomSimpleText.cpp | 2 +- .../Serializations/SerializationEnum.cpp | 2 +- .../Serializations/SerializationObject.cpp | 4 +- .../Serializations/SerializationVariant.cpp | 4 +- src/IO/ReadHelpers.cpp | 42 ++++++++++++++----- src/IO/ReadHelpers.h | 1 - .../Impl/TabSeparatedRowInputFormat.cpp | 14 +++---- 8 files changed, 45 insertions(+), 28 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp index c9af5d1f838..28a4fcf86da 100644 --- a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -150,10 +150,10 @@ void SerializationAggregateFunction::serializeTextEscaped(const IColumn & column } -void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationAggregateFunction::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String s; - readEscapedString(s, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(s, istr) : readEscapedString(s, istr); deserializeFromString(function, column, s, version); } diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp index abe443cab1b..a3b0b088b17 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp @@ -75,7 +75,7 @@ void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column, void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String str; - readEscapedString(str, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(str, istr) : readEscapedString(str, istr); deserializeFromString(*this, column, str, settings); } diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index 14b1a33e2ce..f44ae2fd4f9 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -29,7 +29,7 @@ void SerializationEnum::deserializeTextEscaped(IColumn & column, ReadBuffe { /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. std::string field_name; - readEscapedString(field_name, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field_name, istr) : readEscapedString(field_name, istr); assert_cast(column).getData().push_back(ref_enum_values.getValue(StringRef(field_name), true)); } } diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp index e6dc16ef5a0..9d0ff5903b1 100644 --- a/src/DataTypes/Serializations/SerializationObject.cpp +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -105,9 +105,9 @@ void SerializationObject::deserializeWholeText(IColumn & column, ReadBuf } template -void SerializationObject::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationObject::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeTextImpl(column, [&](String & s) { readEscapedString(s, istr); }); + deserializeTextImpl(column, [&](String & s) { settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(s, istr) : readEscapedString(s, istr); }); } template diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 5af94364167..a4e77b9c75f 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -604,14 +604,14 @@ void SerializationVariant::serializeTextEscaped(const IColumn & column, size_t r bool SerializationVariant::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String field; - readEscapedString(field, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field, istr) : readEscapedString(field, istr); return tryDeserializeTextEscapedImpl(column, field, settings); } void SerializationVariant::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String field; - readEscapedString(field, istr); + settings.tsv.crlf_end_of_line_input ? readEscapedStringCRLF(field, istr) : readEscapedString(field, istr); if (!tryDeserializeTextEscapedImpl(column, field, settings)) throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse escaped value of type {} here: {}", variant_name, field); } diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 53a7229e7d5..e763d627f40 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -537,8 +537,19 @@ void readEscapedStringIntoImpl(Vector & s, ReadBuffer & buf) } } - if (*buf.position() == '\r') - ++buf.position(); + if constexpr (support_crlf) + { + if (*buf.position() == '\r') + { + ++buf.position(); + if (!buf.eof() && *buf.position() != '\n') + { + s.push_back('\r'); + continue; + } + return; + } + } } } @@ -555,11 +566,10 @@ void readEscapedString(String & s, ReadBuffer & buf) readEscapedStringInto(s, buf); } -template void readEscapedStringCRLF(String & s, ReadBuffer & buf) { s.clear(); - readEscapedStringInto(s, buf); + readEscapedStringInto(s, buf); } template void readEscapedStringInto,false>(PaddedPODArray & s, ReadBuffer & buf); @@ -567,9 +577,6 @@ template void readEscapedStringInto(NullOutput & s, ReadBuffer template void readEscapedStringInto,true>(PaddedPODArray & s, ReadBuffer & buf); template void readEscapedStringInto(NullOutput & s, ReadBuffer & buf); -template void readEscapedStringCRLF(String & s, ReadBuffer & buf); -template void readEscapedStringCRLF(String & s, ReadBuffer & buf); - /** If enable_sql_style_quoting == true, * strings like 'abc''def' will be parsed as abc'def. * Please note, that even with SQL style quoting enabled, @@ -1975,13 +1982,26 @@ bool tryReadJSONField(String & s, ReadBuffer & buf) return readParsedValueInto(s, buf, parse_func); } -template +template +void readTSVFieldImpl(String & s, ReadBuffer & buf) +{ + if constexpr (supports_crlf) + readEscapedStringIntoImpl(s, buf); + else + readEscapedStringIntoImpl(s, buf); +} + void readTSVField(String & s, ReadBuffer & buf) { s.clear(); - readEscapedStringIntoImpl(s, buf); + readTSVFieldImpl(s, buf); } -template void readTSVField(String & s, ReadBuffer & buf); -template void readTSVField(String & s, ReadBuffer & buf); +void readTSVFieldCRLF(String & s, ReadBuffer & buf) +{ + s.clear(); + readTSVFieldImpl(s, buf); +} + + } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 79014666ce1..3a20d2480b8 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -581,7 +581,6 @@ void readString(String & s, ReadBuffer & buf); void readEscapedString(String & s, ReadBuffer & buf); -template void readEscapedStringCRLF(String & s, ReadBuffer & buf); void readQuotedString(String & s, ReadBuffer & buf); diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index dbd939effe1..c92cd1c39a0 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -24,17 +24,14 @@ namespace ErrorCodes /** Check for a common error case - usage of Windows line feed. */ -template static void checkForCarriageReturn(ReadBuffer & in) { - bool crlf_escaped = false; - if constexpr (supports_crlf) - crlf_escaped = true; - if (!in.eof() && (in.position()[0] == '\r' || (crlf_escaped ? false : (in.position() != in.buffer().begin() && in.position()[-1] == '\r')))) + if (!in.eof() && (in.position()[0] == '\r' || (in.position() != in.buffer().begin() && in.position()[-1] == '\r'))) throw Exception(ErrorCodes::INCORRECT_DATA, "\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row." "\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format." " You must transform your file to Unix format." - "\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r."); + "\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r" + "\nor else enable setting 'input_format_tsv_crlf_end_of_line'"); } TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( @@ -104,7 +101,8 @@ void TabSeparatedFormatReader::skipRowEndDelimiter() } if (unlikely(first_row)) { - supports_crlf ? checkForCarriageReturn(*buf) : checkForCarriageReturn(*buf); + if (!supports_crlf) + checkForCarriageReturn(*buf); first_row = false; } assertChar('\n', *buf); @@ -120,7 +118,7 @@ String TabSeparatedFormatReader::readFieldIntoString() else { if constexpr (read_string) - support_crlf ? readEscapedStringCRLF(field, *buf) : readEscapedStringCRLF(field, *buf); + support_crlf ? readEscapedStringCRLF(field, *buf) : readEscapedString(field, *buf); else support_crlf ? readTSVField(field, *buf) : readTSVField(field, *buf); } From e60ead336ab14f0dfaba1a34022d0b9e0bbf82c2 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 21 Feb 2024 18:14:22 +0100 Subject: [PATCH 025/392] remove readEscapedStringInto function --- src/IO/ReadHelpers.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index e763d627f40..dec8a14fae7 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1982,25 +1982,16 @@ bool tryReadJSONField(String & s, ReadBuffer & buf) return readParsedValueInto(s, buf, parse_func); } -template -void readTSVFieldImpl(String & s, ReadBuffer & buf) -{ - if constexpr (supports_crlf) - readEscapedStringIntoImpl(s, buf); - else - readEscapedStringIntoImpl(s, buf); -} - void readTSVField(String & s, ReadBuffer & buf) { s.clear(); - readTSVFieldImpl(s, buf); + readEscapedStringIntoImpl(s, buf); } void readTSVFieldCRLF(String & s, ReadBuffer & buf) { s.clear(); - readTSVFieldImpl(s, buf); + readEscapedStringIntoImpl(s, buf); } From e46c3c63fae5aa1d6ae17b53aa03e5b07ba5220b Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 21 Feb 2024 18:24:58 +0100 Subject: [PATCH 026/392] check for return in skipRowEndDelimiter --- .../Formats/Impl/TabSeparatedRowInputFormat.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index c92cd1c39a0..93982526ddc 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -92,19 +92,20 @@ void TabSeparatedFormatReader::skipFieldDelimiter() void TabSeparatedFormatReader::skipRowEndDelimiter() { - bool supports_crlf = format_settings.tsv.crlf_end_of_line_input; if (buf->eof()) return; - if (supports_crlf && first_row==false) + + if (format_settings.tsv.crlf_end_of_line_input) { - ++buf->position(); + if (*buf->position() == '\r') + ++buf->position(); } - if (unlikely(first_row)) + else if (unlikely(first_row)) { - if (!supports_crlf) - checkForCarriageReturn(*buf); + checkForCarriageReturn(*buf); first_row = false; } + assertChar('\n', *buf); } From 4e3f2aae408fc8559304fe4f7c4a21db3d9202a6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 21 Feb 2024 18:47:17 +0100 Subject: [PATCH 027/392] Fix keeper build --- src/Coordination/Standalone/Context.cpp | 1 + src/Coordination/Standalone/Context.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/Coordination/Standalone/Context.cpp b/src/Coordination/Standalone/Context.cpp index c16ecbfd5c3..7e8711c7910 100644 --- a/src/Coordination/Standalone/Context.cpp +++ b/src/Coordination/Standalone/Context.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include diff --git a/src/Coordination/Standalone/Context.h b/src/Coordination/Standalone/Context.h index 3346a865f0f..943fcd106df 100644 --- a/src/Coordination/Standalone/Context.h +++ b/src/Coordination/Standalone/Context.h @@ -163,6 +163,8 @@ public: zkutil::ZooKeeperPtr getZooKeeper() const; const StorageS3Settings & getStorageS3Settings() const; + + const String & getUserName() const { static std::string user; return user; } }; } From 5f06c72bfc86b20e1ed50a255a121b6a334fa229 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 21 Feb 2024 20:36:10 +0100 Subject: [PATCH 028/392] check for return in deserializeTextEscaped implementations for SerializeBool and SerializeNullable --- .../Serializations/SerializationBool.cpp | 7 +++- .../Serializations/SerializationNullable.cpp | 40 ++++++++++++++----- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index f745fac4d30..0cf9cb8be49 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -242,8 +242,11 @@ void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & is { if (istr.eof()) throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); - - deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); + + if (settings.tsv.crlf_end_of_line_input) + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || *buf.position() == '\r'; }); + else + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } bool SerializationBool::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 4d31451f92d..9e78b1285db 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -286,7 +286,7 @@ bool SerializationNullable::tryDeserializeNullRaw(DB::ReadBuffer & istr, const D } template -ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) +ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) { static constexpr bool throw_exception = std::is_same_v; @@ -319,13 +319,23 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, /// Check if we have enough data in buffer to check if it's a null. if (istr.available() > null_representation.size()) { - auto check_for_null = [&null_representation](ReadBuffer & buf) + auto check_for_null = [&null_representation, settings](ReadBuffer & buf) { auto * pos = buf.position(); - if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n')) - return true; - buf.position() = pos; - return false; + if (settings.tsv.crlf_end_of_line_input) + { + if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n'|| *buf.position() == '\r')) + return true; + buf.position() = pos; + return false; + } + else + { + if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n')) + return true; + buf.position() = pos; + return false; + } }; return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } @@ -334,14 +344,22 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if check was failed. PeekableReadBuffer peekable_buf(istr, true); - auto check_for_null = [&null_representation](ReadBuffer & buf_) + auto check_for_null = [&null_representation, settings](ReadBuffer & buf_) { auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); - if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) - return true; + if (settings.tsv.crlf_end_of_line_input) + { + if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || *buf.position() == '\r')) + return true; + } + else + { + if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) + return true; + } buf.rollbackToCheckpoint(); return false; }; @@ -372,6 +390,10 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " "containing '\\t' or '\\n' may not work correctly for large input."); + + if (settings.tsv.crlf_end_of_line_input && null_representation.find('\r') != std::string::npos) + throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " + "containing '\\r' may not work correctly for large input."); WriteBufferFromOwnString parsed_value; if constexpr (escaped) From 80eb0c37826de63d9e2b595c62c37abbbb9c16ab Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 21 Feb 2024 20:47:25 +0100 Subject: [PATCH 029/392] Fix for hdfs --- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 28 +++++++++++++------ src/Storages/HDFS/WriteBufferFromHDFS.cpp | 7 +++-- .../ObjectStorage/HDFS/Configuration.cpp | 14 +++++++--- .../ObjectStorage/ReadBufferIterator.cpp | 12 ++++---- 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index fa5e227d853..360403b7f2d 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -36,10 +36,10 @@ ObjectStorageKey HDFSObjectStorage::generateObjectKeyForPath(const std::string & bool HDFSObjectStorage::exists(const StoredObject & object) const { - const auto & path = object.remote_path; - const size_t begin_of_path = path.find('/', path.find("//") + 2); - const String remote_fs_object_path = path.substr(begin_of_path); - return (0 == hdfsExists(hdfs_fs.get(), remote_fs_object_path.c_str())); + // const auto & path = object.remote_path; + // const size_t begin_of_path = path.find('/', path.find("//") + 2); + // const String remote_fs_object_path = path.substr(begin_of_path); + return (0 == hdfsExists(hdfs_fs.get(), object.remote_path.c_str())); } std::unique_ptr HDFSObjectStorage::readObject( /// NOLINT @@ -86,9 +86,12 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL ErrorCodes::UNSUPPORTED_METHOD, "HDFS API doesn't support custom attributes/metadata for stored objects"); + auto path = object.remote_path.starts_with('/') ? object.remote_path.substr(1) : object.remote_path; + path = fs::path(hdfs_root_path) / path; + /// Single O_WRONLY in libhdfs adds O_TRUNC return std::make_unique( - object.remote_path, config, settings->replication, patchSettings(write_settings), buf_size, + path, config, settings->replication, patchSettings(write_settings), buf_size, mode == WriteMode::Rewrite ? O_WRONLY : O_WRONLY | O_APPEND); } @@ -124,11 +127,18 @@ void HDFSObjectStorage::removeObjectsIfExist(const StoredObjects & objects) removeObjectIfExists(object); } -ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string &) const +ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) const { - throw Exception( - ErrorCodes::UNSUPPORTED_METHOD, - "HDFS API doesn't support custom attributes/metadata for stored objects"); + auto * file_info = hdfsGetPathInfo(hdfs_fs.get(), path.data()); + if (!file_info) + throw Exception(ErrorCodes::HDFS_ERROR, "Cannot get file info for: {}. Error: {}", path, hdfsGetLastError()); + + ObjectMetadata metadata; + metadata.size_bytes = static_cast(file_info->mSize); + metadata.last_modified = file_info->mLastMod; + + hdfsFreeFileInfo(file_info, 1); + return metadata; } void HDFSObjectStorage::copyObject( /// NOLINT diff --git a/src/Storages/HDFS/WriteBufferFromHDFS.cpp b/src/Storages/HDFS/WriteBufferFromHDFS.cpp index 173dd899ada..9d383aa8245 100644 --- a/src/Storages/HDFS/WriteBufferFromHDFS.cpp +++ b/src/Storages/HDFS/WriteBufferFromHDFS.cpp @@ -48,12 +48,13 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2); const String path = hdfs_uri.substr(begin_of_path); - fout = hdfsOpenFile(fs.get(), path.c_str(), flags, 0, replication_, 0); /// O_WRONLY meaning create or overwrite i.e., implies O_TRUNCAT here + /// O_WRONLY meaning create or overwrite i.e., implies O_TRUNCAT here + fout = hdfsOpenFile(fs.get(), path.c_str(), flags, 0, replication_, 0); if (fout == nullptr) { - throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "Unable to open HDFS file: {} error: {}", - path, std::string(hdfsGetLastError())); + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "Unable to open HDFS file: {} ({}) error: {}", + path, hdfs_uri, std::string(hdfsGetLastError())); } } diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 2f2427edb24..a64faafd53d 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -22,13 +22,14 @@ StorageHDFSConfiguration::StorageHDFSConfiguration(const StorageHDFSConfiguratio void StorageHDFSConfiguration::check(ContextPtr context) const { context->getRemoteHostFilter().checkURL(Poco::URI(url)); - checkHDFSURL(url); + checkHDFSURL(fs::path(url) / path); } ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT { UNUSED(is_readonly); auto settings = std::make_unique(); + chassert(!url.empty()); return std::make_shared(url, std::move(settings), context->getConfigRef()); } @@ -36,15 +37,20 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr, bool /* with_str { url = checkAndGetLiteralArgument(args[0], "url"); - String format_name = "auto"; if (args.size() > 1) - format_name = checkAndGetLiteralArgument(args[1], "format_name"); + format = checkAndGetLiteralArgument(args[1], "format_name"); + else + format = "auto"; - String compression_method; if (args.size() == 3) compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); else compression_method = "auto"; + + const size_t begin_of_path = url.find('/', url.find("//") + 2); + path = url.substr(begin_of_path + 1); + url = url.substr(0, begin_of_path); + paths = {path}; } } diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index a0e719878ac..dd4bfe79b06 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -195,19 +195,19 @@ ReadBufferIterator::Data ReadBufferIterator::next() throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "The table structure cannot be extracted from a {} format file, because there are no files with provided path " - "in S3 or all files are empty. You can specify table structure manually", - *format); + "in {} or all files are empty. You can specify table structure manually", + *format, object_storage->getName()); throw Exception( ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files, because there are no files with provided path " - "in S3 or all files are empty. You can specify the format manually"); + "in {} or all files are empty. You can specify the format manually", object_storage->getName()); } return {nullptr, std::nullopt, format}; } - /// S3 file iterator could get new keys after new iteration + /// file iterator could get new keys after new iteration if (read_keys.size() > prev_read_keys_size) { /// If format is unknown we can try to determine it by new file names. @@ -234,7 +234,7 @@ ReadBufferIterator::Data ReadBufferIterator::next() prev_read_keys_size = read_keys.size(); } - if (getContext()->getSettingsRef().s3_skip_empty_files + if (query_settings.skip_empty_files && current_object_info->metadata && current_object_info->metadata->size_bytes == 0) continue; @@ -255,7 +255,7 @@ ReadBufferIterator::Data ReadBufferIterator::next() {}, current_object_info->metadata->size_bytes); - if (!getContext()->getSettingsRef().s3_skip_empty_files || !read_buffer->eof()) + if (!query_settings.skip_empty_files || !read_buffer->eof()) { first = false; From f23ddec69f51481b8a7c3b923ae5e9dbb3891b41 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 22 Feb 2024 11:50:36 +0100 Subject: [PATCH 030/392] Fix unit tests build --- src/IO/tests/gtest_writebuffer_s3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp index ae00bb2e9e2..7856f22ab1a 100644 --- a/src/IO/tests/gtest_writebuffer_s3.cpp +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -546,7 +546,7 @@ public: std::unique_ptr getWriteBuffer(String file_name = "file") { S3Settings::RequestSettings request_settings; - request_settings.updateFromSettings(settings); + request_settings.updateFromSettingsIfChanged(settings); client->resetCounters(); From 26a2fcf65a1702f71cc8cb6167d5622d55c00ae6 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 22 Feb 2024 12:46:13 +0100 Subject: [PATCH 031/392] Fix style-check --- src/DataTypes/Serializations/SerializationBool.cpp | 1 - src/DataTypes/Serializations/SerializationNullable.cpp | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index 0cf9cb8be49..94bc724fd5d 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -242,7 +242,6 @@ void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & is { if (istr.eof()) throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); - if (settings.tsv.crlf_end_of_line_input) deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || *buf.position() == '\r'; }); else diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 9e78b1285db..bb6adf77b32 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -389,8 +389,7 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " - "containing '\\t' or '\\n' may not work correctly for large input."); - + "containing '\\t' or '\\n' may not work correctly for large input."); if (settings.tsv.crlf_end_of_line_input && null_representation.find('\r') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " "containing '\\r' may not work correctly for large input."); From 7f452aa830501ec4d800866b69fab7d158d4896c Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 22 Feb 2024 13:41:03 +0100 Subject: [PATCH 032/392] Update SerializationNullable.cpp Fix style-check --- src/DataTypes/Serializations/SerializationNullable.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index bb6adf77b32..566221e2371 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -389,11 +389,11 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " - "containing '\\t' or '\\n' may not work correctly for large input."); + "containing '\\t' or '\\n' may not work correctly for large input."); if (settings.tsv.crlf_end_of_line_input && null_representation.find('\r') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " - "containing '\\r' may not work correctly for large input."); - + "containing '\\r' may not work correctly for large input."); + WriteBufferFromOwnString parsed_value; if constexpr (escaped) nested_serialization->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); From b548ed976d11309f8fb3b643ab71d9fd7d26ab31 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 22 Feb 2024 14:45:29 +0100 Subject: [PATCH 033/392] Fxi --- src/Storages/ObjectStorage/StorageObjectStorageCluster.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 5d77d4ced60..d7940851b00 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -54,7 +54,7 @@ private: const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; - const String & engine_name; + const String engine_name; const Storage::ConfigurationPtr configuration; const ObjectStoragePtr object_storage; NamesAndTypesList virtual_columns; From e78ab3e06377502068830bfe27f69777f3497cdd Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Tue, 27 Feb 2024 20:20:54 +0100 Subject: [PATCH 034/392] Update src/DataTypes/Serializations/SerializationNullable.cpp Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- .../Serializations/SerializationNullable.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 566221e2371..5aca15e46f0 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -350,16 +350,8 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); - if (settings.tsv.crlf_end_of_line_input) - { - if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || *buf.position() == '\r')) - return true; - } - else - { - if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) - return true; - } + if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n' || (settings.tsv.crlf_end_of_line_input && *buf.position() == '\r'))) + return true; buf.rollbackToCheckpoint(); return false; }; From a458797015eb7d136edf878ac9464c8e6ffdad75 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Tue, 27 Feb 2024 20:21:13 +0100 Subject: [PATCH 035/392] Update src/DataTypes/Serializations/SerializationNullable.cpp Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/DataTypes/Serializations/SerializationNullable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 5aca15e46f0..e9acab7a2a3 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -344,7 +344,7 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if check was failed. PeekableReadBuffer peekable_buf(istr, true); - auto check_for_null = [&null_representation, settings](ReadBuffer & buf_) + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf_) { auto & buf = assert_cast(buf_); buf.setCheckpoint(); From 03d0dd661feaf19d62a6969fc8d895200e410b38 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Tue, 27 Feb 2024 20:21:19 +0100 Subject: [PATCH 036/392] Update src/DataTypes/Serializations/SerializationNullable.cpp Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/DataTypes/Serializations/SerializationNullable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index e9acab7a2a3..aef0a814f24 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -319,7 +319,7 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr /// Check if we have enough data in buffer to check if it's a null. if (istr.available() > null_representation.size()) { - auto check_for_null = [&null_representation, settings](ReadBuffer & buf) + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf) { auto * pos = buf.position(); if (settings.tsv.crlf_end_of_line_input) From c83179bc70b5363a839d71d6f34af54807ad1d82 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Tue, 27 Feb 2024 20:21:28 +0100 Subject: [PATCH 037/392] Update src/DataTypes/Serializations/SerializationNullable.cpp Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- .../Serializations/SerializationNullable.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index aef0a814f24..c796c147f1f 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -322,20 +322,10 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr auto check_for_null = [&null_representation, &settings](ReadBuffer & buf) { auto * pos = buf.position(); - if (settings.tsv.crlf_end_of_line_input) - { - if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n'|| *buf.position() == '\r')) - return true; - buf.position() = pos; - return false; - } - else - { - if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n')) - return true; - buf.position() = pos; - return false; - } + if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n' || (settings.tsv.crlf_end_of_line_input && *buf.position() == '\r'))) + return true; + buf.position() = pos; + return false; }; return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } From 230cc512f86ede5e989a8a41a8abaaf15cfaebdd Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 27 Feb 2024 20:40:55 +0100 Subject: [PATCH 038/392] Handle CRLF in TabSeparatedRowInputFormat --- .../Formats/Impl/TabSeparatedRowInputFormat.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 93982526ddc..f60a64b18e0 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -164,7 +164,7 @@ bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & t const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { const bool at_delimiter = !is_last_file_column && !buf->eof() && *buf->position() == '\t'; - const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n'); + const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || (format_settings.tsv.crlf_end_of_line_input && *buf->position() == '\r')); if (format_settings.tsv.empty_as_default && (at_delimiter || at_last_column_line_end)) { @@ -229,7 +229,10 @@ bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) try { - assertChar('\n', *buf); + if (!format_settings.tsv.crlf_end_of_line_input) + assertChar('\n', *buf); + else + assertChar('\r', *buf); } catch (const DB::Exception &) { @@ -242,7 +245,10 @@ bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) else if (*buf->position() == '\r') { out << "ERROR: Carriage return found where line feed is expected." - " It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n"; + " It's like your file has DOS/Windows style line separators. \n" + "You must transform your file to Unix format. \n" + "But if you really need carriage return at end of string value of last column, you need to escape it as \\r \n" + "or else enable setting 'input_format_tsv_crlf_end_of_line'"; } else { @@ -357,7 +363,7 @@ void TabSeparatedFormatReader::skipRow() bool TabSeparatedFormatReader::checkForEndOfRow() { - return buf->eof() || *buf->position() == '\n'; + return buf->eof() || *buf->position() == '\n' || (format_settings.tsv.crlf_end_of_line_input && *buf->position() == '\r'); } TabSeparatedSchemaReader::TabSeparatedSchemaReader( From 69bb01e77a15cad1e022b7d8234b61373243070c Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 29 Feb 2024 21:49:27 +0100 Subject: [PATCH 039/392] Fix style-check --- src/DataTypes/Serializations/SerializationNullable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index c796c147f1f..06361e24aa2 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -375,7 +375,7 @@ ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr if (settings.tsv.crlf_end_of_line_input && null_representation.find('\r') != std::string::npos) throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " "containing '\\r' may not work correctly for large input."); - + WriteBufferFromOwnString parsed_value; if constexpr (escaped) nested_serialization->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); From 2ad8ab2a5719bbaeb8a1c3216cd93b760534c59a Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 5 Mar 2024 19:12:49 +0100 Subject: [PATCH 040/392] Fix linker errors --- src/Formats/EscapingRuleUtils.cpp | 2 +- src/IO/ReadHelpers.h | 2 +- src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 6b254102bdf..c7a6cb18625 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -236,7 +236,7 @@ String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escapin if constexpr (read_string) readEscapedString(result, buf); else - readTSVField(result, buf); + readTSVField(result, buf); break; default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 3a20d2480b8..f8e5887b82b 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -1897,8 +1897,8 @@ bool tryReadQuotedField(String & s, ReadBuffer & buf); void readJSONField(String & s, ReadBuffer & buf); bool tryReadJSONField(String & s, ReadBuffer & buf); -template void readTSVField(String & s, ReadBuffer & buf); +void readTSVFieldCRLF(String & s, ReadBuffer & buf); /** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters). * It is assumed that the cursor is located on the `\` symbol diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index f60a64b18e0..a800bf41ac9 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -121,7 +121,7 @@ String TabSeparatedFormatReader::readFieldIntoString() if constexpr (read_string) support_crlf ? readEscapedStringCRLF(field, *buf) : readEscapedString(field, *buf); else - support_crlf ? readTSVField(field, *buf) : readTSVField(field, *buf); + support_crlf ? readTSVFieldCRLF(field, *buf) : readTSVField(field, *buf); } return field; } From 2939ea07c8192aa9ed3bd8c75fe9ea42ded0a9cf Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 6 Mar 2024 04:31:49 +0100 Subject: [PATCH 041/392] Update 02973_parse_crlf_with_tsv_files.sh Fix Fuzzer failing on parallel file creation/deletion issue --- .../0_stateless/02973_parse_crlf_with_tsv_files.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh index cb7472be418..df03da4d42b 100755 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -6,8 +6,10 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # Data preparation step USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') -DATA_FILE_UNIX_ENDINGS=${USER_FILES_PATH:?}/data_without_crlf.tsv -DATA_FILE_DOS_ENDINGS=${USER_FILES_PATH:?}/data_with_crlf.tsv +FILE_NAME_UNIX = "${CLICKHOUSE_TEST_UNIQUE_NAME}_data_without_crlf.tsv" +FILE_NAME_CRLF = "${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" +DATA_FILE_UNIX_ENDINGS=${USER_FILES_PATH:?}/FILE_NAME_UNIX +DATA_FILE_DOS_ENDINGS=${USER_FILES_PATH:?}/FILE_NAME_CRLF touch $DATA_FILE_UNIX_ENDINGS touch $DATA_FILE_DOS_ENDINGS @@ -16,11 +18,11 @@ echo -ne "Akiba_Hebrew_Academy\t2017-08-01\t241\nAegithina_tiphia\t2018-02-01\t3 echo -ne "Akiba_Hebrew_Academy\t2017-08-01\t241\r\nAegithina_tiphia\t2018-02-01\t34\r\n1971-72_Utah_Stars_season\t2016-10-01\t1\r\n" > $DATA_FILE_DOS_ENDINGS echo -e "<-- Read UNIX endings -->\n" -$CLICKHOUSE_CLIENT --query "SELECT * FROM file(data_without_crlf.tsv, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" -$CLICKHOUSE_CLIENT --multiquery --query "SELECT * FROM file(data_with_crlf.tsv, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32'); --{serverError 117}" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${FILE_NAME_UNIX}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" +$CLICKHOUSE_CLIENT --multiquery --query "SELECT * FROM file(${FILE_NAME_CRLF}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32'); --{serverError 117}" echo -e "\n<-- Read DOS endings with setting input_format_tsv_crlf_end_of_line=1 -->\n" -$CLICKHOUSE_CLIENT --query "SELECT * FROM file(data_with_crlf.tsv, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32') SETTINGS input_format_tsv_crlf_end_of_line = 1;" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${FILE_NAME_CRLF}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32') SETTINGS input_format_tsv_crlf_end_of_line = 1;" # Test teardown rm $DATA_FILE_UNIX_ENDINGS From 7fa7d81c92007a8e6e8f4be2c3e348a20c3f3cd2 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 6 Mar 2024 06:03:24 +0100 Subject: [PATCH 042/392] Fix shell style check Remove space around = --- tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh index df03da4d42b..345a01bab88 100755 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -6,8 +6,8 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # Data preparation step USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') -FILE_NAME_UNIX = "${CLICKHOUSE_TEST_UNIQUE_NAME}_data_without_crlf.tsv" -FILE_NAME_CRLF = "${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" +FILE_NAME_UNIX="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_without_crlf.tsv" +FILE_NAME_CRLF="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" DATA_FILE_UNIX_ENDINGS=${USER_FILES_PATH:?}/FILE_NAME_UNIX DATA_FILE_DOS_ENDINGS=${USER_FILES_PATH:?}/FILE_NAME_CRLF From 2f2139d53b4497e7fc192d53a3474392dac5ad00 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 6 Mar 2024 07:58:27 +0100 Subject: [PATCH 043/392] Update 02973_parse_crlf_with_tsv_files.sh --- .../queries/0_stateless/02973_parse_crlf_with_tsv_files.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh index 345a01bab88..c36d65fa617 100755 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -18,11 +18,11 @@ echo -ne "Akiba_Hebrew_Academy\t2017-08-01\t241\nAegithina_tiphia\t2018-02-01\t3 echo -ne "Akiba_Hebrew_Academy\t2017-08-01\t241\r\nAegithina_tiphia\t2018-02-01\t34\r\n1971-72_Utah_Stars_season\t2016-10-01\t1\r\n" > $DATA_FILE_DOS_ENDINGS echo -e "<-- Read UNIX endings -->\n" -$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${FILE_NAME_UNIX}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" -$CLICKHOUSE_CLIENT --multiquery --query "SELECT * FROM file(${FILE_NAME_CRLF}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32'); --{serverError 117}" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${DATA_FILE_UNIX_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" +$CLICKHOUSE_CLIENT --multiquery --query "SELECT * FROM file(${DATA_FILE_DOS_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32'); --{serverError 117}" echo -e "\n<-- Read DOS endings with setting input_format_tsv_crlf_end_of_line=1 -->\n" -$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${FILE_NAME_CRLF}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32') SETTINGS input_format_tsv_crlf_end_of_line = 1;" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${DATA_FILE_DOS_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32') SETTINGS input_format_tsv_crlf_end_of_line = 1;" # Test teardown rm $DATA_FILE_UNIX_ENDINGS From 37a17172ccbe4c0f3aeee145ba569ef109ad9efd Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 6 Mar 2024 08:18:13 +0100 Subject: [PATCH 044/392] Update 02973_parse_crlf_with_tsv_files.sh --- .../queries/0_stateless/02973_parse_crlf_with_tsv_files.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh index c36d65fa617..c521b936140 100755 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -6,10 +6,8 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # Data preparation step USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') -FILE_NAME_UNIX="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_without_crlf.tsv" -FILE_NAME_CRLF="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" -DATA_FILE_UNIX_ENDINGS=${USER_FILES_PATH:?}/FILE_NAME_UNIX -DATA_FILE_DOS_ENDINGS=${USER_FILES_PATH:?}/FILE_NAME_CRLF +DATA_FILE_UNIX_ENDINGS=${USER_FILES_PATH:?}/${CLICKHOUSE_TEST_UNIQUE_NAME}_data_without_crlf.tsv" +DATA_FILE_DOS_ENDINGS=${USER_FILES_PATH:?}/${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" touch $DATA_FILE_UNIX_ENDINGS touch $DATA_FILE_DOS_ENDINGS From 361b23c007e1099a9dea11d26c019b9b1b3fb251 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 6 Mar 2024 18:14:58 +0100 Subject: [PATCH 045/392] Use unique test name for parallel tests --- .../0_stateless/02973_parse_crlf_with_tsv_files.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh index c521b936140..c8a3d854d5a 100755 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -6,9 +6,12 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # Data preparation step USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') -DATA_FILE_UNIX_ENDINGS=${USER_FILES_PATH:?}/${CLICKHOUSE_TEST_UNIQUE_NAME}_data_without_crlf.tsv" -DATA_FILE_DOS_ENDINGS=${USER_FILES_PATH:?}/${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" +UNIX_ENDINGS="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_without_crlf.tsv" +DOS_ENDINGS="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" +DATA_FILE_UNIX_ENDINGS="${USER_FILES_PATH:?}/${UNIX_ENDINGS}" +DATA_FILE_DOS_ENDINGS="${USER_FILES_PATH:?}/${DOS_ENDINGS}" +echo $DATA_FILE_UNIX_ENDINGS touch $DATA_FILE_UNIX_ENDINGS touch $DATA_FILE_DOS_ENDINGS @@ -16,11 +19,11 @@ echo -ne "Akiba_Hebrew_Academy\t2017-08-01\t241\nAegithina_tiphia\t2018-02-01\t3 echo -ne "Akiba_Hebrew_Academy\t2017-08-01\t241\r\nAegithina_tiphia\t2018-02-01\t34\r\n1971-72_Utah_Stars_season\t2016-10-01\t1\r\n" > $DATA_FILE_DOS_ENDINGS echo -e "<-- Read UNIX endings -->\n" -$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${DATA_FILE_UNIX_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" -$CLICKHOUSE_CLIENT --multiquery --query "SELECT * FROM file(${DATA_FILE_DOS_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32'); --{serverError 117}" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${UNIX_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32');" +$CLICKHOUSE_CLIENT --multiquery --query "SELECT * FROM file(${DOS_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32'); --{serverError 117}" echo -e "\n<-- Read DOS endings with setting input_format_tsv_crlf_end_of_line=1 -->\n" -$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${DATA_FILE_DOS_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32') SETTINGS input_format_tsv_crlf_end_of_line = 1;" +$CLICKHOUSE_CLIENT --query "SELECT * FROM file(${DOS_ENDINGS}, 'TabSeparated', 'SearchTerm String, Date Date, Hits UInt32') SETTINGS input_format_tsv_crlf_end_of_line = 1;" # Test teardown rm $DATA_FILE_UNIX_ENDINGS From 0abed7aab1ed5d4aa160b03d64bb5846b5a982dc Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 6 Mar 2024 22:10:24 +0100 Subject: [PATCH 046/392] Update reference file --- .../0_stateless/02973_parse_crlf_with_tsv_files.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference index 88d203bd723..14cf3a564e4 100644 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference @@ -1,3 +1,4 @@ +/home/shaun/Desktop/ClickHouse/user_files/02973_parse_crlf_with_tsv_files_test_data_without_crlf.tsv <-- Read UNIX endings --> Akiba_Hebrew_Academy 2017-08-01 241 From 188fe4a93afa8db75afc9b75e6450424d1c4d542 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 6 Mar 2024 22:11:47 +0100 Subject: [PATCH 047/392] Update reference file --- .../0_stateless/02973_parse_crlf_with_tsv_files.reference | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference index 14cf3a564e4..88d203bd723 100644 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.reference @@ -1,4 +1,3 @@ -/home/shaun/Desktop/ClickHouse/user_files/02973_parse_crlf_with_tsv_files_test_data_without_crlf.tsv <-- Read UNIX endings --> Akiba_Hebrew_Academy 2017-08-01 241 From bb393890e8b85f33f7e08236d2bcc61029c5c449 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 8 Mar 2024 20:05:26 +0100 Subject: [PATCH 048/392] Test passes locally --- .../0.2973_parse_crlf_with_tsv_files.reference | 12 ++++++++++++ .../0_stateless/02973_parse_crlf_with_tsv_files.sh | 1 - 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference diff --git a/tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference b/tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference new file mode 100644 index 00000000000..14cf3a564e4 --- /dev/null +++ b/tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference @@ -0,0 +1,12 @@ +/home/shaun/Desktop/ClickHouse/user_files/02973_parse_crlf_with_tsv_files_test_data_without_crlf.tsv +<-- Read UNIX endings --> + +Akiba_Hebrew_Academy 2017-08-01 241 +Aegithina_tiphia 2018-02-01 34 +1971-72_Utah_Stars_season 2016-10-01 1 + +<-- Read DOS endings with setting input_format_tsv_crlf_end_of_line=1 --> + +Akiba_Hebrew_Academy 2017-08-01 241 +Aegithina_tiphia 2018-02-01 34 +1971-72_Utah_Stars_season 2016-10-01 1 diff --git a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh index c8a3d854d5a..14f28f1ba4a 100755 --- a/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh +++ b/tests/queries/0_stateless/02973_parse_crlf_with_tsv_files.sh @@ -11,7 +11,6 @@ DOS_ENDINGS="${CLICKHOUSE_TEST_UNIQUE_NAME}_data_with_crlf.tsv" DATA_FILE_UNIX_ENDINGS="${USER_FILES_PATH:?}/${UNIX_ENDINGS}" DATA_FILE_DOS_ENDINGS="${USER_FILES_PATH:?}/${DOS_ENDINGS}" -echo $DATA_FILE_UNIX_ENDINGS touch $DATA_FILE_UNIX_ENDINGS touch $DATA_FILE_DOS_ENDINGS From 70272d41744d9cc219d79c6dd5e3b6c9e523d447 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 11 Mar 2024 10:55:01 +0100 Subject: [PATCH 049/392] Minor --- src/CMakeLists.txt | 2 +- src/Databases/DatabaseHDFS.cpp | 2 +- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 6 +- .../ObjectStorages/HDFS/HDFSObjectStorage.h | 2 +- .../ObjectStorages/ObjectStorageFactory.cpp | 2 +- src/IO/examples/read_buffer_from_hdfs.cpp | 2 +- src/Storages/HDFS/StorageHDFS.cpp | 1200 ----------------- src/Storages/Hive/HiveCommon.h | 2 +- src/Storages/Hive/HiveFile.h | 2 +- src/Storages/Hive/StorageHive.cpp | 4 +- src/Storages/Hive/StorageHive.h | 2 +- .../HDFS/AsynchronousReadBufferFromHDFS.cpp | 2 +- .../HDFS/AsynchronousReadBufferFromHDFS.h | 2 +- .../ObjectStorage/HDFS/Configuration.cpp | 2 +- .../{ => ObjectStorage}/HDFS/HDFSCommon.cpp | 2 +- .../{ => ObjectStorage}/HDFS/HDFSCommon.h | 0 .../HDFS/ReadBufferFromHDFS.cpp | 2 +- .../HDFS/ReadBufferFromHDFS.h | 0 .../HDFS/WriteBufferFromHDFS.cpp | 4 +- .../HDFS/WriteBufferFromHDFS.h | 0 .../examples/async_read_buffer_from_hdfs.cpp | 2 +- 21 files changed, 21 insertions(+), 1221 deletions(-) delete mode 100644 src/Storages/HDFS/StorageHDFS.cpp rename src/Storages/{ => ObjectStorage}/HDFS/AsynchronousReadBufferFromHDFS.cpp (99%) rename src/Storages/{ => ObjectStorage}/HDFS/AsynchronousReadBufferFromHDFS.h (96%) rename src/Storages/{ => ObjectStorage}/HDFS/HDFSCommon.cpp (99%) rename src/Storages/{ => ObjectStorage}/HDFS/HDFSCommon.h (100%) rename src/Storages/{ => ObjectStorage}/HDFS/ReadBufferFromHDFS.cpp (99%) rename src/Storages/{ => ObjectStorage}/HDFS/ReadBufferFromHDFS.h (100%) rename src/Storages/{ => ObjectStorage}/HDFS/WriteBufferFromHDFS.cpp (97%) rename src/Storages/{ => ObjectStorage}/HDFS/WriteBufferFromHDFS.h (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1cf0e4e2b98..3cb64b56c46 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -150,7 +150,7 @@ if (TARGET ch_contrib::azure_sdk) endif() if (TARGET ch_contrib::hdfs) - add_headers_and_sources(dbms Storages/HDFS) + add_headers_and_sources(dbms Storages/ObjectStorage/HDFS) add_headers_and_sources(dbms Disks/ObjectStorages/HDFS) endif() diff --git a/src/Databases/DatabaseHDFS.cpp b/src/Databases/DatabaseHDFS.cpp index 3a1e6b16ccf..cda38a69c9a 100644 --- a/src/Databases/DatabaseHDFS.cpp +++ b/src/Databases/DatabaseHDFS.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 8bff687b915..2d03de60c3c 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -1,10 +1,10 @@ #include #include -#include -#include +#include +#include -#include +#include #include #include diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 66095eb9f8f..4072d21ed7c 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 02b6816d673..d1841c92a6b 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -7,7 +7,7 @@ #endif #if USE_HDFS && !defined(CLICKHOUSE_KEEPER_STANDALONE_BUILD) #include -#include +#include #endif #if USE_AZURE_BLOB_STORAGE && !defined(CLICKHOUSE_KEEPER_STANDALONE_BUILD) #include diff --git a/src/IO/examples/read_buffer_from_hdfs.cpp b/src/IO/examples/read_buffer_from_hdfs.cpp index 977dd2ae227..a5cf43b3e79 100644 --- a/src/IO/examples/read_buffer_from_hdfs.cpp +++ b/src/IO/examples/read_buffer_from_hdfs.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp deleted file mode 100644 index cd935fa3100..00000000000 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ /dev/null @@ -1,1200 +0,0 @@ -#include "config.h" - -#if USE_HDFS - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include -#include - -#include - -namespace fs = std::filesystem; - -namespace ProfileEvents -{ - extern const Event EngineFileLikeReadFiles; -} - -namespace DB -{ -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int ACCESS_DENIED; - extern const int DATABASE_ACCESS_DENIED; - extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int BAD_ARGUMENTS; - extern const int LOGICAL_ERROR; - extern const int CANNOT_COMPILE_REGEXP; - extern const int CANNOT_DETECT_FORMAT; -} -namespace -{ - struct HDFSFileInfoDeleter - { - /// Can have only one entry (see hdfsGetPathInfo()) - void operator()(hdfsFileInfo * info) { hdfsFreeFileInfo(info, 1); } - }; - using HDFSFileInfoPtr = std::unique_ptr; - - /* Recursive directory listing with matched paths as a result. - * Have the same method in StorageFile. - */ - std::vector LSWithRegexpMatching( - const String & path_for_ls, - const HDFSFSPtr & fs, - const String & for_match) - { - std::vector result; - - const size_t first_glob_pos = for_match.find_first_of("*?{"); - - if (first_glob_pos == std::string::npos) - { - const String path = fs::path(path_for_ls + for_match.substr(1)).lexically_normal(); - HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path.c_str())); - if (hdfs_info) // NOLINT - { - result.push_back(StorageHDFS::PathWithInfo{ - String(path), - StorageHDFS::PathInfo{hdfs_info->mLastMod, static_cast(hdfs_info->mSize)}}); - } - return result; - } - - const size_t end_of_path_without_globs = for_match.substr(0, first_glob_pos).rfind('/'); - const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' - const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' - - const size_t next_slash_after_glob_pos = suffix_with_globs.find('/', 1); - - const std::string current_glob = suffix_with_globs.substr(0, next_slash_after_glob_pos); - - re2::RE2 matcher(makeRegexpPatternFromGlobs(current_glob)); - if (!matcher.ok()) - throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, - "Cannot compile regex from glob ({}): {}", for_match, matcher.error()); - - HDFSFileInfo ls; - ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); - if (ls.file_info == nullptr && errno != ENOENT) // NOLINT - { - // ignore file not found exception, keep throw other exception, libhdfs3 doesn't have function to get exception type, so use errno. - throw Exception( - ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", prefix_without_globs, String(hdfsGetLastError())); - } - - if (!ls.file_info && ls.length > 0) - throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null"); - for (int i = 0; i < ls.length; ++i) - { - const String full_path = fs::path(ls.file_info[i].mName).lexically_normal(); - const size_t last_slash = full_path.rfind('/'); - const String file_name = full_path.substr(last_slash); - const bool looking_for_directory = next_slash_after_glob_pos != std::string::npos; - const bool is_directory = ls.file_info[i].mKind == 'D'; - /// Condition with type of current file_info means what kind of path is it in current iteration of ls - if (!is_directory && !looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - result.push_back(StorageHDFS::PathWithInfo{ - String(full_path), - StorageHDFS::PathInfo{ls.file_info[i].mLastMod, static_cast(ls.file_info[i].mSize)}}); - } - else if (is_directory && looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - std::vector result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, - suffix_with_globs.substr(next_slash_after_glob_pos)); - /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. - std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); - } - } - } - - return result; - } - - std::pair getPathFromUriAndUriWithoutPath(const String & uri) - { - auto pos = uri.find("//"); - if (pos != std::string::npos && pos + 2 < uri.length()) - { - pos = uri.find('/', pos + 2); - if (pos != std::string::npos) - return {uri.substr(pos), uri.substr(0, pos)}; - } - - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Storage HDFS requires valid URL to be set"); - } - - std::vector getPathsList(const String & path_from_uri, const String & uri_without_path, ContextPtr context) - { - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - - Strings paths = expandSelectionGlob(path_from_uri); - - std::vector res; - - for (const auto & path : paths) - { - auto part_of_res = LSWithRegexpMatching("/", fs, path); - res.insert(res.end(), part_of_res.begin(), part_of_res.end()); - } - return res; - } -} - -StorageHDFS::StorageHDFS( - const String & uri_, - const StorageID & table_id_, - const String & format_name_, - const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - const String & comment, - const ContextPtr & context_, - const String & compression_method_, - const bool distributed_processing_, - ASTPtr partition_by_) - : IStorage(table_id_) - , WithContext(context_) - , uris({uri_}) - , format_name(format_name_) - , compression_method(compression_method_) - , distributed_processing(distributed_processing_) - , partition_by(partition_by_) -{ - if (format_name != "auto") - FormatFactory::instance().checkFormatName(format_name); - context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); - checkHDFSURL(uri_); - - String path = uri_.substr(uri_.find('/', uri_.find("//") + 2)); - is_path_with_globs = path.find_first_of("*?{") != std::string::npos; - - StorageInMemoryMetadata storage_metadata; - - if (columns_.empty()) - { - ColumnsDescription columns; - if (format_name == "auto") - std::tie(columns, format_name) = getTableStructureAndFormatFromData(uri_, compression_method_, context_); - else - columns = getTableStructureFromData(format_name, uri_, compression_method, context_); - - storage_metadata.setColumns(columns); - } - else - { - if (format_name == "auto") - format_name = getTableStructureAndFormatFromData(uri_, compression_method_, context_).second; - - /// We don't allow special columns in HDFS storage. - if (!columns_.hasOnlyOrdinary()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine HDFS doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); - storage_metadata.setColumns(columns_); - } - - storage_metadata.setConstraints(constraints_); - storage_metadata.setComment(comment); - setInMemoryMetadata(storage_metadata); - - virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); -} - -namespace -{ - class ReadBufferIterator : public IReadBufferIterator, WithContext - { - public: - ReadBufferIterator( - const std::vector & paths_with_info_, - const String & uri_without_path_, - std::optional format_, - const String & compression_method_, - const ContextPtr & context_) - : WithContext(context_) - , paths_with_info(paths_with_info_) - , uri_without_path(uri_without_path_) - , format(std::move(format_)) - , compression_method(compression_method_) - { - } - - Data next() override - { - bool is_first = current_index == 0; - /// For default mode check cached columns for all paths on first iteration. - if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) - { - if (auto cached_columns = tryGetColumnsFromCache(paths_with_info)) - return {nullptr, cached_columns, format}; - } - - StorageHDFS::PathWithInfo path_with_info; - - while (true) - { - if (current_index == paths_with_info.size()) - { - if (is_first) - { - if (format) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "The table structure cannot be extracted from a {} format file, because all files are empty. " - "You can specify table structure manually", *format); - - throw Exception( - ErrorCodes::CANNOT_DETECT_FORMAT, - "The data format cannot be detected by the contents of the files, because all files are empty. You can specify table structure manually"); - } - return {nullptr, std::nullopt, format}; - } - - path_with_info = paths_with_info[current_index++]; - if (getContext()->getSettingsRef().hdfs_skip_empty_files && path_with_info.info && path_with_info.info->size == 0) - continue; - - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) - { - std::vector paths = {path_with_info}; - if (auto cached_columns = tryGetColumnsFromCache(paths)) - return {nullptr, cached_columns, format}; - } - - auto compression = chooseCompressionMethod(path_with_info.path, compression_method); - auto impl = std::make_unique(uri_without_path, path_with_info.path, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); - if (!getContext()->getSettingsRef().hdfs_skip_empty_files || !impl->eof()) - { - const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - return {wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)), std::nullopt, format}; - } - } - } - - void setNumRowsToLastFile(size_t num_rows) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs) - return; - - String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, *format, std::nullopt, getContext()); - StorageHDFS::getSchemaCache(getContext()).addNumRows(key, num_rows); - } - - void setSchemaToLastFile(const ColumnsDescription & columns) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs - || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::UNION) - return; - - String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, *format, std::nullopt, getContext()); - StorageHDFS::getSchemaCache(getContext()).addColumns(key, columns); - } - - void setResultingSchema(const ColumnsDescription & columns) override - { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs - || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::DEFAULT) - return; - - Strings sources; - sources.reserve(paths_with_info.size()); - std::transform(paths_with_info.begin(), paths_with_info.end(), std::back_inserter(sources), [&](const StorageHDFS::PathWithInfo & path_with_info){ return uri_without_path + path_with_info.path; }); - auto cache_keys = getKeysForSchemaCache(sources, *format, {}, getContext()); - StorageHDFS::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); - } - - void setFormatName(const String & format_name) override - { - format = format_name; - } - - String getLastFileName() const override - { - if (current_index != 0) - return paths_with_info[current_index - 1].path; - - return ""; - } - - bool supportsLastReadBufferRecreation() const override { return true; } - - std::unique_ptr recreateLastReadBuffer() override - { - chassert(current_index > 0 && current_index <= paths_with_info.size()); - auto path_with_info = paths_with_info[current_index - 1]; - auto compression = chooseCompressionMethod(path_with_info.path, compression_method); - auto impl = std::make_unique(uri_without_path, path_with_info.path, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); - const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - return wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); - } - - private: - std::optional tryGetColumnsFromCache(const std::vector & paths_with_info_) - { - auto context = getContext(); - - if (!context->getSettingsRef().schema_inference_use_cache_for_hdfs) - return std::nullopt; - - auto & schema_cache = StorageHDFS::getSchemaCache(context); - for (const auto & path_with_info : paths_with_info_) - { - auto get_last_mod_time = [&]() -> std::optional - { - if (path_with_info.info) - return path_with_info.info->last_mod_time; - - auto builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); - auto fs = createHDFSFS(builder.get()); - HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path_with_info.path.c_str())); - if (hdfs_info) - return hdfs_info->mLastMod; - - return std::nullopt; - }; - - String url = uri_without_path + path_with_info.path; - if (format) - { - auto cache_key = getKeyForSchemaCache(url, *format, {}, context); - if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) - return columns; - } - else - { - /// If format is unknown, we can iterate through all possible input formats - /// and check if we have an entry with this format and this file in schema cache. - /// If we have such entry for some format, we can use this format to read the file. - for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) - { - auto cache_key = getKeyForSchemaCache(url, format_name, {}, context); - if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) - { - /// Now format is known. It should be the same for all files. - format = format_name; - return columns; - } - } - } - } - - return std::nullopt; - } - - const std::vector & paths_with_info; - const String & uri_without_path; - std::optional format; - const String & compression_method; - size_t current_index = 0; - }; -} - -std::pair StorageHDFS::getTableStructureAndFormatFromDataImpl( - std::optional format, - const String & uri, - const String & compression_method, - const ContextPtr & ctx) -{ - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); - auto paths_with_info = getPathsList(path_from_uri, uri, ctx); - - if (paths_with_info.empty() && (!format || !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format))) - { - if (format) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "The table structure cannot be extracted from a {} format file, because there are no files in HDFS with provided path." - " You can specify table structure manually", *format); - - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "The data format cannot be detected by the contents of the files, because there are no files in HDFS with provided path." - " You can specify the format manually"); - } - - ReadBufferIterator read_buffer_iterator(paths_with_info, uri_without_path, format, compression_method, ctx); - if (format) - return {readSchemaFromFormat(*format, std::nullopt, read_buffer_iterator, ctx), *format}; - return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, ctx); -} - -std::pair StorageHDFS::getTableStructureAndFormatFromData(const String & uri, const String & compression_method, const ContextPtr & ctx) -{ - return getTableStructureAndFormatFromDataImpl(std::nullopt, uri, compression_method, ctx); -} - -ColumnsDescription StorageHDFS::getTableStructureFromData(const String & format, const String & uri, const String & compression_method, const DB::ContextPtr & ctx) -{ - return getTableStructureAndFormatFromDataImpl(format, uri, compression_method, ctx).first; -} - -class HDFSSource::DisclosedGlobIterator::Impl -{ -public: - Impl(const String & uri, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) - { - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); - uris = getPathsList(path_from_uri, uri_without_path, context); - ActionsDAGPtr filter_dag; - if (!uris.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - - if (filter_dag) - { - std::vector paths; - paths.reserve(uris.size()); - for (const auto & path_with_info : uris) - paths.push_back(path_with_info.path); - - VirtualColumnUtils::filterByPathOrFile(uris, paths, filter_dag, virtual_columns, context); - } - auto file_progress_callback = context->getFileProgressCallback(); - - for (auto & elem : uris) - { - elem.path = uri_without_path + elem.path; - if (file_progress_callback && elem.info) - file_progress_callback(FileProgress(0, elem.info->size)); - } - uris_iter = uris.begin(); - } - - StorageHDFS::PathWithInfo next() - { - std::lock_guard lock(mutex); - if (uris_iter != uris.end()) - { - auto answer = *uris_iter; - ++uris_iter; - return answer; - } - return {}; - } -private: - std::mutex mutex; - std::vector uris; - std::vector::iterator uris_iter; -}; - -class HDFSSource::URISIterator::Impl : WithContext -{ -public: - explicit Impl(const std::vector & uris_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context_) - : WithContext(context_), uris(uris_), file_progress_callback(context_->getFileProgressCallback()) - { - ActionsDAGPtr filter_dag; - if (!uris.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); - - if (filter_dag) - { - std::vector paths; - paths.reserve(uris.size()); - for (const auto & uri : uris) - paths.push_back(getPathFromUriAndUriWithoutPath(uri).first); - - VirtualColumnUtils::filterByPathOrFile(uris, paths, filter_dag, virtual_columns, getContext()); - } - - if (!uris.empty()) - { - auto path_and_uri = getPathFromUriAndUriWithoutPath(uris[0]); - builder = createHDFSBuilder(path_and_uri.second + "/", getContext()->getGlobalContext()->getConfigRef()); - fs = createHDFSFS(builder.get()); - } - } - - StorageHDFS::PathWithInfo next() - { - String uri; - HDFSFileInfoPtr hdfs_info; - do - { - size_t current_index = index.fetch_add(1); - if (current_index >= uris.size()) - return {"", {}}; - - uri = uris[current_index]; - auto path_and_uri = getPathFromUriAndUriWithoutPath(uri); - hdfs_info.reset(hdfsGetPathInfo(fs.get(), path_and_uri.first.c_str())); - } - /// Skip non-existed files. - while (!hdfs_info && String(hdfsGetLastError()).find("FileNotFoundException") != std::string::npos); - - std::optional info; - if (hdfs_info) - { - info = StorageHDFS::PathInfo{hdfs_info->mLastMod, static_cast(hdfs_info->mSize)}; - if (file_progress_callback) - file_progress_callback(FileProgress(0, hdfs_info->mSize)); - } - - return {uri, info}; - } - -private: - std::atomic_size_t index = 0; - Strings uris; - HDFSBuilderWrapper builder; - HDFSFSPtr fs; - std::function file_progress_callback; -}; - -HDFSSource::DisclosedGlobIterator::DisclosedGlobIterator(const String & uri, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) - : pimpl(std::make_shared(uri, predicate, virtual_columns, context)) {} - -StorageHDFS::PathWithInfo HDFSSource::DisclosedGlobIterator::next() -{ - return pimpl->next(); -} - -HDFSSource::URISIterator::URISIterator(const std::vector & uris_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) - : pimpl(std::make_shared(uris_, predicate, virtual_columns, context)) -{ -} - -StorageHDFS::PathWithInfo HDFSSource::URISIterator::next() -{ - return pimpl->next(); -} - -HDFSSource::HDFSSource( - const ReadFromFormatInfo & info, - StorageHDFSPtr storage_, - const ContextPtr & context_, - UInt64 max_block_size_, - std::shared_ptr file_iterator_, - bool need_only_count_) - : ISource(info.source_header, false) - , WithContext(context_) - , storage(std::move(storage_)) - , block_for_format(info.format_header) - , requested_columns(info.requested_columns) - , requested_virtual_columns(info.requested_virtual_columns) - , max_block_size(max_block_size_) - , file_iterator(file_iterator_) - , columns_description(info.columns_description) - , need_only_count(need_only_count_) -{ - initialize(); -} - -bool HDFSSource::initialize() -{ - bool skip_empty_files = getContext()->getSettingsRef().hdfs_skip_empty_files; - StorageHDFS::PathWithInfo path_with_info; - while (true) - { - path_with_info = (*file_iterator)(); - if (path_with_info.path.empty()) - return false; - - if (path_with_info.info && skip_empty_files && path_with_info.info->size == 0) - continue; - - current_path = path_with_info.path; - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); - - std::optional file_size; - if (!path_with_info.info) - { - auto builder = createHDFSBuilder(uri_without_path + "/", getContext()->getGlobalContext()->getConfigRef()); - auto fs = createHDFSFS(builder.get()); - HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path_from_uri.c_str())); - if (hdfs_info) - path_with_info.info = StorageHDFS::PathInfo{hdfs_info->mLastMod, static_cast(hdfs_info->mSize)}; - } - - if (path_with_info.info) - file_size = path_with_info.info->size; - - auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); - auto impl = std::make_unique( - uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings(), 0, false, file_size); - if (!skip_empty_files || !impl->eof()) - { - impl->setProgressCallback(getContext()); - const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - read_buf = wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); - break; - } - } - - current_path = path_with_info.path; - current_file_size = path_with_info.info ? std::optional(path_with_info.info->size) : std::nullopt; - - QueryPipelineBuilder builder; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(path_with_info) : std::nullopt; - if (num_rows_from_cache) - { - /// We should not return single chunk with all number of rows, - /// because there is a chance that this chunk will be materialized later - /// (it can cause memory problems even with default values in columns or when virtual columns are requested). - /// Instead, we use a special ConstChunkGenerator that will generate chunks - /// with max_block_size rows until total number of rows is reached. - auto source = std::make_shared(block_for_format, *num_rows_from_cache, max_block_size); - builder.init(Pipe(source)); - } - else - { - std::optional max_parsing_threads; - if (need_only_count) - max_parsing_threads = 1; - - input_format = getContext()->getInputFormat(storage->format_name, *read_buf, block_for_format, max_block_size, std::nullopt, max_parsing_threads); - - if (need_only_count) - input_format->needOnlyCount(); - - builder.init(Pipe(input_format)); - if (columns_description.hasDefaults()) - { - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, columns_description, *input_format, getContext()); - }); - } - } - - /// Add ExtractColumnsTransform to extract requested columns/subcolumns - /// from the chunk read by IInputFormat. - builder.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header, requested_columns); - }); - - pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); - reader = std::make_unique(*pipeline); - - ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); - return true; -} - -String HDFSSource::getName() const -{ - return "HDFSSource"; -} - -Chunk HDFSSource::generate() -{ - while (true) - { - if (isCancelled() || !reader) - { - if (reader) - reader->cancel(); - break; - } - - Chunk chunk; - if (reader->pull(chunk)) - { - UInt64 num_rows = chunk.getNumRows(); - total_rows_in_file += num_rows; - size_t chunk_size = 0; - if (input_format) - chunk_size = input_format->getApproxBytesReadForChunk(); - progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk(chunk, requested_virtual_columns, current_path, current_file_size); - return chunk; - } - - if (input_format && getContext()->getSettingsRef().use_cache_for_count_from_files) - addNumRowsToCache(current_path, total_rows_in_file); - - total_rows_in_file = 0; - - reader.reset(); - pipeline.reset(); - input_format.reset(); - read_buf.reset(); - - if (!initialize()) - break; - } - return {}; -} - -void HDFSSource::addNumRowsToCache(const String & path, size_t num_rows) -{ - auto cache_key = getKeyForSchemaCache(path, storage->format_name, std::nullopt, getContext()); - StorageHDFS::getSchemaCache(getContext()).addNumRows(cache_key, num_rows); -} - -std::optional HDFSSource::tryGetNumRowsFromCache(const StorageHDFS::PathWithInfo & path_with_info) -{ - auto cache_key = getKeyForSchemaCache(path_with_info.path, storage->format_name, std::nullopt, getContext()); - auto get_last_mod_time = [&]() -> std::optional - { - if (path_with_info.info) - return path_with_info.info->last_mod_time; - return std::nullopt; - }; - - return StorageHDFS::getSchemaCache(getContext()).tryGetNumRows(cache_key, get_last_mod_time); -} - -class HDFSSink : public SinkToStorage -{ -public: - HDFSSink(const String & uri, - const String & format, - const Block & sample_block, - const ContextPtr & context, - const CompressionMethod compression_method) - : SinkToStorage(sample_block) - { - const auto & settings = context->getSettingsRef(); - write_buf = wrapWriteBufferWithCompressionMethod( - std::make_unique( - uri, context->getGlobalContext()->getConfigRef(), context->getSettingsRef().hdfs_replication, context->getWriteSettings()), - compression_method, - static_cast(settings.output_format_compression_level), - static_cast(settings.output_format_compression_zstd_window_log)); - writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format, *write_buf, sample_block, context); - } - - String getName() const override { return "HDFSSink"; } - - void consume(Chunk chunk) override - { - std::lock_guard lock(cancel_mutex); - if (cancelled) - return; - writer->write(getHeader().cloneWithColumns(chunk.detachColumns())); - } - - void onCancel() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - cancelled = true; - } - - void onException(std::exception_ptr exception) override - { - std::lock_guard lock(cancel_mutex); - try - { - std::rethrow_exception(exception); - } - catch (...) - { - /// An exception context is needed to proper delete write buffers without finalization - release(); - } - } - - void onFinish() override - { - std::lock_guard lock(cancel_mutex); - finalize(); - } - -private: - void finalize() - { - if (!writer) - return; - - try - { - writer->finalize(); - writer->flush(); - write_buf->sync(); - write_buf->finalize(); - } - catch (...) - { - /// Stop ParallelFormattingOutputFormat correctly. - release(); - throw; - } - } - - void release() - { - writer.reset(); - write_buf->finalize(); - } - - std::unique_ptr write_buf; - OutputFormatPtr writer; - std::mutex cancel_mutex; - bool cancelled = false; -}; - -class PartitionedHDFSSink : public PartitionedSink -{ -public: - PartitionedHDFSSink( - const ASTPtr & partition_by, - const String & uri_, - const String & format_, - const Block & sample_block_, - ContextPtr context_, - const CompressionMethod compression_method_) - : PartitionedSink(partition_by, context_, sample_block_) - , uri(uri_) - , format(format_) - , sample_block(sample_block_) - , context(context_) - , compression_method(compression_method_) - { - } - - SinkPtr createSinkForPartition(const String & partition_id) override - { - auto path = PartitionedSink::replaceWildcards(uri, partition_id); - PartitionedSink::validatePartitionKey(path, true); - return std::make_shared(path, format, sample_block, context, compression_method); - } - -private: - const String uri; - const String format; - const Block sample_block; - ContextPtr context; - const CompressionMethod compression_method; -}; - - -bool StorageHDFS::supportsSubsetOfColumns(const ContextPtr & context_) const -{ - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context_); -} - -class ReadFromHDFS : public SourceStepWithFilter -{ -public: - std::string getName() const override { return "ReadFromHDFS"; } - void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; - void applyFilters(ActionDAGNodes added_filter_nodes) override; - - ReadFromHDFS( - const Names & column_names_, - const SelectQueryInfo & query_info_, - const StorageSnapshotPtr & storage_snapshot_, - const ContextPtr & context_, - Block sample_block, - ReadFromFormatInfo info_, - bool need_only_count_, - std::shared_ptr storage_, - size_t max_block_size_, - size_t num_streams_) - : SourceStepWithFilter( - DataStream{.header = std::move(sample_block)}, - column_names_, - query_info_, - storage_snapshot_, - context_) - , info(std::move(info_)) - , need_only_count(need_only_count_) - , storage(std::move(storage_)) - , max_block_size(max_block_size_) - , num_streams(num_streams_) - { - } - -private: - ReadFromFormatInfo info; - const bool need_only_count; - std::shared_ptr storage; - - size_t max_block_size; - size_t num_streams; - - std::shared_ptr iterator_wrapper; - - void createIterator(const ActionsDAG::Node * predicate); -}; - -void ReadFromHDFS::applyFilters(ActionDAGNodes added_filter_nodes) -{ - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); - const ActionsDAG::Node * predicate = nullptr; - if (filter_actions_dag) - predicate = filter_actions_dag->getOutputs().at(0); - - createIterator(predicate); -} - -void StorageHDFS::read( - QueryPlan & query_plan, - const Names & column_names, - const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & query_info, - ContextPtr context_, - QueryProcessingStage::Enum /*processed_stage*/, - size_t max_block_size, - size_t num_streams) -{ - auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(context_), virtual_columns); - bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) - && context_->getSettingsRef().optimize_count_from_files; - - auto this_ptr = std::static_pointer_cast(shared_from_this()); - - auto reading = std::make_unique( - column_names, - query_info, - storage_snapshot, - context_, - read_from_format_info.source_header, - std::move(read_from_format_info), - need_only_count, - std::move(this_ptr), - max_block_size, - num_streams); - - query_plan.addStep(std::move(reading)); -} - -void ReadFromHDFS::createIterator(const ActionsDAG::Node * predicate) -{ - if (iterator_wrapper) - return; - - if (storage->distributed_processing) - { - iterator_wrapper = std::make_shared( - [callback = context->getReadTaskCallback()]() -> StorageHDFS::PathWithInfo { - return StorageHDFS::PathWithInfo{callback(), std::nullopt}; - }); - } - else if (storage->is_path_with_globs) - { - /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(storage->uris[0], predicate, storage->virtual_columns, context); - iterator_wrapper = std::make_shared([glob_iterator]() - { - return glob_iterator->next(); - }); - } - else - { - auto uris_iterator = std::make_shared(storage->uris, predicate, storage->virtual_columns, context); - iterator_wrapper = std::make_shared([uris_iterator]() - { - return uris_iterator->next(); - }); - } -} - -void ReadFromHDFS::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) -{ - createIterator(nullptr); - - Pipes pipes; - for (size_t i = 0; i < num_streams; ++i) - { - pipes.emplace_back(std::make_shared( - info, - storage, - context, - max_block_size, - iterator_wrapper, - need_only_count)); - } - - auto pipe = Pipe::unitePipes(std::move(pipes)); - if (pipe.empty()) - pipe = Pipe(std::make_shared(info.source_header)); - - for (const auto & processor : pipe.getProcessors()) - processors.emplace_back(processor); - - pipeline.init(std::move(pipe)); -} - -SinkToStoragePtr StorageHDFS::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context_, bool /*async_insert*/) -{ - String current_uri = uris.back(); - - bool has_wildcards = current_uri.find(PartitionedSink::PARTITION_ID_WILDCARD) != String::npos; - const auto * insert_query = dynamic_cast(query.get()); - auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : partition_by) : nullptr; - bool is_partitioned_implementation = partition_by_ast && has_wildcards; - - if (is_partitioned_implementation) - { - return std::make_shared( - partition_by_ast, - current_uri, - format_name, - metadata_snapshot->getSampleBlock(), - context_, - chooseCompressionMethod(current_uri, compression_method)); - } - else - { - if (is_path_with_globs) - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "URI '{}' contains globs, so the table is in readonly mode", uris.back()); - - const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_uri); - - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context_->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - - bool truncate_on_insert = context_->getSettingsRef().hdfs_truncate_on_insert; - if (!truncate_on_insert && !hdfsExists(fs.get(), path_from_uri.c_str())) - { - if (context_->getSettingsRef().hdfs_create_new_file_on_insert) - { - auto pos = uris[0].find_first_of('.', uris[0].find_last_of('/')); - size_t index = uris.size(); - String new_uri; - do - { - new_uri = uris[0].substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : uris[0].substr(pos)); - ++index; - } - while (!hdfsExists(fs.get(), new_uri.c_str())); - uris.push_back(new_uri); - current_uri = new_uri; - } - else - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "File with path {} already exists. If you want to overwrite it, enable setting hdfs_truncate_on_insert, " - "if you want to create new file on each insert, enable setting hdfs_create_new_file_on_insert", - path_from_uri); - } - - return std::make_shared(current_uri, - format_name, - metadata_snapshot->getSampleBlock(), - context_, - chooseCompressionMethod(current_uri, compression_method)); - } -} - -void StorageHDFS::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, ContextPtr local_context, TableExclusiveLockHolder &) -{ - const size_t begin_of_path = uris[0].find('/', uris[0].find("//") + 2); - const String url = uris[0].substr(0, begin_of_path); - - HDFSBuilderWrapper builder = createHDFSBuilder(url + "/", local_context->getGlobalContext()->getConfigRef()); - auto fs = createHDFSFS(builder.get()); - - for (const auto & uri : uris) - { - const String path = uri.substr(begin_of_path); - int ret = hdfsDelete(fs.get(), path.data(), 0); - if (ret) - throw Exception(ErrorCodes::ACCESS_DENIED, "Unable to truncate hdfs table: {}", std::string(hdfsGetLastError())); - } -} - - -void registerStorageHDFS(StorageFactory & factory) -{ - factory.registerStorage("HDFS", [](const StorageFactory::Arguments & args) - { - ASTs & engine_args = args.engine_args; - - if (engine_args.empty() || engine_args.size() > 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Storage HDFS requires 1, 2 or 3 arguments: " - "url, name of used format (taken from file extension by default) and optional compression method."); - - engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.getLocalContext()); - - String url = checkAndGetLiteralArgument(engine_args[0], "url"); - - String format_name = "auto"; - if (engine_args.size() > 1) - { - engine_args[1] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[1], args.getLocalContext()); - format_name = checkAndGetLiteralArgument(engine_args[1], "format_name"); - } - - if (format_name == "auto") - format_name = FormatFactory::instance().tryGetFormatFromFileName(url).value_or("auto"); - - String compression_method; - if (engine_args.size() == 3) - { - engine_args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[2], args.getLocalContext()); - compression_method = checkAndGetLiteralArgument(engine_args[2], "compression_method"); - } else compression_method = "auto"; - - ASTPtr partition_by; - if (args.storage_def->partition_by) - partition_by = args.storage_def->partition_by->clone(); - - return std::make_shared( - url, args.table_id, format_name, args.columns, args.constraints, args.comment, args.getContext(), compression_method, false, partition_by); - }, - { - .supports_sort_order = true, // for partition by - .supports_schema_inference = true, - .source_access_type = AccessType::HDFS, - }); -} - -NamesAndTypesList StorageHDFS::getVirtuals() const -{ - return virtual_columns; -} - -Names StorageHDFS::getVirtualColumnNames() -{ - return VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage({}).getNames(); -} - -SchemaCache & StorageHDFS::getSchemaCache(const ContextPtr & ctx) -{ - static SchemaCache schema_cache(ctx->getConfigRef().getUInt("schema_inference_cache_max_elements_for_hdfs", DEFAULT_SCHEMA_CACHE_ELEMENTS)); - return schema_cache; -} - -} - -#endif diff --git a/src/Storages/Hive/HiveCommon.h b/src/Storages/Hive/HiveCommon.h index 0f9d3364ffd..81c167165d3 100644 --- a/src/Storages/Hive/HiveCommon.h +++ b/src/Storages/Hive/HiveCommon.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index 1f5e31f1d54..affb72fe09b 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include namespace orc { diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index 183a4532281..a76cef2d45d 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -38,8 +38,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include diff --git a/src/Storages/Hive/StorageHive.h b/src/Storages/Hive/StorageHive.h index 07440097f7a..43a22a886a8 100644 --- a/src/Storages/Hive/StorageHive.h +++ b/src/Storages/Hive/StorageHive.h @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include diff --git a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/AsynchronousReadBufferFromHDFS.cpp similarity index 99% rename from src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp rename to src/Storages/ObjectStorage/HDFS/AsynchronousReadBufferFromHDFS.cpp index 6b6151f5474..21df7e35284 100644 --- a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp +++ b/src/Storages/ObjectStorage/HDFS/AsynchronousReadBufferFromHDFS.cpp @@ -1,9 +1,9 @@ #include "AsynchronousReadBufferFromHDFS.h" #if USE_HDFS +#include "ReadBufferFromHDFS.h" #include #include -#include #include #include diff --git a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h b/src/Storages/ObjectStorage/HDFS/AsynchronousReadBufferFromHDFS.h similarity index 96% rename from src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h rename to src/Storages/ObjectStorage/HDFS/AsynchronousReadBufferFromHDFS.h index 10e2749fd4a..5aef92315a4 100644 --- a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h +++ b/src/Storages/ObjectStorage/HDFS/AsynchronousReadBufferFromHDFS.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index a64faafd53d..6c7fe1cef7e 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -1,7 +1,7 @@ #include #if USE_HDFS -#include +#include #include #include #include diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/ObjectStorage/HDFS/HDFSCommon.cpp similarity index 99% rename from src/Storages/HDFS/HDFSCommon.cpp rename to src/Storages/ObjectStorage/HDFS/HDFSCommon.cpp index f9a55a1285a..5d14cec14bd 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/ObjectStorage/HDFS/HDFSCommon.cpp @@ -1,4 +1,4 @@ -#include +#include "HDFSCommon.h" #include #include #include diff --git a/src/Storages/HDFS/HDFSCommon.h b/src/Storages/ObjectStorage/HDFS/HDFSCommon.h similarity index 100% rename from src/Storages/HDFS/HDFSCommon.h rename to src/Storages/ObjectStorage/HDFS/HDFSCommon.h diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp similarity index 99% rename from src/Storages/HDFS/ReadBufferFromHDFS.cpp rename to src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp index 4df05d47003..18b22805dfc 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp @@ -1,7 +1,7 @@ #include "ReadBufferFromHDFS.h" #if USE_HDFS -#include +#include "HDFSCommon.h" #include #include #include diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.h b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.h similarity index 100% rename from src/Storages/HDFS/ReadBufferFromHDFS.h rename to src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.h diff --git a/src/Storages/HDFS/WriteBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp similarity index 97% rename from src/Storages/HDFS/WriteBufferFromHDFS.cpp rename to src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp index 9d383aa8245..2c14b38ce01 100644 --- a/src/Storages/HDFS/WriteBufferFromHDFS.cpp +++ b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp @@ -2,8 +2,8 @@ #if USE_HDFS -#include -#include +#include "WriteBufferFromHDFS.h" +#include "HDFSCommon.h" #include #include #include diff --git a/src/Storages/HDFS/WriteBufferFromHDFS.h b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.h similarity index 100% rename from src/Storages/HDFS/WriteBufferFromHDFS.h rename to src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.h diff --git a/src/Storages/examples/async_read_buffer_from_hdfs.cpp b/src/Storages/examples/async_read_buffer_from_hdfs.cpp index 4f6aed8ef65..1c47a07ba58 100644 --- a/src/Storages/examples/async_read_buffer_from_hdfs.cpp +++ b/src/Storages/examples/async_read_buffer_from_hdfs.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include int main() { From cfb73dd30781c95261a02dfb3443f6a18273612b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 18 Mar 2024 13:54:23 +0100 Subject: [PATCH 050/392] Move input_format_tsv_crlf_end_of_line to 24.3 settings changes --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index b14953fd706..5ce98a92003 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -86,6 +86,7 @@ namespace SettingsChangesHistory static std::map settings_changes_history = { {"24.3", {{"s3_connect_timeout_ms", 1000, 1000, "Introduce new dedicated setting for s3 connection timeout"}, + {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, {"allow_experimental_shared_merge_tree", false, true, "The setting is obsolete"}, {"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"}, {"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"}, @@ -109,7 +110,6 @@ static std::map sett {"async_insert_busy_timeout_max_ms", 200, 200, "The minimum value of the asynchronous insert timeout in milliseconds; async_insert_busy_timeout_ms is aliased to async_insert_busy_timeout_max_ms"}, {"async_insert_busy_timeout_increase_rate", 0.2, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout increases"}, {"async_insert_busy_timeout_decrease_rate", 0.2, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout decreases"}, - {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, {"format_template_row_format", "", "", "Template row format string can be set directly in query"}, {"format_template_resultset_format", "", "", "Template result set format string can be set in query"}, {"split_parts_ranges_into_intersecting_and_non_intersecting_final", true, true, "Allow to split parts ranges into intersecting and non intersecting during FINAL optimization"}, From e019b3a391bb8e3bbfa991e083e65e76438a2a9e Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 25 Mar 2024 16:12:39 +0100 Subject: [PATCH 051/392] Fix build after merge --- src/Backups/BackupIO_AzureBlobStorage.cpp | 2 +- .../IO/WriteBufferFromAzureBlobStorage.h | 2 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 12 +-- .../ObjectStorage/AzureBlob/Configuration.cpp | 73 +++++++++++-------- .../ObjectStorage/AzureBlob/Configuration.h | 2 +- .../DataLakes/DeltaLakeMetadata.cpp | 1 + .../ObjectStorage/HDFS/Configuration.cpp | 8 +- .../ObjectStorage/S3/Configuration.cpp | 2 + .../StorageObjectStorageConfiguration.cpp | 10 +++ .../StorageObjectStorageConfiguration.h | 4 + src/Storages/S3Queue/S3QueueTableMetadata.cpp | 1 - src/Storages/S3Queue/S3QueueTableMetadata.h | 1 + 12 files changed, 77 insertions(+), 41 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index dc4a825189f..8a3ff1c3b5e 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -36,7 +36,7 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::Azure, MetadataStorageType::None, configuration_.container, false, false} , configuration(configuration_) { - auto client_ptr = configuration.createClient(/* is_read_only */ false); + auto client_ptr = configuration.createClient(/* is_read_only */ false, /* attempt_to_create_container */true); object_storage = std::make_unique("BackupReaderAzureBlobStorage", std::move(client_ptr), configuration.createSettings(context_), diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h index 6e10c07b255..dbf0b2a3052 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include namespace Poco { diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 13d4c2a551b..872f7eec07b 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -69,10 +69,6 @@ std::unique_ptr getClient( { String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); - if (S3::isS3ExpressEndpoint(endpoint) && !config.has(config_prefix + ".region")) - throw Exception( - ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Region should be explicitly specified for directory buckets ({})", config_prefix); - url = S3::URI(endpoint); if (!url.key.ends_with('/')) url.key.push_back('/'); @@ -83,6 +79,12 @@ std::unique_ptr getClient( throw Exception(ErrorCodes::LOGICAL_ERROR, "URL not passed"); url = *url_; } + const bool is_s3_express_bucket = S3::isS3ExpressEndpoint(url.endpoint); + if (is_s3_express_bucket && !config.has(config_prefix + ".region")) + { + throw Exception( + ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Region should be explicitly specified for directory buckets ({})", config_prefix); + } S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( auth_settings.region, @@ -130,7 +132,7 @@ std::unique_ptr getClient( .use_virtual_addressing = url.is_virtual_hosted_style, .disable_checksum = local_settings.s3_disable_checksum, .gcs_issue_compose_request = config.getBool("s3.gcs_issue_compose_request", false), - .is_s3express_bucket = S3::isS3ExpressEndpoint(endpoint), + .is_s3express_bucket = is_s3_express_bucket, }; auto credentials_configuration = S3::CredentialsConfiguration diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index 7a670441e72..018cec51e7c 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -47,7 +48,8 @@ namespace return !candidate.starts_with("http"); } - bool containerExists(Azure::Storage::Blobs::BlobServiceClient & blob_service_client, std::string container_name) + template + bool containerExists(T & blob_service_client, const std::string & container_name) { Azure::Storage::Blobs::ListBlobContainersOptions options; options.Prefix = container_name; @@ -101,12 +103,13 @@ AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(Co ObjectStoragePtr StorageAzureBlobConfiguration::createObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT { - auto client = createClient(is_readonly); + assertInitialized(); + auto client = createClient(is_readonly, /* attempt_to_create_container */true); auto settings = createSettings(context); return std::make_unique("AzureBlobStorage", std::move(client), std::move(settings), container); } -AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) +AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only, bool attempt_to_create_container) { using namespace Azure::Storage::Blobs; @@ -114,28 +117,32 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) if (is_connection_string) { - auto blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(connection_url)); + std::shared_ptr managed_identity_credential = std::make_shared(); + std::unique_ptr blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(connection_url)); result = std::make_unique(BlobContainerClient::CreateFromConnectionString(connection_url, container)); - bool container_exists = containerExists(*blob_service_client, container); - if (!container_exists) + if (attempt_to_create_container) { - if (is_read_only) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "AzureBlobStorage container does not exist '{}'", - container); + bool container_exists = containerExists(*blob_service_client, container); + if (!container_exists) + { + if (is_read_only) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "AzureBlobStorage container does not exist '{}'", + container); - try - { - result->CreateIfNotExists(); - } - catch (const Azure::Storage::StorageException & e) - { - if (e.StatusCode != Azure::Core::Http::HttpStatusCode::Conflict - || e.ReasonPhrase != "The specified container already exists.") + try { - throw; + result->CreateIfNotExists(); + } + catch (const Azure::Storage::StorageException & e) + { + if (!(e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict + && e.ReasonPhrase == "The specified container already exists.")) + { + throw; + } } } } @@ -145,22 +152,22 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) std::shared_ptr storage_shared_key_credential; if (account_name.has_value() && account_key.has_value()) { - storage_shared_key_credential = - std::make_shared(*account_name, *account_key); + storage_shared_key_credential + = std::make_shared(*account_name, *account_key); } std::unique_ptr blob_service_client; + std::shared_ptr managed_identity_credential; if (storage_shared_key_credential) { blob_service_client = std::make_unique(connection_url, storage_shared_key_credential); } else { - blob_service_client = std::make_unique(connection_url); + managed_identity_credential = std::make_shared(); + blob_service_client = std::make_unique(connection_url, managed_identity_credential); } - bool container_exists = containerExists(*blob_service_client, container); - std::string final_url; size_t pos = connection_url.find('?'); if (pos != std::string::npos) @@ -173,12 +180,21 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) final_url = connection_url + (connection_url.back() == '/' ? "" : "/") + container; + if (!attempt_to_create_container) + { + if (storage_shared_key_credential) + return std::make_unique(final_url, storage_shared_key_credential); + else + return std::make_unique(final_url, managed_identity_credential); + } + + bool container_exists = containerExists(*blob_service_client, container); if (container_exists) { if (storage_shared_key_credential) result = std::make_unique(final_url, storage_shared_key_credential); else - result = std::make_unique(final_url); + result = std::make_unique(final_url, managed_identity_credential); } else { @@ -190,8 +206,7 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) try { result = std::make_unique(blob_service_client->CreateBlobContainer(container).Value); - } - catch (const Azure::Storage::StorageException & e) + } catch (const Azure::Storage::StorageException & e) { if (e.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict && e.ReasonPhrase == "The specified container already exists.") @@ -199,7 +214,7 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only) if (storage_shared_key_credential) result = std::make_unique(final_url, storage_shared_key_credential); else - result = std::make_unique(final_url); + result = std::make_unique(final_url, managed_identity_credential); } else { diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.h b/src/Storages/ObjectStorage/AzureBlob/Configuration.h index 3d701e72cb4..8040d433d99 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.h +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.h @@ -52,7 +52,7 @@ protected: std::string blob_path; std::vector blobs_paths; - AzureClientPtr createClient(bool is_read_only); + AzureClientPtr createClient(bool is_read_only, bool attempt_to_create_container); AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); }; diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index 903558b73ab..1caa2c000d6 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 848fdb292e8..03a0a1a5e69 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -29,12 +29,14 @@ void StorageHDFSConfiguration::check(ContextPtr context) const checkHDFSURL(fs::path(url) / path); } -ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT +ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT { - UNUSED(is_readonly); - auto settings = std::make_unique(); + assertInitialized(); + if (!url.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS url is empty"); + + auto settings = std::make_unique(); return std::make_shared(url, std::move(settings), context->getConfigRef()); } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 0c05f77541b..4e6d8980aa7 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -66,6 +66,8 @@ StorageS3Configuration::StorageS3Configuration(const StorageS3Configuration & ot ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT { + assertInitialized(); + const auto & config = context->getConfigRef(); const std::string config_prefix = "s3."; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 8a4dee2c31b..6172f8934af 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -18,6 +18,8 @@ void StorageObjectStorageConfiguration::initialize( // FIXME: it should be - if (format == "auto" && get_format_from_file) if (configuration.format == "auto") configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.getPath()).value_or("auto"); + + configuration.initialized = true; } StorageObjectStorageConfiguration::StorageObjectStorageConfiguration(const StorageObjectStorageConfiguration & other) @@ -48,4 +50,12 @@ std::string StorageObjectStorageConfiguration::getPathWithoutGlob() const return getPath().substr(0, getPath().find_first_of("*?{")); } +void StorageObjectStorageConfiguration::assertInitialized() const +{ + if (!initialized) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration was not initialized before usage"); + } +} + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 8134bd07806..66fe6a68d76 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -54,6 +54,10 @@ public: protected: virtual void fromNamedCollection(const NamedCollection & collection) = 0; virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; + + void assertInitialized() const; + + bool initialized = false; }; using StorageObjectStorageConfigurationPtr = std::shared_ptr; diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.cpp b/src/Storages/S3Queue/S3QueueTableMetadata.cpp index e1978259230..8354e6aa2ae 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueTableMetadata.cpp @@ -7,7 +7,6 @@ #include #include #include -#include namespace DB diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.h b/src/Storages/S3Queue/S3QueueTableMetadata.h index a649f211abc..2158b189070 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.h +++ b/src/Storages/S3Queue/S3QueueTableMetadata.h @@ -3,6 +3,7 @@ #if USE_AWS_S3 #include +#include #include #include From f5982fdb1ff30280dfebd89afb9274fca33c56b6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 25 Mar 2024 19:19:54 +0100 Subject: [PATCH 052/392] Fix some tests --- .../ObjectStorages/HDFS/HDFSObjectStorage.h | 16 ++----- .../ObjectStorages/ObjectStorageFactory.cpp | 3 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 2 - .../ObjectStorage/HDFS/Configuration.cpp | 48 +++++++++++++------ .../ObjectStorage/HDFS/Configuration.h | 6 +-- .../ObjectStorage/ReadBufferIterator.cpp | 4 +- .../ObjectStorage/S3/Configuration.cpp | 6 +++ src/Storages/ObjectStorage/S3/Configuration.h | 2 + .../ObjectStorage/StorageObjectStorage.cpp | 2 + .../ObjectStorage/StorageObjectStorage.h | 2 + .../StorageObjectStorageConfiguration.cpp | 4 ++ .../StorageObjectStorageConfiguration.h | 2 + .../StorageObjectStorageSink.cpp | 40 ++++++++++++++-- .../ObjectStorage/StorageObjectStorageSink.h | 3 ++ src/Storages/StorageS3Settings.cpp | 2 +- .../queries/0_stateless/02114_hdfs_bad_url.sh | 1 - .../0_stateless/02700_s3_part_INT_MAX.sh | 2 +- ...ed_url_and_url_with_special_characters.sql | 3 +- 18 files changed, 104 insertions(+), 44 deletions(-) diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 4072d21ed7c..f92e160fd4d 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -16,21 +16,13 @@ namespace DB struct HDFSObjectStorageSettings { - - HDFSObjectStorageSettings() = default; - - size_t min_bytes_for_seek; - int objects_chunk_size_to_delete; - int replication; - - HDFSObjectStorageSettings( - int min_bytes_for_seek_, - int objects_chunk_size_to_delete_, - int replication_) + HDFSObjectStorageSettings(int min_bytes_for_seek_, int replication_) : min_bytes_for_seek(min_bytes_for_seek_) - , objects_chunk_size_to_delete(objects_chunk_size_to_delete_) , replication(replication_) {} + + size_t min_bytes_for_seek; + int replication; }; diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index f30a552f8dd..67e38d6389a 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -227,9 +227,8 @@ void registerHDFSObjectStorage(ObjectStorageFactory & factory) if (uri.back() != '/') throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS path must ends with '/', but '{}' doesn't.", uri); - std::unique_ptr settings = std::make_unique( + auto settings = std::make_unique( config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), - config.getInt(config_prefix + ".objects_chunk_size_to_delete", 1000), context->getSettingsRef().hdfs_replication ); diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 872f7eec07b..1aecb590526 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -143,8 +143,6 @@ std::unique_ptr getClient( auth_settings.no_sign_request.value_or(context->getConfigRef().getBool("s3.no_sign_request", false)), }; - LOG_TEST(&Poco::Logger::get("kssenii"), "KSSENII: {} - {}", auth_settings.access_key_id, auth_settings.secret_access_key); - return S3::ClientFactory::instance().create( client_configuration, client_settings, diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 03a0a1a5e69..5edc660d717 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include namespace DB @@ -13,6 +14,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int NOT_IMPLEMENTED; } StorageHDFSConfiguration::StorageHDFSConfiguration(const StorageHDFSConfiguration & other) @@ -29,37 +31,53 @@ void StorageHDFSConfiguration::check(ContextPtr context) const checkHDFSURL(fs::path(url) / path); } -ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT +ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage( /// NOLINT + ContextPtr context, + bool /* is_readonly */) { assertInitialized(); - - if (!url.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS url is empty"); - - auto settings = std::make_unique(); - return std::make_shared(url, std::move(settings), context->getConfigRef()); + const auto & settings = context->getSettingsRef(); + auto hdfs_settings = std::make_unique( + settings.remote_read_min_bytes_for_seek, + settings.hdfs_replication + ); + return std::make_shared(url, std::move(hdfs_settings), context->getConfigRef()); } -void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr, bool /* with_structure */) +void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool /* with_structure */) { url = checkAndGetLiteralArgument(args[0], "url"); if (args.size() > 1) + { + args[1] = evaluateConstantExpressionOrIdentifierAsLiteral(args[1], context); format = checkAndGetLiteralArgument(args[1], "format_name"); - else - format = "auto"; + } if (args.size() == 3) + { + args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(args[2], context); compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); - else - compression_method = "auto"; + } - const size_t begin_of_path = url.find('/', url.find("//") + 2); - path = url.substr(begin_of_path + 1); - url = url.substr(0, begin_of_path); + auto pos = url.find("//"); + if (pos == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid url: {}", url); + + pos = url.find('/', pos + 2); + if (pos == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid url: {}", url); + + path = url.substr(pos + 1); + url = url.substr(0, pos); paths = {path}; } +void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection &) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method fromNamedColection() is not implemented"); +} + } #endif diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 1013c2e00c2..5765edbf36c 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -29,12 +29,12 @@ public: ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } - void fromNamedCollection(const NamedCollection &) override {} - void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override; - static void addStructureToArgs(ASTs &, const String &, ContextPtr) {} private: + void fromNamedCollection(const NamedCollection &) override; + void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override; + String url; String path; std::vector paths; diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index dd4bfe79b06..0b6e34fb831 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -31,7 +31,7 @@ ReadBufferIterator::ReadBufferIterator( , query_settings(query_settings_) , schema_cache(schema_cache_) , read_keys(read_keys_) - , format(configuration->format.empty() || configuration->format == "auto" ? std::nullopt : std::optional(configuration->format)) + , format(configuration->format == "auto" ? std::nullopt : std::optional(configuration->format)) , prev_read_keys_size(read_keys_.size()) { } @@ -191,7 +191,7 @@ ReadBufferIterator::Data ReadBufferIterator::next() { if (first) { - if (format) + if (format.has_value()) throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "The table structure cannot be extracted from a {} format file, because there are no files with provided path " diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 4e6d8980aa7..132a5045d8a 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -51,10 +51,16 @@ String StorageS3Configuration::getDataSourceDescription() void StorageS3Configuration::check(ContextPtr context) const { + validateNamespace(url.bucket); context->getGlobalContext()->getRemoteHostFilter().checkURL(url.uri); context->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(headers_from_ast); } +void StorageS3Configuration::validateNamespace(const String & name) const +{ + S3::URI::validateBucket(name, {}); +} + StorageS3Configuration::StorageS3Configuration(const StorageS3Configuration & other) : StorageObjectStorageConfiguration(other) { diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index 88a084f29b3..f9614da4b95 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -27,6 +27,8 @@ public: String getDataSourceDescription() override; void check(ContextPtr context) const override; + void validateNamespace(const String & name) const override; + StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } bool isStaticConfiguration() const override { return static_configuration; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index f1d3635514f..3a894af3e01 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -87,6 +87,7 @@ StorageObjectStorage::StorageObjectStorage( , format_settings(format_settings_) , partition_by(partition_by_) , distributed_processing(distributed_processing_) + , log(getLogger("Storage" + engine_name_)) , object_storage(object_storage_) , configuration(configuration_) { @@ -204,6 +205,7 @@ SinkToStoragePtr StorageObjectStorage::write( if (partition_by_ast) { + LOG_TEST(log, "Using PartitionedSink for {}", configuration->getPath()); return std::make_shared( object_storage, configuration, format_settings, sample_block, local_context, partition_by_ast); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 743b725a88a..ebaf504f532 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -113,6 +114,7 @@ protected: const ASTPtr partition_by; const bool distributed_processing; + LoggerPtr log; ObjectStoragePtr object_storage; ConfigurationPtr configuration; std::mutex configuration_update_mutex; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 6172f8934af..9a8b8191907 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB { @@ -18,7 +19,10 @@ void StorageObjectStorageConfiguration::initialize( // FIXME: it should be - if (format == "auto" && get_format_from_file) if (configuration.format == "auto") configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.getPath()).value_or("auto"); + else + FormatFactory::instance().checkFormatName(configuration.format); + configuration.check(local_context); configuration.initialized = true; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 66fe6a68d76..0beed91b128 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -43,6 +43,8 @@ public: std::string getPathWithoutGlob() const; virtual void check(ContextPtr context) const = 0; + virtual void validateNamespace(const String & /* name */) const {} + virtual ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT virtual StorageObjectStorageConfigurationPtr clone() = 0; virtual bool isStaticConfiguration() const { return true; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 37f93a2b82f..2dd8516ebe8 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -1,9 +1,14 @@ #include "StorageObjectStorageSink.h" #include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_TEXT; +} StorageObjectStorageSink::StorageObjectStorageSink( ObjectStoragePtr object_storage, @@ -93,6 +98,7 @@ void StorageObjectStorageSink::release() write_buf->finalize(); } + PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, StorageObjectStorageConfigurationPtr configuration_, @@ -111,9 +117,12 @@ PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String & partition_id) { - auto blob = configuration->getPaths().back(); - auto partition_key = replaceWildcards(blob, partition_id); - validatePartitionKey(partition_key, true); + auto partition_bucket = replaceWildcards(configuration->getNamespace(), partition_id); + validateNamespace(partition_bucket); + + auto partition_key = replaceWildcards(configuration->getPath(), partition_id); + validateKey(partition_key); + return std::make_shared( object_storage, configuration, @@ -124,4 +133,29 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String ); } +void PartitionedStorageObjectStorageSink::validateKey(const String & str) +{ + /// See: + /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html + /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject + + if (str.empty() || str.size() > 1024) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); + + if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); + + validatePartitionKey(str, true); +} + +void PartitionedStorageObjectStorageSink::validateNamespace(const String & str) +{ + configuration->validateNamespace(str); + + if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); + + validatePartitionKey(str, false); +} + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index 14298376d0e..a352e2c66a3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -54,6 +54,9 @@ public: SinkPtr createSinkForPartition(const String & partition_id) override; private: + void validateKey(const String & str); + void validateNamespace(const String & str); + ObjectStoragePtr object_storage; StorageObjectStorageConfigurationPtr configuration; const std::optional format_settings; diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index 3eff6e0f6c9..e8f32388b1b 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -108,7 +108,7 @@ void S3Settings::RequestSettings::PartUploadSettings::validate() if (max_upload_part_size > max_upload_part_size_limit) throw Exception( ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_upload_part_size has invalid value {} which is grater than the s3 API limit {}", + "Setting max_upload_part_size has invalid value {} which is greater than the s3 API limit {}", ReadableSize(max_upload_part_size), ReadableSize(max_upload_part_size_limit)); if (max_single_part_upload_size > max_upload_part_size_limit) diff --git a/tests/queries/0_stateless/02114_hdfs_bad_url.sh b/tests/queries/0_stateless/02114_hdfs_bad_url.sh index 22975dddf6f..5bd5610a9f0 100755 --- a/tests/queries/0_stateless/02114_hdfs_bad_url.sh +++ b/tests/queries/0_stateless/02114_hdfs_bad_url.sh @@ -23,4 +23,3 @@ $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs1:9000/data', 'CSV', 'x UInt32')" $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "HDFS_ERROR" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('http://hdfs1:9000/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "BAD_ARGUMENTS" && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT -q "SELECT * FROM hdfs('hdfs://hdfs1@nameservice/abcd/data', 'CSV', 'x UInt32')" 2>&1 | grep -F -q "HDFS_ERROR" && echo 'OK' || echo 'FAIL'; - diff --git a/tests/queries/0_stateless/02700_s3_part_INT_MAX.sh b/tests/queries/0_stateless/02700_s3_part_INT_MAX.sh index d831c7d9806..a34a480a078 100755 --- a/tests/queries/0_stateless/02700_s3_part_INT_MAX.sh +++ b/tests/queries/0_stateless/02700_s3_part_INT_MAX.sh @@ -13,7 +13,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) $CLICKHOUSE_CLIENT -nm -q " INSERT INTO FUNCTION s3('http://localhost:11111/test/$CLICKHOUSE_DATABASE/test_INT_MAX.tsv', '', '', 'TSV') SELECT repeat('a', 1024) FROM numbers((pow(2, 30) * 2) / 1024) - SETTINGS s3_max_single_part_upload_size = '10Gi'; + SETTINGS s3_max_single_part_upload_size = '5Gi'; SELECT count() FROM s3('http://localhost:11111/test/$CLICKHOUSE_DATABASE/test_INT_MAX.tsv'); " diff --git a/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql b/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql index da76a5cb88f..1e99eb8b83d 100644 --- a/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql +++ b/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql @@ -2,5 +2,4 @@ select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/BU%20-%20UNIT%20-%201/*.parquet'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } -select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/*.parquet?some_tocken=ABCD'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } - +select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/*.parquet?some_tocken=ABCD'); -- { serverError CANNOT_DETECT_FORMAT } From cb97f8dab52aeaf492530d66a8553c422ffbcebd Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 25 Mar 2024 19:22:20 +0100 Subject: [PATCH 053/392] Fix style check --- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 1 - .../ObjectStorage/StorageObjectStorageConfiguration.cpp | 4 ++++ src/Storages/ObjectStorage/StorageObjectStorageSink.cpp | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 3a894af3e01..8d85224cff0 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -27,7 +27,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int DATABASE_ACCESS_DENIED; extern const int NOT_IMPLEMENTED; - } template diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 9a8b8191907..1d5c0cd3a39 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -4,6 +4,10 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} void StorageObjectStorageConfiguration::initialize( StorageObjectStorageConfiguration & configuration, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 2dd8516ebe8..cf1c583ca62 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -8,6 +8,7 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_PARSE_TEXT; + extern const int BAD_ARGUMENTS; } StorageObjectStorageSink::StorageObjectStorageSink( From 7a991de488567a255086a14faa830e1ba1610924 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 27 Mar 2024 19:06:19 +0100 Subject: [PATCH 054/392] Fix tests --- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 68 ++++++++++++++-- .../ObjectStorages/HDFS/HDFSObjectStorage.h | 2 + .../ObjectStorages/ObjectStorageIterator.h | 24 +++--- .../ObjectStorageIteratorAsync.cpp | 12 +++ .../ObjectStorageIteratorAsync.h | 15 ++-- .../ObjectStorages/S3/S3ObjectStorage.cpp | 23 ++++-- .../ObjectStorage/HDFS/Configuration.cpp | 81 +++++++++++++++---- .../ObjectStorage/HDFS/Configuration.h | 3 + .../ObjectStorage/HDFS/ReadBufferFromHDFS.cpp | 17 ++-- .../ReadFromStorageObjectStorage.cpp | 4 +- .../ObjectStorage/StorageObjectStorage.cpp | 4 +- .../StorageObjectStorageCluster.cpp | 3 +- .../StorageObjectStorageConfiguration.cpp | 3 +- .../StorageObjectStorageConfiguration.h | 2 +- .../StorageObjectStorageQuerySettings.h | 4 + .../StorageObjectStorageSource.cpp | 56 ++++++++++--- .../StorageObjectStorageSource.h | 31 ++++--- src/Storages/S3Queue/S3QueueSource.cpp | 5 +- src/Storages/S3Queue/S3QueueSource.h | 2 +- src/Storages/S3Queue/StorageS3Queue.cpp | 2 +- tests/integration/test_storage_hdfs/test.py | 4 +- 21 files changed, 279 insertions(+), 86 deletions(-) diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 2d03de60c3c..db79ff365aa 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #if USE_HDFS @@ -18,6 +19,7 @@ namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; extern const int HDFS_ERROR; + extern const int ACCESS_DENIED; } void HDFSObjectStorage::shutdown() @@ -48,7 +50,7 @@ std::unique_ptr HDFSObjectStorage::readObject( /// NOLIN std::optional, std::optional) const { - return std::make_unique(object.remote_path, object.remote_path, config, patchSettings(read_settings)); + return std::make_unique(hdfs_root_path, object.remote_path, config, patchSettings(read_settings)); } std::unique_ptr HDFSObjectStorage::readObjects( /// NOLINT @@ -62,12 +64,12 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI [this, disk_read_settings] (bool /* restricted_seek */, const std::string & path) -> std::unique_ptr { - size_t begin_of_path = path.find('/', path.find("//") + 2); - auto hdfs_path = path.substr(begin_of_path); - auto hdfs_uri = path.substr(0, begin_of_path); + // size_t begin_of_path = path.find('/', path.find("//") + 2); + // auto hdfs_path = path.substr(begin_of_path); + // auto hdfs_uri = path.substr(0, begin_of_path); return std::make_unique( - hdfs_uri, hdfs_path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true); + hdfs_root_path, path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true); }; return std::make_unique( @@ -131,7 +133,8 @@ ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) co { auto * file_info = hdfsGetPathInfo(hdfs_fs.get(), path.data()); if (!file_info) - throw Exception(ErrorCodes::HDFS_ERROR, "Cannot get file info for: {}. Error: {}", path, hdfsGetLastError()); + throw Exception(ErrorCodes::HDFS_ERROR, + "Cannot get file info for: {}. Error: {}", path, hdfsGetLastError()); ObjectMetadata metadata; metadata.size_bytes = static_cast(file_info->mSize); @@ -141,6 +144,54 @@ ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) co return metadata; } +void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const +{ + auto * log = &Poco::Logger::get("HDFSObjectStorage"); + LOG_TRACE(log, "Trying to list files for {}", path); + + HDFSFileInfo ls; + ls.file_info = hdfsListDirectory(hdfs_fs.get(), path.data(), &ls.length); + + if (ls.file_info == nullptr && errno != ENOENT) // NOLINT + { + // ignore file not found exception, keep throw other exception, + // libhdfs3 doesn't have function to get exception type, so use errno. + throw Exception(ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", + path, String(hdfsGetLastError())); + } + + if (!ls.file_info && ls.length > 0) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null"); + } + + LOG_TRACE(log, "Listed {} files for {}", ls.length, path); + + for (int i = 0; i < ls.length; ++i) + { + const String file_path = fs::path(ls.file_info[i].mName).lexically_normal(); + const size_t last_slash = file_path.rfind('/'); + const String file_name = file_path.substr(last_slash); + + const bool is_directory = ls.file_info[i].mKind == 'D'; + if (is_directory) + { + listObjects(fs::path(file_path) / "", children, max_keys); + } + else + { + LOG_TEST(log, "Found file: {}", file_path); + + children.emplace_back(std::make_shared( + String(file_path), + ObjectMetadata{ + static_cast(ls.file_info[i].mSize), + Poco::Timestamp::fromEpochTime(ls.file_info[i].mLastMod), + {}})); + } + } +} + void HDFSObjectStorage::copyObject( /// NOLINT const StoredObject & object_from, const StoredObject & object_to, @@ -160,7 +211,10 @@ void HDFSObjectStorage::copyObject( /// NOLINT } -std::unique_ptr HDFSObjectStorage::cloneObjectStorage(const std::string &, const Poco::Util::AbstractConfiguration &, const std::string &, ContextPtr) +std::unique_ptr HDFSObjectStorage::cloneObjectStorage( + const std::string &, + const Poco::Util::AbstractConfiguration &, + const std::string &, ContextPtr) { throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "HDFS object storage doesn't support cloning"); } diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index f92e160fd4d..24642ec635a 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -92,6 +92,8 @@ public: const WriteSettings & write_settings, std::optional object_to_attributes = {}) override; + void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; + void shutdown() override; void startup() override; diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.h b/src/Disks/ObjectStorages/ObjectStorageIterator.h index e934fc2056d..26c3c690ba5 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIterator.h +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.h @@ -27,9 +27,7 @@ class ObjectStorageIteratorFromList : public IObjectStorageIterator public: explicit ObjectStorageIteratorFromList(RelativePathsWithMetadata && batch_) : batch(std::move(batch_)) - , batch_iterator(batch.begin()) - { - } + , batch_iterator(batch.begin()) {} void next() override { @@ -37,21 +35,23 @@ public: ++batch_iterator; } - void nextBatch() override - { - batch_iterator = batch.end(); - } + void nextBatch() override { batch_iterator = batch.end(); } - bool isValid() override - { - return batch_iterator != batch.end(); - } + bool isValid() override { return batch_iterator != batch.end(); } RelativePathWithMetadataPtr current() override; RelativePathsWithMetadata currentBatch() override { return batch; } - std::optional getCurrentBatchAndScheduleNext() override { return std::nullopt; } + std::optional getCurrentBatchAndScheduleNext() override + { + if (batch.empty()) + return {}; + + auto current_batch = std::move(batch); + batch = {}; + return current_batch; + } size_t getAccumulatedSize() const override { return batch.size(); } diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index f441b18d59d..94a0751dcc8 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -21,6 +21,18 @@ IObjectStorageIteratorAsync::IObjectStorageIteratorAsync( { } +IObjectStorageIteratorAsync::~IObjectStorageIteratorAsync() +{ + if (!deactivated) + deactivate(); +} + +void IObjectStorageIteratorAsync::deactivate() +{ + list_objects_pool.wait(); + deactivated = true; +} + void IObjectStorageIteratorAsync::nextBatch() { std::lock_guard lock(mutex); diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h index c4bde91f415..3e3269fb550 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h @@ -19,18 +19,20 @@ public: CurrentMetrics::Metric threads_scheduled_metric, const std::string & thread_name); - void next() override; - void nextBatch() override; + ~IObjectStorageIteratorAsync() override; + bool isValid() override; + RelativePathWithMetadataPtr current() override; RelativePathsWithMetadata currentBatch() override; + + void next() override; + void nextBatch() override; + size_t getAccumulatedSize() const override; std::optional getCurrentBatchAndScheduleNext() override; - ~IObjectStorageIteratorAsync() override - { - list_objects_pool.wait(); - } + void deactivate(); protected: @@ -46,6 +48,7 @@ protected: bool is_initialized{false}; bool is_finished{false}; + bool deactivated{false}; mutable std::recursive_mutex mutex; ThreadPool list_objects_pool; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 33c0afda4c1..d902a33ae4a 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -110,10 +110,19 @@ public: CurrentMetrics::ObjectStorageS3ThreadsScheduled, "ListObjectS3") , client(client_) + , request(std::make_unique()) { - request.SetBucket(bucket_); - request.SetPrefix(path_prefix); - request.SetMaxKeys(static_cast(max_list_size)); + request->SetBucket(bucket_); + request->SetPrefix(path_prefix); + request->SetMaxKeys(static_cast(max_list_size)); + } + + ~S3IteratorAsync() override + { + /// Deactivate background threads before resetting the request to avoid data race. + deactivate(); + request.reset(); + client.reset(); } private: @@ -121,12 +130,12 @@ private: { ProfileEvents::increment(ProfileEvents::S3ListObjects); - auto outcome = client->ListObjectsV2(request); + auto outcome = client->ListObjectsV2(*request); /// Outcome failure will be handled on the caller side. if (outcome.IsSuccess()) { - request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); + request->SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); auto objects = outcome.GetResult().GetContents(); for (const auto & object : objects) @@ -141,12 +150,12 @@ private: throw S3Exception(outcome.GetError().GetErrorType(), "Could not list objects in bucket {} with prefix {}, S3 exception: {}, message: {}", - quoteString(request.GetBucket()), quoteString(request.GetPrefix()), + quoteString(request->GetBucket()), quoteString(request->GetPrefix()), backQuote(outcome.GetError().GetExceptionName()), quoteString(outcome.GetError().GetMessage())); } std::shared_ptr client; - S3::ListObjectsV2Request request; + std::unique_ptr request; }; } diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 5edc660d717..50e8918a12e 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -8,6 +8,8 @@ #include #include #include +#include + namespace DB { @@ -28,7 +30,7 @@ StorageHDFSConfiguration::StorageHDFSConfiguration(const StorageHDFSConfiguratio void StorageHDFSConfiguration::check(ContextPtr context) const { context->getRemoteHostFilter().checkURL(Poco::URI(url)); - checkHDFSURL(fs::path(url) / path); + checkHDFSURL(fs::path(url) / path.substr(1)); } ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage( /// NOLINT @@ -44,9 +46,22 @@ ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage( /// NOLINT return std::make_shared(url, std::move(hdfs_settings), context->getConfigRef()); } -void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool /* with_structure */) +std::string StorageHDFSConfiguration::getPathWithoutGlob() const { - url = checkAndGetLiteralArgument(args[0], "url"); + /// Unlike s3 and azure, which are object storages, + /// hdfs is a filesystem, so it cannot list files by partual prefix, + /// only by directory. + auto first_glob_pos = path.find_first_of("*?{"); + auto end_of_path_without_globs = path.substr(0, first_glob_pos).rfind('/'); + if (end_of_path_without_globs == std::string::npos || end_of_path_without_globs == 0) + return "/"; + return path.substr(0, end_of_path_without_globs); +} + +void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool with_structure) +{ + std::string url_str; + url_str = checkAndGetLiteralArgument(args[0], "url"); if (args.size() > 1) { @@ -54,28 +69,60 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool /* format = checkAndGetLiteralArgument(args[1], "format_name"); } - if (args.size() == 3) + if (with_structure) + { + if (args.size() > 2) + { + structure = checkAndGetLiteralArgument(args[2], "structure"); + } + if (args.size() > 3) + { + args[3] = evaluateConstantExpressionOrIdentifierAsLiteral(args[3], context); + compression_method = checkAndGetLiteralArgument(args[3], "compression_method"); + } + } + else if (args.size() > 2) { args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(args[2], context); compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); } - auto pos = url.find("//"); - if (pos == std::string::npos) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid url: {}", url); - - pos = url.find('/', pos + 2); - if (pos == std::string::npos) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid url: {}", url); - - path = url.substr(pos + 1); - url = url.substr(0, pos); - paths = {path}; + setURL(url_str); } -void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection &) +void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection & collection) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method fromNamedColection() is not implemented"); + std::string url_str; + + auto filename = collection.getOrDefault("filename", ""); + if (!filename.empty()) + url_str = std::filesystem::path(collection.get("url")) / filename; + else + url_str = collection.get("url"); + + format = collection.getOrDefault("format", "auto"); + compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); + structure = collection.getOrDefault("structure", "auto"); + + setURL(url_str); +} + +void StorageHDFSConfiguration::setURL(const std::string url_) +{ + auto pos = url_.find("//"); + if (pos == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad hdfs url: {}", url_); + + pos = url_.find('/', pos + 2); + if (pos == std::string::npos) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad hdfs url: {}", url_); + + path = url_.substr(pos + 1); + url = url_.substr(0, pos); + path = '/' + path; + paths = {path}; + + LOG_TRACE(getLogger("StorageHDFSConfiguration"), "Using url: {}, path: {}", url, path); } } diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 5765edbf36c..8506c7c9700 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -31,9 +31,12 @@ public: static void addStructureToArgs(ASTs &, const String &, ContextPtr) {} + std::string getPathWithoutGlob() const override; + private: void fromNamedCollection(const NamedCollection &) override; void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override; + void setURL(const std::string url_); String url; String path; diff --git a/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp index 18b22805dfc..c29189804e6 100644 --- a/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -55,10 +56,10 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory(use_external_buffer_ ? 0 : read_settings_.remote_fs_buffer_size) , hdfs_uri(hdfs_uri_) , hdfs_file_path(hdfs_file_path_) - , builder(createHDFSBuilder(hdfs_uri_, config_)) , read_settings(read_settings_) , read_until_position(read_until_position_) { + builder = createHDFSBuilder(hdfs_uri_, config_); fs = createHDFSFS(builder.get()); fin = hdfsOpenFile(fs.get(), hdfs_file_path.c_str(), O_RDONLY, 0, 0, 0); @@ -96,11 +97,14 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory {})", file_offset, read_until_position - 1); @@ -111,10 +115,11 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory= file_size) - { - return false; - } + // if (file_size != 0 && file_offset >= file_size) + // { + // LOG_TEST(log, "KSSENII 1 2"); + // return false; + // } ResourceGuard rlock(read_settings.resource_link, num_bytes_to_read); int bytes_read; @@ -145,6 +150,8 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemoryadd(bytes_read, ProfileEvents::RemoteReadThrottlerBytes, ProfileEvents::RemoteReadThrottlerSleepMicroseconds); + + LOG_TEST(log, "KSSENII SIZE: {}", bytes_read); return true; } diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp index ce157972161..f2595299430 100644 --- a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp @@ -49,8 +49,8 @@ void ReadFromStorageObejctStorage::createIterator(const ActionsDAG::Node * predi { auto context = getContext(); iterator_wrapper = StorageObjectStorageSource::createFileIterator( - configuration, object_storage, distributed_processing, context, predicate, - virtual_columns, nullptr, query_settings.list_object_keys_size, metric_threads_count, + configuration, object_storage, query_settings, distributed_processing, + context, predicate, virtual_columns, nullptr, metric_threads_count, metric_threads_active, metric_threads_scheduled, context->getFileProgressCallback()); } } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 8d85224cff0..0276ff62778 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -288,8 +288,8 @@ std::unique_ptr StorageObjectStorage::creat { const auto settings = StorageSettings::create(context->getSettingsRef()); auto file_iterator = StorageObjectStorageSource::createFileIterator( - configuration, object_storage, /* distributed_processing */false, - context, /* predicate */{}, /* virtual_columns */{}, &read_keys, settings.list_object_keys_size, + configuration, object_storage, settings, /* distributed_processing */false, + context, /* predicate */{}, /* virtual_columns */{}, &read_keys, StorageSettings::ObjectStorageThreads(), StorageSettings::ObjectStorageThreadsActive(), StorageSettings::ObjectStorageThreadsScheduled()); return std::make_unique( diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index c5421f1d319..f023bb068d4 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -92,7 +92,8 @@ StorageObjectStorageCluster::getTask const auto settings = StorageSettings::create(local_context->getSettingsRef()); auto iterator = std::make_shared( object_storage, configuration, predicate, virtual_columns, local_context, - nullptr, settings.list_object_keys_size, local_context->getFileProgressCallback()); + nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match, + local_context->getFileProgressCallback()); auto callback = std::make_shared>([iterator]() mutable -> String { diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 1d5c0cd3a39..61e569cee05 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -40,7 +40,8 @@ StorageObjectStorageConfiguration::StorageObjectStorageConfiguration(const Stora bool StorageObjectStorageConfiguration::withWildcard() const { static const String PARTITION_ID_WILDCARD = "{_partition_id}"; - return getPath().find(PARTITION_ID_WILDCARD) != String::npos; + return getPath().find(PARTITION_ID_WILDCARD) != String::npos + || getNamespace().find(PARTITION_ID_WILDCARD) != String::npos; } bool StorageObjectStorageConfiguration::isPathWithGlobs() const diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 0beed91b128..48825c6a012 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -40,7 +40,7 @@ public: bool withGlobs() const { return isPathWithGlobs() || isNamespaceWithGlobs(); } bool isPathWithGlobs() const; bool isNamespaceWithGlobs() const; - std::string getPathWithoutGlob() const; + virtual std::string getPathWithoutGlob() const; virtual void check(ContextPtr context) const = 0; virtual void validateNamespace(const String & /* name */) const {} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h index 454da7c355f..8bcc2ad3b37 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h @@ -25,6 +25,7 @@ struct StorageObjectStorageSettings SchemaInferenceMode schema_inference_mode; bool skip_empty_files; size_t list_object_keys_size; + bool throw_on_zero_files_match; }; struct S3StorageSettings @@ -38,6 +39,7 @@ struct S3StorageSettings .schema_inference_mode = settings.schema_inference_mode, .skip_empty_files = settings.s3_skip_empty_files, .list_object_keys_size = settings.s3_list_object_keys_size, + .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, }; } @@ -59,6 +61,7 @@ struct AzureStorageSettings .schema_inference_mode = settings.schema_inference_mode, .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for azure .list_object_keys_size = settings.azure_list_object_keys_size, + .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, }; } @@ -80,6 +83,7 @@ struct HDFSStorageSettings .schema_inference_mode = settings.schema_inference_mode, .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for hdfs .list_object_keys_size = settings.s3_list_object_keys_size, /// TODO: add a setting for hdfs + .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 3c8484194c9..5a88f1436c1 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -28,6 +28,7 @@ namespace ErrorCodes extern const int CANNOT_COMPILE_REGEXP; extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; + extern const int FILE_DOESNT_EXIST; } StorageObjectStorageSource::StorageObjectStorageSource( @@ -75,12 +76,12 @@ StorageObjectStorageSource::~StorageObjectStorageSource() std::shared_ptr StorageObjectStorageSource::createFileIterator( ConfigurationPtr configuration, ObjectStoragePtr object_storage, + const StorageObjectStorageSettings & settings, bool distributed_processing, const ContextPtr & local_context, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, - size_t list_object_keys_size, CurrentMetrics::Metric metric_threads_, CurrentMetrics::Metric metric_threads_active_, CurrentMetrics::Metric metric_threads_scheduled_, @@ -99,12 +100,14 @@ std::shared_ptr StorageObjectStorageSourc { /// Iterate through disclosed globs and make a source for each file return std::make_shared( - object_storage, configuration, predicate, virtual_columns, local_context, read_keys, list_object_keys_size, file_progress_callback); + object_storage, configuration, predicate, virtual_columns, local_context, + read_keys, settings.list_object_keys_size, settings.throw_on_zero_files_match, file_progress_callback); } else { return std::make_shared( - object_storage, configuration, virtual_columns, read_keys, file_progress_callback); + object_storage, configuration, virtual_columns, read_keys, + settings.throw_on_zero_files_match, file_progress_callback); } } @@ -209,6 +212,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade do { object_info = file_iterator->next(processor); + if (!object_info || object_info->relative_path.empty()) return {}; @@ -226,8 +230,11 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade ? tryGetNumRowsFromCache(object_info) : std::nullopt; + LOG_TRACE(&Poco::Logger::get("kssenii"), "HAS NUM ROWS FROM CACHE: {}", num_rows_from_cache.has_value()); if (num_rows_from_cache) { + LOG_TRACE(&Poco::Logger::get("kssenii"), "NUM ROWS FROM CACHE: {}", num_rows_from_cache.value()); + /// We should not return single chunk with all number of rows, /// because there is a chance that this chunk will be materialized later /// (it can cause memory problems even with default values in columns or when virtual columns are requested). @@ -324,6 +331,29 @@ std::unique_ptr StorageObjectStorageSource::createReadBuffer(const S } } +StorageObjectStorageSource::IIterator::IIterator(bool throw_on_zero_files_match_, const std::string & logger_name_) + : throw_on_zero_files_match(throw_on_zero_files_match_) + , logger(getLogger(logger_name_)) +{ +} + +ObjectInfoPtr StorageObjectStorageSource::IIterator::next(size_t processor) +{ + auto object_info = nextImpl(processor); + + if (object_info) + { + first_iteration = false; + LOG_TEST(&Poco::Logger::get("KeysIterator"), "Next key: {}", object_info->relative_path); + } + else if (first_iteration && throw_on_zero_files_match) + { + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files"); + } + + return object_info; +} + StorageObjectStorageSource::GlobIterator::GlobIterator( ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, @@ -332,8 +362,10 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( ContextPtr context_, ObjectInfos * read_keys_, size_t list_object_keys_size, + bool throw_on_zero_files_match_, std::function file_progress_callback_) - : WithContext(context_) + : IIterator(throw_on_zero_files_match_, "GlobIterator") + , WithContext(context_) , object_storage(object_storage_) , configuration(configuration_) , virtual_columns(virtual_columns_) @@ -380,7 +412,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } } -ObjectInfoPtr StorageObjectStorageSource::GlobIterator::next(size_t /* processor */) +ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImpl(size_t /* processor */) { std::lock_guard lock(next_mutex); @@ -401,9 +433,10 @@ ObjectInfoPtr StorageObjectStorageSource::GlobIterator::next(size_t /* processor } new_batch = std::move(result.value()); + LOG_TEST(logger, "Batch size: {}", new_batch.size()); + for (auto it = new_batch.begin(); it != new_batch.end();) { - chassert(*it); if (!recursive && !re2::RE2::FullMatch((*it)->relative_path, *matcher)) it = new_batch.erase(it); else @@ -452,8 +485,10 @@ StorageObjectStorageSource::KeysIterator::KeysIterator( ConfigurationPtr configuration_, const NamesAndTypesList & virtual_columns_, ObjectInfos * read_keys_, + bool throw_on_zero_files_match_, std::function file_progress_callback_) - : object_storage(object_storage_) + : IIterator(throw_on_zero_files_match_, "KeysIterator") + , object_storage(object_storage_) , configuration(configuration_) , virtual_columns(virtual_columns_) , file_progress_callback(file_progress_callback_) @@ -470,7 +505,7 @@ StorageObjectStorageSource::KeysIterator::KeysIterator( } } -ObjectInfoPtr StorageObjectStorageSource::KeysIterator::next(size_t /* processor */) +ObjectInfoPtr StorageObjectStorageSource::KeysIterator::nextImpl(size_t /* processor */) { size_t current_index = index.fetch_add(1, std::memory_order_relaxed); if (current_index >= keys.size()) @@ -520,7 +555,8 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( CurrentMetrics::Metric metric_threads_, CurrentMetrics::Metric metric_threads_active_, CurrentMetrics::Metric metric_threads_scheduled_) - : callback(callback_) + : IIterator(false, "ReadTaskIterator") + , callback(callback_) { ThreadPool pool(metric_threads_, metric_threads_active_, metric_threads_scheduled_, max_threads_count); auto pool_scheduler = threadPoolCallbackRunner(pool, "ReadTaskIter"); @@ -540,7 +576,7 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( } } -ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator::next(size_t) +ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator::nextImpl(size_t) { size_t current_index = index.fetch_add(1, std::memory_order_relaxed); if (current_index >= buffer.size()) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index d02cb4a3a90..7c5497a6eaa 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -53,12 +53,12 @@ public: static std::shared_ptr createFileIterator( ConfigurationPtr configuration, ObjectStoragePtr object_storage, + const StorageObjectStorageSettings & settings, bool distributed_processing, const ContextPtr & local_context, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, - size_t list_object_keys_size, CurrentMetrics::Metric metric_threads_, CurrentMetrics::Metric metric_threads_active_, CurrentMetrics::Metric metric_threads_scheduled_, @@ -133,10 +133,21 @@ protected: class StorageObjectStorageSource::IIterator { public: + IIterator(bool throw_on_zero_files_match_, const std::string & logger_name_); + virtual ~IIterator() = default; virtual size_t estimatedKeysCount() = 0; - virtual ObjectInfoPtr next(size_t processor) = 0; + + ObjectInfoPtr next(size_t processor); + +protected: + virtual ObjectInfoPtr nextImpl(size_t processor) = 0; + +protected: + const bool throw_on_zero_files_match; + bool first_iteration = true; + LoggerPtr logger; }; class StorageObjectStorageSource::ReadTaskIterator : public IIterator @@ -151,9 +162,9 @@ public: size_t estimatedKeysCount() override { return buffer.size(); } - ObjectInfoPtr next(size_t) override; - private: + ObjectInfoPtr nextImpl(size_t) override; + ReadTaskCallback callback; ObjectInfos buffer; std::atomic_size_t index = 0; @@ -170,15 +181,17 @@ public: ContextPtr context_, ObjectInfos * read_keys_, size_t list_object_keys_size, + bool throw_on_zero_files_match_, std::function file_progress_callback_ = {}); ~GlobIterator() override = default; size_t estimatedKeysCount() override { return object_infos.size(); } - ObjectInfoPtr next(size_t processor) override; - private: + ObjectInfoPtr nextImpl(size_t processor) override; + void createFilterAST(const String & any_key); + ObjectStoragePtr object_storage; ConfigurationPtr configuration; ActionsDAGPtr filter_dag; @@ -193,7 +206,6 @@ private: std::unique_ptr matcher; - void createFilterAST(const String & any_key); bool is_finished = false; std::mutex next_mutex; @@ -208,15 +220,16 @@ public: ConfigurationPtr configuration_, const NamesAndTypesList & virtual_columns_, ObjectInfos * read_keys_, + bool throw_on_zero_files_match_, std::function file_progress_callback = {}); ~KeysIterator() override = default; size_t estimatedKeysCount() override { return keys.size(); } - ObjectInfoPtr next(size_t processor) override; - private: + ObjectInfoPtr nextImpl(size_t processor) override; + const ObjectStoragePtr object_storage; const ConfigurationPtr configuration; const NamesAndTypesList virtual_columns; diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 42cd210018a..ee3071ea71f 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -45,7 +45,8 @@ StorageS3QueueSource::FileIterator::FileIterator( std::unique_ptr glob_iterator_, size_t current_shard_, std::atomic & shutdown_called_) - : metadata(metadata_) + : StorageObjectStorageSource::IIterator(false, "S3QueueIterator") + , metadata(metadata_) , glob_iterator(std::move(glob_iterator_)) , shutdown_called(shutdown_called_) , log(&Poco::Logger::get("StorageS3QueueSource")) @@ -59,7 +60,7 @@ StorageS3QueueSource::FileIterator::FileIterator( } } -StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::next(size_t processor) +StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl(size_t processor) { while (!shutdown_called) { diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 2bdac7f2311..8c785e683c2 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -56,7 +56,7 @@ public: /// Note: /// List results in s3 are always returned in UTF-8 binary order. /// (https://docs.aws.amazon.com/AmazonS3/latest/userguide/ListingKeysUsingAPIs.html) - ObjectInfoPtr next(size_t processor) override; + ObjectInfoPtr nextImpl(size_t processor) override; size_t estimatedKeysCount() override; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index aafcdc39f9e..c5799d23abd 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -598,7 +598,7 @@ std::shared_ptr StorageS3Queue::createFileIterator { auto settings = S3StorageSettings::create(local_context->getSettingsRef()); auto glob_iterator = std::make_unique( - object_storage, configuration, predicate, getVirtualsList(), local_context, nullptr, settings.list_object_keys_size); + object_storage, configuration, predicate, getVirtualsList(), local_context, nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match); return std::make_shared(files_metadata, std::move(glob_iterator), s3queue_settings->s3queue_current_shard_num, shutdown_called); } diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 9dec1954406..5632c7ae060 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -61,7 +61,7 @@ def test_read_write_storage_with_globs(started_cluster): hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n") assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n" - assert node1.query("select count(*) from HDFSStorageWithRange") == "3\n" + assert node1.query("select count(*) from HDFSStorageWithRange settings s3_throw_on_zero_files_match=1") == "3\n" assert node1.query("select count(*) from HDFSStorageWithEnum") == "3\n" assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == "3\n" assert node1.query("select count(*) from HDFSStorageWithAsterisk") == "3\n" @@ -159,7 +159,7 @@ def test_bad_hdfs_uri(started_cluster): ) except Exception as ex: print(ex) - assert "Unable to create builder to connect to HDFS" in str(ex) + assert "Unable to connect to HDFS" in str(ex) try: node1.query( From 480251e5932f2d15891a403887b5afc96f40ee89 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 27 Mar 2024 19:28:11 +0100 Subject: [PATCH 055/392] Fix style check --- tests/integration/test_storage_hdfs/test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 5632c7ae060..f6e486d6594 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -61,7 +61,12 @@ def test_read_write_storage_with_globs(started_cluster): hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n") assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n" - assert node1.query("select count(*) from HDFSStorageWithRange settings s3_throw_on_zero_files_match=1") == "3\n" + assert ( + node1.query( + "select count(*) from HDFSStorageWithRange settings s3_throw_on_zero_files_match=1" + ) + == "3\n" + ) assert node1.query("select count(*) from HDFSStorageWithEnum") == "3\n" assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == "3\n" assert node1.query("select count(*) from HDFSStorageWithAsterisk") == "3\n" From a2e210462d7d78212c32408ea3d276ef366b57c4 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 27 Mar 2024 22:31:22 +0100 Subject: [PATCH 056/392] Fix style check --- src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp | 1 + src/Storages/ObjectStorage/HDFS/Configuration.cpp | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index db79ff365aa..9bc75b740e5 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -20,6 +20,7 @@ namespace ErrorCodes extern const int UNSUPPORTED_METHOD; extern const int HDFS_ERROR; extern const int ACCESS_DENIED; + extern const int LOGICAL_ERROR; } void HDFSObjectStorage::shutdown() diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 50e8918a12e..3828afc0bea 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -16,7 +16,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int NOT_IMPLEMENTED; } StorageHDFSConfiguration::StorageHDFSConfiguration(const StorageHDFSConfiguration & other) From 5c63d09c5bb91f7dc159befeb505a74e4c0257a5 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Mar 2024 14:15:14 +0100 Subject: [PATCH 057/392] More tests fixes --- src/Core/Settings.h | 3 + .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 4 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 8 +- .../ObjectStorage/HDFS/Configuration.cpp | 18 ++++- .../ObjectStorage/HDFS/Configuration.h | 2 +- .../StorageObjectStorageQuerySettings.h | 4 + .../StorageObjectStorageSource.cpp | 76 ++++++++++++------- .../StorageObjectStorageSource.h | 20 ++--- src/Storages/S3Queue/S3QueueSource.cpp | 2 +- tests/integration/test_storage_hdfs/test.py | 12 +-- .../0_stateless/02725_database_hdfs.sh | 3 +- 11 files changed, 98 insertions(+), 54 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f8f3595094c..2fae390c35b 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -113,6 +113,9 @@ class IColumn; M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \ M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, s3_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageS3", 0) \ + M(Bool, hdfs_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageHDFS", 0) \ + M(Bool, azure_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageAzure", 0) \ M(Bool, s3_disable_checksum, false, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ M(UInt64, s3_request_timeout_ms, 30000, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \ diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 85d3e921f22..8bfba6fcfad 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -103,10 +103,10 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL void HDFSObjectStorage::removeObject(const StoredObject & object) { const auto & path = object.remote_path; - const size_t begin_of_path = path.find('/', path.find("//") + 2); + // const size_t begin_of_path = path.find('/', path.find("//") + 2); /// Add path from root to file name - int res = hdfsDelete(hdfs_fs.get(), path.substr(begin_of_path).c_str(), 0); + int res = hdfsDelete(hdfs_fs.get(), path.c_str(), 0); if (res == -1) throw Exception(ErrorCodes::HDFS_ERROR, "HDFSDelete failed with path: {}", path); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index b9995620c0f..9085fddfd08 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -432,7 +432,9 @@ void S3ObjectStorage::removeObjectsIfExist(const StoredObjects & objects) std::optional S3ObjectStorage::tryGetObjectMetadata(const std::string & path) const { auto settings_ptr = s3_settings.get(); - auto object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true, /* for_disk_s3= */ true, /* throw_on_error= */ false); + auto object_info = S3::getObjectInfo( + *client.get(), uri.bucket, path, {}, settings_ptr->request_settings, + /* with_metadata= */ true, /* for_disk_s3= */ true, /* throw_on_error= */ false); if (object_info.size == 0 && object_info.last_modification_time == 0 && object_info.metadata.empty()) return {}; @@ -448,7 +450,9 @@ std::optional S3ObjectStorage::tryGetObjectMetadata(const std::s ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) const { auto settings_ptr = s3_settings.get(); - auto object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true, /* for_disk_s3= */ true); + auto object_info = S3::getObjectInfo( + *client.get(), uri.bucket, path, {}, settings_ptr->request_settings, + /* with_metadata= */ true, /* for_disk_s3= */ true); ObjectMetadata result; result.size_bytes = object_info.size; diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 3828afc0bea..594f0b89454 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -16,6 +16,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } StorageHDFSConfiguration::StorageHDFSConfiguration(const StorageHDFSConfiguration & other) @@ -62,6 +63,13 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool wit std::string url_str; url_str = checkAndGetLiteralArgument(args[0], "url"); + const size_t max_args_num = with_structure ? 4 : 3; + if (args.size() > max_args_num) + { + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Expected not more than {} arguments", max_args_num); + } + if (args.size() > 1) { args[1] = evaluateConstantExpressionOrIdentifierAsLiteral(args[1], context); @@ -72,6 +80,7 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool wit { if (args.size() > 2) { + args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(args[2], context); structure = checkAndGetLiteralArgument(args[2], "structure"); } if (args.size() > 3) @@ -100,13 +109,14 @@ void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection & colle url_str = collection.get("url"); format = collection.getOrDefault("format", "auto"); - compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); + compression_method = collection.getOrDefault("compression_method", + collection.getOrDefault("compression", "auto")); structure = collection.getOrDefault("structure", "auto"); setURL(url_str); } -void StorageHDFSConfiguration::setURL(const std::string url_) +void StorageHDFSConfiguration::setURL(const std::string & url_) { auto pos = url_.find("//"); if (pos == std::string::npos) @@ -117,8 +127,10 @@ void StorageHDFSConfiguration::setURL(const std::string url_) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad hdfs url: {}", url_); path = url_.substr(pos + 1); + if (!path.starts_with('/')) + path = '/' + path; + url = url_.substr(0, pos); - path = '/' + path; paths = {path}; LOG_TRACE(getLogger("StorageHDFSConfiguration"), "Using url: {}, path: {}", url, path); diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 8506c7c9700..7154f790665 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -36,7 +36,7 @@ public: private: void fromNamedCollection(const NamedCollection &) override; void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override; - void setURL(const std::string url_); + void setURL(const std::string & url_); String url; String path; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h index 8bcc2ad3b37..f0687776aa7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h @@ -26,6 +26,7 @@ struct StorageObjectStorageSettings bool skip_empty_files; size_t list_object_keys_size; bool throw_on_zero_files_match; + bool ignore_non_existent_file; }; struct S3StorageSettings @@ -40,6 +41,7 @@ struct S3StorageSettings .skip_empty_files = settings.s3_skip_empty_files, .list_object_keys_size = settings.s3_list_object_keys_size, .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, + .ignore_non_existent_file = settings.s3_ignore_file_doesnt_exist, }; } @@ -62,6 +64,7 @@ struct AzureStorageSettings .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for azure .list_object_keys_size = settings.azure_list_object_keys_size, .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, + .ignore_non_existent_file = settings.azure_ignore_file_doesnt_exist, }; } @@ -84,6 +87,7 @@ struct HDFSStorageSettings .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for hdfs .list_object_keys_size = settings.s3_list_object_keys_size, /// TODO: add a setting for hdfs .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, + .ignore_non_existent_file = settings.hdfs_ignore_file_doesnt_exist, }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 5a88f1436c1..80aa0c210e9 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -100,14 +100,15 @@ std::shared_ptr StorageObjectStorageSourc { /// Iterate through disclosed globs and make a source for each file return std::make_shared( - object_storage, configuration, predicate, virtual_columns, local_context, - read_keys, settings.list_object_keys_size, settings.throw_on_zero_files_match, file_progress_callback); + object_storage, configuration, predicate, virtual_columns, + local_context, read_keys, settings.list_object_keys_size, + settings.throw_on_zero_files_match, file_progress_callback); } else { return std::make_shared( object_storage, configuration, virtual_columns, read_keys, - settings.throw_on_zero_files_match, file_progress_callback); + settings.ignore_non_existent_file, file_progress_callback); } } @@ -331,9 +332,8 @@ std::unique_ptr StorageObjectStorageSource::createReadBuffer(const S } } -StorageObjectStorageSource::IIterator::IIterator(bool throw_on_zero_files_match_, const std::string & logger_name_) - : throw_on_zero_files_match(throw_on_zero_files_match_) - , logger(getLogger(logger_name_)) +StorageObjectStorageSource::IIterator::IIterator(const std::string & logger_name_) + : logger(getLogger(logger_name_)) { } @@ -343,13 +343,8 @@ ObjectInfoPtr StorageObjectStorageSource::IIterator::next(size_t processor) if (object_info) { - first_iteration = false; LOG_TEST(&Poco::Logger::get("KeysIterator"), "Next key: {}", object_info->relative_path); } - else if (first_iteration && throw_on_zero_files_match) - { - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files"); - } return object_info; } @@ -364,11 +359,12 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( size_t list_object_keys_size, bool throw_on_zero_files_match_, std::function file_progress_callback_) - : IIterator(throw_on_zero_files_match_, "GlobIterator") + : IIterator("GlobIterator") , WithContext(context_) , object_storage(object_storage_) , configuration(configuration_) , virtual_columns(virtual_columns_) + , throw_on_zero_files_match(throw_on_zero_files_match_) , read_keys(read_keys_) , file_progress_callback(file_progress_callback_) { @@ -412,10 +408,24 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } } -ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImpl(size_t /* processor */) +ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImpl(size_t processor) { std::lock_guard lock(next_mutex); + auto object_info = nextImplUnlocked(processor); + if (object_info) + { + if (first_iteration) + first_iteration = false; + } + else if (first_iteration && throw_on_zero_files_match) + { + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files"); + } + return object_info; +} +ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImplUnlocked(size_t /* processor */) +{ bool current_batch_processed = object_infos.empty() || index >= object_infos.size(); if (is_finished && current_batch_processed) return {}; @@ -485,14 +495,15 @@ StorageObjectStorageSource::KeysIterator::KeysIterator( ConfigurationPtr configuration_, const NamesAndTypesList & virtual_columns_, ObjectInfos * read_keys_, - bool throw_on_zero_files_match_, + bool ignore_non_existent_files_, std::function file_progress_callback_) - : IIterator(throw_on_zero_files_match_, "KeysIterator") + : IIterator("KeysIterator") , object_storage(object_storage_) , configuration(configuration_) , virtual_columns(virtual_columns_) , file_progress_callback(file_progress_callback_) , keys(configuration->getPaths()) + , ignore_non_existent_files(ignore_non_existent_files_) { if (read_keys_) { @@ -507,20 +518,29 @@ StorageObjectStorageSource::KeysIterator::KeysIterator( ObjectInfoPtr StorageObjectStorageSource::KeysIterator::nextImpl(size_t /* processor */) { - size_t current_index = index.fetch_add(1, std::memory_order_relaxed); - if (current_index >= keys.size()) - return {}; - - auto key = keys[current_index]; - - ObjectMetadata metadata{}; - if (file_progress_callback) + while (true) { - metadata = object_storage->getObjectMetadata(key); - file_progress_callback(FileProgress(0, metadata.size_bytes)); - } + size_t current_index = index.fetch_add(1, std::memory_order_relaxed); + if (current_index >= keys.size()) + return {}; - return std::make_shared(key, metadata); + auto key = keys[current_index]; + + ObjectMetadata object_metadata{}; + if (ignore_non_existent_files) + { + auto metadata = object_storage->tryGetObjectMetadata(key); + if (!metadata) + continue; + } + else + object_metadata = object_storage->getObjectMetadata(key); + + if (file_progress_callback) + file_progress_callback(FileProgress(0, object_metadata.size_bytes)); + + return std::make_shared(key, object_metadata); + } } StorageObjectStorageSource::ReaderHolder::ReaderHolder( @@ -555,7 +575,7 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( CurrentMetrics::Metric metric_threads_, CurrentMetrics::Metric metric_threads_active_, CurrentMetrics::Metric metric_threads_scheduled_) - : IIterator(false, "ReadTaskIterator") + : IIterator("ReadTaskIterator") , callback(callback_) { ThreadPool pool(metric_threads_, metric_threads_active_, metric_threads_scheduled_, max_threads_count); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 7c5497a6eaa..3d4cc4fbd20 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -133,7 +133,7 @@ protected: class StorageObjectStorageSource::IIterator { public: - IIterator(bool throw_on_zero_files_match_, const std::string & logger_name_); + explicit IIterator(const std::string & logger_name_); virtual ~IIterator() = default; @@ -143,10 +143,6 @@ public: protected: virtual ObjectInfoPtr nextImpl(size_t processor) = 0; - -protected: - const bool throw_on_zero_files_match; - bool first_iteration = true; LoggerPtr logger; }; @@ -190,23 +186,26 @@ public: private: ObjectInfoPtr nextImpl(size_t processor) override; + ObjectInfoPtr nextImplUnlocked(size_t processor); void createFilterAST(const String & any_key); - ObjectStoragePtr object_storage; - ConfigurationPtr configuration; - ActionsDAGPtr filter_dag; - NamesAndTypesList virtual_columns; + const ObjectStoragePtr object_storage; + const ConfigurationPtr configuration; + const NamesAndTypesList virtual_columns; + const bool throw_on_zero_files_match; size_t index = 0; ObjectInfos object_infos; ObjectInfos * read_keys; + ActionsDAGPtr filter_dag; ObjectStorageIteratorPtr object_storage_iterator; bool recursive{false}; std::unique_ptr matcher; bool is_finished = false; + bool first_iteration = true; std::mutex next_mutex; std::function file_progress_callback; @@ -220,7 +219,7 @@ public: ConfigurationPtr configuration_, const NamesAndTypesList & virtual_columns_, ObjectInfos * read_keys_, - bool throw_on_zero_files_match_, + bool ignore_non_existent_files_, std::function file_progress_callback = {}); ~KeysIterator() override = default; @@ -236,5 +235,6 @@ private: const std::function file_progress_callback; const std::vector keys; std::atomic index = 0; + bool ignore_non_existent_files; }; } diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index ee3071ea71f..8e7155205c4 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -45,7 +45,7 @@ StorageS3QueueSource::FileIterator::FileIterator( std::unique_ptr glob_iterator_, size_t current_shard_, std::atomic & shutdown_called_) - : StorageObjectStorageSource::IIterator(false, "S3QueueIterator") + : StorageObjectStorageSource::IIterator("S3QueueIterator") , metadata(metadata_) , glob_iterator(std::move(glob_iterator_)) , shutdown_called(shutdown_called_) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index f6e486d6594..fbf97adcee0 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -326,7 +326,7 @@ def test_virtual_columns(started_cluster): hdfs_api.write_data("/file1", "1\n") hdfs_api.write_data("/file2", "2\n") hdfs_api.write_data("/file3", "3\n") - expected = "1\tfile1\thdfs://hdfs1:9000/file1\n2\tfile2\thdfs://hdfs1:9000/file2\n3\tfile3\thdfs://hdfs1:9000/file3\n" + expected = "1\tfile1\t/file1\n2\tfile2\t/file2\n3\tfile3\t/file3\n" assert ( node1.query( "select id, _file as file_name, _path as file_path from virtual_cols order by id" @@ -365,7 +365,7 @@ def test_truncate_table(started_cluster): assert hdfs_api.read_data("/tr") == "1\tMark\t72.53\n" assert node1.query("select * from test_truncate") == "1\tMark\t72.53\n" node1.query("truncate table test_truncate") - assert node1.query("select * from test_truncate") == "" + assert node1.query("select * from test_truncate settings hdfs_ignore_file_doesnt_exist=1") == "" node1.query("drop table test_truncate") @@ -488,13 +488,13 @@ def test_hdfsCluster(started_cluster): actual = node1.query( "select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id" ) - expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" + expected = "1\tfile1\t/test_hdfsCluster/file1\n2\tfile2\t/test_hdfsCluster/file2\n3\tfile3\t/test_hdfsCluster/file3\n" assert actual == expected actual = node1.query( "select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id" ) - expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n" + expected = "1\tfile1\t/test_hdfsCluster/file1\n2\tfile2\t/test_hdfsCluster/file2\n3\tfile3\t/test_hdfsCluster/file3\n" assert actual == expected fs.delete(dir, recursive=True) @@ -502,7 +502,7 @@ def test_hdfsCluster(started_cluster): def test_hdfs_directory_not_exist(started_cluster): ddl = "create table HDFSStorageWithNotExistDir (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/data/not_eixst', 'TSV')" node1.query(ddl) - assert "" == node1.query("select * from HDFSStorageWithNotExistDir") + assert "" == node1.query("select * from HDFSStorageWithNotExistDir settings hdfs_ignore_file_doesnt_exist=1") def test_overwrite(started_cluster): @@ -658,7 +658,7 @@ def test_virtual_columns_2(started_cluster): node1.query(f"insert into table function {table_function} SELECT 1, 'kek'") result = node1.query(f"SELECT _path FROM {table_function}") - assert result.strip() == "hdfs://hdfs1:9000/parquet_2" + assert result.strip() == "/parquet_2" table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_3', 'Parquet', 'a Int32, _path String')" diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index d62f928e947..623af707542 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -60,7 +60,8 @@ SELECT * FROM \"abacaba/file.tsv\" ${CLICKHOUSE_CLIENT} -q "SELECT * FROM test_hdfs_4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: - +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: From 961704ba173bef199735c52e5296b371a5168f15 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Mar 2024 15:00:49 +0100 Subject: [PATCH 058/392] Style check --- tests/integration/test_storage_hdfs/test.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index fbf97adcee0..77a55ced5c8 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -365,7 +365,12 @@ def test_truncate_table(started_cluster): assert hdfs_api.read_data("/tr") == "1\tMark\t72.53\n" assert node1.query("select * from test_truncate") == "1\tMark\t72.53\n" node1.query("truncate table test_truncate") - assert node1.query("select * from test_truncate settings hdfs_ignore_file_doesnt_exist=1") == "" + assert ( + node1.query( + "select * from test_truncate settings hdfs_ignore_file_doesnt_exist=1" + ) + == "" + ) node1.query("drop table test_truncate") @@ -502,7 +507,9 @@ def test_hdfsCluster(started_cluster): def test_hdfs_directory_not_exist(started_cluster): ddl = "create table HDFSStorageWithNotExistDir (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/data/not_eixst', 'TSV')" node1.query(ddl) - assert "" == node1.query("select * from HDFSStorageWithNotExistDir settings hdfs_ignore_file_doesnt_exist=1") + assert "" == node1.query( + "select * from HDFSStorageWithNotExistDir settings hdfs_ignore_file_doesnt_exist=1" + ) def test_overwrite(started_cluster): From 34a87666ebe932fbedef68ac7fef05f2a6e5880a Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Mar 2024 16:55:39 +0100 Subject: [PATCH 059/392] Update settings changes history --- src/Core/SettingsChangesHistory.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index db6fb2f1c0e..8cde00fcc14 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -124,6 +124,9 @@ static std::map sett {"azure_max_upload_part_size", 5ull*1024*1024*1024, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage."}, {"azure_upload_part_size_multiply_factor", 2, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage."}, {"azure_upload_part_size_multiply_parts_count_threshold", 500, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor."}, + {"hdfs_ignore_file_doesnt_exist", false, false, "Ignore if files does not exits and return 0 zeros for StorageHDFS"}, + {"azure_ignore_file_doesnt_exist", false, false, "Ignore if files does not exits and return 0 zeros for StorageAzureBlob"}, + {"s3_ignore_file_doesnt_exist", false, false, "Ignore if files does not exits and return 0 zeros for StorageS3"}, }}, {"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"}, {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"}, From 422a3bd672d8c3f7f5bc050eaeca14415a013a60 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 3 Apr 2024 17:16:51 +0200 Subject: [PATCH 060/392] Update version in SettingsChangesHistory.h --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 9fa1a71f58e..0b90d0216bf 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,8 +85,8 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"24.4", {{"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}}}, {"24.3", {{"s3_connect_timeout_ms", 1000, 1000, "Introduce new dedicated setting for s3 connection timeout"}, - {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, {"allow_experimental_shared_merge_tree", false, true, "The setting is obsolete"}, {"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"}, {"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"}, From b24a2afd5fb6c44fd1ecd2435963f3433c61f2af Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 4 Apr 2024 13:21:22 +0200 Subject: [PATCH 061/392] A few more test fixes --- src/TableFunctions/TableFunctionObjectStorageCluster.cpp | 5 +++-- src/TableFunctions/TableFunctionObjectStorageCluster.h | 8 ++++++++ tests/queries/0_stateless/02725_database_hdfs.sh | 6 ++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index 4ec94cfaf7c..909ace788eb 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -17,9 +17,8 @@ namespace DB template StoragePtr TableFunctionObjectStorageCluster::executeImpl( const ASTPtr & /*function*/, ContextPtr context, - const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const + const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const { - using Base = TableFunctionObjectStorage; auto configuration = Base::getConfiguration(); ColumnsDescription columns; @@ -27,6 +26,8 @@ StoragePtr TableFunctionObjectStorageClusterstructure, context); else if (!Base::structure_hint.empty()) columns = Base::structure_hint; + else if (!cached_columns.empty()) + columns = cached_columns; auto object_storage = Base::getObjectStorage(context, !is_insert_query); StoragePtr storage; diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.h b/src/TableFunctions/TableFunctionObjectStorageCluster.h index 461456e37df..21c2f8995dc 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.h +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.h @@ -67,6 +67,8 @@ public: String getSignature() const override { return signature; } protected: + using Base = TableFunctionObjectStorage; + StoragePtr executeImpl( const ASTPtr & ast_function, ContextPtr context, @@ -75,6 +77,12 @@ protected: bool is_insert_query) const override; const char * getStorageTypeName() const override { return Definition::storage_type_name; } + + bool hasStaticStructure() const override { return Base::getConfiguration()->structure != "auto"; } + + bool needStructureHint() const override { return Base::getConfiguration()->structure == "auto"; } + + void setStructureHint(const ColumnsDescription & structure_hint_) override { Base::structure_hint = structure_hint_; } }; #if USE_AWS_S3 diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index 623af707542..1eb22976b84 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -58,10 +58,8 @@ SELECT * FROM \"abacaba/file.tsv\" """ 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" ${CLICKHOUSE_CLIENT} -q "SELECT * FROM test_hdfs_4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "The data format cannot be detected" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "The table structure cannot be extracted" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: From aa804e744b1f1c233ef7158431feb4c016d0026c Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 4 Apr 2024 14:05:50 +0200 Subject: [PATCH 062/392] Fix style check --- src/Storages/ObjectStorage/HDFS/Configuration.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 5a4fb322692..0a49ba5e251 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -153,7 +153,7 @@ void StorageHDFSConfiguration::addStructureToArgs(ASTs & args, const String & st { size_t count = args.size(); if (count == 0 || count > 3) - throw Exception(ErrorCodes::LOGICAL_ERROR, + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Expected 1 to 3 arguments in table function, got {}", count); auto structure_literal = std::make_shared(structure_); From e5ffe3cf8d7362335ef6150e7864d5deb74c9479 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 5 Apr 2024 16:15:11 +0200 Subject: [PATCH 063/392] More tests fixes --- src/Storages/MergeTree/KeyCondition.cpp | 7 +++++ .../ObjectStorage/AzureBlob/Configuration.cpp | 3 +- .../ObjectStorage/AzureBlob/Configuration.h | 4 ++- .../ObjectStorage/HDFS/Configuration.cpp | 28 +++++++++++++------ .../ObjectStorage/HDFS/Configuration.h | 4 ++- .../ReadFromStorageObjectStorage.cpp | 3 +- .../ReadFromStorageObjectStorage.h | 1 + .../ObjectStorage/S3/Configuration.cpp | 4 ++- src/Storages/ObjectStorage/S3/Configuration.h | 4 ++- .../ObjectStorage/StorageObjectStorage.cpp | 3 ++ .../StorageObjectStorageConfiguration.h | 1 + .../StorageObjectStorageSource.cpp | 16 ++++++++++- .../StorageObjectStorageSource.h | 2 +- .../TableFunctionObjectStorage.cpp | 4 +-- 14 files changed, 65 insertions(+), 19 deletions(-) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 2d57ea40c9c..a720e243fdb 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -2661,6 +2661,13 @@ BoolMask KeyCondition::checkInHyperrectangle( else if (element.function == RPNElement::FUNCTION_IN_RANGE || element.function == RPNElement::FUNCTION_NOT_IN_RANGE) { + if (element.key_column >= hyperrectangle.size()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Hyperrectangle size is {}, but requested element at posittion {} ({})", + hyperrectangle.size(), element.key_column, element.toString()); + } + const Range * key_range = &hyperrectangle[element.key_column]; /// The case when the column is wrapped in a chain of possibly monotonic functions. diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index 018cec51e7c..fe01251e58a 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -379,7 +379,8 @@ void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr conte blobs_paths = {blob_path}; } -void StorageAzureBlobConfiguration::addStructureToArgs(ASTs & args, const String & structure_, ContextPtr context) +void StorageAzureBlobConfiguration::addStructureAndFormatToArgs( + ASTs & args, const String & structure_, const String & /* format */, ContextPtr context) { if (tryGetNamedCollectionWithOverrides(args, context)) { diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.h b/src/Storages/ObjectStorage/AzureBlob/Configuration.h index 8040d433d99..c12ff81197d 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.h +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.h @@ -26,6 +26,7 @@ public: const Paths & getPaths() const override { return blobs_paths; } Paths & getPaths() override { return blobs_paths; } + void setPaths(const Paths & paths) override { blobs_paths = paths; } String getDataSourceDescription() override { return fs::path(connection_url) / container; } String getNamespace() const override { return container; } @@ -36,7 +37,8 @@ public: void fromNamedCollection(const NamedCollection & collection) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; - static void addStructureToArgs(ASTs & args, const String & structure, ContextPtr context); + static void addStructureAndFormatToArgs( + ASTs & args, const String & structure_, const String & format_, ContextPtr context); protected: using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 0a49ba5e251..220857fead6 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -139,7 +139,11 @@ void StorageHDFSConfiguration::setURL(const std::string & url_) LOG_TRACE(getLogger("StorageHDFSConfiguration"), "Using url: {}, path: {}", url, path); } -void StorageHDFSConfiguration::addStructureToArgs(ASTs & args, const String & structure_, ContextPtr context) +void StorageHDFSConfiguration::addStructureAndFormatToArgs( + ASTs & args, + const String & structure_, + const String & format_, + ContextPtr context) { if (tryGetNamedCollectionWithOverrides(args, context)) { @@ -152,10 +156,13 @@ void StorageHDFSConfiguration::addStructureToArgs(ASTs & args, const String & st else { size_t count = args.size(); - if (count == 0 || count > 3) + if (count == 0 || count > 4) + { throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Expected 1 to 3 arguments in table function, got {}", count); + "Expected 1 to 4 arguments in table function, got {}", count); + } + auto format_literal = std::make_shared(format_); auto structure_literal = std::make_shared(structure_); /// hdfs(url) @@ -168,15 +175,18 @@ void StorageHDFSConfiguration::addStructureToArgs(ASTs & args, const String & st /// hdfs(url, format) else if (count == 2) { + if (checkAndGetLiteralArgument(args[1], "format") == "auto") + args.back() = format_literal; args.push_back(structure_literal); } - /// hdfs(url, format, compression_method) - else if (count == 3) + /// hdfs(url, format, structure) + /// hdfs(url, format, structure, compression_method) + else if (count >= 3) { - auto compression_method = args.back(); - args.pop_back(); - args.push_back(structure_literal); - args.push_back(compression_method); + if (checkAndGetLiteralArgument(args[1], "format") == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } } } diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 7dc1f8073c1..23a7e8e4549 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -21,6 +21,7 @@ public: const Paths & getPaths() const override { return paths; } Paths & getPaths() override { return paths; } + void setPaths(const Paths & paths_) override { paths = paths_; } String getNamespace() const override { return ""; } String getDataSourceDescription() override { return url; } @@ -29,7 +30,8 @@ public: ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } - static void addStructureToArgs(ASTs &, const String &, ContextPtr); + static void addStructureAndFormatToArgs( + ASTs & args, const String & structure_, const String & format_, ContextPtr context); std::string getPathWithoutGlob() const override; diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp index f2595299430..89d33191f41 100644 --- a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp @@ -9,6 +9,7 @@ ReadFromStorageObejctStorage::ReadFromStorageObejctStorage( ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, const String & name_, + const Names & columns_to_read, const NamesAndTypesList & virtual_columns_, const SelectQueryInfo & query_info_, const StorageSnapshotPtr & storage_snapshot_, @@ -24,7 +25,7 @@ ReadFromStorageObejctStorage::ReadFromStorageObejctStorage( CurrentMetrics::Metric metric_threads_count_, CurrentMetrics::Metric metric_threads_active_, CurrentMetrics::Metric metric_threads_scheduled_) - : SourceStepWithFilter(DataStream{.header = info_.source_header}, info_.requested_columns.getNames(), query_info_, storage_snapshot_, context_) + : SourceStepWithFilter(DataStream{.header = info_.source_header}, columns_to_read, query_info_, storage_snapshot_, context_) , object_storage(object_storage_) , configuration(configuration_) , info(std::move(info_)) diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h index 44b992f8c12..c0dd02d75f8 100644 --- a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h +++ b/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h @@ -15,6 +15,7 @@ public: ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, const String & name_, + const Names & columns_to_read, const NamesAndTypesList & virtual_columns_, const SelectQueryInfo & query_info_, const StorageSnapshotPtr & storage_snapshot_, diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 132a5045d8a..f532af24017 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -330,7 +330,8 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ keys = {url.key}; } -void StorageS3Configuration::addStructureToArgs(ASTs & args, const String & structure_, ContextPtr context) +void StorageS3Configuration::addStructureAndFormatToArgs( + ASTs & args, const String & structure_, const String & format_, ContextPtr context) { if (tryGetNamedCollectionWithOverrides(args, context)) { @@ -348,6 +349,7 @@ void StorageS3Configuration::addStructureToArgs(ASTs & args, const String & stru if (count == 0 || count > 6) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to 6 arguments in table function, got {}", count); + auto format_literal = std::make_shared(format_); auto structure_literal = std::make_shared(structure_); /// s3(s3_url) diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index f9614da4b95..ff5e8680e66 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -22,6 +22,7 @@ public: const Paths & getPaths() const override { return keys; } Paths & getPaths() override { return keys; } + void setPaths(const Paths & paths) override { keys = paths; } String getNamespace() const override { return url.bucket; } String getDataSourceDescription() override; @@ -33,7 +34,8 @@ public: bool isStaticConfiguration() const override { return static_configuration; } ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT - static void addStructureToArgs(ASTs & args, const String & structure, ContextPtr context); + static void addStructureAndFormatToArgs( + ASTs & args, const String & structure, const String & format, ContextPtr context); private: void fromNamedCollection(const NamedCollection & collection) override; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 84810c117c9..8fc3de4de1b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -158,10 +158,13 @@ void StorageObjectStorage::read( const bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef().optimize_count_from_files; + LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII SOURCE HEADER: {}", read_from_format_info.source_header.dumpStructure()); + LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII FORMAT HEADER: {}", read_from_format_info.format_header.dumpStructure()); auto read_step = std::make_unique( object_storage, configuration, getName(), + column_names, getVirtualsList(), query_info, storage_snapshot, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 48825c6a012..647575aaa90 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -32,6 +32,7 @@ public: virtual const Paths & getPaths() const = 0; virtual Paths & getPaths() = 0; + virtual void setPaths(const Paths & paths) = 0; virtual String getDataSourceDescription() = 0; virtual String getNamespace() const = 0; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index fd3ac58b1a2..30316af987c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -106,8 +106,21 @@ std::shared_ptr StorageObjectStorageSourc } else { + ConfigurationPtr copy_configuration = configuration->clone(); + auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + if (filter_dag) + { + auto keys = configuration->getPaths(); + std::vector paths; + paths.reserve(keys.size()); + for (const auto & key : keys) + paths.push_back(fs::path(configuration->getNamespace()) / key); + VirtualColumnUtils::filterByPathOrFile(keys, paths, filter_dag, virtual_columns, local_context); + copy_configuration->setPaths(keys); + } + return std::make_shared( - object_storage, configuration, virtual_columns, read_keys, + object_storage, copy_configuration, virtual_columns, read_keys, settings.ignore_non_existent_file, file_progress_callback); } } @@ -247,6 +260,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade const auto max_parsing_threads = need_only_count ? std::optional(1) : std::nullopt; read_buf = createReadBuffer(object_info->relative_path, object_info->metadata->size_bytes); + LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII HEADER: {}", read_from_format_info.format_header.dumpStructure()); auto input_format = FormatFactory::instance().getInput( configuration->format, *read_buf, read_from_format_info.format_header, getContext(), max_block_size, format_settings, max_parsing_threads, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 3d4cc4fbd20..28962aadecd 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -45,7 +45,7 @@ public: void setKeyCondition(const ActionsDAGPtr & filter_actions_dag, ContextPtr context_) override { - setKeyConditionImpl(filter_actions_dag, context_, read_from_format_info.source_header); + setKeyConditionImpl(filter_actions_dag, context_, read_from_format_info.format_header); } Chunk generate() override; diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index d407017d5f7..9223642a7e6 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -65,9 +65,9 @@ std::vector TableFunctionObjectStorage< template void TableFunctionObjectStorage::updateStructureAndFormatArgumentsIfNeeded( - ASTs & args, const String & structure, const String & /* format */, const ContextPtr & context) + ASTs & args, const String & structure, const String & format, const ContextPtr & context) { - Configuration::addStructureToArgs(args, structure, context); + Configuration::addStructureAndFormatToArgs(args, structure, format, context); } template From e8f02af78c418f7c0a521bd48d49fcfb91db455f Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 9 Apr 2024 09:49:32 +0000 Subject: [PATCH 064/392] fix part splitter wrongly add ranges with undefined end mark value to non-intersect part Signed-off-by: Duc Canh Le --- src/Processors/QueryPlan/PartsSplitter.cpp | 11 ++++------ .../03033_final_undefined_last_mark.reference | 1 + .../03033_final_undefined_last_mark.sql | 21 +++++++++++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/03033_final_undefined_last_mark.reference create mode 100644 tests/queries/0_stateless/03033_final_undefined_last_mark.sql diff --git a/src/Processors/QueryPlan/PartsSplitter.cpp b/src/Processors/QueryPlan/PartsSplitter.cpp index 2af1bcb0260..d3425bce2a3 100644 --- a/src/Processors/QueryPlan/PartsSplitter.cpp +++ b/src/Processors/QueryPlan/PartsSplitter.cpp @@ -609,14 +609,11 @@ SplitPartsRangesResult splitPartsRanges(RangesInDataParts ranges_in_data_parts, } /// Process parts ranges with undefined value at end mark - bool is_intersecting = part_index_start_to_range.size() > 1; + /// The last parts ranges could be non-intersect only if: (1) there is only one part range left, (2) it belongs to a non-L0 part, + /// and (3) the begin value of this range is larger than the largest end value of all previous ranges. This is too complicated + /// to check, so we just add the last part ranges to the intersecting ranges. for (const auto & [part_range_index, mark_range] : part_index_start_to_range) - { - if (is_intersecting) - add_intersecting_range(part_range_index.part_index, mark_range); - else - add_non_intersecting_range(part_range_index.part_index, mark_range); - } + add_intersecting_range(part_range_index.part_index, mark_range); auto && non_intersecting_ranges_in_data_parts = std::move(non_intersecting_ranges_in_data_parts_builder.getCurrentRangesInDataParts()); auto && intersecting_ranges_in_data_parts = std::move(intersecting_ranges_in_data_parts_builder.getCurrentRangesInDataParts()); diff --git a/tests/queries/0_stateless/03033_final_undefined_last_mark.reference b/tests/queries/0_stateless/03033_final_undefined_last_mark.reference new file mode 100644 index 00000000000..7b82946b108 --- /dev/null +++ b/tests/queries/0_stateless/03033_final_undefined_last_mark.reference @@ -0,0 +1 @@ +GOOD 11338881281426660955 14765404159170880511 diff --git a/tests/queries/0_stateless/03033_final_undefined_last_mark.sql b/tests/queries/0_stateless/03033_final_undefined_last_mark.sql new file mode 100644 index 00000000000..183406f803c --- /dev/null +++ b/tests/queries/0_stateless/03033_final_undefined_last_mark.sql @@ -0,0 +1,21 @@ +-- Tags: no-random-settings, no-random-merge-tree-settings + +CREATE TABLE account_test +( + `id` UInt64, + `row_ver` UInt64, +) +ENGINE = ReplacingMergeTree(row_ver) +PARTITION BY id % 64 +ORDER BY id +SETTINGS index_granularity = 512, index_granularity_bytes = 0, + min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, + min_rows_for_compact_part = 0, min_bytes_for_compact_part = 0; + +INSERT INTO account_test + SELECT * FROM generateRandom('id UInt64, row_ver UInt64',1234) LIMIT 50000; + +INSERT INTO account_test + SELECT * FROM (SELECT * FROM generateRandom('id UInt64, row_ver UInt64',1234) LIMIT 1000) WHERE row_ver > 14098131981223776000; + +SELECT 'GOOD', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 1; From f41d88b990052e06ae7dd87826662d664c4f54e8 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 10 Apr 2024 05:43:50 +0000 Subject: [PATCH 065/392] add reference query to test Signed-off-by: Duc Canh Le --- .../0_stateless/03033_final_undefined_last_mark.reference | 1 + tests/queries/0_stateless/03033_final_undefined_last_mark.sql | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tests/queries/0_stateless/03033_final_undefined_last_mark.reference b/tests/queries/0_stateless/03033_final_undefined_last_mark.reference index 7b82946b108..bf0a25f24e4 100644 --- a/tests/queries/0_stateless/03033_final_undefined_last_mark.reference +++ b/tests/queries/0_stateless/03033_final_undefined_last_mark.reference @@ -1 +1,2 @@ GOOD 11338881281426660955 14765404159170880511 +GOOD 11338881281426660955 14765404159170880511 diff --git a/tests/queries/0_stateless/03033_final_undefined_last_mark.sql b/tests/queries/0_stateless/03033_final_undefined_last_mark.sql index 183406f803c..2c13da42ca4 100644 --- a/tests/queries/0_stateless/03033_final_undefined_last_mark.sql +++ b/tests/queries/0_stateless/03033_final_undefined_last_mark.sql @@ -18,4 +18,6 @@ INSERT INTO account_test INSERT INTO account_test SELECT * FROM (SELECT * FROM generateRandom('id UInt64, row_ver UInt64',1234) LIMIT 1000) WHERE row_ver > 14098131981223776000; +SELECT 'GOOD', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 0; SELECT 'GOOD', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 1; + From 14c461338b12719daa1dc044148f914fd6a5fac6 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 10 Apr 2024 12:56:29 +0200 Subject: [PATCH 066/392] Replay ZK logs using keeper-bench --- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 4 +- src/Common/ZooKeeper/ZooKeeperImpl.h | 3 +- utils/keeper-bench/CMakeLists.txt | 3 +- utils/keeper-bench/Generator.cpp | 194 +----- utils/keeper-bench/Generator.h | 18 - utils/keeper-bench/Runner.cpp | 821 ++++++++++++++++++++++++- utils/keeper-bench/Runner.h | 77 ++- utils/keeper-bench/main.cpp | 24 +- 8 files changed, 875 insertions(+), 269 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 2185d32e47a..ed7498b1ac9 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1259,11 +1259,13 @@ void ZooKeeper::initFeatureFlags() void ZooKeeper::executeGenericRequest( const ZooKeeperRequestPtr & request, - ResponseCallback callback) + ResponseCallback callback, + WatchCallbackPtr watch) { RequestInfo request_info; request_info.request = request; request_info.callback = callback; + request_info.watch = watch; pushRequest(std::move(request_info)); } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index cf331a03d06..8fdf0f97d9d 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -139,7 +139,8 @@ public: void executeGenericRequest( const ZooKeeperRequestPtr & request, - ResponseCallback callback); + ResponseCallback callback, + WatchCallbackPtr watch = nullptr); /// See the documentation about semantics of these methods in IKeeper class. diff --git a/utils/keeper-bench/CMakeLists.txt b/utils/keeper-bench/CMakeLists.txt index 5514c34f4ef..4fe0d852fd2 100644 --- a/utils/keeper-bench/CMakeLists.txt +++ b/utils/keeper-bench/CMakeLists.txt @@ -4,5 +4,4 @@ if (NOT TARGET ch_contrib::rapidjson) endif () clickhouse_add_executable(keeper-bench Generator.cpp Runner.cpp Stats.cpp main.cpp) -target_link_libraries(keeper-bench PRIVATE dbms) -target_link_libraries(keeper-bench PRIVATE ch_contrib::rapidjson) +target_link_libraries(keeper-bench PRIVATE dbms clickhouse_functions ch_contrib::rapidjson) diff --git a/utils/keeper-bench/Generator.cpp b/utils/keeper-bench/Generator.cpp index 2212f7158ae..cbf1bcdae23 100644 --- a/utils/keeper-bench/Generator.cpp +++ b/utils/keeper-bench/Generator.cpp @@ -40,54 +40,6 @@ std::string generateRandomString(size_t length) } } -void removeRecursive(Coordination::ZooKeeper & zookeeper, const std::string & path) -{ - namespace fs = std::filesystem; - - auto promise = std::make_shared>(); - auto future = promise->get_future(); - - Strings children; - auto list_callback = [promise, &children] (const ListResponse & response) - { - children = response.names; - - promise->set_value(); - }; - zookeeper.list(path, ListRequestType::ALL, list_callback, nullptr); - future.get(); - - while (!children.empty()) - { - Coordination::Requests ops; - for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) - { - removeRecursive(zookeeper, fs::path(path) / children.back()); - ops.emplace_back(makeRemoveRequest(fs::path(path) / children.back(), -1)); - children.pop_back(); - } - auto multi_promise = std::make_shared>(); - auto multi_future = multi_promise->get_future(); - - auto multi_callback = [multi_promise] (const MultiResponse &) - { - multi_promise->set_value(); - }; - zookeeper.multi(ops, multi_callback); - multi_future.get(); - } - auto remove_promise = std::make_shared>(); - auto remove_future = remove_promise->get_future(); - - auto remove_callback = [remove_promise] (const RemoveResponse &) - { - remove_promise->set_value(); - }; - - zookeeper.remove(path, -1, remove_callback); - remove_future.get(); -} - NumberGetter NumberGetter::fromConfig(const std::string & key, const Poco::Util::AbstractConfiguration & config, std::optional default_value) { @@ -603,148 +555,16 @@ Generator::Generator(const Poco::Util::AbstractConfiguration & config) acl.id = "anyone"; default_acls.emplace_back(std::move(acl)); - static const std::string generator_key = "generator"; - - std::cerr << "---- Parsing setup ---- " << std::endl; - static const std::string setup_key = generator_key + ".setup"; - Poco::Util::AbstractConfiguration::Keys keys; - config.keys(setup_key, keys); - for (const auto & key : keys) - { - if (key.starts_with("node")) - { - auto node_key = setup_key + "." + key; - auto parsed_root_node = parseNode(node_key, config); - const auto node = root_nodes.emplace_back(parsed_root_node); - - if (config.has(node_key + ".repeat")) - { - if (!node->name.isRandom()) - throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Repeating node creation for key {}, but name is not randomly generated", node_key); - - auto repeat_count = config.getUInt64(node_key + ".repeat"); - node->repeat_count = repeat_count; - for (size_t i = 1; i < repeat_count; ++i) - root_nodes.emplace_back(node->clone()); - } - - std::cerr << "Tree to create:" << std::endl; - - node->dumpTree(); - std::cerr << std::endl; - } - } - std::cerr << "---- Done parsing data setup ----\n" << std::endl; - std::cerr << "---- Collecting request generators ----" << std::endl; - static const std::string requests_key = generator_key + ".requests"; + static const std::string requests_key = "generator.requests"; request_getter = RequestGetter::fromConfig(requests_key, config); std::cerr << request_getter.description() << std::endl; std::cerr << "---- Done collecting request generators ----\n" << std::endl; } -std::shared_ptr Generator::parseNode(const std::string & key, const Poco::Util::AbstractConfiguration & config) -{ - auto node = std::make_shared(); - node->name = StringGetter::fromConfig(key + ".name", config); - - if (config.has(key + ".data")) - node->data = StringGetter::fromConfig(key + ".data", config); - - Poco::Util::AbstractConfiguration::Keys node_keys; - config.keys(key, node_keys); - - for (const auto & node_key : node_keys) - { - if (!node_key.starts_with("node")) - continue; - - const auto node_key_string = key + "." + node_key; - auto child_node = parseNode(node_key_string, config); - node->children.push_back(child_node); - - if (config.has(node_key_string + ".repeat")) - { - if (!child_node->name.isRandom()) - throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Repeating node creation for key {}, but name is not randomly generated", node_key_string); - - auto repeat_count = config.getUInt64(node_key_string + ".repeat"); - child_node->repeat_count = repeat_count; - for (size_t i = 1; i < repeat_count; ++i) - node->children.push_back(child_node); - } - } - - return node; -} - -void Generator::Node::dumpTree(int level) const -{ - std::string data_string - = data.has_value() ? fmt::format("{}", data->description()) : "no data"; - - std::string repeat_count_string = repeat_count != 0 ? fmt::format(", repeated {} times", repeat_count) : ""; - - std::cerr << fmt::format("{}name: {}, data: {}{}", std::string(level, '\t'), name.description(), data_string, repeat_count_string) << std::endl; - - for (auto it = children.begin(); it != children.end();) - { - const auto & child = *it; - child->dumpTree(level + 1); - std::advance(it, child->repeat_count != 0 ? child->repeat_count : 1); - } -} - -std::shared_ptr Generator::Node::clone() const -{ - auto new_node = std::make_shared(); - new_node->name = name; - new_node->data = data; - new_node->repeat_count = repeat_count; - - // don't do deep copy of children because we will do clone only for root nodes - new_node->children = children; - - return new_node; -} - -void Generator::Node::createNode(Coordination::ZooKeeper & zookeeper, const std::string & parent_path, const Coordination::ACLs & acls) const -{ - auto path = std::filesystem::path(parent_path) / name.getString(); - auto promise = std::make_shared>(); - auto future = promise->get_future(); - auto create_callback = [promise] (const CreateResponse & response) - { - if (response.error != Coordination::Error::ZOK) - promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); - else - promise->set_value(); - }; - zookeeper.create(path, data ? data->getString() : "", false, false, acls, create_callback); - future.get(); - - for (const auto & child : children) - child->createNode(zookeeper, path, acls); -} - void Generator::startup(Coordination::ZooKeeper & zookeeper) { - std::cerr << "---- Creating test data ----" << std::endl; - for (const auto & node : root_nodes) - { - auto node_name = node->name.getString(); - node->name.setString(node_name); - - std::string root_path = std::filesystem::path("/") / node_name; - std::cerr << "Cleaning up " << root_path << std::endl; - removeRecursive(zookeeper, root_path); - - node->createNode(zookeeper, "/", default_acls); - } - std::cerr << "---- Created test data ----\n" << std::endl; - std::cerr << "---- Initializing generators ----" << std::endl; - request_getter.startup(zookeeper); } @@ -752,15 +572,3 @@ Coordination::ZooKeeperRequestPtr Generator::generate() { return request_getter.getRequestGenerator()->generate(default_acls); } - -void Generator::cleanup(Coordination::ZooKeeper & zookeeper) -{ - std::cerr << "---- Cleaning up test data ----" << std::endl; - for (const auto & node : root_nodes) - { - auto node_name = node->name.getString(); - std::string root_path = std::filesystem::path("/") / node_name; - std::cerr << "Cleaning up " << root_path << std::endl; - removeRecursive(zookeeper, root_path); - } -} diff --git a/utils/keeper-bench/Generator.h b/utils/keeper-bench/Generator.h index 5b4c05b2d8b..35dce1a95d9 100644 --- a/utils/keeper-bench/Generator.h +++ b/utils/keeper-bench/Generator.h @@ -173,27 +173,9 @@ public: void startup(Coordination::ZooKeeper & zookeeper); Coordination::ZooKeeperRequestPtr generate(); - void cleanup(Coordination::ZooKeeper & zookeeper); private: - struct Node - { - StringGetter name; - std::optional data; - std::vector> children; - size_t repeat_count = 0; - - std::shared_ptr clone() const; - - void createNode(Coordination::ZooKeeper & zookeeper, const std::string & parent_path, const Coordination::ACLs & acls) const; - void dumpTree(int level = 0) const; - }; - - static std::shared_ptr parseNode(const std::string & key, const Poco::Util::AbstractConfiguration & config); std::uniform_int_distribution request_picker; - std::vector> root_nodes; RequestGetter request_getter; Coordination::ACLs default_acls; }; - -std::optional getGenerator(const std::string & name); diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp index a4b579f1f7b..8b111f5adb9 100644 --- a/utils/keeper-bench/Runner.cpp +++ b/utils/keeper-bench/Runner.cpp @@ -1,14 +1,28 @@ #include "Runner.h" +#include +#include #include +#include "Common/ConcurrentBoundedQueue.h" +#include "Common/ZooKeeper/IKeeper.h" +#include "Common/ZooKeeper/ZooKeeperArgs.h" #include "Common/ZooKeeper/ZooKeeperCommon.h" #include "Common/ZooKeeper/ZooKeeperConstants.h" #include #include -#include "IO/ReadBufferFromString.h" +#include "Core/ColumnWithTypeAndName.h" +#include "Core/ColumnsWithTypeAndName.h" +#include "IO/ReadBuffer.h" +#include "IO/ReadBufferFromFile.h" +#include "base/Decimal.h" +#include "base/types.h" +#include #include #include #include +#include +#include +#include namespace CurrentMetrics @@ -22,23 +36,41 @@ namespace DB::ErrorCodes { extern const int CANNOT_BLOCK_SIGNAL; extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } Runner::Runner( std::optional concurrency_, const std::string & config_path, + const std::string & input_request_log_, const Strings & hosts_strings_, std::optional max_time_, std::optional delay_, std::optional continue_on_error_, std::optional max_iterations_) - : info(std::make_shared()) + : input_request_log(input_request_log_) + , info(std::make_shared()) { DB::ConfigProcessor config_processor(config_path, true, false); - auto config = config_processor.loadConfig().configuration; + DB::ConfigurationPtr config = nullptr; + + if (!config_path.empty()) + { + config = config_processor.loadConfig().configuration; + + if (config->has("generator")) + generator.emplace(*config); + } + else + { + if (input_request_log.empty()) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Both --config and --input_request_log cannot be empty"); + + if (!std::filesystem::exists(input_request_log)) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "File on path {} does not exist", input_request_log); + } - generator.emplace(*config); if (!hosts_strings_.empty()) { @@ -57,6 +89,8 @@ Runner::Runner( static constexpr uint64_t DEFAULT_CONCURRENCY = 1; if (concurrency_) concurrency = *concurrency_; + else if (!config) + concurrency = DEFAULT_CONCURRENCY; else concurrency = config->getUInt64("concurrency", DEFAULT_CONCURRENCY); std::cerr << "Concurrency: " << concurrency << std::endl; @@ -64,6 +98,8 @@ Runner::Runner( static constexpr uint64_t DEFAULT_ITERATIONS = 0; if (max_iterations_) max_iterations = *max_iterations_; + else if (!config) + max_iterations = DEFAULT_ITERATIONS; else max_iterations = config->getUInt64("iterations", DEFAULT_ITERATIONS); std::cerr << "Iterations: " << max_iterations << std::endl; @@ -71,6 +107,8 @@ Runner::Runner( static constexpr double DEFAULT_DELAY = 1.0; if (delay_) delay = *delay_; + else if (!config) + delay = DEFAULT_DELAY; else delay = config->getDouble("report_delay", DEFAULT_DELAY); std::cerr << "Report delay: " << delay << std::endl; @@ -78,44 +116,48 @@ Runner::Runner( static constexpr double DEFAULT_TIME_LIMIT = 0.0; if (max_time_) max_time = *max_time_; + else if (!config) + max_time = DEFAULT_TIME_LIMIT; else max_time = config->getDouble("timelimit", DEFAULT_TIME_LIMIT); std::cerr << "Time limit: " << max_time << std::endl; if (continue_on_error_) continue_on_error = *continue_on_error_; + else if (!config) + continue_on_error_ = false; else continue_on_error = config->getBool("continue_on_error", false); std::cerr << "Continue on error: " << continue_on_error << std::endl; - static const std::string output_key = "output"; - print_to_stdout = config->getBool(output_key + ".stdout", false); - std::cerr << "Printing output to stdout: " << print_to_stdout << std::endl; - - static const std::string output_file_key = output_key + ".file"; - if (config->has(output_file_key)) + if (config) { - if (config->has(output_file_key + ".path")) - { - file_output = config->getString(output_file_key + ".path"); - output_file_with_timestamp = config->getBool(output_file_key + ".with_timestamp"); - } - else - file_output = config->getString(output_file_key); + benchmark_context.initializeFromConfig(*config); - std::cerr << "Result file path: " << file_output->string() << std::endl; + static const std::string output_key = "output"; + print_to_stdout = config->getBool(output_key + ".stdout", false); + std::cerr << "Printing output to stdout: " << print_to_stdout << std::endl; + + static const std::string output_file_key = output_key + ".file"; + if (config->has(output_file_key)) + { + if (config->has(output_file_key + ".path")) + { + file_output = config->getString(output_file_key + ".path"); + output_file_with_timestamp = config->getBool(output_file_key + ".with_timestamp"); + } + else + file_output = config->getString(output_file_key); + + std::cerr << "Result file path: " << file_output->string() << std::endl; + } } std::cerr << "---- Run options ----\n" << std::endl; - - pool.emplace(CurrentMetrics::LocalThread, CurrentMetrics::LocalThreadActive, CurrentMetrics::LocalThreadScheduled, concurrency); - queue.emplace(concurrency); } void Runner::parseHostsFromConfig(const Poco::Util::AbstractConfiguration & config) { - ConnectionInfo default_connection_info; - const auto fill_connection_details = [&](const std::string & key, auto & connection_info) { if (config.has(key + ".secure")) @@ -328,9 +370,519 @@ bool Runner::tryPushRequestInteractively(Coordination::ZooKeeperRequestPtr && re void Runner::runBenchmark() { + if (generator) + runBenchmarkWithGenerator(); + else + runBenchmarkFromLog(); +} + + +struct ZooKeeperRequestBlock +{ + explicit ZooKeeperRequestBlock(DB::Block block_) + : block(std::move(block_)) + , hostname_idx(block.getPositionByName("hostname")) // + , request_event_time_idx(block.getPositionByName("request_event_time")) // + , thread_id_idx(block.getPositionByName("thread_id")) // + , session_id_idx(block.getPositionByName("session_id")) // + , xid_idx(block.getPositionByName("xid")) // + , has_watch_idx(block.getPositionByName("has_watch")) + , op_num_idx(block.getPositionByName("op_num")) + , path_idx(block.getPositionByName("path")) + , data_idx(block.getPositionByName("data")) + , is_ephemeral_idx(block.getPositionByName("is_ephemeral")) + , is_sequential_idx(block.getPositionByName("is_sequential")) + , response_event_time_idx(block.getPositionByName("response_event_time")) // + , error_idx(block.getPositionByName("error")) + , requests_size_idx(block.getPositionByName("requests_size")) + , version_idx(block.getPositionByName("version")) + {} + + size_t rows() const + { + return block.rows(); + } + + UInt64 getExecutorId(size_t row) const + { + return getSessionId(row); + } + + std::string getHostname(size_t row) const + { + return getField(hostname_idx, row).safeGet(); + } + + UInt64 getThreadId(size_t row) const + { + return getField(thread_id_idx, row).safeGet(); + } + + DB::DateTime64 getRequestEventTime(size_t row) const + { + return getField(request_event_time_idx, row).safeGet(); + } + + DB::DateTime64 getResponseEventTime(size_t row) const + { + return getField(response_event_time_idx, row).safeGet(); + } + + Int64 getSessionId(size_t row) const + { + return getField(session_id_idx, row).safeGet(); + } + + Int64 getXid(size_t row) const + { + return getField(xid_idx, row).safeGet(); + } + + bool hasWatch(size_t row) const + { + return getField(has_watch_idx, row).safeGet(); + } + + Coordination::OpNum getOpNum(size_t row) const + { + return static_cast(getField(op_num_idx, row).safeGet()); + } + + bool isEphemeral(size_t row) const + { + return getField(is_ephemeral_idx, row).safeGet(); + } + + bool isSequential(size_t row) const + { + return getField(is_sequential_idx, row).safeGet(); + } + + std::string getPath(size_t row) const + { + return getField(path_idx, row).safeGet(); + } + + std::string getData(size_t row) const + { + return getField(data_idx, row).safeGet(); + } + + UInt64 getRequestsSize(size_t row) const + { + return getField(requests_size_idx, row).safeGet(); + } + + std::optional getVersion(size_t row) const + { + auto field = getField(version_idx, row); + if (field.isNull()) + return std::nullopt; + return static_cast(field.safeGet()); + } + + std::optional getError(size_t row) const + { + auto field = getField(error_idx, row); + if (field.isNull()) + return std::nullopt; + + return static_cast(field.safeGet()); + } +private: + DB::Field getField(size_t position, size_t row) const + { + DB::Field field; + block.getByPosition(position).column->get(row, field); + return field; + } + + DB::Block block; + size_t hostname_idx = 0; + size_t request_event_time_idx = 0; + size_t thread_id_idx = 0; + size_t session_id_idx = 0; + size_t xid_idx = 0; + size_t has_watch_idx = 0; + size_t op_num_idx = 0; + size_t path_idx = 0; + size_t data_idx = 0; + size_t is_ephemeral_idx = 0; + size_t is_sequential_idx = 0; + size_t response_event_time_idx = 0; + size_t error_idx = 0; + size_t requests_size_idx = 0; + size_t version_idx = 0; +}; + +struct RequestFromLog +{ + Coordination::ZooKeeperRequestPtr request; + std::optional expected_result; + int64_t session_id = 0; + size_t executor_id = 0; + bool has_watch = false; + DB::DateTime64 request_event_time; + DB::DateTime64 response_event_time; + std::shared_ptr connection; +}; + +struct ZooKeeperRequestFromLogReader +{ + ZooKeeperRequestFromLogReader(const std::string & input_request_log, DB::ContextPtr context) + { + std::optional format_settings; + + file_read_buf = std::make_unique(input_request_log); + auto compression_method = DB::chooseCompressionMethod(input_request_log, ""); + file_read_buf = DB::wrapReadBufferWithCompressionMethod(std::move(file_read_buf), compression_method); + + DB::SingleReadBufferIterator read_buffer_iterator(std::move(file_read_buf)); + auto [columns_description, format] = DB::detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + + DB::ColumnsWithTypeAndName columns; + columns.reserve(columns_description.size()); + + for (const auto & column_description : columns_description) + columns.push_back(DB::ColumnWithTypeAndName{column_description.type, column_description.name}); + + header_block = std::move(columns); + + file_read_buf + = DB::wrapReadBufferWithCompressionMethod(std::make_unique(input_request_log), compression_method); + + input_format = DB::FormatFactory::instance().getInput( + format, + *file_read_buf, + header_block, + context, + context->getSettingsRef().max_block_size, + format_settings, + 1, + std::nullopt, + /*is_remote_fs*/ false, + DB::CompressionMethod::None, + false); + + Coordination::ACL acl; + acl.permissions = Coordination::ACL::All; + acl.scheme = "world"; + acl.id = "anyone"; + default_acls.emplace_back(std::move(acl)); + } + + std::optional getNextRequest(bool for_multi = false) + { + RequestFromLog request_from_log; + + if (!current_block) + { + auto chunk = input_format->generate(); + + if (chunk.empty()) + return std::nullopt; + + current_block.emplace(header_block.cloneWithColumns(chunk.detachColumns())); + idx_in_block = 0; + } + + + request_from_log.expected_result = current_block->getError(idx_in_block); + request_from_log.session_id = current_block->getSessionId(idx_in_block); + request_from_log.has_watch = current_block->hasWatch(idx_in_block); + request_from_log.executor_id = current_block->getExecutorId(idx_in_block); + request_from_log.request_event_time = current_block->getRequestEventTime(idx_in_block); + request_from_log.response_event_time = current_block->getResponseEventTime(idx_in_block); + + const auto move_row_iterator = [&] + { + if (idx_in_block == current_block->rows() - 1) + current_block.reset(); + else + ++idx_in_block; + }; + + auto op_num = current_block->getOpNum(idx_in_block); + switch (op_num) + { + case Coordination::OpNum::Create: + { + auto create_request = std::make_shared(); + create_request->path = current_block->getPath(idx_in_block); + create_request->data = current_block->getData(idx_in_block); + create_request->is_ephemeral = current_block->isEphemeral(idx_in_block); + create_request->is_sequential = current_block->isSequential(idx_in_block); + request_from_log.request = create_request; + break; + } + case Coordination::OpNum::Set: + { + auto set_request = std::make_shared(); + set_request->path = current_block->getPath(idx_in_block); + set_request->data = current_block->getData(idx_in_block); + if (auto version = current_block->getVersion(idx_in_block)) + set_request->version = *version; + request_from_log.request = set_request; + break; + } + case Coordination::OpNum::Remove: + { + auto remove_request = std::make_shared(); + remove_request->path = current_block->getPath(idx_in_block); + if (auto version = current_block->getVersion(idx_in_block)) + remove_request->version = *version; + request_from_log.request = remove_request; + break; + } + case Coordination::OpNum::Check: + { + auto check_request = std::make_shared(); + check_request->path = current_block->getPath(idx_in_block); + if (auto version = current_block->getVersion(idx_in_block)) + check_request->version = *version; + request_from_log.request = check_request; + break; + } + case Coordination::OpNum::Sync: + { + auto sync_request = std::make_shared(); + sync_request->path = current_block->getPath(idx_in_block); + request_from_log.request = sync_request; + break; + } + case Coordination::OpNum::Get: + { + auto get_request = std::make_shared(); + get_request->path = current_block->getPath(idx_in_block); + request_from_log.request = get_request; + break; + } + case Coordination::OpNum::SimpleList: + case Coordination::OpNum::FilteredList: + { + auto list_request = std::make_shared(); + list_request->path = current_block->getPath(idx_in_block); + request_from_log.request = list_request; + break; + } + case Coordination::OpNum::Exists: + { + auto exists_request = std::make_shared(); + exists_request->path = current_block->getPath(idx_in_block); + request_from_log.request = exists_request; + break; + } + case Coordination::OpNum::Multi: + case Coordination::OpNum::MultiRead: + { + if (for_multi) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Nested multi requests are not allowed"); + + auto requests_size = current_block->getRequestsSize(idx_in_block); + + Coordination::Requests requests; + requests.reserve(requests_size); + move_row_iterator(); + + for (size_t i = 0; i < requests_size; ++i) + { + auto subrequest_from_log = getNextRequest(/*for_multi=*/true); + if (!subrequest_from_log) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Failed to fetch subrequest for {}, subrequest index {}", op_num, i); + + requests.push_back(std::move(subrequest_from_log->request)); + + if (subrequest_from_log->session_id != request_from_log.session_id) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Session id mismatch for subrequest in {}, subrequest index {}", op_num, i); + + if (subrequest_from_log->executor_id != request_from_log.executor_id) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Executor id mismatch for subrequest in {}, subrequest index {}", op_num, i); + } + + request_from_log.request = std::make_shared(requests, default_acls); + + return request_from_log; + } + default: + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unsupported operation {} ({})", op_num, static_cast(op_num)); + } + + move_row_iterator(); + + return request_from_log; + } + +private: + DB::Block header_block; + + std::unique_ptr file_read_buf; + DB::InputFormatPtr input_format; + + std::optional current_block; + size_t idx_in_block = 0; + + Coordination::ACLs default_acls; +}; + + +namespace +{ + + +struct RequestFromLogStats +{ + struct Stats + { + std::atomic total = 0; + std::atomic unexpected_results = 0; + }; + + Stats write_requests; + Stats read_requests; +}; + +void dumpStats(std::string_view type, const RequestFromLogStats::Stats & stats_for_type) +{ + std::cerr << fmt::format( + "{} requests: {} total, {} with unexpected results ({:.4}%)", + type, + stats_for_type.total, + stats_for_type.unexpected_results, + static_cast(stats_for_type.unexpected_results) / stats_for_type.total * 100) + << std::endl; +}; + +void requestFromLogExecutor(std::shared_ptr> queue, RequestFromLogStats & request_stats) +{ + RequestFromLog request_from_log; + std::optional> last_request; + while (queue->pop(request_from_log)) + { + auto request_promise = std::make_shared>(); + last_request = request_promise->get_future(); + Coordination::ResponseCallback callback + = [&, request_promise, request = request_from_log.request, expected_result = request_from_log.expected_result]( + const Coordination::Response & response) mutable + { + auto & stats = request->isReadRequest() ? request_stats.read_requests : request_stats.write_requests; + + stats.total.fetch_add(1, std::memory_order_relaxed); + + if (*expected_result != response.error) + stats.unexpected_results.fetch_add(1, std::memory_order_relaxed); + + //if (!expected_result) + // return; + + //if (*expected_result != response.error) + // std::cerr << fmt::format( + // "Unexpected result for {}, got {}, expected {}", request->getOpNum(), response.error, *expected_result) + // << std::endl; + + request_promise->set_value(); + }; + + Coordination::WatchCallbackPtr watch; + if (request_from_log.has_watch) + watch = std::make_shared([](const Coordination::WatchResponse &) {}); + + request_from_log.connection->executeGenericRequest(request_from_log.request, callback, watch); + } + + if (last_request) + last_request->wait(); +} + +} + +void Runner::runBenchmarkFromLog() +{ + std::cerr << fmt::format("Running benchmark using requests from {}", input_request_log) << std::endl; + + pool.emplace(CurrentMetrics::LocalThread, CurrentMetrics::LocalThreadActive, CurrentMetrics::LocalThreadScheduled, concurrency); + + shared_context = DB::Context::createShared(); + global_context = DB::Context::createGlobal(shared_context.get()); + global_context->makeGlobalContext(); + DB::registerFormats(); + + /// Randomly choosing connection index + pcg64 rng(randomSeed()); + std::uniform_int_distribution connection_distribution(0, connection_infos.size() - 1); + + std::unordered_map> zookeeper_connections; + auto get_zookeeper_connection = [&](int64_t session_id) + { + if (auto it = zookeeper_connections.find(session_id); it != zookeeper_connections.end() && !it->second->isExpired()) + return it->second; + + auto connection_idx = connection_distribution(rng); + auto zk_connection = getConnection(connection_infos[connection_idx], connection_idx); + zookeeper_connections.insert_or_assign(session_id, zk_connection); + return zk_connection; + }; + + RequestFromLogStats stats; + + + std::unordered_map>> executor_id_to_queue; + + SCOPE_EXIT({ + for (const auto & [executor_id, executor_queue] : executor_id_to_queue) + executor_queue->finish(); + + pool->wait(); + + dumpStats("Write", stats.write_requests); + dumpStats("Read", stats.read_requests); + }); + + auto push_request = [&](RequestFromLog request) + { + if (auto it = executor_id_to_queue.find(request.executor_id); it != executor_id_to_queue.end()) + { + auto success = it->second->push(std::move(request)); + if (!success) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Failed to push to the executor's queue"); + return; + } + + auto executor_queue = std::make_shared>(std::numeric_limits().max()); + executor_id_to_queue.emplace(request.executor_id, executor_queue); + auto scheduled = pool->trySchedule([&, executor_queue]() mutable + { + requestFromLogExecutor(std::move(executor_queue), stats); + }); + + if (!scheduled) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Failed to schedule worker, try to increase concurrency parameter"); + + auto success = executor_queue->push(std::move(request)); + if (!success) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Failed to push to the executor's queue"); + }; + + { + auto setup_connection = getConnection(connection_infos[0], 0); + benchmark_context.startup(*setup_connection); + } + + ZooKeeperRequestFromLogReader request_reader(input_request_log, global_context); + while (auto request_from_log = request_reader.getNextRequest()) + { + request_from_log->connection = get_zookeeper_connection(request_from_log->session_id); + push_request(std::move(*request_from_log)); + } +} + +void Runner::runBenchmarkWithGenerator() +{ + pool.emplace(CurrentMetrics::LocalThread, CurrentMetrics::LocalThreadActive, CurrentMetrics::LocalThreadScheduled, concurrency); + queue.emplace(concurrency); createConnections(); std::cerr << "Preparing to run\n"; + benchmark_context.startup(*connections[0]); generator->startup(*connections[0]); std::cerr << "Prepared\n"; @@ -458,8 +1010,225 @@ std::vector> Runner::refreshConnections Runner::~Runner() { - queue->clearAndFinish(); + if (queue) + queue->clearAndFinish(); shutdown = true; - pool->wait(); - generator->cleanup(*connections[0]); + + if (pool) + pool->wait(); + + auto connection = getConnection(connection_infos[0], 0); + benchmark_context.cleanup(*connection); +} + +namespace +{ + +void removeRecursive(Coordination::ZooKeeper & zookeeper, const std::string & path) +{ + namespace fs = std::filesystem; + + auto promise = std::make_shared>(); + auto future = promise->get_future(); + + Strings children; + auto list_callback = [promise, &children] (const Coordination::ListResponse & response) + { + children = response.names; + promise->set_value(); + }; + zookeeper.list(path, Coordination::ListRequestType::ALL, list_callback, nullptr); + future.get(); + + std::span children_span(children); + while (!children_span.empty()) + { + Coordination::Requests ops; + for (size_t i = 0; i < 1000 && !children.empty(); ++i) + { + removeRecursive(zookeeper, fs::path(path) / children.back()); + ops.emplace_back(zkutil::makeRemoveRequest(fs::path(path) / children_span.back(), -1)); + children_span = children_span.subspan(0, children_span.size() - 1); + } + auto multi_promise = std::make_shared>(); + auto multi_future = multi_promise->get_future(); + + auto multi_callback = [multi_promise] (const Coordination::MultiResponse &) + { + multi_promise->set_value(); + }; + zookeeper.multi(ops, multi_callback); + multi_future.get(); + } + auto remove_promise = std::make_shared>(); + auto remove_future = remove_promise->get_future(); + + auto remove_callback = [remove_promise] (const Coordination::RemoveResponse &) + { + remove_promise->set_value(); + }; + + zookeeper.remove(path, -1, remove_callback); + remove_future.get(); +} + +} + +void BenchmarkContext::initializeFromConfig(const Poco::Util::AbstractConfiguration & config) +{ + Coordination::ACL acl; + acl.permissions = Coordination::ACL::All; + acl.scheme = "world"; + acl.id = "anyone"; + default_acls.emplace_back(std::move(acl)); + + std::cerr << "---- Parsing setup ---- " << std::endl; + static const std::string setup_key = "setup"; + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(setup_key, keys); + for (const auto & key : keys) + { + if (key.starts_with("node")) + { + auto node_key = setup_key + "." + key; + auto parsed_root_node = parseNode(node_key, config); + const auto node = root_nodes.emplace_back(parsed_root_node); + + if (config.has(node_key + ".repeat")) + { + if (!node->name.isRandom()) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Repeating node creation for key {}, but name is not randomly generated", node_key); + + auto repeat_count = config.getUInt64(node_key + ".repeat"); + node->repeat_count = repeat_count; + for (size_t i = 1; i < repeat_count; ++i) + root_nodes.emplace_back(node->clone()); + } + + std::cerr << "Tree to create:" << std::endl; + + node->dumpTree(); + std::cerr << std::endl; + } + } + std::cerr << "---- Done parsing data setup ----\n" << std::endl; +} + +std::shared_ptr BenchmarkContext::parseNode(const std::string & key, const Poco::Util::AbstractConfiguration & config) +{ + auto node = std::make_shared(); + node->name = StringGetter::fromConfig(key + ".name", config); + + if (config.has(key + ".data")) + node->data = StringGetter::fromConfig(key + ".data", config); + + Poco::Util::AbstractConfiguration::Keys node_keys; + config.keys(key, node_keys); + + for (const auto & node_key : node_keys) + { + if (!node_key.starts_with("node")) + continue; + + const auto node_key_string = key + "." + node_key; + auto child_node = parseNode(node_key_string, config); + node->children.push_back(child_node); + + if (config.has(node_key_string + ".repeat")) + { + if (!child_node->name.isRandom()) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Repeating node creation for key {}, but name is not randomly generated", node_key_string); + + auto repeat_count = config.getUInt64(node_key_string + ".repeat"); + child_node->repeat_count = repeat_count; + for (size_t i = 1; i < repeat_count; ++i) + node->children.push_back(child_node); + } + } + + return node; +} + +void BenchmarkContext::Node::dumpTree(int level) const +{ + std::string data_string + = data.has_value() ? fmt::format("{}", data->description()) : "no data"; + + std::string repeat_count_string = repeat_count != 0 ? fmt::format(", repeated {} times", repeat_count) : ""; + + std::cerr << fmt::format("{}name: {}, data: {}{}", std::string(level, '\t'), name.description(), data_string, repeat_count_string) << std::endl; + + for (auto it = children.begin(); it != children.end();) + { + const auto & child = *it; + child->dumpTree(level + 1); + std::advance(it, child->repeat_count != 0 ? child->repeat_count : 1); + } +} + +std::shared_ptr BenchmarkContext::Node::clone() const +{ + auto new_node = std::make_shared(); + new_node->name = name; + new_node->data = data; + new_node->repeat_count = repeat_count; + + // don't do deep copy of children because we will do clone only for root nodes + new_node->children = children; + + return new_node; +} + +void BenchmarkContext::Node::createNode(Coordination::ZooKeeper & zookeeper, const std::string & parent_path, const Coordination::ACLs & acls) const +{ + auto path = std::filesystem::path(parent_path) / name.getString(); + auto promise = std::make_shared>(); + auto future = promise->get_future(); + auto create_callback = [promise] (const Coordination::CreateResponse & response) + { + if (response.error != Coordination::Error::ZOK) + promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); + else + promise->set_value(); + }; + zookeeper.create(path, data ? data->getString() : "", false, false, acls, create_callback); + future.get(); + + for (const auto & child : children) + child->createNode(zookeeper, path, acls); +} + +void BenchmarkContext::startup(Coordination::ZooKeeper & zookeeper) +{ + if (root_nodes.empty()) + return; + + std::cerr << "---- Creating test data ----" << std::endl; + for (const auto & node : root_nodes) + { + auto node_name = node->name.getString(); + node->name.setString(node_name); + + std::string root_path = std::filesystem::path("/") / node_name; + std::cerr << "Cleaning up " << root_path << std::endl; + removeRecursive(zookeeper, root_path); + + node->createNode(zookeeper, "/", default_acls); + } + std::cerr << "---- Created test data ----\n" << std::endl; +} + +void BenchmarkContext::cleanup(Coordination::ZooKeeper & zookeeper) +{ + if (root_nodes.empty()) + return; + + std::cerr << "---- Cleaning up test data ----" << std::endl; + for (const auto & node : root_nodes) + { + auto node_name = node->name.getString(); + std::string root_path = std::filesystem::path("/") / node_name; + std::cerr << "Cleaning up " << root_path << std::endl; + removeRecursive(zookeeper, root_path); + } } diff --git a/utils/keeper-bench/Runner.h b/utils/keeper-bench/Runner.h index 4f4a75e6ecf..0c646eb2166 100644 --- a/utils/keeper-bench/Runner.h +++ b/utils/keeper-bench/Runner.h @@ -1,5 +1,5 @@ #pragma once -#include "Common/ZooKeeper/ZooKeeperConstants.h" +#include "Common/ZooKeeper/ZooKeeperArgs.h" #include #include "Generator.h" #include @@ -12,6 +12,7 @@ #include #include +#include "Interpreters/Context.h" #include "Stats.h" #include @@ -19,12 +20,40 @@ using Ports = std::vector; using Strings = std::vector; +struct BenchmarkContext +{ +public: + void initializeFromConfig(const Poco::Util::AbstractConfiguration & config); + + void startup(Coordination::ZooKeeper & zookeeper); + void cleanup(Coordination::ZooKeeper & zookeeper); +private: + struct Node + { + StringGetter name; + std::optional data; + std::vector> children; + size_t repeat_count = 0; + + std::shared_ptr clone() const; + + void createNode(Coordination::ZooKeeper & zookeeper, const std::string & parent_path, const Coordination::ACLs & acls) const; + void dumpTree(int level = 0) const; + }; + + static std::shared_ptr parseNode(const std::string & key, const Poco::Util::AbstractConfiguration & config); + + std::vector> root_nodes; + Coordination::ACLs default_acls; +}; + class Runner { public: Runner( std::optional concurrency_, const std::string & config_path, + const std::string & input_request_log_, const Strings & hosts_strings_, std::optional max_time_, std::optional delay_, @@ -44,8 +73,30 @@ public: ~Runner(); private: + struct ConnectionInfo + { + std::string host; + + bool secure = false; + int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS; + int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS; + int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS; + bool use_compression = false; + + size_t sessions = 1; + }; + void parseHostsFromConfig(const Poco::Util::AbstractConfiguration & config); + void runBenchmarkWithGenerator(); + void runBenchmarkFromLog(); + + void createConnections(); + std::vector> refreshConnections(); + std::shared_ptr getConnection(const ConnectionInfo & connection_info, size_t connection_info_idx); + + std::string input_request_log; + size_t concurrency = 1; std::optional pool; @@ -54,7 +105,8 @@ private: double max_time = 0; double delay = 1; bool continue_on_error = false; - std::atomic max_iterations = 0; + size_t max_iterations = 0; + std::atomic requests_executed = 0; std::atomic shutdown = false; @@ -71,25 +123,14 @@ private: using Queue = ConcurrentBoundedQueue; std::optional queue; - struct ConnectionInfo - { - std::string host; - - bool secure = false; - int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS; - int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS; - int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS; - bool use_compression = false; - - size_t sessions = 1; - }; - std::mutex connection_mutex; + ConnectionInfo default_connection_info; std::vector connection_infos; std::vector> connections; std::unordered_map connections_to_info_map; - void createConnections(); - std::shared_ptr getConnection(const ConnectionInfo & connection_info, size_t connection_info_idx); - std::vector> refreshConnections(); + DB::SharedContextHolder shared_context; + DB::ContextMutablePtr global_context; + + BenchmarkContext benchmark_context; }; diff --git a/utils/keeper-bench/main.cpp b/utils/keeper-bench/main.cpp index 0753d66850f..45fc28f3bca 100644 --- a/utils/keeper-bench/main.cpp +++ b/utils/keeper-bench/main.cpp @@ -1,8 +1,6 @@ #include #include #include "Runner.h" -#include "Stats.h" -#include "Generator.h" #include "Common/Exception.h" #include #include @@ -27,6 +25,10 @@ int main(int argc, char *argv[]) bool print_stacktrace = true; + //Poco::AutoPtr channel(new Poco::ConsoleChannel(std::cerr)); + //Poco::Logger::root().setChannel(channel); + //Poco::Logger::root().setLevel("trace"); + try { using boost::program_options::value; @@ -34,12 +36,13 @@ int main(int argc, char *argv[]) boost::program_options::options_description desc = createOptionsDescription("Allowed options", getTerminalWidth()); desc.add_options() ("help", "produce help message") - ("config", value()->default_value(""), "yaml/xml file containing configuration") - ("concurrency,c", value(), "number of parallel queries") - ("report-delay,d", value(), "delay between intermediate reports in seconds (set 0 to disable reports)") - ("iterations,i", value(), "amount of queries to be executed") - ("time-limit,t", value(), "stop launch of queries after specified time limit") - ("hosts,h", value()->multitoken()->default_value(Strings{}, ""), "") + ("config", value()->default_value(""), "yaml/xml file containing configuration") + ("input-request-log", value()->default_value(""), "log of requests that will be replayed") + ("concurrency,c", value(), "number of parallel queries") + ("report-delay,d", value(), "delay between intermediate reports in seconds (set 0 to disable reports)") + ("iterations,i", value(), "amount of queries to be executed") + ("time-limit,t", value(), "stop launch of queries after specified time limit") + ("hosts,h", value()->multitoken()->default_value(Strings{}, ""), "") ("continue_on_errors", "continue testing even if a query fails") ; @@ -56,6 +59,7 @@ int main(int argc, char *argv[]) Runner runner(valueToOptional(options["concurrency"]), options["config"].as(), + options["input-request-log"].as(), options["hosts"].as(), valueToOptional(options["time-limit"]), valueToOptional(options["report-delay"]), @@ -66,9 +70,9 @@ int main(int argc, char *argv[]) { runner.runBenchmark(); } - catch (const DB::Exception & e) + catch (...) { - std::cout << "Got exception while trying to run benchmark: " << e.message() << std::endl; + std::cout << "Got exception while trying to run benchmark: " << DB::getCurrentExceptionMessage(true) << std::endl; } return 0; From 652796acd6a10515e862260d18e002bae27f3c85 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 15 Apr 2024 16:37:38 +0100 Subject: [PATCH 067/392] Fix MergeTree with HDFS --- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 38 +++++++++++++++---- .../ObjectStorages/HDFS/HDFSObjectStorage.h | 16 ++++++-- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 8bfba6fcfad..82c9a6c6c21 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -34,15 +34,21 @@ void HDFSObjectStorage::startup() ObjectStorageKey HDFSObjectStorage::generateObjectKeyForPath(const std::string & /* path */) const { /// what ever data_source_description.description value is, consider that key as relative key - return ObjectStorageKey::createAsRelative(hdfs_root_path, getRandomASCIIString(32)); + chassert(data_directory.starts_with("/")); + return ObjectStorageKey::createAsRelative( + fs::path(url_without_path) / data_directory.substr(1), getRandomASCIIString(32)); } bool HDFSObjectStorage::exists(const StoredObject & object) const { + std::string path = object.remote_path; + if (path.starts_with(url_without_path)) + path = path.substr(url_without_path.size()); + // const auto & path = object.remote_path; // const size_t begin_of_path = path.find('/', path.find("//") + 2); // const String remote_fs_object_path = path.substr(begin_of_path); - return (0 == hdfsExists(hdfs_fs.get(), object.remote_path.c_str())); + return (0 == hdfsExists(hdfs_fs.get(), path.c_str())); } std::unique_ptr HDFSObjectStorage::readObject( /// NOLINT @@ -51,7 +57,14 @@ std::unique_ptr HDFSObjectStorage::readObject( /// NOLIN std::optional, std::optional) const { - return std::make_unique(hdfs_root_path, object.remote_path, config, patchSettings(read_settings)); + std::string path = object.remote_path; + if (path.starts_with(url)) + path = path.substr(url.size()); + if (path.starts_with("/")) + path.substr(1); + + return std::make_unique( + fs::path(url_without_path) / "", fs::path(data_directory) / path, config, patchSettings(read_settings)); } std::unique_ptr HDFSObjectStorage::readObjects( /// NOLINT @@ -69,8 +82,13 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI // auto hdfs_path = path.substr(begin_of_path); // auto hdfs_uri = path.substr(0, begin_of_path); + std::string path = object_.remote_path; + if (path.starts_with(url)) + path = path.substr(url.size()); + if (path.starts_with("/")) + path.substr(1); return std::make_unique( - hdfs_root_path, object_.remote_path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true); + fs::path(url_without_path) / "", fs::path(data_directory) / path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true); }; return std::make_unique( @@ -89,8 +107,11 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL ErrorCodes::UNSUPPORTED_METHOD, "HDFS API doesn't support custom attributes/metadata for stored objects"); - auto path = object.remote_path.starts_with('/') ? object.remote_path.substr(1) : object.remote_path; - path = fs::path(hdfs_root_path) / path; + std::string path = object.remote_path; + if (path.starts_with("/")) + path = path.substr(1); + if (!path.starts_with(url)) + path = fs::path(url) / path; /// Single O_WRONLY in libhdfs adds O_TRUNC return std::make_unique( @@ -102,8 +123,9 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL /// Remove file. Throws exception if file doesn't exists or it's a directory. void HDFSObjectStorage::removeObject(const StoredObject & object) { - const auto & path = object.remote_path; - // const size_t begin_of_path = path.find('/', path.find("//") + 2); + auto path = object.remote_path; + if (path.starts_with(url_without_path)) + path = path.substr(url_without_path.size()); /// Add path from root to file name int res = hdfsDelete(hdfs_fs.get(), path.c_str(), 0); diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 24642ec635a..8987fa5eaf1 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -40,15 +40,21 @@ public: , hdfs_builder(createHDFSBuilder(hdfs_root_path_, config)) , hdfs_fs(createHDFSFS(hdfs_builder.get())) , settings(std::move(settings_)) - , hdfs_root_path(hdfs_root_path_) { + const size_t begin_of_path = hdfs_root_path_.find('/', hdfs_root_path_.find("//") + 2); + url = hdfs_root_path_; + url_without_path = url.substr(0, begin_of_path); + if (begin_of_path < url.size()) + data_directory = url.substr(begin_of_path); + else + data_directory = "/"; } std::string getName() const override { return "HDFSObjectStorage"; } - std::string getCommonKeyPrefix() const override { return hdfs_root_path; } + std::string getCommonKeyPrefix() const override { return url; } - std::string getDescription() const override { return hdfs_root_path; } + std::string getDescription() const override { return url; } ObjectStorageType getType() const override { return ObjectStorageType::HDFS; } @@ -116,7 +122,9 @@ private: HDFSBuilderWrapper hdfs_builder; HDFSFSPtr hdfs_fs; SettingsPtr settings; - const std::string hdfs_root_path; + std::string url; + std::string url_without_path; + std::string data_directory; }; } From ccee2d668793370c3f947a4be24d1edbabba1724 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 15 Apr 2024 23:28:14 +0100 Subject: [PATCH 068/392] Fix parsing --- src/Storages/ObjectStorage/HDFS/Configuration.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 220857fead6..e12c2f15b28 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -63,9 +63,6 @@ std::string StorageHDFSConfiguration::getPathWithoutGlob() const void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool with_structure) { - std::string url_str; - url_str = checkAndGetLiteralArgument(args[0], "url"); - const size_t max_args_num = with_structure ? 4 : 3; if (!args.size() || args.size() > max_args_num) { @@ -73,6 +70,9 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool wit "Expected not more than {} arguments", max_args_num); } + std::string url_str; + url_str = checkAndGetLiteralArgument(args[0], "url"); + if (args.size() > 1) { args[1] = evaluateConstantExpressionOrIdentifierAsLiteral(args[1], context); From 11be538ac870d231a13a2648038ea1b469f73a08 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 16 Apr 2024 10:20:56 +0100 Subject: [PATCH 069/392] Fix several tests --- src/Disks/ObjectStorages/S3/diskSettings.cpp | 8 +++++-- src/Disks/ObjectStorages/S3/diskSettings.h | 3 ++- .../ObjectStorage/AzureBlob/Configuration.cpp | 7 +++--- .../ObjectStorage/HDFS/Configuration.cpp | 2 +- .../ObjectStorage/S3/Configuration.cpp | 2 +- .../StorageObjectStorageSink.cpp | 3 +-- src/Storages/S3Queue/S3QueueSource.cpp | 14 ++++++++++++ src/Storages/S3Queue/S3QueueSource.h | 1 + src/Storages/StorageS3Settings.cpp | 22 +++++++++++-------- src/Storages/StorageS3Settings.h | 10 +++++---- 10 files changed, 49 insertions(+), 23 deletions(-) diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 9bd4bf699e8..2bca7df7db9 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -36,11 +36,15 @@ extern const int NO_ELEMENTS_IN_CONFIG; } std::unique_ptr getSettings( - const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) + const Poco::Util::AbstractConfiguration & config, + const String & config_prefix, + ContextPtr context, + bool validate_settings) { const Settings & settings = context->getSettingsRef(); - auto request_settings = S3Settings::RequestSettings(config, config_prefix, settings, "s3_"); + auto request_settings = S3Settings::RequestSettings(config, config_prefix, settings, "s3_", validate_settings); auto auth_settings = S3::AuthSettings::loadFromConfig(config_prefix, config); + return std::make_unique( request_settings, auth_settings, diff --git a/src/Disks/ObjectStorages/S3/diskSettings.h b/src/Disks/ObjectStorages/S3/diskSettings.h index 5b655f35508..11ac64ce913 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.h +++ b/src/Disks/ObjectStorages/S3/diskSettings.h @@ -17,7 +17,8 @@ struct S3ObjectStorageSettings; std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - ContextPtr context); + ContextPtr context, + bool validate_settings = true); std::unique_ptr getClient( const Poco::Util::AbstractConfiguration & config, diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index fe01251e58a..44ace9c3b65 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -282,12 +282,11 @@ void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr conte auto is_format_arg = [] (const std::string & s) -> bool { - return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); + return s == "auto" || FormatFactory::instance().getAllFormats().contains(Poco::toLower(s)); }; if (engine_args.size() == 4) { - //'c1 UInt64, c2 UInt64 auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); if (is_format_arg(fourth_arg)) { @@ -298,7 +297,9 @@ void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr conte if (with_structure) structure = fourth_arg; else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format or account name specified without account key"); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Unknown format or account name specified without account key: {}", fourth_arg); } } else if (engine_args.size() == 5) diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index e12c2f15b28..af191070329 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -64,7 +64,7 @@ std::string StorageHDFSConfiguration::getPathWithoutGlob() const void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool with_structure) { const size_t max_args_num = with_structure ? 4 : 3; - if (!args.size() || args.size() > max_args_num) + if (args.empty() || args.size() > max_args_num) { throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Expected not more than {} arguments", max_args_num); diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index f532af24017..46be0a01862 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -77,7 +77,7 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, const auto & config = context->getConfigRef(); const std::string config_prefix = "s3."; - auto s3_settings = getSettings(config, config_prefix, context); + auto s3_settings = getSettings(config, config_prefix, context, false); /// FIXME: add a setting auth_settings.updateFrom(s3_settings->auth_settings); s3_settings->auth_settings = auth_settings; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index cf1c583ca62..8381737a4f5 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -96,10 +96,9 @@ void StorageObjectStorageSink::finalize() void StorageObjectStorageSink::release() { writer.reset(); - write_buf->finalize(); + write_buf.reset(); } - PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, StorageObjectStorageConfigurationPtr configuration_, diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 8e7155205c4..7c6d952d181 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -197,8 +197,22 @@ String StorageS3QueueSource::getName() const return name; } +void StorageS3QueueSource::lazyInitialize() +{ + if (initialized) + return; + + internal_source->lazyInitialize(processing_id); + reader = std::move(internal_source->reader); + if (reader) + reader_future = std::move(internal_source->reader_future); + initialized = true; +} + Chunk StorageS3QueueSource::generate() { + lazyInitialize(); + while (true) { if (!reader) diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 8c785e683c2..c1b45108b36 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -117,6 +117,7 @@ private: void applyActionAfterProcessing(const String & path); void appendLogElement(const std::string & filename, S3QueueFilesMetadata::FileStatus & file_status_, size_t processed_rows, bool processed); + void lazyInitialize(); }; } diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index 2780249e3fd..b767805f637 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -18,18 +18,20 @@ namespace ErrorCodes extern const int INVALID_SETTING_VALUE; } -S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const Settings & settings) +S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const Settings & settings, bool validate_settings) { updateFromSettings(settings, false); - validate(); + if (validate_settings) + validate(); } S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Settings & settings, - String setting_name_prefix) - : PartUploadSettings(settings) + String setting_name_prefix, + bool validate_settings) + : PartUploadSettings(settings, validate_settings) { String key = config_prefix + "." + setting_name_prefix; strict_upload_part_size = config.getUInt64(key + "strict_upload_part_size", strict_upload_part_size); @@ -46,7 +48,8 @@ S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings( storage_class_name = config.getString(config_prefix + ".s3_storage_class", storage_class_name); storage_class_name = Poco::toUpperInPlace(storage_class_name); - validate(); + if (validate_settings) + validate(); } S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const NamedCollection & collection) @@ -170,8 +173,8 @@ void S3Settings::RequestSettings::PartUploadSettings::validate() } -S3Settings::RequestSettings::RequestSettings(const Settings & settings) - : upload_settings(settings) +S3Settings::RequestSettings::RequestSettings(const Settings & settings, bool validate_settings) + : upload_settings(settings, validate_settings) { updateFromSettingsImpl(settings, false); } @@ -190,8 +193,9 @@ S3Settings::RequestSettings::RequestSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Settings & settings, - String setting_name_prefix) - : upload_settings(config, config_prefix, settings, setting_name_prefix) + String setting_name_prefix, + bool validate_settings) + : upload_settings(config, config_prefix, settings, setting_name_prefix, validate_settings) { String key = config_prefix + "." + setting_name_prefix; max_single_read_retries = config.getUInt64(key + "max_single_read_retries", settings.s3_max_single_read_retries); diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index e09be8654e7..c3bc8aa6ed6 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -44,13 +44,14 @@ struct S3Settings private: PartUploadSettings() = default; - explicit PartUploadSettings(const Settings & settings); + explicit PartUploadSettings(const Settings & settings, bool validate_settings = true); explicit PartUploadSettings(const NamedCollection & collection); PartUploadSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Settings & settings, - String setting_name_prefix = {}); + String setting_name_prefix = {}, + bool validate_settings = true); friend struct RequestSettings; }; @@ -78,7 +79,7 @@ struct S3Settings void setStorageClassName(const String & storage_class_name) { upload_settings.storage_class_name = storage_class_name; } RequestSettings() = default; - explicit RequestSettings(const Settings & settings); + explicit RequestSettings(const Settings & settings, bool validate_settings = true); explicit RequestSettings(const NamedCollection & collection); /// What's the setting_name_prefix, and why do we need it? @@ -92,7 +93,8 @@ struct S3Settings const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Settings & settings, - String setting_name_prefix = {}); + String setting_name_prefix = {}, + bool validate_settings = true); void updateFromSettingsIfChanged(const Settings & settings); From 4e1005bc43fabce6baf28f5f91b8a6db0315cc7d Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 17 Apr 2024 14:13:21 +0100 Subject: [PATCH 070/392] Fix s3 throttler --- src/Storages/ObjectStorage/S3/Configuration.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 46be0a01862..4c9e49d0705 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -79,7 +79,9 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, auto s3_settings = getSettings(config, config_prefix, context, false); /// FIXME: add a setting + request_settings.updateFromSettingsIfChanged(context->getSettingsRef()); auth_settings.updateFrom(s3_settings->auth_settings); + s3_settings->auth_settings = auth_settings; s3_settings->request_settings = request_settings; From 6bb3ad3133e3c7c767048bb32d85276bed726247 Mon Sep 17 00:00:00 2001 From: serxa Date: Wed, 17 Apr 2024 13:15:07 +0000 Subject: [PATCH 071/392] Save the stacktrace of thread waiting on failing AsyncLoader job into exception --- src/Common/AsyncLoader.cpp | 3 ++- src/Common/ErrorCodes.cpp | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Common/AsyncLoader.cpp b/src/Common/AsyncLoader.cpp index 3bec30893b9..29ea59b82ed 100644 --- a/src/Common/AsyncLoader.cpp +++ b/src/Common/AsyncLoader.cpp @@ -30,6 +30,7 @@ namespace ErrorCodes extern const int ASYNC_LOAD_CYCLE; extern const int ASYNC_LOAD_FAILED; extern const int ASYNC_LOAD_CANCELED; + extern const int ASYNC_LOAD_WAIT_FAILED; extern const int LOGICAL_ERROR; } @@ -433,7 +434,7 @@ void AsyncLoader::wait(const LoadJobPtr & job, bool no_throw) std::unique_lock job_lock{job->mutex}; wait(job_lock, job); if (!no_throw && job->load_exception) - std::rethrow_exception(job->load_exception); + throw Exception(ErrorCodes::ASYNC_LOAD_WAIT_FAILED, "Waited job failed: {}", getExceptionMessage(job->load_exception, /* with_stacktrace = */ false)); } void AsyncLoader::remove(const LoadJobSet & jobs) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 97a339b2bac..9fad2f1ff02 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -598,6 +598,7 @@ M(717, EXPERIMENTAL_FEATURE_ERROR) \ M(718, TOO_SLOW_PARSING) \ M(719, QUERY_CACHE_USED_WITH_SYSTEM_TABLE) \ + M(720, ASYNC_LOAD_WAIT_FAILED) \ \ M(900, DISTRIBUTED_CACHE_ERROR) \ M(901, CANNOT_USE_DISTRIBUTED_CACHE) \ From 51c8dd133888964b50c2fa3db5cf6069ccca0252 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 17 Apr 2024 16:17:57 +0100 Subject: [PATCH 072/392] Fix delta lake tests --- .../DataLakes/IStorageDataLake.h | 24 +++++++++++++++---- src/TableFunctions/ITableFunctionDataLake.h | 6 +++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index 0e83bb70a2f..21ebc32c8ae 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -57,8 +57,8 @@ public: } return std::make_shared>( - base_configuration, std::move(metadata), configuration, object_storage, engine_name_, context, - table_id_, + base_configuration, std::move(metadata), configuration, object_storage, + engine_name_, context, table_id_, columns_.empty() ? ColumnsDescription(schema_from_metadata) : columns_, constraints_, comment_, format_settings_); } @@ -68,11 +68,23 @@ public: static ColumnsDescription getTableStructureFromData( ObjectStoragePtr object_storage_, ConfigurationPtr base_configuration, - const std::optional &, + const std::optional & format_settings_, ContextPtr local_context) { auto metadata = DataLakeMetadata::create(object_storage_, base_configuration, local_context); - return ColumnsDescription(metadata->getTableSchema()); + + auto schema_from_metadata = metadata->getTableSchema(); + if (schema_from_metadata != NamesAndTypesList{}) + { + return ColumnsDescription(std::move(schema_from_metadata)); + } + else + { + ConfigurationPtr configuration = base_configuration->clone(); + configuration->getPaths() = metadata->getDataFiles(); + return Storage::getTableStructureFromData( + object_storage_, configuration, format_settings_, local_context); + } } void updateConfiguration(ContextPtr local_context) override @@ -102,6 +114,10 @@ public: , base_configuration(base_configuration_) , current_metadata(std::move(metadata_)) { + if (base_configuration->format == "auto") + { + base_configuration->format = Storage::configuration->format; + } } private: diff --git a/src/TableFunctions/ITableFunctionDataLake.h b/src/TableFunctions/ITableFunctionDataLake.h index c86970307c0..8cbd855bb96 100644 --- a/src/TableFunctions/ITableFunctionDataLake.h +++ b/src/TableFunctions/ITableFunctionDataLake.h @@ -57,8 +57,10 @@ protected: auto object_storage = TableFunction::getObjectStorage(context, !is_insert_query); return Storage::getTableStructureFromData(object_storage, configuration, std::nullopt, context); } - - return parseColumnsListFromString(configuration->structure, context); + else + { + return parseColumnsListFromString(configuration->structure, context); + } } void parseArguments(const ASTPtr & ast_function, ContextPtr context) override From c8915a16a51719e6ba569806b377f01859971e87 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 18 Apr 2024 17:22:51 +0100 Subject: [PATCH 073/392] Fix a few mote tests --- src/Backups/BackupIO_AzureBlobStorage.cpp | 3 ++- .../registerBackupEngineAzureBlobStorage.cpp | 6 ++++-- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 5 ++++- src/Disks/ObjectStorages/S3/diskSettings.cpp | 5 ++--- src/Storages/ObjectStorage/DataLakes/Common.cpp | 2 +- .../ObjectStorage/DataLakes/DeltaLakeMetadata.cpp | 12 ++++++------ .../ObjectStorage/DataLakes/DeltaLakeMetadata.h | 6 ++++-- .../ObjectStorage/DataLakes/HudiMetadata.h | 4 +++- .../ObjectStorage/DataLakes/IStorageDataLake.h | 14 +++++++++++--- 9 files changed, 37 insertions(+), 20 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 4dd54712e5e..673930b5976 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -193,7 +193,8 @@ void BackupWriterAzureBlobStorage::copyDataToFile( { copyDataToAzureBlobStorageFile( create_read_buffer, start_pos, length, client, configuration.container, - path_in_backup, settings, threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupWRAzure")); + fs::path(configuration.blob_path) / path_in_backup, settings, + threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupWRAzure")); } BackupWriterAzureBlobStorage::~BackupWriterAzureBlobStorage() = default; diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index 700c8cb222f..049a4b1a338 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -117,8 +117,10 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Using archives with backups on clusters is disabled"); auto path = configuration.getPath(); - configuration.setPath(removeFileNameFromURL(path)); - archive_params.archive_name = configuration.getPath(); + auto filename = removeFileNameFromURL(path); + configuration.setPath(path); + + archive_params.archive_name = filename; archive_params.compression_method = params.compression_method; archive_params.compression_level = params.compression_level; archive_params.password = params.password; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index f97d6f937ef..a2522212f90 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -60,7 +60,10 @@ void throwIfError(const Aws::Utils::Outcome & response) if (!response.IsSuccess()) { const auto & err = response.GetError(); - throw S3Exception(fmt::format("{} (Code: {})", err.GetMessage(), static_cast(err.GetErrorType())), err.GetErrorType()); + throw S3Exception( + fmt::format("{} (Code: {}, s3 exception: {})", + err.GetMessage(), static_cast(err.GetErrorType()), err.GetExceptionName()), + err.GetErrorType()); } } diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 2bca7df7db9..66731e85d41 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -72,7 +72,6 @@ std::unique_ptr getClient( if (for_disk_s3) { String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); - url = S3::URI(endpoint); if (!url.key.ends_with('/')) url.key.push_back('/'); @@ -103,8 +102,8 @@ std::unique_ptr getClient( client_configuration.endpointOverride = url.endpoint; client_configuration.maxConnections = static_cast(request_settings.max_connections); - client_configuration.connectTimeoutMs = config.getUInt(config_prefix + ".connect_timeout_ms", S3::DEFAULT_CONNECT_TIMEOUT_MS); - client_configuration.requestTimeoutMs = config.getUInt(config_prefix + ".request_timeout_ms", S3::DEFAULT_REQUEST_TIMEOUT_MS); + client_configuration.connectTimeoutMs = config.getUInt64(config_prefix + ".connect_timeout_ms", local_settings.s3_connect_timeout_ms.value); + client_configuration.requestTimeoutMs = config.getUInt64(config_prefix + ".request_timeout_ms", local_settings.s3_request_timeout_ms.value); client_configuration.maxConnections = config.getUInt(config_prefix + ".max_connections", S3::DEFAULT_MAX_CONNECTIONS); client_configuration.http_keep_alive_timeout = config.getUInt(config_prefix + ".http_keep_alive_timeout", S3::DEFAULT_KEEP_ALIVE_TIMEOUT); client_configuration.http_keep_alive_max_requests = config.getUInt(config_prefix + ".http_keep_alive_max_requests", S3::DEFAULT_KEEP_ALIVE_MAX_REQUESTS); diff --git a/src/Storages/ObjectStorage/DataLakes/Common.cpp b/src/Storages/ObjectStorage/DataLakes/Common.cpp index 5f0138078d4..0c9237127b9 100644 --- a/src/Storages/ObjectStorage/DataLakes/Common.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Common.cpp @@ -21,7 +21,7 @@ std::vector listFiles( if (filename.ends_with(suffix)) res.push_back(filename); } - LOG_TRACE(getLogger("DataLakeCommon"), "Listed {} files", res.size()); + LOG_TRACE(getLogger("DataLakeCommon"), "Listed {} files ({})", res.size(), fmt::join(res, ", ")); return res; } diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index 123c63439b0..d0f203b32bd 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -27,10 +27,11 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -struct DeltaLakeMetadata::Impl final : private WithContext +struct DeltaLakeMetadata::Impl { ObjectStoragePtr object_storage; ConfigurationPtr configuration; + ContextPtr context; /** * Useful links: @@ -39,9 +40,9 @@ struct DeltaLakeMetadata::Impl final : private WithContext Impl(ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, ContextPtr context_) - : WithContext(context_) - , object_storage(object_storage_) + : object_storage(object_storage_) , configuration(configuration_) + , context(context_) { } @@ -137,7 +138,7 @@ struct DeltaLakeMetadata::Impl final : private WithContext */ void processMetadataFile(const String & key, std::set & result) { - auto read_settings = getContext()->getReadSettings(); + auto read_settings = context->getReadSettings(); auto buf = object_storage->readObject(StoredObject(key), read_settings); char c; @@ -190,7 +191,7 @@ struct DeltaLakeMetadata::Impl final : private WithContext return 0; String json_str; - auto read_settings = getContext()->getReadSettings(); + auto read_settings = context->getReadSettings(); auto buf = object_storage->readObject(StoredObject(last_checkpoint_file), read_settings); readJSONObjectPossiblyInvalid(json_str, *buf); @@ -252,7 +253,6 @@ struct DeltaLakeMetadata::Impl final : private WithContext LOG_TRACE(log, "Using checkpoint file: {}", checkpoint_path.string()); - auto context = getContext(); auto read_settings = context->getReadSettings(); auto buf = object_storage->readObject(StoredObject(checkpoint_path), read_settings); auto format_settings = getFormatSettings(context); diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h index 1a5bb85586a..5050b88d809 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h @@ -9,7 +9,7 @@ namespace DB { -class DeltaLakeMetadata final : public IDataLakeMetadata, private WithContext +class DeltaLakeMetadata final : public IDataLakeMetadata { public: using ConfigurationPtr = StorageObjectStorageConfigurationPtr; @@ -28,7 +28,9 @@ public: bool operator ==(const IDataLakeMetadata & other) const override { const auto * deltalake_metadata = dynamic_cast(&other); - return deltalake_metadata && getDataFiles() == deltalake_metadata->getDataFiles(); + return deltalake_metadata + && !data_files.empty() && !deltalake_metadata->data_files.empty() + && data_files == deltalake_metadata->data_files; } static DataLakeMetadataPtr create( diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h index ee8b1ea4978..6054c3f15d6 100644 --- a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h @@ -29,7 +29,9 @@ public: bool operator ==(const IDataLakeMetadata & other) const override { const auto * hudi_metadata = dynamic_cast(&other); - return hudi_metadata && getDataFiles() == hudi_metadata->getDataFiles(); + return hudi_metadata + && !data_files.empty() && !hudi_metadata->data_files.empty() + && data_files == hudi_metadata->data_files; } static DataLakeMetadataPtr create( diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index 21ebc32c8ae..64228e880f8 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -42,17 +42,25 @@ public: auto object_storage = base_configuration->createObjectStorage(context); DataLakeMetadataPtr metadata; NamesAndTypesList schema_from_metadata; + + if (base_configuration->format == "auto") + base_configuration->format = "Parquet"; + ConfigurationPtr configuration = base_configuration->clone(); + try { metadata = DataLakeMetadata::create(object_storage, base_configuration, context); schema_from_metadata = metadata->getTableSchema(); - configuration->getPaths() = metadata->getDataFiles(); + configuration->setPaths(metadata->getDataFiles()); } catch (...) { if (mode <= LoadingStrictnessLevel::CREATE) throw; + + metadata.reset(); + configuration->setPaths({}); tryLogCurrentException(__PRETTY_FUNCTION__); } @@ -100,8 +108,8 @@ public: current_metadata = std::move(new_metadata); auto updated_configuration = base_configuration->clone(); - /// If metadata wasn't changed, we won't list data files again. - updated_configuration->getPaths() = current_metadata->getDataFiles(); + updated_configuration->setPaths(current_metadata->getDataFiles()); + Storage::configuration = updated_configuration; } From e2e6071063b4ce09530746c9ef49d12a36ccec37 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 19 Apr 2024 13:43:43 +0100 Subject: [PATCH 074/392] Fix a few more tests --- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 21 ++++ .../ObjectStorages/HDFS/HDFSObjectStorage.h | 8 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 3 +- .../ObjectStorage/AzureBlob/Configuration.cpp | 97 +++++++++++++++---- .../ObjectStorage/HDFS/Configuration.cpp | 10 +- .../ObjectStorage/StorageObjectStorage.cpp | 50 +++------- .../StorageObjectStorageQuerySettings.h | 2 +- .../StorageObjectStorageSink.cpp | 9 ++ .../ObjectStorage/StorageObjectStorageSink.h | 3 + src/Storages/ObjectStorage/Utils.cpp | 43 ++++++++ src/Storages/ObjectStorage/Utils.h | 17 ++++ tests/integration/test_storage_hdfs/test.py | 8 +- .../test_storage_kerberized_hdfs/test.py | 2 +- 13 files changed, 204 insertions(+), 69 deletions(-) create mode 100644 src/Storages/ObjectStorage/Utils.cpp create mode 100644 src/Storages/ObjectStorage/Utils.h diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 82c9a6c6c21..fc7d49324c7 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -31,8 +31,18 @@ void HDFSObjectStorage::startup() { } +void HDFSObjectStorage::initializeHDFS() const +{ + if (hdfs_fs) + return; + + hdfs_builder = createHDFSBuilder(url, config); + hdfs_fs = createHDFSFS(hdfs_builder.get()); +} + ObjectStorageKey HDFSObjectStorage::generateObjectKeyForPath(const std::string & /* path */) const { + initializeHDFS(); /// what ever data_source_description.description value is, consider that key as relative key chassert(data_directory.starts_with("/")); return ObjectStorageKey::createAsRelative( @@ -41,6 +51,7 @@ ObjectStorageKey HDFSObjectStorage::generateObjectKeyForPath(const std::string & bool HDFSObjectStorage::exists(const StoredObject & object) const { + initializeHDFS(); std::string path = object.remote_path; if (path.starts_with(url_without_path)) path = path.substr(url_without_path.size()); @@ -57,6 +68,7 @@ std::unique_ptr HDFSObjectStorage::readObject( /// NOLIN std::optional, std::optional) const { + initializeHDFS(); std::string path = object.remote_path; if (path.starts_with(url)) path = path.substr(url.size()); @@ -73,6 +85,7 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI std::optional, std::optional) const { + initializeHDFS(); auto disk_read_settings = patchSettings(read_settings); auto read_buffer_creator = [this, disk_read_settings] @@ -102,6 +115,7 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL size_t buf_size, const WriteSettings & write_settings) { + initializeHDFS(); if (attributes.has_value()) throw Exception( ErrorCodes::UNSUPPORTED_METHOD, @@ -123,6 +137,7 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL /// Remove file. Throws exception if file doesn't exists or it's a directory. void HDFSObjectStorage::removeObject(const StoredObject & object) { + initializeHDFS(); auto path = object.remote_path; if (path.starts_with(url_without_path)) path = path.substr(url_without_path.size()); @@ -136,24 +151,28 @@ void HDFSObjectStorage::removeObject(const StoredObject & object) void HDFSObjectStorage::removeObjects(const StoredObjects & objects) { + initializeHDFS(); for (const auto & object : objects) removeObject(object); } void HDFSObjectStorage::removeObjectIfExists(const StoredObject & object) { + initializeHDFS(); if (exists(object)) removeObject(object); } void HDFSObjectStorage::removeObjectsIfExist(const StoredObjects & objects) { + initializeHDFS(); for (const auto & object : objects) removeObjectIfExists(object); } ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) const { + initializeHDFS(); auto * file_info = hdfsGetPathInfo(hdfs_fs.get(), path.data()); if (!file_info) throw Exception(ErrorCodes::HDFS_ERROR, @@ -169,6 +188,7 @@ ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) co void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const { + initializeHDFS(); auto * log = &Poco::Logger::get("HDFSObjectStorage"); LOG_TRACE(log, "Trying to list files for {}", path); @@ -222,6 +242,7 @@ void HDFSObjectStorage::copyObject( /// NOLINT const WriteSettings & write_settings, std::optional object_to_attributes) { + initializeHDFS(); if (object_to_attributes.has_value()) throw Exception( ErrorCodes::UNSUPPORTED_METHOD, diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 8987fa5eaf1..f57b7e1fda8 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -37,8 +37,6 @@ public: SettingsPtr settings_, const Poco::Util::AbstractConfiguration & config_) : config(config_) - , hdfs_builder(createHDFSBuilder(hdfs_root_path_, config)) - , hdfs_fs(createHDFSFS(hdfs_builder.get())) , settings(std::move(settings_)) { const size_t begin_of_path = hdfs_root_path_.find('/', hdfs_root_path_.find("//") + 2); @@ -117,10 +115,12 @@ public: bool isRemote() const override { return true; } private: + void initializeHDFS() const; + const Poco::Util::AbstractConfiguration & config; - HDFSBuilderWrapper hdfs_builder; - HDFSFSPtr hdfs_fs; + mutable HDFSBuilderWrapper hdfs_builder; + mutable HDFSFSPtr hdfs_fs; SettingsPtr settings; std::string url; std::string url_without_path; diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 66731e85d41..49300a9cd89 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -157,7 +157,8 @@ std::unique_ptr getClient( auth_settings.server_side_encryption_customer_key_base64, std::move(sse_kms_config), auth_settings.headers, - credentials_configuration); + credentials_configuration, + auth_settings.session_token); } } diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index 44ace9c3b65..4b826a0c721 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -381,7 +381,7 @@ void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr conte } void StorageAzureBlobConfiguration::addStructureAndFormatToArgs( - ASTs & args, const String & structure_, const String & /* format */, ContextPtr context) + ASTs & args, const String & structure_, const String & format_, ContextPtr context) { if (tryGetNamedCollectionWithOverrides(args, context)) { @@ -397,66 +397,129 @@ void StorageAzureBlobConfiguration::addStructureAndFormatToArgs( { throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Storage Azure requires 3 to 7 arguments: " - "StorageObjectStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); + "StorageObjectStorage(connection_string|storage_account_url, container_name, " + "blobpath, [account_name, account_key, format, compression, structure])"); } + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + auto structure_literal = std::make_shared(structure_); + auto format_literal = std::make_shared(format_); auto is_format_arg = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); }; + /// (connection_string, container_name, blobpath) if (args.size() == 3) { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); + /// Add compression = "auto" before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + /// (connection_string, container_name, blobpath, structure) or + /// (connection_string, container_name, blobpath, format) + /// We can distinguish them by looking at the 4-th argument: check if it's format name or not. else if (args.size() == 4) { auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name/structure"); + /// (..., format) -> (..., format, compression, structure) if (is_format_arg(fourth_arg)) { + if (fourth_arg == "auto") + args[3] = format_literal; /// Add compression=auto before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + /// (..., structure) -> (..., format, compression, structure) else { - args.back() = structure_literal; + auto structure_arg = args.back(); + args[3] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + if (fourth_arg == "auto") + args.push_back(structure_literal); + else + args.push_back(structure_arg); } } + /// (connection_string, container_name, blobpath, format, compression) or + /// (storage_account_url, container_name, blobpath, account_name, account_key) + /// We can distinguish them by looking at the 4-th argument: check if it's format name or not. else if (args.size() == 5) { auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) + /// (..., format, compression) -> (..., format, compression, structure) + if (is_format_arg(fourth_arg)) { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(std::make_shared("auto")); + if (fourth_arg == "auto") + args[3] = format_literal; + args.push_back(structure_literal); } - args.push_back(structure_literal); - } - else if (args.size() == 6) - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) + /// (..., account_name, account_key) -> (..., account_name, account_key, format, compression, structure) + else { + args.push_back(format_literal); /// Add compression=auto before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + } + /// (connection_string, container_name, blobpath, format, compression, structure) or + /// (storage_account_url, container_name, blobpath, account_name, account_key, structure) or + /// (storage_account_url, container_name, blobpath, account_name, account_key, format) + else if (args.size() == 6) + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); + auto sixth_arg = checkAndGetLiteralArgument(args[5], "format/structure"); + + /// (..., format, compression, structure) + if (is_format_arg(fourth_arg)) + { + if (fourth_arg == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[5], "structure") == "auto") + args[5] = structure_literal; + } + /// (..., account_name, account_key, format) -> (..., account_name, account_key, format, compression, structure) + else if (is_format_arg(sixth_arg)) + { + if (sixth_arg == "auto") + args[5] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + /// (..., account_name, account_key, structure) -> (..., account_name, account_key, format, compression, structure) else { - args.back() = structure_literal; + auto structure_arg = args.back(); + args[5] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + if (sixth_arg == "auto") + args.push_back(structure_literal); + else + args.push_back(structure_arg); } } + /// (storage_account_url, container_name, blobpath, account_name, account_key, format, compression) else if (args.size() == 7) { + /// (..., format, compression) -> (..., format, compression, structure) + if (checkAndGetLiteralArgument(args[5], "format") == "auto") + args[5] = format_literal; args.push_back(structure_literal); } + /// (storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure) else if (args.size() == 8) { - args.back() = structure_literal; + if (checkAndGetLiteralArgument(args[5], "format") == "auto") + args[5] = format_literal; + if (checkAndGetLiteralArgument(args[7], "structure") == "auto") + args[7] = structure_literal; } } } diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index af191070329..84f0a7bfe9f 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -73,9 +73,11 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool wit std::string url_str; url_str = checkAndGetLiteralArgument(args[0], "url"); + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + if (args.size() > 1) { - args[1] = evaluateConstantExpressionOrIdentifierAsLiteral(args[1], context); format = checkAndGetLiteralArgument(args[1], "format_name"); } @@ -83,18 +85,15 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool wit { if (args.size() > 2) { - args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(args[2], context); structure = checkAndGetLiteralArgument(args[2], "structure"); } if (args.size() > 3) { - args[3] = evaluateConstantExpressionOrIdentifierAsLiteral(args[3], context); compression_method = checkAndGetLiteralArgument(args[3], "compression_method"); } } else if (args.size() > 2) { - args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(args[2], context); compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); } @@ -165,6 +164,9 @@ void StorageHDFSConfiguration::addStructureAndFormatToArgs( auto format_literal = std::make_shared(format_); auto structure_literal = std::make_shared(structure_); + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + /// hdfs(url) if (count == 1) { diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 8fc3de4de1b..13f3557d927 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -193,6 +194,7 @@ SinkToStoragePtr StorageObjectStorage::write( { updateConfiguration(local_context); const auto sample_block = metadata_snapshot->getSampleBlock(); + const auto & query_settings = StorageSettings::create(local_context->getSettingsRef()); if (configuration->withWildcard()) { @@ -209,7 +211,8 @@ SinkToStoragePtr StorageObjectStorage::write( { LOG_TEST(log, "Using PartitionedSink for {}", configuration->getPath()); return std::make_shared( - object_storage, configuration, format_settings, sample_block, local_context, partition_by_ast); + object_storage, configuration, query_settings, + format_settings, sample_block, local_context, partition_by_ast); } } @@ -220,46 +223,19 @@ SinkToStoragePtr StorageObjectStorage::write( getName(), configuration->getPath()); } - const auto storage_settings = StorageSettings::create(local_context->getSettingsRef()); - - auto configuration_copy = configuration->clone(); - if (!storage_settings.truncate_on_insert - && object_storage->exists(StoredObject(configuration->getPath()))) + auto & paths = configuration->getPaths(); + if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( + *object_storage, *configuration, query_settings, paths.front(), paths.size())) { - if (storage_settings.create_new_file_on_insert) - { - auto & paths = configuration_copy->getPaths(); - size_t index = paths.size(); - const auto & first_key = paths[0]; - auto pos = first_key.find_first_of('.'); - String new_key; - - do - { - new_key = first_key.substr(0, pos) - + "." - + std::to_string(index) - + (pos == std::string::npos ? "" : first_key.substr(pos)); - ++index; - } - while (object_storage->exists(StoredObject(new_key))); - - paths.push_back(new_key); - configuration->getPaths().push_back(new_key); - } - else - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Object in bucket {} with key {} already exists. " - "If you want to overwrite it, enable setting [engine_name]_truncate_on_insert, if you " - "want to create a new file on each insert, enable setting [engine_name]_create_new_file_on_insert", - configuration_copy->getNamespace(), configuration_copy->getPaths().back()); - } + paths.push_back(*new_key); } return std::make_shared( - object_storage, configuration_copy, format_settings, sample_block, local_context); + object_storage, + configuration->clone(), + format_settings, + sample_block, + local_context); } template diff --git a/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h index f0687776aa7..606456011c3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h @@ -84,7 +84,7 @@ struct HDFSStorageSettings .create_new_file_on_insert = settings.hdfs_create_new_file_on_insert, .schema_inference_use_cache = settings.schema_inference_use_cache_for_hdfs, .schema_inference_mode = settings.schema_inference_mode, - .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for hdfs + .skip_empty_files = settings.hdfs_skip_empty_files, /// TODO: add setting for hdfs .list_object_keys_size = settings.s3_list_object_keys_size, /// TODO: add a setting for hdfs .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, .ignore_non_existent_file = settings.hdfs_ignore_file_doesnt_exist, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 8381737a4f5..42371764920 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { @@ -102,6 +103,7 @@ void StorageObjectStorageSink::release() PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, StorageObjectStorageConfigurationPtr configuration_, + const StorageObjectStorageSettings & query_settings_, std::optional format_settings_, const Block & sample_block_, ContextPtr context_, @@ -109,6 +111,7 @@ PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( : PartitionedSink(partition_by, context_, sample_block_) , object_storage(object_storage_) , configuration(configuration_) + , query_settings(query_settings_) , format_settings(format_settings_) , sample_block(sample_block_) , context(context_) @@ -123,6 +126,12 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String auto partition_key = replaceWildcards(configuration->getPath(), partition_id); validateKey(partition_key); + if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( + *object_storage, *configuration, query_settings, partition_key, /* sequence_number */1)) + { + partition_key = *new_key; + } + return std::make_shared( object_storage, configuration, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index a352e2c66a3..38805332a35 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include @@ -46,6 +47,7 @@ public: PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, StorageObjectStorageConfigurationPtr configuration_, + const StorageObjectStorageSettings & query_settings_, std::optional format_settings_, const Block & sample_block_, ContextPtr context_, @@ -59,6 +61,7 @@ private: ObjectStoragePtr object_storage; StorageObjectStorageConfigurationPtr configuration; + const StorageObjectStorageSettings query_settings; const std::optional format_settings; const Block sample_block; const ContextPtr context; diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp new file mode 100644 index 00000000000..6cc3962209f --- /dev/null +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -0,0 +1,43 @@ +#include +#include +#include +#include + + +namespace DB +{ + +std::optional checkAndGetNewFileOnInsertIfNeeded( + const IObjectStorage & object_storage, + const StorageObjectStorageConfiguration & configuration, + const StorageObjectStorageSettings & query_settings, + const String & key, + size_t sequence_number) +{ + if (query_settings.truncate_on_insert + || !object_storage.exists(StoredObject(key))) + return std::nullopt; + + if (query_settings.create_new_file_on_insert) + { + auto pos = key.find_first_of('.'); + String new_key; + do + { + new_key = key.substr(0, pos) + "." + std::to_string(sequence_number) + (pos == std::string::npos ? "" : key.substr(pos)); + ++sequence_number; + } + while (object_storage.exists(StoredObject(new_key))); + + return new_key; + } + + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Object in bucket {} with key {} already exists. " + "If you want to overwrite it, enable setting s3_truncate_on_insert, if you " + "want to create a new file on each insert, enable setting s3_create_new_file_on_insert", + configuration.getNamespace(), key); +} + +} diff --git a/src/Storages/ObjectStorage/Utils.h b/src/Storages/ObjectStorage/Utils.h new file mode 100644 index 00000000000..9291bb72615 --- /dev/null +++ b/src/Storages/ObjectStorage/Utils.h @@ -0,0 +1,17 @@ +#include + +namespace DB +{ + +class IObjectStorage; +class StorageObjectStorageConfiguration; +struct StorageObjectStorageSettings; + +std::optional checkAndGetNewFileOnInsertIfNeeded( + const IObjectStorage & object_storage, + const StorageObjectStorageConfiguration & configuration, + const StorageObjectStorageSettings & query_settings, + const std::string & key, + size_t sequence_number); + +} diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index d8dab85ee6a..dc375b9ec36 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -980,7 +980,7 @@ def test_read_subcolumns(started_cluster): assert ( res - == "2\thdfs://hdfs1:9000/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n" + == "2\t/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n" ) res = node.query( @@ -989,7 +989,7 @@ def test_read_subcolumns(started_cluster): assert ( res - == "2\thdfs://hdfs1:9000/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n" + == "2\t/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n" ) res = node.query( @@ -998,7 +998,7 @@ def test_read_subcolumns(started_cluster): assert ( res - == "0\thdfs://hdfs1:9000/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" + == "0\t/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" ) res = node.query( @@ -1007,7 +1007,7 @@ def test_read_subcolumns(started_cluster): assert ( res - == "42\thdfs://hdfs1:9000/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" + == "42\t/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" ) diff --git a/tests/integration/test_storage_kerberized_hdfs/test.py b/tests/integration/test_storage_kerberized_hdfs/test.py index c72152fa376..ddfc1f6483d 100644 --- a/tests/integration/test_storage_kerberized_hdfs/test.py +++ b/tests/integration/test_storage_kerberized_hdfs/test.py @@ -130,7 +130,7 @@ def test_prohibited(started_cluster): assert False, "Exception have to be thrown" except Exception as ex: assert ( - "Unable to open HDFS file: /storage_user_two_prohibited error: Permission denied: user=specuser, access=WRITE" + "Unable to open HDFS file: /storage_user_two_prohibited (hdfs://suser@kerberizedhdfs1:9010/storage_user_two_prohibited) error: Permission denied: user=specuser, access=WRITE" in str(ex) ) From 191937c0c6c5e5a31c6045269026ca1a1e5171c7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 20 Apr 2024 10:19:55 +0100 Subject: [PATCH 075/392] Fix style check --- tests/integration/test_storage_hdfs/test.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index dc375b9ec36..820e3db6eb1 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -978,37 +978,25 @@ def test_read_subcolumns(started_cluster): f"select a.b.d, _path, a.b, _file, a.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert ( - res - == "2\t/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n" - ) + assert res == "2\t/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n" res = node.query( f"select a.b.d, _path, a.b, _file, a.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert ( - res - == "2\t/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n" - ) + assert res == "2\t/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n" res = node.query( f"select x.b.d, _path, x.b, _file, x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert ( - res - == "0\t/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" - ) + assert res == "0\t/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" res = node.query( f"select x.b.d, _path, x.b, _file, x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')" ) - assert ( - res - == "42\t/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" - ) + assert res == "42\t/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" def test_union_schema_inference_mode(started_cluster): From c7f0cfc4c2df850cf97c81febd61b3411c4e7869 Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 20 Apr 2024 11:47:41 +0100 Subject: [PATCH 076/392] Fix style check --- src/Storages/ObjectStorage/Utils.cpp | 5 +++++ src/Storages/ObjectStorage/Utils.h | 1 + 2 files changed, 6 insertions(+) diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index 6cc3962209f..9caab709081 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -7,6 +7,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + std::optional checkAndGetNewFileOnInsertIfNeeded( const IObjectStorage & object_storage, const StorageObjectStorageConfiguration & configuration, diff --git a/src/Storages/ObjectStorage/Utils.h b/src/Storages/ObjectStorage/Utils.h index 9291bb72615..afc0f31a33f 100644 --- a/src/Storages/ObjectStorage/Utils.h +++ b/src/Storages/ObjectStorage/Utils.h @@ -1,3 +1,4 @@ +#pragma once #include namespace DB From a4daf2b454c44e1891a61eaddf3a2fd965e5f880 Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 20 Apr 2024 14:46:32 +0100 Subject: [PATCH 077/392] Fix hdfs race --- src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp | 7 ++++++- src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index fc7d49324c7..ed63795cb05 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -33,11 +33,16 @@ void HDFSObjectStorage::startup() void HDFSObjectStorage::initializeHDFS() const { - if (hdfs_fs) + if (initialized) + return; + + std::lock_guard lock(init_mutex); + if (initialized) return; hdfs_builder = createHDFSBuilder(url, config); hdfs_fs = createHDFSFS(hdfs_builder.get()); + initialized = true; } ObjectStorageKey HDFSObjectStorage::generateObjectKeyForPath(const std::string & /* path */) const diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index f57b7e1fda8..b626d3dc779 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -121,6 +121,10 @@ private: mutable HDFSBuilderWrapper hdfs_builder; mutable HDFSFSPtr hdfs_fs; + + mutable std::mutex init_mutex; + mutable std::atomic_bool initialized{false}; + SettingsPtr settings; std::string url; std::string url_without_path; From 399414bb40e517b315ab396669875af8e365ece0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 20 Apr 2024 17:27:54 +0100 Subject: [PATCH 078/392] Better --- src/Common/CurrentMetrics.cpp | 3 + src/Interpreters/InterpreterSystemQuery.cpp | 9 +- .../ObjectStorage/AzureBlob/Configuration.cpp | 15 ++ .../ObjectStorage/AzureBlob/Configuration.h | 11 +- .../DataLakes/DeltaLakeMetadata.cpp | 2 +- .../DataLakes/IStorageDataLake.h | 17 +- .../DataLakes/registerDataLakeStorages.cpp | 7 +- .../ObjectStorage/HDFS/Configuration.cpp | 14 ++ .../ObjectStorage/HDFS/Configuration.h | 11 +- .../ObjectStorage/ReadBufferIterator.cpp | 4 +- .../ObjectStorage/ReadBufferIterator.h | 4 +- ...rage.cpp => ReadFromObjectStorageStep.cpp} | 32 +--- ...tStorage.h => ReadFromObjectStorageStep.h} | 18 +- .../ObjectStorage/S3/Configuration.cpp | 15 ++ src/Storages/ObjectStorage/S3/Configuration.h | 11 +- .../ObjectStorage/StorageObjectStorage.cpp | 181 +++++++----------- .../ObjectStorage/StorageObjectStorage.h | 69 +++---- .../StorageObjectStorageCluster.cpp | 89 ++++----- .../StorageObjectStorageCluster.h | 28 +-- .../StorageObjectStorageConfiguration.h | 7 + .../StorageObjectStorageQuerySettings.h | 102 ---------- .../StorageObjectStorageSink.cpp | 3 +- .../ObjectStorage/StorageObjectStorageSink.h | 4 +- .../StorageObjectStorageSource.cpp | 49 ++--- .../StorageObjectStorageSource.h | 26 +-- src/Storages/ObjectStorage/Utils.cpp | 42 +++- src/Storages/ObjectStorage/Utils.h | 14 +- .../registerStorageObjectStorage.cpp | 20 +- src/Storages/S3Queue/S3QueueSource.h | 3 +- src/Storages/S3Queue/StorageS3Queue.cpp | 25 +-- src/Storages/S3Queue/StorageS3Queue.h | 3 +- .../StorageSystemSchemaInferenceCache.cpp | 9 +- src/TableFunctions/ITableFunctionDataLake.h | 2 +- .../TableFunctionObjectStorage.cpp | 78 ++++---- .../TableFunctionObjectStorage.h | 8 +- .../TableFunctionObjectStorageCluster.cpp | 16 +- .../TableFunctionObjectStorageCluster.h | 12 +- 37 files changed, 427 insertions(+), 536 deletions(-) rename src/Storages/ObjectStorage/{ReadFromStorageObjectStorage.cpp => ReadFromObjectStorageStep.cpp} (62%) rename src/Storages/ObjectStorage/{ReadFromStorageObjectStorage.h => ReadFromObjectStorageStep.h} (70%) delete mode 100644 src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 0f25397a961..983e737991c 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -168,6 +168,9 @@ M(ObjectStorageS3Threads, "Number of threads in the S3ObjectStorage thread pool.") \ M(ObjectStorageS3ThreadsActive, "Number of threads in the S3ObjectStorage thread pool running a task.") \ M(ObjectStorageS3ThreadsScheduled, "Number of queued or active jobs in the S3ObjectStorage thread pool.") \ + M(StorageObjectStorageThreads, "Number of threads in the remote table engines thread pools.") \ + M(StorageObjectStorageThreadsActive, "Number of threads in the remote table engines thread pool running a task.") \ + M(StorageObjectStorageThreadsScheduled, "Number of queued or active jobs in remote table engines thread pool.") \ M(ObjectStorageAzureThreads, "Number of threads in the AzureObjectStorage thread pool.") \ M(ObjectStorageAzureThreadsActive, "Number of threads in the AzureObjectStorage thread pool running a task.") \ M(ObjectStorageAzureThreadsScheduled, "Number of queued or active jobs in the AzureObjectStorage thread pool.") \ diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 27b2a9460b7..af9dc08e8c7 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -53,6 +53,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -489,17 +492,17 @@ BlockIO InterpreterSystemQuery::execute() StorageFile::getSchemaCache(getContext()).clear(); #if USE_AWS_S3 if (caches_to_drop.contains("S3")) - StorageS3::getSchemaCache(getContext()).clear(); + StorageObjectStorage::getSchemaCache(getContext(), StorageS3Configuration::type_name).clear(); #endif #if USE_HDFS if (caches_to_drop.contains("HDFS")) - StorageHDFS::getSchemaCache(getContext()).clear(); + StorageObjectStorage::getSchemaCache(getContext(), StorageHDFSConfiguration::type_name).clear(); #endif if (caches_to_drop.contains("URL")) StorageURL::getSchemaCache(getContext()).clear(); #if USE_AZURE_BLOB_STORAGE if (caches_to_drop.contains("AZURE")) - StorageAzureBlob::getSchemaCache(getContext()).clear(); + StorageObjectStorage::getSchemaCache(getContext(), StorageAzureBlobConfiguration::type_name).clear(); #endif break; } diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index 4b826a0c721..c9bc59d62aa 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -101,6 +101,21 @@ AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(Co return settings_ptr; } +StorageObjectStorage::QuerySettings StorageAzureBlobConfiguration::getQuerySettings(const ContextPtr & context) const +{ + const auto & settings = context->getSettingsRef(); + return StorageObjectStorage::QuerySettings{ + .truncate_on_insert = settings.azure_truncate_on_insert, + .create_new_file_on_insert = settings.azure_create_new_file_on_insert, + .schema_inference_use_cache = settings.schema_inference_use_cache_for_azure, + .schema_inference_mode = settings.schema_inference_mode, + .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for azure + .list_object_keys_size = settings.azure_list_object_keys_size, + .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, + .ignore_non_existent_file = settings.azure_ignore_file_doesnt_exist, + }; +} + ObjectStoragePtr StorageAzureBlobConfiguration::createObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT { assertInitialized(); diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.h b/src/Storages/ObjectStorage/AzureBlob/Configuration.h index c12ff81197d..7e105ea82b5 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.h +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.h @@ -18,9 +18,15 @@ class StorageAzureBlobConfiguration : public StorageObjectStorageConfiguration friend void registerBackupEngineAzureBlobStorage(BackupFactory & factory); public: + static constexpr auto type_name = "azure"; + static constexpr auto engine_name = "Azure"; + StorageAzureBlobConfiguration() = default; StorageAzureBlobConfiguration(const StorageAzureBlobConfiguration & other); + std::string getTypeName() const override { return type_name; } + std::string getEngineName() const override { return engine_name; } + Path getPath() const override { return blob_path; } void setPath(const Path & path) override { blob_path = path; } @@ -30,6 +36,7 @@ public: String getDataSourceDescription() override { return fs::path(connection_url) / container; } String getNamespace() const override { return container; } + StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; void check(ContextPtr context) const override; ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT @@ -37,8 +44,8 @@ public: void fromNamedCollection(const NamedCollection & collection) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; - static void addStructureAndFormatToArgs( - ASTs & args, const String & structure_, const String & format_, ContextPtr context); + void addStructureAndFormatToArgs( + ASTs & args, const String & structure_, const String & format_, ContextPtr context) override; protected: using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index d0f203b32bd..c6590ba8d43 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -184,7 +184,7 @@ struct DeltaLakeMetadata::Impl * * We need to get "version", which is the version of the checkpoint we need to read. */ - size_t readLastCheckpointIfExists() + size_t readLastCheckpointIfExists() const { const auto last_checkpoint_file = fs::path(configuration->getPath()) / deltalake_metadata_directory / "_last_checkpoint"; if (!object_storage->exists(StoredObject(last_checkpoint_file))) diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index 64228e880f8..e1851775925 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -21,17 +21,16 @@ namespace DB /// Storage for read-only integration with Apache Iceberg tables in Amazon S3 (see https://iceberg.apache.org/) /// Right now it's implemented on top of StorageS3 and right now it doesn't support /// many Iceberg features like schema evolution, partitioning, positional and equality deletes. -template -class IStorageDataLake final : public StorageObjectStorage +template +class IStorageDataLake final : public StorageObjectStorage { public: - using Storage = StorageObjectStorage; + using Storage = StorageObjectStorage; using ConfigurationPtr = Storage::ConfigurationPtr; static StoragePtr create( ConfigurationPtr base_configuration, ContextPtr context, - const String & engine_name_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -64,9 +63,9 @@ public: tryLogCurrentException(__PRETTY_FUNCTION__); } - return std::make_shared>( + return std::make_shared>( base_configuration, std::move(metadata), configuration, object_storage, - engine_name_, context, table_id_, + context, table_id_, columns_.empty() ? ColumnsDescription(schema_from_metadata) : columns_, constraints_, comment_, format_settings_); } @@ -133,9 +132,9 @@ private: DataLakeMetadataPtr current_metadata; }; -using StorageIceberg = IStorageDataLake; -using StorageDeltaLake = IStorageDataLake; -using StorageHudi = IStorageDataLake; +using StorageIceberg = IStorageDataLake; +using StorageDeltaLake = IStorageDataLake; +using StorageHudi = IStorageDataLake; } diff --git a/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp b/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp index d11dd1ca836..a5170e5ed6b 100644 --- a/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp +++ b/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB @@ -24,7 +23,7 @@ void registerStorageIceberg(StorageFactory & factory) StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); return StorageIceberg::create( - configuration, args.getContext(), "Iceberg", args.table_id, args.columns, + configuration, args.getContext(), args.table_id, args.columns, args.constraints, args.comment, std::nullopt, args.mode); }, { @@ -47,7 +46,7 @@ void registerStorageDeltaLake(StorageFactory & factory) StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); return StorageDeltaLake::create( - configuration, args.getContext(), "DeltaLake", args.table_id, args.columns, + configuration, args.getContext(), args.table_id, args.columns, args.constraints, args.comment, std::nullopt, args.mode); }, { @@ -68,7 +67,7 @@ void registerStorageHudi(StorageFactory & factory) StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); return StorageHudi::create( - configuration, args.getContext(), "Hudi", args.table_id, args.columns, + configuration, args.getContext(), args.table_id, args.columns, args.constraints, args.comment, std::nullopt, args.mode); }, { diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 84f0a7bfe9f..0062ac969ac 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -60,6 +60,20 @@ std::string StorageHDFSConfiguration::getPathWithoutGlob() const return "/"; return path.substr(0, end_of_path_without_globs); } +StorageObjectStorage::QuerySettings StorageHDFSConfiguration::getQuerySettings(const ContextPtr & context) const +{ + const auto & settings = context->getSettingsRef(); + return StorageObjectStorage::QuerySettings{ + .truncate_on_insert = settings.hdfs_truncate_on_insert, + .create_new_file_on_insert = settings.hdfs_create_new_file_on_insert, + .schema_inference_use_cache = settings.schema_inference_use_cache_for_hdfs, + .schema_inference_mode = settings.schema_inference_mode, + .skip_empty_files = settings.hdfs_skip_empty_files, /// TODO: add setting for hdfs + .list_object_keys_size = settings.s3_list_object_keys_size, /// TODO: add a setting for hdfs + .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, + .ignore_non_existent_file = settings.hdfs_ignore_file_doesnt_exist, + }; +} void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool with_structure) { diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 23a7e8e4549..0a502857153 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -13,9 +13,15 @@ namespace DB class StorageHDFSConfiguration : public StorageObjectStorageConfiguration { public: + static constexpr auto type_name = "hdfs"; + static constexpr auto engine_name = "HDFS"; + StorageHDFSConfiguration() = default; StorageHDFSConfiguration(const StorageHDFSConfiguration & other); + std::string getTypeName() const override { return type_name; } + std::string getEngineName() const override { return engine_name; } + Path getPath() const override { return path; } void setPath(const Path & path_) override { path = path_; } @@ -25,13 +31,14 @@ public: String getNamespace() const override { return ""; } String getDataSourceDescription() override { return url; } + StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; void check(ContextPtr context) const override; ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } - static void addStructureAndFormatToArgs( - ASTs & args, const String & structure_, const String & format_, ContextPtr context); + void addStructureAndFormatToArgs( + ASTs & args, const String & structure_, const String & format_, ContextPtr context) override; std::string getPathWithoutGlob() const override; diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 0b6e34fb831..f8ce90a2b1f 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -1,5 +1,4 @@ #include -#include #include #include @@ -19,7 +18,6 @@ ReadBufferIterator::ReadBufferIterator( ConfigurationPtr configuration_, const FileIterator & file_iterator_, const std::optional & format_settings_, - const StorageObjectStorageSettings & query_settings_, SchemaCache & schema_cache_, ObjectInfos & read_keys_, const ContextPtr & context_) @@ -28,7 +26,7 @@ ReadBufferIterator::ReadBufferIterator( , configuration(configuration_) , file_iterator(file_iterator_) , format_settings(format_settings_) - , query_settings(query_settings_) + , query_settings(configuration->getQuerySettings(context_)) , schema_cache(schema_cache_) , read_keys(read_keys_) , format(configuration->format == "auto" ? std::nullopt : std::optional(configuration->format)) diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.h b/src/Storages/ObjectStorage/ReadBufferIterator.h index 053bcbf894f..2d58e1c789e 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.h +++ b/src/Storages/ObjectStorage/ReadBufferIterator.h @@ -2,7 +2,6 @@ #include #include #include -#include #include @@ -19,7 +18,6 @@ public: ConfigurationPtr configuration_, const FileIterator & file_iterator_, const std::optional & format_settings_, - const StorageObjectStorageSettings & query_settings_, SchemaCache & schema_cache_, ObjectInfos & read_keys_, const ContextPtr & context_); @@ -50,7 +48,7 @@ private: const ConfigurationPtr configuration; const FileIterator file_iterator; const std::optional & format_settings; - const StorageObjectStorageSettings query_settings; + const StorageObjectStorage::QuerySettings query_settings; SchemaCache & schema_cache; ObjectInfos & read_keys; std::optional format; diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp b/src/Storages/ObjectStorage/ReadFromObjectStorageStep.cpp similarity index 62% rename from src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp rename to src/Storages/ObjectStorage/ReadFromObjectStorageStep.cpp index 89d33191f41..f19e01cdc3e 100644 --- a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/ReadFromObjectStorageStep.cpp @@ -1,11 +1,11 @@ -#include +#include #include #include namespace DB { -ReadFromStorageObejctStorage::ReadFromStorageObejctStorage( +ReadFromObjectStorageStep::ReadFromObjectStorageStep( ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, const String & name_, @@ -14,49 +14,41 @@ ReadFromStorageObejctStorage::ReadFromStorageObejctStorage( const SelectQueryInfo & query_info_, const StorageSnapshotPtr & storage_snapshot_, const std::optional & format_settings_, - const StorageObjectStorageSettings & query_settings_, bool distributed_processing_, ReadFromFormatInfo info_, SchemaCache & schema_cache_, const bool need_only_count_, ContextPtr context_, size_t max_block_size_, - size_t num_streams_, - CurrentMetrics::Metric metric_threads_count_, - CurrentMetrics::Metric metric_threads_active_, - CurrentMetrics::Metric metric_threads_scheduled_) + size_t num_streams_) : SourceStepWithFilter(DataStream{.header = info_.source_header}, columns_to_read, query_info_, storage_snapshot_, context_) , object_storage(object_storage_) , configuration(configuration_) , info(std::move(info_)) , virtual_columns(virtual_columns_) , format_settings(format_settings_) - , query_settings(query_settings_) + , query_settings(configuration->getQuerySettings(context_)) , schema_cache(schema_cache_) , name(name_ + "Source") , need_only_count(need_only_count_) , max_block_size(max_block_size_) , num_streams(num_streams_) , distributed_processing(distributed_processing_) - , metric_threads_count(metric_threads_count_) - , metric_threads_active(metric_threads_active_) - , metric_threads_scheduled(metric_threads_scheduled_) { } -void ReadFromStorageObejctStorage::createIterator(const ActionsDAG::Node * predicate) +void ReadFromObjectStorageStep::createIterator(const ActionsDAG::Node * predicate) { if (!iterator_wrapper) { auto context = getContext(); iterator_wrapper = StorageObjectStorageSource::createFileIterator( - configuration, object_storage, query_settings, distributed_processing, - context, predicate, virtual_columns, nullptr, metric_threads_count, - metric_threads_active, metric_threads_scheduled, context->getFileProgressCallback()); + configuration, object_storage, distributed_processing, + context, predicate, virtual_columns, nullptr, context->getFileProgressCallback()); } } -void ReadFromStorageObejctStorage::applyFilters(ActionDAGNodes added_filter_nodes) +void ReadFromObjectStorageStep::applyFilters(ActionDAGNodes added_filter_nodes) { filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); const ActionsDAG::Node * predicate = nullptr; @@ -66,7 +58,7 @@ void ReadFromStorageObejctStorage::applyFilters(ActionDAGNodes added_filter_node createIterator(predicate); } -void ReadFromStorageObejctStorage::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) +void ReadFromObjectStorageStep::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { createIterator(nullptr); auto context = getContext(); @@ -74,13 +66,9 @@ void ReadFromStorageObejctStorage::initializePipeline(QueryPipelineBuilder & pip Pipes pipes; for (size_t i = 0; i < num_streams; ++i) { - auto threadpool = std::make_shared( - metric_threads_count, metric_threads_active, metric_threads_scheduled, /* max_threads */1); - auto source = std::make_shared( getName(), object_storage, configuration, info, format_settings, query_settings, - context, max_block_size, iterator_wrapper, need_only_count, schema_cache, - std::move(threadpool), metric_threads_count, metric_threads_active, metric_threads_scheduled); + context, max_block_size, iterator_wrapper, need_only_count, schema_cache); source->setKeyCondition(filter_actions_dag, context); pipes.emplace_back(std::move(source)); diff --git a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h b/src/Storages/ObjectStorage/ReadFromObjectStorageStep.h similarity index 70% rename from src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h rename to src/Storages/ObjectStorage/ReadFromObjectStorageStep.h index c0dd02d75f8..d98ebfef1f2 100644 --- a/src/Storages/ObjectStorage/ReadFromStorageObjectStorage.h +++ b/src/Storages/ObjectStorage/ReadFromObjectStorageStep.h @@ -1,17 +1,16 @@ #pragma once -#include -#include #include +#include namespace DB { -class ReadFromStorageObejctStorage : public SourceStepWithFilter +class ReadFromObjectStorageStep : public SourceStepWithFilter { public: using ConfigurationPtr = StorageObjectStorageConfigurationPtr; - ReadFromStorageObejctStorage( + ReadFromObjectStorageStep( ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, const String & name_, @@ -20,17 +19,13 @@ public: const SelectQueryInfo & query_info_, const StorageSnapshotPtr & storage_snapshot_, const std::optional & format_settings_, - const StorageObjectStorageSettings & query_settings_, bool distributed_processing_, ReadFromFormatInfo info_, SchemaCache & schema_cache_, bool need_only_count_, ContextPtr context_, size_t max_block_size_, - size_t num_streams_, - CurrentMetrics::Metric metric_threads_count_, - CurrentMetrics::Metric metric_threads_active_, - CurrentMetrics::Metric metric_threads_scheduled_); + size_t num_streams_); std::string getName() const override { return name; } @@ -46,16 +41,13 @@ private: const ReadFromFormatInfo info; const NamesAndTypesList virtual_columns; const std::optional format_settings; - const StorageObjectStorageSettings query_settings; + const StorageObjectStorage::QuerySettings query_settings; SchemaCache & schema_cache; const String name; const bool need_only_count; const size_t max_block_size; const size_t num_streams; const bool distributed_processing; - const CurrentMetrics::Metric metric_threads_count; - const CurrentMetrics::Metric metric_threads_active; - const CurrentMetrics::Metric metric_threads_scheduled; void createIterator(const ActionsDAG::Node * predicate); }; diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 4c9e49d0705..139d9004f8e 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -70,6 +70,21 @@ StorageS3Configuration::StorageS3Configuration(const StorageS3Configuration & ot keys = other.keys; } +StorageObjectStorage::QuerySettings StorageS3Configuration::getQuerySettings(const ContextPtr & context) const +{ + const auto & settings = context->getSettingsRef(); + return StorageObjectStorage::QuerySettings{ + .truncate_on_insert = settings.s3_truncate_on_insert, + .create_new_file_on_insert = settings.s3_create_new_file_on_insert, + .schema_inference_use_cache = settings.schema_inference_use_cache_for_s3, + .schema_inference_mode = settings.schema_inference_mode, + .skip_empty_files = settings.s3_skip_empty_files, + .list_object_keys_size = settings.s3_list_object_keys_size, + .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, + .ignore_non_existent_file = settings.s3_ignore_file_doesnt_exist, + }; +} + ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, bool /* is_readonly */) /// NOLINT { assertInitialized(); diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index ff5e8680e66..de4a6d17579 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -14,9 +15,14 @@ namespace DB class StorageS3Configuration : public StorageObjectStorageConfiguration { public: + static constexpr auto type_name = "s3"; + StorageS3Configuration() = default; StorageS3Configuration(const StorageS3Configuration & other); + std::string getTypeName() const override { return type_name; } + std::string getEngineName() const override { return url.storage_name; } + Path getPath() const override { return url.key; } void setPath(const Path & path) override { url.key = path; } @@ -26,6 +32,7 @@ public: String getNamespace() const override { return url.bucket; } String getDataSourceDescription() override; + StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; void check(ContextPtr context) const override; void validateNamespace(const String & name) const override; @@ -34,8 +41,8 @@ public: bool isStaticConfiguration() const override { return static_configuration; } ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT - static void addStructureAndFormatToArgs( - ASTs & args, const String & structure, const String & format, ContextPtr context); + void addStructureAndFormatToArgs( + ASTs & args, const String & structure, const String & format, ContextPtr context) override; private: void fromNamedCollection(const NamedCollection & collection) override; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 13f3557d927..441639629a3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -11,10 +11,9 @@ #include #include #include -#include #include #include -#include +#include #include #include #include @@ -25,53 +24,13 @@ namespace DB namespace ErrorCodes { - extern const int BAD_ARGUMENTS; extern const int DATABASE_ACCESS_DENIED; extern const int NOT_IMPLEMENTED; } -template -std::unique_ptr getStorageMetadata( - ObjectStoragePtr object_storage, - const StorageObjectStorageConfigurationPtr & configuration, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints, - std::optional format_settings, - const String & comment, - const std::string & engine_name, - const ContextPtr & context) -{ - using Storage = StorageObjectStorage; - - auto storage_metadata = std::make_unique(); - if (columns.empty()) - { - auto fetched_columns = Storage::getTableStructureFromData(object_storage, configuration, format_settings, context); - storage_metadata->setColumns(fetched_columns); - } - else if (!columns.hasOnlyOrdinary()) - { - /// We don't allow special columns. - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine {} doesn't support special columns " - "like MATERIALIZED, ALIAS or EPHEMERAL", engine_name); - } - else - { - if (configuration->format == "auto") - Storage::setFormatFromData(object_storage, configuration, format_settings, context); - - storage_metadata->setColumns(columns); - } - storage_metadata->setConstraints(constraints); - storage_metadata->setComment(comment); - return storage_metadata; -} - -template -StorageObjectStorage::StorageObjectStorage( +StorageObjectStorage::StorageObjectStorage( ConfigurationPtr configuration_, ObjectStoragePtr object_storage_, - const String & engine_name_, ContextPtr context, const StorageID & table_id_, const ColumnsDescription & columns_, @@ -80,16 +39,13 @@ StorageObjectStorage::StorageObjectStorage( std::optional format_settings_, bool distributed_processing_, ASTPtr partition_by_) - : IStorage(table_id_, getStorageMetadata( - object_storage_, configuration_, columns_, constraints_, format_settings_, - comment, engine_name, context)) - , engine_name(engine_name_) + : IStorage(table_id_) + , configuration(configuration_) + , object_storage(object_storage_) , format_settings(format_settings_) , partition_by(partition_by_) , distributed_processing(distributed_processing_) - , log(getLogger("Storage" + engine_name_)) - , object_storage(object_storage_) - , configuration(configuration_) + , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { FormatFactory::instance().checkFormatName(configuration->format); configuration->check(context); @@ -98,46 +54,41 @@ StorageObjectStorage::StorageObjectStorage( for (const auto & key : configuration->getPaths()) objects.emplace_back(key); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(getInMemoryMetadataPtr()->getColumns())); + auto metadata = getStorageMetadata( + object_storage_, configuration_, columns_, + constraints_, format_settings_, comment, context); + + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); + setInMemoryMetadata(std::move(metadata)); } -template -bool StorageObjectStorage::prefersLargeBlocks() const +String StorageObjectStorage::getName() const +{ + return configuration->getEngineName(); +} + +bool StorageObjectStorage::prefersLargeBlocks() const { return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration->format); } -template -bool StorageObjectStorage::parallelizeOutputAfterReading(ContextPtr context) const +bool StorageObjectStorage::parallelizeOutputAfterReading(ContextPtr context) const { return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration->format, context); } -template -bool StorageObjectStorage::supportsSubsetOfColumns(const ContextPtr & context) const +bool StorageObjectStorage::supportsSubsetOfColumns(const ContextPtr & context) const { return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context, format_settings); } -template -void StorageObjectStorage::updateConfiguration(ContextPtr context) +void StorageObjectStorage::updateConfiguration(ContextPtr context) { if (!configuration->isStaticConfiguration()) object_storage->applyNewSettings(context->getConfigRef(), "s3.", context); } -template -SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context) -{ - static SchemaCache schema_cache( - context->getConfigRef().getUInt( - StorageSettings::SCHEMA_CACHE_MAX_ELEMENTS_CONFIG_SETTING, - DEFAULT_SCHEMA_CACHE_ELEMENTS)); - return schema_cache; -} - -template -void StorageObjectStorage::read( +void StorageObjectStorage::read( QueryPlan & query_plan, const Names & column_names, const StorageSnapshotPtr & storage_snapshot, @@ -155,13 +106,12 @@ void StorageObjectStorage::read( getName()); } - const auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context)); + const auto read_from_format_info = prepareReadingFromFormat( + column_names, storage_snapshot, supportsSubsetOfColumns(local_context)); const bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef().optimize_count_from_files; - LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII SOURCE HEADER: {}", read_from_format_info.source_header.dumpStructure()); - LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII FORMAT HEADER: {}", read_from_format_info.format_header.dumpStructure()); - auto read_step = std::make_unique( + auto read_step = std::make_unique( object_storage, configuration, getName(), @@ -170,23 +120,18 @@ void StorageObjectStorage::read( query_info, storage_snapshot, format_settings, - StorageSettings::create(local_context->getSettingsRef()), distributed_processing, std::move(read_from_format_info), getSchemaCache(local_context), need_only_count, local_context, max_block_size, - num_streams, - StorageSettings::ObjectStorageThreads(), - StorageSettings::ObjectStorageThreadsActive(), - StorageSettings::ObjectStorageThreadsScheduled()); + num_streams); query_plan.addStep(std::move(read_step)); } -template -SinkToStoragePtr StorageObjectStorage::write( +SinkToStoragePtr StorageObjectStorage::write( const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, @@ -194,7 +139,7 @@ SinkToStoragePtr StorageObjectStorage::write( { updateConfiguration(local_context); const auto sample_block = metadata_snapshot->getSampleBlock(); - const auto & query_settings = StorageSettings::create(local_context->getSettingsRef()); + const auto & settings = configuration->getQuerySettings(local_context); if (configuration->withWildcard()) { @@ -209,23 +154,22 @@ SinkToStoragePtr StorageObjectStorage::write( if (partition_by_ast) { - LOG_TEST(log, "Using PartitionedSink for {}", configuration->getPath()); return std::make_shared( - object_storage, configuration, query_settings, - format_settings, sample_block, local_context, partition_by_ast); + object_storage, configuration, format_settings, sample_block, local_context, partition_by_ast); } } if (configuration->withGlobs()) { - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, - "{} key '{}' contains globs, so the table is in readonly mode", - getName(), configuration->getPath()); + throw Exception( + ErrorCodes::DATABASE_ACCESS_DENIED, + "{} key '{}' contains globs, so the table is in readonly mode", + getName(), configuration->getPath()); } auto & paths = configuration->getPaths(); if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( - *object_storage, *configuration, query_settings, paths.front(), paths.size())) + *object_storage, *configuration, settings, paths.front(), paths.size())) { paths.push_back(*new_key); } @@ -238,9 +182,11 @@ SinkToStoragePtr StorageObjectStorage::write( local_context); } -template -void StorageObjectStorage::truncate( - const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) +void StorageObjectStorage::truncate( + const ASTPtr &, + const StorageMetadataPtr &, + ContextPtr, + TableExclusiveLockHolder &) { if (configuration->withGlobs()) { @@ -257,34 +203,37 @@ void StorageObjectStorage::truncate( object_storage->removeObjectsIfExist(objects); } -template -std::unique_ptr StorageObjectStorage::createReadBufferIterator( +std::unique_ptr StorageObjectStorage::createReadBufferIterator( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, ObjectInfos & read_keys, const ContextPtr & context) { - const auto settings = StorageSettings::create(context->getSettingsRef()); auto file_iterator = StorageObjectStorageSource::createFileIterator( - configuration, object_storage, settings, /* distributed_processing */false, - context, /* predicate */{}, /* virtual_columns */{}, &read_keys, - StorageSettings::ObjectStorageThreads(), StorageSettings::ObjectStorageThreadsActive(), StorageSettings::ObjectStorageThreadsScheduled()); + configuration, + object_storage, + false/* distributed_processing */, + context, + {}/* predicate */, + {}/* virtual_columns */, + &read_keys); return std::make_unique( object_storage, configuration, file_iterator, - format_settings, StorageSettings::create(context->getSettingsRef()), getSchemaCache(context), read_keys, context); + format_settings, getSchemaCache(context, configuration->getTypeName()), read_keys, context); } -template -ColumnsDescription StorageObjectStorage::getTableStructureFromData( +ColumnsDescription StorageObjectStorage::getTableStructureFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, const ContextPtr & context) { ObjectInfos read_keys; - auto read_buffer_iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + auto read_buffer_iterator = createReadBufferIterator( + object_storage, configuration, format_settings, read_keys, context); + if (configuration->format == "auto") { auto [columns, format] = detectFormatAndReadSchema(format_settings, *read_buffer_iterator, context); @@ -297,20 +246,34 @@ ColumnsDescription StorageObjectStorage::getTableStructureFromD } } -template -void StorageObjectStorage::setFormatFromData( +void StorageObjectStorage::setFormatFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, const ContextPtr & context) { ObjectInfos read_keys; - auto read_buffer_iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + auto read_buffer_iterator = createReadBufferIterator( + object_storage, configuration, format_settings, read_keys, context); configuration->format = detectFormatAndReadSchema(format_settings, *read_buffer_iterator, context).second; } -template class StorageObjectStorage; -template class StorageObjectStorage; -template class StorageObjectStorage; +SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context) +{ + static SchemaCache schema_cache( + context->getConfigRef().getUInt( + "schema_inference_cache_max_elements_for_" + configuration->getTypeName(), + DEFAULT_SCHEMA_CACHE_ELEMENTS)); + return schema_cache; +} + +SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context, const std::string & storage_type_name) +{ + static SchemaCache schema_cache( + context->getConfigRef().getUInt( + "schema_inference_cache_max_elements_for_" + storage_type_name, + DEFAULT_SCHEMA_CACHE_ELEMENTS)); + return schema_cache; +} } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index a2112f7ed01..3dbe010e406 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -1,31 +1,22 @@ #pragma once - -#include #include #include -#include #include #include #include - namespace DB { -struct SelectQueryInfo; class StorageObjectStorageConfiguration; -struct S3StorageSettings; -struct HDFSStorageSettings; -struct AzureStorageSettings; -class PullingPipelineExecutor; -using ReadTaskCallback = std::function; -class IOutputFormat; -class IInputFormat; -class SchemaCache; class ReadBufferIterator; +class SchemaCache; - -template +/** + * A general class containing implementation for external table engines + * such as StorageS3, StorageAzure, StorageHDFS. + * Works with an object of IObjectStorage class. + */ class StorageObjectStorage : public IStorage { public: @@ -35,10 +26,26 @@ public: using ObjectInfoPtr = std::shared_ptr; using ObjectInfos = std::vector; + struct QuerySettings + { + /// Insert settings: + bool truncate_on_insert; + bool create_new_file_on_insert; + + /// Schema inference settings: + bool schema_inference_use_cache; + SchemaInferenceMode schema_inference_mode; + + /// List settings: + bool skip_empty_files; + size_t list_object_keys_size; + bool throw_on_zero_files_match; + bool ignore_non_existent_file; + }; + StorageObjectStorage( ConfigurationPtr configuration_, ObjectStoragePtr object_storage_, - const String & engine_name_, ContextPtr context_, const StorageID & table_id_, const ColumnsDescription & columns_, @@ -48,17 +55,17 @@ public: bool distributed_processing_ = false, ASTPtr partition_by_ = nullptr); - String getName() const override { return engine_name; } + String getName() const override; void read( QueryPlan & query_plan, - const Names &, - const StorageSnapshotPtr &, - SelectQueryInfo &, - ContextPtr, - QueryProcessingStage::Enum, - size_t, - size_t) override; + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr local_context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + size_t num_streams) override; SinkToStoragePtr write( const ASTPtr & query, @@ -84,7 +91,9 @@ public: bool parallelizeOutputAfterReading(ContextPtr context) const override; - static SchemaCache & getSchemaCache(const ContextPtr & context); + SchemaCache & getSchemaCache(const ContextPtr & context); + + static SchemaCache & getSchemaCache(const ContextPtr & context, const std::string & storage_type_name); static ColumnsDescription getTableStructureFromData( const ObjectStoragePtr & object_storage, @@ -108,19 +117,15 @@ protected: ObjectInfos & read_keys, const ContextPtr & context); + ConfigurationPtr configuration; + const ObjectStoragePtr object_storage; const std::string engine_name; - std::optional format_settings; + const std::optional format_settings; const ASTPtr partition_by; const bool distributed_processing; LoggerPtr log; - ObjectStoragePtr object_storage; - ConfigurationPtr configuration; std::mutex configuration_update_mutex; }; -using StorageS3 = StorageObjectStorage; -using StorageAzureBlob = StorageObjectStorage; -using StorageHDFS = StorageObjectStorage; - } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index f023bb068d4..72a35ae33eb 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace DB { @@ -24,47 +25,34 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -template -StorageObjectStorageCluster::StorageObjectStorageCluster( +StorageObjectStorageCluster::StorageObjectStorageCluster( const String & cluster_name_, - const Storage::ConfigurationPtr & configuration_, + ConfigurationPtr configuration_, ObjectStoragePtr object_storage_, - const String & engine_name_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, ContextPtr context_) - : IStorageCluster(cluster_name_, - table_id_, - getLogger(fmt::format("{}({})", engine_name_, table_id_.table_name))) - , engine_name(engine_name_) + : IStorageCluster( + cluster_name_, table_id_, getLogger(fmt::format("{}({})", configuration_->getEngineName(), table_id_.table_name))) , configuration{configuration_} , object_storage(object_storage_) { configuration->check(context_); - StorageInMemoryMetadata storage_metadata; + auto metadata = getStorageMetadata( + object_storage, configuration, columns_, constraints_, + {}/* format_settings */, ""/* comment */, context_); - if (columns_.empty()) - { - ColumnsDescription columns = Storage::getTableStructureFromData(object_storage, configuration, /*format_settings=*/std::nullopt, context_); - storage_metadata.setColumns(columns); - } - else - { - if (configuration->format == "auto") - StorageS3::setFormatFromData(object_storage, configuration, /*format_settings=*/std::nullopt, context_); - - storage_metadata.setColumns(columns_); - } - - storage_metadata.setConstraints(constraints_); - setInMemoryMetadata(storage_metadata); - - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); + setInMemoryMetadata(std::move(metadata)); } -template -void StorageObjectStorageCluster::updateQueryToSendIfNeeded( +std::string StorageObjectStorageCluster::getName() const +{ + return configuration->getEngineName(); +} + +void StorageObjectStorageCluster::updateQueryToSendIfNeeded( ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) @@ -72,24 +60,32 @@ void StorageObjectStorageCluster::up ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Expected SELECT query from table function {}, got '{}'", - engine_name, queryToString(query)); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Expected SELECT query from table function {}, got '{}'", + configuration->getEngineName(), queryToString(query)); } - TableFunction::updateStructureAndFormatArgumentsIfNeeded( - expression_list->children, - storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), - configuration->format, - context); + ASTs & args = expression_list->children; + const auto & structure = storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(); + if (args.empty()) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Unexpected empty list of arguments for {}Cluster table function", + configuration->getEngineName()); + } + + ASTPtr cluster_name_arg = args.front(); + args.erase(args.begin()); + configuration->addStructureAndFormatToArgs(args, structure, configuration->format, context); + args.insert(args.begin(), cluster_name_arg); } -template -RemoteQueryExecutor::Extension -StorageObjectStorageCluster::getTaskIteratorExtension( +RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ContextPtr & local_context) const { - const auto settings = StorageSettings::create(local_context->getSettingsRef()); + const auto settings = configuration->getQuerySettings(local_context); auto iterator = std::make_shared( object_storage, configuration, predicate, virtual_columns, local_context, nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match, @@ -106,17 +102,4 @@ StorageObjectStorageCluster::getTask return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) }; } - -#if USE_AWS_S3 -template class StorageObjectStorageCluster; -#endif - -#if USE_AZURE_BLOB_STORAGE -template class StorageObjectStorageCluster; -#endif - -#if USE_HDFS -template class StorageObjectStorageCluster; -#endif - } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index ac894e14f24..2db8f5c352e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -11,32 +11,25 @@ namespace DB { -class StorageS3Settings; -class StorageAzureBlobSettings; - class Context; -template class StorageObjectStorageCluster : public IStorageCluster { public: - using Storage = StorageObjectStorage; - using TableFunction = TableFunctionObjectStorageCluster; + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; StorageObjectStorageCluster( const String & cluster_name_, - const Storage::ConfigurationPtr & configuration_, + ConfigurationPtr configuration_, ObjectStoragePtr object_storage_, - const String & engine_name_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, ContextPtr context_); - std::string getName() const override { return engine_name; } + std::string getName() const override; - RemoteQueryExecutor::Extension - getTaskIteratorExtension( + RemoteQueryExecutor::Extension getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ContextPtr & context) const override; @@ -53,20 +46,9 @@ private: const ContextPtr & context) override; const String engine_name; - const Storage::ConfigurationPtr configuration; + const StorageObjectStorage::ConfigurationPtr configuration; const ObjectStoragePtr object_storage; NamesAndTypesList virtual_columns; }; - -#if USE_AWS_S3 -using StorageS3Cluster = StorageObjectStorageCluster; -#endif -#if USE_AZURE_BLOB_STORAGE -using StorageAzureBlobCluster = StorageObjectStorageCluster; -#endif -#if USE_HDFS -using StorageHDFSCluster = StorageObjectStorageCluster; -#endif - } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 647575aaa90..34965174bf9 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include "StorageObjectStorage.h" #include namespace fs = std::filesystem; @@ -27,6 +28,9 @@ public: ContextPtr local_context, bool with_table_structure); + virtual std::string getTypeName() const = 0; + virtual std::string getEngineName() const = 0; + virtual Path getPath() const = 0; virtual void setPath(const Path & path) = 0; @@ -36,6 +40,9 @@ public: virtual String getDataSourceDescription() = 0; virtual String getNamespace() const = 0; + virtual StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const = 0; + virtual void addStructureAndFormatToArgs( + ASTs & args, const String & structure_, const String & format_, ContextPtr context) = 0; bool withWildcard() const; bool withGlobs() const { return isPathWithGlobs() || isNamespaceWithGlobs(); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h b/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h deleted file mode 100644 index 606456011c3..00000000000 --- a/src/Storages/ObjectStorage/StorageObjectStorageQuerySettings.h +++ /dev/null @@ -1,102 +0,0 @@ -#pragma once -#include -#include -#include - -namespace CurrentMetrics -{ - extern const Metric ObjectStorageAzureThreads; - extern const Metric ObjectStorageAzureThreadsActive; - extern const Metric ObjectStorageAzureThreadsScheduled; - - extern const Metric ObjectStorageS3Threads; - extern const Metric ObjectStorageS3ThreadsActive; - extern const Metric ObjectStorageS3ThreadsScheduled; -} - -namespace DB -{ - -struct StorageObjectStorageSettings -{ - bool truncate_on_insert; - bool create_new_file_on_insert; - bool schema_inference_use_cache; - SchemaInferenceMode schema_inference_mode; - bool skip_empty_files; - size_t list_object_keys_size; - bool throw_on_zero_files_match; - bool ignore_non_existent_file; -}; - -struct S3StorageSettings -{ - static StorageObjectStorageSettings create(const Settings & settings) - { - return StorageObjectStorageSettings{ - .truncate_on_insert = settings.s3_truncate_on_insert, - .create_new_file_on_insert = settings.s3_create_new_file_on_insert, - .schema_inference_use_cache = settings.schema_inference_use_cache_for_s3, - .schema_inference_mode = settings.schema_inference_mode, - .skip_empty_files = settings.s3_skip_empty_files, - .list_object_keys_size = settings.s3_list_object_keys_size, - .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, - .ignore_non_existent_file = settings.s3_ignore_file_doesnt_exist, - }; - } - - static constexpr auto SCHEMA_CACHE_MAX_ELEMENTS_CONFIG_SETTING = "schema_inference_cache_max_elements_for_s3"; - - static CurrentMetrics::Metric ObjectStorageThreads() { return CurrentMetrics::ObjectStorageS3Threads; } /// NOLINT - static CurrentMetrics::Metric ObjectStorageThreadsActive() { return CurrentMetrics::ObjectStorageS3ThreadsActive; } /// NOLINT - static CurrentMetrics::Metric ObjectStorageThreadsScheduled() { return CurrentMetrics::ObjectStorageS3ThreadsScheduled; } /// NOLINT -}; - -struct AzureStorageSettings -{ - static StorageObjectStorageSettings create(const Settings & settings) - { - return StorageObjectStorageSettings{ - .truncate_on_insert = settings.azure_truncate_on_insert, - .create_new_file_on_insert = settings.azure_create_new_file_on_insert, - .schema_inference_use_cache = settings.schema_inference_use_cache_for_azure, - .schema_inference_mode = settings.schema_inference_mode, - .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for azure - .list_object_keys_size = settings.azure_list_object_keys_size, - .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, - .ignore_non_existent_file = settings.azure_ignore_file_doesnt_exist, - }; - } - - static constexpr auto SCHEMA_CACHE_MAX_ELEMENTS_CONFIG_SETTING = "schema_inference_cache_max_elements_for_azure"; - - static CurrentMetrics::Metric ObjectStorageThreads() { return CurrentMetrics::ObjectStorageAzureThreads; } /// NOLINT - static CurrentMetrics::Metric ObjectStorageThreadsActive() { return CurrentMetrics::ObjectStorageAzureThreadsActive; } /// NOLINT - static CurrentMetrics::Metric ObjectStorageThreadsScheduled() { return CurrentMetrics::ObjectStorageAzureThreadsScheduled; } /// NOLINT -}; - -struct HDFSStorageSettings -{ - static StorageObjectStorageSettings create(const Settings & settings) - { - return StorageObjectStorageSettings{ - .truncate_on_insert = settings.hdfs_truncate_on_insert, - .create_new_file_on_insert = settings.hdfs_create_new_file_on_insert, - .schema_inference_use_cache = settings.schema_inference_use_cache_for_hdfs, - .schema_inference_mode = settings.schema_inference_mode, - .skip_empty_files = settings.hdfs_skip_empty_files, /// TODO: add setting for hdfs - .list_object_keys_size = settings.s3_list_object_keys_size, /// TODO: add a setting for hdfs - .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, - .ignore_non_existent_file = settings.hdfs_ignore_file_doesnt_exist, - }; - } - - static constexpr auto SCHEMA_CACHE_MAX_ELEMENTS_CONFIG_SETTING = "schema_inference_cache_max_elements_for_hdfs"; - - /// TODO: s3 -> hdfs - static CurrentMetrics::Metric ObjectStorageThreads() { return CurrentMetrics::ObjectStorageS3Threads; } /// NOLINT - static CurrentMetrics::Metric ObjectStorageThreadsActive() { return CurrentMetrics::ObjectStorageS3ThreadsActive; } /// NOLINT - static CurrentMetrics::Metric ObjectStorageThreadsScheduled() { return CurrentMetrics::ObjectStorageS3ThreadsScheduled; } /// NOLINT -}; - -} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 42371764920..62367a6b933 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -103,7 +103,6 @@ void StorageObjectStorageSink::release() PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, StorageObjectStorageConfigurationPtr configuration_, - const StorageObjectStorageSettings & query_settings_, std::optional format_settings_, const Block & sample_block_, ContextPtr context_, @@ -111,7 +110,7 @@ PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( : PartitionedSink(partition_by, context_, sample_block_) , object_storage(object_storage_) , configuration(configuration_) - , query_settings(query_settings_) + , query_settings(configuration_->getQuerySettings(context_)) , format_settings(format_settings_) , sample_block(sample_block_) , context(context_) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index 38805332a35..6c2f73e40e3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -1,7 +1,6 @@ #pragma once #include #include -#include #include #include @@ -47,7 +46,6 @@ public: PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, StorageObjectStorageConfigurationPtr configuration_, - const StorageObjectStorageSettings & query_settings_, std::optional format_settings_, const Block & sample_block_, ContextPtr context_, @@ -61,7 +59,7 @@ private: ObjectStoragePtr object_storage; StorageObjectStorageConfigurationPtr configuration; - const StorageObjectStorageSettings query_settings; + const StorageObjectStorage::QuerySettings query_settings; const std::optional format_settings; const Block sample_block; const ContextPtr context; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 82824b0e7f7..3101a7ebf51 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -20,6 +19,13 @@ namespace ProfileEvents extern const Event EngineFileLikeReadFiles; } +namespace CurrentMetrics +{ + extern const Metric StorageObjectStorageThreads; + extern const Metric StorageObjectStorageThreadsActive; + extern const Metric StorageObjectStorageThreadsScheduled; +} + namespace DB { @@ -37,16 +43,12 @@ StorageObjectStorageSource::StorageObjectStorageSource( ConfigurationPtr configuration_, const ReadFromFormatInfo & info, std::optional format_settings_, - const StorageObjectStorageSettings & query_settings_, + const StorageObjectStorage::QuerySettings & query_settings_, ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, bool need_only_count_, - SchemaCache & schema_cache_, - std::shared_ptr reader_pool_, - CurrentMetrics::Metric metric_threads_, - CurrentMetrics::Metric metric_threads_active_, - CurrentMetrics::Metric metric_threads_scheduled_) + SchemaCache & schema_cache_) : SourceWithKeyCondition(info.source_header, false) , WithContext(context_) , name(std::move(name_)) @@ -57,13 +59,14 @@ StorageObjectStorageSource::StorageObjectStorageSource( , max_block_size(max_block_size_) , need_only_count(need_only_count_) , read_from_format_info(info) - , create_reader_pool(reader_pool_) + , create_reader_pool(std::make_shared( + CurrentMetrics::StorageObjectStorageThreads, + CurrentMetrics::StorageObjectStorageThreadsActive, + CurrentMetrics::StorageObjectStorageThreadsScheduled, + 1/* max_threads */)) , columns_desc(info.columns_description) , file_iterator(file_iterator_) , schema_cache(schema_cache_) - , metric_threads(metric_threads_) - , metric_threads_active(metric_threads_active_) - , metric_threads_scheduled(metric_threads_scheduled_) , create_reader_scheduler(threadPoolCallbackRunnerUnsafe(*create_reader_pool, "Reader")) { } @@ -76,26 +79,23 @@ StorageObjectStorageSource::~StorageObjectStorageSource() std::shared_ptr StorageObjectStorageSource::createFileIterator( ConfigurationPtr configuration, ObjectStoragePtr object_storage, - const StorageObjectStorageSettings & settings, bool distributed_processing, const ContextPtr & local_context, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, - CurrentMetrics::Metric metric_threads_, - CurrentMetrics::Metric metric_threads_active_, - CurrentMetrics::Metric metric_threads_scheduled_, std::function file_progress_callback) { if (distributed_processing) return std::make_shared( local_context->getReadTaskCallback(), - local_context->getSettingsRef().max_threads, - metric_threads_, metric_threads_active_, metric_threads_scheduled_); + local_context->getSettingsRef().max_threads); if (configuration->isNamespaceWithGlobs()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); + auto settings = configuration->getQuerySettings(local_context); + if (configuration->isPathWithGlobs()) { /// Iterate through disclosed globs and make a source for each file @@ -568,7 +568,8 @@ StorageObjectStorageSource::ReaderHolder::ReaderHolder( { } -StorageObjectStorageSource::ReaderHolder & StorageObjectStorageSource::ReaderHolder::operator=(ReaderHolder && other) noexcept +StorageObjectStorageSource::ReaderHolder & +StorageObjectStorageSource::ReaderHolder::operator=(ReaderHolder && other) noexcept { /// The order of destruction is important. /// reader uses pipeline, pipeline uses read_buf. @@ -581,15 +582,15 @@ StorageObjectStorageSource::ReaderHolder & StorageObjectStorageSource::ReaderHol } StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( - const ReadTaskCallback & callback_, - size_t max_threads_count, - CurrentMetrics::Metric metric_threads_, - CurrentMetrics::Metric metric_threads_active_, - CurrentMetrics::Metric metric_threads_scheduled_) + const ReadTaskCallback & callback_, size_t max_threads_count) : IIterator("ReadTaskIterator") , callback(callback_) { - ThreadPool pool(metric_threads_, metric_threads_active_, metric_threads_scheduled_, max_threads_count); + ThreadPool pool( + CurrentMetrics::StorageObjectStorageThreads, + CurrentMetrics::StorageObjectStorageThreadsActive, + CurrentMetrics::StorageObjectStorageThreadsScheduled, max_threads_count); + auto pool_scheduler = threadPoolCallbackRunnerUnsafe(pool, "ReadTaskIter"); std::vector> keys; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index f75bfc390bb..3c2cc3f80cd 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -3,7 +3,6 @@ #include #include #include -#include #include #include @@ -28,16 +27,12 @@ public: ConfigurationPtr configuration, const ReadFromFormatInfo & info, std::optional format_settings_, - const StorageObjectStorageSettings & query_settings_, + const StorageObjectStorage::QuerySettings & query_settings_, ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, bool need_only_count_, - SchemaCache & schema_cache_, - std::shared_ptr reader_pool_, - CurrentMetrics::Metric metric_threads_, - CurrentMetrics::Metric metric_threads_active_, - CurrentMetrics::Metric metric_threads_scheduled_); + SchemaCache & schema_cache_); ~StorageObjectStorageSource() override; @@ -53,15 +48,11 @@ public: static std::shared_ptr createFileIterator( ConfigurationPtr configuration, ObjectStoragePtr object_storage, - const StorageObjectStorageSettings & settings, bool distributed_processing, const ContextPtr & local_context, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, - CurrentMetrics::Metric metric_threads_, - CurrentMetrics::Metric metric_threads_active_, - CurrentMetrics::Metric metric_threads_scheduled_, std::function file_progress_callback = {}); protected: @@ -69,7 +60,7 @@ protected: ObjectStoragePtr object_storage; const ConfigurationPtr configuration; const std::optional format_settings; - const StorageObjectStorageSettings query_settings; + const StorageObjectStorage::QuerySettings query_settings; const UInt64 max_block_size; const bool need_only_count; const ReadFromFormatInfo read_from_format_info; @@ -79,10 +70,6 @@ protected: SchemaCache & schema_cache; bool initialized = false; - const CurrentMetrics::Metric metric_threads; - const CurrentMetrics::Metric metric_threads_active; - const CurrentMetrics::Metric metric_threads_scheduled; - size_t total_rows_in_file = 0; LoggerPtr log = getLogger("StorageObjectStorageSource"); @@ -149,12 +136,7 @@ protected: class StorageObjectStorageSource::ReadTaskIterator : public IIterator { public: - ReadTaskIterator( - const ReadTaskCallback & callback_, - size_t max_threads_count, - CurrentMetrics::Metric metric_threads_, - CurrentMetrics::Metric metric_threads_active_, - CurrentMetrics::Metric metric_threads_scheduled_); + ReadTaskIterator(const ReadTaskCallback & callback_, size_t max_threads_count); size_t estimatedKeysCount() override { return buffer.size(); } diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index 9caab709081..94d6dadee3b 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -1,8 +1,6 @@ #include #include #include -#include - namespace DB { @@ -15,15 +13,15 @@ namespace ErrorCodes std::optional checkAndGetNewFileOnInsertIfNeeded( const IObjectStorage & object_storage, const StorageObjectStorageConfiguration & configuration, - const StorageObjectStorageSettings & query_settings, + const StorageObjectStorage::QuerySettings & settings, const String & key, size_t sequence_number) { - if (query_settings.truncate_on_insert + if (settings.truncate_on_insert || !object_storage.exists(StoredObject(key))) return std::nullopt; - if (query_settings.create_new_file_on_insert) + if (settings.create_new_file_on_insert) { auto pos = key.find_first_of('.'); String new_key; @@ -45,4 +43,38 @@ std::optional checkAndGetNewFileOnInsertIfNeeded( configuration.getNamespace(), key); } +StorageInMemoryMetadata getStorageMetadata( + ObjectStoragePtr object_storage, + const StorageObjectStorageConfigurationPtr & configuration, + const ColumnsDescription & columns, + const ConstraintsDescription & constraints, + std::optional format_settings, + const String & comment, + const ContextPtr & context) +{ + StorageInMemoryMetadata storage_metadata; + if (columns.empty()) + { + auto fetched_columns = StorageObjectStorage::getTableStructureFromData(object_storage, configuration, format_settings, context); + storage_metadata.setColumns(fetched_columns); + } + else if (!columns.hasOnlyOrdinary()) + { + /// We don't allow special columns. + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Special columns are not supported for {} storage" + "like MATERIALIZED, ALIAS or EPHEMERAL", configuration->getTypeName()); + } + else + { + if (configuration->format == "auto") + StorageObjectStorage::setFormatFromData(object_storage, configuration, format_settings, context); + + storage_metadata.setColumns(columns); + } + storage_metadata.setConstraints(constraints); + storage_metadata.setComment(comment); + return storage_metadata; +} + } diff --git a/src/Storages/ObjectStorage/Utils.h b/src/Storages/ObjectStorage/Utils.h index afc0f31a33f..37bd49a77c0 100644 --- a/src/Storages/ObjectStorage/Utils.h +++ b/src/Storages/ObjectStorage/Utils.h @@ -1,18 +1,30 @@ #pragma once #include +#include "StorageObjectStorage.h" namespace DB { class IObjectStorage; class StorageObjectStorageConfiguration; +using StorageObjectStorageConfigurationPtr = std::shared_ptr; struct StorageObjectStorageSettings; std::optional checkAndGetNewFileOnInsertIfNeeded( const IObjectStorage & object_storage, const StorageObjectStorageConfiguration & configuration, - const StorageObjectStorageSettings & query_settings, + const StorageObjectStorage::QuerySettings & settings, const std::string & key, size_t sequence_number); + +StorageInMemoryMetadata getStorageMetadata( + ObjectStoragePtr object_storage, + const StorageObjectStorageConfigurationPtr & configuration, + const ColumnsDescription & columns, + const ConstraintsDescription & constraints, + std::optional format_settings, + const String & comment, + const ContextPtr & context); + } diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index 3271b766f68..06b8aefb716 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -2,22 +2,23 @@ #include #include #include +#include #include #include namespace DB { +#if USE_AWS_S3 || USE_AZURE_BLOB_STORAGE || USE_HDFS + namespace ErrorCodes { extern const int BAD_ARGUMENTS; } -template -static std::shared_ptr> createStorageObjectStorage( +static std::shared_ptr createStorageObjectStorage( const StorageFactory::Arguments & args, - typename StorageObjectStorage::ConfigurationPtr configuration, - const String & engine_name, + typename StorageObjectStorage::ConfigurationPtr configuration, ContextPtr context) { auto & engine_args = args.engine_args; @@ -54,10 +55,9 @@ static std::shared_ptr> createStorageObjec if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); - return std::make_shared>( + return std::make_shared( configuration, configuration->createObjectStorage(context), - engine_name, args.getContext(), args.table_id, args.columns, @@ -68,6 +68,8 @@ static std::shared_ptr> createStorageObjec partition_by); } +#endif + #if USE_AZURE_BLOB_STORAGE void registerStorageAzure(StorageFactory & factory) { @@ -76,7 +78,7 @@ void registerStorageAzure(StorageFactory & factory) auto context = args.getLocalContext(); auto configuration = std::make_shared(); StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); - return createStorageObjectStorage(args, configuration, "Azure", context); + return createStorageObjectStorage(args, configuration, context); }, { .supports_settings = true, @@ -95,7 +97,7 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) auto context = args.getLocalContext(); auto configuration = std::make_shared(); StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); - return createStorageObjectStorage(args, configuration, name, context); + return createStorageObjectStorage(args, configuration, context); }, { .supports_settings = true, @@ -130,7 +132,7 @@ void registerStorageHDFS(StorageFactory & factory) auto context = args.getLocalContext(); auto configuration = std::make_shared(); StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); - return createStorageObjectStorage(args, configuration, "HDFS", context); + return createStorageObjectStorage(args, configuration, context); }, { .supports_settings = true, diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index c1b45108b36..5a1f0f6dd04 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -7,7 +7,6 @@ #include #include #include -#include #include @@ -21,7 +20,7 @@ struct ObjectMetadata; class StorageS3QueueSource : public ISource, WithContext { public: - using Storage = StorageObjectStorage; + using Storage = StorageObjectStorage; using ConfigurationPtr = Storage::ConfigurationPtr; using GlobIterator = StorageObjectStorageSource::GlobIterator; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index c5799d23abd..6b504b0d986 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -37,13 +37,6 @@ namespace ProfileEvents extern const Event S3ListObjects; } -namespace CurrentMetrics -{ - extern const Metric ObjectStorageS3Threads; - extern const Metric ObjectStorageS3ThreadsActive; - extern const Metric ObjectStorageS3ThreadsScheduled; -} - namespace DB { @@ -151,14 +144,14 @@ StorageS3Queue::StorageS3Queue( StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = Storage::getTableStructureFromData(object_storage, configuration, format_settings, context_); + auto columns = StorageObjectStorage::getTableStructureFromData(object_storage, configuration, format_settings, context_); storage_metadata.setColumns(columns); } else { if (configuration->format == "auto") { - StorageObjectStorage::setFormatFromData(object_storage, configuration, format_settings, context_); + StorageObjectStorage::setFormatFromData(object_storage, configuration, format_settings, context_); } storage_metadata.setColumns(columns_); } @@ -370,26 +363,18 @@ std::shared_ptr StorageS3Queue::createSource( size_t max_block_size, ContextPtr local_context) { - auto threadpool = std::make_shared(CurrentMetrics::ObjectStorageS3Threads, - CurrentMetrics::ObjectStorageS3ThreadsActive, - CurrentMetrics::ObjectStorageS3ThreadsScheduled, - /* max_threads */1); auto internal_source = std::make_unique( getName(), object_storage, configuration, info, format_settings, - S3StorageSettings::create(local_context->getSettingsRef()), + configuration->getQuerySettings(local_context), local_context, max_block_size, file_iterator, false, - Storage::getSchemaCache(local_context), - threadpool, - CurrentMetrics::ObjectStorageS3Threads, - CurrentMetrics::ObjectStorageS3ThreadsActive, - CurrentMetrics::ObjectStorageS3ThreadsScheduled); + StorageObjectStorage::getSchemaCache(local_context, configuration->getTypeName())); auto file_deleter = [=, this](const std::string & path) mutable { @@ -596,7 +581,7 @@ void StorageS3Queue::checkTableStructure(const String & zookeeper_prefix, const std::shared_ptr StorageS3Queue::createFileIterator(ContextPtr local_context, const ActionsDAG::Node * predicate) { - auto settings = S3StorageSettings::create(local_context->getSettingsRef()); + auto settings = configuration->getQuerySettings(local_context); auto glob_iterator = std::make_unique( object_storage, configuration, predicate, getVirtualsList(), local_context, nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match); diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index 72c41a6a694..1464e15ebf2 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -21,8 +21,7 @@ class S3QueueFilesMetadata; class StorageS3Queue : public IStorage, WithContext { public: - using Storage = StorageObjectStorage; - using ConfigurationPtr = Storage::ConfigurationPtr; + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; StorageS3Queue( std::unique_ptr s3queue_settings_, diff --git a/src/Storages/System/StorageSystemSchemaInferenceCache.cpp b/src/Storages/System/StorageSystemSchemaInferenceCache.cpp index 9ef64f2b90d..a2d3f342a63 100644 --- a/src/Storages/System/StorageSystemSchemaInferenceCache.cpp +++ b/src/Storages/System/StorageSystemSchemaInferenceCache.cpp @@ -9,6 +9,9 @@ #include #include #include +#include +#include +#include namespace DB { @@ -74,14 +77,14 @@ void StorageSystemSchemaInferenceCache::fillData(MutableColumns & res_columns, C { fillDataImpl(res_columns, StorageFile::getSchemaCache(context), "File"); #if USE_AWS_S3 - fillDataImpl(res_columns, StorageS3::getSchemaCache(context), "S3"); + fillDataImpl(res_columns, StorageObjectStorage::getSchemaCache(context, StorageS3Configuration::type_name), "S3"); #endif #if USE_HDFS - fillDataImpl(res_columns, StorageHDFS::getSchemaCache(context), "HDFS"); + fillDataImpl(res_columns, StorageObjectStorage::getSchemaCache(context, StorageHDFSConfiguration::type_name), "HDFS"); #endif fillDataImpl(res_columns, StorageURL::getSchemaCache(context), "URL"); #if USE_AZURE_BLOB_STORAGE - fillDataImpl(res_columns, StorageAzureBlob::getSchemaCache(context), "Azure"); /// FIXME + fillDataImpl(res_columns, StorageObjectStorage::getSchemaCache(context, StorageAzureBlobConfiguration::type_name), "Azure"); #endif } diff --git a/src/TableFunctions/ITableFunctionDataLake.h b/src/TableFunctions/ITableFunctionDataLake.h index 8cbd855bb96..02c8c623e61 100644 --- a/src/TableFunctions/ITableFunctionDataLake.h +++ b/src/TableFunctions/ITableFunctionDataLake.h @@ -39,7 +39,7 @@ protected: columns = cached_columns; StoragePtr storage = Storage::create( - configuration, context, "", StorageID(TableFunction::getDatabaseName(), table_name), + configuration, context, StorageID(TableFunction::getDatabaseName(), table_name), columns, ConstraintsDescription{}, String{}, std::nullopt, LoadingStrictnessLevel::CREATE); storage->startup(); diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index 9223642a7e6..2b5c774ff78 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -27,27 +27,27 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } -template +template ObjectStoragePtr TableFunctionObjectStorage< - Definition, StorageSettings, Configuration>::getObjectStorage(const ContextPtr & context, bool create_readonly) const + Definition, Configuration>::getObjectStorage(const ContextPtr & context, bool create_readonly) const { if (!object_storage) object_storage = configuration->createObjectStorage(context, create_readonly); return object_storage; } -template +template StorageObjectStorageConfigurationPtr TableFunctionObjectStorage< - Definition, StorageSettings, Configuration>::getConfiguration() const + Definition, Configuration>::getConfiguration() const { if (!configuration) configuration = std::make_shared(); return configuration; } -template +template std::vector TableFunctionObjectStorage< - Definition, StorageSettings, Configuration>::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const + Definition, Configuration>::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const { auto & table_function_node = query_node_table_function->as(); auto & table_function_arguments_nodes = table_function_node.getArguments().getNodes(); @@ -63,22 +63,21 @@ std::vector TableFunctionObjectStorage< return result; } -template -void TableFunctionObjectStorage::updateStructureAndFormatArgumentsIfNeeded( +template +void TableFunctionObjectStorage::updateStructureAndFormatArgumentsIfNeeded( ASTs & args, const String & structure, const String & format, const ContextPtr & context) { - Configuration::addStructureAndFormatToArgs(args, structure, format, context); + Configuration().addStructureAndFormatToArgs(args, structure, format, context); } -template -void TableFunctionObjectStorage< - Definition, StorageSettings, Configuration>::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context) +template +void TableFunctionObjectStorage::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context) { StorageObjectStorageConfiguration::initialize(*getConfiguration(), engine_args, local_context, true); } -template -void TableFunctionObjectStorage::parseArguments(const ASTPtr & ast_function, ContextPtr context) +template +void TableFunctionObjectStorage::parseArguments(const ASTPtr & ast_function, ContextPtr context) { /// Clone ast function, because we can modify its arguments like removing headers. auto ast_copy = ast_function->clone(); @@ -90,38 +89,38 @@ void TableFunctionObjectStorage::par parseArgumentsImpl(args, context); } -template +template ColumnsDescription TableFunctionObjectStorage< - Definition, StorageSettings, Configuration>::getActualTableStructure(ContextPtr context, bool is_insert_query) const + Definition, Configuration>::getActualTableStructure(ContextPtr context, bool is_insert_query) const { chassert(configuration); if (configuration->structure == "auto") { context->checkAccess(getSourceAccessType()); auto storage = getObjectStorage(context, !is_insert_query); - return StorageObjectStorage::getTableStructureFromData(storage, configuration, std::nullopt, context); + return StorageObjectStorage::getTableStructureFromData(storage, configuration, std::nullopt, context); } return parseColumnsListFromString(configuration->structure, context); } -template +template bool TableFunctionObjectStorage< - Definition, StorageSettings, Configuration>::supportsReadingSubsetOfColumns(const ContextPtr & context) + Definition, Configuration>::supportsReadingSubsetOfColumns(const ContextPtr & context) { chassert(configuration); return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); } -template +template std::unordered_set TableFunctionObjectStorage< - Definition, StorageSettings, Configuration>::getVirtualsToCheckBeforeUsingStructureHint() const + Definition, Configuration>::getVirtualsToCheckBeforeUsingStructureHint() const { return VirtualColumnUtils::getVirtualNamesForFileLikeStorage(); } -template -StoragePtr TableFunctionObjectStorage::executeImpl( +template +StoragePtr TableFunctionObjectStorage::executeImpl( const ASTPtr & /* ast_function */, ContextPtr context, const std::string & table_name, @@ -137,10 +136,9 @@ StoragePtr TableFunctionObjectStorage>( + StoragePtr storage = std::make_shared( configuration, getObjectStorage(context, !is_insert_query), - Definition::storage_type_name, context, StorageID(getDatabaseName(), table_name), columns, @@ -159,7 +157,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) { UNUSED(factory); #if USE_AWS_S3 - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -170,7 +168,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) .allow_readonly = false }); - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -181,7 +179,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) .allow_readonly = false }); - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -191,7 +189,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) .categories{"DataLake"}}, .allow_readonly = false }); - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -204,7 +202,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) #endif #if USE_AZURE_BLOB_STORAGE - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -220,7 +218,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) }); #endif #if USE_HDFS - factory.registerFunction>( + factory.registerFunction>( { .allow_readonly = false }); @@ -228,21 +226,21 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) } #if USE_AZURE_BLOB_STORAGE -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; #endif #if USE_AWS_S3 -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; #endif #if USE_HDFS -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; #endif } diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h index 9022f6e577f..bd43cae3697 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.h +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -85,7 +85,7 @@ struct HDFSDefinition " - uri, format, structure, compression_method\n"; }; -template +template class TableFunctionObjectStorage : public ITableFunction { public: @@ -142,14 +142,14 @@ protected: }; #if USE_AWS_S3 -using TableFunctionS3 = TableFunctionObjectStorage; +using TableFunctionS3 = TableFunctionObjectStorage; #endif #if USE_AZURE_BLOB_STORAGE -using TableFunctionAzureBlob = TableFunctionObjectStorage; +using TableFunctionAzureBlob = TableFunctionObjectStorage; #endif #if USE_HDFS -using TableFunctionHDFS = TableFunctionObjectStorage; +using TableFunctionHDFS = TableFunctionObjectStorage; #endif } diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index 909ace788eb..ce78076dd21 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -14,8 +14,8 @@ namespace DB { -template -StoragePtr TableFunctionObjectStorageCluster::executeImpl( +template +StoragePtr TableFunctionObjectStorageCluster::executeImpl( const ASTPtr & /*function*/, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const { @@ -34,10 +34,9 @@ StoragePtr TableFunctionObjectStorageClustergetClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) { /// On worker node this filename won't contains globs - storage = std::make_shared>( + storage = std::make_shared( configuration, object_storage, - Definition::storage_type_name, context, StorageID(Base::getDatabaseName(), table_name), columns, @@ -49,11 +48,10 @@ StoragePtr TableFunctionObjectStorageCluster>( + storage = std::make_shared( ITableFunctionCluster::cluster_name, configuration, object_storage, - Definition::storage_type_name, StorageID(Base::getDatabaseName(), table_name), columns, ConstraintsDescription{}, @@ -107,14 +105,14 @@ void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory) } #if USE_AWS_S3 -template class TableFunctionObjectStorageCluster; +template class TableFunctionObjectStorageCluster; #endif #if USE_AZURE_BLOB_STORAGE -template class TableFunctionObjectStorageCluster; +template class TableFunctionObjectStorageCluster; #endif #if USE_HDFS -template class TableFunctionObjectStorageCluster; +template class TableFunctionObjectStorageCluster; #endif } diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.h b/src/TableFunctions/TableFunctionObjectStorageCluster.h index 21c2f8995dc..a8bc11b5e40 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.h +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.h @@ -56,8 +56,8 @@ struct HDFSClusterDefinition " - cluster_name, uri, format, structure, compression_method\n"; }; -template -class TableFunctionObjectStorageCluster : public ITableFunctionCluster> +template +class TableFunctionObjectStorageCluster : public ITableFunctionCluster> { public: static constexpr auto name = Definition::name; @@ -67,7 +67,7 @@ public: String getSignature() const override { return signature; } protected: - using Base = TableFunctionObjectStorage; + using Base = TableFunctionObjectStorage; StoragePtr executeImpl( const ASTPtr & ast_function, @@ -86,14 +86,14 @@ protected: }; #if USE_AWS_S3 -using TableFunctionS3Cluster = TableFunctionObjectStorageCluster; +using TableFunctionS3Cluster = TableFunctionObjectStorageCluster; #endif #if USE_AZURE_BLOB_STORAGE -using TableFunctionAzureBlobCluster = TableFunctionObjectStorageCluster; +using TableFunctionAzureBlobCluster = TableFunctionObjectStorageCluster; #endif #if USE_HDFS -using TableFunctionHDFSCluster = TableFunctionObjectStorageCluster; +using TableFunctionHDFSCluster = TableFunctionObjectStorageCluster; #endif } From 9eb9a76592dada103c40baa2c4acf5a3918b8e95 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 22 Apr 2024 14:18:46 +0100 Subject: [PATCH 079/392] Fix --- .../ObjectStorage/AzureBlob/Configuration.cpp | 1 + .../DataLakes/IStorageDataLake.h | 2 +- .../ObjectStorage/HDFS/Configuration.cpp | 1 + .../ObjectStorage/S3/Configuration.cpp | 1 + .../ObjectStorage/StorageObjectStorage.cpp | 47 +++++++++++-------- .../ObjectStorage/StorageObjectStorage.h | 10 +++- .../StorageObjectStorageCluster.cpp | 9 ++-- .../StorageObjectStorageConfiguration.cpp | 5 ++ .../StorageObjectStorageConfiguration.h | 2 +- src/Storages/ObjectStorage/Utils.cpp | 33 ++++++------- src/Storages/ObjectStorage/Utils.h | 10 ++-- src/Storages/S3Queue/StorageS3Queue.cpp | 21 +++------ .../TableFunctionObjectStorage.cpp | 5 +- 13 files changed, 80 insertions(+), 67 deletions(-) diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp index c9bc59d62aa..f268b812c03 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp @@ -77,6 +77,7 @@ void StorageAzureBlobConfiguration::check(ContextPtr context) const url_to_check = Poco::URI(connection_url); context->getGlobalContext()->getRemoteHostFilter().checkURL(url_to_check); + StorageObjectStorageConfiguration::check(context); } StorageAzureBlobConfiguration::StorageAzureBlobConfiguration(const StorageAzureBlobConfiguration & other) diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index e1851775925..144cc16939c 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -89,7 +89,7 @@ public: { ConfigurationPtr configuration = base_configuration->clone(); configuration->getPaths() = metadata->getDataFiles(); - return Storage::getTableStructureFromData( + return Storage::resolveSchemaFromData( object_storage_, configuration, format_settings_, local_context); } } diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 0062ac969ac..12e3f3adb12 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -34,6 +34,7 @@ void StorageHDFSConfiguration::check(ContextPtr context) const { context->getRemoteHostFilter().checkURL(Poco::URI(url)); checkHDFSURL(fs::path(url) / path.substr(1)); + StorageObjectStorageConfiguration::check(context); } ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage( /// NOLINT diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 139d9004f8e..bfd61c647f8 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -54,6 +54,7 @@ void StorageS3Configuration::check(ContextPtr context) const validateNamespace(url.bucket); context->getGlobalContext()->getRemoteHostFilter().checkURL(url.uri); context->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(headers_from_ast); + StorageObjectStorageConfiguration::check(context); } void StorageS3Configuration::validateNamespace(const String & name) const diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 441639629a3..36a8beba41a 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -47,17 +47,19 @@ StorageObjectStorage::StorageObjectStorage( , distributed_processing(distributed_processing_) , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { - FormatFactory::instance().checkFormatName(configuration->format); + ColumnsDescription columns{columns_}; + resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, context); configuration->check(context); + StorageInMemoryMetadata metadata; + metadata.setColumns(columns); + metadata.setConstraints(constraints_); + metadata.setComment(comment); + StoredObjects objects; for (const auto & key : configuration->getPaths()) objects.emplace_back(key); - auto metadata = getStorageMetadata( - object_storage_, configuration_, columns_, - constraints_, format_settings_, comment, context); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); setInMemoryMetadata(std::move(metadata)); } @@ -224,7 +226,7 @@ std::unique_ptr StorageObjectStorage::createReadBufferIterat format_settings, getSchemaCache(context, configuration->getTypeName()), read_keys, context); } -ColumnsDescription StorageObjectStorage::getTableStructureFromData( +ColumnsDescription StorageObjectStorage::resolveSchemaFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, @@ -233,20 +235,11 @@ ColumnsDescription StorageObjectStorage::getTableStructureFromData( ObjectInfos read_keys; auto read_buffer_iterator = createReadBufferIterator( object_storage, configuration, format_settings, read_keys, context); - - if (configuration->format == "auto") - { - auto [columns, format] = detectFormatAndReadSchema(format_settings, *read_buffer_iterator, context); - configuration->format = format; - return columns; - } - else - { - return readSchemaFromFormat(configuration->format, format_settings, *read_buffer_iterator, context); - } + return readSchemaFromFormat( + configuration->format, format_settings, *read_buffer_iterator, context); } -void StorageObjectStorage::setFormatFromData( +std::string StorageObjectStorage::resolveFormatFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, @@ -255,7 +248,23 @@ void StorageObjectStorage::setFormatFromData( ObjectInfos read_keys; auto read_buffer_iterator = createReadBufferIterator( object_storage, configuration, format_settings, read_keys, context); - configuration->format = detectFormatAndReadSchema(format_settings, *read_buffer_iterator, context).second; + return detectFormatAndReadSchema( + format_settings, *read_buffer_iterator, context).second; +} + +std::pair StorageObjectStorage::resolveSchemaAndFormatFromData( + const ObjectStoragePtr & object_storage, + const ConfigurationPtr & configuration, + const std::optional & format_settings, + const ContextPtr & context) +{ + ObjectInfos read_keys; + auto read_buffer_iterator = createReadBufferIterator( + object_storage, configuration, format_settings, read_keys, context); + + auto [columns, format] = detectFormatAndReadSchema(format_settings, *read_buffer_iterator, context); + configuration->format = format; + return std::pair(columns, format); } SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 3dbe010e406..d46a875bf42 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -95,13 +95,19 @@ public: static SchemaCache & getSchemaCache(const ContextPtr & context, const std::string & storage_type_name); - static ColumnsDescription getTableStructureFromData( + static ColumnsDescription resolveSchemaFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, const ContextPtr & context); - static void setFormatFromData( + static std::string resolveFormatFromData( + const ObjectStoragePtr & object_storage, + const ConfigurationPtr & configuration, + const std::optional & format_settings, + const ContextPtr & context); + + static std::pair resolveSchemaAndFormatFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 72a35ae33eb..f98fc32a3cc 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -38,10 +38,13 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( , configuration{configuration_} , object_storage(object_storage_) { + ColumnsDescription columns{columns_}; + resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, {}, context_); configuration->check(context_); - auto metadata = getStorageMetadata( - object_storage, configuration, columns_, constraints_, - {}/* format_settings */, ""/* comment */, context_); + + StorageInMemoryMetadata metadata; + metadata.setColumns(columns); + metadata.setConstraints(constraints_); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); setInMemoryMetadata(std::move(metadata)); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 61e569cee05..3635269db34 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -30,6 +30,11 @@ void StorageObjectStorageConfiguration::initialize( configuration.initialized = true; } +void StorageObjectStorageConfiguration::check(ContextPtr) const +{ + FormatFactory::instance().checkFormatName(format); +} + StorageObjectStorageConfiguration::StorageObjectStorageConfiguration(const StorageObjectStorageConfiguration & other) { format = other.format; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 34965174bf9..c55362aa8bd 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -50,7 +50,7 @@ public: bool isNamespaceWithGlobs() const; virtual std::string getPathWithoutGlob() const; - virtual void check(ContextPtr context) const = 0; + virtual void check(ContextPtr context) const; virtual void validateNamespace(const String & /* name */) const {} virtual ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index 94d6dadee3b..2a7236ab196 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -12,7 +12,7 @@ namespace ErrorCodes std::optional checkAndGetNewFileOnInsertIfNeeded( const IObjectStorage & object_storage, - const StorageObjectStorageConfiguration & configuration, + const StorageObjectStorage::Configuration & configuration, const StorageObjectStorage::QuerySettings & settings, const String & key, size_t sequence_number) @@ -43,38 +43,33 @@ std::optional checkAndGetNewFileOnInsertIfNeeded( configuration.getNamespace(), key); } -StorageInMemoryMetadata getStorageMetadata( +void resolveSchemaAndFormat( + ColumnsDescription & columns, + std::string & format, ObjectStoragePtr object_storage, const StorageObjectStorageConfigurationPtr & configuration, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints, std::optional format_settings, - const String & comment, const ContextPtr & context) { - StorageInMemoryMetadata storage_metadata; if (columns.empty()) { - auto fetched_columns = StorageObjectStorage::getTableStructureFromData(object_storage, configuration, format_settings, context); - storage_metadata.setColumns(fetched_columns); + if (format == "auto") + std::tie(columns, format) = StorageObjectStorage::resolveSchemaAndFormatFromData(object_storage, configuration, format_settings, context); + else + columns = StorageObjectStorage::resolveSchemaFromData(object_storage, configuration, format_settings, context); } - else if (!columns.hasOnlyOrdinary()) + else if (format == "auto") + { + format = StorageObjectStorage::resolveFormatFromData(object_storage, configuration, format_settings, context); + } + + if (!columns.hasOnlyOrdinary()) { /// We don't allow special columns. throw Exception(ErrorCodes::BAD_ARGUMENTS, "Special columns are not supported for {} storage" "like MATERIALIZED, ALIAS or EPHEMERAL", configuration->getTypeName()); } - else - { - if (configuration->format == "auto") - StorageObjectStorage::setFormatFromData(object_storage, configuration, format_settings, context); - - storage_metadata.setColumns(columns); - } - storage_metadata.setConstraints(constraints); - storage_metadata.setComment(comment); - return storage_metadata; } } diff --git a/src/Storages/ObjectStorage/Utils.h b/src/Storages/ObjectStorage/Utils.h index 37bd49a77c0..3a752e6b8f0 100644 --- a/src/Storages/ObjectStorage/Utils.h +++ b/src/Storages/ObjectStorage/Utils.h @@ -12,19 +12,17 @@ struct StorageObjectStorageSettings; std::optional checkAndGetNewFileOnInsertIfNeeded( const IObjectStorage & object_storage, - const StorageObjectStorageConfiguration & configuration, + const StorageObjectStorage::Configuration & configuration, const StorageObjectStorage::QuerySettings & settings, const std::string & key, size_t sequence_number); - -StorageInMemoryMetadata getStorageMetadata( +void resolveSchemaAndFormat( + ColumnsDescription & columns, + std::string & format, ObjectStoragePtr object_storage, const StorageObjectStorageConfigurationPtr & configuration, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints, std::optional format_settings, - const String & comment, const ContextPtr & context); } diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 6b504b0d986..229c40396c5 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -141,24 +142,14 @@ StorageS3Queue::StorageS3Queue( FormatFactory::instance().checkFormatName(configuration->format); configuration->check(context_); - StorageInMemoryMetadata storage_metadata; - if (columns_.empty()) - { - auto columns = StorageObjectStorage::getTableStructureFromData(object_storage, configuration, format_settings, context_); - storage_metadata.setColumns(columns); - } - else - { - if (configuration->format == "auto") - { - StorageObjectStorage::setFormatFromData(object_storage, configuration, format_settings, context_); - } - storage_metadata.setColumns(columns_); - } + ColumnsDescription columns{columns_}; + resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, context_); + configuration->check(context_); + StorageInMemoryMetadata storage_metadata; + storage_metadata.setColumns(columns); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); - setInMemoryMetadata(storage_metadata); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); LOG_INFO(log, "Using zookeeper path: {}", zk_path.string()); diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index 2b5c774ff78..06676a8adfa 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -98,7 +99,9 @@ ColumnsDescription TableFunctionObjectStorage< { context->checkAccess(getSourceAccessType()); auto storage = getObjectStorage(context, !is_insert_query); - return StorageObjectStorage::getTableStructureFromData(storage, configuration, std::nullopt, context); + ColumnsDescription columns; + resolveSchemaAndFormat(columns, configuration->format, storage, configuration, std::nullopt, context); + return columns; } return parseColumnsListFromString(configuration->structure, context); From 16bc8aa0b1a68bd2422026ea7205a3746029e86b Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 25 Apr 2024 16:08:13 +0200 Subject: [PATCH 080/392] Fxi --- src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 3635269db34..89c15085274 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -26,7 +26,6 @@ void StorageObjectStorageConfiguration::initialize( else FormatFactory::instance().checkFormatName(configuration.format); - configuration.check(local_context); configuration.initialized = true; } From 193ff63f87a2cef958983b2ef106a7c52f6db8be Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 25 Apr 2024 22:44:12 +0200 Subject: [PATCH 081/392] Fix --- .../ObjectStorage/StorageObjectStorage.cpp | 37 ++++++++++++++----- src/Storages/S3Queue/StorageS3Queue.cpp | 1 + 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 36a8beba41a..f5bfb9d2a65 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -269,20 +269,37 @@ std::pair StorageObjectStorage::resolveSchemaAn SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context) { - static SchemaCache schema_cache( - context->getConfigRef().getUInt( - "schema_inference_cache_max_elements_for_" + configuration->getTypeName(), - DEFAULT_SCHEMA_CACHE_ELEMENTS)); - return schema_cache; + return getSchemaCache(context, configuration->getTypeName()); } SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context, const std::string & storage_type_name) { - static SchemaCache schema_cache( - context->getConfigRef().getUInt( - "schema_inference_cache_max_elements_for_" + storage_type_name, - DEFAULT_SCHEMA_CACHE_ELEMENTS)); - return schema_cache; + if (storage_type_name == "s3") + { + static SchemaCache schema_cache( + context->getConfigRef().getUInt( + "schema_inference_cache_max_elements_for_s3", + DEFAULT_SCHEMA_CACHE_ELEMENTS)); + return schema_cache; + } + else if (storage_type_name == "hdfs") + { + static SchemaCache schema_cache( + context->getConfigRef().getUInt( + "schema_inference_cache_max_elements_for_hdfs", + DEFAULT_SCHEMA_CACHE_ELEMENTS)); + return schema_cache; + } + else if (storage_type_name == "azure") + { + static SchemaCache schema_cache( + context->getConfigRef().getUInt( + "schema_inference_cache_max_elements_for_azure", + DEFAULT_SCHEMA_CACHE_ELEMENTS)); + return schema_cache; + } + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported storage type: {}", storage_type_name); } } diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 229c40396c5..e84dabecf3b 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -151,6 +151,7 @@ StorageS3Queue::StorageS3Queue( storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); + setInMemoryMetadata(storage_metadata); LOG_INFO(log, "Using zookeeper path: {}", zk_path.string()); task = getContext()->getSchedulePool().createTask("S3QueueStreamingTask", [this] { threadFunc(); }); From 69a3aa7bcf0e7a2d311a076493715cf3b1b3a418 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 26 Apr 2024 11:01:32 +0000 Subject: [PATCH 082/392] Implement Dynamic data type --- docs/en/sql-reference/data-types/dynamic.md | 157 ++++ src/Columns/ColumnArray.cpp | 10 + src/Columns/ColumnArray.h | 3 + src/Columns/ColumnCompressed.h | 3 + src/Columns/ColumnConst.cpp | 9 + src/Columns/ColumnConst.h | 4 + src/Columns/ColumnDynamic.cpp | 785 ++++++++++++++++++ src/Columns/ColumnDynamic.h | 363 ++++++++ src/Columns/ColumnMap.cpp | 9 + src/Columns/ColumnMap.h | 3 + src/Columns/ColumnNullable.cpp | 9 + src/Columns/ColumnNullable.h | 3 + src/Columns/ColumnSparse.cpp | 9 + src/Columns/ColumnSparse.h | 3 + src/Columns/ColumnTuple.cpp | 28 + src/Columns/ColumnTuple.h | 3 + src/Columns/ColumnVariant.cpp | 185 ++++- src/Columns/ColumnVariant.h | 34 +- src/Columns/IColumn.cpp | 2 + src/Columns/IColumn.h | 2 + src/Columns/tests/gtest_column_dynamic.cpp | 652 +++++++++++++++ src/Core/Settings.h | 2 + src/Core/TypeId.h | 1 + src/DataTypes/DataTypeArray.cpp | 21 + src/DataTypes/DataTypeArray.h | 7 +- src/DataTypes/DataTypeDynamic.cpp | 144 ++++ src/DataTypes/DataTypeDynamic.h | 53 ++ src/DataTypes/DataTypeFactory.cpp | 1 + src/DataTypes/DataTypeFactory.h | 1 + src/DataTypes/DataTypeMap.h | 2 +- src/DataTypes/DataTypeObject.h | 2 +- src/DataTypes/DataTypeTuple.cpp | 4 +- src/DataTypes/DataTypeTuple.h | 2 +- src/DataTypes/DataTypeVariant.cpp | 23 +- src/DataTypes/DataTypeVariant.h | 4 +- src/DataTypes/IDataType.cpp | 71 +- src/DataTypes/IDataType.h | 30 +- src/DataTypes/ObjectUtils.cpp | 12 +- src/DataTypes/ObjectUtils.h | 4 +- .../Serializations/ISerialization.cpp | 19 + src/DataTypes/Serializations/ISerialization.h | 21 +- .../Serializations/SerializationArray.cpp | 5 +- .../Serializations/SerializationArray.h | 4 +- .../Serializations/SerializationDynamic.cpp | 645 ++++++++++++++ .../Serializations/SerializationDynamic.h | 116 +++ .../SerializationDynamicElement.cpp | 99 +++ .../SerializationDynamicElement.h | 58 ++ .../Serializations/SerializationInterval.cpp | 4 +- .../Serializations/SerializationInterval.h | 5 +- .../SerializationLowCardinality.cpp | 3 +- .../SerializationLowCardinality.h | 3 +- .../Serializations/SerializationMap.cpp | 5 +- .../Serializations/SerializationMap.h | 3 +- .../Serializations/SerializationNamed.cpp | 5 +- .../Serializations/SerializationNamed.h | 3 +- .../Serializations/SerializationNullable.cpp | 5 +- .../Serializations/SerializationNullable.h | 3 +- .../Serializations/SerializationObject.cpp | 5 +- .../Serializations/SerializationObject.h | 3 +- .../Serializations/SerializationSparse.cpp | 7 +- .../Serializations/SerializationSparse.h | 3 +- .../Serializations/SerializationTuple.cpp | 5 +- .../Serializations/SerializationTuple.h | 3 +- .../Serializations/SerializationVariant.cpp | 5 +- .../Serializations/SerializationVariant.h | 3 +- .../SerializationVariantElement.cpp | 28 +- .../SerializationVariantElement.h | 14 +- .../Serializations/SerializationWrapper.cpp | 5 +- .../Serializations/SerializationWrapper.h | 3 +- .../tests/gtest_object_serialization.cpp | 2 +- src/DataTypes/Utils.cpp | 1 + src/Databases/DatabaseReplicated.cpp | 1 + src/Formats/FormatSettings.h | 6 +- src/Formats/NativeReader.cpp | 2 +- src/Functions/FunctionsConversion.cpp | 356 +++++++- src/Functions/dynamicElement.cpp | 172 ++++ src/Functions/dynamicType.cpp | 104 +++ src/Functions/if.cpp | 11 + src/Functions/isNotNull.cpp | 6 +- src/Functions/isNull.cpp | 6 +- src/Functions/variantElement.cpp | 52 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/Interpreters/InterpreterInsertQuery.cpp | 2 +- src/Interpreters/TreeRewriter.cpp | 34 +- src/Interpreters/convertFieldToType.cpp | 11 +- .../parseColumnsListForTableFunction.cpp | 14 +- .../parseColumnsListForTableFunction.h | 2 + src/Parsers/ParserDataType.cpp | 46 +- src/Processors/Formats/IOutputFormat.h | 3 +- .../Algorithms/AggregatingSortedAlgorithm.cpp | 39 +- .../Algorithms/AggregatingSortedAlgorithm.h | 3 +- .../Algorithms/CollapsingSortedAlgorithm.cpp | 19 +- .../Algorithms/CollapsingSortedAlgorithm.h | 2 - .../GraphiteRollupSortedAlgorithm.cpp | 28 +- .../GraphiteRollupSortedAlgorithm.h | 4 +- .../IMergingAlgorithmWithDelayedChunk.h | 2 +- .../IMergingAlgorithmWithSharedChunks.cpp | 5 +- .../IMergingAlgorithmWithSharedChunks.h | 6 +- src/Processors/Merges/Algorithms/MergedData.h | 42 +- .../Algorithms/MergingSortedAlgorithm.cpp | 3 +- .../Algorithms/ReplacingSortedAlgorithm.cpp | 17 +- .../Algorithms/ReplacingSortedAlgorithm.h | 2 - .../Algorithms/SummingSortedAlgorithm.cpp | 76 +- .../Algorithms/SummingSortedAlgorithm.h | 4 +- .../VersionedCollapsingAlgorithm.cpp | 15 +- .../Algorithms/VersionedCollapsingAlgorithm.h | 2 - .../Transforms/ColumnGathererTransform.cpp | 34 +- src/Storages/AlterCommands.cpp | 6 +- src/Storages/ColumnsDescription.cpp | 36 +- src/Storages/HDFS/StorageHDFS.h | 2 + src/Storages/HDFS/StorageHDFSCluster.h | 2 + src/Storages/IStorage.h | 4 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 30 + src/Storages/MergeTree/IMergeTreeDataPart.h | 2 + src/Storages/MergeTree/MergeTreeData.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.h | 1 + .../MergeTreeDataPartWriterCompact.cpp | 17 +- .../MergeTreeDataPartWriterCompact.h | 4 +- .../MergeTree/MergeTreeDataPartWriterWide.cpp | 54 +- .../MergeTree/MergeTreeDataPartWriterWide.h | 14 +- .../MergeTree/MergeTreeDataWriter.cpp | 2 +- src/Storages/MergeTree/MergeTreeIndexSet.cpp | 2 +- .../MergeTree/MergeTreeReaderCompact.cpp | 5 +- .../MergeTree/MergeTreeReaderWide.cpp | 107 ++- src/Storages/MergeTree/MergeTreeReaderWide.h | 38 +- src/Storages/MergeTree/MergeTreeSettings.h | 1 + src/Storages/MergeTree/MutateTask.cpp | 16 +- .../MergeTree/StorageFromMergeTreeDataPart.h | 1 + src/Storages/S3Queue/StorageS3Queue.h | 1 + src/Storages/StorageAzureBlob.h | 2 + src/Storages/StorageAzureBlobCluster.h | 2 + src/Storages/StorageBuffer.h | 2 + src/Storages/StorageDistributed.cpp | 2 +- src/Storages/StorageDistributed.h | 1 + src/Storages/StorageDummy.h | 1 + src/Storages/StorageFile.h | 2 + src/Storages/StorageFileCluster.h | 2 + src/Storages/StorageInMemoryMetadata.cpp | 6 +- src/Storages/StorageLog.cpp | 2 +- src/Storages/StorageMaterializedView.h | 1 + src/Storages/StorageMemory.h | 1 + src/Storages/StorageMerge.h | 1 + src/Storages/StorageNull.h | 2 + src/Storages/StorageS3.h | 2 + src/Storages/StorageS3Cluster.h | 2 + src/Storages/StorageSnapshot.cpp | 2 +- src/Storages/StorageURL.h | 2 + src/Storages/StorageURLCluster.h | 2 + src/Storages/getStructureOfRemoteTable.cpp | 2 +- .../02943_variant_read_subcolumns.sh | 2 +- ...03033_dynamic_text_serialization.reference | 55 ++ .../03033_dynamic_text_serialization.sql | 74 ++ .../03034_dynamic_conversions.reference | 63 ++ .../0_stateless/03034_dynamic_conversions.sql | 24 + .../03035_dynamic_sorting.reference | 299 +++++++ .../0_stateless/03035_dynamic_sorting.sql | 80 ++ .../03036_dynamic_read_subcolumns.reference | 57 ++ .../03036_dynamic_read_subcolumns.sh | 62 ++ .../03037_dynamic_merges_1.reference | 120 +++ .../0_stateless/03037_dynamic_merges_1.sh | 61 ++ .../0_stateless/03037_dynamic_merges_2.sh | 45 + .../03038_nested_dynamic_merges.reference | 92 ++ .../03038_nested_dynamic_merges.sh | 53 ++ ...9_dynamic_all_merge_algorithms_1.reference | 88 ++ .../03039_dynamic_all_merge_algorithms_1.sh | 65 ++ ...9_dynamic_all_merge_algorithms_2.reference | 44 + .../03039_dynamic_all_merge_algorithms_2.sh | 50 ++ .../03040_dynamic_type_alters.reference | 526 ++++++++++++ .../0_stateless/03040_dynamic_type_alters.sh | 76 ++ 169 files changed, 6770 insertions(+), 438 deletions(-) create mode 100644 docs/en/sql-reference/data-types/dynamic.md create mode 100644 src/Columns/ColumnDynamic.cpp create mode 100644 src/Columns/ColumnDynamic.h create mode 100644 src/Columns/tests/gtest_column_dynamic.cpp create mode 100644 src/DataTypes/DataTypeDynamic.cpp create mode 100644 src/DataTypes/DataTypeDynamic.h create mode 100644 src/DataTypes/Serializations/SerializationDynamic.cpp create mode 100644 src/DataTypes/Serializations/SerializationDynamic.h create mode 100644 src/DataTypes/Serializations/SerializationDynamicElement.cpp create mode 100644 src/DataTypes/Serializations/SerializationDynamicElement.h create mode 100644 src/Functions/dynamicElement.cpp create mode 100644 src/Functions/dynamicType.cpp create mode 100644 tests/queries/0_stateless/03033_dynamic_text_serialization.reference create mode 100644 tests/queries/0_stateless/03033_dynamic_text_serialization.sql create mode 100644 tests/queries/0_stateless/03034_dynamic_conversions.reference create mode 100644 tests/queries/0_stateless/03034_dynamic_conversions.sql create mode 100644 tests/queries/0_stateless/03035_dynamic_sorting.reference create mode 100644 tests/queries/0_stateless/03035_dynamic_sorting.sql create mode 100644 tests/queries/0_stateless/03036_dynamic_read_subcolumns.reference create mode 100755 tests/queries/0_stateless/03036_dynamic_read_subcolumns.sh create mode 100644 tests/queries/0_stateless/03037_dynamic_merges_1.reference create mode 100755 tests/queries/0_stateless/03037_dynamic_merges_1.sh create mode 100755 tests/queries/0_stateless/03037_dynamic_merges_2.sh create mode 100644 tests/queries/0_stateless/03038_nested_dynamic_merges.reference create mode 100755 tests/queries/0_stateless/03038_nested_dynamic_merges.sh create mode 100644 tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference create mode 100755 tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh create mode 100644 tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference create mode 100755 tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh create mode 100644 tests/queries/0_stateless/03040_dynamic_type_alters.reference create mode 100755 tests/queries/0_stateless/03040_dynamic_type_alters.sh diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md new file mode 100644 index 00000000000..e20bdad1e79 --- /dev/null +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -0,0 +1,157 @@ +--- +slug: /en/sql-reference/data-types/dynamic +sidebar_position: 56 +sidebar_label: Dynamic +--- + +# Dynamic + +This type allows to store values of any type inside it without knowing all of them in advance. + +To declare a column of `Dynamic` type, use the following syntax: + +``` sql + Dynamic(max_types=N) +``` + +Where `N` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a column with type `Dynamic`. If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_types` is `32`. + +:::note +The Dynamic data type is an experimental feature. To use it, set `allow_experimental_dynamic_type = 1`. +::: + +## Creating Dynamic + +Using `Dynamic` type in table column definition: + +```sql +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT d, dynamicType(d) FROM test; +``` + +```text +┌─d─────────────┬─dynamicType(d)─┐ +│ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ +│ Hello, World! │ String │ +│ [1,2,3] │ Array(Int64) │ +└───────────────┴────────────────┘ +``` + +Using CAST from ordinary column: + +```sql +SELECT 'Hello, World!'::Dynamic as d, dynamicType(d); +``` + +```text +┌─d─────────────┬─dynamicType(d)─┐ +│ Hello, World! │ String │ +└───────────────┴────────────────┘ +``` + +Using CAST from `Variant` column: + +```sql +SET allow_experimental_variant_type = 1, use_variant_as_common_type = 1; +SELECT multiIf((number % 3) = 0, number, (number % 3) = 1, range(number + 1), NULL)::Dynamic AS d, dynamicType(d) FROM numbers(3) +``` + +```text +┌─d─────┬─dynamicType(d)─┐ +│ 0 │ UInt64 │ +│ [0,1] │ Array(UInt64) │ +│ ᴺᵁᴸᴸ │ None │ +└───────┴────────────────┘ +``` + + +## Reading Dynamic nested types as subcolumns + +`Dynamic` type supports reading a single nested type from a `Dynamic` column using the type name as a subcolumn. +So, if you have column `d Dynamic` you can read a subcolumn of any valid type `T` using syntax `d.T`, +this subcolumn will have type `Nullable(T)` if `T` can be inside `Nullable` and `T` otherwise. This subcolumn will +be the same size as original `Dynamic` column and will contain `NULL` values (or empty values if `T` cannot be inside `Nullable`) +in all rows in which original `Dynamic` column doesn't have type `T`. + +`Dynamic` subcolumns can be also read using function `dynamicElement(dynamic_column, type_name)`. + +Examples: + +```sql +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT d, dynamicType(d), d.String, d.Int64, d.`Array(Int64)`, d.Date, d.`Array(String)` FROM test; +``` + +```text +┌─d─────────────┬─dynamicType(d)─┬─d.String──────┬─d.Int64─┬─d.Array(Int64)─┬─d.Date─┬─d.Array(String)─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ [] │ ᴺᵁᴸᴸ │ [] │ +│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ ᴺᵁᴸᴸ │ [] │ +└───────────────┴────────────────┴───────────────┴─────────┴────────────────┴────────┴─────────────────┘ +``` + +```sql +SELECT toTypeName(d.String), toTypeName(d.Int64), toTypeName(d.`Array(Int64)`), toTypeName(d.Date), toTypeName(d.`Array(String)`) FROM test LIMIT 1; +``` + +```text +┌─toTypeName(d.String)─┬─toTypeName(d.Int64)─┬─toTypeName(d.Array(Int64))─┬─toTypeName(d.Date)─┬─toTypeName(d.Array(String))─┐ +│ Nullable(String) │ Nullable(Int64) │ Array(Int64) │ Nullable(Date) │ Array(String) │ +└──────────────────────┴─────────────────────┴────────────────────────────┴────────────────────┴─────────────────────────────┘ +``` + +```sql +SELECT d, dynamicType(d), dynamicElement(d, 'String'), dynamicElement(d, 'Int64'), dynamicElement(d, 'Array(Int64)'), dynamicElement(d, 'Date'), dynamicElement(d, 'Array(String)') FROM test;``` + +```text +┌─d─────────────┬─dynamicType(d)─┬─dynamicElement(d, 'String')─┬─dynamicElement(d, 'Int64')─┬─dynamicElement(d, 'Array(Int64)')─┬─dynamicElement(d, 'Date')─┬─dynamicElement(d, 'Array(String)')─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ [] │ ᴺᵁᴸᴸ │ [] │ +│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ ᴺᵁᴸᴸ │ [] │ +└───────────────┴────────────────┴─────────────────────────────┴────────────────────────────┴───────────────────────────────────┴───────────────────────────┴────────────────────────────────────┘ +``` + +To know what variant is stored in each row function `dynamicType(dynamic_column)` can be used. It returns `String` with value type name for each row (or `'None'` if row is `NULL`). + +Example: + +```sql +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT dynamicType(d) from test; +``` + +```text +┌─dynamicType(d)─┐ +│ None │ +│ Int64 │ +│ String │ +│ Array(Int64) │ +└────────────────┘ +``` + +## Conversion between Dynamic column and other columns + +There are 4 possible conversions that can be performed with `Dynamic` column. + +### Converting an ordinary column to a Variant column + +```sql +SELECT 'Hello, World!'::Dynamic as d, dynamicType(d); +``` + +```text +┌─d─────────────┬─dynamicType(d)─┐ +│ Hello, World! │ String │ +└───────────────┴────────────────┘ +``` + + + + + diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 7b268b80116..29773492dc9 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -1289,4 +1289,14 @@ size_t ColumnArray::getNumberOfDimensions() const return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion. } +void ColumnArray::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +{ + Columns nested_source_columns; + nested_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + nested_source_columns.push_back(assert_cast(*source_column).getDataPtr()); + + data->takeDynamicStructureFromSourceColumns(nested_source_columns); +} + } diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 230d8830265..53eb5166df8 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -175,6 +175,9 @@ public: size_t getNumberOfDimensions() const; + bool hasDynamicStructure() const override { return getData().hasDynamicStructure(); } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + private: WrappedPtr data; WrappedPtr offsets; diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index 6763410b46d..934adf07cf4 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -122,6 +122,9 @@ public: UInt64 getNumberOfDefaultRows() const override { throwMustBeDecompressed(); } void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); } + bool hasDynamicStructure() const override { throwMustBeDecompressed(); } + void takeDynamicStructureFromSourceColumns(const Columns &) override { throwMustBeDecompressed(); } + protected: size_t rows; size_t bytes; diff --git a/src/Columns/ColumnConst.cpp b/src/Columns/ColumnConst.cpp index f2cea83db0e..cf3f448516c 100644 --- a/src/Columns/ColumnConst.cpp +++ b/src/Columns/ColumnConst.cpp @@ -159,6 +159,15 @@ void ColumnConst::compareColumn( std::fill(compare_results.begin(), compare_results.end(), res); } +void ColumnConst::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +{ + Columns nested_source_columns; + nested_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + nested_source_columns.push_back(assert_cast(*source_column).getDataColumnPtr()); + data->takeDynamicStructureFromSourceColumns(nested_source_columns); +} + ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value) { auto data = column->cloneEmpty(); diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index 4a3d40ca0d2..042468cbbcc 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -306,6 +306,10 @@ public: T getValue() const { return static_cast(getField().safeGet()); } bool isCollationSupported() const override { return data->isCollationSupported(); } + + bool hasDynamicStructure() const override { return data->hasDynamicStructure(); } + + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; }; ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value); diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp new file mode 100644 index 00000000000..293055b43fc --- /dev/null +++ b/src/Columns/ColumnDynamic.cpp @@ -0,0 +1,785 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int PARAMETER_OUT_OF_BOUND; +} + + +ColumnDynamic::ColumnDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) +{ + /// Create empty Variant. + variant_info.variant_type = std::make_shared(DataTypes{}); + variant_info.variant_name = variant_info.variant_type->getName(); + variant_column = variant_info.variant_type->createColumn(); +} + +ColumnDynamic::ColumnDynamic( + MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_) + : variant_column(std::move(variant_column_)) + , variant_info(variant_info_) + , max_dynamic_types(max_dynamic_types_) + , statistics(statistics_) +{ +} + +ColumnDynamic::MutablePtr ColumnDynamic::create(MutableColumnPtr variant_column, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_) +{ + VariantInfo variant_info; + variant_info.variant_type = variant_type; + variant_info.variant_name = variant_type->getName(); + const auto & variants = assert_cast(*variant_type).getVariants(); + variant_info.variant_names.reserve(variants.size()); + variant_info.variant_name_to_discriminator.reserve(variants.size()); + for (ColumnVariant::Discriminator discr = 0; discr != variants.size(); ++discr) + { + variant_info.variant_names.push_back(variants[discr]->getName()); + variant_info.variant_name_to_discriminator[variant_info.variant_names.back()] = discr; + } + + return create(std::move(variant_column), variant_info, max_dynamic_types_, statistics_); +} + +bool ColumnDynamic::addNewVariant(const DB::DataTypePtr & new_variant) +{ + /// Check if we already have such variant. + if (variant_info.variant_name_to_discriminator.contains(new_variant->getName())) + return true; + + /// Check if we reached maximum number of variants. + if (variant_info.variant_names.size() >= max_dynamic_types) + { + /// ColumnDynamic can have max_dynamic_types number of variants only when it has String as a variant. + /// Otherwise we won't be able to add cast new variants to Strings. + if (!variant_info.variant_name_to_discriminator.contains("String")) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Maximum number of variants reached, but no String variant exists"); + + return false; + } + + /// If we have max_dynamic_types - 1 number of variants and don't have String variant, we can add only String variant. + if (variant_info.variant_names.size() == max_dynamic_types - 1 && new_variant->getName() != "String" && !variant_info.variant_name_to_discriminator.contains("String")) + return false; + + const DataTypes & current_variants = assert_cast(*variant_info.variant_type).getVariants(); + DataTypes all_variants = current_variants; + all_variants.push_back(new_variant); + auto new_variant_type = std::make_shared(all_variants); + const auto & new_variants = assert_cast(*new_variant_type).getVariants(); + + std::vector current_to_new_discriminators; + current_to_new_discriminators.resize(variant_info.variant_names.size()); + Names new_variant_names; + new_variant_names.reserve(new_variants.size()); + std::unordered_map new_variant_name_to_discriminator; + new_variant_name_to_discriminator.reserve(new_variants.size()); + std::vector> new_variant_columns_and_discriminators_to_add; + new_variant_columns_and_discriminators_to_add.reserve(new_variants.size() - current_variants.size()); + + for (ColumnVariant::Discriminator discr = 0; discr != new_variants.size(); ++discr) + { + String name = new_variants[discr]->getName(); + new_variant_names.push_back(name); + new_variant_name_to_discriminator[name] = discr; + auto it = variant_info.variant_name_to_discriminator.find(name); + if (it == variant_info.variant_name_to_discriminator.end()) + new_variant_columns_and_discriminators_to_add.emplace_back(new_variants[discr]->createColumn(), discr); + else + current_to_new_discriminators[it->second] = discr; + } + + variant_info.variant_type = new_variant_type; + variant_info.variant_name = new_variant_type->getName(); + variant_info.variant_names = new_variant_names; + variant_info.variant_name_to_discriminator = new_variant_name_to_discriminator; + assert_cast(*variant_column).extend(current_to_new_discriminators, std::move(new_variant_columns_and_discriminators_to_add)); + variant_mappings_cache.clear(); + return true; +} + +void ColumnDynamic::addStringVariant() +{ + addNewVariant(std::make_shared()); +} + +void ColumnDynamic::updateVariantInfoAndExpandVariantColumn(const DB::DataTypePtr & new_variant_type) +{ + const DataTypes & current_variants = assert_cast(variant_info.variant_type.get())->getVariants(); + const DataTypes & new_variants = assert_cast(new_variant_type.get())->getVariants(); + + Names new_variant_names; + new_variant_names.reserve(new_variants.size()); + std::unordered_map new_variant_name_to_discriminator; + new_variant_name_to_discriminator.reserve(new_variants.size()); + std::vector> new_variant_columns_and_discriminators_to_add; + new_variant_columns_and_discriminators_to_add.reserve(new_variants.size() - current_variants.size()); + std::vector current_to_new_discriminators; + current_to_new_discriminators.resize(current_variants.size()); + + for (ColumnVariant::Discriminator discr = 0; discr != new_variants.size(); ++discr) + { + String name = new_variants[discr]->getName(); + new_variant_names.push_back(name); + new_variant_name_to_discriminator[name] = discr; + + auto current_it = variant_info.variant_name_to_discriminator.find(name); + if (current_it == variant_info.variant_name_to_discriminator.end()) + new_variant_columns_and_discriminators_to_add.emplace_back(new_variants[discr]->createColumn(), discr); + else + current_to_new_discriminators[current_it->second] = discr; + } + + variant_info.variant_type = new_variant_type; + variant_info.variant_name = new_variant_type->getName(); + variant_info.variant_names = new_variant_names; + variant_info.variant_name_to_discriminator = new_variant_name_to_discriminator; + assert_cast(*variant_column).extend(current_to_new_discriminators, std::move(new_variant_columns_and_discriminators_to_add)); + /// Clear mappings cache because now with new Variant we will have new mappings. + variant_mappings_cache.clear(); +} + +std::vector * ColumnDynamic::combineVariants(const DB::ColumnDynamic::VariantInfo & other_variant_info) +{ + /// Check if we already have global discriminators mapping for other Variant in cache. + /// It's used to not calculate the same mapping each call of insertFrom with the same columns. + auto cache_it = variant_mappings_cache.find(other_variant_info.variant_name); + if (cache_it != variant_mappings_cache.end()) + return &cache_it->second; + + /// Check if we already tried to combine these variants but failed due to max_dynamic_types limit. + if (variants_with_failed_combination.contains(other_variant_info.variant_name)) + return nullptr; + + const DataTypes & other_variants = assert_cast(*other_variant_info.variant_type).getVariants(); + + size_t num_new_variants = 0; + for (size_t i = 0; i != other_variants.size(); ++i) + { + if (!variant_info.variant_name_to_discriminator.contains(other_variant_info.variant_names[i])) + ++num_new_variants; + } + + /// If we have new variants we need to update current variant info and extend Variant column + if (num_new_variants) + { + const DataTypes & current_variants = assert_cast(*variant_info.variant_type).getVariants(); + + /// We cannot combine Variants if total number of variants exceeds max_dynamic_types. + if (current_variants.size() + num_new_variants > max_dynamic_types) + { + /// Remember that we cannot combine our variant with this one, so we will not try to do it again. + variants_with_failed_combination.insert(other_variant_info.variant_name); + return nullptr; + } + + /// We cannot combine Variants if total number of variants reaches max_dynamic_types and we don't have String variant. + if (current_variants.size() + num_new_variants == max_dynamic_types && !variant_info.variant_name_to_discriminator.contains("String") && !other_variant_info.variant_name_to_discriminator.contains("String")) + { + variants_with_failed_combination.insert(other_variant_info.variant_name); + return nullptr; + } + + DataTypes all_variants = current_variants; + all_variants.insert(all_variants.end(), other_variants.begin(), other_variants.end()); + auto new_variant_type = std::make_shared(all_variants); + updateVariantInfoAndExpandVariantColumn(new_variant_type); + } + + /// Create a global discriminators mapping for other variant. + std::vector other_to_new_discriminators; + other_to_new_discriminators.reserve(other_variants.size()); + for (size_t i = 0; i != other_variants.size(); ++i) + other_to_new_discriminators.push_back(variant_info.variant_name_to_discriminator[other_variant_info.variant_names[i]]); + + /// Save mapping to cache to not calculate it again for the same Variants. + auto [it, _] = variant_mappings_cache.emplace(other_variant_info.variant_name, std::move(other_to_new_discriminators)); + return &it->second; +} + +void ColumnDynamic::insert(const DB::Field & x) +{ + /// Check if we can insert field without Variant extension. + if (variant_column->tryInsert(x)) + return; + + /// If we cannot insert field into current variant column, extend it with new variant for this field from its type. + if (likely(addNewVariant(applyVisitor(FieldToDataType(), x)))) + { + /// Now we should be able to insert this field into extended variant column. + variant_column->insert(x); + } + else + { + /// We reached maximum number of variants and couldn't add new variant. + /// This case should be really rare in real use cases. + /// We should always be able to add String variant and cast inserted value to String. + addStringVariant(); + variant_column->insert(toString(x)); + } +} + +bool ColumnDynamic::tryInsert(const DB::Field & x) +{ + /// We can insert any value into Dynamic column. + insert(x); + return true; +} + + +void ColumnDynamic::insertFrom(const DB::IColumn & src_, size_t n) +{ + const auto & dynamic_src = assert_cast(src_); + + /// Check if we have the same variants in both columns. + if (variant_info.variant_name == dynamic_src.variant_info.variant_name) + { + variant_column->insertFrom(*dynamic_src.variant_column, n); + return; + } + + auto & variant_col = assert_cast(*variant_column); + + /// If variants are different, we need to extend our variant with new variants. + if (auto global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + { + variant_col.insertFrom(*dynamic_src.variant_column, n, *global_discriminators_mapping); + return; + } + + /// We cannot combine 2 Variant types as total number of variants exceeds the limit. + /// We need to insert single value, try to add only corresponding variant. + const auto & src_variant_col = assert_cast(*dynamic_src.variant_column); + auto src_global_discr = src_variant_col.globalDiscriminatorAt(n); + + /// NULL doesn't require Variant extension. + if (src_global_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + insertDefault(); + return; + } + + auto variant_type = assert_cast(*dynamic_src.variant_info.variant_type).getVariants()[src_global_discr]; + if (addNewVariant(variant_type)) + { + auto discr = variant_info.variant_name_to_discriminator[dynamic_src.variant_info.variant_names[src_global_discr]]; + variant_col.insertIntoVariantFrom(discr, src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(n)); + return; + } + + /// We reached maximum number of variants and couldn't add new variant. + /// We should always be able to add String variant and cast inserted value to String. + addStringVariant(); + auto tmp_variant_column = src_variant_col.getVariantByGlobalDiscriminator(src_global_discr).cloneEmpty(); + tmp_variant_column->insertFrom(src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(n)); + auto tmp_string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared()); + auto string_variant_discr = variant_info.variant_name_to_discriminator["String"]; + variant_col.insertIntoVariantFrom(string_variant_discr, *tmp_string_column, 0); +} + +void ColumnDynamic::insertRangeFrom(const DB::IColumn & src_, size_t start, size_t length) +{ + if (start + length > src_.size()) + throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Parameter out of bound in ColumnDynamic::insertRangeFrom method. " + "[start({}) + length({}) > src.size()({})]", start, length, src_.size()); + + const auto & dynamic_src = assert_cast(src_); + + /// Check if we have the same variants in both columns. + if (variant_info.variant_names == dynamic_src.variant_info.variant_names) + { + variant_column->insertRangeFrom(*dynamic_src.variant_column, start, length); + return; + } + + auto & variant_col = assert_cast(*variant_column); + + /// If variants are different, we need to extend our variant with new variants. + if (auto global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + { + variant_col.insertRangeFrom(*dynamic_src.variant_column, start, length, *global_discriminators_mapping); + return; + } + + /// We cannot combine 2 Variant types as total number of variants exceeds the limit. + /// In this case we will add most frequent variants from this range and insert them as usual, + /// all other variants will be converted to String. + const auto & src_variant_column = dynamic_src.getVariantColumn(); + + /// Calculate ranges for each variant in current range. + std::vector> variants_ranges(dynamic_src.variant_info.variant_names.size(), {0, 0}); + /// If we insert the whole column, no need to iterate through the range, we can just take variant sizes. + if (start == 0 && length == dynamic_src.size()) + { + for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i) + variants_ranges[i] = {0, src_variant_column.getVariantByGlobalDiscriminator(i).size()}; + } + /// Otherwise we need to iterate through discriminators and calculate the range for each variant. + else + { + const auto & local_discriminators = src_variant_column.getLocalDiscriminators(); + const auto & offsets = src_variant_column.getOffsets(); + size_t end = start + length; + for (size_t i = start; i != end; ++i) + { + auto discr = src_variant_column.globalDiscriminatorByLocal(local_discriminators[i]); + if (discr != ColumnVariant::NULL_DISCRIMINATOR) + { + if (!variants_ranges[discr].second) + variants_ranges[discr].first = offsets[i]; + ++variants_ranges[discr].second; + } + } + } + + const auto & src_variants = assert_cast(*dynamic_src.variant_info.variant_type).getVariants(); + /// List of variants that will be converted to String. + std::vector variants_to_convert_to_string; + /// Mapping from global discriminators of src_variant to the new variant we will create. + std::vector other_to_new_discriminators; + other_to_new_discriminators.reserve(dynamic_src.variant_info.variant_names.size()); + + /// Check if we cannot add any more new variants. In this case we will convert all new variants to String. + if (variant_info.variant_names.size() == max_dynamic_types || (variant_info.variant_names.size() == max_dynamic_types - 1 && !variant_info.variant_name_to_discriminator.contains("String"))) + { + addStringVariant(); + for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i) + { + auto it = variant_info.variant_name_to_discriminator.find(dynamic_src.variant_info.variant_names[i]); + if (it == variant_info.variant_name_to_discriminator.end()) + { + variants_to_convert_to_string.push_back(i); + other_to_new_discriminators.push_back(variant_info.variant_name_to_discriminator["String"]); + } + else + { + other_to_new_discriminators.push_back(it->second); + } + } + } + /// We still can add some new variants, but not all of them. Let's choose the most frequent variants in specified range. + else + { + std::vector> new_variants_with_sizes; + new_variants_with_sizes.reserve(dynamic_src.variant_info.variant_names.size()); + for (size_t i = 0; i != dynamic_src.variant_info.variant_names.size(); ++i) + { + const auto & variant_name = dynamic_src.variant_info.variant_names[i]; + if (variant_name != "String" && !variant_info.variant_name_to_discriminator.contains(variant_name)) + new_variants_with_sizes.emplace_back(variants_ranges[i].second, i); + } + + std::sort(new_variants_with_sizes.begin(), new_variants_with_sizes.end(), std::greater()); + DataTypes new_variants = assert_cast(*variant_info.variant_type).getVariants(); + if (!variant_info.variant_name_to_discriminator.contains("String")) + new_variants.push_back(std::make_shared()); + + for (const auto & [_, discr] : new_variants_with_sizes) + { + if (new_variants.size() != max_dynamic_types) + new_variants.push_back(src_variants[discr]); + else + variants_to_convert_to_string.push_back(discr); + } + + auto new_variant_type = std::make_shared(new_variants); + updateVariantInfoAndExpandVariantColumn(new_variant_type); + auto string_variant_discriminator = variant_info.variant_name_to_discriminator.at("String"); + for (const auto & variant_name : dynamic_src.variant_info.variant_names) + { + auto it = variant_info.variant_name_to_discriminator.find(variant_name); + if (it == variant_info.variant_name_to_discriminator.end()) + other_to_new_discriminators.push_back(string_variant_discriminator); + else + other_to_new_discriminators.push_back(it->second); + } + } + + /// Convert to String all variants that couldn't be added. + std::unordered_map variants_converted_to_string; + variants_converted_to_string.reserve(variants_to_convert_to_string.size()); + for (auto discr : variants_to_convert_to_string) + { + auto [variant_start, variant_length] = variants_ranges[discr]; + const auto & variant = src_variant_column.getVariantPtrByGlobalDiscriminator(discr); + if (variant_start == 0 && variant_length == variant->size()) + variants_converted_to_string[discr] = castColumn(ColumnWithTypeAndName(variant, src_variants[discr], ""), std::make_shared()); + else + variants_converted_to_string[discr] = castColumn(ColumnWithTypeAndName(variant->cut(variant_start, variant_length), src_variants[discr], ""), std::make_shared()); + } + + const auto & src_local_discriminators = src_variant_column.getLocalDiscriminators(); + const auto & src_offsets = src_variant_column.getOffsets(); + const auto & src_variant_columns = src_variant_column.getVariants(); + size_t end = start + length; + for (size_t i = start; i != end; ++i) + { + auto local_discr = src_local_discriminators[i]; + if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + variant_col.insertDefault(); + } + else + { + auto global_discr = src_variant_column.globalDiscriminatorByLocal(local_discr); + auto to_global_discr = other_to_new_discriminators[global_discr]; + auto it = variants_converted_to_string.find(global_discr); + if (it == variants_converted_to_string.end()) + { + variant_col.insertIntoVariantFrom(to_global_discr, *src_variant_columns[local_discr], src_offsets[i]); + } + else + { + variant_col.insertIntoVariantFrom(to_global_discr, *it->second, src_offsets[i] - variants_ranges[global_discr].first); + } + } + } +} + +void ColumnDynamic::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length) +{ + const auto & dynamic_src = assert_cast(src_); + + /// Check if we have the same variants in both columns. + if (variant_info.variant_names == dynamic_src.variant_info.variant_names) + { + variant_column->insertManyFrom(*dynamic_src.variant_column, position, length); + return; + } + + auto & variant_col = assert_cast(*variant_column); + + /// If variants are different, we need to extend our variant with new variants. + if (auto global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + { + variant_col.insertManyFrom(*dynamic_src.variant_column, position, length, *global_discriminators_mapping); + return; + } + + /// We cannot combine 2 Variant types as total number of variants exceeds the limit. + /// We need to insert single value, try to add only corresponding variant. + const auto & src_variant_col = assert_cast(*dynamic_src.variant_column); + auto src_global_discr = src_variant_col.globalDiscriminatorAt(position); + if (src_global_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + insertDefault(); + return; + } + + auto variant_type = assert_cast(*dynamic_src.variant_info.variant_type).getVariants()[src_global_discr]; + if (addNewVariant(variant_type)) + { + auto discr = variant_info.variant_name_to_discriminator[dynamic_src.variant_info.variant_names[src_global_discr]]; + variant_col.insertManyIntoVariantFrom(discr, src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(position), length); + return; + } + + addStringVariant(); + auto tmp_variant_column = src_variant_col.getVariantByGlobalDiscriminator(src_global_discr).cloneEmpty(); + tmp_variant_column->insertFrom(src_variant_col.getVariantByGlobalDiscriminator(src_global_discr), src_variant_col.offsetAt(position)); + auto tmp_string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared()); + auto string_variant_discr = variant_info.variant_name_to_discriminator["String"]; + variant_col.insertManyIntoVariantFrom(string_variant_discr, *tmp_string_column, 0, length); +} + + +StringRef ColumnDynamic::serializeValueIntoArena(size_t n, DB::Arena & arena, const char *& begin) const +{ + /// We cannot use Variant serialization here as it serializes discriminator + value, + /// but Dynamic doesn't have fixed mapping discriminator <-> variant type + /// as different Dynamic column can have different Variants. + /// Instead, we serialize null bit + variant type name (size + bytes) + value. + const auto & variant_col = assert_cast(*variant_column); + auto discr = variant_col.globalDiscriminatorAt(n); + StringRef res; + UInt8 null_bit = discr == ColumnVariant::NULL_DISCRIMINATOR; + if (null_bit) + { + char * pos = arena.allocContinue(sizeof(UInt8), begin); + memcpy(pos, &null_bit, sizeof(UInt8)); + res.data = pos; + res.size = sizeof(UInt8); + return res; + } + + const auto & variant_name = variant_info.variant_names[discr]; + size_t variant_name_size = variant_name.size(); + char * pos = arena.allocContinue(sizeof(UInt8) + sizeof(size_t) + variant_name.size(), begin); + memcpy(pos, &null_bit, sizeof(UInt8)); + memcpy(pos + sizeof(UInt8), &variant_name_size, sizeof(size_t)); + memcpy(pos + sizeof(UInt8) + sizeof(size_t), variant_name.data(), variant_name.size()); + res.data = pos; + res.size = sizeof(UInt8) + sizeof(size_t) + variant_name.size(); + + auto value_ref = variant_col.getVariantByGlobalDiscriminator(discr).serializeValueIntoArena(variant_col.offsetAt(n), arena, begin); + res.data = value_ref.data - res.size; + res.size += value_ref.size; + return res; +} + +const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos) +{ + auto & variant_col = assert_cast(*variant_column); + UInt8 null_bit = unalignedLoad(pos); + pos += sizeof(UInt8); + if (null_bit) + { + insertDefault(); + return pos; + } + + /// Read variant type name. + const size_t variant_name_size = unalignedLoad(pos); + pos += sizeof(variant_name_size); + String variant_name; + variant_name.resize(variant_name_size); + memcpy(variant_name.data(), pos, variant_name_size); + pos += variant_name_size; + /// If we already have such variant, just deserialize it into corresponding variant column. + auto it = variant_info.variant_name_to_discriminator.find(variant_name); + if (it != variant_info.variant_name_to_discriminator.end()) + { + auto discr = it->second; + return variant_col.deserializeVariantAndInsertFromArena(discr, pos); + } + + /// If we don't have such variant, add it. + auto variant_type = DataTypeFactory::instance().get(variant_name); + if (likely(addNewVariant(variant_type))) + { + auto discr = variant_info.variant_name_to_discriminator[variant_name]; + return variant_col.deserializeVariantAndInsertFromArena(discr, pos); + } + + /// We reached maximum number of variants and couldn't add new variant. + /// This case should be really rare in real use cases. + /// We should always be able to add String variant and cast inserted value to String. + addStringVariant(); + /// Create temporary column of this variant type and deserialize value into it. + auto tmp_variant_column = variant_type->createColumn(); + pos = tmp_variant_column->deserializeAndInsertFromArena(pos); + /// Cast temporary column to String and insert this value into String variant. + auto str_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared()); + variant_col.insertIntoVariantFrom(variant_info.variant_name_to_discriminator["String"], *str_column, 0); + return pos; +} + +const char * ColumnDynamic::skipSerializedInArena(const char * pos) const +{ + UInt8 null_bit = unalignedLoad(pos); + pos += sizeof(UInt8); + if (null_bit) + return pos; + + const size_t variant_name_size = unalignedLoad(pos); + pos += sizeof(variant_name_size); + String variant_name; + variant_name.resize(variant_name_size); + memcpy(variant_name.data(), pos, variant_name_size); + pos += variant_name_size; + auto tmp_variant_column = DataTypeFactory::instance().get(variant_name)->createColumn(); + return tmp_variant_column->skipSerializedInArena(pos); +} + +void ColumnDynamic::updateHashWithValue(size_t n, SipHash & hash) const +{ + const auto & variant_col = assert_cast(*variant_column); + auto discr = variant_col.globalDiscriminatorAt(n); + if (discr == ColumnVariant::NULL_DISCRIMINATOR) + { + hash.update(discr); + return; + } + + hash.update(variant_info.variant_names[discr]); + variant_col.getVariantByGlobalDiscriminator(discr).updateHashWithValue(variant_col.offsetAt(n), hash); +} + +int ColumnDynamic::compareAt(size_t n, size_t m, const DB::IColumn & rhs, int nan_direction_hint) const +{ + const auto & left_variant = assert_cast(*variant_column); + const auto & right_dynamic = assert_cast(rhs); + const auto & right_variant = assert_cast(*right_dynamic.variant_column); + + auto left_discr = left_variant.globalDiscriminatorAt(n); + auto right_discr = right_variant.globalDiscriminatorAt(m); + + /// Check if we have NULLs and return result based on nan_direction_hint. + if (left_discr == ColumnVariant::NULL_DISCRIMINATOR && right_discr == ColumnVariant::NULL_DISCRIMINATOR) + return 0; + else if (left_discr == ColumnVariant::NULL_DISCRIMINATOR) + return nan_direction_hint; + else if (right_discr == ColumnVariant::NULL_DISCRIMINATOR) + return -nan_direction_hint; + + /// If rows have different types, we compare type names. + if (variant_info.variant_names[left_discr] != right_dynamic.variant_info.variant_names[right_discr]) + return variant_info.variant_names[left_discr] < right_dynamic.variant_info.variant_names[right_discr] ? -1 : 1; + + /// If rows have the same types, compare actual values from corresponding variants. + return left_variant.getVariantByGlobalDiscriminator(left_discr).compareAt(left_variant.offsetAt(n), right_variant.offsetAt(m), right_variant.getVariantByGlobalDiscriminator(right_discr), nan_direction_hint); +} + +ColumnPtr ColumnDynamic::compress() const +{ + ColumnPtr variant_compressed = variant_column->compress(); + size_t byte_size = variant_compressed->byteSize(); + return ColumnCompressed::create(size(), byte_size, + [my_variant_compressed = std::move(variant_compressed), my_variant_info = variant_info, my_max_dynamic_types = max_dynamic_types, my_statistics = statistics]() mutable + { + return ColumnDynamic::create(my_variant_compressed->decompress(), my_variant_info, my_max_dynamic_types, my_statistics); + }); +} + +void ColumnDynamic::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +{ + if (!empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "takeDynamicStructureFromSourceColumns should be called only on empty Dynamic column"); + + /// During serialization of Dynamic column in MergeTree all Dynamic columns + /// in single part must have the same structure (the same variants). During merge + /// resulting column is constructed by inserting from source columns, + /// but it may happen that resulting column doesn't have rows from all source parts + /// but only from subset of them, and as a result some variants could be missing + /// and structures of resulting column may differ. + /// To solve this problem, before merge we create empty resulting column and use this method + /// to take dynamic structure from all source column even if we won't insert + /// rows from some of them. + + /// We want to construct resulting variant with most frequent variants from source columns and convert the rarest + /// variants to single String variant if we exceed the limit of variants. + /// First, collect all variants from all source columns and calculate total sizes. + std::unordered_map total_sizes; + DataTypes all_variants; + + for (const auto & source_column : source_columns) + { + const auto & source_dynamic = assert_cast(*source_column); + const auto & source_variant_column = source_dynamic.getVariantColumn(); + const auto & source_variant_info = source_dynamic.getVariantInfo(); + const auto & source_variants = assert_cast(*source_variant_info.variant_type).getVariants(); + /// During deserialization from MergeTree we will have variant sizes statistics from the whole data part. + const auto & source_statistics = source_dynamic.getStatistics(); + for (size_t i = 0; i != source_variants.size(); ++i) + { + const auto & variant_name = source_variant_info.variant_names[i]; + auto it = total_sizes.find(variant_name); + /// Add this variant to the list of all variants if we didn't see it yet. + if (it == total_sizes.end()) + { + all_variants.push_back(source_variants[i]); + it = total_sizes.emplace(variant_name, 0).first; + } + + size_t size = source_statistics.data.empty() ? source_variant_column.getVariantByGlobalDiscriminator(i).size() : source_statistics.data.at(variant_name); +// LOG_DEBUG(getLogger("ColumnDynamic"), "Source variant: {}. Variant: {}. Size: {}", source_variant_info.variant_name, variant_name, size); + it->second += size; + } + } + + DataTypePtr result_variant_type; + /// Check if the number of all variants exceeds the limit. + if (all_variants.size() > max_dynamic_types || (all_variants.size() == max_dynamic_types && !total_sizes.contains("String"))) + { + /// Create list of variants with their sizes and sort it. + std::vector> variants_with_sizes; + variants_with_sizes.reserve(all_variants.size()); + for (const auto & variant : all_variants) + { +// LOG_DEBUG(getLogger("ColumnDynamic"), "Variant: {}. Size: {}", variant->getName(), total_sizes[variant->getName()]); + variants_with_sizes.emplace_back(total_sizes[variant->getName()], variant); + } + std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater()); + + /// Take first max_dynamic_types variants from sorted list. + DataTypes result_variants; + result_variants.reserve(max_dynamic_types); + /// Add String variant in advance. + result_variants.push_back(std::make_shared()); + size_t i = 0; + while (result_variants.size() != max_dynamic_types && i < variants_with_sizes.size()) + { + const auto & variant = variants_with_sizes[i].second; + if (variant->getName() != "String") + result_variants.push_back(variant); + ++i; + } + + result_variant_type = std::make_shared(result_variants); + } + else + { + result_variant_type = std::make_shared(all_variants); + } + + /// Now we have resulting Variant and can fill variant info. + variant_info.variant_type = result_variant_type; + variant_info.variant_name = result_variant_type->getName(); + const auto & result_variants = assert_cast(*result_variant_type).getVariants(); + variant_info.variant_names.clear(); + variant_info.variant_names.reserve(result_variants.size()); + variant_info.variant_name_to_discriminator.clear(); + variant_info.variant_name_to_discriminator.reserve(result_variants.size()); + statistics.data.clear(); + statistics.data.reserve(result_variants.size()); + statistics.source = Statistics::Source::MERGE; + for (size_t i = 0; i != result_variants.size(); ++i) + { + auto variant_name = result_variants[i]->getName(); + variant_info.variant_names.push_back(variant_name); + variant_info.variant_name_to_discriminator[variant_name] = i; + statistics.data[variant_name] = total_sizes[variant_name]; + } + + variant_column = variant_info.variant_type->createColumn(); + + /// Now we have the resulting Variant that will be used in all merged columns. + /// Variants can also contain Dynamic columns inside, we should collect + /// all source variants that will be used in the resulting merged column + /// and call takeDynamicStructureFromSourceColumns on all resulting variants. + std::vector variants_source_columns; + variants_source_columns.resize(variant_info.variant_names.size()); + for (const auto & source_column : source_columns) + { + const auto & source_dynamic_column = assert_cast(*source_column); + const auto & source_variant_info = source_dynamic_column.getVariantInfo(); + for (size_t i = 0; i != variant_info.variant_names.size(); ++i) + { + /// Try to find this variant in current source column. + auto it = source_variant_info.variant_name_to_discriminator.find(variant_info.variant_names[i]); + if (it != source_variant_info.variant_name_to_discriminator.end()) + variants_source_columns[i].push_back(source_dynamic_column.getVariantColumn().getVariantPtrByGlobalDiscriminator(it->second)); + } + } + + auto & variant_col = getVariantColumn(); + for (size_t i = 0; i != variant_info.variant_names.size(); ++i) + variant_col.getVariantByGlobalDiscriminator(i).takeDynamicStructureFromSourceColumns(variants_source_columns[i]); +} + +void ColumnDynamic::applyNullMap(const ColumnVector::Container & null_map) +{ + assert_cast(*variant_column).applyNullMap(null_map); +} + +void ColumnDynamic::applyNegatedNullMap(const ColumnVector::Container & null_map) +{ + assert_cast(*variant_column).applyNegatedNullMap(null_map); +} + +} diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h new file mode 100644 index 00000000000..7487a5aa0db --- /dev/null +++ b/src/Columns/ColumnDynamic.h @@ -0,0 +1,363 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +/** + * Column for storing Dynamic type values. + * Dynamic column allows to insert and store values of any data types inside. + * Inside it stores: + * - Variant column with all inserted values of different types. + * - Information about currently stored variants. + * + * When new values are inserted into Dynamic column, the internal Variant + * type and column are extended if the inserted value has new type. + */ +class ColumnDynamic final : public COWHelper, ColumnDynamic> +{ +public: + struct Statistics + { + enum class Source + { + READ, + MERGE, + }; + + Source source; + std::unordered_map data; + }; + +private: + friend class COWHelper, ColumnDynamic>; + + struct VariantInfo + { + DataTypePtr variant_type; + /// Name of the whole variant to not call getName() every time. + String variant_name; + /// Store names of variants to not call getName() every time on variants. + Names variant_names; + /// Store mapping (variant name) -> (global discriminator). + /// It's used during variant extension. + std::unordered_map variant_name_to_discriminator; + }; + + ColumnDynamic(size_t max_dynamic_types_); + ColumnDynamic(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {}); + +public: + /** Create immutable column using immutable arguments. This arguments may be shared with other columns. + * Use IColumn::mutate in order to make mutable column and mutate shared nested columns. + */ + using Base = COWHelper, ColumnDynamic>; + static Ptr create(const ColumnPtr & variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {}) + { + return ColumnDynamic::create(variant_column_->assumeMutable(), variant_info_, max_dynamic_types_, statistics_); + } + + static MutablePtr create(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {}) + { + return Base::create(std::move(variant_column_), variant_info_, max_dynamic_types_, statistics_); + } + + static MutablePtr create(MutableColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_ = {}); + + static ColumnPtr create(ColumnPtr variant_column_, const DataTypePtr & variant_type, size_t max_dynamic_types_, const Statistics & statistics_ = {}) + { + return create(variant_column_->assumeMutable(), variant_type, max_dynamic_types_, statistics_); + } + + static MutablePtr create(size_t max_dynamic_types_) + { + return Base::create(max_dynamic_types_); + } + + std::string getName() const override { return "Dynamic(max_types=" + std::to_string(max_dynamic_types) + ")"; } + + const char * getFamilyName() const override + { + return "Dynamic"; + } + + TypeIndex getDataType() const override + { + return TypeIndex::Dynamic; + } + + MutableColumnPtr cloneEmpty() const override + { + /// Keep current dynamic structure. + return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types, statistics); + } + + MutableColumnPtr cloneResized(size_t size) const override + { + return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types, statistics); + } + + size_t size() const override + { + return variant_column->size(); + } + + Field operator[](size_t n) const override + { + return (*variant_column)[n]; + } + + void get(size_t n, Field & res) const override + { + variant_column->get(n, res); + } + + bool isDefaultAt(size_t n) const override + { + return variant_column->isDefaultAt(n); + } + + bool isNullAt(size_t n) const override + { + return variant_column->isNullAt(n); + } + + StringRef getDataAt(size_t n) const override + { + return variant_column->getDataAt(n); + } + + void insertData(const char * pos, size_t length) override + { + return variant_column->insertData(pos, length); + } + + void insert(const Field & x) override; + bool tryInsert(const Field & x) override; + void insertFrom(const IColumn & src_, size_t n) override; + void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; + + void insertDefault() override + { + variant_column->insertDefault(); + } + + void insertManyDefaults(size_t length) override + { + variant_column->insertManyDefaults(length); + } + + void popBack(size_t n) override + { + variant_column->popBack(n); + } + + StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; + const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; + + void updateHashWithValue(size_t n, SipHash & hash) const override; + + void updateWeakHash32(WeakHash32 & hash) const override + { + variant_column->updateWeakHash32(hash); + } + + void updateHashFast(SipHash & hash) const override + { + variant_column->updateHashFast(hash); + } + + ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override + { + return create(variant_column->filter(filt, result_size_hint), variant_info, max_dynamic_types); + } + + void expand(const Filter & mask, bool inverted) override + { + variant_column->expand(mask, inverted); + } + + ColumnPtr permute(const Permutation & perm, size_t limit) const override + { + return create(variant_column->permute(perm, limit), variant_info, max_dynamic_types); + } + + ColumnPtr index(const IColumn & indexes, size_t limit) const override + { + return create(variant_column->index(indexes, limit), variant_info, max_dynamic_types); + } + + ColumnPtr replicate(const Offsets & replicate_offsets) const override + { + return create(variant_column->replicate(replicate_offsets), variant_info, max_dynamic_types); + } + + MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override + { + MutableColumns scattered_variant_columns = variant_column->scatter(num_columns, selector); + MutableColumns scattered_columns; + scattered_columns.reserve(num_columns); + for (auto & scattered_variant_column : scattered_variant_columns) + scattered_columns.emplace_back(create(std::move(scattered_variant_column), variant_info, max_dynamic_types)); + + return scattered_columns; + } + + int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override; + + bool hasEqualValues() const override + { + return variant_column->hasEqualValues(); + } + + void getExtremes(Field & min, Field & max) const override + { + variant_column->getExtremes(min, max); + } + + void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, + size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override + { + variant_column->getPermutation(direction, stability, limit, nan_direction_hint, res); + } + + void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, + size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override + { + variant_column->updatePermutation(direction, stability, limit, nan_direction_hint, res, equal_ranges); + } + + void reserve(size_t n) override + { + variant_column->reserve(n); + } + + void ensureOwnership() override + { + variant_column->ensureOwnership(); + } + + size_t byteSize() const override + { + return variant_column->byteSize(); + } + + size_t byteSizeAt(size_t n) const override + { + return variant_column->byteSizeAt(n); + } + + size_t allocatedBytes() const override + { + return variant_column->allocatedBytes(); + } + + void protect() override + { + variant_column->protect(); + } + + void forEachSubcolumn(MutableColumnCallback callback) override + { + callback(variant_column); + } + + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override + { + callback(*variant_column); + variant_column->forEachSubcolumnRecursively(callback); + } + + bool structureEquals(const IColumn & rhs) const override + { + if (const auto * rhs_concrete = typeid_cast(&rhs)) + return max_dynamic_types == rhs_concrete->max_dynamic_types; + return false; + } + + ColumnPtr compress() const override; + + double getRatioOfDefaultRows(double sample_ratio) const override + { + return variant_column->getRatioOfDefaultRows(sample_ratio); + } + + UInt64 getNumberOfDefaultRows() const override + { + return variant_column->getNumberOfDefaultRows(); + } + + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override + { + variant_column->getIndicesOfNonDefaultRows(indices, from, limit); + } + + void finalize() override + { + variant_column->finalize(); + } + + bool isFinalized() const override + { + return variant_column->isFinalized(); + } + + /// Apply null map to a nested Variant column. + void applyNullMap(const ColumnVector::Container & null_map); + void applyNegatedNullMap(const ColumnVector::Container & null_map); + + const VariantInfo & getVariantInfo() const { return variant_info; } + + const ColumnPtr & getVariantColumnPtr() const { return variant_column; } + ColumnPtr & getVariantColumnPtr() { return variant_column; } + + const ColumnVariant & getVariantColumn() const { return assert_cast(*variant_column); } + ColumnVariant & getVariantColumn() { return assert_cast(*variant_column); } + + bool addNewVariant(const DataTypePtr & new_variant); + void addStringVariant(); + + bool hasDynamicStructure() const override { return true; } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + + const Statistics & getStatistics() const { return statistics; } + + size_t getMaxDynamicTypes() const { return max_dynamic_types; } + +private: + /// Combine current variant with the other variant and return global discriminators mapping + /// from other variant to the combined one. It's used for inserting from + /// different variants. + /// Returns nullptr if maximum number of Variants is reached and tne new Variant cannot be created. + std::vector * combineVariants(const VariantInfo & other_variant_info); + + void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type); + + WrappedPtr variant_column; + /// Store the type of current variant with some additional information. + VariantInfo variant_info; + /// Maximum number of different types that can be stored in Dynamic. + /// If exceeded, all new variants will be converted to String. + size_t max_dynamic_types; + + /// Size statistics of each variants from MergeTree data part. + /// Used in takeDynamicStructureFromSourceColumns and set during deserialization. + Statistics statistics; + + std::unordered_map> variant_mappings_cache; + std::unordered_set variants_with_failed_combination; +}; + +} diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 57e8ba685b4..48e8bced23a 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -312,4 +312,13 @@ ColumnPtr ColumnMap::compress() const }); } +void ColumnMap::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +{ + Columns nested_source_columns; + nested_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + nested_source_columns.push_back(assert_cast(*source_column).getNestedColumnPtr()); + nested->takeDynamicStructureFromSourceColumns(nested_source_columns); +} + } diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index 60aa69e7bf6..52165d0d74e 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -104,6 +104,9 @@ public: ColumnTuple & getNestedData() { return assert_cast(getNestedColumn().getData()); } ColumnPtr compress() const override; + + bool hasDynamicStructure() const override { return nested->hasDynamicStructure(); } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; }; } diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index fa5fdfb8c21..4474816601e 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -835,6 +835,15 @@ ColumnPtr ColumnNullable::getNestedColumnWithDefaultOnNull() const return res; } +void ColumnNullable::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +{ + Columns nested_source_columns; + nested_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + nested_source_columns.push_back(assert_cast(*source_column).getNestedColumnPtr()); + nested_column->takeDynamicStructureFromSourceColumns(nested_source_columns); +} + ColumnPtr makeNullable(const ColumnPtr & column) { if (isColumnNullable(*column)) diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index ef4bf4fa41b..73bd75527f8 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -186,6 +186,9 @@ public: /// Check that size of null map equals to size of nested column. void checkConsistency() const; + bool hasDynamicStructure() const override { return nested_column->hasDynamicStructure(); } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + private: WrappedPtr nested_column; WrappedPtr null_map; diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index b9a173fd92c..4acd162e52f 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -801,6 +801,15 @@ ColumnSparse::Iterator ColumnSparse::getIterator(size_t n) const return Iterator(offsets_data, _size, current_offset, n); } +void ColumnSparse::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +{ + Columns values_source_columns; + values_source_columns.reserve(source_columns.size()); + for (const auto & source_column : source_columns) + values_source_columns.push_back(assert_cast(*source_column).getValuesPtr()); + values->takeDynamicStructureFromSourceColumns(values_source_columns); +} + ColumnPtr recursiveRemoveSparse(const ColumnPtr & column) { if (!column) diff --git a/src/Columns/ColumnSparse.h b/src/Columns/ColumnSparse.h index c1bd614102c..7d3200da35f 100644 --- a/src/Columns/ColumnSparse.h +++ b/src/Columns/ColumnSparse.h @@ -148,6 +148,9 @@ public: size_t sizeOfValueIfFixed() const override { return values->sizeOfValueIfFixed() + values->sizeOfValueIfFixed(); } bool isCollationSupported() const override { return values->isCollationSupported(); } + bool hasDynamicStructure() const override { return values->hasDynamicStructure(); } + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + size_t getNumberOfTrailingDefaults() const { return offsets->empty() ? _size : _size - getOffsetsData().back() - 1; diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index 062bdadf9d2..4e8e4063157 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -572,6 +572,34 @@ bool ColumnTuple::isCollationSupported() const return false; } +bool ColumnTuple::hasDynamicStructure() const +{ + for (const auto & column : columns) + { + if (column->hasDynamicStructure()) + return true; + } + return false; +} + +void ColumnTuple::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +{ + std::vector nested_source_columns; + nested_source_columns.resize(columns.size()); + for (size_t i = 0; i != columns.size(); ++i) + nested_source_columns[i].reserve(source_columns.size()); + + for (const auto & source_column : source_columns) + { + const auto & nsource_columns = assert_cast(*source_column).getColumns(); + for (size_t i = 0; i != nsource_columns.size(); ++i) + nested_source_columns[i].push_back(nsource_columns[i]); + } + + for (size_t i = 0; i != columns.size(); ++i) + columns[i]->takeDynamicStructureFromSourceColumns(nested_source_columns[i]); +} + ColumnPtr ColumnTuple::compress() const { diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index 5b626155754..65103fa8c49 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -114,6 +114,9 @@ public: const ColumnPtr & getColumnPtr(size_t idx) const { return columns[idx]; } ColumnPtr & getColumnPtr(size_t idx) { return columns[idx]; } + bool hasDynamicStructure() const override; + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + private: int compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator=nullptr) const; diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index 31e9b0964f4..819491f7fd9 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include @@ -452,16 +451,18 @@ bool ColumnVariant::tryInsert(const DB::Field & x) return false; } -void ColumnVariant::insertFrom(const IColumn & src_, size_t n) +void ColumnVariant::insertFromImpl(const DB::IColumn & src_, size_t n, const std::vector * global_discriminators_mapping) { + const size_t num_variants = variants.size(); const ColumnVariant & src = assert_cast(src_); - const size_t num_variants = variants.size(); - if (src.variants.size() != num_variants) + if (!global_discriminators_mapping && src.variants.size() != num_variants) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); - /// Remember that src column can have different local variants order. - Discriminator global_discr = src.globalDiscriminatorAt(n); + Discriminator src_global_discr = src.globalDiscriminatorAt(n); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; Discriminator local_discr = localDiscriminatorByGlobal(global_discr); getLocalDiscriminators().push_back(local_discr); if (local_discr == NULL_DISCRIMINATOR) @@ -471,25 +472,15 @@ void ColumnVariant::insertFrom(const IColumn & src_, size_t n) else { getOffsets().push_back(variants[local_discr]->size()); - variants[local_discr]->insertFrom(src.getVariantByGlobalDiscriminator(global_discr), src.offsetAt(n)); + variants[local_discr]->insertFrom(src.getVariantByGlobalDiscriminator(src_global_discr), src.offsetAt(n)); } } -void ColumnVariant::insertIntoVariant(const DB::Field & x, Discriminator global_discr) -{ - if (global_discr > variants.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator: {}. The number of variants is {}", size_t(global_discr), variants.size()); - auto & variant = getVariantByGlobalDiscriminator(global_discr); - variant.insert(x); - getLocalDiscriminators().push_back(localDiscriminatorByGlobal(global_discr)); - getOffsets().push_back(variant.size() - 1); -} - -void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length) +void ColumnVariant::insertRangeFromImpl(const DB::IColumn & src_, size_t start, size_t length, const std::vector * global_discriminators_mapping) { const size_t num_variants = variants.size(); const auto & src = assert_cast(src_); - if (src.variants.size() != num_variants) + if (!global_discriminators_mapping && src.variants.size() != num_variants) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); if (start + length > src.getLocalDiscriminators().size()) @@ -507,7 +498,12 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l /// In this case we can simply call insertRangeFrom on this single variant. if (auto non_empty_src_local_discr = src.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) { - auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(*non_empty_src_local_discr)); + Discriminator src_global_discr = src.globalDiscriminatorByLocal(*non_empty_src_local_discr); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; + + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); size_t offset = variants[local_discr]->size(); variants[local_discr]->insertRangeFrom(*src.variants[*non_empty_src_local_discr], start, length); getLocalDiscriminators().resize_fill(local_discriminators->size() + length, local_discr); @@ -522,7 +518,7 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l /// collect ranges we need to insert for all variants and update offsets. /// nested_ranges[i].first - offset in src.variants[i] /// nested_ranges[i].second - length in src.variants[i] - std::vector> nested_ranges(num_variants, {0, 0}); + std::vector> nested_ranges(src.variants.size(), {0, 0}); auto & offsets_data = getOffsets(); offsets_data.reserve(offsets_data.size() + length); auto & local_discriminators_data = getLocalDiscriminators(); @@ -533,7 +529,11 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l { /// We insert from src.variants[src_local_discr] to variants[local_discr] Discriminator src_local_discr = src_local_discriminators_data[i]; - Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); local_discriminators_data.push_back(local_discr); if (local_discr == NULL_DISCRIMINATOR) { @@ -553,22 +553,29 @@ void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t l for (size_t src_local_discr = 0; src_local_discr != nested_ranges.size(); ++src_local_discr) { auto [nested_start, nested_length] = nested_ranges[src_local_discr]; - auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); if (nested_length) variants[local_discr]->insertRangeFrom(*src.variants[src_local_discr], nested_start, nested_length); } } -void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length) +void ColumnVariant::insertManyFromImpl(const DB::IColumn & src_, size_t position, size_t length, const std::vector * global_discriminators_mapping) { const size_t num_variants = variants.size(); const auto & src = assert_cast(src_); - if (src.variants.size() != num_variants) + if (!global_discriminators_mapping && src.variants.size() != num_variants) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); - /// Remember that src column can have different local variants order. Discriminator src_local_discr = src.localDiscriminatorAt(position); - Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + Discriminator src_global_discr = src.globalDiscriminatorByLocal(src_local_discr); + Discriminator global_discr = src_global_discr; + if (global_discriminators_mapping && src_global_discr != NULL_DISCRIMINATOR) + global_discr = (*global_discriminators_mapping)[src_global_discr]; + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); auto & local_discriminators_data = getLocalDiscriminators(); local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr); @@ -588,6 +595,72 @@ void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, si } } +void ColumnVariant::insertFrom(const IColumn & src_, size_t n) +{ + insertFromImpl(src_, n, nullptr); +} + +void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length) +{ + insertRangeFromImpl(src_, start, length, nullptr); +} + +void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length) +{ + insertManyFromImpl(src_, position, length, nullptr); +} + +void ColumnVariant::insertFrom(const DB::IColumn & src_, size_t n, const std::vector & global_discriminators_mapping) +{ + insertFromImpl(src_, n, &global_discriminators_mapping); +} + +void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector & global_discriminators_mapping) +{ + insertRangeFromImpl(src_, start, length, &global_discriminators_mapping); +} + +void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length, const std::vector & global_discriminators_mapping) +{ + insertManyFromImpl(src_, position, length, &global_discriminators_mapping); +} + +void ColumnVariant::insertIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t n) +{ + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + getLocalDiscriminators().push_back(local_discr); + getOffsets().push_back(variants[local_discr]->size()); + variants[local_discr]->insertFrom(src_, n); +} + +void ColumnVariant::insertRangeIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t start, size_t length) +{ + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + auto & local_discriminators_data = getLocalDiscriminators(); + local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr); + auto & offsets_data = getOffsets(); + size_t offset = variants[local_discr]->size(); + offsets_data.reserve(offsets_data.size() + length); + for (size_t i = 0; i != length; ++i) + offsets_data.push_back(offset + i); + + variants[local_discr]->insertRangeFrom(src_, start, length); +} + +void ColumnVariant::insertManyIntoVariantFrom(DB::ColumnVariant::Discriminator global_discr, const DB::IColumn & src_, size_t position, size_t length) +{ + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + auto & local_discriminators_data = getLocalDiscriminators(); + local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr); + auto & offsets_data = getOffsets(); + size_t offset = variants[local_discr]->size(); + offsets_data.reserve(offsets_data.size() + length); + for (size_t i = 0; i != length; ++i) + offsets_data.push_back(offset + i); + + variants[local_discr]->insertManyFrom(src_, position, length); +} + void ColumnVariant::insertDefault() { getLocalDiscriminators().push_back(NULL_DISCRIMINATOR); @@ -678,6 +751,14 @@ const char * ColumnVariant::deserializeAndInsertFromArena(const char * pos) return variants[local_discr]->deserializeAndInsertFromArena(pos); } +const char * ColumnVariant::deserializeVariantAndInsertFromArena(DB::ColumnVariant::Discriminator global_discr, const char * pos) +{ + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + getLocalDiscriminators().push_back(local_discr); + getOffsets().push_back(variants[local_discr]->size()); + return variants[local_discr]->deserializeAndInsertFromArena(pos); +} + const char * ColumnVariant::skipSerializedInArena(const char * pos) const { Discriminator global_discr = unalignedLoad(pos); @@ -1426,4 +1507,54 @@ void ColumnVariant::applyNullMapImpl(const ColumnVector::Container & null } } +void ColumnVariant::extend(const std::vector & old_to_new_global_discriminators, std::vector> && new_variants_and_discriminators) +{ + /// Update global discriminators for current variants. + for (Discriminator & global_discr : local_to_global_discriminators) + global_discr = old_to_new_global_discriminators[global_discr]; + + /// Add new variants. + variants.reserve(variants.size() + new_variants_and_discriminators.size()); + local_to_global_discriminators.reserve(local_to_global_discriminators.size() + new_variants_and_discriminators.size()); + for (auto & new_variant_and_discriminator : new_variants_and_discriminators) + { + variants.emplace_back(std::move(new_variant_and_discriminator.first)); + local_to_global_discriminators.push_back(new_variant_and_discriminator.second); + } + + /// Update global -> local discriminators matching. + global_to_local_discriminators.resize(local_to_global_discriminators.size()); + for (Discriminator local_discr = 0; local_discr != local_to_global_discriminators.size(); ++local_discr) + global_to_local_discriminators[local_to_global_discriminators[local_discr]] = local_discr; +} + +bool ColumnVariant::hasDynamicStructure() const +{ + for (const auto & variant : variants) + { + if (variant->hasDynamicStructure()) + return true; + } + + return false; +} + +void ColumnVariant::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +{ + std::vector variants_source_columns; + variants_source_columns.resize(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + variants_source_columns[i].reserve(source_columns.size()); + + for (const auto & source_column : source_columns) + { + const auto & source_variants = assert_cast(*source_column).variants; + for (size_t i = 0; i != source_variants.size(); ++i) + variants_source_columns[i].push_back(source_variants[i]); + } + + for (size_t i = 0; i != variants.size(); ++i) + variants[i]->takeDynamicStructureFromSourceColumns(variants_source_columns[i]); +} + } diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index 4aa2c9058cc..8f703ea17d9 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -175,18 +175,32 @@ public: bool isDefaultAt(size_t n) const override; bool isNullAt(size_t n) const override; StringRef getDataAt(size_t n) const override; + void insertData(const char * pos, size_t length) override; void insert(const Field & x) override; bool tryInsert(const Field & x) override; - void insertIntoVariant(const Field & x, Discriminator global_discr); + void insertFrom(const IColumn & src_, size_t n) override; - void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; - void insertManyFrom(const IColumn & src, size_t position, size_t length) override; + void insertRangeFrom(const IColumn & src_, size_t start, size_t length) override; + void insertManyFrom(const IColumn & src_, size_t position, size_t length) override; + + /// Methods for insertion from another Variant but with known mapping between global discriminators. + void insertFrom(const IColumn & src_, size_t n, const std::vector & global_discriminators_mapping); + void insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector & global_discriminators_mapping); + void insertManyFrom(const IColumn & src_, size_t position, size_t length, const std::vector & global_discriminators_mapping); + + /// Methods for insertrion into a specific variant. + void insertIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t n); + void insertRangeIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t start, size_t length); + void insertManyIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t position, size_t length); + void insertDefault() override; void insertManyDefaults(size_t length) override; + void popBack(size_t n) override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; const char * deserializeAndInsertFromArena(const char * pos) override; + const char * deserializeVariantAndInsertFromArena(Discriminator global_discr, const char * pos); const char * skipSerializedInArena(const char * pos) const override; void updateHashWithValue(size_t n, SipHash & hash) const override; void updateWeakHash32(WeakHash32 & hash) const override; @@ -234,6 +248,8 @@ public: ColumnPtr & getVariantPtrByLocalDiscriminator(size_t discr) { return variants[discr]; } ColumnPtr & getVariantPtrByGlobalDiscriminator(size_t discr) { return variants[global_to_local_discriminators.at(discr)]; } + const NestedColumns & getVariants() const { return variants; } + const IColumn & getLocalDiscriminatorsColumn() const { return *local_discriminators; } IColumn & getLocalDiscriminatorsColumn() { return *local_discriminators; } @@ -282,7 +298,19 @@ public: void applyNullMap(const ColumnVector::Container & null_map); void applyNegatedNullMap(const ColumnVector::Container & null_map); + /// Extend current column with new variants. Change global discriminators of current variants to the new + /// according to the mapping and add new variants with new global discriminators. + /// This extension doesn't rewrite any data, just adds new empty variants and modifies global/local discriminators matching. + void extend(const std::vector & old_to_new_global_discriminators, std::vector> && new_variants_and_discriminators); + + bool hasDynamicStructure() const override; + void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; + private: + void insertFromImpl(const IColumn & src_, size_t n, const std::vector * global_discriminators_mapping); + void insertRangeFromImpl(const IColumn & src_, size_t start, size_t length, const std::vector * global_discriminators_mapping); + void insertManyFromImpl(const IColumn & src_, size_t position, size_t length, const std::vector * global_discriminators_mapping); + void initIdentityGlobalToLocalDiscriminatorsMapping(); template diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index 18974e49760..479fd7de1bc 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -461,6 +462,7 @@ template class IColumnHelper; template class IColumnHelper; template class IColumnHelper; template class IColumnHelper; +template class IColumnHelper; template class IColumnHelper; diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index cea8d7c9f55..33f398474ed 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -534,6 +534,8 @@ public: return res; } + virtual bool hasDynamicStructure() const { return false; } + virtual void takeDynamicStructureFromSourceColumns(const std::vector & /*source_columns*/) {} /** Some columns can contain another columns inside. * So, we have a tree of columns. But not all combinations are possible. diff --git a/src/Columns/tests/gtest_column_dynamic.cpp b/src/Columns/tests/gtest_column_dynamic.cpp new file mode 100644 index 00000000000..4c209f7d8a9 --- /dev/null +++ b/src/Columns/tests/gtest_column_dynamic.cpp @@ -0,0 +1,652 @@ +#include +#include +#include +#include + +using namespace DB; + +TEST(ColumnDynamic, CreateEmpty) +{ + auto column = ColumnDynamic::create(255); + ASSERT_TRUE(column->empty()); + ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant()"); + ASSERT_TRUE(column->getVariantInfo().variant_names.empty()); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.empty()); +} + +TEST(ColumnDynamic, InsertDefault) +{ + auto column = ColumnDynamic::create(255); + column->insertDefault(); + ASSERT_TRUE(column->size() == 1); + ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant()"); + ASSERT_TRUE(column->getVariantInfo().variant_names.empty()); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.empty()); + ASSERT_TRUE(column->isNullAt(0)); + ASSERT_EQ((*column)[0], Field(Null())); +} + +TEST(ColumnDynamic, InsertFields) +{ + auto column = ColumnDynamic::create(255); + column->insert(Field(42)); + column->insert(Field(-42)); + column->insert(Field("str1")); + column->insert(Field(Null())); + column->insert(Field(42.42)); + column->insert(Field(43)); + column->insert(Field(-43)); + column->insert(Field("str2")); + column->insert(Field(Null())); + column->insert(Field(43.43)); + ASSERT_TRUE(column->size() == 10); + + ASSERT_EQ(column->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, String)"); + std::vector expected_names = {"Float64", "Int8", "String"}; + ASSERT_EQ(column->getVariantInfo().variant_names, expected_names); + std::unordered_map expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"String", 2}}; + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +ColumnDynamic::MutablePtr getDynamicWithManyVariants(size_t num_variants, Field tuple_element = Field(42)) +{ + auto column = ColumnDynamic::create(255); + for (size_t i = 0; i != num_variants; ++i) + { + Tuple tuple; + for (size_t j = 0; j != i + 1; ++j) + tuple.push_back(tuple_element); + column->insert(tuple); + } + + return column; +} + +TEST(ColumnDynamic, InsertFieldsOverflow1) +{ + auto column = getDynamicWithManyVariants(253); + + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 253); + + column->insert(Field(42.42)); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + + column->insert(Field(42)); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + Field field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "42"); + + column->insert(Field(43)); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "43"); + + column->insert(Field("str1")); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "str1"); + + column->insert(Field(Array({Field(42), Field(43)}))); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "[42, 43]"); +} + +TEST(ColumnDynamic, InsertFieldsOverflow2) +{ + auto column = getDynamicWithManyVariants(254); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 254); + + column->insert(Field("str1")); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + + column->insert(Field(42)); + ASSERT_EQ(column->getVariantInfo().variant_names.size(), 255); + ASSERT_FALSE(column->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column->getVariantInfo().variant_name_to_discriminator.contains("String")); + Field field = (*column)[column->size() - 1]; + ASSERT_EQ(field, "42"); +} + +ColumnDynamic::MutablePtr getInsertFromColumn(size_t num = 1) +{ + auto column_from = ColumnDynamic::create(255); + for (size_t i = 0; i != num; ++i) + { + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + } + return column_from; +} + +void checkInsertFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector & expected_names, const std::unordered_map & expected_variant_name_to_discriminator) +{ + column_to->insertFrom(*column_from, 0); + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); + auto field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertFrom(*column_from, 1); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42.42); + + column_to->insertFrom(*column_from, 2); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +TEST(ColumnDynamic, InsertFrom1) +{ + auto column_to = ColumnDynamic::create(255); + checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertFrom2) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str")); + + checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertFrom3) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str")); + column_to->insert(Array({Field(42)})); + + checkInsertFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}}); +} + +TEST(ColumnDynamic, InsertFromOverflow1) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertFrom(*column_from, 0); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + auto field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertFrom(*column_from, 1); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "42.42"); + + column_to->insertFrom(*column_from, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); +} + +TEST(ColumnDynamic, InsertFromOverflow2) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertFrom(*column_from, 0); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + auto field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertFrom(*column_from, 1); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "42.42"); +} + +void checkInsertManyFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector & expected_names, const std::unordered_map & expected_variant_name_to_discriminator) +{ + column_to->insertManyFrom(*column_from, 0, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); + auto field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertManyFrom(*column_from, 1, 2); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42.42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42.42); + + column_to->insertManyFrom(*column_from, 2, 2); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, "str"); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +TEST(ColumnDynamic, InsertManyFrom1) +{ + auto column_to = ColumnDynamic::create(255); + checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertManyFrom2) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str")); + + checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertManyFrom3) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str")); + column_to->insert(Array({Field(42)})); + + checkInsertManyFrom(getInsertFromColumn(), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}}); +} + +TEST(ColumnDynamic, InsertManyFromOverflow1) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertManyFrom(*column_from, 0, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + auto field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertManyFrom(*column_from, 1, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, "42.42"); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "42.42"); + + column_to->insertManyFrom(*column_from, 2, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, "str"); +} + +TEST(ColumnDynamic, InsertManyFromOverflow2) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertManyFrom(*column_from, 0, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 254); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + auto field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, 42); + + column_to->insertManyFrom(*column_from, 1, 2); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, "42.42"); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "42.42"); +} + +void checkInsertRangeFrom(const ColumnDynamic::MutablePtr & column_from, ColumnDynamic::MutablePtr & column_to, const std::string & expected_variant, const std::vector & expected_names, const std::unordered_map & expected_variant_name_to_discriminator) +{ + column_to->insertRangeFrom(*column_from, 0, 3); + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); + auto field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42.42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + + column_to->insertRangeFrom(*column_from, 3, 3); + field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, 42); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, 42.42); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, "str"); + + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), expected_variant); + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +TEST(ColumnDynamic, InsertRangeFrom1) +{ + auto column_to = ColumnDynamic::create(255); + checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertRangeFrom2) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str1")); + + checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Float64, Int8, String)", {"Float64", "Int8", "String"}, {{"Float64", 0}, {"Int8", 1}, {"String", 2}}); +} + +TEST(ColumnDynamic, InsertRangeFrom3) +{ + auto column_to = ColumnDynamic::create(255); + column_to->insert(Field(42)); + column_to->insert(Field(42.42)); + column_to->insert(Field("str1")); + column_to->insert(Array({Field(42)})); + + checkInsertRangeFrom(getInsertFromColumn(2), column_to, "Variant(Array(Int8), Float64, Int8, String)", {"Array(Int8)", "Float64", "Int8", "String"}, {{"Array(Int8)", 0}, {"Float64", 1}, {"Int8", 2}, {"String", 3}}); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow1) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertRangeFrom(*column_from, 0, 4); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 4]; + ASSERT_EQ(field, Field(42)); + field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(43)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field("42.42")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("str")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow2) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(42.42)); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertRangeFrom(*column_from, 0, 3); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(42)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field(43)); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("42.42")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow3) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(42.42)); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insert(Field("Str")); + column_to->insertRangeFrom(*column_from, 0, 3); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(42)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field(43)); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("42.42")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow4) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(254); + column_to->insertRangeFrom(*column_from, 0, 3); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field("42")); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field("42.42")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("str")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow5) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insert(Field("str")); + column_to->insertRangeFrom(*column_from, 0, 4); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + auto field = (*column_to)[column_to->size() - 4]; + ASSERT_EQ(field, Field(42)); + field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(43)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field("42.42")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("str")); +} + +TEST(ColumnDynamic, InsertRangeFromOverflow6) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(43)); + column_from->insert(Field(44)); + column_from->insert(Field(42.42)); + column_from->insert(Field(43.43)); + column_from->insert(Field("str")); + column_from->insert(Field(Array({Field(42)}))); + + auto column_to = getDynamicWithManyVariants(253); + column_to->insertRangeFrom(*column_from, 2, 5); + ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)")); + auto field = (*column_to)[column_to->size() - 5]; + + ASSERT_EQ(field, Field("44")); + field = (*column_to)[column_to->size() - 4]; + ASSERT_EQ(field, Field(42.42)); + field = (*column_to)[column_to->size() - 3]; + ASSERT_EQ(field, Field(43.43)); + field = (*column_to)[column_to->size() - 2]; + ASSERT_EQ(field, Field("str")); + field = (*column_to)[column_to->size() - 1]; + ASSERT_EQ(field, Field("[42]")); +} + +TEST(ColumnDynamic, SerializeDeserializeFromArena1) +{ + auto column = ColumnDynamic::create(255); + column->insert(Field(42)); + column->insert(Field(42.42)); + column->insert(Field("str")); + column->insert(Field(Null())); + + Arena arena; + const char * pos = nullptr; + auto ref1 = column->serializeValueIntoArena(0, arena, pos); + column->serializeValueIntoArena(1, arena, pos); + column->serializeValueIntoArena(2, arena, pos); + column->serializeValueIntoArena(3, arena, pos); + pos = column->deserializeAndInsertFromArena(ref1.data); + pos = column->deserializeAndInsertFromArena(pos); + pos = column->deserializeAndInsertFromArena(pos); + column->deserializeAndInsertFromArena(pos); + + ASSERT_EQ((*column)[column->size() - 4], 42); + ASSERT_EQ((*column)[column->size() - 3], 42.42); + ASSERT_EQ((*column)[column->size() - 2], "str"); + ASSERT_EQ((*column)[column->size() - 1], Null()); +} + +TEST(ColumnDynamic, SerializeDeserializeFromArena2) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + column_from->insert(Field(Null())); + + Arena arena; + const char * pos = nullptr; + auto ref1 = column_from->serializeValueIntoArena(0, arena, pos); + column_from->serializeValueIntoArena(1, arena, pos); + column_from->serializeValueIntoArena(2, arena, pos); + column_from->serializeValueIntoArena(3, arena, pos); + + auto column_to = ColumnDynamic::create(255); + pos = column_to->deserializeAndInsertFromArena(ref1.data); + pos = column_to->deserializeAndInsertFromArena(pos); + pos = column_to->deserializeAndInsertFromArena(pos); + column_to->deserializeAndInsertFromArena(pos); + + ASSERT_EQ((*column_from)[column_from->size() - 4], 42); + ASSERT_EQ((*column_from)[column_from->size() - 3], 42.42); + ASSERT_EQ((*column_from)[column_from->size() - 2], "str"); + ASSERT_EQ((*column_from)[column_from->size() - 1], Null()); + ASSERT_EQ(column_to->getVariantInfo().variant_type->getName(), "Variant(Float64, Int8, String)"); + std::vector expected_names = {"Float64", "Int8", "String"}; + ASSERT_EQ(column_to->getVariantInfo().variant_names, expected_names); + std::unordered_map expected_variant_name_to_discriminator = {{"Float64", 0}, {"Int8", 1}, {"String", 2}}; + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator == expected_variant_name_to_discriminator); +} + +TEST(ColumnDynamic, SerializeDeserializeFromArenaOverflow) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + column_from->insert(Field(Null())); + + Arena arena; + const char * pos = nullptr; + auto ref1 = column_from->serializeValueIntoArena(0, arena, pos); + column_from->serializeValueIntoArena(1, arena, pos); + column_from->serializeValueIntoArena(2, arena, pos); + column_from->serializeValueIntoArena(3, arena, pos); + + auto column_to = getDynamicWithManyVariants(253); + pos = column_to->deserializeAndInsertFromArena(ref1.data); + pos = column_to->deserializeAndInsertFromArena(pos); + pos = column_to->deserializeAndInsertFromArena(pos); + column_to->deserializeAndInsertFromArena(pos); + + ASSERT_EQ((*column_from)[column_from->size() - 4], 42); + ASSERT_EQ((*column_from)[column_from->size() - 3], 42.42); + ASSERT_EQ((*column_from)[column_from->size() - 2], "str"); + ASSERT_EQ((*column_from)[column_from->size() - 1], Null()); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); +} + +TEST(ColumnDynamic, skipSerializedInArena) +{ + auto column_from = ColumnDynamic::create(255); + column_from->insert(Field(42)); + column_from->insert(Field(42.42)); + column_from->insert(Field("str")); + column_from->insert(Field(Null())); + + Arena arena; + const char * pos = nullptr; + auto ref1 = column_from->serializeValueIntoArena(0, arena, pos); + column_from->serializeValueIntoArena(1, arena, pos); + column_from->serializeValueIntoArena(2, arena, pos); + auto ref4 = column_from->serializeValueIntoArena(3, arena, pos); + + const char * end = ref4.data + ref4.size; + auto column_to = ColumnDynamic::create(255); + pos = column_to->skipSerializedInArena(ref1.data); + pos = column_to->skipSerializedInArena(pos); + pos = column_to->skipSerializedInArena(pos); + pos = column_to->skipSerializedInArena(pos); + + ASSERT_EQ(pos, end); + ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.empty()); + ASSERT_TRUE(column_to->getVariantInfo().variant_names.empty()); +} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 84e709294aa..7176c4d8850 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -871,6 +871,7 @@ class IColumn; M(Bool, traverse_shadow_remote_data_paths, false, "Traverse shadow directory when query system.remote_data_paths", 0) \ M(Bool, geo_distance_returns_float64_on_float64_arguments, true, "If all four arguments to `geoDistance`, `greatCircleDistance`, `greatCircleAngle` functions are Float64, return Float64 and use double precision for internal calculations. In previous ClickHouse versions, the functions always returned Float32.", 0) \ M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \ + M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ @@ -879,6 +880,7 @@ class IColumn; M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ + M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ diff --git a/src/Core/TypeId.h b/src/Core/TypeId.h index 7003e880cd5..26d9ab8595b 100644 --- a/src/Core/TypeId.h +++ b/src/Core/TypeId.h @@ -50,6 +50,7 @@ enum class TypeIndex IPv6, JSONPaths, Variant, + Dynamic }; /** diff --git a/src/DataTypes/DataTypeArray.cpp b/src/DataTypes/DataTypeArray.cpp index 6e5760933eb..806a1577a21 100644 --- a/src/DataTypes/DataTypeArray.cpp +++ b/src/DataTypes/DataTypeArray.cpp @@ -75,6 +75,27 @@ void DataTypeArray::forEachChild(const ChildCallback & callback) const nested->forEachChild(callback); } +std::unique_ptr DataTypeArray::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const +{ + auto nested_type = assert_cast(*data.type).nested; + auto nested_data = std::make_unique(nested_type->getDefaultSerialization()); + nested_data->type = nested_type; + nested_data->column = data.column ? assert_cast(*data.column).getDataPtr() : nullptr; + + auto nested_subcolumn_data = nested_type->getSubcolumnData(subcolumn_name, *nested_data, throw_if_null); + if (!nested_subcolumn_data) + return nullptr; + + auto creator = SerializationArray::SubcolumnCreator(data.column ? assert_cast(*data.column).getOffsetsPtr() : nullptr); + auto res = std::make_unique(); + res->serialization = creator.create(nested_subcolumn_data->serialization); + res->type = creator.create(nested_subcolumn_data->type); + if (data.column) + res->column = creator.create(nested_subcolumn_data->column); + + return res; +} + static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.size() != 1) diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 4423f137e1a..b242d871c36 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -55,7 +55,12 @@ public: bool textCanContainOnlyValidUTF8() const override { return nested->textCanContainOnlyValidUTF8(); } bool isComparable() const override { return nested->isComparable(); } bool canBeComparedWithCollation() const override { return nested->canBeComparedWithCollation(); } - bool hasDynamicSubcolumns() const override { return nested->hasDynamicSubcolumns(); } + bool hasDynamicSubcolumnsDeprecated() const override { return nested->hasDynamicSubcolumnsDeprecated(); } + + /// Array column doesn't have subcolumns by itself but allows to read subcolumns of nested column. + /// If nested column has dynamic subcolumns, Array of this type should also be able to read these dynamic subcolumns. + bool hasDynamicSubcolumnsData() const override { return nested->hasDynamicSubcolumnsData(); } + std::unique_ptr getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const override; bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { diff --git a/src/DataTypes/DataTypeDynamic.cpp b/src/DataTypes/DataTypeDynamic.cpp new file mode 100644 index 00000000000..2c6b3eba906 --- /dev/null +++ b/src/DataTypes/DataTypeDynamic.cpp @@ -0,0 +1,144 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int UNEXPECTED_AST_STRUCTURE; +} + +DataTypeDynamic::DataTypeDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) +{ +} + +MutableColumnPtr DataTypeDynamic::createColumn() const +{ + return ColumnDynamic::create(max_dynamic_types); +} + +String DataTypeDynamic::doGetName() const +{ + if (max_dynamic_types == DEFAULT_MAX_DYNAMIC_TYPES) + return "Dynamic"; + return "Dynamic(max_types=" + toString(max_dynamic_types) + ")"; +} + +Field DataTypeDynamic::getDefault() const +{ + return Field(Null()); +} + +SerializationPtr DataTypeDynamic::doGetDefaultSerialization() const +{ + return std::make_shared(max_dynamic_types); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + return std::make_shared(); + + if (arguments->children.size() > 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Dynamic data type can have only one optional argument - the maximum number of dynamic types in a form 'Dynamic(max_types=N)"); + + + const auto * argument = arguments->children[0]->as(); + if (!argument || argument->name != "equals") + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Dynamic data type argument should be in a form 'max_types=N'"); + + auto identifier_name = argument->arguments->children[0]->as()->name(); + if (identifier_name != "max_types") + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected identifier: {}. Dynamic data type argument should be in a form 'max_types=N'", identifier_name); + + auto literal = argument->arguments->children[1]->as(); + + if (!literal || literal->value.getType() != Field::Types::UInt64 || literal->value.get() == 0 || literal->value.get() > 255) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "'max_types' argument for Dynamic type should be a positive integer between 1 and 255"); + + return std::make_shared(literal->value.get()); +} + +void registerDataTypeDynamic(DataTypeFactory & factory) +{ + factory.registerDataType("Dynamic", create); +} + +std::unique_ptr DataTypeDynamic::getDynamicSubcolumnData(std::string_view subcolumn_name, const DB::IDataType::SubstreamData & data, bool throw_if_null) const +{ + auto [subcolumn_type_name, subcolumn_nested_name] = Nested::splitName(subcolumn_name); + /// Check if requested subcolumn is a valid data type. + auto subcolumn_type = DataTypeFactory::instance().tryGet(String(subcolumn_type_name)); + if (!subcolumn_type) + { + if (throw_if_null) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Dynamic type doesn't have subcolumn '{}'", subcolumn_type_name); + return nullptr; + } + + std::unique_ptr res = std::make_unique(subcolumn_type->getDefaultSerialization()); + res->type = subcolumn_type; + std::optional discriminator; + if (data.column) + { + /// If column was provided, we should extract subcolumn from Dynamic column. + const auto & dynamic_column = assert_cast(*data.column); + const auto & variant_info = dynamic_column.getVariantInfo(); + /// Check if provided Dynamic column has subcolumn of this type. + auto it = variant_info.variant_name_to_discriminator.find(subcolumn_type->getName()); + if (it != variant_info.variant_name_to_discriminator.end()) + { + discriminator = it->second; + res->column = dynamic_column.getVariantColumn().getVariantPtrByGlobalDiscriminator(*discriminator); + } + } + + /// Extract nested subcolumn of requested dynamic subcolumn if needed. + if (!subcolumn_nested_name.empty()) + { + res = getSubcolumnData(subcolumn_nested_name, *res, throw_if_null); + if (!res) + return nullptr; + } + + res->serialization = std::make_shared(res->serialization, subcolumn_type->getName()); + res->type = makeNullableOrLowCardinalityNullableSafe(res->type); + if (data.column) + { + if (discriminator) + { + /// Provided Dynamic column has subcolumn of this type, we should use VariantSubcolumnCreator to + /// create full subcolumn from variant according to discriminators. + const auto & variant_column = assert_cast(*data.column).getVariantColumn(); + auto creator = SerializationVariantElement::VariantSubcolumnCreator(variant_column.getLocalDiscriminatorsPtr(), "", *discriminator, variant_column.localDiscriminatorByGlobal(*discriminator)); + res->column = creator.create(res->column); + } + else + { + /// Provided Dynamic column doesn't have subcolumn of this type, just create column filled with default values. + auto column = res->type->createColumn(); + column->insertManyDefaults(data.column->size()); + res->column = std::move(column); + } + } + + return res; +} + +} diff --git a/src/DataTypes/DataTypeDynamic.h b/src/DataTypes/DataTypeDynamic.h new file mode 100644 index 00000000000..452e05061a0 --- /dev/null +++ b/src/DataTypes/DataTypeDynamic.h @@ -0,0 +1,53 @@ +#pragma once + +#include + +#define DEFAULT_MAX_DYNAMIC_TYPES 32 + + +namespace DB +{ + +class DataTypeDynamic final : public IDataType +{ +public: + static constexpr bool is_parametric = true; + + DataTypeDynamic(size_t max_dynamic_types_ = DEFAULT_MAX_DYNAMIC_TYPES); + + TypeIndex getTypeId() const override { return TypeIndex::Dynamic; } + const char * getFamilyName() const override { return "Dynamic"; } + + bool isParametric() const override { return true; } + bool canBeInsideNullable() const override { return false; } + bool supportsSparseSerialization() const override { return false; } + bool canBeInsideSparseColumns() const override { return false; } + bool isComparable() const override { return true; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override + { + if (const auto * rhs_dynamic_type = typeid_cast(&rhs)) + return max_dynamic_types == rhs_dynamic_type->max_dynamic_types; + return false; + } + + bool haveSubtypes() const override { return false; } + + bool hasDynamicSubcolumnsData() const override { return true; } + std::unique_ptr getDynamicSubcolumnData(std::string_view subcolumn_name, const SubstreamData & data, bool throw_if_null) const override; + + size_t getMaxDynamicTypes() const { return max_dynamic_types; } + +private: + SerializationPtr doGetDefaultSerialization() const override; + String doGetName() const override; + + size_t max_dynamic_types; +}; + +} + diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 844384f3c95..a94526dce60 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -292,6 +292,7 @@ DataTypeFactory::DataTypeFactory() registerDataTypeMap(*this); registerDataTypeObject(*this); registerDataTypeVariant(*this); + registerDataTypeDynamic(*this); } DataTypeFactory & DataTypeFactory::instance() diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index 4727cb3ae5c..86e0203358d 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -100,5 +100,6 @@ void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory); void registerDataTypeObject(DataTypeFactory & factory); void registerDataTypeVariant(DataTypeFactory & factory); +void registerDataTypeDynamic(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 7281cca1bb1..4866c3e78cc 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -42,7 +42,7 @@ public: bool isComparable() const override { return key_type->isComparable() && value_type->isComparable(); } bool isParametric() const override { return true; } bool haveSubtypes() const override { return true; } - bool hasDynamicSubcolumns() const override { return nested->hasDynamicSubcolumns(); } + bool hasDynamicSubcolumnsDeprecated() const override { return nested->hasDynamicSubcolumnsDeprecated(); } const DataTypePtr & getKeyType() const { return key_type; } const DataTypePtr & getValueType() const { return value_type; } diff --git a/src/DataTypes/DataTypeObject.h b/src/DataTypes/DataTypeObject.h index 937a9091371..c610a1a8ba4 100644 --- a/src/DataTypes/DataTypeObject.h +++ b/src/DataTypes/DataTypeObject.h @@ -36,7 +36,7 @@ public: bool haveSubtypes() const override { return false; } bool equals(const IDataType & rhs) const override; bool isParametric() const override { return true; } - bool hasDynamicSubcolumns() const override { return true; } + bool hasDynamicSubcolumnsDeprecated() const override { return true; } SerializationPtr doGetDefaultSerialization() const override; diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index 5bbd79160d4..71347011658 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -291,9 +291,9 @@ bool DataTypeTuple::haveMaximumSizeOfValue() const return std::all_of(elems.begin(), elems.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); }); } -bool DataTypeTuple::hasDynamicSubcolumns() const +bool DataTypeTuple::hasDynamicSubcolumnsDeprecated() const { - return std::any_of(elems.begin(), elems.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); }); + return std::any_of(elems.begin(), elems.end(), [](auto && elem) { return elem->hasDynamicSubcolumnsDeprecated(); }); } bool DataTypeTuple::isComparable() const diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index 15561fe4286..fd00fce5a17 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -52,7 +52,7 @@ public: bool isComparable() const override; bool textCanContainOnlyValidUTF8() const override; bool haveMaximumSizeOfValue() const override; - bool hasDynamicSubcolumns() const override; + bool hasDynamicSubcolumnsDeprecated() const override; size_t getMaximumSizeOfValueInMemory() const override; size_t getSizeOfValueInMemory() const override; diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index db96972c00f..b918b79a2ed 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -33,6 +33,9 @@ DataTypeVariant::DataTypeVariant(const DataTypes & variants_) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nullable/LowCardinality(Nullable) types are not allowed inside Variant type"); if (type->getTypeId() == TypeIndex::Variant) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nested Variant types are not allowed"); + if (type->getTypeId() == TypeIndex::Dynamic) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dynamic type is not allowed inside Variant type"); + /// Don't use Nothing type as a variant. if (!isNothing(type)) name_to_type[type->getName()] = type; @@ -42,9 +45,6 @@ DataTypeVariant::DataTypeVariant(const DataTypes & variants_) for (const auto & [_, type] : name_to_type) variants.push_back(type); - if (variants.empty()) - throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); - if (variants.size() > ColumnVariant::MAX_NESTED_COLUMNS) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Variant type with more than {} nested types is not allowed", ColumnVariant::MAX_NESTED_COLUMNS); } @@ -113,9 +113,16 @@ bool DataTypeVariant::equals(const IDataType & rhs) const return false; for (size_t i = 0; i < size; ++i) + { if (!variants[i]->equals(*rhs_variant.variants[i])) return false; + /// The same data types with different custom names considered different. + /// For example, UInt8 and Bool. + if ((variants[i]->hasCustomName() || rhs_variant.variants[i]) && variants[i]->getName() != rhs_variant.variants[i]->getName()) + return false; + } + return true; } @@ -129,17 +136,15 @@ bool DataTypeVariant::haveMaximumSizeOfValue() const return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); }); } -bool DataTypeVariant::hasDynamicSubcolumns() const +bool DataTypeVariant::hasDynamicSubcolumnsDeprecated() const { - return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); }); + return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumnsDeprecated(); }); } -std::optional DataTypeVariant::tryGetVariantDiscriminator(const IDataType & type) const +std::optional DataTypeVariant::tryGetVariantDiscriminator(const String & type_name) const { - String type_name = type.getName(); for (size_t i = 0; i != variants.size(); ++i) { - /// We don't use equals here, because it doesn't respect custom type names. if (variants[i]->getName() == type_name) return i; } @@ -187,7 +192,7 @@ void DataTypeVariant::forEachChild(const DB::IDataType::ChildCallback & callback static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.empty()) - throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); + return std::make_shared(DataTypes{}); DataTypes nested_types; nested_types.reserve(arguments->children.size()); diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h index dadc85ac3b3..1b561a083b1 100644 --- a/src/DataTypes/DataTypeVariant.h +++ b/src/DataTypes/DataTypeVariant.h @@ -45,14 +45,14 @@ public: bool haveSubtypes() const override { return true; } bool textCanContainOnlyValidUTF8() const override; bool haveMaximumSizeOfValue() const override; - bool hasDynamicSubcolumns() const override; + bool hasDynamicSubcolumnsDeprecated() const override; size_t getMaximumSizeOfValueInMemory() const override; const DataTypePtr & getVariant(size_t i) const { return variants[i]; } const DataTypes & getVariants() const { return variants; } /// Check if Variant has provided type in the list of variants and return its discriminator. - std::optional tryGetVariantDiscriminator(const IDataType & type) const; + std::optional tryGetVariantDiscriminator(const String & type_name) const; void forEachChild(const ChildCallback & callback) const override; diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 344b81be960..1c9715bbf53 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -101,14 +101,12 @@ void IDataType::forEachSubcolumn( data.serialization->enumerateStreams(settings, callback_with_data, data); } -template -Ptr IDataType::getForSubcolumn( +std::unique_ptr IDataType::getSubcolumnData( std::string_view subcolumn_name, const SubstreamData & data, - Ptr SubstreamData::*member, - bool throw_if_null) const + bool throw_if_null) { - Ptr res; + std::unique_ptr res; ISerialization::StreamCallback callback_with_data = [&](const auto & subpath) { @@ -120,7 +118,29 @@ Ptr IDataType::getForSubcolumn( auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len); /// Create data from path only if it's requested subcolumn. if (name == subcolumn_name) - res = ISerialization::createFromPath(subpath, prefix_len).*member; + { + res = std::make_unique(ISerialization::createFromPath(subpath, prefix_len)); + } + /// Check if this subcolumn is a prefix of requested subcolumn and it can create dynamic subcolumns. + else if (subcolumn_name.starts_with(name + ".") && subpath[i].data.type && subpath[i].data.type->hasDynamicSubcolumnsData()) + { + auto dynamic_subcolumn_name = subcolumn_name.substr(name.size() + 1); + auto dynamic_subcolumn_data = subpath[i].data.type->getDynamicSubcolumnData(dynamic_subcolumn_name, subpath[i].data, false); + if (dynamic_subcolumn_data) + { + /// Create requested subcolumn using dynamic subcolumn data. + auto tmp_subpath = subpath; + if (tmp_subpath[i].creator) + { + dynamic_subcolumn_data->type = tmp_subpath[i].creator->create(dynamic_subcolumn_data->type); + dynamic_subcolumn_data->column = tmp_subpath[i].creator->create(dynamic_subcolumn_data->column); + dynamic_subcolumn_data->serialization = tmp_subpath[i].creator->create(dynamic_subcolumn_data->serialization); + } + + tmp_subpath[i].data = *dynamic_subcolumn_data; + res = std::make_unique(ISerialization::createFromPath(tmp_subpath, prefix_len)); + } + } } subpath[i].visited = true; } @@ -130,8 +150,11 @@ Ptr IDataType::getForSubcolumn( settings.position_independent_encoding = false; data.serialization->enumerateStreams(settings, callback_with_data, data); + if (!res && data.type->hasDynamicSubcolumnsData()) + return data.type->getDynamicSubcolumnData(subcolumn_name, data, throw_if_null); + if (!res && throw_if_null) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, data.type->getName()); return res; } @@ -141,34 +164,51 @@ bool IDataType::hasSubcolumn(std::string_view subcolumn_name) const return tryGetSubcolumnType(subcolumn_name) != nullptr; } +bool IDataType::hasDynamicSubcolumns() const +{ + if (hasDynamicSubcolumnsData()) + return true; + + bool has_dynamic_subcolumns = false; + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); + auto callback = [&](const SubstreamPath &, const String &, const SubstreamData & subcolumn_data) + { + has_dynamic_subcolumns |= subcolumn_data.type->hasDynamicSubcolumnsData(); + }; + forEachSubcolumn(callback, data); + return has_dynamic_subcolumns; +} + DataTypePtr IDataType::tryGetSubcolumnType(std::string_view subcolumn_name) const { auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::type, false); + auto subcolumn_data = getSubcolumnData(subcolumn_name, data, false); + return subcolumn_data ? subcolumn_data->type : nullptr; } DataTypePtr IDataType::getSubcolumnType(std::string_view subcolumn_name) const { auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::type, true); + return getSubcolumnData(subcolumn_name, data, true)->type; } ColumnPtr IDataType::tryGetSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const { - auto data = SubstreamData(getDefaultSerialization()).withColumn(column); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::column, false); + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()).withColumn(column); + auto subcolumn_data = getSubcolumnData(subcolumn_name, data, false); + return subcolumn_data ? subcolumn_data->column : nullptr; } ColumnPtr IDataType::getSubcolumn(std::string_view subcolumn_name, const ColumnPtr & column) const { - auto data = SubstreamData(getDefaultSerialization()).withColumn(column); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::column, true); + auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()).withColumn(column); + return getSubcolumnData(subcolumn_name, data, true)->column; } SerializationPtr IDataType::getSubcolumnSerialization(std::string_view subcolumn_name, const SerializationPtr & serialization) const { - auto data = SubstreamData(serialization); - return getForSubcolumn(subcolumn_name, data, &SubstreamData::serialization, true); + auto data = SubstreamData(serialization).withType(getPtr()); + return getSubcolumnData(subcolumn_name, data, true)->serialization; } Names IDataType::getSubcolumnNames() const @@ -323,6 +363,7 @@ bool isMap(TYPE data_type) {return WhichDataType(data_type).isMap(); } \ bool isInterval(TYPE data_type) {return WhichDataType(data_type).isInterval(); } \ bool isObject(TYPE data_type) { return WhichDataType(data_type).isObject(); } \ bool isVariant(TYPE data_type) { return WhichDataType(data_type).isVariant(); } \ +bool isDynamic(TYPE data_type) { return WhichDataType(data_type).isDynamic(); } \ bool isNothing(TYPE data_type) { return WhichDataType(data_type).isNothing(); } \ \ bool isColumnedAsNumber(TYPE data_type) \ diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index eaf798a3017..dde61ca3a48 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -311,8 +311,13 @@ public: /// Strings, Numbers, Date, DateTime, Nullable virtual bool canBeInsideLowCardinality() const { return false; } - /// Object, Array(Object), Tuple(..., Object, ...) - virtual bool hasDynamicSubcolumns() const { return false; } + /// Checks for deprecated Object type usage recursively: Object, Array(Object), Tuple(..., Object, ...) + virtual bool hasDynamicSubcolumnsDeprecated() const { return false; } + + /// Checks if column has dynamic subcolumns. + virtual bool hasDynamicSubcolumns() const; + /// Checks if column can create dynamic subcolumns data and getDynamicSubcolumnData can be called. + virtual bool hasDynamicSubcolumnsData() const { return false; } /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint); @@ -329,16 +334,25 @@ protected: mutable SerializationPtr custom_serialization; public: + bool hasCustomName() const { return static_cast(custom_name.get()); } const IDataTypeCustomName * getCustomName() const { return custom_name.get(); } const ISerialization * getCustomSerialization() const { return custom_serialization.get(); } -private: - template - Ptr getForSubcolumn( +protected: + static std::unique_ptr getSubcolumnData( std::string_view subcolumn_name, const SubstreamData & data, - Ptr SubstreamData::*member, - bool throw_if_null) const; + bool throw_if_null); + + virtual std::unique_ptr getDynamicSubcolumnData( + std::string_view /*subcolumn_name*/, + const SubstreamData & /*data*/, + bool throw_if_null) const + { + if (throw_if_null) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDynamicSubcolumnData() is not implemented for type {}", getName()); + return nullptr; + } }; @@ -423,6 +437,7 @@ struct WhichDataType constexpr bool isLowCardinality() const { return idx == TypeIndex::LowCardinality; } constexpr bool isVariant() const { return idx == TypeIndex::Variant; } + constexpr bool isDynamic() const { return idx == TypeIndex::Dynamic; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) @@ -483,6 +498,7 @@ bool isMap(TYPE data_type); \ bool isInterval(TYPE data_type); \ bool isObject(TYPE data_type); \ bool isVariant(TYPE data_type); \ +bool isDynamic(TYPE data_type); \ bool isNothing(TYPE data_type); \ \ bool isColumnedAsNumber(TYPE data_type); \ diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 99cf092e6cd..107e3a50025 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -177,7 +177,7 @@ static std::pair convertObjectColumnToTuple( static std::pair recursivlyConvertDynamicColumnToTuple( const ColumnPtr & column, const DataTypePtr & type) { - if (!type->hasDynamicSubcolumns()) + if (!type->hasDynamicSubcolumnsDeprecated()) return {column, type}; if (const auto * type_object = typeid_cast(type.get())) @@ -243,7 +243,7 @@ void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & sto { for (auto & column : block) { - if (!column.type->hasDynamicSubcolumns()) + if (!column.type->hasDynamicSubcolumnsDeprecated()) continue; std::tie(column.column, column.type) @@ -417,7 +417,7 @@ static DataTypePtr getLeastCommonTypeForTuple( static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl( const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths) { - if (!type_in_storage->hasDynamicSubcolumns()) + if (!type_in_storage->hasDynamicSubcolumnsDeprecated()) return type_in_storage; if (isObject(type_in_storage)) @@ -459,7 +459,7 @@ DataTypePtr getLeastCommonTypeForDynamicColumns( DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage) { - if (!type_in_storage->hasDynamicSubcolumns()) + if (!type_in_storage->hasDynamicSubcolumnsDeprecated()) return type_in_storage; if (isObject(type_in_storage)) @@ -494,7 +494,7 @@ bool hasDynamicSubcolumns(const ColumnsDescription & columns) return std::any_of(columns.begin(), columns.end(), [](const auto & column) { - return column.type->hasDynamicSubcolumns(); + return column.type->hasDynamicSubcolumnsDeprecated(); }); } @@ -1065,7 +1065,7 @@ Field FieldVisitorFoldDimension::operator()(const Null & x) const void setAllObjectsToDummyTupleType(NamesAndTypesList & columns) { for (auto & column : columns) - if (column.type->hasDynamicSubcolumns()) + if (column.type->hasDynamicSubcolumnsDeprecated()) column.type = createConcreteEmptyDynamicColumn(column.type); } diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 3e3b1b96740..6599d8adef1 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -194,7 +194,7 @@ ColumnsDescription getConcreteObjectColumns( /// dummy column will be removed. for (const auto & column : storage_columns) { - if (column.type->hasDynamicSubcolumns()) + if (column.type->hasDynamicSubcolumnsDeprecated()) types_in_entries[column.name].push_back(createConcreteEmptyDynamicColumn(column.type)); } @@ -204,7 +204,7 @@ ColumnsDescription getConcreteObjectColumns( for (const auto & column : entry_columns) { auto storage_column = storage_columns.tryGetPhysical(column.name); - if (storage_column && storage_column->type->hasDynamicSubcolumns()) + if (storage_column && storage_column->type->hasDynamicSubcolumnsDeprecated()) types_in_entries[column.name].push_back(column.type); } } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index a3a28f8091c..dbe27a5f3f6 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -196,6 +196,8 @@ String getNameForSubstreamPath( stream_name += ".variant_offsets"; else if (it->type == Substream::VariantElement) stream_name += "." + it->variant_element_name; + else if (it->type == SubstreamType::DynamicStructure) + stream_name += ".dynamic_structure"; } return stream_name; @@ -271,6 +273,23 @@ ColumnPtr ISerialization::getFromSubstreamsCache(SubstreamsCache * cache, const return it == cache->end() ? nullptr : it->second; } +void ISerialization::addToSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path, DeserializeBinaryBulkStatePtr state) +{ + if (!cache || path.empty()) + return; + + cache->emplace(getSubcolumnNameForStream(path), state); +} + +ISerialization::DeserializeBinaryBulkStatePtr ISerialization::getFromSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path) +{ + if (!cache || path.empty()) + return nullptr; + + auto it = cache->find(getSubcolumnNameForStream(path)); + return it == cache->end() ? nullptr : it->second; +} + bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) { for (const auto & elem : path) diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index ebaa26d19a6..65493cf6dda 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -160,6 +160,9 @@ public: VariantElements, VariantElement, + DynamicData, + DynamicStructure, + Regular, }; @@ -231,6 +234,8 @@ public: using SerializeBinaryBulkStatePtr = std::shared_ptr; using DeserializeBinaryBulkStatePtr = std::shared_ptr; + using SubstreamsDeserializeStatesCache = std::unordered_map; + struct SerializeBinaryBulkSettings { OutputStreamGetter getter; @@ -240,6 +245,14 @@ public: bool low_cardinality_use_single_dictionary_for_part = true; bool position_independent_encoding = true; + + enum class DynamicStatisticsMode + { + NONE, /// Don't write statistics. + PREFIX, /// Write statistics in prefix. + SUFFIX, /// Write statistics in suffix. + }; + DynamicStatisticsMode dynamic_write_statistics = DynamicStatisticsMode::NONE; }; struct DeserializeBinaryBulkSettings @@ -256,6 +269,8 @@ public: /// If not zero, may be used to avoid reallocations while reading column of String type. double avg_value_size_hint = 0; + + bool dynamic_read_statistics = false; }; /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. @@ -273,7 +288,8 @@ public: /// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr. virtual void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & /*settings*/, - DeserializeBinaryBulkStatePtr & /*state*/) const {} + DeserializeBinaryBulkStatePtr & /*state*/, + SubstreamsDeserializeStatesCache * /*cache*/) const {} /** 'offset' and 'limit' are used to specify range. * limit = 0 - means no limit. @@ -393,6 +409,9 @@ public: static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column); static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path); + static void addToSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path, DeserializeBinaryBulkStatePtr state); + static DeserializeBinaryBulkStatePtr getFromSubstreamsDeserializeStatesCache(SubstreamsDeserializeStatesCache * cache, const SubstreamPath & path); + static bool isSpecialCompressionAllowed(const SubstreamPath & path); static size_t getArrayLevel(const SubstreamPath & path); diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index e8aab615849..d6546b338b5 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -284,10 +284,11 @@ void SerializationArray::serializeBinaryBulkStateSuffix( void SerializationArray::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { settings.path.push_back(Substream::ArrayElements); - nested->deserializeBinaryBulkStatePrefix(settings, state); + nested->deserializeBinaryBulkStatePrefix(settings, state, cache); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index 82f5e8bce45..c3353f0c251 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -55,7 +55,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, @@ -71,7 +72,6 @@ public: DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; -private: struct SubcolumnCreator : public ISubcolumnCreator { const ColumnPtr offsets; diff --git a/src/DataTypes/Serializations/SerializationDynamic.cpp b/src/DataTypes/Serializations/SerializationDynamic.cpp new file mode 100644 index 00000000000..c9fe8dd6b29 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDynamic.cpp @@ -0,0 +1,645 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; + extern const int LOGICAL_ERROR; +} + +void SerializationDynamic::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + settings.path.push_back(Substream::DynamicStructure); + callback(settings.path); + settings.path.pop_back(); + + const auto * column_dynamic = data.column ? &assert_cast(*data.column) : nullptr; + + /// If column is nullptr, nothing to enumerate as we don't have any variants. + if (!column_dynamic) + return; + + const auto & variant_info = column_dynamic->getVariantInfo(); + auto variant_serialization = variant_info.variant_type->getDefaultSerialization(); + + settings.path.push_back(Substream::DynamicData); + auto variant_data = SubstreamData(variant_serialization) + .withType(variant_info.variant_type) + .withColumn(column_dynamic->getVariantColumnPtr()) + .withSerializationInfo(data.serialization_info); + settings.path.back().data = variant_data; + variant_serialization->enumerateStreams(settings, callback, variant_data); + settings.path.pop_back(); +} + +SerializationDynamic::DynamicStructureSerializationVersion::DynamicStructureSerializationVersion(UInt64 version) : value(static_cast(version)) +{ + checkVersion(version); +} + +void SerializationDynamic::DynamicStructureSerializationVersion::checkVersion(UInt64 version) +{ + if (version != VariantTypeName) + throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid version for Dynamic structure serialization."); +} + +struct SerializeBinaryBulkStateDynamic : public ISerialization::SerializeBinaryBulkState +{ + SerializationDynamic::DynamicStructureSerializationVersion structure_version; + DataTypePtr variant_type; + Names variant_names; + SerializationPtr variant_serialization; + ISerialization::SerializeBinaryBulkStatePtr variant_state; + + /// Pointer to currently serialized dynamic column. + /// Used to calculate statistics for the whole column and not for some range. + const ColumnDynamic * current_dynamic_column = nullptr; + + /// Variants statistics. Map (Variant name) -> (Variant size). + ColumnDynamic::Statistics statistics = { .source =ColumnDynamic::Statistics::Source::READ }; + + SerializeBinaryBulkStateDynamic(UInt64 structure_version_) : structure_version(structure_version_) {} + + void updateStatistics(const ColumnVariant & column_variant) + { + for (size_t i = 0; i != variant_names.size(); ++i) + statistics.data[variant_names[i]] += column_variant.getVariantPtrByGlobalDiscriminator(i)->size(); + } +}; + +struct DeserializeBinaryBulkStateDynamic : public ISerialization::DeserializeBinaryBulkState +{ + SerializationPtr variant_serialization; + ISerialization::DeserializeBinaryBulkStatePtr variant_state; + ISerialization::DeserializeBinaryBulkStatePtr structure_state; +}; + +void SerializationDynamic::serializeBinaryBulkStatePrefix( + const DB::IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const auto & column_dynamic = assert_cast(column); + const auto & variant_info = column_dynamic.getVariantInfo(); + + settings.path.push_back(Substream::DynamicStructure); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state prefix"); + + /// Write structure serialization version. + UInt64 structure_version = DynamicStructureSerializationVersion::Value::VariantTypeName; + writeBinaryLittleEndian(structure_version, *stream); + auto dynamic_state = std::make_shared(structure_version); + + dynamic_state->variant_type = variant_info.variant_type; + dynamic_state->variant_names = variant_info.variant_names; + const auto & variant_column = column_dynamic.getVariantColumn(); + + /// Write internal Variant type name. + writeStringBinary(dynamic_state->variant_type->getName(), *stream); + + /// Write statistics in prefix if needed. + if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::PREFIX) + { + const auto & statistics = column_dynamic.getStatistics(); + for (size_t i = 0; i != variant_info.variant_names.size(); ++i) + { + size_t size = 0; + /// Use statistics from column if it was created during merge. + if (statistics.data.empty() || statistics.source != ColumnDynamic::Statistics::Source::MERGE) + size = variant_column.getVariantByGlobalDiscriminator(i).size(); + /// Otherwise we can use only variant sizes from current column. + else + size = statistics.data.at(variant_info.variant_names[i]); + writeVarUInt(size, *stream); + } + } + + dynamic_state->variant_serialization = dynamic_state->variant_type->getDefaultSerialization(); + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->serializeBinaryBulkStatePrefix(variant_column, settings, dynamic_state->variant_state); + settings.path.pop_back(); + + state = std::move(dynamic_state); +} + +void SerializationDynamic::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const +{ + DeserializeBinaryBulkStatePtr structure_state = deserializeDynamicStructureStatePrefix(settings, cache); + if (!structure_state) + return; + + auto dynamic_state = std::make_shared(); + dynamic_state->structure_state = structure_state; + dynamic_state->variant_serialization = checkAndGetState(structure_state)->variant_type->getDefaultSerialization(); + + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_state->variant_state, cache); + settings.path.pop_back(); + + state = std::move(dynamic_state); +} + +ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeDynamicStructureStatePrefix( + DeserializeBinaryBulkSettings & settings, SubstreamsDeserializeStatesCache * cache) +{ + settings.path.push_back(Substream::DynamicStructure); + + DeserializeBinaryBulkStatePtr state = nullptr; + if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path)) + { + state = cached_state; + } + else if (auto * structure_stream = settings.getter(settings.path)) + { + /// Read structure serialization version. + UInt64 structure_version; + readBinaryLittleEndian(structure_version, *structure_stream); + auto structure_state = std::make_shared(structure_version); + /// Read internal Variant type name. + String data_type_name; + readStringBinary(data_type_name, *structure_stream); + structure_state->variant_type = DataTypeFactory::instance().get(data_type_name); + const auto * variant_type = typeid_cast(structure_state->variant_type.get()); + if (!variant_type) + throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect type of Dynamic nested column, expected Variant, got {}", structure_state->variant_type->getName()); + + /// Read statistics. + if (settings.dynamic_read_statistics) + { + const auto & variants = variant_type->getVariants(); + size_t variant_size; + for (const auto & variant : variants) + { + readVarUInt(variant_size, *structure_stream); + structure_state->statistics.data[variant->getName()] = variant_size; + } + } + + state = structure_state; + addToSubstreamsDeserializeStatesCache(cache, settings.path, state); + } + + settings.path.pop_back(); + return state; +} + +void SerializationDynamic::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const +{ + auto * dynamic_state = checkAndGetState(state); + settings.path.push_back(Substream::DynamicStructure); + auto * stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for Dynamic column structure during serialization of binary bulk state prefix"); + + /// Write statistics in suffix if needed. + if (settings.dynamic_write_statistics == SerializeBinaryBulkSettings::DynamicStatisticsMode::SUFFIX) + { + for (const auto & variant_name : dynamic_state->variant_names) + writeVarUInt(dynamic_state->statistics.data[variant_name], *stream); + } + + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->serializeBinaryBulkStateSuffix(settings, dynamic_state->variant_state); + settings.path.pop_back(); +} + +void SerializationDynamic::serializeBinaryBulkWithMultipleStreams( + const DB::IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const auto & column_dynamic = assert_cast(column); + auto * dynamic_state = checkAndGetState(state); + const auto & variant_info = column_dynamic.getVariantInfo(); + const auto * variant_column = &column_dynamic.getVariantColumn(); + + if (!variant_info.variant_type->equals(*dynamic_state->variant_type)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of internal columns of Dynamic. Expected: {}, Got: {}", dynamic_state->variant_type->getName(), variant_info.variant_type->getName()); + + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->serializeBinaryBulkWithMultipleStreams(*variant_column, offset, limit, settings, dynamic_state->variant_state); + settings.path.pop_back(); +} + +void SerializationDynamic::deserializeBinaryBulkWithMultipleStreams( + DB::ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + if (!state) + return; + + auto mutable_column = column->assumeMutable(); + auto * dynamic_state = checkAndGetState(state); + auto * structure_state = checkAndGetState(dynamic_state->structure_state); + + if (mutable_column->empty()) + mutable_column = ColumnDynamic::create(structure_state->variant_type->createColumn(), structure_state->variant_type, max_dynamic_types, structure_state->statistics); + + auto & column_dynamic = assert_cast(*mutable_column); + const auto & variant_info = column_dynamic.getVariantInfo(); + if (!variant_info.variant_type->equals(*structure_state->variant_type)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of internal columns of Dynamic. Expected: {}, Got: {}", structure_state->variant_type->getName(), variant_info.variant_type->getName()); + + settings.path.push_back(Substream::DynamicData); + dynamic_state->variant_serialization->deserializeBinaryBulkWithMultipleStreams(column_dynamic.getVariantColumnPtr(), limit, settings, dynamic_state->variant_state, cache); + settings.path.pop_back(); + + column = std::move(mutable_column); +} + +void SerializationDynamic::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const +{ + UInt8 null_bit = field.isNull(); + writeBinary(null_bit, ostr); + if (null_bit) + return; + + auto field_type = applyVisitor(FieldToDataType(), field); + auto field_type_name = field_type->getName(); + writeVarUInt(field_type_name.size(), ostr); + writeString(field_type_name, ostr); + field_type->getDefaultSerialization()->serializeBinary(field, ostr, settings); +} + +void SerializationDynamic::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const +{ + UInt8 null_bit; + readBinary(null_bit, istr); + if (null_bit) + { + field = Null(); + return; + } + + size_t field_type_name_size; + readVarUInt(field_type_name_size, istr); + String field_type_name(field_type_name_size, 0); + istr.readStrict(field_type_name.data(), field_type_name_size); + auto field_type = DataTypeFactory::instance().get(field_type_name); + field_type->getDefaultSerialization()->deserializeBinary(field, istr, settings); +} + +void SerializationDynamic::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + const auto & variant_info = dynamic_column.getVariantInfo(); + const auto & variant_column = dynamic_column.getVariantColumn(); + auto global_discr = variant_column.globalDiscriminatorAt(row_num); + + UInt8 null_bit = global_discr == ColumnVariant::NULL_DISCRIMINATOR; + writeBinary(null_bit, ostr); + if (null_bit) + return; + + const auto & variant_type = assert_cast(*variant_info.variant_type).getVariant(global_discr); + const auto & variant_type_name = variant_info.variant_names[global_discr]; + writeVarUInt(variant_type_name.size(), ostr); + writeString(variant_type_name, ostr); + variant_type->getDefaultSerialization()->serializeBinary(variant_column.getVariantByGlobalDiscriminator(global_discr), variant_column.offsetAt(row_num), ostr, settings); +} + +template +static void deserializeVariant( + ColumnVariant & variant_column, + const DataTypePtr & variant_type, + ColumnVariant::Discriminator global_discr, + ReadBuffer & istr, + DeserializeFunc deserialize) +{ + auto & variant = variant_column.getVariantByGlobalDiscriminator(global_discr); + deserialize(*variant_type->getDefaultSerialization(), variant, istr); + variant_column.getLocalDiscriminators().push_back(variant_column.localDiscriminatorByGlobal(global_discr)); + variant_column.getOffsets().push_back(variant.size() - 1); +} + +void SerializationDynamic::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto & dynamic_column = assert_cast(column); + UInt8 null_bit; + readBinary(null_bit, istr); + if (null_bit) + { + dynamic_column.insertDefault(); + return; + } + + size_t variant_type_name_size; + readVarUInt(variant_type_name_size, istr); + String variant_type_name(variant_type_name_size, 0); + istr.readStrict(variant_type_name.data(), variant_type_name_size); + + const auto & variant_info = dynamic_column.getVariantInfo(); + auto it = variant_info.variant_name_to_discriminator.find(variant_type_name); + if (it != variant_info.variant_name_to_discriminator.end()) + { + const auto & variant_type = assert_cast(*variant_info.variant_type).getVariant(it->second); + deserializeVariant(dynamic_column.getVariantColumn(), variant_type, it->second, istr, [&settings](const ISerialization & serialization, IColumn & variant, ReadBuffer & buf){ serialization.deserializeBinary(variant, buf, settings); }); + return; + } + + /// We don't have this variant yet. Let's try to add it. + auto variant_type = DataTypeFactory::instance().get(variant_type_name); + if (dynamic_column.addNewVariant(variant_type)) + { + auto discr = variant_info.variant_name_to_discriminator.at(variant_type_name); + deserializeVariant(dynamic_column.getVariantColumn(), variant_type, discr, istr, [&settings](const ISerialization & serialization, IColumn & variant, ReadBuffer & buf){ serialization.deserializeBinary(variant, buf, settings); }); + return; + } + + /// We reached maximum number of variants and couldn't add new variant. + /// This case should be really rare in real use cases. + /// We should always be able to add String variant and insert value as String. + dynamic_column.addStringVariant(); + auto tmp_variant_column = variant_type->createColumn(); + variant_type->getDefaultSerialization()->deserializeBinary(*tmp_variant_column, istr, settings); + auto string_column = castColumn(ColumnWithTypeAndName(tmp_variant_column->getPtr(), variant_type, ""), std::make_shared()); + auto & variant_column = dynamic_column.getVariantColumn(); + variant_column.insertIntoVariantFrom(variant_info.variant_name_to_discriminator.at("String"), *string_column, 0); +} + +void SerializationDynamic::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextCSV(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +template +static void deserializeTextImpl( + IColumn & column, + ReadBuffer & istr, + const FormatSettings & settings, + ReadFieldFunc read_field, + FormatSettings::EscapingRule escaping_rule, + TryDeserializeVariantFunc try_deserialize_variant, + DeserializeVariant deserialize_variant) +{ + auto & dynamic_column = assert_cast(column); + auto & variant_column = dynamic_column.getVariantColumn(); + const auto & variant_info = dynamic_column.getVariantInfo(); + String field = read_field(istr); + auto field_buf = std::make_unique(field); + JSONInferenceInfo json_info; + auto variant_type = tryInferDataTypeByEscapingRule(field, settings, escaping_rule, &json_info); + if (escaping_rule == FormatSettings::EscapingRule::JSON) + transformFinalInferredJSONTypeIfNeeded(variant_type, settings, &json_info); + + if (checkIfTypeIsComplete(variant_type) && dynamic_column.addNewVariant(variant_type)) + { + auto discr = variant_info.variant_name_to_discriminator.at(variant_type->getName()); + deserializeVariant(dynamic_column.getVariantColumn(), variant_type, discr, *field_buf, deserialize_variant); + return; + } + + /// We couldn't infer type or add new variant. Try to insert field into current variants. + field_buf = std::make_unique(field); + if (try_deserialize_variant(*variant_info.variant_type->getDefaultSerialization(), variant_column, *field_buf)) + return; + + /// We couldn't insert field into any existing variant, add String variant and read value as String. + dynamic_column.addStringVariant(); + + if (escaping_rule == FormatSettings::EscapingRule::Quoted && (field.size() < 2 || field.front() != '\'' || field.back() != '\'')) + field = "'" + field + "'"; + + field_buf = std::make_unique(field); + auto string_discr = variant_info.variant_name_to_discriminator.at("String"); + deserializeVariant(dynamic_column.getVariantColumn(), std::make_shared(), string_discr, *field_buf, deserialize_variant); +} + +void SerializationDynamic::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [&settings](ReadBuffer & buf) + { + String field; + readCSVField(field, buf, settings.csv); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextCSV(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextCSV(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::CSV, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextCSV(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextEscaped(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [](ReadBuffer & buf) + { + String field; + readEscapedString(field, buf); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextEscaped(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextEscaped(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Escaped, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextEscaped(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextEscaped(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextQuoted(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [](ReadBuffer & buf) + { + String field; + readQuotedField(field, buf); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextQuoted(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextQuoted(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Quoted, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextQuoted(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextJSON(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [&settings](ReadBuffer & buf) + { + String field; + readJSONField(field, buf, settings.json); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextJSON(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextJSON(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::JSON, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextJSON(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextRaw(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [](ReadBuffer & buf) + { + String field; + readString(field, buf); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeTextRaw(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeTextRaw(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Raw, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeTextRaw(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextRaw(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeText(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +void SerializationDynamic::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_field = [](ReadBuffer & buf) + { + String field; + readStringUntilEOF(field, buf); + return field; + }; + + auto try_deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + return serialization.tryDeserializeWholeText(col, buf, settings); + }; + + auto deserialize_variant = [&settings](const ISerialization & serialization, IColumn & col, ReadBuffer & buf) + { + serialization.deserializeWholeText(col, buf, settings); + }; + + deserializeTextImpl(column, istr, settings, read_field, FormatSettings::EscapingRule::Raw, try_deserialize_variant, deserialize_variant); +} + +bool SerializationDynamic::tryDeserializeWholeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeWholeText(column, istr, settings); + return true; +} + +void SerializationDynamic::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & dynamic_column = assert_cast(column); + dynamic_column.getVariantInfo().variant_type->getDefaultSerialization()->serializeTextXML(dynamic_column.getVariantColumn(), row_num, ostr, settings); +} + +} diff --git a/src/DataTypes/Serializations/SerializationDynamic.h b/src/DataTypes/Serializations/SerializationDynamic.h new file mode 100644 index 00000000000..4803bc25d18 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDynamic.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class SerializationDynamicElement; + +class SerializationDynamic : public ISerialization +{ +public: + SerializationDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) + { + } + + struct DynamicStructureSerializationVersion + { + enum Value + { + VariantTypeName = 1, + }; + + Value value; + + static void checkVersion(UInt64 version); + + explicit DynamicStructureSerializationVersion(UInt64 version); + }; + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; + + static DeserializeBinaryBulkStatePtr deserializeDynamicStructureStatePrefix( + DeserializeBinaryBulkSettings & settings, + SubstreamsDeserializeStatesCache * cache); + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + +private: + friend SerializationDynamicElement; + + struct DeserializeBinaryBulkStateDynamicStructure : public ISerialization::DeserializeBinaryBulkState + { + DynamicStructureSerializationVersion structure_version; + DataTypePtr variant_type; + ColumnDynamic::Statistics statistics = {.source = ColumnDynamic::Statistics::Source::READ}; + + explicit DeserializeBinaryBulkStateDynamicStructure(UInt64 structure_version_) : structure_version(structure_version_) {} + }; + + size_t max_dynamic_types; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.cpp b/src/DataTypes/Serializations/SerializationDynamicElement.cpp new file mode 100644 index 00000000000..386a6579519 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDynamicElement.cpp @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +void SerializationDynamicElement::enumerateStreams( + DB::ISerialization::EnumerateStreamsSettings & settings, + const DB::ISerialization::StreamCallback & callback, + const DB::ISerialization::SubstreamData &) const +{ + settings.path.push_back(Substream::DynamicStructure); + callback(settings.path); + settings.path.pop_back(); + + /// We don't know if we have actually have this variant in Dynamic column, + /// so we cannot enumerate variant streams. +} + +void SerializationDynamicElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationDynamicElement"); +} + +void SerializationDynamicElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationDynamicElement"); +} + +struct DeserializeBinaryBulkStateDynamicElement : public ISerialization::DeserializeBinaryBulkState +{ + ISerialization::DeserializeBinaryBulkStatePtr structure_state; + SerializationPtr variant_serialization; + ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; +}; + +void SerializationDynamicElement::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const +{ + DeserializeBinaryBulkStatePtr structure_state = SerializationDynamic::deserializeDynamicStructureStatePrefix(settings, cache); + if (!structure_state) + return; + + auto dynamic_element_state = std::make_shared(); + dynamic_element_state->structure_state = std::move(structure_state); + const auto & variant_type = checkAndGetState(structure_state)->variant_type; + /// Check if we actually have required element in the Variant. + if (auto global_discr = assert_cast(*variant_type).tryGetVariantDiscriminator(dynamic_element_name)) + { + settings.path.push_back(Substream::DynamicData); + dynamic_element_state->variant_serialization = std::make_shared(nested_serialization, dynamic_element_name, *global_discr); + dynamic_element_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_element_state->variant_element_state, cache); + settings.path.pop_back(); + } + + state = std::move(dynamic_element_state); +} + +void SerializationDynamicElement::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationDynamicElement"); +} + +void SerializationDynamicElement::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & result_column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto * dynamic_element_state = checkAndGetState(state); + + if (dynamic_element_state->variant_serialization) + { + settings.path.push_back(Substream::DynamicData); + dynamic_element_state->variant_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, dynamic_element_state->variant_element_state, cache); + settings.path.pop_back(); + } + else + { + auto mutable_column = result_column->assumeMutable(); + mutable_column->insertManyDefaults(limit); + result_column = std::move(mutable_column); + } +} + +} diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.h b/src/DataTypes/Serializations/SerializationDynamicElement.h new file mode 100644 index 00000000000..9e4980e0a27 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationDynamicElement.h @@ -0,0 +1,58 @@ +#pragma once + +#include + +namespace DB +{ + + +/// Serialization for Dynamic element when we read it as a subcolumn. +class SerializationDynamicElement final : public SerializationWrapper +{ +private: + /// To be able to deserialize Dyna,ic element as a subcolumn + /// we need its type name and global discriminator. + String dynamic_element_name; + +public: + SerializationDynamicElement(const SerializationPtr & nested_, const String & dynamic_element_name_) + : SerializationWrapper(nested_) + , dynamic_element_name(dynamic_element_name_) + { + } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationInterval.cpp b/src/DataTypes/Serializations/SerializationInterval.cpp index 59086d8aef3..2157566895d 100644 --- a/src/DataTypes/Serializations/SerializationInterval.cpp +++ b/src/DataTypes/Serializations/SerializationInterval.cpp @@ -68,9 +68,9 @@ void SerializationInterval::deserializeBinaryBulk(IColumn & column, ReadBuffer & } void SerializationInterval::deserializeBinaryBulkStatePrefix( - DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const { - dispatch(&ISerialization::deserializeBinaryBulkStatePrefix, FormatSettings::IntervalOutputFormat::Numeric, settings, state); + dispatch(&ISerialization::deserializeBinaryBulkStatePrefix, FormatSettings::IntervalOutputFormat::Numeric, settings, state, cache); } diff --git a/src/DataTypes/Serializations/SerializationInterval.h b/src/DataTypes/Serializations/SerializationInterval.h index a4e6c204e4f..368aff4f0c3 100644 --- a/src/DataTypes/Serializations/SerializationInterval.h +++ b/src/DataTypes/Serializations/SerializationInterval.h @@ -34,7 +34,10 @@ public: void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - void deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const override; + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void deserializeBinaryBulkWithMultipleStreams( ColumnPtr & column, size_t limit, diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 9efe05042ed..802da263d89 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -267,7 +267,8 @@ void SerializationLowCardinality::serializeBinaryBulkStateSuffix( void SerializationLowCardinality::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * /*cache*/) const { settings.path.push_back(Substream::DictionaryKeys); auto * stream = settings.getter(settings.path); diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h index d2c3a95c702..aa64e956a64 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.h +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -33,7 +33,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 7b6f87baf2e..dac4fbe88e0 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -420,9 +420,10 @@ void SerializationMap::serializeBinaryBulkStateSuffix( void SerializationMap::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { - nested->deserializeBinaryBulkStatePrefix(settings, state); + nested->deserializeBinaryBulkStatePrefix(settings, state, cache); } diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index 3e27ef1b04a..cfcde445c1f 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -51,7 +51,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp index 2792827e690..07f5f9ea7ed 100644 --- a/src/DataTypes/Serializations/SerializationNamed.cpp +++ b/src/DataTypes/Serializations/SerializationNamed.cpp @@ -54,10 +54,11 @@ void SerializationNamed::serializeBinaryBulkStateSuffix( void SerializationNamed::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { addToPath(settings.path); - nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state, cache); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationNamed.h b/src/DataTypes/Serializations/SerializationNamed.h index 0633ba2ea6f..bb2161e40e6 100644 --- a/src/DataTypes/Serializations/SerializationNamed.h +++ b/src/DataTypes/Serializations/SerializationNamed.h @@ -36,7 +36,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 4d31451f92d..477349f955d 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -95,10 +95,11 @@ void SerializationNullable::serializeBinaryBulkStateSuffix( void SerializationNullable::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { settings.path.push_back(Substream::NullableElements); - nested->deserializeBinaryBulkStatePrefix(settings, state); + nested->deserializeBinaryBulkStatePrefix(settings, state, cache); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index 37858ccdefd..f7d2d2eadf0 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -29,7 +29,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp index 67bf7af7799..88244a89204 100644 --- a/src/DataTypes/Serializations/SerializationObject.cpp +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -210,7 +210,8 @@ void SerializationObject::serializeBinaryBulkStateSuffix( template void SerializationObject::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { checkSerializationIsSupported(settings); if (state) @@ -258,7 +259,7 @@ void SerializationObject::deserializeBinaryBulkStatePrefix( } settings.path.push_back(Substream::ObjectData); - state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state); + state_object->nested_serialization->deserializeBinaryBulkStatePrefix(settings, state_object->nested_state, cache); settings.path.pop_back(); state = std::move(state_object); diff --git a/src/DataTypes/Serializations/SerializationObject.h b/src/DataTypes/Serializations/SerializationObject.h index 39e1c514640..4cb7d0ab6a8 100644 --- a/src/DataTypes/Serializations/SerializationObject.h +++ b/src/DataTypes/Serializations/SerializationObject.h @@ -41,7 +41,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp index 4d7514271ad..f9228069b90 100644 --- a/src/DataTypes/Serializations/SerializationSparse.cpp +++ b/src/DataTypes/Serializations/SerializationSparse.cpp @@ -152,7 +152,7 @@ void SerializationSparse::enumerateStreams( const StreamCallback & callback, const SubstreamData & data) const { - const auto * column_sparse = data.column ? &assert_cast(*data.column) : nullptr; + const auto * column_sparse = data.column ? typeid_cast(data.column.get()) : nullptr; size_t column_size = column_sparse ? column_sparse->size() : 0; settings.path.push_back(Substream::SparseOffsets); @@ -242,12 +242,13 @@ void SerializationSparse::serializeBinaryBulkStateSuffix( void SerializationSparse::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { auto state_sparse = std::make_shared(); settings.path.push_back(Substream::SparseElements); - nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested); + nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested, cache); settings.path.pop_back(); state = std::move(state_sparse); diff --git a/src/DataTypes/Serializations/SerializationSparse.h b/src/DataTypes/Serializations/SerializationSparse.h index b1ed7b613f0..a55856bacf0 100644 --- a/src/DataTypes/Serializations/SerializationSparse.h +++ b/src/DataTypes/Serializations/SerializationSparse.h @@ -43,7 +43,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; /// Allows to write ColumnSparse and other columns in sparse serialization. void serializeBinaryBulkWithMultipleStreams( diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 632a019d2d9..bb7c19aa78d 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -606,13 +606,14 @@ void SerializationTuple::serializeBinaryBulkStateSuffix( void SerializationTuple::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { auto tuple_state = std::make_shared(); tuple_state->states.resize(elems.size()); for (size_t i = 0; i < elems.size(); ++i) - elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]); + elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i], cache); state = std::move(tuple_state); } diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index d9c63a05217..810673d8b21 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -53,7 +53,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 8ca86c63bf6..3fe26b773e3 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -123,7 +123,8 @@ void SerializationVariant::serializeBinaryBulkStateSuffix( void SerializationVariant::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { auto variant_state = std::make_shared(); variant_state->states.resize(variants.size()); @@ -132,7 +133,7 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix( for (size_t i = 0; i < variants.size(); ++i) { addVariantElementToPath(settings.path, i); - variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i]); + variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i], cache); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h index 3f53dcf1339..0de786f5561 100644 --- a/src/DataTypes/Serializations/SerializationVariant.h +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -59,7 +59,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 7d4487fe6da..4f120ecac06 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { @@ -55,12 +56,13 @@ struct DeserializeBinaryBulkStateVariantElement : public ISerialization::Deseria ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; }; -void SerializationVariantElement::deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const +void SerializationVariantElement::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const { auto variant_element_state = std::make_shared(); addVariantToPath(settings.path); - nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); state = std::move(variant_element_state); @@ -80,6 +82,7 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( { auto * variant_element_state = checkAndGetState(state); + size_t variant_limit = 0; /// First, deserialize discriminators from Variant column. settings.path.push_back(Substream::VariantDiscriminators); if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) @@ -96,17 +99,30 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( if (!variant_element_state->discriminators || result_column->empty()) variant_element_state->discriminators = ColumnVariant::ColumnDiscriminators::create(); +// ColumnVariant::Discriminator discr; +// readBinaryLittleEndian(discr, *discriminators_stream); +// if (discr == ColumnVariant::NULL_DISCRIMINATOR) +// { SerializationNumber().deserializeBinaryBulk(*variant_element_state->discriminators->assumeMutable(), *discriminators_stream, limit, 0); +// } +// else +// { +// auto & discriminators_data = assert_cast(*variant_element_state->discriminators->assumeMutable()).getData(); +// discriminators_data.resize_fill(discriminators_data.size() + limit, discr); +// } + addToSubstreamsCache(cache, settings.path, variant_element_state->discriminators); } settings.path.pop_back(); - /// Iterate through new discriminators to calculate the limit for our variant. const auto & discriminators_data = assert_cast(*variant_element_state->discriminators).getData(); size_t discriminators_offset = variant_element_state->discriminators->size() - limit; - size_t variant_limit = 0; - for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) - variant_limit += (discriminators_data[i] == variant_discriminator); + /// Iterate through new discriminators to calculate the limit for our variant. + if (!variant_limit) + { + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + variant_limit += (discriminators_data[i] == variant_discriminator); + } /// Now we know the limit for our variant and can deserialize it. diff --git a/src/DataTypes/Serializations/SerializationVariantElement.h b/src/DataTypes/Serializations/SerializationVariantElement.h index aafecf43d39..0ce0a72e250 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.h +++ b/src/DataTypes/Serializations/SerializationVariantElement.h @@ -43,7 +43,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, @@ -59,12 +60,6 @@ public: DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; -private: - friend SerializationVariant; - - void addVariantToPath(SubstreamPath & path) const; - void removeVariantFromPath(SubstreamPath & path) const; - struct VariantSubcolumnCreator : public ISubcolumnCreator { const ColumnPtr local_discriminators; @@ -82,6 +77,11 @@ private: ColumnPtr create(const ColumnPtr & prev) const override; SerializationPtr create(const SerializationPtr & prev) const override; }; +private: + friend SerializationVariant; + + void addVariantToPath(SubstreamPath & path) const; + void removeVariantFromPath(SubstreamPath & path) const; }; } diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp index bde52bb8096..ecef533d7e0 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.cpp +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -29,9 +29,10 @@ void SerializationWrapper::serializeBinaryBulkStateSuffix( void SerializationWrapper::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const { - nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state, cache); } void SerializationWrapper::serializeBinaryBulkWithMultipleStreams( diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h index 6c5e2046062..882f17bba0a 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.h +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -36,7 +36,8 @@ public: void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, - DeserializeBinaryBulkStatePtr & state) const override; + DeserializeBinaryBulkStatePtr & state, + SubstreamsDeserializeStatesCache * cache) const override; void serializeBinaryBulkWithMultipleStreams( const IColumn & column, diff --git a/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp b/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp index fc7432d5bf6..c6337a31fce 100644 --- a/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp +++ b/src/DataTypes/Serializations/tests/gtest_object_serialization.cpp @@ -49,7 +49,7 @@ TEST(SerializationObject, FromString) settings.position_independent_encoding = false; settings.getter = [&in](const auto &) { return ∈ }; - serialization->deserializeBinaryBulkStatePrefix(settings, state); + serialization->deserializeBinaryBulkStatePrefix(settings, state, nullptr); serialization->deserializeBinaryBulkWithMultipleStreams(result_column, column_string->size(), settings, state, nullptr); } diff --git a/src/DataTypes/Utils.cpp b/src/DataTypes/Utils.cpp index 2f29d57d454..e7e69e379af 100644 --- a/src/DataTypes/Utils.cpp +++ b/src/DataTypes/Utils.cpp @@ -224,6 +224,7 @@ bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_typ case TypeIndex::Nothing: case TypeIndex::JSONPaths: case TypeIndex::Variant: + case TypeIndex::Dynamic: return false; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 59b3e52e139..330bc28be61 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -929,6 +929,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep query_context->setSetting("allow_experimental_hash_functions", 1); query_context->setSetting("allow_experimental_object_type", 1); query_context->setSetting("allow_experimental_variant_type", 1); + query_context->setSetting("allow_experimental_dynamic_type", 1); query_context->setSetting("allow_experimental_annoy_index", 1); query_context->setSetting("allow_experimental_usearch_index", 1); query_context->setSetting("allow_experimental_bigint_types", 1); diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 5b7995e0da2..deff44a0d9b 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -43,9 +43,9 @@ struct FormatSettings String column_names_for_schema_inference{}; String schema_inference_hints{}; - bool try_infer_integers = false; - bool try_infer_dates = false; - bool try_infer_datetimes = false; + bool try_infer_integers = true; + bool try_infer_dates = true; + bool try_infer_datetimes = true; bool try_infer_exponent_floats = false; enum class DateTimeInputFormat diff --git a/src/Formats/NativeReader.cpp b/src/Formats/NativeReader.cpp index 8286b24d0a6..39915b0735e 100644 --- a/src/Formats/NativeReader.cpp +++ b/src/Formats/NativeReader.cpp @@ -93,7 +93,7 @@ void NativeReader::readData(const ISerialization & serialization, ColumnPtr & co ISerialization::DeserializeBinaryBulkStatePtr state; - serialization.deserializeBinaryBulkStatePrefix(settings, state); + serialization.deserializeBinaryBulkStatePrefix(settings, state, nullptr); serialization.deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state, nullptr); if (column->size() != rows) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 7049ca44110..75f8979e727 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +64,7 @@ #include #include +#include namespace DB { @@ -1815,6 +1818,7 @@ struct ConvertImpl /// Generic conversion of any type from String. Used for complex types: Array and Tuple or types with custom serialization. +template struct ConvertImplGenericFromString { static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) @@ -1854,29 +1858,34 @@ struct ConvertImplGenericFromString { serialization_from.deserializeWholeText(column_to, read_buffer, format_settings); } - catch (const Exception & e) + catch (const Exception &) { - auto * nullable_column = typeid_cast(&column_to); - if (e.code() == ErrorCodes::CANNOT_PARSE_BOOL && nullable_column) - { - auto & col_nullmap = nullable_column->getNullMapData(); - if (col_nullmap.size() != nullable_column->size()) - col_nullmap.resize_fill(nullable_column->size()); - if (nullable_column->size() == (i + 1)) - nullable_column->popBack(1); - nullable_column->insertDefault(); - continue; - } - throw; + if constexpr (throw_on_error) + throw; + /// Check if exception happened after we inserted the value + /// (deserializeWholeText should not do it, but let's check anyway). + if (column_to.size() > i) + column_to.popBack(column_to.size() - i); + column_to.insertDefault(); } + /// Usually deserializeWholeText checks for eof after parsing, but let's check one more time just in case. if (!read_buffer.eof()) { - if (result_type) - throwExceptionForIncompletelyParsedValue(read_buffer, *result_type); + if constexpr (throw_on_error) + { + if (result_type) + throwExceptionForIncompletelyParsedValue(read_buffer, *result_type); + else + throw Exception( + ErrorCodes::CANNOT_PARSE_TEXT, "Cannot parse string to column {}. Expected eof", column_to.getName()); + } else - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, - "Cannot parse string to column {}. Expected eof", column_to.getName()); + { + if (column_to.size() > i) + column_to.popBack(column_to.size() - i); + column_to.insertDefault(); + } } } } @@ -3279,7 +3288,9 @@ private: { if (checkAndGetDataType(from_type.get())) { - return &ConvertImplGenericFromString::execute; + if (cast_type == CastType::accurateOrNull) + return &ConvertImplGenericFromString::execute; + return &ConvertImplGenericFromString::execute; } return createWrapper(from_type, to_type, requested_result_is_nullable); @@ -3442,7 +3453,7 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return &ConvertImplGenericFromString::execute; } else if (const auto * agg_type = checkAndGetDataType(from_type_untyped.get())) { @@ -3485,7 +3496,7 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return &ConvertImplGenericFromString::execute; } DataTypePtr from_type_holder; @@ -3576,7 +3587,7 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return &ConvertImplGenericFromString::execute; } const auto * from_type = checkAndGetDataType(from_type_untyped.get()); @@ -3921,7 +3932,7 @@ private: { return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) { - auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count)->assumeMutable(); + auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count)->assumeMutable(); res->finalize(); return res; }; @@ -4089,7 +4100,7 @@ private: }; } - auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(*removeNullableOrLowCardinalityNullable(from_type)); + auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(removeNullableOrLowCardinalityNullable(from_type)->getName()); if (!variant_discr_opt) throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert type {} to {}. Conversion to Variant allowed only for types from this Variant", from_type->getName(), to_variant.getName()); @@ -4197,6 +4208,293 @@ private: return createColumnToVariantWrapper(from_type, assert_cast(*to_type)); } + WrapperType createDynamicToColumnWrapper(const DataTypePtr & to_type) const + { + return [this, to_type] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * col_nullable, size_t input_rows_count) -> ColumnPtr + { + const auto & column_dynamic = assert_cast(*arguments.front().column.get()); + const auto & variant_info = column_dynamic.getVariantInfo(); + auto variant_wrapper = createVariantToColumnWrapper(assert_cast(*variant_info.variant_type), to_type); + ColumnsWithTypeAndName args = {ColumnWithTypeAndName(column_dynamic.getVariantColumnPtr(), variant_info.variant_type, "")}; + return variant_wrapper(args, result_type, col_nullable, input_rows_count); + }; + } + + WrapperType createStringToDynamicThroughParsingWrapper() const + { + return [&](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + auto column = arguments[0].column->convertToFullColumnIfLowCardinality(); + auto args = arguments; + args[0].column = column; + + const ColumnNullable * column_nullable = nullptr; + if (isColumnNullable(*args[0].column)) + { + column_nullable = assert_cast(args[0].column.get()); + args[0].column = column_nullable->getNestedColumnPtr(); + } + + args[0].type = removeNullable(removeLowCardinality(args[0].type)); + + if (cast_type == CastType::accurateOrNull) + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count); + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count); + }; + } + + std::pair getReducedVariant( + const ColumnVariant & variant_column, + const DataTypePtr & variant_type, + const std::unordered_map & variant_name_to_discriminator, + size_t max_result_num_variants, + const ColumnDynamic::Statistics & statistics = {}) const + { + LOG_DEBUG(getLogger("FunctionsConversion"), "getReducedVariant for variant {} with size {}", variant_type->getName(), variant_column.size()); + + const auto & variant_types = assert_cast(*variant_type).getVariants(); + /// First check if we don't exceed the limit in current Variant column. + if (variant_types.size() < max_result_num_variants || (variant_types.size() == max_result_num_variants && variant_name_to_discriminator.contains("String"))) + return {variant_column.getPtr(), variant_type}; + + /// We want to keep the most frequent variants and convert to string the rarest. + std::vector> variant_sizes; + variant_sizes.reserve(variant_types.size()); + std::optional old_string_discriminator; + /// List of variants that should be converted to a single String variant. + std::vector variants_to_convert_to_string; + for (size_t i = 0; i != variant_types.size(); ++i) + { + /// String variant won't be removed. + String variant_name = variant_types[i]->getName(); + LOG_DEBUG(getLogger("FunctionsConversion"), "Variant {}/{} size: {}, statistics: {}", variant_name, i, variant_column.getVariantByGlobalDiscriminator(i).size(), statistics.data.contains(variant_name) ? toString(statistics.data.at(variant_name)) : "none"); + + if (variant_name == "String") + { + old_string_discriminator = i; + /// For simplicity, add this variant to the list that will be converted string, + /// so we will process it with other variants when constructing the new String variant. + variants_to_convert_to_string.push_back(i); + } + else + { + size_t size = 0; + if (statistics.data.empty()) + size = variant_column.getVariantByGlobalDiscriminator(i).size(); + else + size = statistics.data.at(variant_name); + variant_sizes.emplace_back(size, i); + } + } + + /// Sort variants by sizes, so we will keep the most frequent. + std::sort(variant_sizes.begin(), variant_sizes.end(), std::greater()); + + DataTypes remaining_variants; + remaining_variants.reserve(max_result_num_variants); + /// Add String variant in advance. + remaining_variants.push_back(std::make_shared()); + for (auto [_, discr] : variant_sizes) + { + if (remaining_variants.size() != max_result_num_variants) + remaining_variants.push_back(variant_types[discr]); + else + variants_to_convert_to_string.push_back(discr); + } + + auto reduced_variant = std::make_shared(remaining_variants); + const auto & new_variants = reduced_variant->getVariants(); + /// To construct reduced variant column we will need mapping from old to new discriminators. + std::vector old_to_new_discriminators_mapping; + old_to_new_discriminators_mapping.resize(variant_types.size()); + ColumnVariant::Discriminator string_variant_discriminator = 0; + for (size_t i = 0; i != new_variants.size(); ++i) + { + String variant_name = new_variants[i]->getName(); + if (variant_name == "String") + { + string_variant_discriminator = i; + for (auto discr : variants_to_convert_to_string) + old_to_new_discriminators_mapping[discr] = i; + } + else + { + auto old_discr = variant_name_to_discriminator.at(variant_name); + old_to_new_discriminators_mapping[old_discr] = i; + } + } + + /// Convert all reduced variants to String. + std::unordered_map variants_converted_to_string; + variants_converted_to_string.reserve(variants_to_convert_to_string.size()); + size_t string_variant_size = 0; + for (auto discr : variants_to_convert_to_string) + { + auto string_type = std::make_shared(); + auto string_wrapper = prepareUnpackDictionaries(variant_types[discr], string_type); + LOG_DEBUG(getLogger("FunctionsConversion"), "Convert variant {} with size {} to String", variant_types[discr]->getName(), variant_column.getVariantPtrByGlobalDiscriminator(discr)->size()); + auto column_to_convert = ColumnWithTypeAndName(variant_column.getVariantPtrByGlobalDiscriminator(discr), variant_types[discr], ""); + ColumnsWithTypeAndName args = {column_to_convert}; + auto variant_string_column = string_wrapper(args, string_type, nullptr, column_to_convert.column->size()); + LOG_DEBUG(getLogger("FunctionsConversion"), "Got String column with size {}", variant_string_column->size()); + string_variant_size += variant_string_column->size(); + variants_converted_to_string[discr] = variant_string_column; + } + + /// Create new discriminators and offsets and fill new String variant according to old discriminators. + auto string_variant = ColumnString::create(); + string_variant->reserve(string_variant_size); + auto new_discriminators_column = variant_column.getLocalDiscriminatorsPtr()->cloneEmpty(); + auto & new_discriminators_data = assert_cast(*new_discriminators_column).getData(); + new_discriminators_data.reserve(variant_column.size()); + auto new_offsets = variant_column.getOffsetsPtr()->cloneEmpty(); + auto & new_offsets_data = assert_cast(*new_offsets).getData(); + new_offsets_data.reserve(variant_column.size()); + const auto & old_local_discriminators = variant_column.getLocalDiscriminators(); + const auto & old_offsets = variant_column.getOffsets(); + LOG_DEBUG(getLogger("FunctionsConversion"), "Discriminators size: {}. Offsets size: {}", old_local_discriminators.size(), old_offsets.size()); + for (size_t i = 0; i != old_local_discriminators.size(); ++i) + { + auto old_discr = variant_column.globalDiscriminatorByLocal(old_local_discriminators[i]); + LOG_DEBUG(getLogger("FunctionsConversion"), "Row {}, discriminator {}", i, UInt64(old_discr)); + + if (old_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + new_discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + new_offsets_data.push_back(0); + continue; + } + + auto new_discr = old_to_new_discriminators_mapping[old_discr]; + new_discriminators_data.push_back(new_discr); + if (new_discr != string_variant_discriminator) + { + LOG_DEBUG(getLogger("FunctionsConversion"), "Keep variant {}", UInt64(old_discr)); + new_offsets_data.push_back(old_offsets[i]); + } + else + { + LOG_DEBUG(getLogger("FunctionsConversion"), "Get string value of variant {} with String column with size {} at offset {}", UInt64(old_discr), variants_converted_to_string[old_discr]->size(), old_offsets[i]); + new_offsets_data.push_back(string_variant->size()); + string_variant->insertFrom(*variants_converted_to_string[old_discr], old_offsets[i]); + } + } + + /// Create new list of variant columns. + Columns new_variant_columns; + new_variant_columns.resize(new_variants.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + auto new_discr = old_to_new_discriminators_mapping[i]; + if (new_discr != string_variant_discriminator) + new_variant_columns[new_discr] = variant_column.getVariantPtrByGlobalDiscriminator(i); + } + new_variant_columns[string_variant_discriminator] = std::move(string_variant); + return {ColumnVariant::create(std::move(new_discriminators_column), std::move(new_offsets), new_variant_columns), reduced_variant}; + } + + WrapperType createVariantToDynamicWrapper(const DataTypePtr & from_type, const DataTypeDynamic & dynamic_type) const + { + const auto & from_variant_type = assert_cast(*from_type); + size_t max_dynamic_types = dynamic_type.getMaxDynamicTypes(); + const auto & variants = from_variant_type.getVariants(); + std::unordered_map variant_name_to_discriminator; + variant_name_to_discriminator.reserve(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + variant_name_to_discriminator[variants[i]->getName()] = i; + + return [from_type, max_dynamic_types, variant_name_to_discriminator, this] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & variant_column = assert_cast(*arguments.front().column); + auto [reduced_variant_column, reduced_variant_type] = getReducedVariant(variant_column, from_type, variant_name_to_discriminator, max_dynamic_types); + return ColumnDynamic::create(reduced_variant_column, reduced_variant_type, max_dynamic_types); + }; + } + + WrapperType createColumnToDynamicWrapper(const DataTypePtr & from_type, const DataTypeDynamic & dynamic_type) const + { + if (const auto * variant_type = typeid_cast(from_type.get())) + return createVariantToDynamicWrapper(from_type, dynamic_type); + + if (dynamic_type.getMaxDynamicTypes() == 1) + { + DataTypePtr string_type = std::make_shared(); + if (from_type->isNullable()) + string_type = makeNullable(string_type); + auto string_wrapper = prepareUnpackDictionaries(from_type, string_type); + auto variant_type = std::make_shared(DataTypes{removeNullable(string_type)}); + auto variant_wrapper = createColumnToVariantWrapper(string_type, *variant_type); + return [string_wrapper, variant_wrapper, string_type, variant_type, max_dynamic_types=dynamic_type.getMaxDynamicTypes()] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * col_nullable, size_t input_rows_count) -> ColumnPtr + { + auto string_column = string_wrapper(arguments, string_type, col_nullable, input_rows_count); + auto column = ColumnWithTypeAndName(string_column, string_type, ""); + ColumnsWithTypeAndName args = {column}; + auto variant_column = variant_wrapper(args, variant_type, nullptr, string_column->size()); + return ColumnDynamic::create(variant_column, variant_type, max_dynamic_types); + }; + } + + if (context && context->getSettingsRef().cast_string_to_dynamic_use_inference && isStringOrFixedString(removeNullable(removeLowCardinality(from_type)))) + return createStringToDynamicThroughParsingWrapper(); + + auto variant_type = std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(from_type)}); + auto variant_wrapper = createColumnToVariantWrapper(from_type, *variant_type); + return [variant_wrapper, variant_type, max_dynamic_types=dynamic_type.getMaxDynamicTypes()] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * col_nullable, size_t input_rows_count) -> ColumnPtr + { + auto variant_res = variant_wrapper(arguments, variant_type, col_nullable, input_rows_count); + return ColumnDynamic::create(variant_res, variant_type, max_dynamic_types); + }; + } + + WrapperType createDynamicToDynamicWrapper(const DataTypeDynamic & from_dynamic, const DataTypeDynamic & to_dynamic) const + { + size_t from_max_types = from_dynamic.getMaxDynamicTypes(); + size_t to_max_types = to_dynamic.getMaxDynamicTypes(); + if (from_max_types == to_max_types) + return createIdentityWrapper(from_dynamic.getPtr()); + + if (to_max_types > from_max_types) + { + return [to_max_types] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & column_dynamic = assert_cast(*arguments[0].column); + return ColumnDynamic::create(column_dynamic.getVariantColumnPtr(), column_dynamic.getVariantInfo(), to_max_types); + }; + } + + return [to_max_types, this] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & column_dynamic = assert_cast(*arguments[0].column); + auto [reduced_variant_column, reduced_variant_type] = getReducedVariant( + column_dynamic.getVariantColumn(), + column_dynamic.getVariantInfo().variant_type, + column_dynamic.getVariantInfo().variant_name_to_discriminator, + to_max_types, + column_dynamic.getStatistics()); + return ColumnDynamic::create(reduced_variant_column, reduced_variant_type, to_max_types); + }; + } + + /// Wrapper for conversion to/from Dynamic type + WrapperType createDynamicWrapper(const DataTypePtr & from_type, const DataTypePtr & to_type) const + { + if (const auto * from_dynamic = checkAndGetDataType(from_type.get())) + { + if (const auto * to_dynamic = checkAndGetDataType(to_type.get())) + return createDynamicToDynamicWrapper(*from_dynamic, *to_dynamic); + + return createDynamicToColumnWrapper(to_type); + } + + return createColumnToDynamicWrapper(from_type, *checkAndGetDataType(to_type.get())); + } + template WrapperType createEnumWrapper(const DataTypePtr & from_type, const DataTypeEnum * to_type) const { @@ -4376,8 +4674,11 @@ private: WrapperType prepareUnpackDictionaries(const DataTypePtr & from_type, const DataTypePtr & to_type) const { - /// Conversion from/to Variant data type is processed in a special way. + /// Conversion from/to Variant/Dynamic data type is processed in a special way. /// We don't need to remove LowCardinality/Nullable. + if (isDynamic(to_type) || isDynamic(from_type)) + return createDynamicWrapper(from_type, to_type); + if (isVariant(to_type) || isVariant(from_type)) return createVariantWrapper(from_type, to_type); @@ -4691,7 +4992,7 @@ private: if (to_type->getCustomSerialization() && to_type->getCustomName()) { - ret = [requested_result_is_nullable]( + ret = [requested_result_is_nullable, this]( ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, @@ -4700,7 +5001,10 @@ private: auto wrapped_result_type = result_type; if (requested_result_is_nullable) wrapped_result_type = makeNullable(result_type); - return ConvertImplGenericFromString::execute( + if (this->cast_type == CastType::accurateOrNull) + return ConvertImplGenericFromString::execute( + arguments, wrapped_result_type, column_nullable, input_rows_count); + return ConvertImplGenericFromString::execute( arguments, wrapped_result_type, column_nullable, input_rows_count); }; return true; diff --git a/src/Functions/dynamicElement.cpp b/src/Functions/dynamicElement.cpp new file mode 100644 index 00000000000..964c058776e --- /dev/null +++ b/src/Functions/dynamicElement.cpp @@ -0,0 +1,172 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; +} + +namespace +{ + +/** Extract element of Dynamic by type name. + * Also the function looks through Arrays: you can get Array of Dynamic elements from Array of Dynamic. + */ +class FunctionDynamicElement : public IFunction +{ +public: + static constexpr auto name = "dynamicElement"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 2; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + const size_t number_of_arguments = arguments.size(); + + if (number_of_arguments != 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 2", + getName(), number_of_arguments); + + size_t count_arrays = 0; + const IDataType * input_type = arguments[0].type.get(); + while (const DataTypeArray * array = checkAndGetDataType(input_type)) + { + input_type = array->getNestedType().get(); + ++count_arrays; + } + + if (!isDynamic(*input_type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant or Array of Variant. Actual {}", + getName(), + arguments[0].type->getName()); + + auto return_type = makeNullableOrLowCardinalityNullableSafe(getRequestedElementType(arguments[1].column)); + + for (; count_arrays; --count_arrays) + return_type = std::make_shared(return_type); + + return return_type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto & input_arg = arguments[0]; + const IDataType * input_type = input_arg.type.get(); + const IColumn * input_col = input_arg.column.get(); + + bool input_arg_is_const = false; + if (typeid_cast(input_col)) + { + input_col = assert_cast(input_col)->getDataColumnPtr().get(); + input_arg_is_const = true; + } + + Columns array_offsets; + while (const DataTypeArray * array_type = checkAndGetDataType(input_type)) + { + const ColumnArray * array_col = assert_cast(input_col); + + input_type = array_type->getNestedType().get(); + input_col = &array_col->getData(); + array_offsets.push_back(array_col->getOffsetsPtr()); + } + + const ColumnDynamic * input_col_as_dynamic = checkAndGetColumn(input_col); + if (!input_col_as_dynamic) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Dynamic or array of Dynamics. Actual {}", getName(), input_arg.type->getName()); + + auto element_type = getRequestedElementType(arguments[1].column); + const auto & variant_info = input_col_as_dynamic->getVariantInfo(); + auto it = variant_info.variant_name_to_discriminator.find(element_type->getName()); + if (it == variant_info.variant_name_to_discriminator.end()) + { + auto result_type = makeNullableOrLowCardinalityNullableSafe(element_type); + auto result_column = result_type->createColumn(); + result_column->insertManyDefaults(input_rows_count); + return wrapInArraysAndConstIfNeeded(std::move(result_column), array_offsets, input_arg_is_const, input_rows_count); + } + + const auto & variant_column = input_col_as_dynamic->getVariantColumn(); + auto subcolumn_creator = SerializationVariantElement::VariantSubcolumnCreator(variant_column.getLocalDiscriminatorsPtr(), element_type->getName(), it->second, variant_column.localDiscriminatorByGlobal(it->second)); + auto result_column = subcolumn_creator.create(variant_column.getVariantPtrByGlobalDiscriminator(it->second)); + return wrapInArraysAndConstIfNeeded(std::move(result_column), array_offsets, input_arg_is_const, input_rows_count); + } + +private: + DataTypePtr getRequestedElementType(const ColumnPtr & type_name_column) const + { + const auto * name_col = checkAndGetColumnConst(type_name_column.get()); + if (!name_col) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument of {} must be a constant String", getName()); + + String element_type_name = name_col->getValue(); + auto element_type = DataTypeFactory::instance().tryGet(element_type_name); + if (!element_type) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument of {} must be a valid type name. Got: {}", getName(), element_type_name); + + return element_type; + } + + ColumnPtr wrapInArraysAndConstIfNeeded(ColumnPtr res, const Columns & array_offsets, bool input_arg_is_const, size_t input_rows_count) const + { + for (auto it = array_offsets.rbegin(); it != array_offsets.rend(); ++it) + res = ColumnArray::create(res, *it); + + if (input_arg_is_const) + res = ColumnConst::create(res, input_rows_count); + + return res; + } +}; + +} + +REGISTER_FUNCTION(DynamicElement) +{ +// factory.registerFunction(FunctionDocumentation{ +// .description = R"( +//Extracts a column with specified type from a `Dynamic` column. +//)", +// .syntax{"dynamicElement(dynamic, type_name)"}, +// .arguments{{ +// {"dynamic", "Dynamic column"}, +// {"type_name", "The name of the variant type to extract"}}}, +// .examples{{{ +// "Example", +// R"( +//)", +// R"( +//)"}}}, +// .categories{"Dynamic"}, +// }); + + factory.registerFunction(); +} + +} diff --git a/src/Functions/dynamicType.cpp b/src/Functions/dynamicType.cpp new file mode 100644 index 00000000000..8fb2974ceff --- /dev/null +++ b/src/Functions/dynamicType.cpp @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +namespace +{ + +/// Return enum with type name for each row in Dynamic column. +class FunctionDynamicType : public IFunction +{ +public: + static constexpr auto name = "dynamicType"; + static constexpr auto name_for_null = "None"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.empty() || arguments.size() > 1) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1", + getName(), arguments.empty()); + + if (!isDynamic(arguments[0].type.get())) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Dynamic, got {} instead", + getName(), arguments[0].type->getName()); + + return std::make_shared(std::make_shared()); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + const ColumnDynamic * dynamic_column = checkAndGetColumn(arguments[0].column.get()); + if (!dynamic_column) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Dynamic, got {} instead", + getName(), arguments[0].type->getName()); + + const auto & variant_info = dynamic_column->getVariantInfo(); + const auto & variant_column = dynamic_column->getVariantColumn(); + auto res = result_type->createColumn(); + String element_type; + for (size_t i = 0; i != input_rows_count; ++i) + { + auto global_discr = variant_column.globalDiscriminatorAt(i); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + element_type = name_for_null; + else + element_type = variant_info.variant_names[global_discr]; + + res->insertData(element_type.data(), element_type.size()); + } + + return res; + } +}; + +} + +REGISTER_FUNCTION(DynamicType) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Returns the variant type name for each row of `Dynamic` column. If row contains NULL, it returns 'None' for it. +)", + .syntax = {"dynamicType(variant)"}, + .arguments = {{"variant", "Variant column"}}, + .examples = {{{ + "Example", + R"( +)", + R"( + +)"}}}, + .categories{"Variant"}, + }); +} + +} diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 4f75042ad8d..d501fa28d4b 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -1157,6 +1158,11 @@ private: variant_column->applyNullMap(assert_cast(*arg_cond.column).getData()); return result_column; } + else if (auto * dynamic_column = typeid_cast(result_column.get())) + { + dynamic_column->applyNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else return ColumnNullable::create(materializeColumnIfConst(result_column), arg_cond.column); } @@ -1200,6 +1206,11 @@ private: variant_column->applyNegatedNullMap(assert_cast(*arg_cond.column).getData()); return result_column; } + else if (auto * dynamic_column = typeid_cast(result_column.get())) + { + dynamic_column->applyNegatedNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else { size_t size = input_rows_count; diff --git a/src/Functions/isNotNull.cpp b/src/Functions/isNotNull.cpp index dd5182aeade..f0afc0d5ba3 100644 --- a/src/Functions/isNotNull.cpp +++ b/src/Functions/isNotNull.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -44,9 +45,10 @@ public: { const ColumnWithTypeAndName & elem = arguments[0]; - if (isVariant(elem.type)) + if (isVariant(elem.type) || isDynamic(elem.type)) { - const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + const auto & column_variant = isVariant(elem.type) ? assert_cast(*elem.column) : assert_cast(*elem.column).getVariantColumn(); + const auto & discriminators = column_variant.getLocalDiscriminators(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); data.resize(discriminators.size()); diff --git a/src/Functions/isNull.cpp b/src/Functions/isNull.cpp index 4bf4e44f866..7ed4fa7a813 100644 --- a/src/Functions/isNull.cpp +++ b/src/Functions/isNull.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -46,9 +47,10 @@ public: { const ColumnWithTypeAndName & elem = arguments[0]; - if (isVariant(elem.type)) + if (isVariant(elem.type) || isDynamic(elem.type)) { - const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + const auto & column_variant = isVariant(elem.type) ? assert_cast(*elem.column) : assert_cast(*elem.column).getVariantColumn(); + const auto & discriminators = column_variant.getLocalDiscriminators(); auto res = DataTypeUInt8().createColumn(); auto & data = typeid_cast(*res).getData(); data.reserve(discriminators.size()); diff --git a/src/Functions/variantElement.cpp b/src/Functions/variantElement.cpp index 2744a0dabb8..b57ccb6fee1 100644 --- a/src/Functions/variantElement.cpp +++ b/src/Functions/variantElement.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -116,55 +117,12 @@ public: if (!variant_global_discr.has_value()) return arguments[2].column; + auto variant_local_discr = input_col_as_variant->localDiscriminatorByGlobal(*variant_global_discr); const auto & variant_type = input_type_as_variant->getVariant(*variant_global_discr); const auto & variant_column = input_col_as_variant->getVariantPtrByGlobalDiscriminator(*variant_global_discr); - - /// If Variant has only NULLs or our variant doesn't have any real values, - /// just create column with default values and create null mask with 1. - if (input_col_as_variant->hasOnlyNulls() || variant_column->empty()) - { - auto res = variant_type->createColumn(); - - if (variant_type->lowCardinality()) - assert_cast(*res).nestedToNullable(); - - res->insertManyDefaults(input_col_as_variant->size()); - if (!variant_type->canBeInsideNullable()) - return wrapInArraysAndConstIfNeeded(std::move(res), array_offsets, input_arg_is_const, input_rows_count); - - auto null_map = ColumnUInt8::create(); - auto & null_map_data = null_map->getData(); - null_map_data.resize_fill(input_col_as_variant->size(), 1); - return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(res), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); - } - - /// If we extract single non-empty column and have no NULLs, then just return this variant. - if (auto non_empty_local_discr = input_col_as_variant->getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) - { - /// If we were trying to extract some other variant, - /// it would be empty and we would already processed this case above. - chassert(input_col_as_variant->globalDiscriminatorByLocal(*non_empty_local_discr) == variant_global_discr); - return wrapInArraysAndConstIfNeeded(makeNullableOrLowCardinalityNullableSafe(variant_column), array_offsets, input_arg_is_const, input_rows_count); - } - - /// In general case we should calculate null-mask for variant - /// according to the discriminators column and expand - /// variant column by this mask to get a full column (with default values on NULLs) - const auto & local_discriminators = input_col_as_variant->getLocalDiscriminators(); - auto null_map = ColumnUInt8::create(); - auto & null_map_data = null_map->getData(); - null_map_data.reserve(local_discriminators.size()); - auto variant_local_discr = input_col_as_variant->localDiscriminatorByGlobal(*variant_global_discr); - for (auto local_discr : local_discriminators) - null_map_data.push_back(local_discr != variant_local_discr); - - auto expanded_variant_column = IColumn::mutate(variant_column); - if (variant_type->lowCardinality()) - expanded_variant_column = assert_cast(*expanded_variant_column).cloneNullable(); - expanded_variant_column->expand(null_map_data, /*inverted = */ true); - if (variant_type->canBeInsideNullable()) - return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(expanded_variant_column), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); - return wrapInArraysAndConstIfNeeded(std::move(expanded_variant_column), array_offsets, input_arg_is_const, input_rows_count); + auto subcolumn_creator = SerializationVariantElement::VariantSubcolumnCreator(input_col_as_variant->getLocalDiscriminatorsPtr(), variant_type->getName(), *variant_global_discr, variant_local_discr); + auto res = subcolumn_creator.create(variant_column); + return wrapInArraysAndConstIfNeeded(std::move(res), array_offsets, input_arg_is_const, input_rows_count); } private: std::optional getVariantGlobalDiscriminator(const ColumnPtr & index_column, const DataTypeVariant & variant_type, size_t argument_size) const diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 7c3bed7388c..739d0f17078 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1496,7 +1496,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, validateVirtualColumns(*res); - if (!res->supportsDynamicSubcolumns() && hasDynamicSubcolumns(res->getInMemoryMetadataPtr()->getColumns())) + if (!res->supportsDynamicSubcolumnsDeprecated() && hasDynamicSubcolumns(res->getInMemoryMetadataPtr()->getColumns())) { throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot create table with column of type Object, " diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index fc58f7b5098..a1cede5ae95 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -554,7 +554,7 @@ BlockIO InterpreterInsertQuery::execute() { /// Change query sample block columns to Nullable to allow inserting nullable columns, where NULL values will be substituted with /// default column values (in AddingDefaultsTransform), so all values will be cast correctly. - if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && !isVariant(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) + if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && !isVariant(query_columns[col_idx].type) && !isDynamic(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) query_sample_block.setColumn(col_idx, ColumnWithTypeAndName(makeNullableOrLowCardinalityNullable(query_columns[col_idx].column), makeNullableOrLowCardinalityNullable(query_columns[col_idx].type), query_columns[col_idx].name)); } } diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 5588fc55a64..351189f70ae 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -2,7 +2,7 @@ #include #include -#include +//#include #include #include @@ -1188,6 +1188,38 @@ bool TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select } } + if (!unknown_required_source_columns.empty()) + { + + for (const NameAndTypePair & pair : source_columns_ordinary) + { +// std::cerr << "Check ordinary column " << pair.name << "\n"; + if (!pair.type->hasDynamicSubcolumns()) + continue; + +// std::cerr << "Check dyamic subcolumns\n"; + + for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();) + { + auto [column_name, dynamic_subcolumn_name] = Nested::splitName(*it); +// std::cerr << "Check dyamic subcolumn " << dynamic_subcolumn_name << "\n"; + + if (column_name == pair.name) + { + if (auto dynamic_subcolumn_type = pair.type->tryGetSubcolumnType(dynamic_subcolumn_name)) + { +// std::cerr << "Found\n"; + source_columns.emplace_back(*it, dynamic_subcolumn_type); + it = unknown_required_source_columns.erase(it); + continue; + } + } + + ++it; + } + } + } + if (!unknown_required_source_columns.empty()) { constexpr auto format_string = "Missing columns: {} while processing query: '{}', required columns:{}{}"; diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 25085ff4823..30b7de409f1 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -26,6 +27,7 @@ #include #include #include +#include namespace DB @@ -165,6 +167,8 @@ Field convertDecimalType(const Field & from, const To & type) Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const IDataType * from_type_hint) { + checkStackSize(); + if (from_type_hint && from_type_hint->equals(type)) { return src; @@ -504,7 +508,7 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID else if (const DataTypeVariant * type_variant = typeid_cast(&type)) { /// If we have type hint and Variant contains such type, no need to convert field. - if (from_type_hint && type_variant->tryGetVariantDiscriminator(*from_type_hint)) + if (from_type_hint && type_variant->tryGetVariantDiscriminator(from_type_hint->getName())) return src; /// Create temporary column and check if we can insert this field to the variant. @@ -513,6 +517,11 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (col->tryInsert(src)) return src; } + else if (isDynamic(type)) + { + /// We can insert any field to Dynamic column. + return src; + } /// Conversion from string by parsing. if (src.getType() == Field::Types::String) diff --git a/src/Interpreters/parseColumnsListForTableFunction.cpp b/src/Interpreters/parseColumnsListForTableFunction.cpp index 27c364073ae..3529863a623 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.cpp +++ b/src/Interpreters/parseColumnsListForTableFunction.cpp @@ -40,7 +40,7 @@ void validateDataType(const DataTypePtr & type_to_check, const DataTypeValidatio if (!settings.allow_experimental_object_type) { - if (data_type.hasDynamicSubcolumns()) + if (data_type.hasDynamicSubcolumnsDeprecated()) { throw Exception( ErrorCodes::ILLEGAL_COLUMN, @@ -107,6 +107,18 @@ void validateDataType(const DataTypePtr & type_to_check, const DataTypeValidatio } } } + + if (!settings.allow_experimental_dynamic_type) + { + if (data_type.hasDynamicSubcolumns()) + { + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Cannot create column with type '{}' because experimental Dynamic type is not allowed. " + "Set setting allow_experimental_dynamic_type = 1 in order to allow it", + data_type.getName()); + } + } }; validate_callback(*type_to_check); diff --git a/src/Interpreters/parseColumnsListForTableFunction.h b/src/Interpreters/parseColumnsListForTableFunction.h index ffb59bfa457..e2d2bc97ff7 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.h +++ b/src/Interpreters/parseColumnsListForTableFunction.h @@ -21,6 +21,7 @@ struct DataTypeValidationSettings , allow_experimental_variant_type(settings.allow_experimental_variant_type) , allow_suspicious_variant_types(settings.allow_suspicious_variant_types) , validate_nested_types(settings.validate_experimental_and_suspicious_types_inside_nested_types) + , allow_experimental_dynamic_type(settings.allow_experimental_dynamic_type) { } @@ -30,6 +31,7 @@ struct DataTypeValidationSettings bool allow_experimental_variant_type = true; bool allow_suspicious_variant_types = true; bool validate_nested_types = true; + bool allow_experimental_dynamic_type = true; }; void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings & settings); diff --git a/src/Parsers/ParserDataType.cpp b/src/Parsers/ParserDataType.cpp index fcf189e51f4..747a9a6f7ba 100644 --- a/src/Parsers/ParserDataType.cpp +++ b/src/Parsers/ParserDataType.cpp @@ -1,18 +1,47 @@ #include #include +#include #include #include #include #include - namespace DB { namespace { +class DynamicArgumentsParser : public IParserBase +{ +private: + const char * getName() const override { return "Dynamic data type optional argument"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override + { + ASTPtr identifier; + ParserIdentifier identifier_parser; + if (!identifier_parser.parse(pos, identifier, expected)) + return false; + + if (pos->type != TokenType::Equals) + { + expected.add(pos, "equals operator"); + return false; + } + + ++pos; + + ASTPtr number; + ParserNumber number_parser; + if (!number_parser.parse(pos, number, expected)) + return false; + + node = makeASTFunction("equals", identifier, number); + return true; + } +}; + /// Wrapper to allow mixed lists of nested and normal types. /// Parameters are either: /// - Nested table elements; @@ -21,10 +50,21 @@ namespace /// - another data type (or identifier) class ParserDataTypeArgument : public IParserBase { +public: + ParserDataTypeArgument(std::string_view type_name_) : type_name(type_name_) + { + } + private: const char * getName() const override { return "data type argument"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override { + if (type_name == "Dynamic") + { + DynamicArgumentsParser parser; + return parser.parse(pos, node, expected); + } + ParserNestedTable nested_parser; ParserDataType data_type_parser; ParserAllCollectionsOfLiterals literal_parser(false); @@ -39,6 +79,8 @@ private: || literal_parser.parse(pos, node, expected) || data_type_parser.parse(pos, node, expected); } + + std::string_view type_name; }; } @@ -140,7 +182,7 @@ bool ParserDataType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ++pos; /// Parse optional parameters - ParserList args_parser(std::make_unique(), std::make_unique(TokenType::Comma)); + ParserList args_parser(std::make_unique(type_name), std::make_unique(TokenType::Comma)); ASTPtr expr_list_args; if (!args_parser.parse(pos, expr_list_args, expected)) diff --git a/src/Processors/Formats/IOutputFormat.h b/src/Processors/Formats/IOutputFormat.h index cae2ab7691e..9996bedb20e 100644 --- a/src/Processors/Formats/IOutputFormat.h +++ b/src/Processors/Formats/IOutputFormat.h @@ -105,6 +105,8 @@ public: } } + virtual void finalizeBuffers() {} + protected: friend class ParallelFormattingOutputFormat; @@ -122,7 +124,6 @@ protected: virtual void consumeTotals(Chunk) {} virtual void consumeExtremes(Chunk) {} virtual void finalizeImpl() {} - virtual void finalizeBuffers() {} virtual void writePrefix() {} virtual void writeSuffix() {} virtual void resetFormatterImpl() {} diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp index 3bd0b532d90..857f5040b79 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp @@ -70,25 +70,6 @@ static AggregatingSortedAlgorithm::ColumnsDefinition defineColumns( return def; } -static MutableColumns getMergedColumns(const Block & header, const AggregatingSortedAlgorithm::ColumnsDefinition & def) -{ - MutableColumns columns; - columns.resize(header.columns()); - - for (const auto & desc : def.columns_to_simple_aggregate) - { - const auto & type = desc.nested_type ? desc.nested_type - : desc.real_type; - columns[desc.column_number] = type->createColumn(); - } - - for (size_t i = 0; i < columns.size(); ++i) - if (!columns[i]) - columns[i] = header.getByPosition(i).type->createColumn(); - - return columns; -} - /// Remove constants and LowCardinality for SimpleAggregateFunction static void preprocessChunk(Chunk & chunk, const AggregatingSortedAlgorithm::ColumnsDefinition & def) { @@ -159,12 +140,24 @@ AggregatingSortedAlgorithm::SimpleAggregateDescription::~SimpleAggregateDescript AggregatingSortedAlgorithm::AggregatingMergedData::AggregatingMergedData( - MutableColumns columns_, UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_) - : MergedData(std::move(columns_), false, max_block_size_rows_, max_block_size_bytes_), def(def_) + : MergedData(false, max_block_size_rows_, max_block_size_bytes_), def(def_) { +} + +void AggregatingSortedAlgorithm::AggregatingMergedData::initialize(const DB::Block & header, const IMergingAlgorithm::Inputs & inputs) +{ + MergedData::initialize(header, inputs); + + for (const auto & desc : def.columns_to_simple_aggregate) + { + const auto & type = desc.nested_type ? desc.nested_type + : desc.real_type; + columns[desc.column_number] = type->createColumn(); + } + initAggregateDescription(); /// Just to make startGroup() simpler. @@ -267,12 +260,14 @@ AggregatingSortedAlgorithm::AggregatingSortedAlgorithm( size_t max_block_size_bytes_) : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, description_) , columns_definition(defineColumns(header_, description_)) - , merged_data(getMergedColumns(header_, columns_definition), max_block_size_rows_, max_block_size_bytes_, columns_definition) + , merged_data(max_block_size_rows_, max_block_size_bytes_, columns_definition) { } void AggregatingSortedAlgorithm::initialize(Inputs inputs) { + merged_data.initialize(header, inputs); + for (auto & input : inputs) if (input.chunk) preprocessChunk(input.chunk, columns_definition); diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h index aa221573151..9ab800058b1 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h @@ -101,11 +101,12 @@ private: public: AggregatingMergedData( - MutableColumns columns_, UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_); + void initialize(const Block & header, const IMergingAlgorithm::Inputs & inputs) override; + /// Group is a group of rows with the same sorting key. It represents single row in result. /// Algorithm is: start group, add several rows, finish group. /// Then pull chunk when enough groups were added. diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp index 8948cee217c..f5e4c88fcd0 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp @@ -31,8 +31,7 @@ CollapsingSortedAlgorithm::CollapsingSortedAlgorithm( LoggerPtr log_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) - , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs, std::make_unique(use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_)) , sign_column_number(header_.getPositionByName(sign_column)) , only_positive_sign(only_positive_sign_) , log(log_) @@ -65,7 +64,7 @@ void CollapsingSortedAlgorithm::reportIncorrectData() void CollapsingSortedAlgorithm::insertRow(RowRef & row) { - merged_data.insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); + merged_data->insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); } std::optional CollapsingSortedAlgorithm::insertRows() @@ -90,8 +89,8 @@ std::optional CollapsingSortedAlgorithm::insertRows() if (count_positive >= count_negative) { - if (merged_data.hasEnoughRows()) - res = merged_data.pull(); + if (merged_data->hasEnoughRows()) + res = merged_data->pull(); insertRow(last_positive_row); @@ -121,8 +120,8 @@ std::optional CollapsingSortedAlgorithm::insertRows() IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() { /// Rare case, which may happen when index_granularity is 1, but we needed to insert 2 rows inside insertRows(). - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); /// Take rows in required order and put them into `merged_data`, while the rows are no more than `max_block_size` while (queue.isValid()) @@ -148,8 +147,8 @@ IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() if (key_differs) { /// if there are enough rows and the last one is calculated completely - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); /// We write data for the previous primary key. auto res = insertRows(); @@ -220,7 +219,7 @@ IMergingAlgorithm::Status CollapsingSortedAlgorithm::merge() return Status(std::move(*res)); } - return Status(merged_data.pull(), true); + return Status(merged_data->pull(), true); } } diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h index be1a3a3bf33..99fd95d82d9 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h @@ -42,8 +42,6 @@ public: Status merge() override; private: - MergedData merged_data; - const size_t sign_column_number; const bool only_positive_sign; diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp index 814625d7aee..2b891592b20 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp @@ -46,8 +46,8 @@ GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( size_t max_block_size_bytes_, Graphite::Params params_, time_t time_of_merge_) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), nullptr, max_row_refs) - , merged_data(header_.cloneEmptyColumns(), false, max_block_size_rows_, max_block_size_bytes_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), nullptr, max_row_refs, std::make_unique(false, max_block_size_rows_, max_block_size_bytes_)) + , graphite_rollup_merged_data(assert_cast(*merged_data)) , params(std::move(params_)) , time_of_merge(time_of_merge_) { @@ -63,7 +63,7 @@ GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( } } - merged_data.allocMemForAggregates(max_size_of_aggregate_state, max_alignment_of_aggregate_state); + graphite_rollup_merged_data.allocMemForAggregates(max_size_of_aggregate_state, max_alignment_of_aggregate_state); columns_definition = defineColumns(header_, params); } @@ -113,7 +113,7 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() const DateLUTImpl & date_lut = timezone ? timezone->getTimeZone() : DateLUT::instance(); - /// Take rows in needed order and put them into `merged_data` until we get `max_block_size` rows. + /// Take rows in needed order and put them into `graphite_rollup_merged_data` until we get `max_block_size` rows. /// /// Variables starting with current_* refer to the rows previously popped from the queue that will /// contribute towards current output row. @@ -142,10 +142,10 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() if (is_new_key) { /// Accumulate the row that has maximum version in the previous group of rows with the same key: - if (merged_data.wasGroupStarted()) + if (graphite_rollup_merged_data.wasGroupStarted()) accumulateRow(current_subgroup_newest_row); - Graphite::RollupRule next_rule = merged_data.currentRule(); + Graphite::RollupRule next_rule = graphite_rollup_merged_data.currentRule(); if (new_path) next_rule = selectPatternForPath(this->params, next_path); @@ -167,15 +167,15 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() if (will_be_new_key) { - if (merged_data.wasGroupStarted()) + if (graphite_rollup_merged_data.wasGroupStarted()) { finishCurrentGroup(); /// We have enough rows - return, but don't advance the loop. At the beginning of the /// next call to merge() the same next_cursor will be processed once more and /// the next output row will be created from it. - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (graphite_rollup_merged_data.hasEnoughRows()) + return Status(graphite_rollup_merged_data.pull()); } /// At this point previous row has been fully processed, so we can advance the loop @@ -218,28 +218,28 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() } /// Write result row for the last group. - if (merged_data.wasGroupStarted()) + if (graphite_rollup_merged_data.wasGroupStarted()) { accumulateRow(current_subgroup_newest_row); finishCurrentGroup(); } - return Status(merged_data.pull(), true); + return Status(graphite_rollup_merged_data.pull(), true); } void GraphiteRollupSortedAlgorithm::startNextGroup(SortCursor & cursor, Graphite::RollupRule next_rule) { - merged_data.startNextGroup(cursor->all_columns, cursor->getRow(), next_rule, columns_definition); + graphite_rollup_merged_data.startNextGroup(cursor->all_columns, cursor->getRow(), next_rule, columns_definition); } void GraphiteRollupSortedAlgorithm::finishCurrentGroup() { - merged_data.insertRow(current_time_rounded, current_subgroup_newest_row, columns_definition); + graphite_rollup_merged_data.insertRow(current_time_rounded, current_subgroup_newest_row, columns_definition); } void GraphiteRollupSortedAlgorithm::accumulateRow(RowRef & row) { - merged_data.accumulateRow(row, columns_definition); + graphite_rollup_merged_data.accumulateRow(row, columns_definition); } void GraphiteRollupSortedAlgorithm::GraphiteRollupMergedData::startNextGroup( diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h index a20a6eaf11f..aaa3859efb6 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h @@ -53,7 +53,7 @@ public: { public: using MergedData::MergedData; - ~GraphiteRollupMergedData(); + ~GraphiteRollupMergedData() override; void startNextGroup(const ColumnRawPtrs & raw_columns, size_t row, Graphite::RollupRule next_rule, ColumnsDefinition & def); @@ -72,7 +72,7 @@ public: }; private: - GraphiteRollupMergedData merged_data; + GraphiteRollupMergedData & graphite_rollup_merged_data; const Graphite::Params params; ColumnsDefinition columns_definition; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h index b8e73aec0dc..cf4b8589441 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h @@ -34,9 +34,9 @@ protected: return !lhs.hasEqualSortColumnsWith(rhs); } -private: Block header; +private: /// Inputs currently being merged. Inputs current_inputs; SortCursorImpls cursors; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp index c8b69382e89..fe5186736b5 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp @@ -5,7 +5,7 @@ namespace DB { IMergingAlgorithmWithSharedChunks::IMergingAlgorithmWithSharedChunks( - Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs) + Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs, std::unique_ptr merged_data_) : header(std::move(header_)) , description(std::move(description_)) , chunk_allocator(num_inputs + max_row_refs) @@ -13,6 +13,7 @@ IMergingAlgorithmWithSharedChunks::IMergingAlgorithmWithSharedChunks( , sources(num_inputs) , sources_origin_merge_tree_part_level(num_inputs) , out_row_sources_buf(out_row_sources_buf_) + , merged_data(std::move(merged_data_)) { } @@ -28,6 +29,8 @@ static void prepareChunk(Chunk & chunk) void IMergingAlgorithmWithSharedChunks::initialize(Inputs inputs) { + merged_data->initialize(header, inputs); + for (size_t source_num = 0; source_num < inputs.size(); ++source_num) { if (!inputs[source_num].chunk) diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h index 3b4f9e92c5d..bc1aafe93f7 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include namespace DB @@ -10,7 +11,7 @@ class IMergingAlgorithmWithSharedChunks : public IMergingAlgorithm { public: IMergingAlgorithmWithSharedChunks( - Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs); + Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs, std::unique_ptr merged_data_); void initialize(Inputs inputs) override; void consume(Input & input, size_t source_num) override; @@ -25,7 +26,6 @@ private: SortCursorImpls cursors; protected: - struct Source { detail::SharedChunkPtr chunk; @@ -43,6 +43,8 @@ protected: /// If it is not nullptr then it should be populated during execution WriteBuffer * out_row_sources_buf = nullptr; + std::unique_ptr merged_data; + using RowRef = detail::RowRefWithOwnedChunk; void setRowRef(RowRef & row, SortCursor & cursor) { row.set(cursor, sources[cursor.impl->order].chunk); } bool skipLastRowFor(size_t input_number) const { return sources[input_number].skip_last_row; } diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h index 7ffde835ad0..95f915e4478 100644 --- a/src/Processors/Merges/Algorithms/MergedData.h +++ b/src/Processors/Merges/Algorithms/MergedData.h @@ -1,7 +1,9 @@ #pragma once #include +#include #include +#include #include #include @@ -19,17 +21,40 @@ namespace ErrorCodes class MergedData { public: - explicit MergedData(MutableColumns columns_, bool use_average_block_size_, UInt64 max_block_size_, UInt64 max_block_size_bytes_) - : columns(std::move(columns_)), max_block_size(max_block_size_), max_block_size_bytes(max_block_size_bytes_), use_average_block_size(use_average_block_size_) + explicit MergedData(bool use_average_block_size_, UInt64 max_block_size_, UInt64 max_block_size_bytes_) + : max_block_size(max_block_size_), max_block_size_bytes(max_block_size_bytes_), use_average_block_size(use_average_block_size_) { } + virtual void initialize(const Block & header, const IMergingAlgorithm::Inputs & inputs) + { + columns = header.cloneEmptyColumns(); + std::vector source_columns; + source_columns.resize(columns.size()); + for (const auto & input : inputs) + { + if (!input.chunk) + continue; + + const auto & input_columns = input.chunk.getColumns(); + for (size_t i = 0; i != input_columns.size(); ++i) + source_columns[i].push_back(input_columns[i]); + } + + for (size_t i = 0; i != columns.size(); ++i) + { + if (columns[i]->hasDynamicStructure()) + columns[i]->takeDynamicStructureFromSourceColumns(source_columns[i]); + } + } + /// Pull will be called at next prepare call. void flush() { need_flush = true; } void insertRow(const ColumnRawPtrs & raw_columns, size_t row, size_t block_size) { size_t num_columns = raw_columns.size(); + chassert(columns.size() == num_columns); for (size_t i = 0; i < num_columns; ++i) columns[i]->insertFrom(*raw_columns[i], row); @@ -41,6 +66,7 @@ public: void insertRows(const ColumnRawPtrs & raw_columns, size_t start_index, size_t length, size_t block_size) { size_t num_columns = raw_columns.size(); + chassert(columns.size() == num_columns); for (size_t i = 0; i < num_columns; ++i) { if (length == 1) @@ -61,6 +87,7 @@ public: UInt64 num_rows = chunk.getNumRows(); UInt64 num_columns = chunk.getNumColumns(); + chassert(columns.size() == num_columns); auto chunk_columns = chunk.mutateColumns(); /// Here is a special code for constant columns. @@ -69,9 +96,18 @@ public: for (size_t i = 0; i < num_columns; ++i) { if (isColumnConst(*columns[i])) + { columns[i] = columns[i]->cloneResized(num_rows); + } + else if (columns[i]->hasDynamicStructure()) + { + columns[i] = columns[i]->cloneEmpty(); + columns[i]->insertRangeFrom(*chunk_columns[i], 0, num_rows); + } else + { columns[i] = std::move(chunk_columns[i]); + } } if (rows_size < num_rows) @@ -144,6 +180,8 @@ public: UInt64 totalAllocatedBytes() const { return total_allocated_bytes; } UInt64 maxBlockSize() const { return max_block_size; } + virtual ~MergedData() = default; + protected: MutableColumns columns; diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp index 1debfcec8e0..75a6ddec682 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp @@ -18,7 +18,7 @@ MergingSortedAlgorithm::MergingSortedAlgorithm( WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) : header(std::move(header_)) - , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size_, max_block_size_bytes_) + , merged_data(use_average_block_sizes, max_block_size_, max_block_size_bytes_) , description(description_) , limit(limit_) , out_row_sources_buf(out_row_sources_buf_) @@ -59,6 +59,7 @@ static void prepareChunk(Chunk & chunk) void MergingSortedAlgorithm::initialize(Inputs inputs) { + merged_data.initialize(header, inputs); current_inputs = std::move(inputs); for (size_t source_num = 0; source_num < current_inputs.size(); ++source_num) diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp index 9e5c1249c4e..7b2c7d82a01 100644 --- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp @@ -41,9 +41,8 @@ ReplacingSortedAlgorithm::ReplacingSortedAlgorithm( bool use_average_block_sizes, bool cleanup_, bool enable_vertical_final_) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) - , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows, max_block_size_bytes), cleanup(cleanup_) - , enable_vertical_final(enable_vertical_final_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs, std::make_unique(use_average_block_sizes, max_block_size_rows, max_block_size_bytes)) + , cleanup(cleanup_), enable_vertical_final(enable_vertical_final_) { if (!is_deleted_column.empty()) is_deleted_column_number = header_.getPositionByName(is_deleted_column); @@ -75,7 +74,7 @@ void ReplacingSortedAlgorithm::insertRow() to_be_emitted.push(std::move(selected_row.owned_chunk)); } else - merged_data.insertRow(*selected_row.all_columns, selected_row.row_num, selected_row.owned_chunk->getNumRows()); + merged_data->insertRow(*selected_row.all_columns, selected_row.row_num, selected_row.owned_chunk->getNumRows()); selected_row.clear(); } @@ -109,8 +108,8 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge() if (key_differs) { /// If there are enough rows and the last one is calculated completely - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); /// Write the data for the previous primary key. if (!selected_row.empty()) @@ -168,8 +167,8 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge() } /// If have enough rows, return block, because it prohibited to overflow requested number of rows. - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); /// We will write the data for the last primary key. if (!selected_row.empty()) @@ -193,7 +192,7 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge() return emitChunk(chunk, to_be_emitted.empty()); } - return Status(merged_data.pull(), true); + return Status(merged_data->pull(), true); } void ReplacingSortedAlgorithm::saveChunkForSkippingFinalFromSelectedRow() diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h index 2fbd73c9072..a3ccccf0845 100644 --- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h @@ -44,8 +44,6 @@ public: Status merge() override; private: - MergedData merged_data; - ssize_t is_deleted_column_number = -1; ssize_t version_column_number = -1; bool cleanup = false; diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp index 28160b18269..49a417e7df2 100644 --- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp @@ -382,39 +382,6 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns( return def; } -static MutableColumns getMergedDataColumns( - const Block & header, - const SummingSortedAlgorithm::ColumnsDefinition & def) -{ - MutableColumns columns; - size_t num_columns = def.column_numbers_not_to_aggregate.size() + def.columns_to_aggregate.size(); - columns.reserve(num_columns); - - for (const auto & desc : def.columns_to_aggregate) - { - // Wrap aggregated columns in a tuple to match function signature - if (!desc.is_agg_func_type && !desc.is_simple_agg_func_type && isTuple(desc.function->getResultType())) - { - size_t tuple_size = desc.column_numbers.size(); - MutableColumns tuple_columns(tuple_size); - for (size_t i = 0; i < tuple_size; ++i) - tuple_columns[i] = header.safeGetByPosition(desc.column_numbers[i]).column->cloneEmpty(); - - columns.emplace_back(ColumnTuple::create(std::move(tuple_columns))); - } - else - { - const auto & type = desc.nested_type ? desc.nested_type : desc.real_type; - columns.emplace_back(type->createColumn()); - } - } - - for (const auto & column_number : def.column_numbers_not_to_aggregate) - columns.emplace_back(header.safeGetByPosition(column_number).type->createColumn()); - - return columns; -} - static void preprocessChunk(Chunk & chunk, const SummingSortedAlgorithm::ColumnsDefinition & def) { auto num_rows = chunk.getNumRows(); @@ -504,11 +471,44 @@ static void setRow(Row & row, const ColumnRawPtrs & raw_columns, size_t row_num, } -SummingSortedAlgorithm::SummingMergedData::SummingMergedData( - MutableColumns columns_, UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_) - : MergedData(std::move(columns_), false, max_block_size_rows_, max_block_size_bytes_) +SummingSortedAlgorithm::SummingMergedData::SummingMergedData(UInt64 max_block_size_rows_, UInt64 max_block_size_bytes_, ColumnsDefinition & def_) + : MergedData(false, max_block_size_rows_, max_block_size_bytes_) , def(def_) { +} + +void SummingSortedAlgorithm::SummingMergedData::initialize(const DB::Block & header, const IMergingAlgorithm::Inputs & inputs) +{ + MergedData::initialize(header, inputs); + + MutableColumns new_columns; + size_t num_columns = def.column_numbers_not_to_aggregate.size() + def.columns_to_aggregate.size(); + new_columns.reserve(num_columns); + + for (const auto & desc : def.columns_to_aggregate) + { + // Wrap aggregated columns in a tuple to match function signature + if (!desc.is_agg_func_type && !desc.is_simple_agg_func_type && isTuple(desc.function->getResultType())) + { + size_t tuple_size = desc.column_numbers.size(); + MutableColumns tuple_columns(tuple_size); + for (size_t i = 0; i < tuple_size; ++i) + tuple_columns[i] = std::move(columns[desc.column_numbers[i]]); + + new_columns.emplace_back(ColumnTuple::create(std::move(tuple_columns))); + } + else + { + const auto & type = desc.nested_type ? desc.nested_type : desc.real_type; + new_columns.emplace_back(type->createColumn()); + } + } + + for (const auto & column_number : def.column_numbers_not_to_aggregate) + new_columns.emplace_back(std::move(columns[column_number])); + + columns = std::move(new_columns); + current_row.resize(def.column_names.size()); initAggregateDescription(); @@ -698,12 +698,14 @@ SummingSortedAlgorithm::SummingSortedAlgorithm( size_t max_block_size_bytes) : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, std::move(description_)) , columns_definition(defineColumns(header_, description, column_names_to_sum, partition_key_columns)) - , merged_data(getMergedDataColumns(header_, columns_definition), max_block_size_rows, max_block_size_bytes, columns_definition) + , merged_data(max_block_size_rows, max_block_size_bytes, columns_definition) { } void SummingSortedAlgorithm::initialize(Inputs inputs) { + merged_data.initialize(header, inputs); + for (auto & input : inputs) if (input.chunk) preprocessChunk(input.chunk, columns_definition); diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h index dbbe4e53a5f..664b171c4b9 100644 --- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h @@ -65,7 +65,9 @@ public: using MergedData::insertRow; public: - SummingMergedData(MutableColumns columns_, UInt64 max_block_size_rows, UInt64 max_block_size_bytes_, ColumnsDefinition & def_); + SummingMergedData(UInt64 max_block_size_rows, UInt64 max_block_size_bytes_, ColumnsDefinition & def_); + + void initialize(const Block & header, const IMergingAlgorithm::Inputs & inputs) override; void startGroup(ColumnRawPtrs & raw_columns, size_t row); void finishGroup(); diff --git a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp index e7a431dc1d0..9f124c6ba18 100644 --- a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp @@ -16,8 +16,7 @@ VersionedCollapsingAlgorithm::VersionedCollapsingAlgorithm( size_t max_block_size_bytes_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE) - , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE, std::make_unique(use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_)) /// -1 for +1 in FixedSizeDequeWithGaps's internal buffer. 3 is a reasonable minimum size to collapse anything. , max_rows_in_queue(std::min(std::max(3, max_block_size_rows_), MAX_ROWS_IN_MULTIVERSION_QUEUE) - 1) , current_keys(max_rows_in_queue) @@ -47,7 +46,7 @@ void VersionedCollapsingAlgorithm::insertGap(size_t gap_size) void VersionedCollapsingAlgorithm::insertRow(size_t skip_rows, const RowRef & row) { - merged_data.insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); + merged_data->insertRow(*row.all_columns, row.row_num, row.owned_chunk->getNumRows()); insertGap(skip_rows); @@ -104,8 +103,8 @@ IMergingAlgorithm::Status VersionedCollapsingAlgorithm::merge() --num_rows_to_insert; /// It's ok to return here, because we didn't affect queue. - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); } if (current_keys.empty()) @@ -147,13 +146,13 @@ IMergingAlgorithm::Status VersionedCollapsingAlgorithm::merge() insertRow(gap, row); current_keys.popFront(); - if (merged_data.hasEnoughRows()) - return Status(merged_data.pull()); + if (merged_data->hasEnoughRows()) + return Status(merged_data->pull()); } /// Write information about last collapsed rows. insertGap(current_keys.frontGap()); - return Status(merged_data.pull(), true); + return Status(merged_data->pull(), true); } } diff --git a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h index d98529b301c..e6d20ddac75 100644 --- a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h +++ b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h @@ -29,8 +29,6 @@ public: Status merge() override; private: - MergedData merged_data; - size_t sign_column_number = 0; const size_t max_rows_in_queue; diff --git a/src/Processors/Transforms/ColumnGathererTransform.cpp b/src/Processors/Transforms/ColumnGathererTransform.cpp index b2e8e9bc89e..6736cd59e83 100644 --- a/src/Processors/Transforms/ColumnGathererTransform.cpp +++ b/src/Processors/Transforms/ColumnGathererTransform.cpp @@ -32,15 +32,23 @@ ColumnGathererStream::ColumnGathererStream( void ColumnGathererStream::initialize(Inputs inputs) { + Columns source_columns; + source_columns.reserve(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { if (inputs[i].chunk) { sources[i].update(inputs[i].chunk.detachColumns().at(0)); - if (!result_column) - result_column = sources[i].column->cloneEmpty(); + source_columns.push_back(sources[i].column); } } + + if (source_columns.empty()) + return; + + result_column = source_columns[0]->cloneEmpty(); + if (result_column->hasDynamicStructure()) + result_column->takeDynamicStructureFromSourceColumns(source_columns); } IMergingAlgorithm::Status ColumnGathererStream::merge() @@ -52,7 +60,16 @@ IMergingAlgorithm::Status ColumnGathererStream::merge() if (source_to_fully_copy) /// Was set on a previous iteration { Chunk res; - res.addColumn(source_to_fully_copy->column); + if (result_column->hasDynamicStructure()) + { + auto col = result_column->cloneEmpty(); + col->insertRangeFrom(*source_to_fully_copy->column, 0, source_to_fully_copy->column->size()); + res.addColumn(std::move(col)); + } + else + { + res.addColumn(source_to_fully_copy->column); + } merged_rows += source_to_fully_copy->size; source_to_fully_copy->pos = source_to_fully_copy->size; source_to_fully_copy = nullptr; @@ -96,7 +113,16 @@ IMergingAlgorithm::Status ColumnGathererStream::merge() Chunk res; merged_rows += source_to_fully_copy->column->size(); merged_bytes += source_to_fully_copy->column->allocatedBytes(); - res.addColumn(source_to_fully_copy->column); + if (result_column->hasDynamicStructure()) + { + auto col = result_column->cloneEmpty(); + col->insertRangeFrom(*source_to_fully_copy->column, 0, source_to_fully_copy->column->size()); + res.addColumn(std::move(col)); + } + else + { + res.addColumn(source_to_fully_copy->column); + } source_to_fully_copy->pos = source_to_fully_copy->size; source_to_fully_copy = nullptr; return Status(std::move(res)); diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index eae5e1a8a47..db6a4d9f06e 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1288,7 +1288,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const /// Looks like there is something around default expression for this column (method `getDefault` is not implemented for the data type Object). /// But after ALTER TABLE ADD COLUMN we need to fill existing rows with something (exactly the default value). /// So we don't allow to do it for now. - if (command.data_type->hasDynamicSubcolumns()) + if (command.data_type->hasDynamicSubcolumnsDeprecated()) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Adding a new column of a type which has dynamic subcolumns to an existing table is not allowed. It has known bugs"); if (virtuals->tryGet(column_name, VirtualsKind::Persistent)) @@ -1366,8 +1366,8 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const const GetColumnsOptions options(GetColumnsOptions::All); const auto old_data_type = all_columns.getColumn(options, column_name).type; - bool new_type_has_object = command.data_type->hasDynamicSubcolumns(); - bool old_type_has_object = old_data_type->hasDynamicSubcolumns(); + bool new_type_has_object = command.data_type->hasDynamicSubcolumnsDeprecated(); + bool old_type_has_object = old_data_type->hasDynamicSubcolumnsDeprecated(); if (new_type_has_object || old_type_has_object) throw Exception( diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 16b89f24243..6f844e31970 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -547,7 +547,18 @@ bool ColumnsDescription::hasNested(const String & column_name) const bool ColumnsDescription::hasSubcolumn(const String & column_name) const { - return subcolumns.get<0>().count(column_name); + if (subcolumns.get<0>().count(column_name)) + return true; + + auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); + auto it = columns.get<1>().find(ordinary_column_name); + if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) + { + if (auto dynamic_subcolumn_type = it->type->tryGetSubcolumnType(dynamic_subcolumn_name)) + return true; + } + + return false; } const ColumnDescription & ColumnsDescription::get(const String & column_name) const @@ -644,6 +655,14 @@ std::optional ColumnsDescription::tryGetColumn(const GetColumns return *jt; } + auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); + it = columns.get<1>().find(ordinary_column_name); + if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) + { + if (auto dynamic_subcolumn_type = it->type->tryGetSubcolumnType(dynamic_subcolumn_name)) + return NameAndTypePair(ordinary_column_name, dynamic_subcolumn_name, it->type, dynamic_subcolumn_type); + } + return {}; } @@ -730,9 +749,18 @@ bool ColumnsDescription::hasAlias(const String & column_name) const bool ColumnsDescription::hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const { auto it = columns.get<1>().find(column_name); - return (it != columns.get<1>().end() - && (defaultKindToGetKind(it->default_desc.kind) & kind)) - || hasSubcolumn(column_name); + if ((it != columns.get<1>().end() && (defaultKindToGetKind(it->default_desc.kind) & kind)) || hasSubcolumn(column_name)) + return true; + + auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); + it = columns.get<1>().find(ordinary_column_name); + if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) + { + if (auto dynamic_subcolumn_type = it->type->hasSubcolumn(dynamic_subcolumn_name)) + return true; + } + + return false; } bool ColumnsDescription::hasColumnOrNested(GetColumnsOptions::Kind kind, const String & column_name) const diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index b14bb7f997b..785ddcd18f8 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -79,6 +79,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + static ColumnsDescription getTableStructureFromData( const String & format, const String & uri, diff --git a/src/Storages/HDFS/StorageHDFSCluster.h b/src/Storages/HDFS/StorageHDFSCluster.h index 26ebc8601ee..448b4be6c96 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.h +++ b/src/Storages/HDFS/StorageHDFSCluster.h @@ -36,6 +36,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization() const override { return true; } private: diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 1108eafc6b6..5a23fcceeb9 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -172,8 +172,10 @@ public: /// This method can return true for readonly engines that return the same rows for reading (such as SystemNumbers) virtual bool supportsTransactions() const { return false; } + /// Returns true if the storage supports storing of data type Object. + virtual bool supportsDynamicSubcolumnsDeprecated() const { return false; } + /// Returns true if the storage supports storing of dynamic subcolumns. - /// For now it makes sense only for data type Object. virtual bool supportsDynamicSubcolumns() const { return false; } /// Requires squashing small blocks to large for optimal storage. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 570175f6614..2e2d1dbed4d 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -2392,6 +2392,36 @@ void IMergeTreeDataPart::setBrokenReason(const String & message, int code) const exception_code = code; } +ColumnPtr IMergeTreeDataPart::readColumnSample(const NameAndTypePair & column) const +{ + const size_t total_mark = getMarksCount(); + if (!total_mark) + return column.type->createColumn(); + + NamesAndTypesList cols; + cols.emplace_back(column); + + StorageMetadataPtr metadata_ptr = storage.getInMemoryMetadataPtr(); + StorageSnapshotPtr storage_snapshot_ptr = std::make_shared(storage, metadata_ptr); + + MergeTreeReaderPtr reader = getReader( + cols, + storage_snapshot_ptr, + MarkRanges{MarkRange(0, 1)}, + /*virtual_fields=*/ {}, + /*uncompressed_cache=*/{}, + storage.getContext()->getMarkCache().get(), + std::make_shared(), + MergeTreeReaderSettings{}, + ValueSizeMap{}, + ReadBufferFromFileBase::ProfileCallback{}); + + Columns result; + result.resize(1); + reader->readRows(0, 1, false, 0, result); + return result[0]; +} + bool isCompactPart(const MergeTreeDataPartPtr & data_part) { return (data_part && data_part->getType() == MergeTreeDataPartType::Compact); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 7519980a7a3..78619f216c0 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -166,6 +166,8 @@ public: NameAndTypePair getColumn(const String & name) const; std::optional tryGetColumn(const String & column_name) const; + ColumnPtr readColumnSample(const NameAndTypePair & column) const; + const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; } SerializationPtr getSerialization(const String & column_name) const; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 08a2ff89e7b..c47297be84d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3660,7 +3660,7 @@ void MergeTreeData::checkPartDynamicColumns(MutableDataPartPtr & part, DataParts continue; auto storage_column = columns.getPhysical(part_column.name); - if (!storage_column.type->hasDynamicSubcolumns()) + if (!storage_column.type->hasDynamicSubcolumnsDeprecated()) continue; auto concrete_storage_column = object_columns.getPhysical(part_column.name); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 046376be474..089793beab8 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -434,6 +434,7 @@ public: bool supportsTTL() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } bool supportsLightweightDelete() const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 1605e5cdb9a..d0a685d95fc 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -44,21 +44,27 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( marks_source_hashing = std::make_unique(*marks_compressor); } +} + +void MergeTreeDataPartWriterCompact::initStreamsIfNeeded(const Block & block) +{ + if (!compressed_streams.empty()) + return; auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); for (const auto & column : columns_list) { auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); - addStreams(column, compression); + addStreams(column, block.getByName(column.name).column, compression); } } -void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc) +void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & name_and_type, const ColumnPtr & column, const ASTPtr & effective_codec_desc) { ISerialization::StreamCallback callback = [&](const auto & substream_path) { assert(!substream_path.empty()); - String stream_name = ISerialization::getFileNameForStream(column, substream_path); + String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); /// Shared offsets for Nested type. if (compressed_streams.contains(stream_name)) @@ -81,7 +87,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, compressed_streams.emplace(stream_name, stream); }; - data_part->getSerialization(column.name)->enumerateStreams(callback, column.type); + data_part->getSerialization(name_and_type.name)->enumerateStreams(callback, name_and_type.type, column); } namespace @@ -138,6 +144,7 @@ void writeColumnSingleGranule( serialize_settings.getter = stream_getter; serialize_settings.position_independent_encoding = true; serialize_settings.low_cardinality_max_dictionary_size = 0; + serialize_settings.dynamic_write_statistics = ISerialization::SerializeBinaryBulkSettings::DynamicStatisticsMode::PREFIX; serialization->serializeBinaryBulkStatePrefix(*column.column, serialize_settings, state); serialization->serializeBinaryBulkWithMultipleStreams(*column.column, from_row, number_of_rows, serialize_settings, state); @@ -148,6 +155,8 @@ void writeColumnSingleGranule( void MergeTreeDataPartWriterCompact::write(const Block & block, const IColumn::Permutation * permutation) { + initStreamsIfNeeded(block); + /// Fill index granularity for this block /// if it's unknown (in case of insert data or horizontal merge, /// but not in case of vertical merge) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index ddb6178dce6..1c748803c52 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -42,7 +42,9 @@ private: void addToChecksums(MergeTreeDataPartChecksums & checksums); - void addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc); + void addStreams(const NameAndTypePair & name_and_type, const ColumnPtr & column, const ASTPtr & effective_codec_desc); + + void initStreamsIfNeeded(const Block & block); Block header; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 6a3b08d4d65..c23a9a81cbc 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -89,16 +89,25 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( indices_to_recalc_, stats_to_recalc_, marks_file_extension_, default_codec_, settings_, index_granularity_) { +} + +void MergeTreeDataPartWriterWide::initStreamsIfNeeded(const DB::Block & block) +{ + if (!column_streams.empty()) + return; + + block_sample = block.cloneEmpty(); auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); for (const auto & column : columns_list) { auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); - addStreams(column, compression); + addStreams(column, block_sample.getByName(column.name).column, compression); } } void MergeTreeDataPartWriterWide::addStreams( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, + const ColumnPtr & column, const ASTPtr & effective_codec_desc) { ISerialization::StreamCallback callback = [&](const auto & substream_path) @@ -106,7 +115,7 @@ void MergeTreeDataPartWriterWide::addStreams( assert(!substream_path.empty()); auto storage_settings = storage.getSettings(); - auto full_stream_name = ISerialization::getFileNameForStream(column, substream_path); + auto full_stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path); String stream_name; if (storage_settings->replace_long_file_name_to_hash && full_stream_name.size() > storage_settings->max_file_name_length) @@ -138,7 +147,7 @@ void MergeTreeDataPartWriterWide::addStreams( auto ast = parseQuery(codec_parser, "(" + Poco::toUpper(settings.marks_compression_codec) + ")", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS); CompressionCodecPtr marks_compression_codec = CompressionCodecFactory::instance().get(ast, nullptr); - const auto column_desc = metadata_snapshot->columns.tryGetColumnDescription(GetColumnsOptions(GetColumnsOptions::AllPhysical), column.getNameInStorage()); + const auto column_desc = metadata_snapshot->columns.tryGetColumnDescription(GetColumnsOptions(GetColumnsOptions::AllPhysical), name_and_type.getNameInStorage()); UInt64 max_compress_block_size = 0; if (column_desc) @@ -163,7 +172,7 @@ void MergeTreeDataPartWriterWide::addStreams( }; ISerialization::SubstreamPath path; - data_part->getSerialization(column.name)->enumerateStreams(callback, column.type); + data_part->getSerialization(name_and_type.name)->enumerateStreams(callback, name_and_type.type, column); } const String & MergeTreeDataPartWriterWide::getStreamName( @@ -222,6 +231,8 @@ void MergeTreeDataPartWriterWide::shiftCurrentMark(const Granules & granules_wri void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Permutation * permutation) { + initStreamsIfNeeded(block); + /// Fill index granularity for this block /// if it's unknown (in case of insert data or horizontal merge, /// but not in case of vertical part of vertical merge) @@ -302,11 +313,12 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm } void MergeTreeDataPartWriterWide::writeSingleMark( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, WrittenOffsetColumns & offset_columns, size_t number_of_rows) { - StreamsWithMarks marks = getCurrentMarksForColumn(column, offset_columns); + auto * sample_column = block_sample.findByName(name_and_type.name); + StreamsWithMarks marks = getCurrentMarksForColumn(name_and_type, sample_column ? sample_column->column : nullptr, offset_columns); for (const auto & mark : marks) flushMarkToFile(mark, number_of_rows); } @@ -323,21 +335,22 @@ void MergeTreeDataPartWriterWide::flushMarkToFile(const StreamNameAndMark & stre } StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, + const ColumnPtr & column_sample, WrittenOffsetColumns & offset_columns) { StreamsWithMarks result; - const auto column_desc = metadata_snapshot->columns.tryGetColumnDescription(GetColumnsOptions(GetColumnsOptions::AllPhysical), column.getNameInStorage()); + const auto column_desc = metadata_snapshot->columns.tryGetColumnDescription(GetColumnsOptions(GetColumnsOptions::AllPhysical), name_and_type.getNameInStorage()); UInt64 min_compress_block_size = 0; if (column_desc) if (const auto * value = column_desc->settings.tryGet("min_compress_block_size")) min_compress_block_size = value->safeGet(); if (!min_compress_block_size) min_compress_block_size = settings.min_compress_block_size; - data_part->getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + data_part->getSerialization(name_and_type.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; - auto stream_name = getStreamName(column, substream_path); + auto stream_name = getStreamName(name_and_type, substream_path); /// Don't write offsets more than one time for Nested type. if (is_offsets && offset_columns.contains(stream_name)) @@ -355,7 +368,7 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn( stream_with_mark.mark.offset_in_decompressed_block = stream.compressed_hashing.offset(); result.push_back(stream_with_mark); - }); + }, name_and_type.type, column_sample); return result; } @@ -382,7 +395,7 @@ void MergeTreeDataPartWriterWide::writeSingleGranule( return; column_streams.at(stream_name)->compressed_hashing.nextIfAtEnd(); - }); + }, name_and_type.type, column.getPtr()); } /// Column must not be empty. (column.size() !== 0) @@ -424,7 +437,7 @@ void MergeTreeDataPartWriterWide::writeColumn( "We have to add new mark for column, but already have non written mark. " "Current mark {}, total marks {}, offset {}", getCurrentMark(), index_granularity.getMarksCount(), rows_written_in_last_mark); - last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, offset_columns); + last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, column.getPtr(), offset_columns); } writeSingleGranule( @@ -453,7 +466,7 @@ void MergeTreeDataPartWriterWide::writeColumn( bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) offset_columns.insert(getStreamName(name_and_type, substream_path)); - }); + }, name_and_type.type, column.getPtr()); } @@ -622,6 +635,7 @@ void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksum if (!serialization_states.empty()) { serialize_settings.getter = createStreamGetter(*it, written_offset_columns ? *written_offset_columns : offset_columns); + serialize_settings.dynamic_write_statistics = ISerialization::SerializeBinaryBulkSettings::DynamicStatisticsMode::SUFFIX; data_part->getSerialization(it->name)->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]); } @@ -703,17 +717,17 @@ void MergeTreeDataPartWriterWide::finish(bool sync) } void MergeTreeDataPartWriterWide::writeFinalMark( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, WrittenOffsetColumns & offset_columns) { - writeSingleMark(column, offset_columns, 0); + writeSingleMark(name_and_type, offset_columns, 0); /// Memoize information about offsets - data_part->getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + data_part->getSerialization(name_and_type.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) - offset_columns.insert(getStreamName(column, substream_path)); - }); + offset_columns.insert(getStreamName(name_and_type, substream_path)); + }, name_and_type.type, block_sample.getByName(name_and_type.name).column); } static void fillIndexGranularityImpl( diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index f5ff323563d..ebdd907914f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -63,7 +63,8 @@ private: /// Take offsets from column and return as MarkInCompressed file with stream name StreamsWithMarks getCurrentMarksForColumn( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, + const ColumnPtr & column_sample, WrittenOffsetColumns & offset_columns); /// Write mark to disk using stream and rows count @@ -73,18 +74,21 @@ private: /// Write mark for column taking offsets from column stream void writeSingleMark( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, WrittenOffsetColumns & offset_columns, size_t number_of_rows); void writeFinalMark( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, WrittenOffsetColumns & offset_columns); void addStreams( - const NameAndTypePair & column, + const NameAndTypePair & name_and_type, + const ColumnPtr & column, const ASTPtr & effective_codec_desc); + void initStreamsIfNeeded(const Block & block); + /// Method for self check (used in debug-build only). Checks that written /// data and corresponding marks are consistent. Otherwise throws logical /// errors. @@ -129,6 +133,8 @@ private: /// How many rows we have already written in the current mark. /// More than zero when incoming blocks are smaller then their granularity. size_t rows_written_in_last_mark = 0; + + Block block_sample; }; } diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index cadd94867ec..ad60e31dddc 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -422,7 +422,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( auto columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); for (auto & column : columns) - if (column.type->hasDynamicSubcolumns()) + if (column.type->hasDynamicSubcolumnsDeprecated()) column.type = block.getByName(column.name).type; auto minmax_idx = std::make_shared(); diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index dba2bc1e56c..02a3f1b1165 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -116,7 +116,7 @@ void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr, MergeTreeInd ISerialization::DeserializeBinaryBulkStatePtr state; auto serialization = type->getDefaultSerialization(); - serialization->deserializeBinaryBulkStatePrefix(settings, state); + serialization->deserializeBinaryBulkStatePrefix(settings, state, nullptr); serialization->deserializeBinaryBulkWithMultipleStreams(new_column, rows_to_read, settings, state, nullptr); block.insert(ColumnWithTypeAndName(new_column, type, column.name)); diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index a22bff6b8d2..7504ce3cc5f 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -195,7 +195,7 @@ void MergeTreeReaderCompact::readPrefix( deserialize_settings.getter = buffer_getter_for_prefix; ISerialization::DeserializeBinaryBulkStatePtr state_for_prefix; - serialization_for_prefix->deserializeBinaryBulkStatePrefix(deserialize_settings, state_for_prefix); + serialization_for_prefix->deserializeBinaryBulkStatePrefix(deserialize_settings, state_for_prefix, nullptr); } SerializationPtr serialization; @@ -206,7 +206,8 @@ void MergeTreeReaderCompact::readPrefix( deserialize_settings.getter = buffer_getter; - serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name_and_type.name]); + deserialize_settings.dynamic_read_statistics = true; + serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name_and_type.name], nullptr); } catch (Exception & e) { diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index 394a22835f1..c8bf12436b0 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -43,11 +44,13 @@ MergeTreeReaderWide::MergeTreeReaderWide( mark_ranges_, settings_, avg_value_size_hints_) + , profile_callback(profile_callback_) + , clock_type(clock_type_) { try { for (size_t i = 0; i < columns_to_read.size(); ++i) - addStreams(columns_to_read[i], serializations[i], profile_callback_, clock_type_); + addStreams(columns_to_read[i], serializations[i]); } catch (...) { @@ -100,9 +103,10 @@ void MergeTreeReaderWide::prefetchForAllColumns( try { auto & cache = caches[columns_to_read[pos].getNameInStorage()]; + auto & deserialize_states_cache = deserialize_states_caches[columns_to_read[pos].getNameInStorage()]; prefetchForColumn( priority, columns_to_read[pos], serializations[pos], from_mark, continue_reading, - current_task_last_mark, cache); + current_task_last_mark, cache, deserialize_states_cache); } catch (Exception & e) { @@ -147,11 +151,12 @@ size_t MergeTreeReaderWide::readRows( { size_t column_size_before_reading = column->size(); auto & cache = caches[column_to_read.getNameInStorage()]; + auto & deserialize_states_cache = deserialize_states_caches[column_to_read.getNameInStorage()]; readData( column_to_read, serializations[pos], column, from_mark, continue_reading, current_task_last_mark, - max_rows_to_read, cache, /* was_prefetched =*/ !prefetched_streams.empty()); + max_rows_to_read, cache, deserialize_states_cache, /* was_prefetched =*/ !prefetched_streams.empty()); /// For elements of Nested, column_size_before_reading may be greater than column size /// if offsets are not empty and were already read, but elements are empty. @@ -199,9 +204,7 @@ size_t MergeTreeReaderWide::readRows( void MergeTreeReaderWide::addStreams( const NameAndTypePair & name_and_type, - const SerializationPtr & serialization, - const ReadBufferFromFileBase::ProfileCallback & profile_callback, - clockid_t clock_type) + const SerializationPtr & serialization) { bool has_any_stream = false; bool has_all_streams = true; @@ -225,29 +228,8 @@ void MergeTreeReaderWide::addStreams( return; } - auto context = data_part_info_for_read->getContext(); - auto * load_marks_threadpool = settings.read_settings.load_marks_asynchronously ? &context->getLoadMarksThreadpool() : nullptr; - - auto marks_loader = std::make_shared( - data_part_info_for_read, - mark_cache, - data_part_info_for_read->getIndexGranularityInfo().getMarksFilePath(*stream_name), - data_part_info_for_read->getMarksCount(), - data_part_info_for_read->getIndexGranularityInfo(), - settings.save_marks_in_cache, - settings.read_settings, - load_marks_threadpool, - /*num_columns_in_mark=*/ 1); - + addStream(substream_path, *stream_name); has_any_stream = true; - auto stream_settings = settings; - stream_settings.is_low_cardinality_dictionary = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys; - - streams.emplace(*stream_name, std::make_unique( - data_part_info_for_read->getDataPartStorage(), *stream_name, DATA_FILE_EXTENSION, - data_part_info_for_read->getMarksCount(), all_mark_ranges, stream_settings, - uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(*stream_name + DATA_FILE_EXTENSION), - std::move(marks_loader), profile_callback, clock_type)); }; serialization->enumerateStreams(callback); @@ -256,11 +238,36 @@ void MergeTreeReaderWide::addStreams( partially_read_columns.insert(name_and_type.name); } -static ReadBuffer * getStream( +MergeTreeReaderWide::FileStreams::iterator MergeTreeReaderWide::addStream(const ISerialization::SubstreamPath & substream_path, const String & stream_name) +{ + auto context = data_part_info_for_read->getContext(); + auto * load_marks_threadpool = settings.read_settings.load_marks_asynchronously ? &context->getLoadMarksThreadpool() : nullptr; + + auto marks_loader = std::make_shared( + data_part_info_for_read, + mark_cache, + data_part_info_for_read->getIndexGranularityInfo().getMarksFilePath(stream_name), + data_part_info_for_read->getMarksCount(), + data_part_info_for_read->getIndexGranularityInfo(), + settings.save_marks_in_cache, + settings.read_settings, + load_marks_threadpool, + /*num_columns_in_mark=*/ 1); + + auto stream_settings = settings; + stream_settings.is_low_cardinality_dictionary = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys; + + return streams.emplace(stream_name, std::make_unique( + data_part_info_for_read->getDataPartStorage(), stream_name, DATA_FILE_EXTENSION, + data_part_info_for_read->getMarksCount(), all_mark_ranges, stream_settings, + uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION), + std::move(marks_loader), profile_callback, clock_type)).first; +} + +ReadBuffer * MergeTreeReaderWide::getStream( bool seek_to_start, const ISerialization::SubstreamPath & substream_path, const MergeTreeDataPartChecksums & checksums, - MergeTreeReaderWide::FileStreams & streams, const NameAndTypePair & name_and_type, size_t from_mark, bool seek_to_mark, @@ -277,7 +284,13 @@ static ReadBuffer * getStream( auto it = streams.find(*stream_name); if (it == streams.end()) - return nullptr; + { + /// If we didn't create requested stream, but file with this path exists, create a stream for it. + /// It may happen during reading of columns with dynamic subcolumns, because all streams are known + /// only after deserializing of binary bulk prefix. + + it = addStream(substream_path, *stream_name); + } MergeTreeReaderStream & stream = *it->second; stream.adjustRightMark(current_task_last_mark); @@ -294,17 +307,19 @@ void MergeTreeReaderWide::deserializePrefix( const SerializationPtr & serialization, const NameAndTypePair & name_and_type, size_t current_task_last_mark, - ISerialization::SubstreamsCache & cache) + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache) { const auto & name = name_and_type.name; if (!deserialize_binary_bulk_state_map.contains(name)) { ISerialization::DeserializeBinaryBulkSettings deserialize_settings; + deserialize_settings.dynamic_read_statistics = true; deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path) { - return getStream(/* seek_to_start = */true, substream_path, data_part_info_for_read->getChecksums(), streams, name_and_type, 0, /* seek_to_mark = */false, current_task_last_mark, cache); + return getStream(/* seek_to_start = */true, substream_path, data_part_info_for_read->getChecksums(), name_and_type, 0, /* seek_to_mark = */false, current_task_last_mark, cache); }; - serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name]); + serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name], &deserialize_states_cache); } } @@ -315,9 +330,10 @@ void MergeTreeReaderWide::prefetchForColumn( size_t from_mark, bool continue_reading, size_t current_task_last_mark, - ISerialization::SubstreamsCache & cache) + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache) { - deserializePrefix(serialization, name_and_type, current_task_last_mark, cache); + deserializePrefix(serialization, name_and_type, current_task_last_mark, cache, deserialize_states_cache); serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { @@ -326,7 +342,7 @@ void MergeTreeReaderWide::prefetchForColumn( if (stream_name && !prefetched_streams.contains(*stream_name)) { bool seek_to_mark = !continue_reading; - if (ReadBuffer * buf = getStream(false, substream_path, data_part_info_for_read->getChecksums(), streams, name_and_type, from_mark, seek_to_mark, current_task_last_mark, cache)) + if (ReadBuffer * buf = getStream(false, substream_path, data_part_info_for_read->getChecksums(), name_and_type, from_mark, seek_to_mark, current_task_last_mark, cache)) { buf->prefetch(priority); prefetched_streams.insert(*stream_name); @@ -337,15 +353,22 @@ void MergeTreeReaderWide::prefetchForColumn( void MergeTreeReaderWide::readData( - const NameAndTypePair & name_and_type, const SerializationPtr & serialization, ColumnPtr & column, - size_t from_mark, bool continue_reading, size_t current_task_last_mark, - size_t max_rows_to_read, ISerialization::SubstreamsCache & cache, bool was_prefetched) + const NameAndTypePair & name_and_type, + const SerializationPtr & serialization, + ColumnPtr & column, + size_t from_mark, + bool continue_reading, + size_t current_task_last_mark, + size_t max_rows_to_read, + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache, + bool was_prefetched) { double & avg_value_size_hint = avg_value_size_hints[name_and_type.name]; ISerialization::DeserializeBinaryBulkSettings deserialize_settings; deserialize_settings.avg_value_size_hint = avg_value_size_hint; - deserializePrefix(serialization, name_and_type, current_task_last_mark, cache); + deserializePrefix(serialization, name_and_type, current_task_last_mark, cache, deserialize_states_cache); deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path) { @@ -353,7 +376,7 @@ void MergeTreeReaderWide::readData( return getStream( /* seek_to_start = */false, substream_path, - data_part_info_for_read->getChecksums(), streams, + data_part_info_for_read->getChecksums(), name_and_type, from_mark, seek_to_mark, current_task_last_mark, cache); }; diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.h b/src/Storages/MergeTree/MergeTreeReaderWide.h index a9a5526dd65..1eef21b455b 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.h +++ b/src/Storages/MergeTree/MergeTreeReaderWide.h @@ -45,14 +45,31 @@ private: void addStreams( const NameAndTypePair & name_and_type, - const SerializationPtr & serialization, - const ReadBufferFromFileBase::ProfileCallback & profile_callback, - clockid_t clock_type); + const SerializationPtr & serialization); + + ReadBuffer * getStream( + bool seek_to_start, + const ISerialization::SubstreamPath & substream_path, + const MergeTreeDataPartChecksums & checksums, + const NameAndTypePair & name_and_type, + size_t from_mark, + bool seek_to_mark, + size_t current_task_last_mark, + ISerialization::SubstreamsCache & cache); + + FileStreams::iterator addStream(const ISerialization::SubstreamPath & substream_path, const String & stream_name); void readData( - const NameAndTypePair & name_and_type, const SerializationPtr & serialization, ColumnPtr & column, - size_t from_mark, bool continue_reading, size_t current_task_last_mark, size_t max_rows_to_read, - ISerialization::SubstreamsCache & cache, bool was_prefetched); + const NameAndTypePair & name_and_type, + const SerializationPtr & serialization, + ColumnPtr & column, + size_t from_mark, + bool continue_reading, + size_t current_task_last_mark, + size_t max_rows_to_read, + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache, + bool was_prefetched); /// Make next readData more simple by calling 'prefetch' of all related ReadBuffers (column streams). void prefetchForColumn( @@ -62,17 +79,22 @@ private: size_t from_mark, bool continue_reading, size_t current_task_last_mark, - ISerialization::SubstreamsCache & cache); + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache); void deserializePrefix( const SerializationPtr & serialization, const NameAndTypePair & name_and_type, size_t current_task_last_mark, - ISerialization::SubstreamsCache & cache); + ISerialization::SubstreamsCache & cache, + ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache); std::unordered_map caches; + std::unordered_map deserialize_states_caches; std::unordered_set prefetched_streams; ssize_t prefetched_from_mark = -1; + ReadBufferFromFileBase::ProfileCallback profile_callback; + clockid_t clock_type; }; } diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 9c67a86997b..3ddd6b21ffb 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -43,6 +43,7 @@ struct Settings; M(UInt64, compact_parts_max_granules_to_buffer, 128, "Only available in ClickHouse Cloud", 0) \ M(UInt64, compact_parts_merge_max_bytes_to_prefetch_part, 16 * 1024 * 1024, "Only available in ClickHouse Cloud", 0) \ M(Bool, load_existing_rows_count_for_old_parts, false, "Whether to load existing_rows_count for existing parts. If false, existing_rows_count will be equal to rows_count for existing parts.", 0) \ + /** M(UInt64, max_types_for_dynamic_serialization, 32, "The maximum number of different types in Dynamic column stored separately in MergeTree tables in wide format. If exceeded, new types will be converted to String", 0) */ \ \ /** Merge settings. */ \ M(UInt64, merge_max_block_size, 8192, "How many rows in blocks should be formed for merge operations. By default has the same value as `index_granularity`.", 0) \ diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index f67e9484598..b2817b386fa 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -777,7 +777,13 @@ static NameToNameVector collectFilesForRenames( }; if (auto serialization = source_part->tryGetSerialization(command.column_name)) - serialization->enumerateStreams(callback); + { + auto name_and_type = source_part->getColumn(command.column_name); + ColumnPtr column_sample; + if (name_and_type.type->hasDynamicSubcolumns()) + column_sample = source_part->readColumnSample(name_and_type); + serialization->enumerateStreams(callback, name_and_type.type, column_sample); + } /// if we drop a column with statistic, we should also drop the stat file. if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) @@ -813,7 +819,13 @@ static NameToNameVector collectFilesForRenames( }; if (auto serialization = source_part->tryGetSerialization(command.column_name)) - serialization->enumerateStreams(callback); + { + auto name_and_type = source_part->getColumn(command.column_name); + ColumnPtr column_sample; + if (name_and_type.type->hasDynamicSubcolumns()) + column_sample = source_part->readColumnSample(name_and_type); + serialization->enumerateStreams(callback, name_and_type.type, column_sample); + } /// if we rename a column with statistic, we should also rename the stat file. if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index ca8ed9abdb5..a94508ad41f 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -87,6 +87,7 @@ public: bool supportsPrewhere() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } bool supportsSubcolumns() const override { return true; } diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index 1f735b47819..fce6736aa07 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -81,6 +81,7 @@ private: void drop() override; bool supportsSubsetOfColumns(const ContextPtr & context_) const; bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } std::shared_ptr createFileIterator(ContextPtr local_context, const ActionsDAG::Node * predicate); std::shared_ptr createSource( diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index 27ac7a5c368..be0e88b9b6d 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -98,6 +98,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsSubsetOfColumns(const ContextPtr & context) const; bool supportsTrivialCountOptimization() const override { return true; } diff --git a/src/Storages/StorageAzureBlobCluster.h b/src/Storages/StorageAzureBlobCluster.h index 545e568a772..9521ae4d24e 100644 --- a/src/Storages/StorageAzureBlobCluster.h +++ b/src/Storages/StorageAzureBlobCluster.h @@ -35,6 +35,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization() const override { return true; } private: diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h index 6c15c7e0238..cd6dd7b933f 100644 --- a/src/Storages/StorageBuffer.h +++ b/src/Storages/StorageBuffer.h @@ -89,6 +89,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool /*async_insert*/) override; void startup() override; diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 12c2ad331ad..5d499fb319b 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -712,7 +712,7 @@ static bool requiresObjectColumns(const ColumnsDescription & all_columns, ASTPtr auto name_in_storage = Nested::splitName(required_column).first; auto column_in_storage = all_columns.tryGetPhysical(name_in_storage); - if (column_in_storage && column_in_storage->type->hasDynamicSubcolumns()) + if (column_in_storage && column_in_storage->type->hasDynamicSubcolumnsDeprecated()) return true; } diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 3a7e63aef50..85a8de86953 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -85,6 +85,7 @@ public: bool supportsFinal() const override { return true; } bool supportsPrewhere() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } StoragePolicyPtr getStoragePolicy() const override; diff --git a/src/Storages/StorageDummy.h b/src/Storages/StorageDummy.h index e9d8f90f755..a07a5600870 100644 --- a/src/Storages/StorageDummy.h +++ b/src/Storages/StorageDummy.h @@ -20,6 +20,7 @@ public: bool supportsFinal() const override { return true; } bool supportsPrewhere() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } bool canMoveConditionsToPrewhere() const override { diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 93c263008a6..566c407a798 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -90,6 +90,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool prefersLargeBlocks() const override; bool parallelizeOutputAfterReading(ContextPtr context) const override; diff --git a/src/Storages/StorageFileCluster.h b/src/Storages/StorageFileCluster.h index 3acbc71ba7e..b8bb3fd5ea1 100644 --- a/src/Storages/StorageFileCluster.h +++ b/src/Storages/StorageFileCluster.h @@ -32,6 +32,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization() const override { return true; } private: diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index 1ac739f03fd..fcd14fb8ec1 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -628,7 +628,7 @@ void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns) const auto * available_type = it->getMapped(); - if (!available_type->hasDynamicSubcolumns() + if (!available_type->hasDynamicSubcolumnsDeprecated() && !column.type->equals(*available_type) && !isCompatibleEnumTypes(available_type, column.type.get())) throw Exception( @@ -676,7 +676,7 @@ void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns, const auto * provided_column_type = it->getMapped(); const auto * available_column_type = jt->getMapped(); - if (!provided_column_type->hasDynamicSubcolumns() + if (!provided_column_type->hasDynamicSubcolumnsDeprecated() && !provided_column_type->equals(*available_column_type) && !isCompatibleEnumTypes(available_column_type, provided_column_type)) throw Exception( @@ -720,7 +720,7 @@ void StorageInMemoryMetadata::check(const Block & block, bool need_all) const listOfColumns(available_columns)); const auto * available_type = it->getMapped(); - if (!available_type->hasDynamicSubcolumns() + if (!available_type->hasDynamicSubcolumnsDeprecated() && !column.type->equals(*available_type) && !isCompatibleEnumTypes(available_type, column.type.get())) throw Exception( diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 549cfca1b6c..7f09236454c 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -252,7 +252,7 @@ void LogSource::readData(const NameAndTypePair & name_and_type, ColumnPtr & colu if (!deserialize_states.contains(name)) { settings.getter = create_stream_getter(true); - serialization->deserializeBinaryBulkStatePrefix(settings, deserialize_states[name]); + serialization->deserializeBinaryBulkStatePrefix(settings, deserialize_states[name], nullptr); } settings.getter = create_stream_getter(false); diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index 198b7a642ee..0d906a933f7 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -32,6 +32,7 @@ public: bool supportsFinal() const override { return getTargetTable()->supportsFinal(); } bool supportsParallelInsert() const override { return getTargetTable()->supportsParallelInsert(); } bool supportsSubcolumns() const override { return getTargetTable()->supportsSubcolumns(); } + bool supportsDynamicSubcolumns() const override { return getTargetTable()->supportsDynamicSubcolumns(); } bool supportsTransactions() const override { return getTargetTable()->supportsTransactions(); } SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool async_insert) override; diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index 13f1c971d82..ef422a6c872 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -58,6 +58,7 @@ public: bool supportsParallelInsert() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumnsDeprecated() const override { return true; } bool supportsDynamicSubcolumns() const override { return true; } /// Smaller blocks (e.g. 64K rows) are better for CPU cache. diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index c049d50f3b4..b08bef0a143 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -49,6 +49,7 @@ public: bool supportsSampling() const override { return true; } bool supportsFinal() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } bool supportsPrewhere() const override { return tableSupportsPrewhere(); } std::optional supportedPrewhereColumns() const override; diff --git a/src/Storages/StorageNull.h b/src/Storages/StorageNull.h index f7ee936db8d..74abf931f8f 100644 --- a/src/Storages/StorageNull.h +++ b/src/Storages/StorageNull.h @@ -48,6 +48,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + SinkToStoragePtr write(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, bool) override { return std::make_shared(metadata_snapshot->getSampleBlock()); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index d1f15edfd6d..3a20872bbe4 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -385,6 +385,8 @@ private: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsSubsetOfColumns(const ContextPtr & context) const; bool prefersLargeBlocks() const override; diff --git a/src/Storages/StorageS3Cluster.h b/src/Storages/StorageS3Cluster.h index 6a5b03e682f..3ec84b363fb 100644 --- a/src/Storages/StorageS3Cluster.h +++ b/src/Storages/StorageS3Cluster.h @@ -32,6 +32,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization() const override { return true; } protected: diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp index 8b087a4a2bc..aada25168f8 100644 --- a/src/Storages/StorageSnapshot.cpp +++ b/src/Storages/StorageSnapshot.cpp @@ -115,7 +115,7 @@ std::optional StorageSnapshot::tryGetColumn(const GetColumnsOpt { const auto & columns = getMetadataForQuery()->getColumns(); auto column = columns.tryGetColumn(options, column_name); - if (column && (!column->type->hasDynamicSubcolumns() || !options.with_extended_objects)) + if (column && (!column->type->hasDynamicSubcolumnsDeprecated() || !options.with_extended_objects)) return column; if (options.with_extended_objects) diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 842cfd5b627..3fd7a7f097f 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -295,6 +295,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + static FormatSettings getFormatSettingsFromArgs(const StorageFactory::Arguments & args); struct Configuration : public StatelessTableEngineConfiguration diff --git a/src/Storages/StorageURLCluster.h b/src/Storages/StorageURLCluster.h index dce2e0106ea..ad8113517c5 100644 --- a/src/Storages/StorageURLCluster.h +++ b/src/Storages/StorageURLCluster.h @@ -35,6 +35,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization() const override { return true; } private: diff --git a/src/Storages/getStructureOfRemoteTable.cpp b/src/Storages/getStructureOfRemoteTable.cpp index 26e953c0578..6ea7bdc312d 100644 --- a/src/Storages/getStructureOfRemoteTable.cpp +++ b/src/Storages/getStructureOfRemoteTable.cpp @@ -210,7 +210,7 @@ ColumnsDescriptionByShardNum getExtendedObjectsOfRemoteTables( auto type_name = type_col[i].get(); auto storage_column = storage_columns.tryGetPhysical(name); - if (storage_column && storage_column->type->hasDynamicSubcolumns()) + if (storage_column && storage_column->type->hasDynamicSubcolumnsDeprecated()) res.add(ColumnDescription(std::move(name), DataTypeFactory::instance().get(type_name))); } } diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns.sh index b816a20c818..6bbd127d933 100755 --- a/tests/queries/0_stateless/02943_variant_read_subcolumns.sh +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_suspicious_variant_types=1" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_suspicious_variant_types=1 --max_insert_threads 4 --group_by_two_level_threshold 752249 --group_by_two_level_threshold_bytes 15083870 --distributed_aggregation_memory_efficient 1 --fsync_metadata 1 --output_format_parallel_formatting 0 --input_format_parallel_parsing 0 --min_chunk_bytes_for_parallel_parsing 6583861 --max_read_buffer_size 640584 --prefer_localhost_replica 1 --max_block_size 38844 --max_threads 48 --optimize_append_index 0 --optimize_if_chain_to_multiif 1 --optimize_if_transform_strings_to_enum 0 --optimize_read_in_order 1 --optimize_or_like_chain 0 --optimize_substitute_columns 1 --enable_multiple_prewhere_read_steps 1 --read_in_order_two_level_merge_threshold 4 --optimize_aggregation_in_order 0 --aggregation_in_order_max_block_bytes 18284646 --use_uncompressed_cache 1 --min_bytes_to_use_direct_io 10737418240 --min_bytes_to_use_mmap_io 10737418240 --local_filesystem_read_method pread --remote_filesystem_read_method read --local_filesystem_read_prefetch 1 --filesystem_cache_segments_batch_size 0 --read_from_filesystem_cache_if_exists_otherwise_bypass_cache 0 --throw_on_error_from_cache_on_write_operations 1 --remote_filesystem_read_prefetch 0 --allow_prefetched_read_pool_for_remote_filesystem 0 --filesystem_prefetch_max_memory_usage 128Mi --filesystem_prefetches_limit 0 --filesystem_prefetch_min_bytes_for_single_read_task 16Mi --filesystem_prefetch_step_marks 50 --filesystem_prefetch_step_bytes 0 --compile_aggregate_expressions 1 --compile_sort_description 0 --merge_tree_coarse_index_granularity 31 --optimize_distinct_in_order 1 --max_bytes_before_external_sort 1 --max_bytes_before_external_group_by 1 --max_bytes_before_remerge_sort 2640239625 --min_compress_block_size 3114155 --max_compress_block_size 226550 --merge_tree_compact_parts_min_granules_to_multibuffer_read 118 --optimize_sorting_by_input_stream_properties 0 --http_response_buffer_size 543038 --http_wait_end_of_query False --enable_memory_bound_merging_of_aggregation_results 1 --min_count_to_compile_expression 3 --min_count_to_compile_aggregate_expression 3 --min_count_to_compile_sort_description 0 --session_timezone America/Mazatlan --prefer_warmed_unmerged_parts_seconds 8 --use_page_cache_for_disks_without_file_cache False --page_cache_inject_eviction True --merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability 0.82 " function test() diff --git a/tests/queries/0_stateless/03033_dynamic_text_serialization.reference b/tests/queries/0_stateless/03033_dynamic_text_serialization.reference new file mode 100644 index 00000000000..d965245266c --- /dev/null +++ b/tests/queries/0_stateless/03033_dynamic_text_serialization.reference @@ -0,0 +1,55 @@ +JSON +{"d":"42","dynamicType(d)":"Int64"} +{"d":42.42,"dynamicType(d)":"Float64"} +{"d":"str","dynamicType(d)":"String"} +{"d":["1","2","3"],"dynamicType(d)":"Array(Int64)"} +{"d":"2020-01-01","dynamicType(d)":"Date"} +{"d":"2020-01-01 10:00:00.000000000","dynamicType(d)":"DateTime64(9)"} +{"d":{"a":"42","b":"str"},"dynamicType(d)":"Tuple(a Int64, b String)"} +{"d":{"a":"43"},"dynamicType(d)":"Tuple(a Int64)"} +{"d":{"a":"44","c":["1","2","3"]},"dynamicType(d)":"Tuple(a Int64, c Array(Int64))"} +{"d":["1","str",["1","2","3"]],"dynamicType(d)":"Tuple(Int64, String, Array(Int64))"} +{"d":null,"dynamicType(d)":"None"} +{"d":true,"dynamicType(d)":"Bool"} +{"d":"42","dynamicType(d)":"Int64"} +{"d":"42.42","dynamicType(d)":"String"} +{"d":"str","dynamicType(d)":"String"} +{"d":null,"dynamicType(d)":"None"} +{"d":"1","dynamicType(d)":"Int64"} +CSV +42,"Int64" +42.42,"Float64" +"str","String" +"[1,2,3]","Array(Int64)" +"2020-01-01","Date" +"2020-01-01 10:00:00.000000000","DateTime64(9)" +"[1, 'str', [1, 2, 3]]","String" +\N,"None" +true,"Bool" +TSV +42 Int64 +42.42 Float64 +str String +[1,2,3] Array(Int64) +2020-01-01 Date +2020-01-01 10:00:00.000000000 DateTime64(9) +[1, \'str\', [1, 2, 3]] String +\N None +true Bool +Values +(42,'Int64'),(42.42,'Float64'),('str','String'),([1,2,3],'Array(Int64)'),('2020-01-01','Date'),('2020-01-01 10:00:00.000000000','DateTime64(9)'),(NULL,'None'),(true,'Bool') +Cast using parsing +42 Int64 +42.42 Float64 +[1,2,3] Array(Int64) +2020-01-01 Date +2020-01-01 10:00:00.000000000 DateTime64(9) +\N None +true Bool +42 Int64 +42.42 Float64 +[1, 2, 3] String +2020-01-01 String +2020-01-01 10:00:00 String +\N None +true String diff --git a/tests/queries/0_stateless/03033_dynamic_text_serialization.sql b/tests/queries/0_stateless/03033_dynamic_text_serialization.sql new file mode 100644 index 00000000000..d12d110fe28 --- /dev/null +++ b/tests/queries/0_stateless/03033_dynamic_text_serialization.sql @@ -0,0 +1,74 @@ +set allow_experimental_dynamic_type = 1; + +select 'JSON'; +select d, dynamicType(d) from format(JSONEachRow, 'd Dynamic', $$ +{"d" : 42} +{"d" : 42.42} +{"d" : "str"} +{"d" : [1, 2, 3]} +{"d" : "2020-01-01"} +{"d" : "2020-01-01 10:00:00"} +{"d" : {"a" : 42, "b" : "str"}} +{"d" : {"a" : 43}} +{"d" : {"a" : 44, "c" : [1, 2, 3]}} +{"d" : [1, "str", [1, 2, 3]]} +{"d" : null} +{"d" : true} +$$) format JSONEachRow; + +select d, dynamicType(d) from format(JSONEachRow, 'd Dynamic(max_types=2)', $$ +{"d" : 42} +{"d" : 42.42} +{"d" : "str"} +{"d" : null} +{"d" : true} +$$) format JSONEachRow; + +select 'CSV'; +select d, dynamicType(d) from format(CSV, 'd Dynamic', +$$42 +42.42 +"str" +"[1, 2, 3]" +"2020-01-01" +"2020-01-01 10:00:00" +"[1, 'str', [1, 2, 3]]" +\N +true +$$) format CSV; + +select 'TSV'; +select d, dynamicType(d) from format(TSV, 'd Dynamic', +$$42 +42.42 +str +[1, 2, 3] +2020-01-01 +2020-01-01 10:00:00 +[1, 'str', [1, 2, 3]] +\N +true +$$) format TSV; + +select 'Values'; +select d, dynamicType(d) from format(Values, 'd Dynamic', $$ +(42) +(42.42) +('str') +([1, 2, 3]) +('2020-01-01') +('2020-01-01 10:00:00') +(NULL) +(true) +$$) format Values; +select ''; + +select 'Cast using parsing'; +drop table if exists test; +create table test (s String) engine=Memory; +insert into test values ('42'), ('42.42'), ('[1, 2, 3]'), ('2020-01-01'), ('2020-01-01 10:00:00'), ('NULL'), ('true'); +set cast_string_to_dynamic_use_inference=1; +select s::Dynamic as d, dynamicType(d) from test; +select s::Dynamic(max_types=3) as d, dynamicType(d) from test; +drop table test; + diff --git a/tests/queries/0_stateless/03034_dynamic_conversions.reference b/tests/queries/0_stateless/03034_dynamic_conversions.reference new file mode 100644 index 00000000000..af91add9ddd --- /dev/null +++ b/tests/queries/0_stateless/03034_dynamic_conversions.reference @@ -0,0 +1,63 @@ +0 UInt64 +1 UInt64 +2 UInt64 +0 String +1 String +2 String +0 +1 +2 +0 +1 +2 +1970-01-01 +1970-01-02 +1970-01-03 +0 UInt64 +1 UInt64 +2 UInt64 +0 UInt64 +\N None +2 UInt64 +0 UInt64 +str_1 String +[0,1] Array(UInt64) +\N None +4 UInt64 +str_5 String +0 String +str_1 String +[0,1] String +\N None +4 String +str_5 String +0 UInt64 +str_1 String +[0,1] String +\N None +4 UInt64 +str_5 String +0 UInt64 +str_1 String +[0,1] Array(UInt64) +\N None +4 UInt64 +str_5 String +0 +1 +2 +0 +1 +2 +0 UInt64 +str_1 String +[0,1] String +\N None +4 UInt64 +str_5 String +0 UInt64 +1970-01-02 Date +[0,1] String +\N None +4 UInt64 +1970-01-06 Date diff --git a/tests/queries/0_stateless/03034_dynamic_conversions.sql b/tests/queries/0_stateless/03034_dynamic_conversions.sql new file mode 100644 index 00000000000..e9b4944f5d8 --- /dev/null +++ b/tests/queries/0_stateless/03034_dynamic_conversions.sql @@ -0,0 +1,24 @@ +set allow_experimental_dynamic_type=1; +set allow_experimental_variant_type=1; +set use_variant_as_common_type=1; + +select number::Dynamic as d, dynamicType(d) from numbers(3); +select number::Dynamic(max_types=1) as d, dynamicType(d) from numbers(3); +select number::Dynamic::UInt64 as v from numbers(3); +select number::Dynamic::String as v from numbers(3); +select number::Dynamic::Date as v from numbers(3); +select number::Dynamic::Array(UInt64) as v from numbers(3); -- {serverError TYPE_MISMATCH} +select number::Dynamic::Variant(UInt64, String) as v, variantType(v) from numbers(3); +select (number % 2 ? NULL : number)::Dynamic as d, dynamicType(d) from numbers(3); + +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic as d, dynamicType(d) from numbers(6); +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=1) as d, dynamicType(d) from numbers(6); +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=2) as d, dynamicType(d) from numbers(6); +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=3) as d, dynamicType(d) from numbers(6); + +select number::Dynamic(max_types=2)::Dynamic(max_types=3) as d from numbers(3); +select number::Dynamic(max_types=2)::Dynamic(max_types=1) as d from numbers(3); +select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=3)::Dynamic(max_types=2) as d, dynamicType(d) from numbers(6); +select multiIf(number % 4 == 0, number, number % 4 == 1, toDate(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=4)::Dynamic(max_types=3) as d, dynamicType(d) from numbers(6); + + diff --git a/tests/queries/0_stateless/03035_dynamic_sorting.reference b/tests/queries/0_stateless/03035_dynamic_sorting.reference new file mode 100644 index 00000000000..9b8df11c7a9 --- /dev/null +++ b/tests/queries/0_stateless/03035_dynamic_sorting.reference @@ -0,0 +1,299 @@ +order by d1 nulls first +\N None +\N None +\N None +\N None +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,4] Array(Int64) +42 Int64 +42 Int64 +42 Int64 +42 Int64 +42 Int64 +43 Int64 +abc String +abc String +abc String +abc String +abc String +abd String +order by d1 nulls last +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,4] Array(Int64) +42 Int64 +42 Int64 +42 Int64 +42 Int64 +42 Int64 +43 Int64 +abc String +abc String +abc String +abc String +abc String +abd String +\N None +\N None +\N None +\N None +order by d2 nulls first +\N None +\N None +\N None +\N None +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,4] Array(Int64) +42 Int64 +42 Int64 +42 Int64 +42 Int64 +42 Int64 +43 Int64 +abc String +abc String +abc String +abc String +abc String +abd String +order by d2 nulls last +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,3] Array(Int64) +[1,2,4] Array(Int64) +42 Int64 +42 Int64 +42 Int64 +42 Int64 +42 Int64 +43 Int64 +abc String +abc String +abc String +abc String +abc String +abd String +\N None +\N None +\N None +\N None +order by d1, d2 nulls first +[1,2,3] \N Array(Int64) None +[1,2,3] [1,2,3] Array(Int64) Array(Int64) +[1,2,3] [1,2,4] Array(Int64) Array(Int64) +[1,2,3] 42 Array(Int64) Int64 +[1,2,3] abc Array(Int64) String +[1,2,4] [1,2,3] Array(Int64) Array(Int64) +42 \N Int64 None +42 [1,2,3] Int64 Array(Int64) +42 42 Int64 Int64 +42 43 Int64 Int64 +42 abc Int64 String +43 42 Int64 Int64 +abc \N String None +abc [1,2,3] String Array(Int64) +abc 42 String Int64 +abc abc String String +abc abd String String +abd abc String String +\N \N None None +\N [1,2,3] None Array(Int64) +\N 42 None Int64 +\N abc None String +order by d1, d2 nulls last +[1,2,3] [1,2,3] Array(Int64) Array(Int64) +[1,2,3] [1,2,4] Array(Int64) Array(Int64) +[1,2,3] 42 Array(Int64) Int64 +[1,2,3] abc Array(Int64) String +[1,2,3] \N Array(Int64) None +[1,2,4] [1,2,3] Array(Int64) Array(Int64) +42 [1,2,3] Int64 Array(Int64) +42 42 Int64 Int64 +42 43 Int64 Int64 +42 abc Int64 String +42 \N Int64 None +43 42 Int64 Int64 +abc [1,2,3] String Array(Int64) +abc 42 String Int64 +abc abc String String +abc abd String String +abc \N String None +abd abc String String +\N [1,2,3] None Array(Int64) +\N 42 None Int64 +\N abc None String +\N \N None None +order by d2, d1 nulls first +\N [1,2,3] None Array(Int64) +[1,2,3] [1,2,3] Array(Int64) Array(Int64) +[1,2,4] [1,2,3] Array(Int64) Array(Int64) +42 [1,2,3] Int64 Array(Int64) +abc [1,2,3] String Array(Int64) +[1,2,3] [1,2,4] Array(Int64) Array(Int64) +\N 42 None Int64 +[1,2,3] 42 Array(Int64) Int64 +42 42 Int64 Int64 +43 42 Int64 Int64 +abc 42 String Int64 +42 43 Int64 Int64 +\N abc None String +[1,2,3] abc Array(Int64) String +42 abc Int64 String +abc abc String String +abd abc String String +abc abd String String +\N \N None None +[1,2,3] \N Array(Int64) None +42 \N Int64 None +abc \N String None +order by d2, d1 nulls last +[1,2,3] [1,2,3] Array(Int64) Array(Int64) +[1,2,4] [1,2,3] Array(Int64) Array(Int64) +42 [1,2,3] Int64 Array(Int64) +abc [1,2,3] String Array(Int64) +\N [1,2,3] None Array(Int64) +[1,2,3] [1,2,4] Array(Int64) Array(Int64) +[1,2,3] 42 Array(Int64) Int64 +42 42 Int64 Int64 +43 42 Int64 Int64 +abc 42 String Int64 +\N 42 None Int64 +42 43 Int64 Int64 +[1,2,3] abc Array(Int64) String +42 abc Int64 String +abc abc String String +abd abc String String +\N abc None String +abc abd String String +[1,2,3] \N Array(Int64) None +42 \N Int64 None +abc \N String None +\N \N None None +d1 = d2 +[1,2,3] [1,2,3] 1 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 0 Array(Int64) Array(Int64) +[1,2,3] 42 0 Array(Int64) Int64 +[1,2,3] abc 0 Array(Int64) String +[1,2,3] \N 0 Array(Int64) None +[1,2,4] [1,2,3] 0 Array(Int64) Array(Int64) +42 [1,2,3] 0 Int64 Array(Int64) +42 42 1 Int64 Int64 +42 43 0 Int64 Int64 +42 abc 0 Int64 String +42 \N 0 Int64 None +43 42 0 Int64 Int64 +abc [1,2,3] 0 String Array(Int64) +abc 42 0 String Int64 +abc abc 1 String String +abc abd 0 String String +abc \N 0 String None +abd abc 0 String String +\N [1,2,3] 0 None Array(Int64) +\N 42 0 None Int64 +\N abc 0 None String +\N \N 1 None None +d1 < d2 +[1,2,3] [1,2,3] 0 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 1 Array(Int64) Array(Int64) +[1,2,3] 42 1 Array(Int64) Int64 +[1,2,3] abc 1 Array(Int64) String +[1,2,3] \N 1 Array(Int64) None +[1,2,4] [1,2,3] 0 Array(Int64) Array(Int64) +42 [1,2,3] 0 Int64 Array(Int64) +42 42 0 Int64 Int64 +42 43 1 Int64 Int64 +42 abc 1 Int64 String +42 \N 1 Int64 None +43 42 0 Int64 Int64 +abc [1,2,3] 0 String Array(Int64) +abc 42 0 String Int64 +abc abc 0 String String +abc abd 1 String String +abc \N 1 String None +abd abc 0 String String +\N [1,2,3] 0 None Array(Int64) +\N 42 0 None Int64 +\N abc 0 None String +\N \N 0 None None +d1 <= d2 +[1,2,3] [1,2,3] 1 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 1 Array(Int64) Array(Int64) +[1,2,3] 42 1 Array(Int64) Int64 +[1,2,3] abc 1 Array(Int64) String +[1,2,3] \N 1 Array(Int64) None +[1,2,4] [1,2,3] 0 Array(Int64) Array(Int64) +42 [1,2,3] 0 Int64 Array(Int64) +42 42 1 Int64 Int64 +42 43 1 Int64 Int64 +42 abc 1 Int64 String +42 \N 1 Int64 None +43 42 0 Int64 Int64 +abc [1,2,3] 0 String Array(Int64) +abc 42 0 String Int64 +abc abc 1 String String +abc abd 1 String String +abc \N 1 String None +abd abc 0 String String +\N [1,2,3] 0 None Array(Int64) +\N 42 0 None Int64 +\N abc 0 None String +\N \N 1 None None +d1 > d2 +[1,2,3] [1,2,3] 0 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 0 Array(Int64) Array(Int64) +[1,2,3] 42 0 Array(Int64) Int64 +[1,2,3] abc 0 Array(Int64) String +[1,2,3] \N 0 Array(Int64) None +[1,2,4] [1,2,3] 1 Array(Int64) Array(Int64) +42 [1,2,3] 1 Int64 Array(Int64) +42 42 0 Int64 Int64 +42 43 0 Int64 Int64 +42 abc 0 Int64 String +42 \N 0 Int64 None +43 42 1 Int64 Int64 +abc [1,2,3] 1 String Array(Int64) +abc 42 1 String Int64 +abc abc 0 String String +abc abd 0 String String +abc \N 0 String None +abd abc 1 String String +\N [1,2,3] 1 None Array(Int64) +\N 42 1 None Int64 +\N abc 1 None String +\N \N 0 None None +d1 >= d2 +[1,2,3] [1,2,3] 1 Array(Int64) Array(Int64) +[1,2,3] [1,2,4] 1 Array(Int64) Array(Int64) +[1,2,3] 42 1 Array(Int64) Int64 +[1,2,3] abc 1 Array(Int64) String +[1,2,3] \N 1 Array(Int64) None +[1,2,4] [1,2,3] 1 Array(Int64) Array(Int64) +42 [1,2,3] 1 Int64 Array(Int64) +42 42 1 Int64 Int64 +42 43 1 Int64 Int64 +42 abc 1 Int64 String +42 \N 1 Int64 None +43 42 1 Int64 Int64 +abc [1,2,3] 1 String Array(Int64) +abc 42 1 String Int64 +abc abc 1 String String +abc abd 1 String String +abc \N 1 String None +abd abc 1 String String +\N [1,2,3] 1 None Array(Int64) +\N 42 1 None Int64 +\N abc 1 None String +\N \N 1 None None diff --git a/tests/queries/0_stateless/03035_dynamic_sorting.sql b/tests/queries/0_stateless/03035_dynamic_sorting.sql new file mode 100644 index 00000000000..0487fafc955 --- /dev/null +++ b/tests/queries/0_stateless/03035_dynamic_sorting.sql @@ -0,0 +1,80 @@ +set allow_experimental_dynamic_type = 1; + +drop table if exists test; +create table test (d1 Dynamic, d2 Dynamic) engine=Memory; + +insert into test values (42, 42); +insert into test values (42, 43); +insert into test values (43, 42); + +insert into test values ('abc', 'abc'); +insert into test values ('abc', 'abd'); +insert into test values ('abd', 'abc'); + +insert into test values ([1,2,3], [1,2,3]); +insert into test values ([1,2,3], [1,2,4]); +insert into test values ([1,2,4], [1,2,3]); + +insert into test values (NULL, NULL); + +insert into test values (42, 'abc'); +insert into test values ('abc', 42); + +insert into test values (42, [1,2,3]); +insert into test values ([1,2,3], 42); + +insert into test values (42, NULL); +insert into test values (NULL, 42); + +insert into test values ('abc', [1,2,3]); +insert into test values ([1,2,3], 'abc'); + +insert into test values ('abc', NULL); +insert into test values (NULL, 'abc'); + +insert into test values ([1,2,3], NULL); +insert into test values (NULL, [1,2,3]); + + +select 'order by d1 nulls first'; +select d1, dynamicType(d1) from test order by d1 nulls first; + +select 'order by d1 nulls last'; +select d1, dynamicType(d1) from test order by d1 nulls last; + +select 'order by d2 nulls first'; +select d2, dynamicType(d2) from test order by d2 nulls first; + +select 'order by d2 nulls last'; +select d2, dynamicType(d2) from test order by d2 nulls last; + + +select 'order by d1, d2 nulls first'; +select d1, d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2 nulls first; + +select 'order by d1, d2 nulls last'; +select d1, d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2 nulls last; + +select 'order by d2, d1 nulls first'; +select d1, d2, dynamicType(d1), dynamicType(d2) from test order by d2, d1 nulls first; + +select 'order by d2, d1 nulls last'; +select d1, d2, dynamicType(d1), dynamicType(d2) from test order by d2, d1 nulls last; + +select 'd1 = d2'; +select d1, d2, d1 = d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +select 'd1 < d2'; +select d1, d2, d1 < d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +select 'd1 <= d2'; +select d1, d2, d1 <= d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +select 'd1 > d2'; +select d1, d2, d1 > d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +select 'd1 >= d2'; +select d1, d2, d2 >= d2, dynamicType(d1), dynamicType(d2) from test order by d1, d2; + +drop table test; + diff --git a/tests/queries/0_stateless/03036_dynamic_read_subcolumns.reference b/tests/queries/0_stateless/03036_dynamic_read_subcolumns.reference new file mode 100644 index 00000000000..36984bc8b9b --- /dev/null +++ b/tests/queries/0_stateless/03036_dynamic_read_subcolumns.reference @@ -0,0 +1,57 @@ +Memory +test +Array(Array(Dynamic)) +Array(Variant(String, UInt64)) +None +String +UInt64 +200000 +200000 +200000 +200000 +0 +0 +200000 +200000 +100000 +100000 +200000 +0 +MergeTree compact +test +Array(Array(Dynamic)) +Array(Variant(String, UInt64)) +None +String +UInt64 +200000 +200000 +200000 +200000 +0 +0 +200000 +200000 +100000 +100000 +200000 +0 +MergeTree wide +test +Array(Array(Dynamic)) +Array(Variant(String, UInt64)) +None +String +UInt64 +200000 +200000 +200000 +200000 +0 +0 +200000 +200000 +100000 +100000 +200000 +0 diff --git a/tests/queries/0_stateless/03036_dynamic_read_subcolumns.sh b/tests/queries/0_stateless/03036_dynamic_read_subcolumns.sh new file mode 100755 index 00000000000..65517061b99 --- /dev/null +++ b/tests/queries/0_stateless/03036_dynamic_read_subcolumns.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(100000, 100000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1)) from numbers(200000, 100000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, NULL from numbers(300000, 100000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, multiIf(number % 4 == 3, 'str_' || toString(number), number % 4 == 2, NULL, number % 4 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10 + 1))) from numbers(400000, 400000) settings min_insert_block_size_rows=50000" + $CH_CLIENT -q "insert into test select number, [range((number % 10 + 1)::UInt64)]::Array(Array(Dynamic)) from numbers(100000, 100000) settings min_insert_block_size_rows=50000" + + $CH_CLIENT -q "select distinct dynamicType(d) as type from test order by type" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'UInt64'" + $CH_CLIENT -q "select count() from test where d.UInt64 is not NULL" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'String'" + $CH_CLIENT -q "select count() from test where d.String is not NULL" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'Date'" + $CH_CLIENT -q "select count() from test where d.Date is not NULL" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'Array(Variant(String, UInt64))'" + $CH_CLIENT -q "select count() from test where not empty(d.\`Array(Variant(String, UInt64))\`)" + $CH_CLIENT -q "select count() from test where dynamicType(d) == 'Array(Array(Dynamic))'" + $CH_CLIENT -q "select count() from test where not empty(d.\`Array(Array(Dynamic))\`)" + $CH_CLIENT -q "select count() from test where d is NULL" + $CH_CLIENT -q "select count() from test where not empty(d.\`Tuple(a Array(Dynamic))\`.a.String)" + + $CH_CLIENT -q "select d, d.UInt64, d.String, d.\`Array(Variant(String, UInt64))\` from test format Null" + $CH_CLIENT -q "select d.UInt64, d.String, d.\`Array(Variant(String, UInt64))\` from test format Null" + $CH_CLIENT -q "select d.Int8, d.Date, d.\`Array(String)\` from test format Null" + $CH_CLIENT -q "select d, d.UInt64, d.Date, d.\`Array(Variant(String, UInt64))\`, d.\`Array(Variant(String, UInt64))\`.size0, d.\`Array(Variant(String, UInt64))\`.UInt64 from test format Null" + $CH_CLIENT -q "select d.UInt64, d.Date, d.\`Array(Variant(String, UInt64))\`, d.\`Array(Variant(String, UInt64))\`.size0, d.\`Array(Variant(String, UInt64))\`.UInt64, d.\`Array(Variant(String, UInt64))\`.String from test format Null" + $CH_CLIENT -q "select d, d.\`Tuple(a UInt64, b String)\`.a, d.\`Array(Dynamic)\`.\`Variant(String, UInt64)\`.UInt64, d.\`Array(Variant(String, UInt64))\`.UInt64 from test format Null" + $CH_CLIENT -q "select d.\`Array(Dynamic)\`.\`Variant(String, UInt64)\`.UInt64, d.\`Array(Dynamic)\`.size0, d.\`Array(Variant(String, UInt64))\`.UInt64 from test format Null" + $CH_CLIENT -q "select d.\`Array(Array(Dynamic))\`.size1, d.\`Array(Array(Dynamic))\`.UInt64, d.\`Array(Array(Dynamic))\`.\`Map(String, Tuple(a UInt64))\`.values.a from test format Null" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=Memory" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1.reference b/tests/queries/0_stateless/03037_dynamic_merges_1.reference new file mode 100644 index 00000000000..fff812f0396 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1.reference @@ -0,0 +1,120 @@ +MergeTree compact + horizontal merge +test1 +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 UInt64 +100000 None +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String +MergeTree wide + horizontal merge +test1 +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 UInt64 +100000 None +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 UInt64 +100000 None +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String +MergeTree compact + vertical merge +test1 +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 UInt64 +100000 None +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 UInt64 +100000 None +200000 Map(UInt64, UInt64) +270000 String +MergeTree wide + vertical merge +test1 +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1.sh b/tests/queries/0_stateless/03037_dynamic_merges_1.sh new file mode 100755 index 00000000000..cf524fb9393 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "test" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(80000)" + $CH_CLIENT -q "insert into test select number, range(number % 10 + 1) from numbers(70000)" + $CH_CLIENT -q "insert into test select number, toDate(number) from numbers(60000)" + $CH_CLIENT -q "insert into test select number, toDateTime(number) from numbers(50000)" + $CH_CLIENT -q "insert into test select number, NULL from numbers(100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, map(number, number) from numbers(200000)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, tuple(number, number) from numbers(10000)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03037_dynamic_merges_2.sh b/tests/queries/0_stateless/03037_dynamic_merges_2.sh new file mode 100755 index 00000000000..e9d571c2104 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_2.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "test" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(1000000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(1000000, 1000000)" + $CH_CLIENT -q "insert into test select number, range(number % 10 + 1) from numbers(2000000, 1000000)" + + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03038_nested_dynamic_merges.reference b/tests/queries/0_stateless/03038_nested_dynamic_merges.reference new file mode 100644 index 00000000000..f8118ce8b95 --- /dev/null +++ b/tests/queries/0_stateless/03038_nested_dynamic_merges.reference @@ -0,0 +1,92 @@ +MergeTree compact + horizontal merge +test +16667 Tuple(a Dynamic(max_types=3)):Date +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +50000 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +16667 Tuple(a Dynamic(max_types=3)):DateTime +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +133333 Tuple(a Dynamic(max_types=3)):None +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +116667 Tuple(a Dynamic(max_types=3)):String +133333 Tuple(a Dynamic(max_types=3)):None +MergeTree wide + horizontal merge +test +16667 Tuple(a Dynamic(max_types=3)):Date +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +50000 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +16667 Tuple(a Dynamic(max_types=3)):DateTime +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +133333 Tuple(a Dynamic(max_types=3)):None +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 UInt64:None +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +116667 Tuple(a Dynamic(max_types=3)):String +133333 Tuple(a Dynamic(max_types=3)):None +MergeTree compact + vertical merge +test +16667 Tuple(a Dynamic(max_types=3)):Date +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):String +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 UInt64:None +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +16667 Tuple(a Dynamic(max_types=3)):DateTime +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +133333 Tuple(a Dynamic(max_types=3)):None +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +116667 Tuple(a Dynamic(max_types=3)):String +133333 Tuple(a Dynamic(max_types=3)):None +MergeTree wide + vertical merge +test +16667 Tuple(a Dynamic(max_types=3)):Date +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):String +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 UInt64:None +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 UInt64:None +16667 Tuple(a Dynamic(max_types=3)):DateTime +33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) +50000 Tuple(a Dynamic(max_types=3)):UInt64 +66667 Tuple(a Dynamic(max_types=3)):String +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None +133333 Tuple(a Dynamic(max_types=3)):None +50000 Tuple(a Dynamic(max_types=3)):UInt64 +100000 UInt64:None +100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +116667 Tuple(a Dynamic(max_types=3)):String +133333 Tuple(a Dynamic(max_types=3)):None diff --git a/tests/queries/0_stateless/03038_nested_dynamic_merges.sh b/tests/queries/0_stateless/03038_nested_dynamic_merges.sh new file mode 100755 index 00000000000..afb167ec20d --- /dev/null +++ b/tests/queries/0_stateless/03038_nested_dynamic_merges.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "test" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, tuple(if(number % 2 == 0, number, 'str_' || toString(number)))::Tuple(a Dynamic(max_types=3)) from numbers(100000)" + $CH_CLIENT -q "insert into test select number, tuple(if(number % 3 == 0, toDate(number), range(number % 10)))::Tuple(a Dynamic(max_types=3)) from numbers(50000)" + + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count()" + + $CH_CLIENT -q "insert into test select number, tuple(if(number % 3 == 0, toDateTime(number), NULL))::Tuple(a Dynamic(max_types=3)) from numbers(50000)" + $CH_CLIENT -q "insert into test select number, tuple(if(number % 2 == 0, tuple(number), NULL))::Tuple(a Dynamic(max_types=3)) from numbers(200000)" + + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count()" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + horizontal merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide + vertical merge" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +test +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference new file mode 100644 index 00000000000..a7fbbabcd46 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference @@ -0,0 +1,88 @@ +MergeTree compact + horizontal merge +ReplacingMergeTree +100000 UInt64 +100000 String +50000 UInt64 +100000 String +SummingMergeTree +100000 UInt64 +100000 String +200000 1 +50000 String +100000 UInt64 +50000 2 +100000 1 +AggregatingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +50000 2 +100000 1 +MergeTree wide + horizontal merge +ReplacingMergeTree +100000 UInt64 +100000 String +50000 UInt64 +100000 String +SummingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +50000 2 +100000 1 +AggregatingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +50000 2 +100000 1 +MergeTree compact + vertical merge +ReplacingMergeTree +100000 String +100000 UInt64 +50000 UInt64 +100000 String +SummingMergeTree +100000 UInt64 +100000 String +200000 1 +50000 String +100000 UInt64 +50000 2 +100000 1 +AggregatingMergeTree +100000 UInt64 +100000 String +200000 1 +50000 String +100000 UInt64 +50000 2 +100000 1 +MergeTree wide + vertical merge +ReplacingMergeTree +100000 UInt64 +100000 String +50000 UInt64 +100000 String +SummingMergeTree +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +50000 2 +100000 1 +AggregatingMergeTree +100000 UInt64 +100000 String +200000 1 +50000 String +100000 UInt64 +50000 2 +100000 1 diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh new file mode 100755 index 00000000000..3384a135307 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "ReplacingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=ReplacingMergeTree order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "drop table test" + + echo "SummingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, sum UInt64, d Dynamic) engine=SummingMergeTree(sum) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), sum from test group by sum" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), sum from test group by sum" + $CH_CLIENT -q "drop table test" + + echo "AggregatingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, sum AggregateFunction(sum, UInt64), d Dynamic) engine=AggregatingMergeTree() order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), number from numbers(100000) group by number" + $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), 'str_' || toString(number) from numbers(50000, 100000) group by number" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference new file mode 100644 index 00000000000..03c8b4564fa --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference @@ -0,0 +1,44 @@ +MergeTree compact + horizontal merge +CollapsingMergeTree +100000 String +100000 UInt64 +50000 UInt64 +50000 String +VersionedCollapsingMergeTree +100000 String +100000 UInt64 +75000 String +75000 UInt64 +MergeTree wide + horizontal merge +CollapsingMergeTree +100000 UInt64 +100000 String +50000 String +50000 UInt64 +VersionedCollapsingMergeTree +100000 UInt64 +100000 String +75000 String +75000 UInt64 +MergeTree compact + vertical merge +CollapsingMergeTree +100000 UInt64 +100000 String +50000 UInt64 +50000 String +VersionedCollapsingMergeTree +100000 UInt64 +100000 String +75000 UInt64 +75000 String +MergeTree wide + vertical merge +CollapsingMergeTree +100000 UInt64 +100000 String +50000 String +50000 UInt64 +VersionedCollapsingMergeTree +100000 UInt64 +100000 String +75000 UInt64 +75000 String diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh new file mode 100755 index 00000000000..5dae9228d0a --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" + + +function test() +{ + echo "CollapsingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, sign Int8, d Dynamic) engine=CollapsingMergeTree(sign) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, -1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "drop table test" + + echo "VersionedCollapsingMergeTree" + $CH_CLIENT -q "create table test (id UInt64, sign Int8, version UInt8, d Dynamic) engine=VersionedCollapsingMergeTree(sign, version) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, -1, number >= 75000 ? 2 : 1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters.reference b/tests/queries/0_stateless/03040_dynamic_type_alters.reference new file mode 100644 index 00000000000..ca98ec0963c --- /dev/null +++ b/tests/queries/0_stateless/03040_dynamic_type_alters.reference @@ -0,0 +1,526 @@ +Memory +initial insert +alter add column 1 +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter modify column 1 +7 None +8 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert after alter modify column 1 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +alter modify column 2 +4 UInt64 +7 String +8 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +insert after alter modify column 2 +1 Date +5 UInt64 +8 String +9 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +19 19 \N \N \N \N \N +20 20 20 \N 20 \N \N +21 21 str_21 str_21 \N \N \N +22 22 1970-01-23 \N \N 1970-01-23 \N +alter modify column 3 +1 Date +5 UInt64 +8 String +9 None +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N \N 3 \N \N +4 4 4 \N \N \N 4 \N \N +5 5 5 \N \N \N 5 \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N \N 12 \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +insert after alter modify column 3 +1 Date +5 UInt64 +8 String +12 None +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N \N 3 \N \N +4 4 4 \N \N \N 4 \N \N +5 5 5 \N \N \N 5 \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N \N 12 \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +23 \N \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N \N +25 str_25 \N str_25 \N \N \N \N \N +MergeTree compact +initial insert +alter add column 1 +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter modify column 1 +7 None +8 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert after alter modify column 1 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +alter modify column 2 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +insert after alter modify column 2 +1 Date +1 UInt64 +9 None +12 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +19 19 \N \N \N \N \N +20 20 20 \N 20 \N \N +21 21 str_21 str_21 \N \N \N +22 22 1970-01-23 \N \N 1970-01-23 \N +alter modify column 3 +1 Date +1 UInt64 +9 None +12 String +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N 3 \N \N \N +4 4 4 \N \N 4 \N \N \N +5 5 5 \N \N 5 \N \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N 12 \N \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +insert after alter modify column 3 +1 Date +1 UInt64 +12 None +12 String +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N 3 \N \N \N +4 4 4 \N \N 4 \N \N \N +5 5 5 \N \N 5 \N \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N 12 \N \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +23 \N \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N \N +25 str_25 \N str_25 \N \N \N \N \N +MergeTree wide +initial insert +alter add column 1 +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter modify column 1 +7 None +8 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert after alter modify column 1 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +alter modify column 2 +8 None +11 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +insert after alter modify column 2 +1 Date +1 UInt64 +9 None +12 String +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 3 \N \N \N +4 4 4 4 \N \N \N +5 5 5 5 \N \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 12 \N \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +15 15 \N \N \N \N \N +16 16 16 16 \N \N \N +17 17 str_17 str_17 \N \N \N +18 18 1970-01-19 1970-01-19 \N \N \N +19 19 \N \N \N \N \N +20 20 20 \N 20 \N \N +21 21 str_21 str_21 \N \N \N +22 22 1970-01-23 \N \N 1970-01-23 \N +alter modify column 3 +1 Date +1 UInt64 +9 None +12 String +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N 3 \N \N \N +4 4 4 \N \N 4 \N \N \N +5 5 5 \N \N 5 \N \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N 12 \N \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +insert after alter modify column 3 +1 Date +1 UInt64 +12 None +12 String +0 0 0 \N \N \N \N \N \N +1 1 1 \N \N \N \N \N \N +2 2 2 \N \N \N \N \N \N +3 3 3 \N \N 3 \N \N \N +4 4 4 \N \N 4 \N \N \N +5 5 5 \N \N 5 \N \N \N +6 6 6 \N \N str_6 \N \N \N +7 7 7 \N \N str_7 \N \N \N +8 8 8 \N \N str_8 \N \N \N +9 9 9 \N \N \N \N \N \N +10 10 10 \N \N \N \N \N \N +11 11 11 \N \N \N \N \N \N +12 12 12 \N \N 12 \N \N \N +13 13 13 \N \N str_13 \N \N \N +14 14 14 \N \N \N \N \N \N +15 15 15 \N \N \N \N \N \N +16 16 16 \N \N 16 \N \N \N +17 17 17 \N \N str_17 \N \N \N +18 18 18 \N \N 1970-01-19 \N \N \N +19 19 19 \N \N \N \N \N \N +20 20 20 \N \N \N 20 \N \N +21 21 21 \N \N str_21 \N \N \N +22 22 22 \N \N \N \N 1970-01-23 \N +23 \N \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N \N +25 str_25 \N str_25 \N \N \N \N \N diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters.sh b/tests/queries/0_stateless/03040_dynamic_type_alters.sh new file mode 100755 index 00000000000..a20a92712e0 --- /dev/null +++ b/tests/queries/0_stateless/03040_dynamic_type_alters.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --stacktrace --max_insert_threads 3 --group_by_two_level_threshold 1000000 --group_by_two_level_threshold_bytes 42526602 --distributed_aggregation_memory_efficient 1 --fsync_metadata 1 --output_format_parallel_formatting 0 --input_format_parallel_parsing 0 --min_chunk_bytes_for_parallel_parsing 8125230 --max_read_buffer_size 859505 --prefer_localhost_replica 1 --max_block_size 34577 --max_threads 41 --optimize_append_index 0 --optimize_if_chain_to_multiif 1 --optimize_if_transform_strings_to_enum 1 --optimize_read_in_order 1 --optimize_or_like_chain 0 --optimize_substitute_columns 1 --enable_multiple_prewhere_read_steps 1 --read_in_order_two_level_merge_threshold 99 --optimize_aggregation_in_order 1 --aggregation_in_order_max_block_bytes 27635208 --use_uncompressed_cache 0 --min_bytes_to_use_direct_io 10737418240 --min_bytes_to_use_mmap_io 6451111320 --local_filesystem_read_method pread --remote_filesystem_read_method read --local_filesystem_read_prefetch 1 --filesystem_cache_segments_batch_size 50 --read_from_filesystem_cache_if_exists_otherwise_bypass_cache 0 --throw_on_error_from_cache_on_write_operations 0 --remote_filesystem_read_prefetch 1 --allow_prefetched_read_pool_for_remote_filesystem 0 --filesystem_prefetch_max_memory_usage 64Mi --filesystem_prefetches_limit 10 --filesystem_prefetch_min_bytes_for_single_read_task 16Mi --filesystem_prefetch_step_marks 0 --filesystem_prefetch_step_bytes 100Mi --compile_aggregate_expressions 0 --compile_sort_description 1 --merge_tree_coarse_index_granularity 32 --optimize_distinct_in_order 0 --max_bytes_before_external_sort 10737418240 --max_bytes_before_external_group_by 10737418240 --max_bytes_before_remerge_sort 1374192967 --min_compress_block_size 2152247 --max_compress_block_size 1830907 --merge_tree_compact_parts_min_granules_to_multibuffer_read 79 --optimize_sorting_by_input_stream_properties 1 --http_response_buffer_size 106072 --http_wait_end_of_query True --enable_memory_bound_merging_of_aggregation_results 0 --min_count_to_compile_expression 0 --min_count_to_compile_aggregate_expression 3 --min_count_to_compile_sort_description 3 --session_timezone Africa/Khartoum --prefer_warmed_unmerged_parts_seconds 4 --use_page_cache_for_disks_without_file_cache False --page_cache_inject_eviction True --merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability 0.03 --ratio_of_defaults_for_sparse_serialization 0.9779014012142565 --prefer_fetch_merged_part_size_threshold 4254002758 --vertical_merge_algorithm_min_rows_to_activate 1 --vertical_merge_algorithm_min_columns_to_activate 1 --allow_vertical_merges_from_compact_to_wide_parts 1 --min_merge_bytes_to_use_direct_io 1 --index_granularity_bytes 4982992 --merge_max_block_size 16662 --index_granularity 22872 --min_bytes_for_wide_part 1073741824 --compress_marks 0 --compress_primary_key 0 --marks_compress_block_size 86328 --primary_key_compress_block_size 64101 --replace_long_file_name_to_hash 0 --max_file_name_length 81 --min_bytes_for_full_part_storage 536870912 --compact_parts_max_bytes_to_buffer 480908080 --compact_parts_max_granules_to_buffer 1 --compact_parts_merge_max_bytes_to_prefetch_part 4535313 --cache_populated_by_fetch 0" + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column 1" + $CH_CLIENT -q "alter table test add column d Dynamic(max_types=3) settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter add column 1" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "alter modify column 1" + $CH_CLIENT -q "alter table test modify column d Dynamic(max_types=1) settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter modify column 1" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, toDate(number), NULL) from numbers(15, 4)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "alter modify column 2" + $CH_CLIENT -q "alter table test modify column d Dynamic(max_types=3) settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter modify column 2" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, toDate(number), NULL) from numbers(19, 4)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "alter modify column 3" + $CH_CLIENT -q "alter table test modify column y Dynamic settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, y.\`Tuple(a UInt64)\`.a, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter modify column 3" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL), NULL from numbers(23, 3)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, y.\`Tuple(a UInt64)\`.a, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=Memory" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" From 18e4c0f1da79fc458707c5557b9e611a1fe916bd Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 26 Apr 2024 13:35:18 +0200 Subject: [PATCH 083/392] Fix remaining integration test --- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 4 ++-- src/IO/S3/getObjectInfo.cpp | 2 +- .../ObjectStorage/HDFS/ReadBufferFromHDFS.cpp | 1 - .../ObjectStorage/ReadBufferIterator.cpp | 4 ++-- .../ObjectStorage/StorageObjectStorageSource.cpp | 16 +++++++++++----- .../ObjectStorage/StorageObjectStorageSource.h | 7 ++----- 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index a2522212f90..507e9dbafcb 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -447,7 +447,7 @@ std::optional S3ObjectStorage::tryGetObjectMetadata(const std::s ObjectMetadata result; result.size_bytes = object_info.size; - result.last_modified = object_info.last_modification_time; + result.last_modified = Poco::Timestamp::fromEpochTime(object_info.last_modification_time); result.attributes = object_info.metadata; return result; @@ -462,7 +462,7 @@ ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) cons ObjectMetadata result; result.size_bytes = object_info.size; - result.last_modified = object_info.last_modification_time; + result.last_modified = Poco::Timestamp::fromEpochTime(object_info.last_modification_time); result.attributes = object_info.metadata; return result; diff --git a/src/IO/S3/getObjectInfo.cpp b/src/IO/S3/getObjectInfo.cpp index 88f79f8d8d5..c294e7905bd 100644 --- a/src/IO/S3/getObjectInfo.cpp +++ b/src/IO/S3/getObjectInfo.cpp @@ -53,7 +53,7 @@ namespace const auto & result = outcome.GetResult(); ObjectInfo object_info; object_info.size = static_cast(result.GetContentLength()); - object_info.last_modification_time = result.GetLastModified().Millis() / 1000; + object_info.last_modification_time = result.GetLastModified().Seconds(); if (with_metadata) object_info.metadata = result.GetMetadata(); diff --git a/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp index eeb553e0d62..b37b9de746b 100644 --- a/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp @@ -116,7 +116,6 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory= file_size) // { - // LOG_TEST(log, "KSSENII 1 2"); // return false; // } diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index f8ce90a2b1f..9c1d3f79c2b 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -67,11 +67,11 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( auto get_last_mod_time = [&] -> std::optional { if (object_info->metadata) - return object_info->metadata->last_modified.epochMicroseconds(); + return object_info->metadata->last_modified.epochTime(); else { object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); - return object_info->metadata->last_modified.epochMicroseconds(); + return object_info->metadata->last_modified.epochTime(); } }; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 3101a7ebf51..4551c2df7c3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -76,6 +76,11 @@ StorageObjectStorageSource::~StorageObjectStorageSource() create_reader_pool->wait(); } +void StorageObjectStorageSource::setKeyCondition(const ActionsDAGPtr & filter_actions_dag, ContextPtr context_) +{ + setKeyConditionImpl(filter_actions_dag, context_, read_from_format_info.format_header); +} + std::shared_ptr StorageObjectStorageSource::createFileIterator( ConfigurationPtr configuration, ObjectStoragePtr object_storage, @@ -213,9 +218,11 @@ std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const O auto get_last_mod_time = [&]() -> std::optional { - return object_info->metadata - ? object_info->metadata->last_modified.epochMicroseconds() - : 0; + if (object_info->metadata) + { + return object_info->metadata->last_modified.epochTime(); + } + return std::nullopt; }; return schema_cache.tryGetNumRows(cache_key, get_last_mod_time); } @@ -260,7 +267,6 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade const auto max_parsing_threads = need_only_count ? std::optional(1) : std::nullopt; read_buf = createReadBuffer(object_info->relative_path, object_info->metadata->size_bytes); - LOG_TEST(&Poco::Logger::get("KSSENII"), "KSSENII HEADER: {}", read_from_format_info.format_header.dumpStructure()); auto input_format = FormatFactory::instance().getInput( configuration->format, *read_buf, read_from_format_info.format_header, getContext(), max_block_size, format_settings, max_parsing_threads, @@ -354,7 +360,7 @@ ObjectInfoPtr StorageObjectStorageSource::IIterator::next(size_t processor) if (object_info) { - LOG_TEST(&Poco::Logger::get("KeysIterator"), "Next key: {}", object_info->relative_path); + LOG_TEST(logger, "Next key: {}", object_info->relative_path); } return object_info; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 3c2cc3f80cd..0afbf77db2b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -38,10 +38,7 @@ public: String getName() const override { return name; } - void setKeyCondition(const ActionsDAGPtr & filter_actions_dag, ContextPtr context_) override - { - setKeyConditionImpl(filter_actions_dag, context_, read_from_format_info.format_header); - } + void setKeyCondition(const ActionsDAGPtr & filter_actions_dag, ContextPtr context_) override; Chunk generate() override; @@ -65,11 +62,11 @@ protected: const bool need_only_count; const ReadFromFormatInfo read_from_format_info; const std::shared_ptr create_reader_pool; + ColumnsDescription columns_desc; std::shared_ptr file_iterator; SchemaCache & schema_cache; bool initialized = false; - size_t total_rows_in_file = 0; LoggerPtr log = getLogger("StorageObjectStorageSource"); From a4ed164074fcd96fc198000722563da70f6a31bf Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 26 Apr 2024 13:38:38 +0200 Subject: [PATCH 084/392] Fix clang tidy --- src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp | 2 +- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 2 +- src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index c6590ba8d43..571e14325bb 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -136,7 +136,7 @@ struct DeltaLakeMetadata::Impl * \"nullCount\":{\"col-6c990940-59bb-4709-8f2e-17083a82c01a\":0,\"col-763cd7e2-7627-4d8e-9fb7-9e85d0c8845b\":0}}"}} * " */ - void processMetadataFile(const String & key, std::set & result) + void processMetadataFile(const String & key, std::set & result) const { auto read_settings = context->getReadSettings(); auto buf = object_storage->readObject(StoredObject(key), read_settings); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index f5bfb9d2a65..c5565d8b0e8 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -61,7 +61,7 @@ StorageObjectStorage::StorageObjectStorage( objects.emplace_back(key); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); - setInMemoryMetadata(std::move(metadata)); + setInMemoryMetadata(metadata); } String StorageObjectStorage::getName() const diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index f98fc32a3cc..1a1df399626 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -47,7 +47,7 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( metadata.setConstraints(constraints_); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); - setInMemoryMetadata(std::move(metadata)); + setInMemoryMetadata(metadata); } std::string StorageObjectStorageCluster::getName() const From 434d2d16f1056977dd80f47d0b687151ac9d16f2 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 26 Apr 2024 16:34:12 +0200 Subject: [PATCH 085/392] Cleanuo --- src/Backups/BackupIO_AzureBlobStorage.cpp | 4 +- src/Backups/BackupIO_AzureBlobStorage.h | 10 +- .../registerBackupEngineAzureBlobStorage.cpp | 4 +- src/CMakeLists.txt | 4 +- src/Core/Settings.h | 4 + src/Core/SettingsChangesHistory.h | 4 + .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 78 +++---- .../ObjectStorages/HDFS/HDFSObjectStorage.h | 17 +- .../ObjectStorages/ObjectStorageFactory.cpp | 3 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 18 -- src/Disks/ObjectStorages/S3/diskSettings.cpp | 10 +- src/Interpreters/InterpreterSystemQuery.cpp | 4 +- .../{AzureBlob => Azure}/Configuration.cpp | 33 +-- .../{AzureBlob => Azure}/Configuration.h | 16 +- .../ObjectStorage/DataLakes/Common.cpp | 4 +- src/Storages/ObjectStorage/DataLakes/Common.h | 4 +- .../DataLakes/DeltaLakeMetadata.cpp | 12 +- .../DataLakes/DeltaLakeMetadata.h | 5 +- .../ObjectStorage/DataLakes/HudiMetadata.h | 4 +- .../DataLakes/IStorageDataLake.h | 2 +- .../DataLakes/IcebergMetadata.cpp | 6 +- .../ObjectStorage/DataLakes/IcebergMetadata.h | 4 +- .../DataLakes/registerDataLakeStorages.cpp | 6 +- .../ObjectStorage/HDFS/Configuration.cpp | 32 +-- .../ObjectStorage/HDFS/Configuration.h | 12 +- .../ObjectStorage/HDFS/ReadBufferFromHDFS.cpp | 8 +- .../ObjectStorage/ReadBufferIterator.cpp | 53 ++--- .../ObjectStorage/ReadBufferIterator.h | 8 +- .../ReadFromObjectStorageStep.cpp | 87 ------- .../ObjectStorage/ReadFromObjectStorageStep.h | 55 ----- .../ObjectStorage/S3/Configuration.cpp | 21 +- src/Storages/ObjectStorage/S3/Configuration.h | 11 +- .../ObjectStorage/StorageObjectStorage.cpp | 213 ++++++++++++++++-- .../ObjectStorage/StorageObjectStorage.h | 62 ++++- .../StorageObjectStorageCluster.cpp | 20 +- .../StorageObjectStorageCluster.h | 15 +- .../StorageObjectStorageConfiguration.cpp | 74 ------ .../StorageObjectStorageConfiguration.h | 75 ------ .../StorageObjectStorageSink.cpp | 7 +- .../ObjectStorage/StorageObjectStorageSink.h | 16 +- .../StorageObjectStorageSource.cpp | 23 +- .../StorageObjectStorageSource.h | 7 +- .../StorageObjectStorage_fwd_internal.h | 12 - src/Storages/ObjectStorage/Utils.cpp | 7 +- src/Storages/ObjectStorage/Utils.h | 6 +- .../registerStorageObjectStorage.cpp | 22 +- src/Storages/S3Queue/S3QueueTableMetadata.cpp | 3 +- src/Storages/S3Queue/S3QueueTableMetadata.h | 4 +- src/Storages/S3Queue/StorageS3Queue.cpp | 2 +- .../StorageSystemSchemaInferenceCache.cpp | 4 +- src/TableFunctions/ITableFunctionDataLake.h | 2 +- .../TableFunctionObjectStorage.cpp | 73 ++---- .../TableFunctionObjectStorage.h | 33 ++- .../TableFunctionObjectStorageCluster.cpp | 4 +- .../TableFunctionObjectStorageCluster.h | 4 +- src/TableFunctions/registerTableFunctions.cpp | 12 - .../configs/inf_s3_retries.xml | 1 + .../configs/s3_retries.xml | 1 + 58 files changed, 555 insertions(+), 690 deletions(-) rename src/Storages/ObjectStorage/{AzureBlob => Azure}/Configuration.cpp (93%) rename src/Storages/ObjectStorage/{AzureBlob => Azure}/Configuration.h (78%) delete mode 100644 src/Storages/ObjectStorage/ReadFromObjectStorageStep.cpp delete mode 100644 src/Storages/ObjectStorage/ReadFromObjectStorageStep.h delete mode 100644 src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp delete mode 100644 src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h delete mode 100644 src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 673930b5976..f00da686c18 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -28,7 +28,7 @@ namespace ErrorCodes } BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( - const StorageAzureBlobConfiguration & configuration_, + const StorageAzureConfiguration & configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_) @@ -112,7 +112,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( - const StorageAzureBlobConfiguration & configuration_, + const StorageAzureConfiguration & configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_, diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 25c52f9b0d3..4643c103fd5 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace DB @@ -17,7 +17,7 @@ class BackupReaderAzureBlobStorage : public BackupReaderDefault { public: BackupReaderAzureBlobStorage( - const StorageAzureBlobConfiguration & configuration_, + const StorageAzureConfiguration & configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); @@ -39,7 +39,7 @@ public: private: const DataSourceDescription data_source_description; std::shared_ptr client; - StorageAzureBlobConfiguration configuration; + StorageAzureConfiguration configuration; std::unique_ptr object_storage; std::shared_ptr settings; }; @@ -48,7 +48,7 @@ class BackupWriterAzureBlobStorage : public BackupWriterDefault { public: BackupWriterAzureBlobStorage( - const StorageAzureBlobConfiguration & configuration_, + const StorageAzureConfiguration & configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_, @@ -85,7 +85,7 @@ private: const DataSourceDescription data_source_description; std::shared_ptr client; - StorageAzureBlobConfiguration configuration; + StorageAzureConfiguration configuration; std::unique_ptr object_storage; std::shared_ptr settings; }; diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index 049a4b1a338..1e3b3759257 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #endif @@ -49,7 +49,7 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) const String & id_arg = params.backup_info.id_arg; const auto & args = params.backup_info.args; - StorageAzureBlobConfiguration configuration; + StorageAzureConfiguration configuration; if (!id_arg.empty()) { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c26c40d4b87..d5d17f992dc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -103,7 +103,6 @@ add_library(clickhouse_compression ${clickhouse_compression_headers} ${clickhous add_headers_and_sources(dbms Disks/IO) add_headers_and_sources(dbms Disks/ObjectStorages) -add_headers_and_sources(dbms Disks/ObjectStorages) if (TARGET ch_contrib::sqlite) add_headers_and_sources(dbms Databases/SQLite) endif() @@ -117,7 +116,7 @@ if (TARGET ch_contrib::nats_io) endif() add_headers_and_sources(dbms Storages/ObjectStorage) -add_headers_and_sources(dbms Storages/ObjectStorage/AzureBlob) +add_headers_and_sources(dbms Storages/ObjectStorage/Azure) add_headers_and_sources(dbms Storages/ObjectStorage/S3) add_headers_and_sources(dbms Storages/ObjectStorage/HDFS) add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes) @@ -148,7 +147,6 @@ if (TARGET ch_contrib::azure_sdk) endif() if (TARGET ch_contrib::hdfs) - add_headers_and_sources(dbms Storages/ObjectStorage/HDFS) add_headers_and_sources(dbms Disks/ObjectStorages/HDFS) endif() diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ff7a9089327..bf558d7b1ba 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -113,9 +113,12 @@ class IColumn; M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \ M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, hdfs_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, azure_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ M(Bool, s3_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageS3", 0) \ M(Bool, hdfs_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageHDFS", 0) \ M(Bool, azure_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageAzure", 0) \ + M(Bool, s3_validate_request_settings, true, "Validate S3 request settings", 0) \ M(Bool, s3_disable_checksum, false, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ M(UInt64, s3_request_timeout_ms, 30000, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \ @@ -128,6 +131,7 @@ class IColumn; M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \ M(Bool, hdfs_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in hdfs engine tables", 0) \ M(Bool, hdfs_skip_empty_files, false, "Allow to skip empty files in hdfs table engine", 0) \ + M(Bool, azure_skip_empty_files, false, "Allow to skip empty files in azure table engine", 0) \ M(UInt64, hsts_max_age, 0, "Expired time for hsts. 0 means disable HSTS.", 0) \ M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \ M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index cfe3c290d83..4954fa5d996 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -90,6 +90,10 @@ static std::map sett {"lightweight_deletes_sync", 2, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes"}, {"query_cache_system_table_handling", "save", "throw", "The query cache no longer caches results of queries against system tables"}, {"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."}, + {"hdfs_throw_on_zero_files_match", false, false, "Throw an error, when ListObjects request cannot match any files"}, + {"azure_throw_on_zero_files_match", false, false, "Throw an error, when ListObjects request cannot match any files"}, + {"s3_validate_request_settings", true, true, "Validate S3 request settings"}, + {"azure_skip_empty_files", false, false, "Allow to skip empty files in azure table engine"}, }}, {"24.3", {{"s3_connect_timeout_ms", 1000, 1000, "Introduce new dedicated setting for s3 connection timeout"}, {"allow_experimental_shared_merge_tree", false, true, "The setting is obsolete"}, diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index ed63795cb05..6c2f310a7d1 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -23,15 +23,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -void HDFSObjectStorage::shutdown() -{ -} - -void HDFSObjectStorage::startup() -{ -} - -void HDFSObjectStorage::initializeHDFS() const +void HDFSObjectStorage::initializeHDFSFS() const { if (initialized) return; @@ -45,9 +37,25 @@ void HDFSObjectStorage::initializeHDFS() const initialized = true; } +std::string HDFSObjectStorage::extractObjectKeyFromURL(const StoredObject & object) const +{ + /// This is very unfortunate, but for disk HDFS we made a mistake + /// and now its behaviour is inconsistent with S3 and Azure disks. + /// The mistake is that for HDFS we write into metadata files whole URL + data directory + key, + /// while for S3 and Azure we write there only data_directory + key. + /// This leads us into ambiguity that for StorageHDFS we have just key in object.remote_path, + /// but for DiskHDFS we have there URL as well. + auto path = object.remote_path; + if (path.starts_with(url)) + path = path.substr(url.size()); + if (path.starts_with("/")) + path.substr(1); + return path; +} + ObjectStorageKey HDFSObjectStorage::generateObjectKeyForPath(const std::string & /* path */) const { - initializeHDFS(); + initializeHDFSFS(); /// what ever data_source_description.description value is, consider that key as relative key chassert(data_directory.starts_with("/")); return ObjectStorageKey::createAsRelative( @@ -56,14 +64,11 @@ ObjectStorageKey HDFSObjectStorage::generateObjectKeyForPath(const std::string & bool HDFSObjectStorage::exists(const StoredObject & object) const { - initializeHDFS(); + initializeHDFSFS(); std::string path = object.remote_path; if (path.starts_with(url_without_path)) path = path.substr(url_without_path.size()); - // const auto & path = object.remote_path; - // const size_t begin_of_path = path.find('/', path.find("//") + 2); - // const String remote_fs_object_path = path.substr(begin_of_path); return (0 == hdfsExists(hdfs_fs.get(), path.c_str())); } @@ -73,13 +78,8 @@ std::unique_ptr HDFSObjectStorage::readObject( /// NOLIN std::optional, std::optional) const { - initializeHDFS(); - std::string path = object.remote_path; - if (path.starts_with(url)) - path = path.substr(url.size()); - if (path.starts_with("/")) - path.substr(1); - + initializeHDFSFS(); + auto path = extractObjectKeyFromURL(object); return std::make_unique( fs::path(url_without_path) / "", fs::path(data_directory) / path, config, patchSettings(read_settings)); } @@ -90,21 +90,13 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI std::optional, std::optional) const { - initializeHDFS(); + initializeHDFSFS(); auto disk_read_settings = patchSettings(read_settings); auto read_buffer_creator = [this, disk_read_settings] (bool /* restricted_seek */, const StoredObject & object_) -> std::unique_ptr { - // size_t begin_of_path = path.find('/', path.find("//") + 2); - // auto hdfs_path = path.substr(begin_of_path); - // auto hdfs_uri = path.substr(0, begin_of_path); - - std::string path = object_.remote_path; - if (path.starts_with(url)) - path = path.substr(url.size()); - if (path.starts_with("/")) - path.substr(1); + auto path = extractObjectKeyFromURL(object_); return std::make_unique( fs::path(url_without_path) / "", fs::path(data_directory) / path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true); }; @@ -120,7 +112,7 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL size_t buf_size, const WriteSettings & write_settings) { - initializeHDFS(); + initializeHDFSFS(); if (attributes.has_value()) throw Exception( ErrorCodes::UNSUPPORTED_METHOD, @@ -142,7 +134,7 @@ std::unique_ptr HDFSObjectStorage::writeObject( /// NOL /// Remove file. Throws exception if file doesn't exists or it's a directory. void HDFSObjectStorage::removeObject(const StoredObject & object) { - initializeHDFS(); + initializeHDFSFS(); auto path = object.remote_path; if (path.starts_with(url_without_path)) path = path.substr(url_without_path.size()); @@ -156,28 +148,28 @@ void HDFSObjectStorage::removeObject(const StoredObject & object) void HDFSObjectStorage::removeObjects(const StoredObjects & objects) { - initializeHDFS(); + initializeHDFSFS(); for (const auto & object : objects) removeObject(object); } void HDFSObjectStorage::removeObjectIfExists(const StoredObject & object) { - initializeHDFS(); + initializeHDFSFS(); if (exists(object)) removeObject(object); } void HDFSObjectStorage::removeObjectsIfExist(const StoredObjects & objects) { - initializeHDFS(); + initializeHDFSFS(); for (const auto & object : objects) removeObjectIfExists(object); } ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) const { - initializeHDFS(); + initializeHDFSFS(); auto * file_info = hdfsGetPathInfo(hdfs_fs.get(), path.data()); if (!file_info) throw Exception(ErrorCodes::HDFS_ERROR, @@ -185,7 +177,7 @@ ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) co ObjectMetadata metadata; metadata.size_bytes = static_cast(file_info->mSize); - metadata.last_modified = file_info->mLastMod; + metadata.last_modified = Poco::Timestamp::fromEpochTime(file_info->mLastMod); hdfsFreeFileInfo(file_info, 1); return metadata; @@ -193,9 +185,9 @@ ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) co void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const { - initializeHDFS(); + initializeHDFSFS(); auto * log = &Poco::Logger::get("HDFSObjectStorage"); - LOG_TRACE(log, "Trying to list files for {}", path); + LOG_TEST(log, "Trying to list files for {}", path); HDFSFileInfo ls; ls.file_info = hdfsListDirectory(hdfs_fs.get(), path.data(), &ls.length); @@ -213,7 +205,7 @@ void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithM throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null"); } - LOG_TRACE(log, "Listed {} files for {}", ls.length, path); + LOG_TEST(log, "Listed {} files for {}", ls.length, path); for (int i = 0; i < ls.length; ++i) { @@ -228,8 +220,6 @@ void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithM } else { - LOG_TEST(log, "Found file: {}", file_path); - children.emplace_back(std::make_shared( String(file_path), ObjectMetadata{ @@ -247,7 +237,7 @@ void HDFSObjectStorage::copyObject( /// NOLINT const WriteSettings & write_settings, std::optional object_to_attributes) { - initializeHDFS(); + initializeHDFSFS(); if (object_to_attributes.has_value()) throw Exception( ErrorCodes::UNSUPPORTED_METHOD, diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index b626d3dc779..e747b283400 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -35,7 +35,8 @@ public: HDFSObjectStorage( const String & hdfs_root_path_, SettingsPtr settings_, - const Poco::Util::AbstractConfiguration & config_) + const Poco::Util::AbstractConfiguration & config_, + bool lazy_initialize) : config(config_) , settings(std::move(settings_)) { @@ -46,6 +47,9 @@ public: data_directory = url.substr(begin_of_path); else data_directory = "/"; + + if (!lazy_initialize) + initializeHDFSFS(); } std::string getName() const override { return "HDFSObjectStorage"; } @@ -98,10 +102,6 @@ public: void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; - void shutdown() override; - - void startup() override; - String getObjectsNamespace() const override { return ""; } std::unique_ptr cloneObjectStorage( @@ -114,8 +114,13 @@ public: bool isRemote() const override { return true; } + void startup() override { } + + void shutdown() override { } + private: - void initializeHDFS() const; + void initializeHDFSFS() const; + std::string extractObjectKeyFromURL(const StoredObject & object) const; const Poco::Util::AbstractConfiguration & config; diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 67e38d6389a..1a2ea0c2593 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -232,7 +232,8 @@ void registerHDFSObjectStorage(ObjectStorageFactory & factory) context->getSettingsRef().hdfs_replication ); - return createObjectStorage(ObjectStorageType::HDFS, config, config_prefix, uri, std::move(settings), config); + return createObjectStorage( + ObjectStorageType::HDFS, config, config_prefix, uri, std::move(settings), config, /* lazy_initialize */false); }); } #endif diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 507e9dbafcb..0801a84ce13 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -582,27 +582,9 @@ void S3ObjectStorage::applyNewSettings( auto new_client = getClient(config, config_prefix, context, *new_s3_settings, for_disk_s3, &uri); client.set(std::move(new_client)); } - s3_settings.set(std::move(new_s3_settings)); } -// void S3ObjectStorage::applyNewSettings(ContextPtr context) -// { -// auto settings = s3_settings.get(); -// if (!endpoint_settings || !settings->auth_settings.hasUpdates(endpoint_settings->auth_settings)) -// return; -// -// const auto & config = context->getConfigRef(); -// auto new_s3_settings = getSettings(uri, config, "s3.", context); -// -// new_s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); -// -// auto new_client = getClient(config, "s3.", context, *new_s3_settings, false); -// -// s3_settings.set(std::move(new_s3_settings)); -// client.set(std::move(new_client)); -// } - std::unique_ptr S3ObjectStorage::cloneObjectStorage( const std::string & new_namespace, const Poco::Util::AbstractConfiguration & config, diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 49300a9cd89..a38c0d3c85f 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -100,11 +100,9 @@ std::unique_ptr getClient( settings.request_settings.put_request_throttler, url.uri.getScheme()); - client_configuration.endpointOverride = url.endpoint; - client_configuration.maxConnections = static_cast(request_settings.max_connections); client_configuration.connectTimeoutMs = config.getUInt64(config_prefix + ".connect_timeout_ms", local_settings.s3_connect_timeout_ms.value); client_configuration.requestTimeoutMs = config.getUInt64(config_prefix + ".request_timeout_ms", local_settings.s3_request_timeout_ms.value); - client_configuration.maxConnections = config.getUInt(config_prefix + ".max_connections", S3::DEFAULT_MAX_CONNECTIONS); + client_configuration.maxConnections = config.getUInt(config_prefix + ".max_connections", static_cast(request_settings.max_connections)); client_configuration.http_keep_alive_timeout = config.getUInt(config_prefix + ".http_keep_alive_timeout", S3::DEFAULT_KEEP_ALIVE_TIMEOUT); client_configuration.http_keep_alive_max_requests = config.getUInt(config_prefix + ".http_keep_alive_max_requests", S3::DEFAULT_KEEP_ALIVE_MAX_REQUESTS); @@ -112,12 +110,6 @@ std::unique_ptr getClient( client_configuration.s3_use_adaptive_timeouts = config.getBool( config_prefix + ".use_adaptive_timeouts", client_configuration.s3_use_adaptive_timeouts); - // client_configuration.http_keep_alive_timeout_ms = config.getUInt(config_prefix + ".http_keep_alive_timeout_ms", DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT * 1000); - // client_configuration.http_connection_pool_size = config.getUInt( - // config_prefix + ".http_connection_pool_size", static_cast(global_settings.s3_http_connection_pool_size.value)); - // client_configuration.s3_use_adaptive_timeouts = config.getBool(config_prefix + ".use_adaptive_timeouts", client_configuration.s3_use_adaptive_timeouts); - // client_configuration.wait_on_pool_size_limit = for_disk_s3; - if (for_disk_s3) { /* diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index af9dc08e8c7..56b2904363e 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include #include @@ -502,7 +502,7 @@ BlockIO InterpreterSystemQuery::execute() StorageURL::getSchemaCache(getContext()).clear(); #if USE_AZURE_BLOB_STORAGE if (caches_to_drop.contains("AZURE")) - StorageObjectStorage::getSchemaCache(getContext(), StorageAzureBlobConfiguration::type_name).clear(); + StorageObjectStorage::getSchemaCache(getContext(), StorageAzureConfiguration::type_name).clear(); #endif break; } diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp b/src/Storages/ObjectStorage/Azure/Configuration.cpp similarity index 93% rename from src/Storages/ObjectStorage/AzureBlob/Configuration.cpp rename to src/Storages/ObjectStorage/Azure/Configuration.cpp index f268b812c03..43992a81eef 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.cpp +++ b/src/Storages/ObjectStorage/Azure/Configuration.cpp @@ -1,8 +1,9 @@ -#include +#include #if USE_AZURE_BLOB_STORAGE #include +#include #include #include #include @@ -65,7 +66,7 @@ namespace } } -void StorageAzureBlobConfiguration::check(ContextPtr context) const +void StorageAzureConfiguration::check(ContextPtr context) const { Poco::URI url_to_check; if (is_connection_string) @@ -77,11 +78,11 @@ void StorageAzureBlobConfiguration::check(ContextPtr context) const url_to_check = Poco::URI(connection_url); context->getGlobalContext()->getRemoteHostFilter().checkURL(url_to_check); - StorageObjectStorageConfiguration::check(context); + Configuration::check(context); } -StorageAzureBlobConfiguration::StorageAzureBlobConfiguration(const StorageAzureBlobConfiguration & other) - : StorageObjectStorageConfiguration(other) +StorageAzureConfiguration::StorageAzureConfiguration(const StorageAzureConfiguration & other) + : Configuration(other) { connection_url = other.connection_url; is_connection_string = other.is_connection_string; @@ -92,7 +93,7 @@ StorageAzureBlobConfiguration::StorageAzureBlobConfiguration(const StorageAzureB blobs_paths = other.blobs_paths; } -AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(ContextPtr context) +AzureObjectStorage::SettingsPtr StorageAzureConfiguration::createSettings(ContextPtr context) { const auto & context_settings = context->getSettingsRef(); auto settings_ptr = std::make_unique(); @@ -102,7 +103,7 @@ AzureObjectStorage::SettingsPtr StorageAzureBlobConfiguration::createSettings(Co return settings_ptr; } -StorageObjectStorage::QuerySettings StorageAzureBlobConfiguration::getQuerySettings(const ContextPtr & context) const +StorageObjectStorage::QuerySettings StorageAzureConfiguration::getQuerySettings(const ContextPtr & context) const { const auto & settings = context->getSettingsRef(); return StorageObjectStorage::QuerySettings{ @@ -110,14 +111,14 @@ StorageObjectStorage::QuerySettings StorageAzureBlobConfiguration::getQuerySetti .create_new_file_on_insert = settings.azure_create_new_file_on_insert, .schema_inference_use_cache = settings.schema_inference_use_cache_for_azure, .schema_inference_mode = settings.schema_inference_mode, - .skip_empty_files = settings.s3_skip_empty_files, /// TODO: add setting for azure + .skip_empty_files = settings.azure_skip_empty_files, .list_object_keys_size = settings.azure_list_object_keys_size, - .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, + .throw_on_zero_files_match = settings.azure_throw_on_zero_files_match, .ignore_non_existent_file = settings.azure_ignore_file_doesnt_exist, }; } -ObjectStoragePtr StorageAzureBlobConfiguration::createObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT +ObjectStoragePtr StorageAzureConfiguration::createObjectStorage(ContextPtr context, bool is_readonly) /// NOLINT { assertInitialized(); auto client = createClient(is_readonly, /* attempt_to_create_container */true); @@ -125,7 +126,7 @@ ObjectStoragePtr StorageAzureBlobConfiguration::createObjectStorage(ContextPtr c return std::make_unique("AzureBlobStorage", std::move(client), std::move(settings), container); } -AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only, bool attempt_to_create_container) +AzureClientPtr StorageAzureConfiguration::createClient(bool is_read_only, bool attempt_to_create_container) { using namespace Azure::Storage::Blobs; @@ -133,8 +134,8 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only, bo if (is_connection_string) { - std::shared_ptr managed_identity_credential = std::make_shared(); - std::unique_ptr blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(connection_url)); + auto managed_identity_credential = std::make_shared(); + auto blob_service_client = std::make_unique(BlobServiceClient::CreateFromConnectionString(connection_url)); result = std::make_unique(BlobContainerClient::CreateFromConnectionString(connection_url, container)); if (attempt_to_create_container) @@ -243,7 +244,7 @@ AzureClientPtr StorageAzureBlobConfiguration::createClient(bool is_read_only, bo return result; } -void StorageAzureBlobConfiguration::fromNamedCollection(const NamedCollection & collection) +void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & collection) { validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); @@ -275,7 +276,7 @@ void StorageAzureBlobConfiguration::fromNamedCollection(const NamedCollection & blobs_paths = {blob_path}; } -void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr context, bool with_structure) +void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, bool with_structure) { if (engine_args.size() < 3 || engine_args.size() > (with_structure ? 8 : 7)) { @@ -396,7 +397,7 @@ void StorageAzureBlobConfiguration::fromAST(ASTs & engine_args, ContextPtr conte blobs_paths = {blob_path}; } -void StorageAzureBlobConfiguration::addStructureAndFormatToArgs( +void StorageAzureConfiguration::addStructureAndFormatToArgs( ASTs & args, const String & structure_, const String & format_, ContextPtr context) { if (tryGetNamedCollectionWithOverrides(args, context)) diff --git a/src/Storages/ObjectStorage/AzureBlob/Configuration.h b/src/Storages/ObjectStorage/Azure/Configuration.h similarity index 78% rename from src/Storages/ObjectStorage/AzureBlob/Configuration.h rename to src/Storages/ObjectStorage/Azure/Configuration.h index 7e105ea82b5..91a9a0bbbd5 100644 --- a/src/Storages/ObjectStorage/AzureBlob/Configuration.h +++ b/src/Storages/ObjectStorage/Azure/Configuration.h @@ -5,24 +5,27 @@ #if USE_AZURE_BLOB_STORAGE #include -#include +#include +#include namespace DB { class BackupFactory; -class StorageAzureBlobConfiguration : public StorageObjectStorageConfiguration +class StorageAzureConfiguration : public StorageObjectStorage::Configuration { friend class BackupReaderAzureBlobStorage; friend class BackupWriterAzureBlobStorage; friend void registerBackupEngineAzureBlobStorage(BackupFactory & factory); public: + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; + static constexpr auto type_name = "azure"; static constexpr auto engine_name = "Azure"; - StorageAzureBlobConfiguration() = default; - StorageAzureBlobConfiguration(const StorageAzureBlobConfiguration & other); + StorageAzureConfiguration() = default; + StorageAzureConfiguration(const StorageAzureConfiguration & other); std::string getTypeName() const override { return type_name; } std::string getEngineName() const override { return engine_name; } @@ -31,16 +34,15 @@ public: void setPath(const Path & path) override { blob_path = path; } const Paths & getPaths() const override { return blobs_paths; } - Paths & getPaths() override { return blobs_paths; } void setPaths(const Paths & paths) override { blobs_paths = paths; } - String getDataSourceDescription() override { return fs::path(connection_url) / container; } + String getDataSourceDescription() override { return std::filesystem::path(connection_url) / container; } String getNamespace() const override { return container; } StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; void check(ContextPtr context) const override; ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT - StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } + ConfigurationPtr clone() override { return std::make_shared(*this); } void fromNamedCollection(const NamedCollection & collection) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; diff --git a/src/Storages/ObjectStorage/DataLakes/Common.cpp b/src/Storages/ObjectStorage/DataLakes/Common.cpp index 0c9237127b9..4830cc52a90 100644 --- a/src/Storages/ObjectStorage/DataLakes/Common.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Common.cpp @@ -1,6 +1,6 @@ #include "Common.h" #include -#include +#include #include namespace DB @@ -8,7 +8,7 @@ namespace DB std::vector listFiles( const IObjectStorage & object_storage, - const StorageObjectStorageConfiguration & configuration, + const StorageObjectStorage::Configuration & configuration, const String & prefix, const String & suffix) { auto key = std::filesystem::path(configuration.getPath()) / prefix; diff --git a/src/Storages/ObjectStorage/DataLakes/Common.h b/src/Storages/ObjectStorage/DataLakes/Common.h index ae3767f2eec..db3afa9e4a6 100644 --- a/src/Storages/ObjectStorage/DataLakes/Common.h +++ b/src/Storages/ObjectStorage/DataLakes/Common.h @@ -1,15 +1,15 @@ #pragma once #include +#include namespace DB { class IObjectStorage; -class StorageObjectStorageConfiguration; std::vector listFiles( const IObjectStorage & object_storage, - const StorageObjectStorageConfiguration & configuration, + const StorageObjectStorage::Configuration & configuration, const String & prefix, const String & suffix); } diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index 571e14325bb..277d07d88ef 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -85,7 +85,7 @@ struct DeltaLakeMetadata::Impl while (true) { const auto filename = withPadding(++current_version) + metadata_file_suffix; - const auto file_path = fs::path(configuration->getPath()) / deltalake_metadata_directory / filename; + const auto file_path = std::filesystem::path(configuration->getPath()) / deltalake_metadata_directory / filename; if (!object_storage->exists(StoredObject(file_path))) break; @@ -161,12 +161,12 @@ struct DeltaLakeMetadata::Impl if (json.has("add")) { const auto path = json["add"]["path"].getString(); - result.insert(fs::path(configuration->getPath()) / path); + result.insert(std::filesystem::path(configuration->getPath()) / path); } else if (json.has("remove")) { const auto path = json["remove"]["path"].getString(); - result.erase(fs::path(configuration->getPath()) / path); + result.erase(std::filesystem::path(configuration->getPath()) / path); } } } @@ -186,7 +186,7 @@ struct DeltaLakeMetadata::Impl */ size_t readLastCheckpointIfExists() const { - const auto last_checkpoint_file = fs::path(configuration->getPath()) / deltalake_metadata_directory / "_last_checkpoint"; + const auto last_checkpoint_file = std::filesystem::path(configuration->getPath()) / deltalake_metadata_directory / "_last_checkpoint"; if (!object_storage->exists(StoredObject(last_checkpoint_file))) return 0; @@ -249,7 +249,7 @@ struct DeltaLakeMetadata::Impl return 0; const auto checkpoint_filename = withPadding(version) + ".checkpoint.parquet"; - const auto checkpoint_path = fs::path(configuration->getPath()) / deltalake_metadata_directory / checkpoint_filename; + const auto checkpoint_path = std::filesystem::path(configuration->getPath()) / deltalake_metadata_directory / checkpoint_filename; LOG_TRACE(log, "Using checkpoint file: {}", checkpoint_path.string()); @@ -311,7 +311,7 @@ struct DeltaLakeMetadata::Impl if (filename.empty()) continue; LOG_TEST(log, "Adding {}", filename); - const auto [_, inserted] = result.insert(fs::path(configuration->getPath()) / filename); + const auto [_, inserted] = result.insert(std::filesystem::path(configuration->getPath()) / filename); if (!inserted) throw Exception(ErrorCodes::INCORRECT_DATA, "File already exists {}", filename); } diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h index 5050b88d809..e527721b29e 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include @@ -12,8 +12,7 @@ namespace DB class DeltaLakeMetadata final : public IDataLakeMetadata { public: - using ConfigurationPtr = StorageObjectStorageConfigurationPtr; - + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; static constexpr auto name = "DeltaLake"; DeltaLakeMetadata( diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h index 6054c3f15d6..3ab274b1fbf 100644 --- a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -13,7 +13,7 @@ namespace DB class HudiMetadata final : public IDataLakeMetadata, private WithContext { public: - using ConfigurationPtr = StorageObjectStorageConfigurationPtr; + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; static constexpr auto name = "Hudi"; diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index 144cc16939c..3119b844aaf 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -88,7 +88,7 @@ public: else { ConfigurationPtr configuration = base_configuration->clone(); - configuration->getPaths() = metadata->getDataFiles(); + configuration->setPaths(metadata->getDataFiles()); return Storage::resolveSchemaFromData( object_storage_, configuration, format_settings_, local_context); } diff --git a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp index 8ee6f002ca6..591e5ef03f6 100644 --- a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.cpp @@ -45,7 +45,7 @@ namespace ErrorCodes IcebergMetadata::IcebergMetadata( ObjectStoragePtr object_storage_, - StorageObjectStorageConfigurationPtr configuration_, + ConfigurationPtr configuration_, DB::ContextPtr context_, Int32 metadata_version_, Int32 format_version_, @@ -341,7 +341,7 @@ MutableColumns parseAvro( */ std::pair getMetadataFileAndVersion( ObjectStoragePtr object_storage, - const StorageObjectStorageConfiguration & configuration) + const StorageObjectStorage::Configuration & configuration) { const auto metadata_files = listFiles(*object_storage, configuration, "metadata", ".metadata.json"); if (metadata_files.empty()) @@ -378,7 +378,7 @@ std::pair getMetadataFileAndVersion( DataLakeMetadataPtr IcebergMetadata::create( ObjectStoragePtr object_storage, - StorageObjectStorageConfigurationPtr configuration, + ConfigurationPtr configuration, ContextPtr local_context) { const auto [metadata_version, metadata_file_path] = getMetadataFileAndVersion(object_storage, *configuration); diff --git a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h index f88e3eecc67..06dbd373bf9 100644 --- a/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IcebergMetadata.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include namespace DB @@ -61,7 +61,7 @@ namespace DB class IcebergMetadata : public IDataLakeMetadata, private WithContext { public: - using ConfigurationPtr = StorageObjectStorageConfigurationPtr; + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; static constexpr auto name = "Iceberg"; diff --git a/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp b/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp index a5170e5ed6b..0fa6402e892 100644 --- a/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp +++ b/src/Storages/ObjectStorage/DataLakes/registerDataLakeStorages.cpp @@ -20,7 +20,7 @@ void registerStorageIceberg(StorageFactory & factory) [&](const StorageFactory::Arguments & args) { auto configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); + StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); return StorageIceberg::create( configuration, args.getContext(), args.table_id, args.columns, @@ -43,7 +43,7 @@ void registerStorageDeltaLake(StorageFactory & factory) [&](const StorageFactory::Arguments & args) { auto configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); + StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); return StorageDeltaLake::create( configuration, args.getContext(), args.table_id, args.columns, @@ -64,7 +64,7 @@ void registerStorageHudi(StorageFactory & factory) [&](const StorageFactory::Arguments & args) { auto configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); + StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, args.getLocalContext(), false); return StorageHudi::create( configuration, args.getContext(), args.table_id, args.columns, diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 12e3f3adb12..a8a9ab5b557 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -1,18 +1,21 @@ #include #if USE_HDFS -#include -#include -#include +#include #include -#include -#include #include +#include + +#include +#include + +#include +#include +#include + #include #include #include -#include - namespace DB { @@ -23,7 +26,7 @@ namespace ErrorCodes } StorageHDFSConfiguration::StorageHDFSConfiguration(const StorageHDFSConfiguration & other) - : StorageObjectStorageConfiguration(other) + : Configuration(other) { url = other.url; path = other.path; @@ -34,7 +37,7 @@ void StorageHDFSConfiguration::check(ContextPtr context) const { context->getRemoteHostFilter().checkURL(Poco::URI(url)); checkHDFSURL(fs::path(url) / path.substr(1)); - StorageObjectStorageConfiguration::check(context); + Configuration::check(context); } ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage( /// NOLINT @@ -47,10 +50,11 @@ ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage( /// NOLINT settings.remote_read_min_bytes_for_seek, settings.hdfs_replication ); - return std::make_shared(url, std::move(hdfs_settings), context->getConfigRef()); + return std::make_shared( + url, std::move(hdfs_settings), context->getConfigRef(), /* lazy_initialize */true); } -std::string StorageHDFSConfiguration::getPathWithoutGlob() const +std::string StorageHDFSConfiguration::getPathWithoutGlobs() const { /// Unlike s3 and azure, which are object storages, /// hdfs is a filesystem, so it cannot list files by partual prefix, @@ -69,9 +73,9 @@ StorageObjectStorage::QuerySettings StorageHDFSConfiguration::getQuerySettings(c .create_new_file_on_insert = settings.hdfs_create_new_file_on_insert, .schema_inference_use_cache = settings.schema_inference_use_cache_for_hdfs, .schema_inference_mode = settings.schema_inference_mode, - .skip_empty_files = settings.hdfs_skip_empty_files, /// TODO: add setting for hdfs - .list_object_keys_size = settings.s3_list_object_keys_size, /// TODO: add a setting for hdfs - .throw_on_zero_files_match = settings.s3_throw_on_zero_files_match, + .skip_empty_files = settings.hdfs_skip_empty_files, + .list_object_keys_size = 0, /// HDFS does not support listing in batches. + .throw_on_zero_files_match = settings.hdfs_throw_on_zero_files_match, .ignore_non_existent_file = settings.hdfs_ignore_file_doesnt_exist, }; } diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 0a502857153..cac09ee1d92 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -2,17 +2,18 @@ #include "config.h" #if USE_HDFS -#include +#include #include -#include #include namespace DB { -class StorageHDFSConfiguration : public StorageObjectStorageConfiguration +class StorageHDFSConfiguration : public StorageObjectStorage::Configuration { public: + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; + static constexpr auto type_name = "hdfs"; static constexpr auto engine_name = "HDFS"; @@ -26,7 +27,6 @@ public: void setPath(const Path & path_) override { path = path_; } const Paths & getPaths() const override { return paths; } - Paths & getPaths() override { return paths; } void setPaths(const Paths & paths_) override { paths = paths_; } String getNamespace() const override { return ""; } @@ -35,12 +35,12 @@ public: void check(ContextPtr context) const override; ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT - StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } + ConfigurationPtr clone() override { return std::make_shared(*this); } void addStructureAndFormatToArgs( ASTs & args, const String & structure_, const String & format_, ContextPtr context) override; - std::string getPathWithoutGlob() const override; + std::string getPathWithoutGlobs() const override; private: void fromNamedCollection(const NamedCollection &) override; diff --git a/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp index b37b9de746b..be339d021dc 100644 --- a/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp @@ -114,10 +114,10 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory= file_size) - // { - // return false; - // } + if (file_size != 0 && file_offset >= file_size) + { + return false; + } ResourceGuard rlock(read_settings.resource_link, num_bytes_to_read); int bytes_read; diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 9c1d3f79c2b..3705725ffe1 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -10,7 +10,6 @@ namespace ErrorCodes { extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; extern const int CANNOT_DETECT_FORMAT; - } ReadBufferIterator::ReadBufferIterator( @@ -29,18 +28,19 @@ ReadBufferIterator::ReadBufferIterator( , query_settings(configuration->getQuerySettings(context_)) , schema_cache(schema_cache_) , read_keys(read_keys_) - , format(configuration->format == "auto" ? std::nullopt : std::optional(configuration->format)) , prev_read_keys_size(read_keys_.size()) { + if (configuration->format != "auto") + format = configuration->format; } SchemaCache::Key ReadBufferIterator::getKeyForSchemaCache(const String & path, const String & format_name) const { - auto source = fs::path(configuration->getDataSourceDescription()) / path; + auto source = std::filesystem::path(configuration->getDataSourceDescription()) / path; return DB::getKeyForSchemaCache(source, format_name, format_settings, getContext()); } -SchemaCache::Keys ReadBufferIterator::getPathsForSchemaCache() const +SchemaCache::Keys ReadBufferIterator::getKeysForSchemaCache() const { Strings sources; sources.reserve(read_keys.size()); @@ -49,7 +49,7 @@ SchemaCache::Keys ReadBufferIterator::getPathsForSchemaCache() const std::back_inserter(sources), [&](const auto & elem) { - return fs::path(configuration->getDataSourceDescription()) / elem->relative_path; + return std::filesystem::path(configuration->getDataSourceDescription()) / elem->relative_path; }); return DB::getKeysForSchemaCache(sources, *format, format_settings, getContext()); } @@ -66,16 +66,14 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( const auto & object_info = (*it); auto get_last_mod_time = [&] -> std::optional { - if (object_info->metadata) - return object_info->metadata->last_modified.epochTime(); - else - { - object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); - return object_info->metadata->last_modified.epochTime(); - } + if (!object_info->metadata) + object_info->metadata = object_storage->tryGetObjectMetadata(object_info->relative_path); + + return object_info->metadata + ? std::optional(object_info->metadata->last_modified.epochTime()) + : std::nullopt; }; - chassert(object_info); if (format) { auto cache_key = getKeyForSchemaCache(object_info->relative_path, *format); @@ -105,14 +103,12 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( void ReadBufferIterator::setNumRowsToLastFile(size_t num_rows) { - chassert(current_object_info); if (query_settings.schema_inference_use_cache) schema_cache.addNumRows(getKeyForSchemaCache(current_object_info->relative_path, *format), num_rows); } void ReadBufferIterator::setSchemaToLastFile(const ColumnsDescription & columns) { - chassert(current_object_info); if (query_settings.schema_inference_use_cache && query_settings.schema_inference_mode == SchemaInferenceMode::UNION) { @@ -125,7 +121,7 @@ void ReadBufferIterator::setResultingSchema(const ColumnsDescription & columns) if (query_settings.schema_inference_use_cache && query_settings.schema_inference_mode == SchemaInferenceMode::DEFAULT) { - schema_cache.addManyColumns(getPathsForSchemaCache(), columns); + schema_cache.addManyColumns(getKeysForSchemaCache(), columns); } } @@ -144,15 +140,11 @@ String ReadBufferIterator::getLastFileName() const std::unique_ptr ReadBufferIterator::recreateLastReadBuffer() { - chassert(current_object_info); - - auto impl = object_storage->readObject( - StoredObject(current_object_info->relative_path), getContext()->getReadSettings()); - - int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); - return wrapReadBufferWithCompressionMethod( - std::move(impl), chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method), - zstd_window_log_max); + auto context = getContext(); + auto impl = object_storage->readObject(StoredObject(current_object_info->relative_path), context->getReadSettings()); + const auto compression_method = chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method); + const auto zstd_window_log_max = static_cast(context->getSettingsRef().zstd_window_log_max); + return wrapReadBufferWithCompressionMethod(std::move(impl), compression_method, zstd_window_log_max); } ReadBufferIterator::Data ReadBufferIterator::next() @@ -190,16 +182,21 @@ ReadBufferIterator::Data ReadBufferIterator::next() if (first) { if (format.has_value()) + { throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "The table structure cannot be extracted from a {} format file, because there are no files with provided path " + "The table structure cannot be extracted from a {} format file, " + "because there are no files with provided path " "in {} or all files are empty. You can specify table structure manually", *format, object_storage->getName()); + } throw Exception( ErrorCodes::CANNOT_DETECT_FORMAT, - "The data format cannot be detected by the contents of the files, because there are no files with provided path " - "in {} or all files are empty. You can specify the format manually", object_storage->getName()); + "The data format cannot be detected by the contents of the files, " + "because there are no files with provided path " + "in {} or all files are empty. You can specify the format manually", + object_storage->getName()); } return {nullptr, std::nullopt, format}; diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.h b/src/Storages/ObjectStorage/ReadBufferIterator.h index 2d58e1c789e..287e316e243 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.h +++ b/src/Storages/ObjectStorage/ReadBufferIterator.h @@ -1,8 +1,7 @@ #pragma once #include -#include -#include #include +#include namespace DB @@ -12,6 +11,9 @@ class ReadBufferIterator : public IReadBufferIterator, WithContext { public: using FileIterator = std::shared_ptr; + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; + using ObjectInfoPtr = StorageObjectStorage::ObjectInfoPtr; + using ObjectInfos = StorageObjectStorage::ObjectInfos; ReadBufferIterator( ObjectStoragePtr object_storage_, @@ -40,7 +42,7 @@ public: private: SchemaCache::Key getKeyForSchemaCache(const String & path, const String & format_name) const; - SchemaCache::Keys getPathsForSchemaCache() const; + SchemaCache::Keys getKeysForSchemaCache() const; std::optional tryGetColumnsFromCache( const ObjectInfos::iterator & begin, const ObjectInfos::iterator & end); diff --git a/src/Storages/ObjectStorage/ReadFromObjectStorageStep.cpp b/src/Storages/ObjectStorage/ReadFromObjectStorageStep.cpp deleted file mode 100644 index f19e01cdc3e..00000000000 --- a/src/Storages/ObjectStorage/ReadFromObjectStorageStep.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include -#include - -namespace DB -{ - -ReadFromObjectStorageStep::ReadFromObjectStorageStep( - ObjectStoragePtr object_storage_, - ConfigurationPtr configuration_, - const String & name_, - const Names & columns_to_read, - const NamesAndTypesList & virtual_columns_, - const SelectQueryInfo & query_info_, - const StorageSnapshotPtr & storage_snapshot_, - const std::optional & format_settings_, - bool distributed_processing_, - ReadFromFormatInfo info_, - SchemaCache & schema_cache_, - const bool need_only_count_, - ContextPtr context_, - size_t max_block_size_, - size_t num_streams_) - : SourceStepWithFilter(DataStream{.header = info_.source_header}, columns_to_read, query_info_, storage_snapshot_, context_) - , object_storage(object_storage_) - , configuration(configuration_) - , info(std::move(info_)) - , virtual_columns(virtual_columns_) - , format_settings(format_settings_) - , query_settings(configuration->getQuerySettings(context_)) - , schema_cache(schema_cache_) - , name(name_ + "Source") - , need_only_count(need_only_count_) - , max_block_size(max_block_size_) - , num_streams(num_streams_) - , distributed_processing(distributed_processing_) -{ -} - -void ReadFromObjectStorageStep::createIterator(const ActionsDAG::Node * predicate) -{ - if (!iterator_wrapper) - { - auto context = getContext(); - iterator_wrapper = StorageObjectStorageSource::createFileIterator( - configuration, object_storage, distributed_processing, - context, predicate, virtual_columns, nullptr, context->getFileProgressCallback()); - } -} - -void ReadFromObjectStorageStep::applyFilters(ActionDAGNodes added_filter_nodes) -{ - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); - const ActionsDAG::Node * predicate = nullptr; - if (filter_actions_dag) - predicate = filter_actions_dag->getOutputs().at(0); - - createIterator(predicate); -} - -void ReadFromObjectStorageStep::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) -{ - createIterator(nullptr); - auto context = getContext(); - - Pipes pipes; - for (size_t i = 0; i < num_streams; ++i) - { - auto source = std::make_shared( - getName(), object_storage, configuration, info, format_settings, query_settings, - context, max_block_size, iterator_wrapper, need_only_count, schema_cache); - - source->setKeyCondition(filter_actions_dag, context); - pipes.emplace_back(std::move(source)); - } - - auto pipe = Pipe::unitePipes(std::move(pipes)); - if (pipe.empty()) - pipe = Pipe(std::make_shared(info.source_header)); - - for (const auto & processor : pipe.getProcessors()) - processors.emplace_back(processor); - - pipeline.init(std::move(pipe)); -} - -} diff --git a/src/Storages/ObjectStorage/ReadFromObjectStorageStep.h b/src/Storages/ObjectStorage/ReadFromObjectStorageStep.h deleted file mode 100644 index d98ebfef1f2..00000000000 --- a/src/Storages/ObjectStorage/ReadFromObjectStorageStep.h +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once -#include -#include - -namespace DB -{ - -class ReadFromObjectStorageStep : public SourceStepWithFilter -{ -public: - using ConfigurationPtr = StorageObjectStorageConfigurationPtr; - - ReadFromObjectStorageStep( - ObjectStoragePtr object_storage_, - ConfigurationPtr configuration_, - const String & name_, - const Names & columns_to_read, - const NamesAndTypesList & virtual_columns_, - const SelectQueryInfo & query_info_, - const StorageSnapshotPtr & storage_snapshot_, - const std::optional & format_settings_, - bool distributed_processing_, - ReadFromFormatInfo info_, - SchemaCache & schema_cache_, - bool need_only_count_, - ContextPtr context_, - size_t max_block_size_, - size_t num_streams_); - - std::string getName() const override { return name; } - - void applyFilters(ActionDAGNodes added_filter_nodes) override; - - void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; - -private: - ObjectStoragePtr object_storage; - ConfigurationPtr configuration; - std::shared_ptr iterator_wrapper; - - const ReadFromFormatInfo info; - const NamesAndTypesList virtual_columns; - const std::optional format_settings; - const StorageObjectStorage::QuerySettings query_settings; - SchemaCache & schema_cache; - const String name; - const bool need_only_count; - const size_t max_block_size; - const size_t num_streams; - const bool distributed_processing; - - void createIterator(const ActionsDAG::Node * predicate); -}; - -} diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index bfd61c647f8..9fcbc6a6816 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -1,17 +1,23 @@ #include #if USE_AWS_S3 - #include +#include #include + +#include #include -#include + #include #include + #include #include #include +#include +#include + namespace DB { namespace ErrorCodes @@ -46,7 +52,7 @@ static const std::unordered_set optional_configuration_keys = String StorageS3Configuration::getDataSourceDescription() { - return fs::path(url.uri.getHost() + std::to_string(url.uri.getPort())) / url.bucket; + return std::filesystem::path(url.uri.getHost() + std::to_string(url.uri.getPort())) / url.bucket; } void StorageS3Configuration::check(ContextPtr context) const @@ -54,7 +60,7 @@ void StorageS3Configuration::check(ContextPtr context) const validateNamespace(url.bucket); context->getGlobalContext()->getRemoteHostFilter().checkURL(url.uri); context->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(headers_from_ast); - StorageObjectStorageConfiguration::check(context); + Configuration::check(context); } void StorageS3Configuration::validateNamespace(const String & name) const @@ -63,7 +69,7 @@ void StorageS3Configuration::validateNamespace(const String & name) const } StorageS3Configuration::StorageS3Configuration(const StorageS3Configuration & other) - : StorageObjectStorageConfiguration(other) + : Configuration(other) { url = other.url; static_configuration = other.static_configuration; @@ -91,11 +97,12 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, assertInitialized(); const auto & config = context->getConfigRef(); + const auto & settings = context->getSettingsRef(); const std::string config_prefix = "s3."; - auto s3_settings = getSettings(config, config_prefix, context, false); /// FIXME: add a setting + auto s3_settings = getSettings(config, config_prefix, context, settings.s3_validate_request_settings); - request_settings.updateFromSettingsIfChanged(context->getSettingsRef()); + request_settings.updateFromSettingsIfChanged(settings); auth_settings.updateFrom(s3_settings->auth_settings); s3_settings->auth_settings = auth_settings; diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index de4a6d17579..9eb724c4a64 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -4,17 +4,17 @@ #if USE_AWS_S3 -#include #include -#include -#include +#include namespace DB { -class StorageS3Configuration : public StorageObjectStorageConfiguration +class StorageS3Configuration : public StorageObjectStorage::Configuration { public: + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; + static constexpr auto type_name = "s3"; StorageS3Configuration() = default; @@ -27,7 +27,6 @@ public: void setPath(const Path & path) override { url.key = path; } const Paths & getPaths() const override { return keys; } - Paths & getPaths() override { return keys; } void setPaths(const Paths & paths) override { keys = paths; } String getNamespace() const override { return url.bucket; } @@ -37,7 +36,7 @@ public: void check(ContextPtr context) const override; void validateNamespace(const String & name) const override; - StorageObjectStorageConfigurationPtr clone() override { return std::make_shared(*this); } + ConfigurationPtr clone() override { return std::make_shared(*this); } bool isStaticConfiguration() const override { return static_configuration; } ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index c5565d8b0e8..2c9831f0d29 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -2,21 +2,25 @@ #include #include -#include #include +#include + +#include #include +#include +#include #include #include -#include + #include +#include #include -#include +#include +#include +#include #include #include -#include #include -#include -#include namespace DB @@ -26,6 +30,7 @@ namespace ErrorCodes { extern const int DATABASE_ACCESS_DENIED; extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; } StorageObjectStorage::StorageObjectStorage( @@ -90,6 +95,110 @@ void StorageObjectStorage::updateConfiguration(ContextPtr context) object_storage->applyNewSettings(context->getConfigRef(), "s3.", context); } +namespace +{ +class ReadFromObjectStorageStep : public SourceStepWithFilter +{ +public: + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; + + ReadFromObjectStorageStep( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + const String & name_, + const Names & columns_to_read, + const NamesAndTypesList & virtual_columns_, + const SelectQueryInfo & query_info_, + const StorageSnapshotPtr & storage_snapshot_, + const std::optional & format_settings_, + bool distributed_processing_, + ReadFromFormatInfo info_, + SchemaCache & schema_cache_, + const bool need_only_count_, + ContextPtr context_, + size_t max_block_size_, + size_t num_streams_) + : SourceStepWithFilter(DataStream{.header = info_.source_header}, columns_to_read, query_info_, storage_snapshot_, context_) + , object_storage(object_storage_) + , configuration(configuration_) + , schema_cache(schema_cache_) + , info(std::move(info_)) + , virtual_columns(virtual_columns_) + , format_settings(format_settings_) + , query_settings(configuration->getQuerySettings(context_)) + , name(name_ + "Source") + , need_only_count(need_only_count_) + , max_block_size(max_block_size_) + , num_streams(num_streams_) + , distributed_processing(distributed_processing_) + { + } + + std::string getName() const override { return name; } + + void applyFilters(ActionDAGNodes added_filter_nodes) override + { + filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + const ActionsDAG::Node * predicate = nullptr; + if (filter_actions_dag) + predicate = filter_actions_dag->getOutputs().at(0); + createIterator(predicate); + } + + void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override + { + createIterator(nullptr); + Pipes pipes; + auto context = getContext(); + + for (size_t i = 0; i < num_streams; ++i) + { + auto source = std::make_shared( + getName(), object_storage, configuration, info, format_settings, query_settings, + context, max_block_size, iterator_wrapper, need_only_count, schema_cache); + + source->setKeyCondition(filter_actions_dag, context); + pipes.emplace_back(std::move(source)); + } + + auto pipe = Pipe::unitePipes(std::move(pipes)); + if (pipe.empty()) + pipe = Pipe(std::make_shared(info.source_header)); + + for (const auto & processor : pipe.getProcessors()) + processors.emplace_back(processor); + + pipeline.init(std::move(pipe)); + } + +private: + ObjectStoragePtr object_storage; + ConfigurationPtr configuration; + std::shared_ptr iterator_wrapper; + SchemaCache & schema_cache; + + const ReadFromFormatInfo info; + const NamesAndTypesList virtual_columns; + const std::optional format_settings; + const StorageObjectStorage::QuerySettings query_settings; + const String name; + const bool need_only_count; + const size_t max_block_size; + const size_t num_streams; + const bool distributed_processing; + + void createIterator(const ActionsDAG::Node * predicate) + { + if (iterator_wrapper) + return; + auto context = getContext(); + iterator_wrapper = StorageObjectStorageSource::createFileIterator( + configuration, object_storage, distributed_processing, + context, predicate, virtual_columns, nullptr, context->getFileProgressCallback()); + } +}; +} + void StorageObjectStorage::read( QueryPlan & query_plan, const Names & column_names, @@ -123,7 +232,7 @@ void StorageObjectStorage::read( storage_snapshot, format_settings, distributed_processing, - std::move(read_from_format_info), + read_from_format_info, getSchemaCache(local_context), need_only_count, local_context, @@ -169,12 +278,13 @@ SinkToStoragePtr StorageObjectStorage::write( getName(), configuration->getPath()); } - auto & paths = configuration->getPaths(); + auto paths = configuration->getPaths(); if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( *object_storage, *configuration, settings, paths.front(), paths.size())) { paths.push_back(*new_key); } + configuration->setPaths(paths); return std::make_shared( object_storage, @@ -185,10 +295,10 @@ SinkToStoragePtr StorageObjectStorage::write( } void StorageObjectStorage::truncate( - const ASTPtr &, - const StorageMetadataPtr &, - ContextPtr, - TableExclusiveLockHolder &) + const ASTPtr & /* query */, + const StorageMetadataPtr & /* metadata_snapshot */, + ContextPtr /* context */, + TableExclusiveLockHolder & /* table_holder */) { if (configuration->withGlobs()) { @@ -233,10 +343,8 @@ ColumnsDescription StorageObjectStorage::resolveSchemaFromData( const ContextPtr & context) { ObjectInfos read_keys; - auto read_buffer_iterator = createReadBufferIterator( - object_storage, configuration, format_settings, read_keys, context); - return readSchemaFromFormat( - configuration->format, format_settings, *read_buffer_iterator, context); + auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + return readSchemaFromFormat(configuration->format, format_settings, *iterator, context); } std::string StorageObjectStorage::resolveFormatFromData( @@ -246,10 +354,8 @@ std::string StorageObjectStorage::resolveFormatFromData( const ContextPtr & context) { ObjectInfos read_keys; - auto read_buffer_iterator = createReadBufferIterator( - object_storage, configuration, format_settings, read_keys, context); - return detectFormatAndReadSchema( - format_settings, *read_buffer_iterator, context).second; + auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + return detectFormatAndReadSchema(format_settings, *iterator, context).second; } std::pair StorageObjectStorage::resolveSchemaAndFormatFromData( @@ -259,10 +365,8 @@ std::pair StorageObjectStorage::resolveSchemaAn const ContextPtr & context) { ObjectInfos read_keys; - auto read_buffer_iterator = createReadBufferIterator( - object_storage, configuration, format_settings, read_keys, context); - - auto [columns, format] = detectFormatAndReadSchema(format_settings, *read_buffer_iterator, context); + auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, context); configuration->format = format; return std::pair(columns, format); } @@ -302,4 +406,65 @@ SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context, c throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported storage type: {}", storage_type_name); } +void StorageObjectStorage::Configuration::initialize( + Configuration & configuration, + ASTs & engine_args, + ContextPtr local_context, + bool with_table_structure) +{ + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) + configuration.fromNamedCollection(*named_collection); + else + configuration.fromAST(engine_args, local_context, with_table_structure); + + // FIXME: it should be - if (format == "auto" && get_format_from_file) + if (configuration.format == "auto") + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.getPath()).value_or("auto"); + else + FormatFactory::instance().checkFormatName(configuration.format); + + configuration.initialized = true; +} + +void StorageObjectStorage::Configuration::check(ContextPtr) const +{ + FormatFactory::instance().checkFormatName(format); +} + +StorageObjectStorage::Configuration::Configuration(const Configuration & other) +{ + format = other.format; + compression_method = other.compression_method; + structure = other.structure; +} + +bool StorageObjectStorage::Configuration::withWildcard() const +{ + static const String PARTITION_ID_WILDCARD = "{_partition_id}"; + return getPath().find(PARTITION_ID_WILDCARD) != String::npos + || getNamespace().find(PARTITION_ID_WILDCARD) != String::npos; +} + +bool StorageObjectStorage::Configuration::isPathWithGlobs() const +{ + return getPath().find_first_of("*?{") != std::string::npos; +} + +bool StorageObjectStorage::Configuration::isNamespaceWithGlobs() const +{ + return getNamespace().find_first_of("*?{") != std::string::npos; +} + +std::string StorageObjectStorage::Configuration::getPathWithoutGlobs() const +{ + return getPath().substr(0, getPath().find_first_of("*?{")); +} + +void StorageObjectStorage::Configuration::assertInitialized() const +{ + if (!initialized) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration was not initialized before usage"); + } +} } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index d46a875bf42..46d422b26c2 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -2,15 +2,16 @@ #include #include #include +#include #include #include namespace DB { -class StorageObjectStorageConfiguration; class ReadBufferIterator; class SchemaCache; +class NamedCollection; /** * A general class containing implementation for external table engines @@ -20,7 +21,7 @@ class SchemaCache; class StorageObjectStorage : public IStorage { public: - using Configuration = StorageObjectStorageConfiguration; + class Configuration; using ConfigurationPtr = std::shared_ptr; using ObjectInfo = RelativePathWithMetadata; using ObjectInfoPtr = std::shared_ptr; @@ -134,4 +135,61 @@ protected: std::mutex configuration_update_mutex; }; +class StorageObjectStorage::Configuration +{ +public: + Configuration() = default; + Configuration(const Configuration & other); + virtual ~Configuration() = default; + + using Path = std::string; + using Paths = std::vector; + + static void initialize( + Configuration & configuration, + ASTs & engine_args, + ContextPtr local_context, + bool with_table_structure); + + virtual std::string getTypeName() const = 0; + virtual std::string getEngineName() const = 0; + + virtual Path getPath() const = 0; + virtual void setPath(const Path & path) = 0; + + virtual const Paths & getPaths() const = 0; + virtual void setPaths(const Paths & paths) = 0; + + virtual String getDataSourceDescription() = 0; + virtual String getNamespace() const = 0; + virtual StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const = 0; + virtual void addStructureAndFormatToArgs( + ASTs & args, const String & structure_, const String & format_, ContextPtr context) = 0; + + bool withWildcard() const; + bool withGlobs() const { return isPathWithGlobs() || isNamespaceWithGlobs(); } + bool isPathWithGlobs() const; + bool isNamespaceWithGlobs() const; + virtual std::string getPathWithoutGlobs() const; + + virtual void check(ContextPtr context) const; + virtual void validateNamespace(const String & /* name */) const {} + + virtual ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT + virtual ConfigurationPtr clone() = 0; + virtual bool isStaticConfiguration() const { return true; } + + String format = "auto"; + String compression_method = "auto"; + String structure = "auto"; + +protected: + virtual void fromNamedCollection(const NamedCollection & collection) = 0; + virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; + + void assertInitialized() const; + + bool initialized = false; +}; + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 1a1df399626..193894a1d44 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -1,21 +1,15 @@ #include "Storages/ObjectStorage/StorageObjectStorageCluster.h" -#include "config.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include +#include +#include + +#include #include +#include +#include + namespace DB { diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 2db8f5c352e..b38eb722df5 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -1,12 +1,10 @@ #pragma once -#include "config.h" - -#include +// #include #include #include #include -#include +// #include namespace DB { @@ -29,17 +27,14 @@ public: std::string getName() const override; - RemoteQueryExecutor::Extension getTaskIteratorExtension( - const ActionsDAG::Node * predicate, - const ContextPtr & context) const override; - bool supportsSubcolumns() const override { return true; } bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } -private: - void updateBeforeRead(const ContextPtr & /* context */) override {} + RemoteQueryExecutor::Extension getTaskIteratorExtension( + const ActionsDAG::Node * predicate, const ContextPtr & context) const override; +private: void updateQueryToSendIfNeeded( ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp deleted file mode 100644 index 89c15085274..00000000000 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ /dev/null @@ -1,74 +0,0 @@ -#include -#include -#include - -namespace DB -{ -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -void StorageObjectStorageConfiguration::initialize( - StorageObjectStorageConfiguration & configuration, - ASTs & engine_args, - ContextPtr local_context, - bool with_table_structure) -{ - if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - configuration.fromNamedCollection(*named_collection); - else - configuration.fromAST(engine_args, local_context, with_table_structure); - - // FIXME: it should be - if (format == "auto" && get_format_from_file) - if (configuration.format == "auto") - configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.getPath()).value_or("auto"); - else - FormatFactory::instance().checkFormatName(configuration.format); - - configuration.initialized = true; -} - -void StorageObjectStorageConfiguration::check(ContextPtr) const -{ - FormatFactory::instance().checkFormatName(format); -} - -StorageObjectStorageConfiguration::StorageObjectStorageConfiguration(const StorageObjectStorageConfiguration & other) -{ - format = other.format; - compression_method = other.compression_method; - structure = other.structure; -} - -bool StorageObjectStorageConfiguration::withWildcard() const -{ - static const String PARTITION_ID_WILDCARD = "{_partition_id}"; - return getPath().find(PARTITION_ID_WILDCARD) != String::npos - || getNamespace().find(PARTITION_ID_WILDCARD) != String::npos; -} - -bool StorageObjectStorageConfiguration::isPathWithGlobs() const -{ - return getPath().find_first_of("*?{") != std::string::npos; -} - -bool StorageObjectStorageConfiguration::isNamespaceWithGlobs() const -{ - return getNamespace().find_first_of("*?{") != std::string::npos; -} - -std::string StorageObjectStorageConfiguration::getPathWithoutGlob() const -{ - return getPath().substr(0, getPath().find_first_of("*?{")); -} - -void StorageObjectStorageConfiguration::assertInitialized() const -{ - if (!initialized) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Configuration was not initialized before usage"); - } -} - -} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h deleted file mode 100644 index c55362aa8bd..00000000000 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ /dev/null @@ -1,75 +0,0 @@ -#pragma once -#include -#include -#include "StorageObjectStorage.h" -#include - -namespace fs = std::filesystem; - -namespace DB -{ - -class StorageObjectStorageConfiguration; -using StorageObjectStorageConfigurationPtr = std::shared_ptr; - -class StorageObjectStorageConfiguration -{ -public: - StorageObjectStorageConfiguration() = default; - StorageObjectStorageConfiguration(const StorageObjectStorageConfiguration & other); - virtual ~StorageObjectStorageConfiguration() = default; - - using Path = std::string; - using Paths = std::vector; - - static void initialize( - StorageObjectStorageConfiguration & configuration, - ASTs & engine_args, - ContextPtr local_context, - bool with_table_structure); - - virtual std::string getTypeName() const = 0; - virtual std::string getEngineName() const = 0; - - virtual Path getPath() const = 0; - virtual void setPath(const Path & path) = 0; - - virtual const Paths & getPaths() const = 0; - virtual Paths & getPaths() = 0; - virtual void setPaths(const Paths & paths) = 0; - - virtual String getDataSourceDescription() = 0; - virtual String getNamespace() const = 0; - virtual StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const = 0; - virtual void addStructureAndFormatToArgs( - ASTs & args, const String & structure_, const String & format_, ContextPtr context) = 0; - - bool withWildcard() const; - bool withGlobs() const { return isPathWithGlobs() || isNamespaceWithGlobs(); } - bool isPathWithGlobs() const; - bool isNamespaceWithGlobs() const; - virtual std::string getPathWithoutGlob() const; - - virtual void check(ContextPtr context) const; - virtual void validateNamespace(const String & /* name */) const {} - - virtual ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT - virtual StorageObjectStorageConfigurationPtr clone() = 0; - virtual bool isStaticConfiguration() const { return true; } - - String format = "auto"; - String compression_method = "auto"; - String structure = "auto"; - -protected: - virtual void fromNamedCollection(const NamedCollection & collection) = 0; - virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; - - void assertInitialized() const; - - bool initialized = false; -}; - -using StorageObjectStorageConfigurationPtr = std::shared_ptr; - -} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 62367a6b933..81bdeaa43a3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -14,14 +14,13 @@ namespace ErrorCodes StorageObjectStorageSink::StorageObjectStorageSink( ObjectStoragePtr object_storage, - StorageObjectStorageConfigurationPtr configuration, + ConfigurationPtr configuration, std::optional format_settings_, const Block & sample_block_, ContextPtr context, const std::string & blob_path) : SinkToStorage(sample_block_) , sample_block(sample_block_) - , format_settings(format_settings_) { const auto & settings = context->getSettingsRef(); const auto path = blob_path.empty() ? configuration->getPaths().back() : blob_path; @@ -37,7 +36,7 @@ StorageObjectStorageSink::StorageObjectStorageSink( static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormatParallelIfPossible( - configuration->format, *write_buf, sample_block, context, format_settings); + configuration->format, *write_buf, sample_block, context, format_settings_); } void StorageObjectStorageSink::consume(Chunk chunk) @@ -102,7 +101,7 @@ void StorageObjectStorageSink::release() PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, - StorageObjectStorageConfigurationPtr configuration_, + ConfigurationPtr configuration_, std::optional format_settings_, const Block & sample_block_, ContextPtr context_, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index 6c2f73e40e3..a3c8ef68cf0 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -1,17 +1,18 @@ #pragma once #include -#include #include -#include +#include namespace DB { class StorageObjectStorageSink : public SinkToStorage { public: + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; + StorageObjectStorageSink( ObjectStoragePtr object_storage, - StorageObjectStorageConfigurationPtr configuration, + ConfigurationPtr configuration, std::optional format_settings_, const Block & sample_block_, ContextPtr context, @@ -29,8 +30,6 @@ public: private: const Block sample_block; - const std::optional format_settings; - std::unique_ptr write_buf; OutputFormatPtr writer; bool cancelled = false; @@ -43,9 +42,11 @@ private: class PartitionedStorageObjectStorageSink : public PartitionedSink { public: + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; + PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, - StorageObjectStorageConfigurationPtr configuration_, + ConfigurationPtr configuration_, std::optional format_settings_, const Block & sample_block_, ContextPtr context_, @@ -58,7 +59,8 @@ private: void validateNamespace(const String & str); ObjectStoragePtr object_storage; - StorageObjectStorageConfigurationPtr configuration; + ConfigurationPtr configuration; + const StorageObjectStorage::QuerySettings query_settings; const std::optional format_settings; const Block sample_block; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 4551c2df7c3..b224afb7a58 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -9,10 +9,11 @@ #include #include #include -#include +#include #include #include +namespace fs = std::filesystem; namespace ProfileEvents { @@ -218,11 +219,9 @@ std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const O auto get_last_mod_time = [&]() -> std::optional { - if (object_info->metadata) - { - return object_info->metadata->last_modified.epochTime(); - } - return std::nullopt; + return object_info->metadata + ? std::optional(object_info->metadata->last_modified.epochTime()) + : std::nullopt; }; return schema_cache.tryGetNumRows(cache_key, get_last_mod_time); } @@ -354,7 +353,7 @@ StorageObjectStorageSource::IIterator::IIterator(const std::string & logger_name { } -ObjectInfoPtr StorageObjectStorageSource::IIterator::next(size_t processor) +StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::IIterator::next(size_t processor) { auto object_info = nextImpl(processor); @@ -392,7 +391,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( else if (configuration->isPathWithGlobs()) { const auto key_with_globs = configuration_->getPath(); - const auto key_prefix = configuration->getPathWithoutGlob(); + const auto key_prefix = configuration->getPathWithoutGlobs(); object_storage_iterator = object_storage->iterate(key_prefix, list_object_keys_size); matcher = std::make_unique(makeRegexpPatternFromGlobs(key_with_globs)); @@ -423,7 +422,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } } -ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImpl(size_t processor) +StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImpl(size_t processor) { std::lock_guard lock(next_mutex); auto object_info = nextImplUnlocked(processor); @@ -439,7 +438,7 @@ ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImpl(size_t processo return object_info; } -ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImplUnlocked(size_t /* processor */) +StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImplUnlocked(size_t /* processor */) { bool current_batch_processed = object_infos.empty() || index >= object_infos.size(); if (is_finished && current_batch_processed) @@ -533,7 +532,7 @@ StorageObjectStorageSource::KeysIterator::KeysIterator( } } -ObjectInfoPtr StorageObjectStorageSource::KeysIterator::nextImpl(size_t /* processor */) +StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::KeysIterator::nextImpl(size_t /* processor */) { while (true) { @@ -614,7 +613,7 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( } } -ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator::nextImpl(size_t) +StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator::nextImpl(size_t) { size_t current_index = index.fetch_add(1, std::memory_order_relaxed); if (current_index >= buffer.size()) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 0afbf77db2b..356478422bc 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -3,8 +3,8 @@ #include #include #include -#include #include +#include namespace DB @@ -16,6 +16,11 @@ class StorageObjectStorageSource : public SourceWithKeyCondition, WithContext { friend class StorageS3QueueSource; public: + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; + using ObjectInfo = StorageObjectStorage::ObjectInfo; + using ObjectInfos = StorageObjectStorage::ObjectInfos; + using ObjectInfoPtr = StorageObjectStorage::ObjectInfoPtr; + class IIterator; class ReadTaskIterator; class GlobIterator; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h b/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h deleted file mode 100644 index 241e2f20962..00000000000 --- a/src/Storages/ObjectStorage/StorageObjectStorage_fwd_internal.h +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once -#include - -namespace DB -{ - -using ConfigurationPtr = StorageObjectStorageConfigurationPtr; -using ObjectInfo = RelativePathWithMetadata; -using ObjectInfoPtr = std::shared_ptr; -using ObjectInfos = std::vector; - -} diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index 2a7236ab196..bde3cb7e1cb 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include namespace DB { @@ -47,14 +47,15 @@ void resolveSchemaAndFormat( ColumnsDescription & columns, std::string & format, ObjectStoragePtr object_storage, - const StorageObjectStorageConfigurationPtr & configuration, + const StorageObjectStorage::ConfigurationPtr & configuration, std::optional format_settings, const ContextPtr & context) { if (columns.empty()) { if (format == "auto") - std::tie(columns, format) = StorageObjectStorage::resolveSchemaAndFormatFromData(object_storage, configuration, format_settings, context); + std::tie(columns, format) = + StorageObjectStorage::resolveSchemaAndFormatFromData(object_storage, configuration, format_settings, context); else columns = StorageObjectStorage::resolveSchemaFromData(object_storage, configuration, format_settings, context); } diff --git a/src/Storages/ObjectStorage/Utils.h b/src/Storages/ObjectStorage/Utils.h index 3a752e6b8f0..2077999df41 100644 --- a/src/Storages/ObjectStorage/Utils.h +++ b/src/Storages/ObjectStorage/Utils.h @@ -1,14 +1,10 @@ #pragma once -#include #include "StorageObjectStorage.h" namespace DB { class IObjectStorage; -class StorageObjectStorageConfiguration; -using StorageObjectStorageConfigurationPtr = std::shared_ptr; -struct StorageObjectStorageSettings; std::optional checkAndGetNewFileOnInsertIfNeeded( const IObjectStorage & object_storage, @@ -21,7 +17,7 @@ void resolveSchemaAndFormat( ColumnsDescription & columns, std::string & format, ObjectStoragePtr object_storage, - const StorageObjectStorageConfigurationPtr & configuration, + const StorageObjectStorage::ConfigurationPtr & configuration, std::optional format_settings, const ContextPtr & context); diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index 06b8aefb716..c23b180215e 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -1,8 +1,8 @@ -#include +#include #include #include #include -#include +#include #include #include @@ -18,13 +18,15 @@ namespace ErrorCodes static std::shared_ptr createStorageObjectStorage( const StorageFactory::Arguments & args, - typename StorageObjectStorage::ConfigurationPtr configuration, + StorageObjectStorage::ConfigurationPtr configuration, ContextPtr context) { auto & engine_args = args.engine_args; if (engine_args.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); + StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, context, false); + // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current // session and user are ignored. @@ -75,10 +77,8 @@ void registerStorageAzure(StorageFactory & factory) { factory.registerStorage("AzureBlobStorage", [](const StorageFactory::Arguments & args) { - auto context = args.getLocalContext(); - auto configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); - return createStorageObjectStorage(args, configuration, context); + auto configuration = std::make_shared(); + return createStorageObjectStorage(args, configuration, args.getLocalContext()); }, { .supports_settings = true, @@ -94,10 +94,8 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) { factory.registerStorage(name, [=](const StorageFactory::Arguments & args) { - auto context = args.getLocalContext(); auto configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); - return createStorageObjectStorage(args, configuration, context); + return createStorageObjectStorage(args, configuration, args.getLocalContext()); }, { .supports_settings = true, @@ -129,10 +127,8 @@ void registerStorageHDFS(StorageFactory & factory) { factory.registerStorage("HDFS", [=](const StorageFactory::Arguments & args) { - auto context = args.getLocalContext(); auto configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); - return createStorageObjectStorage(args, configuration, context); + return createStorageObjectStorage(args, configuration, args.getLocalContext()); }, { .supports_settings = true, diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.cpp b/src/Storages/S3Queue/S3QueueTableMetadata.cpp index 8354e6aa2ae..f0b7568ae7f 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueTableMetadata.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB @@ -32,7 +33,7 @@ namespace S3QueueTableMetadata::S3QueueTableMetadata( - const StorageObjectStorageConfiguration & configuration, + const StorageObjectStorage::Configuration & configuration, const S3QueueSettings & engine_settings, const StorageInMemoryMetadata & storage_metadata) { diff --git a/src/Storages/S3Queue/S3QueueTableMetadata.h b/src/Storages/S3Queue/S3QueueTableMetadata.h index 2158b189070..bb8f8ccf2c4 100644 --- a/src/Storages/S3Queue/S3QueueTableMetadata.h +++ b/src/Storages/S3Queue/S3QueueTableMetadata.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include namespace DB @@ -29,7 +29,7 @@ struct S3QueueTableMetadata S3QueueTableMetadata() = default; S3QueueTableMetadata( - const StorageObjectStorageConfiguration & configuration, + const StorageObjectStorage::Configuration & configuration, const S3QueueSettings & engine_settings, const StorageInMemoryMetadata & storage_metadata); diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index e84dabecf3b..38934a7895a 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -591,7 +591,7 @@ void registerStorageS3QueueImpl(const String & name, StorageFactory & factory) throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); auto configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getContext(), false); + StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, args.getContext(), false); // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current diff --git a/src/Storages/System/StorageSystemSchemaInferenceCache.cpp b/src/Storages/System/StorageSystemSchemaInferenceCache.cpp index a2d3f342a63..b67a8b23e9d 100644 --- a/src/Storages/System/StorageSystemSchemaInferenceCache.cpp +++ b/src/Storages/System/StorageSystemSchemaInferenceCache.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include namespace DB { @@ -84,7 +84,7 @@ void StorageSystemSchemaInferenceCache::fillData(MutableColumns & res_columns, C #endif fillDataImpl(res_columns, StorageURL::getSchemaCache(context), "URL"); #if USE_AZURE_BLOB_STORAGE - fillDataImpl(res_columns, StorageObjectStorage::getSchemaCache(context, StorageAzureBlobConfiguration::type_name), "Azure"); + fillDataImpl(res_columns, StorageObjectStorage::getSchemaCache(context, StorageAzureConfiguration::type_name), "Azure"); #endif } diff --git a/src/TableFunctions/ITableFunctionDataLake.h b/src/TableFunctions/ITableFunctionDataLake.h index 02c8c623e61..6ad8689a9b4 100644 --- a/src/TableFunctions/ITableFunctionDataLake.h +++ b/src/TableFunctions/ITableFunctionDataLake.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index 06676a8adfa..a997b34a75c 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -1,23 +1,23 @@ #include "config.h" +#include +#include +#include #include + #include +#include #include #include + #include -#include -#include -#include -#include -#include -#include + #include #include -#include -#include -#include -#include -#include "registerTableFunctions.h" +#include +#include +#include +#include namespace DB @@ -29,8 +29,7 @@ namespace ErrorCodes } template -ObjectStoragePtr TableFunctionObjectStorage< - Definition, Configuration>::getObjectStorage(const ContextPtr & context, bool create_readonly) const +ObjectStoragePtr TableFunctionObjectStorage::getObjectStorage(const ContextPtr & context, bool create_readonly) const { if (!object_storage) object_storage = configuration->createObjectStorage(context, create_readonly); @@ -38,8 +37,7 @@ ObjectStoragePtr TableFunctionObjectStorage< } template -StorageObjectStorageConfigurationPtr TableFunctionObjectStorage< - Definition, Configuration>::getConfiguration() const +StorageObjectStorage::ConfigurationPtr TableFunctionObjectStorage::getConfiguration() const { if (!configuration) configuration = std::make_shared(); @@ -47,8 +45,8 @@ StorageObjectStorageConfigurationPtr TableFunctionObjectStorage< } template -std::vector TableFunctionObjectStorage< - Definition, Configuration>::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const +std::vector TableFunctionObjectStorage::skipAnalysisForArguments( + const QueryTreeNodePtr & query_node_table_function, ContextPtr) const { auto & table_function_node = query_node_table_function->as(); auto & table_function_arguments_nodes = table_function_node.getArguments().getNodes(); @@ -64,19 +62,6 @@ std::vector TableFunctionObjectStorage< return result; } -template -void TableFunctionObjectStorage::updateStructureAndFormatArgumentsIfNeeded( - ASTs & args, const String & structure, const String & format, const ContextPtr & context) -{ - Configuration().addStructureAndFormatToArgs(args, structure, format, context); -} - -template -void TableFunctionObjectStorage::parseArgumentsImpl(ASTs & engine_args, const ContextPtr & local_context) -{ - StorageObjectStorageConfiguration::initialize(*getConfiguration(), engine_args, local_context, true); -} - template void TableFunctionObjectStorage::parseArguments(const ASTPtr & ast_function, ContextPtr context) { @@ -94,32 +79,16 @@ template ColumnsDescription TableFunctionObjectStorage< Definition, Configuration>::getActualTableStructure(ContextPtr context, bool is_insert_query) const { - chassert(configuration); if (configuration->structure == "auto") { context->checkAccess(getSourceAccessType()); - auto storage = getObjectStorage(context, !is_insert_query); ColumnsDescription columns; + auto storage = getObjectStorage(context, !is_insert_query); resolveSchemaAndFormat(columns, configuration->format, storage, configuration, std::nullopt, context); return columns; } - - return parseColumnsListFromString(configuration->structure, context); -} - -template -bool TableFunctionObjectStorage< - Definition, Configuration>::supportsReadingSubsetOfColumns(const ContextPtr & context) -{ - chassert(configuration); - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); -} - -template -std::unordered_set TableFunctionObjectStorage< - Definition, Configuration>::getVirtualsToCheckBeforeUsingStructureHint() const -{ - return VirtualColumnUtils::getVirtualNamesForFileLikeStorage(); + else + return parseColumnsListFromString(configuration->structure, context); } template @@ -205,7 +174,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) #endif #if USE_AZURE_BLOB_STORAGE - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -229,8 +198,8 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) } #if USE_AZURE_BLOB_STORAGE -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; #endif #if USE_AWS_S3 diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h index bd43cae3697..bbc40cc6191 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.h +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -1,19 +1,18 @@ #pragma once #include "config.h" - #include -#include -#include +#include #include - +#include +#include namespace DB { class Context; class StorageS3Configuration; -class StorageAzureBlobConfiguration; +class StorageAzureConfiguration; class StorageHDFSConfiguration; struct S3StorageSettings; struct AzureStorageSettings; @@ -104,20 +103,32 @@ public: void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } - bool supportsReadingSubsetOfColumns(const ContextPtr & context) override; + bool supportsReadingSubsetOfColumns(const ContextPtr & context) override + { + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); + } - std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override; + std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override + { + return VirtualColumnUtils::getVirtualNamesForFileLikeStorage(); + } - virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); + virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context) + { + StorageObjectStorage::Configuration::initialize(*getConfiguration(), args, context, true); + } static void updateStructureAndFormatArgumentsIfNeeded( ASTs & args, const String & structure, const String & format, - const ContextPtr & context); + const ContextPtr & context) + { + Configuration().addStructureAndFormatToArgs(args, structure, format, context); + } protected: - using ConfigurationPtr = StorageObjectStorageConfigurationPtr; + using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; StoragePtr executeImpl( const ASTPtr & ast_function, @@ -146,7 +157,7 @@ using TableFunctionS3 = TableFunctionObjectStorage; +using TableFunctionAzureBlob = TableFunctionObjectStorage; #endif #if USE_HDFS diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index ce78076dd21..449bd2c8c49 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace DB @@ -109,7 +109,7 @@ template class TableFunctionObjectStorageCluster; +template class TableFunctionObjectStorageCluster; #endif #if USE_HDFS diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.h b/src/TableFunctions/TableFunctionObjectStorageCluster.h index a8bc11b5e40..76786fafe99 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.h +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.h @@ -13,7 +13,7 @@ class Context; class StorageS3Settings; class StorageAzureBlobSettings; class StorageS3Configuration; -class StorageAzureBlobConfiguration; +class StorageAzureConfiguration; struct AzureClusterDefinition { @@ -90,7 +90,7 @@ using TableFunctionS3Cluster = TableFunctionObjectStorageCluster; +using TableFunctionAzureBlobCluster = TableFunctionObjectStorageCluster; #endif #if USE_HDFS diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index 5e0bc3267d8..26b9a771416 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -29,18 +29,6 @@ void registerTableFunctions() registerTableFunctionFuzzJSON(factory); #endif -#if USE_AWS_S3 - // registerTableFunctionS3Cluster(factory); - // registerTableFunctionHudi(factory); -#if USE_PARQUET - // registerTableFunctionDeltaLake(factory); -#endif -#if USE_AVRO - // registerTableFunctionIceberg(factory); -#endif - -#endif - #if USE_HIVE registerTableFunctionHive(factory); #endif diff --git a/tests/integration/test_checking_s3_blobs_paranoid/configs/inf_s3_retries.xml b/tests/integration/test_checking_s3_blobs_paranoid/configs/inf_s3_retries.xml index 4210c13b727..7df7b56b3b4 100644 --- a/tests/integration/test_checking_s3_blobs_paranoid/configs/inf_s3_retries.xml +++ b/tests/integration/test_checking_s3_blobs_paranoid/configs/inf_s3_retries.xml @@ -5,6 +5,7 @@ 1000000 1 + 0 diff --git a/tests/integration/test_checking_s3_blobs_paranoid/configs/s3_retries.xml b/tests/integration/test_checking_s3_blobs_paranoid/configs/s3_retries.xml index 95a313ea4f2..c1ca258f6c4 100644 --- a/tests/integration/test_checking_s3_blobs_paranoid/configs/s3_retries.xml +++ b/tests/integration/test_checking_s3_blobs_paranoid/configs/s3_retries.xml @@ -5,6 +5,7 @@ 5 0 + 0 From 0db76bf631475c6a7647096baf26bfdac35cc181 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 26 Apr 2024 18:52:49 +0000 Subject: [PATCH 086/392] Add more tests and docs, fix collecting statistics, fix prefetching columns in wide parts --- src/Columns/ColumnDynamic.cpp | 4 +- src/Columns/ColumnNullable.cpp | 19 ++++ src/Columns/ColumnNullable.h | 3 + src/DataTypes/Serializations/ISerialization.h | 37 ++++---- .../Serializations/SerializationArray.cpp | 3 +- .../Serializations/SerializationDynamic.cpp | 88 ++++++++++--------- .../SerializationDynamicElement.cpp | 35 ++++++-- .../Serializations/SerializationMap.cpp | 3 +- .../Serializations/SerializationTuple.cpp | 41 ++++----- .../Serializations/SerializationVariant.cpp | 24 ++--- .../SerializationVariantElement.cpp | 84 ++++++++---------- .../MergeTree/MergeTreeReaderWide.cpp | 9 +- src/Storages/MergeTree/MutateTask.cpp | 16 +--- .../03034_dynamic_conversions.reference | 25 ++++++ .../0_stateless/03034_dynamic_conversions.sql | 10 +++ .../03037_dynamic_merges_1.reference | 18 ++-- .../0_stateless/03037_dynamic_merges_1.sh | 17 ++-- .../03037_dynamic_merges_2.reference | 20 +++++ .../0_stateless/03037_dynamic_merges_2.sh | 2 +- ... => 03040_dynamic_type_alters.sh.disabled} | 0 20 files changed, 275 insertions(+), 183 deletions(-) create mode 100644 tests/queries/0_stateless/03037_dynamic_merges_2.reference rename tests/queries/0_stateless/{03040_dynamic_type_alters.sh => 03040_dynamic_type_alters.sh.disabled} (100%) diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index 293055b43fc..3074504973a 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -687,7 +687,7 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const DB::Columns & so } size_t size = source_statistics.data.empty() ? source_variant_column.getVariantByGlobalDiscriminator(i).size() : source_statistics.data.at(variant_name); -// LOG_DEBUG(getLogger("ColumnDynamic"), "Source variant: {}. Variant: {}. Size: {}", source_variant_info.variant_name, variant_name, size); + LOG_DEBUG(getLogger("ColumnDynamic"), "Source variant: {}. Variant: {}. Size: {}", source_variant_info.variant_name, variant_name, size); it->second += size; } } @@ -701,7 +701,7 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const DB::Columns & so variants_with_sizes.reserve(all_variants.size()); for (const auto & variant : all_variants) { -// LOG_DEBUG(getLogger("ColumnDynamic"), "Variant: {}. Size: {}", variant->getName(), total_sizes[variant->getName()]); + LOG_DEBUG(getLogger("ColumnDynamic"), "Variant: {}. Size: {}", variant->getName(), total_sizes[variant->getName()]); variants_with_sizes.emplace_back(total_sizes[variant->getName()], variant); } std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater()); diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 4474816601e..011f3702bdf 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -900,4 +900,23 @@ ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column) return column; } +ColumnPtr removeNullable(const ColumnPtr & column) +{ + if (const auto * column_nullable = typeid_cast(column.get())) + return column_nullable->getNestedColumnPtr(); + return column; +} + +ColumnPtr removeNullableOrLowCardinalityNullable(const ColumnPtr & column) +{ + if (const auto * column_low_cardinality = typeid_cast(column.get())) + { + if (!column_low_cardinality->nestedIsNullable()) + return column; + return column_low_cardinality->cloneWithDefaultOnNull(); + } + + return removeNullable(column); +} + } diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 73bd75527f8..4e6f05b35ec 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -210,4 +210,7 @@ ColumnPtr makeNullableSafe(const ColumnPtr & column); ColumnPtr makeNullableOrLowCardinalityNullable(const ColumnPtr & column); ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column); +ColumnPtr removeNullable(const ColumnPtr & column); +ColumnPtr removeNullableOrLowCardinalityNullable(const ColumnPtr & column); + } diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 65493cf6dda..ddbed34f614 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -99,6 +99,19 @@ public: using SubcolumnCreatorPtr = std::shared_ptr; + struct SerializeBinaryBulkState + { + virtual ~SerializeBinaryBulkState() = default; + }; + + struct DeserializeBinaryBulkState + { + virtual ~DeserializeBinaryBulkState() = default; + }; + + using SerializeBinaryBulkStatePtr = std::shared_ptr; + using DeserializeBinaryBulkStatePtr = std::shared_ptr; + struct SubstreamData { SubstreamData() = default; @@ -125,10 +138,17 @@ public: return *this; } + SubstreamData & withDeserializePrefix(DeserializeBinaryBulkStatePtr deserialize_prefix_state_) + { + deserialize_prefix_state = std::move(deserialize_prefix_state_); + return *this; + } + SerializationPtr serialization; DataTypePtr type; ColumnPtr column; SerializationInfoPtr serialization_info; + DeserializeBinaryBulkStatePtr deserialize_prefix_state; }; struct Substream @@ -221,21 +241,6 @@ public: using OutputStreamGetter = std::function; using InputStreamGetter = std::function; - struct SerializeBinaryBulkState - { - virtual ~SerializeBinaryBulkState() = default; - }; - - struct DeserializeBinaryBulkState - { - virtual ~DeserializeBinaryBulkState() = default; - }; - - using SerializeBinaryBulkStatePtr = std::shared_ptr; - using DeserializeBinaryBulkStatePtr = std::shared_ptr; - - using SubstreamsDeserializeStatesCache = std::unordered_map; - struct SerializeBinaryBulkSettings { OutputStreamGetter getter; @@ -285,6 +290,8 @@ public: SerializeBinaryBulkSettings & /*settings*/, SerializeBinaryBulkStatePtr & /*state*/) const {} + using SubstreamsDeserializeStatesCache = std::unordered_map; + /// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr. virtual void deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & /*settings*/, diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index d6546b338b5..6a8555a3714 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -254,7 +254,8 @@ void SerializationArray::enumerateStreams( auto next_data = SubstreamData(nested) .withType(type_array ? type_array->getNestedType() : nullptr) .withColumn(column_array ? column_array->getDataPtr() : nullptr) - .withSerializationInfo(data.serialization_info); + .withSerializationInfo(data.serialization_info) + .withDeserializePrefix(data.deserialize_prefix_state); nested->enumerateStreams(settings, callback, next_data); settings.path.pop_back(); diff --git a/src/DataTypes/Serializations/SerializationDynamic.cpp b/src/DataTypes/Serializations/SerializationDynamic.cpp index c9fe8dd6b29..858445ed257 100644 --- a/src/DataTypes/Serializations/SerializationDynamic.cpp +++ b/src/DataTypes/Serializations/SerializationDynamic.cpp @@ -21,45 +21,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -void SerializationDynamic::enumerateStreams( - EnumerateStreamsSettings & settings, - const StreamCallback & callback, - const SubstreamData & data) const -{ - settings.path.push_back(Substream::DynamicStructure); - callback(settings.path); - settings.path.pop_back(); - - const auto * column_dynamic = data.column ? &assert_cast(*data.column) : nullptr; - - /// If column is nullptr, nothing to enumerate as we don't have any variants. - if (!column_dynamic) - return; - - const auto & variant_info = column_dynamic->getVariantInfo(); - auto variant_serialization = variant_info.variant_type->getDefaultSerialization(); - - settings.path.push_back(Substream::DynamicData); - auto variant_data = SubstreamData(variant_serialization) - .withType(variant_info.variant_type) - .withColumn(column_dynamic->getVariantColumnPtr()) - .withSerializationInfo(data.serialization_info); - settings.path.back().data = variant_data; - variant_serialization->enumerateStreams(settings, callback, variant_data); - settings.path.pop_back(); -} - -SerializationDynamic::DynamicStructureSerializationVersion::DynamicStructureSerializationVersion(UInt64 version) : value(static_cast(version)) -{ - checkVersion(version); -} - -void SerializationDynamic::DynamicStructureSerializationVersion::checkVersion(UInt64 version) -{ - if (version != VariantTypeName) - throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid version for Dynamic structure serialization."); -} - struct SerializeBinaryBulkStateDynamic : public ISerialization::SerializeBinaryBulkState { SerializationDynamic::DynamicStructureSerializationVersion structure_version; @@ -68,10 +29,6 @@ struct SerializeBinaryBulkStateDynamic : public ISerialization::SerializeBinaryB SerializationPtr variant_serialization; ISerialization::SerializeBinaryBulkStatePtr variant_state; - /// Pointer to currently serialized dynamic column. - /// Used to calculate statistics for the whole column and not for some range. - const ColumnDynamic * current_dynamic_column = nullptr; - /// Variants statistics. Map (Variant name) -> (Variant size). ColumnDynamic::Statistics statistics = { .source =ColumnDynamic::Statistics::Source::READ }; @@ -91,6 +48,47 @@ struct DeserializeBinaryBulkStateDynamic : public ISerialization::DeserializeBin ISerialization::DeserializeBinaryBulkStatePtr structure_state; }; +void SerializationDynamic::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + settings.path.push_back(Substream::DynamicStructure); + callback(settings.path); + settings.path.pop_back(); + + const auto * column_dynamic = data.column ? &assert_cast(*data.column) : nullptr; + const auto * deserialize_prefix_state = data.deserialize_prefix_state ? checkAndGetState(data.deserialize_prefix_state) : nullptr; + + /// If column is nullptr and we didn't deserizlize prefix yet, nothing to enumerate as we don't have any variants. + if (!column_dynamic && !deserialize_prefix_state) + return; + + const auto & variant_type = column_dynamic ? column_dynamic->getVariantInfo().variant_type : checkAndGetState(deserialize_prefix_state->structure_state)->variant_type; + auto variant_serialization = variant_type->getDefaultSerialization(); + + settings.path.push_back(Substream::DynamicData); + auto variant_data = SubstreamData(variant_serialization) + .withType(variant_type) + .withColumn(column_dynamic ? column_dynamic->getVariantColumnPtr() : nullptr) + .withSerializationInfo(data.serialization_info) + .withDeserializePrefix(deserialize_prefix_state ? deserialize_prefix_state->variant_state : nullptr); + settings.path.back().data = variant_data; + variant_serialization->enumerateStreams(settings, callback, variant_data); + settings.path.pop_back(); +} + +SerializationDynamic::DynamicStructureSerializationVersion::DynamicStructureSerializationVersion(UInt64 version) : value(static_cast(version)) +{ + checkVersion(version); +} + +void SerializationDynamic::DynamicStructureSerializationVersion::checkVersion(UInt64 version) +{ + if (version != VariantTypeName) + throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid version for Dynamic structure serialization."); +} + void SerializationDynamic::serializeBinaryBulkStatePrefix( const DB::IColumn & column, SerializeBinaryBulkSettings & settings, @@ -245,6 +243,10 @@ void SerializationDynamic::serializeBinaryBulkWithMultipleStreams( if (!variant_info.variant_type->equals(*dynamic_state->variant_type)) throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of internal columns of Dynamic. Expected: {}, Got: {}", dynamic_state->variant_type->getName(), variant_info.variant_type->getName()); + /// Update statistics. + if (offset == 0) + dynamic_state->updateStatistics(*variant_column); + settings.path.push_back(Substream::DynamicData); dynamic_state->variant_serialization->serializeBinaryBulkWithMultipleStreams(*variant_column, offset, limit, settings, dynamic_state->variant_state); settings.path.pop_back(); diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.cpp b/src/DataTypes/Serializations/SerializationDynamicElement.cpp index 386a6579519..9be9802d926 100644 --- a/src/DataTypes/Serializations/SerializationDynamicElement.cpp +++ b/src/DataTypes/Serializations/SerializationDynamicElement.cpp @@ -14,17 +14,41 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } + +struct DeserializeBinaryBulkStateDynamicElement : public ISerialization::DeserializeBinaryBulkState +{ + ISerialization::DeserializeBinaryBulkStatePtr structure_state; + SerializationPtr variant_serialization; + ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; +}; + void SerializationDynamicElement::enumerateStreams( DB::ISerialization::EnumerateStreamsSettings & settings, const DB::ISerialization::StreamCallback & callback, - const DB::ISerialization::SubstreamData &) const + const DB::ISerialization::SubstreamData & data) const { settings.path.push_back(Substream::DynamicStructure); callback(settings.path); settings.path.pop_back(); - /// We don't know if we have actually have this variant in Dynamic column, + /// If we didn't deserialize prefix yet, we don't know if we actually have this variant in Dynamic column, /// so we cannot enumerate variant streams. + if (!data.deserialize_prefix_state) + return; + + auto * deserialize_prefix_state = checkAndGetState(data.deserialize_prefix_state); + /// If we don't have this variant, no need to enumerate streams for it as we won't read from any stream. + if (!deserialize_prefix_state->variant_serialization) + return; + + settings.path.push_back(Substream::DynamicData); + auto variant_data = SubstreamData(deserialize_prefix_state->variant_serialization) + .withType(data.type) + .withColumn(data.column) + .withSerializationInfo(data.serialization_info) + .withDeserializePrefix(deserialize_prefix_state->variant_element_state); + deserialize_prefix_state->variant_serialization->enumerateStreams(settings, callback, variant_data); + settings.path.pop_back(); } void SerializationDynamicElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const @@ -39,13 +63,6 @@ void SerializationDynamicElement::serializeBinaryBulkStateSuffix(SerializeBinary ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationDynamicElement"); } -struct DeserializeBinaryBulkStateDynamicElement : public ISerialization::DeserializeBinaryBulkState -{ - ISerialization::DeserializeBinaryBulkStatePtr structure_state; - SerializationPtr variant_serialization; - ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; -}; - void SerializationDynamicElement::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const { diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index dac4fbe88e0..cda82f31820 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -398,7 +398,8 @@ void SerializationMap::enumerateStreams( auto next_data = SubstreamData(nested) .withType(data.type ? assert_cast(*data.type).getNestedType() : nullptr) .withColumn(data.column ? assert_cast(*data.column).getNestedColumnPtr() : nullptr) - .withSerializationInfo(data.serialization_info); + .withSerializationInfo(data.serialization_info) + .withDeserializePrefix(data.deserialize_prefix_state); nested->enumerateStreams(settings, callback, next_data); } diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index bb7c19aa78d..6e4b4c4c533 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -549,26 +549,6 @@ bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & is return tryDeserializeText(column, rb, settings, true); } -void SerializationTuple::enumerateStreams( - EnumerateStreamsSettings & settings, - const StreamCallback & callback, - const SubstreamData & data) const -{ - const auto * type_tuple = data.type ? &assert_cast(*data.type) : nullptr; - const auto * column_tuple = data.column ? &assert_cast(*data.column) : nullptr; - const auto * info_tuple = data.serialization_info ? &assert_cast(*data.serialization_info) : nullptr; - - for (size_t i = 0; i < elems.size(); ++i) - { - auto next_data = SubstreamData(elems[i]) - .withType(type_tuple ? type_tuple->getElement(i) : nullptr) - .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr) - .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr); - - elems[i]->enumerateStreams(settings, callback, next_data); - } -} - struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState { std::vector states; @@ -579,6 +559,27 @@ struct DeserializeBinaryBulkStateTuple : public ISerialization::DeserializeBinar std::vector states; }; +void SerializationTuple::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_tuple = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_tuple = data.column ? &assert_cast(*data.column) : nullptr; + const auto * info_tuple = data.serialization_info ? &assert_cast(*data.serialization_info) : nullptr; + const auto * tuple_deserialize_prefix_state = data.deserialize_prefix_state ? checkAndGetState(data.deserialize_prefix_state) : nullptr; + + for (size_t i = 0; i < elems.size(); ++i) + { + auto next_data = SubstreamData(elems[i]) + .withType(type_tuple ? type_tuple->getElement(i) : nullptr) + .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr) + .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr) + .withDeserializePrefix(tuple_deserialize_prefix_state ? tuple_deserialize_prefix_state->states[i] : nullptr); + + elems[i]->enumerateStreams(settings, callback, next_data); + } +} void SerializationTuple::serializeBinaryBulkStatePrefix( const IColumn & column, diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 3fe26b773e3..8e0ef112444 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -28,6 +28,16 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } +struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState +{ + std::vector states; +}; + +struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState +{ + std::vector states; +}; + void SerializationVariant::enumerateStreams( EnumerateStreamsSettings & settings, const StreamCallback & callback, @@ -35,6 +45,7 @@ void SerializationVariant::enumerateStreams( { const auto * type_variant = data.type ? &assert_cast(*data.type) : nullptr; const auto * column_variant = data.column ? &assert_cast(*data.column) : nullptr; + const auto * variant_deserialize_prefix_state = data.deserialize_prefix_state ? checkAndGetState(data.deserialize_prefix_state) : nullptr; auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", SubstreamType::NamedVariantDiscriminators); auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr; @@ -59,7 +70,8 @@ void SerializationVariant::enumerateStreams( auto variant_data = SubstreamData(variants[i]) .withType(type_variant ? type_variant->getVariant(i) : nullptr) .withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr) - .withSerializationInfo(data.serialization_info); + .withSerializationInfo(data.serialization_info) + .withDeserializePrefix(variant_deserialize_prefix_state ? variant_deserialize_prefix_state->states[i] : nullptr); addVariantElementToPath(settings.path, i); settings.path.back().data = variant_data; @@ -70,16 +82,6 @@ void SerializationVariant::enumerateStreams( settings.path.pop_back(); } -struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState -{ - std::vector states; -}; - -struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState -{ - std::vector states; -}; - void SerializationVariant::serializeBinaryBulkStatePrefix( const IColumn & column, SerializeBinaryBulkSettings & settings, diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 4f120ecac06..0e1ad81ce5b 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -12,34 +12,6 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -void SerializationVariantElement::enumerateStreams( - DB::ISerialization::EnumerateStreamsSettings & settings, - const DB::ISerialization::StreamCallback & callback, - const DB::ISerialization::SubstreamData & data) const -{ - /// We will need stream for discriminators during deserialization. - settings.path.push_back(Substream::VariantDiscriminators); - callback(settings.path); - settings.path.pop_back(); - - addVariantToPath(settings.path); - settings.path.back().data = data; - nested_serialization->enumerateStreams(settings, callback, data); - removeVariantFromPath(settings.path); -} - -void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const -{ - throw Exception( - ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement"); -} - -void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const -{ - throw Exception( - ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); -} - struct DeserializeBinaryBulkStateVariantElement : public ISerialization::DeserializeBinaryBulkState { /// During deserialization discriminators and variant streams can be shared. @@ -56,6 +28,40 @@ struct DeserializeBinaryBulkStateVariantElement : public ISerialization::Deseria ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; }; +void SerializationVariantElement::enumerateStreams( + DB::ISerialization::EnumerateStreamsSettings & settings, + const DB::ISerialization::StreamCallback & callback, + const DB::ISerialization::SubstreamData & data) const +{ + /// We will need stream for discriminators during deserialization. + settings.path.push_back(Substream::VariantDiscriminators); + callback(settings.path); + settings.path.pop_back(); + + const auto * deserialize_prefix_state = data.deserialize_prefix_state ? checkAndGetState(data.deserialize_prefix_state) : nullptr; + addVariantToPath(settings.path); + auto nested_data = SubstreamData(nested_serialization) + .withType(data.type ? removeNullableOrLowCardinalityNullable(data.type) : nullptr) + .withColumn(data.column ? removeNullableOrLowCardinalityNullable(data.column) : nullptr) + .withSerializationInfo(data.serialization_info) + .withDeserializePrefix(deserialize_prefix_state ? deserialize_prefix_state->variant_element_state : nullptr); + settings.path.back().data = data; + nested_serialization->enumerateStreams(settings, callback, data); + removeVariantFromPath(settings.path); +} + +void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); +} + void SerializationVariantElement::deserializeBinaryBulkStatePrefix( DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const { @@ -82,7 +88,6 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( { auto * variant_element_state = checkAndGetState(state); - size_t variant_limit = 0; /// First, deserialize discriminators from Variant column. settings.path.push_back(Substream::VariantDiscriminators); if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) @@ -99,30 +104,17 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( if (!variant_element_state->discriminators || result_column->empty()) variant_element_state->discriminators = ColumnVariant::ColumnDiscriminators::create(); -// ColumnVariant::Discriminator discr; -// readBinaryLittleEndian(discr, *discriminators_stream); -// if (discr == ColumnVariant::NULL_DISCRIMINATOR) -// { SerializationNumber().deserializeBinaryBulk(*variant_element_state->discriminators->assumeMutable(), *discriminators_stream, limit, 0); -// } -// else -// { -// auto & discriminators_data = assert_cast(*variant_element_state->discriminators->assumeMutable()).getData(); -// discriminators_data.resize_fill(discriminators_data.size() + limit, discr); -// } - addToSubstreamsCache(cache, settings.path, variant_element_state->discriminators); } settings.path.pop_back(); + /// Iterate through new discriminators to calculate the limit for our variant. const auto & discriminators_data = assert_cast(*variant_element_state->discriminators).getData(); size_t discriminators_offset = variant_element_state->discriminators->size() - limit; - /// Iterate through new discriminators to calculate the limit for our variant. - if (!variant_limit) - { - for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) - variant_limit += (discriminators_data[i] == variant_discriminator); - } + size_t variant_limit = 0; + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + variant_limit += (discriminators_data[i] == variant_discriminator); /// Now we know the limit for our variant and can deserialize it. diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index c8bf12436b0..d18d5eec975 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -334,8 +334,7 @@ void MergeTreeReaderWide::prefetchForColumn( ISerialization::SubstreamsDeserializeStatesCache & deserialize_states_cache) { deserializePrefix(serialization, name_and_type, current_task_last_mark, cache, deserialize_states_cache); - - serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + auto callback = [&](const ISerialization::SubstreamPath & substream_path) { auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(name_and_type, substream_path, data_part_info_for_read->getChecksums()); @@ -348,7 +347,11 @@ void MergeTreeReaderWide::prefetchForColumn( prefetched_streams.insert(*stream_name); } } - }); + }; + + auto data = ISerialization::SubstreamData(serialization).withType(name_and_type.type).withDeserializePrefix(deserialize_binary_bulk_state_map[name_and_type.name]); + ISerialization::EnumerateStreamsSettings settings; + serialization->enumerateStreams(settings, callback, data); } diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index fb3e318687a..5e388d6a8ac 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -779,13 +779,7 @@ static NameToNameVector collectFilesForRenames( }; if (auto serialization = source_part->tryGetSerialization(command.column_name)) - { - auto name_and_type = source_part->getColumn(command.column_name); - ColumnPtr column_sample; - if (name_and_type.type->hasDynamicSubcolumns()) - column_sample = source_part->readColumnSample(name_and_type); - serialization->enumerateStreams(callback, name_and_type.type, column_sample); - } + serialization->enumerateStreams(callback); /// if we drop a column with statistic, we should also drop the stat file. if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) @@ -821,13 +815,7 @@ static NameToNameVector collectFilesForRenames( }; if (auto serialization = source_part->tryGetSerialization(command.column_name)) - { - auto name_and_type = source_part->getColumn(command.column_name); - ColumnPtr column_sample; - if (name_and_type.type->hasDynamicSubcolumns()) - column_sample = source_part->readColumnSample(name_and_type); - serialization->enumerateStreams(callback, name_and_type.type, column_sample); - } + serialization->enumerateStreams(callback); /// if we rename a column with statistic, we should also rename the stat file. if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) diff --git a/tests/queries/0_stateless/03034_dynamic_conversions.reference b/tests/queries/0_stateless/03034_dynamic_conversions.reference index af91add9ddd..45f94f7ecc4 100644 --- a/tests/queries/0_stateless/03034_dynamic_conversions.reference +++ b/tests/queries/0_stateless/03034_dynamic_conversions.reference @@ -61,3 +61,28 @@ str_5 String \N None 4 UInt64 1970-01-06 Date +0 +42 +42.42 +1 +0 +\N +42 +42.42 +1 +0 + +42 +42.42 +true +e10 +\N +42 +42.42 +true +e10 +\N +42 +\N +1 +\N diff --git a/tests/queries/0_stateless/03034_dynamic_conversions.sql b/tests/queries/0_stateless/03034_dynamic_conversions.sql index e9b4944f5d8..ed75fbf2377 100644 --- a/tests/queries/0_stateless/03034_dynamic_conversions.sql +++ b/tests/queries/0_stateless/03034_dynamic_conversions.sql @@ -22,3 +22,13 @@ select multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(numb select multiIf(number % 4 == 0, number, number % 4 == 1, toDate(number), number % 4 == 2, range(number), NULL)::Dynamic(max_types=4)::Dynamic(max_types=3) as d, dynamicType(d) from numbers(6); +create table test (d Dynamic) engine = Memory; +insert into test values (NULL), (42), ('42.42'), (true), ('e10'); +select d::Float64 from test; +select d::Nullable(Float64) from test; +select d::String from test; +select d::Nullable(String) from test; +select d::UInt64 from test; -- {serverError CANNOT_PARSE_TEXT} +select d::Nullable(UInt64) from test; +select d::Date from test; -- {serverError CANNOT_PARSE_DATE} + diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1.reference b/tests/queries/0_stateless/03037_dynamic_merges_1.reference index fff812f0396..0a647b41c4b 100644 --- a/tests/queries/0_stateless/03037_dynamic_merges_1.reference +++ b/tests/queries/0_stateless/03037_dynamic_merges_1.reference @@ -1,5 +1,5 @@ MergeTree compact + horizontal merge -test1 +test 50000 DateTime 60000 Date 70000 Array(UInt16) @@ -20,8 +20,8 @@ test1 200000 Map(UInt64, UInt64) 260000 String 10000 Tuple(UInt64, UInt64) -100000 UInt64 100000 None +100000 UInt64 200000 Map(UInt64, UInt64) 260000 String 100000 None @@ -29,7 +29,7 @@ test1 200000 Map(UInt64, UInt64) 270000 String MergeTree wide + horizontal merge -test1 +test 50000 DateTime 60000 Date 70000 Array(UInt16) @@ -41,8 +41,8 @@ test1 100000 UInt64 190000 String 70000 Array(UInt16) -100000 UInt64 100000 None +100000 UInt64 190000 String 200000 Map(UInt64, UInt64) 100000 None @@ -50,8 +50,8 @@ test1 200000 Map(UInt64, UInt64) 260000 String 10000 Tuple(UInt64, UInt64) -100000 UInt64 100000 None +100000 UInt64 200000 Map(UInt64, UInt64) 260000 String 100000 None @@ -59,7 +59,7 @@ test1 200000 Map(UInt64, UInt64) 270000 String MergeTree compact + vertical merge -test1 +test 50000 DateTime 60000 Date 70000 Array(UInt16) @@ -71,8 +71,8 @@ test1 100000 UInt64 190000 String 70000 Array(UInt16) -100000 UInt64 100000 None +100000 UInt64 190000 String 200000 Map(UInt64, UInt64) 100000 None @@ -84,12 +84,12 @@ test1 100000 UInt64 200000 Map(UInt64, UInt64) 260000 String -100000 UInt64 100000 None +100000 UInt64 200000 Map(UInt64, UInt64) 270000 String MergeTree wide + vertical merge -test1 +test 50000 DateTime 60000 Date 70000 Array(UInt16) diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1.sh b/tests/queries/0_stateless/03037_dynamic_merges_1.sh index cf524fb9393..056f6702727 100755 --- a/tests/queries/0_stateless/03037_dynamic_merges_1.sh +++ b/tests/queries/0_stateless/03037_dynamic_merges_1.sh @@ -21,35 +21,36 @@ function test() $CH_CLIENT -q "insert into test select number, toDateTime(number) from numbers(50000)" $CH_CLIENT -q "insert into test select number, NULL from numbers(100000)" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -nm -q "system start merges test; optimize table test final;" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "system stop merges test" $CH_CLIENT -q "insert into test select number, map(number, number) from numbers(200000)" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -nm -q "system start merges test; optimize table test final;" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "system stop merges test" $CH_CLIENT -q "insert into test select number, tuple(number, number) from numbers(10000)" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -nm -q "system start merges test; optimize table test final;" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" } $CH_CLIENT -q "drop table if exists test;" echo "MergeTree compact + horizontal merge" -$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_columns_to_activate=10;" test $CH_CLIENT -q "drop table test;" echo "MergeTree wide + horizontal merge" -$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_columns_to_activate=10;" test $CH_CLIENT -q "drop table test;" + echo "MergeTree compact + vertical merge" $CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" test diff --git a/tests/queries/0_stateless/03037_dynamic_merges_2.reference b/tests/queries/0_stateless/03037_dynamic_merges_2.reference new file mode 100644 index 00000000000..420b8185b16 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_2.reference @@ -0,0 +1,20 @@ +MergeTree compact + horizontal merge +test +1000000 Array(UInt16) +1000000 String +1000000 UInt64 +MergeTree wide + horizontal merge +test +1000000 Array(UInt16) +1000000 String +1000000 UInt64 +MergeTree compact + vertical merge +test +1000000 Array(UInt16) +1000000 String +1000000 UInt64 +MergeTree wide + vertical merge +test +1000000 Array(UInt16) +1000000 String +1000000 UInt64 diff --git a/tests/queries/0_stateless/03037_dynamic_merges_2.sh b/tests/queries/0_stateless/03037_dynamic_merges_2.sh index e9d571c2104..40adbdd4262 100755 --- a/tests/queries/0_stateless/03037_dynamic_merges_2.sh +++ b/tests/queries/0_stateless/03037_dynamic_merges_2.sh @@ -19,7 +19,7 @@ function test() $CH_CLIENT -q "insert into test select number, range(number % 10 + 1) from numbers(2000000, 1000000)" $CH_CLIENT -nm -q "system start merges test; optimize table test final;" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" } $CH_CLIENT -q "drop table if exists test;" diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters.sh b/tests/queries/0_stateless/03040_dynamic_type_alters.sh.disabled similarity index 100% rename from tests/queries/0_stateless/03040_dynamic_type_alters.sh rename to tests/queries/0_stateless/03040_dynamic_type_alters.sh.disabled From 671650bd2eaf2a07d5e6f517b40905c71ce798b6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Sun, 28 Apr 2024 12:18:24 +0200 Subject: [PATCH 087/392] Cleanup --- src/Backups/BackupIO_AzureBlobStorage.cpp | 4 ++-- src/Storages/ObjectStorage/Azure/Configuration.h | 16 ++++++++++------ .../ObjectStorage/DataLakes/IStorageDataLake.h | 4 +--- src/Storages/ObjectStorage/HDFS/Configuration.h | 11 +++++++---- src/Storages/ObjectStorage/S3/Configuration.h | 10 ++++++---- .../ObjectStorage/StorageObjectStorage.cpp | 16 +++------------- .../ObjectStorage/StorageObjectStorage.h | 5 +---- .../ObjectStorage/StorageObjectStorageSource.cpp | 9 ++++----- .../ObjectStorage/StorageObjectStorageSource.h | 5 +---- .../registerStorageObjectStorage.cpp | 3 +-- src/Storages/S3Queue/StorageS3Queue.cpp | 6 ++---- 11 files changed, 38 insertions(+), 51 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index f00da686c18..3af66e5470f 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -36,7 +36,7 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::Azure, MetadataStorageType::None, configuration_.container, false, false} , configuration(configuration_) { - auto client_ptr = configuration.createClient(/* is_read_only */ false, /* attempt_to_create_container */true); + auto client_ptr = configuration.createClient(/* is_readonly */false, /* attempt_to_create_container */true); object_storage = std::make_unique("BackupReaderAzureBlobStorage", std::move(client_ptr), configuration.createSettings(context_), @@ -121,7 +121,7 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::Azure, MetadataStorageType::None, configuration_.container, false, false} , configuration(configuration_) { - auto client_ptr = configuration.createClient(/* is_read_only */ false, attempt_to_create_container); + auto client_ptr = configuration.createClient(/* is_readonly */false, attempt_to_create_container); object_storage = std::make_unique("BackupWriterAzureBlobStorage", std::move(client_ptr), configuration.createSettings(context_), diff --git a/src/Storages/ObjectStorage/Azure/Configuration.h b/src/Storages/ObjectStorage/Azure/Configuration.h index 91a9a0bbbd5..1591cb42469 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.h +++ b/src/Storages/ObjectStorage/Azure/Configuration.h @@ -3,7 +3,6 @@ #include "config.h" #if USE_AZURE_BLOB_STORAGE - #include #include #include @@ -36,20 +35,25 @@ public: const Paths & getPaths() const override { return blobs_paths; } void setPaths(const Paths & paths) override { blobs_paths = paths; } - String getDataSourceDescription() override { return std::filesystem::path(connection_url) / container; } String getNamespace() const override { return container; } + String getDataSourceDescription() override { return std::filesystem::path(connection_url) / container; } StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; void check(ContextPtr context) const override; - ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT ConfigurationPtr clone() override { return std::make_shared(*this); } - void fromNamedCollection(const NamedCollection & collection) override; - void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; + ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly) override; + void addStructureAndFormatToArgs( - ASTs & args, const String & structure_, const String & format_, ContextPtr context) override; + ASTs & args, + const String & structure_, + const String & format_, + ContextPtr context) override; protected: + void fromNamedCollection(const NamedCollection & collection) override; + void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; + using AzureClient = Azure::Storage::Blobs::BlobContainerClient; using AzureClientPtr = std::unique_ptr; diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index 3119b844aaf..83865c47eb8 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -38,7 +38,7 @@ public: std::optional format_settings_, LoadingStrictnessLevel mode) { - auto object_storage = base_configuration->createObjectStorage(context); + auto object_storage = base_configuration->createObjectStorage(context, /* is_readonly */true); DataLakeMetadataPtr metadata; NamesAndTypesList schema_from_metadata; @@ -96,8 +96,6 @@ public: void updateConfiguration(ContextPtr local_context) override { - std::lock_guard lock(Storage::configuration_update_mutex); - Storage::updateConfiguration(local_context); auto new_metadata = DataLakeMetadata::create(Storage::object_storage, base_configuration, local_context); diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index cac09ee1d92..dc06e754c44 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -28,19 +28,22 @@ public: const Paths & getPaths() const override { return paths; } void setPaths(const Paths & paths_) override { paths = paths_; } + std::string getPathWithoutGlobs() const override; String getNamespace() const override { return ""; } String getDataSourceDescription() override { return url; } StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; void check(ContextPtr context) const override; - ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT ConfigurationPtr clone() override { return std::make_shared(*this); } - void addStructureAndFormatToArgs( - ASTs & args, const String & structure_, const String & format_, ContextPtr context) override; + ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly) override; - std::string getPathWithoutGlobs() const override; + void addStructureAndFormatToArgs( + ASTs & args, + const String & structure_, + const String & format_, + ContextPtr context) override; private: void fromNamedCollection(const NamedCollection &) override; diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index 9eb724c4a64..b28b1c226a7 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -3,7 +3,6 @@ #include "config.h" #if USE_AWS_S3 - #include #include @@ -35,13 +34,16 @@ public: void check(ContextPtr context) const override; void validateNamespace(const String & name) const override; - ConfigurationPtr clone() override { return std::make_shared(*this); } bool isStaticConfiguration() const override { return static_configuration; } - ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) override; /// NOLINT + ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly) override; + void addStructureAndFormatToArgs( - ASTs & args, const String & structure, const String & format, ContextPtr context) override; + ASTs & args, + const String & structure, + const String & format, + ContextPtr context) override; private: void fromNamedCollection(const NamedCollection & collection) override; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 2c9831f0d29..a187a8fc54d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -91,6 +91,7 @@ bool StorageObjectStorage::supportsSubsetOfColumns(const ContextPtr & context) c void StorageObjectStorage::updateConfiguration(ContextPtr context) { + /// FIXME: we should be able to update everything apart from client if static_configuration == true. if (!configuration->isStaticConfiguration()) object_storage->applyNewSettings(context->getConfigRef(), "s3.", context); } @@ -113,7 +114,6 @@ public: const std::optional & format_settings_, bool distributed_processing_, ReadFromFormatInfo info_, - SchemaCache & schema_cache_, const bool need_only_count_, ContextPtr context_, size_t max_block_size_, @@ -121,11 +121,9 @@ public: : SourceStepWithFilter(DataStream{.header = info_.source_header}, columns_to_read, query_info_, storage_snapshot_, context_) , object_storage(object_storage_) , configuration(configuration_) - , schema_cache(schema_cache_) , info(std::move(info_)) , virtual_columns(virtual_columns_) , format_settings(format_settings_) - , query_settings(configuration->getQuerySettings(context_)) , name(name_ + "Source") , need_only_count(need_only_count_) , max_block_size(max_block_size_) @@ -154,8 +152,8 @@ public: for (size_t i = 0; i < num_streams; ++i) { auto source = std::make_shared( - getName(), object_storage, configuration, info, format_settings, query_settings, - context, max_block_size, iterator_wrapper, need_only_count, schema_cache); + getName(), object_storage, configuration, info, format_settings, + context, max_block_size, iterator_wrapper, need_only_count); source->setKeyCondition(filter_actions_dag, context); pipes.emplace_back(std::move(source)); @@ -175,12 +173,10 @@ private: ObjectStoragePtr object_storage; ConfigurationPtr configuration; std::shared_ptr iterator_wrapper; - SchemaCache & schema_cache; const ReadFromFormatInfo info; const NamesAndTypesList virtual_columns; const std::optional format_settings; - const StorageObjectStorage::QuerySettings query_settings; const String name; const bool need_only_count; const size_t max_block_size; @@ -233,7 +229,6 @@ void StorageObjectStorage::read( format_settings, distributed_processing, read_from_format_info, - getSchemaCache(local_context), need_only_count, local_context, max_block_size, @@ -371,11 +366,6 @@ std::pair StorageObjectStorage::resolveSchemaAn return std::pair(columns, format); } -SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context) -{ - return getSchemaCache(context, configuration->getTypeName()); -} - SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context, const std::string & storage_type_name) { if (storage_type_name == "s3") diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 46d422b26c2..3f8ff79ad54 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -92,8 +92,6 @@ public: bool parallelizeOutputAfterReading(ContextPtr context) const override; - SchemaCache & getSchemaCache(const ContextPtr & context); - static SchemaCache & getSchemaCache(const ContextPtr & context, const std::string & storage_type_name); static ColumnsDescription resolveSchemaFromData( @@ -132,7 +130,6 @@ protected: const bool distributed_processing; LoggerPtr log; - std::mutex configuration_update_mutex; }; class StorageObjectStorage::Configuration @@ -175,7 +172,7 @@ public: virtual void check(ContextPtr context) const; virtual void validateNamespace(const String & /* name */) const {} - virtual ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly = true) = 0; /// NOLINT + virtual ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly) = 0; virtual ConfigurationPtr clone() = 0; virtual bool isStaticConfiguration() const { return true; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index b224afb7a58..cb3f732ce83 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -44,19 +44,16 @@ StorageObjectStorageSource::StorageObjectStorageSource( ConfigurationPtr configuration_, const ReadFromFormatInfo & info, std::optional format_settings_, - const StorageObjectStorage::QuerySettings & query_settings_, ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, - bool need_only_count_, - SchemaCache & schema_cache_) + bool need_only_count_) : SourceWithKeyCondition(info.source_header, false) , WithContext(context_) , name(std::move(name_)) , object_storage(object_storage_) , configuration(configuration_) , format_settings(format_settings_) - , query_settings(query_settings_) , max_block_size(max_block_size_) , need_only_count(need_only_count_) , read_from_format_info(info) @@ -67,7 +64,7 @@ StorageObjectStorageSource::StorageObjectStorageSource( 1/* max_threads */)) , columns_desc(info.columns_description) , file_iterator(file_iterator_) - , schema_cache(schema_cache_) + , schema_cache(StorageObjectStorage::getSchemaCache(context_, configuration->getTypeName())) , create_reader_scheduler(threadPoolCallbackRunnerUnsafe(*create_reader_pool, "Reader")) { } @@ -229,6 +226,8 @@ std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const O StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReader(size_t processor) { ObjectInfoPtr object_info; + auto query_settings = configuration->getQuerySettings(getContext()); + do { object_info = file_iterator->next(processor); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 356478422bc..a8df00bc0ac 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -32,12 +32,10 @@ public: ConfigurationPtr configuration, const ReadFromFormatInfo & info, std::optional format_settings_, - const StorageObjectStorage::QuerySettings & query_settings_, ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, - bool need_only_count_, - SchemaCache & schema_cache_); + bool need_only_count_); ~StorageObjectStorageSource() override; @@ -62,7 +60,6 @@ protected: ObjectStoragePtr object_storage; const ConfigurationPtr configuration; const std::optional format_settings; - const StorageObjectStorage::QuerySettings query_settings; const UInt64 max_block_size; const bool need_only_count; const ReadFromFormatInfo read_from_format_info; diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index c23b180215e..74c8aeaad7d 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include @@ -59,7 +58,7 @@ static std::shared_ptr createStorageObjectStorage( return std::make_shared( configuration, - configuration->createObjectStorage(context), + configuration->createObjectStorage(context, /* is_readonly */false), args.getContext(), args.table_id, args.columns, diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 38934a7895a..b9c67c7d801 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -138,7 +138,7 @@ StorageS3Queue::StorageS3Queue( checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef()); - object_storage = configuration->createObjectStorage(context_); + object_storage = configuration->createObjectStorage(context_, /* is_readonly */true); FormatFactory::instance().checkFormatName(configuration->format); configuration->check(context_); @@ -361,12 +361,10 @@ std::shared_ptr StorageS3Queue::createSource( configuration, info, format_settings, - configuration->getQuerySettings(local_context), local_context, max_block_size, file_iterator, - false, - StorageObjectStorage::getSchemaCache(local_context, configuration->getTypeName())); + false); auto file_deleter = [=, this](const std::string & path) mutable { From 1ccae23170f7668b56a44cb3063e86530f32ce10 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 29 Apr 2024 17:05:31 +0000 Subject: [PATCH 088/392] Fix alter modify column for dynamic columns, make check part work for dynamic columns, fix style errors and tests --- src/Columns/ColumnDynamic.cpp | 5 --- src/Columns/ColumnDynamic.h | 7 +--- src/Core/SettingsChangesHistory.h | 2 + src/DataTypes/DataTypeVariant.cpp | 2 - src/DataTypes/IDataType.h | 6 +++ src/DataTypes/Serializations/ISerialization.h | 10 +++-- .../Serializations/SerializationArray.cpp | 2 +- .../Serializations/SerializationDynamic.cpp | 32 ++++++-------- .../SerializationDynamicElement.cpp | 12 +++--- .../Serializations/SerializationMap.cpp | 2 +- .../Serializations/SerializationTuple.cpp | 4 +- .../Serializations/SerializationVariant.cpp | 22 ++++++++-- .../Serializations/SerializationVariant.h | 8 ++++ .../SerializationVariantElement.cpp | 4 +- src/Functions/FunctionsConversion.cpp | 29 ++++--------- src/Functions/dynamicElement.cpp | 42 +++++++++++-------- src/Functions/variantElement.cpp | 4 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 6 ++- src/Storages/MergeTree/IMergeTreeDataPart.h | 4 +- .../MergeTreeDataPartWriterCompact.cpp | 28 +++++++++---- .../MergeTreeDataPartWriterCompact.h | 4 +- .../MergeTree/MergeTreeDataPartWriterWide.cpp | 40 +++++++++++------- .../MergeTree/MergeTreeDataPartWriterWide.h | 4 +- .../MergeTree/MergeTreeReaderWide.cpp | 2 +- src/Storages/MergeTree/MutateTask.cpp | 21 +++++++++- src/Storages/MergeTree/checkDataPart.cpp | 2 +- ....disabled => 03040_dynamic_type_alters.sh} | 0 27 files changed, 180 insertions(+), 124 deletions(-) rename tests/queries/0_stateless/{03040_dynamic_type_alters.sh.disabled => 03040_dynamic_type_alters.sh} (100%) diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index 3074504973a..f3dff01af25 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -10,7 +10,6 @@ #include #include -#include namespace DB { @@ -687,7 +686,6 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const DB::Columns & so } size_t size = source_statistics.data.empty() ? source_variant_column.getVariantByGlobalDiscriminator(i).size() : source_statistics.data.at(variant_name); - LOG_DEBUG(getLogger("ColumnDynamic"), "Source variant: {}. Variant: {}. Size: {}", source_variant_info.variant_name, variant_name, size); it->second += size; } } @@ -700,10 +698,7 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const DB::Columns & so std::vector> variants_with_sizes; variants_with_sizes.reserve(all_variants.size()); for (const auto & variant : all_variants) - { - LOG_DEBUG(getLogger("ColumnDynamic"), "Variant: {}. Size: {}", variant->getName(), total_sizes[variant->getName()]); variants_with_sizes.emplace_back(total_sizes[variant->getName()], variant); - } std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater()); /// Take first max_dynamic_types variants from sorted list. diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h index 7487a5aa0db..b5167f4b9d9 100644 --- a/src/Columns/ColumnDynamic.h +++ b/src/Columns/ColumnDynamic.h @@ -9,11 +9,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - /** * Column for storing Dynamic type values. * Dynamic column allows to insert and store values of any data types inside. @@ -340,7 +335,7 @@ private: /// Combine current variant with the other variant and return global discriminators mapping /// from other variant to the combined one. It's used for inserting from /// different variants. - /// Returns nullptr if maximum number of Variants is reached and tne new Variant cannot be created. + /// Returns nullptr if maximum number of Variants is reached and the new Variant cannot be created. std::vector * combineVariants(const VariantInfo & other_variant_info); void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type); diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index d3b5de06e70..42cda26d73c 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,6 +85,8 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"24.5", {{"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"}, + {"allow_experimental_dynamic_type", false, false, "Add new experimental Dynamic type"}}}, {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"}, {"ignore_drop_queries_probability", 0, 0, "Allow to ignore drop queries in server with specified probability for testing purposes"}, {"lightweight_deletes_sync", 2, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes"}, diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index b918b79a2ed..6478bd598f1 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -18,7 +17,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int EMPTY_DATA_PASSED; } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index dde61ca3a48..46c30240ef8 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -11,6 +11,12 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + + class ReadBuffer; class WriteBuffer; diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index ddbed34f614..b233230f9cc 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -138,9 +138,9 @@ public: return *this; } - SubstreamData & withDeserializePrefix(DeserializeBinaryBulkStatePtr deserialize_prefix_state_) + SubstreamData & withDeserializeState(DeserializeBinaryBulkStatePtr deserialize_state_) { - deserialize_prefix_state = std::move(deserialize_prefix_state_); + deserialize_state = std::move(deserialize_state_); return *this; } @@ -148,7 +148,11 @@ public: DataTypePtr type; ColumnPtr column; SerializationInfoPtr serialization_info; - DeserializeBinaryBulkStatePtr deserialize_prefix_state; + + /// For types with dynamic subcolumns deserialize state contains information + /// about current dynamic structure. And this information can be useful + /// when we call enumerateStreams to enumerate dynamic streams. + DeserializeBinaryBulkStatePtr deserialize_state; }; struct Substream diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index 6a8555a3714..ac7b8f4d084 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -255,7 +255,7 @@ void SerializationArray::enumerateStreams( .withType(type_array ? type_array->getNestedType() : nullptr) .withColumn(column_array ? column_array->getDataPtr() : nullptr) .withSerializationInfo(data.serialization_info) - .withDeserializePrefix(data.deserialize_prefix_state); + .withDeserializeState(data.deserialize_state); nested->enumerateStreams(settings, callback, next_data); settings.path.pop_back(); diff --git a/src/DataTypes/Serializations/SerializationDynamic.cpp b/src/DataTypes/Serializations/SerializationDynamic.cpp index 858445ed257..5e6106f560f 100644 --- a/src/DataTypes/Serializations/SerializationDynamic.cpp +++ b/src/DataTypes/Serializations/SerializationDynamic.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -30,15 +31,9 @@ struct SerializeBinaryBulkStateDynamic : public ISerialization::SerializeBinaryB ISerialization::SerializeBinaryBulkStatePtr variant_state; /// Variants statistics. Map (Variant name) -> (Variant size). - ColumnDynamic::Statistics statistics = { .source =ColumnDynamic::Statistics::Source::READ }; + ColumnDynamic::Statistics statistics = { .source = ColumnDynamic::Statistics::Source::READ }; SerializeBinaryBulkStateDynamic(UInt64 structure_version_) : structure_version(structure_version_) {} - - void updateStatistics(const ColumnVariant & column_variant) - { - for (size_t i = 0; i != variant_names.size(); ++i) - statistics.data[variant_names[i]] += column_variant.getVariantPtrByGlobalDiscriminator(i)->size(); - } }; struct DeserializeBinaryBulkStateDynamic : public ISerialization::DeserializeBinaryBulkState @@ -58,13 +53,13 @@ void SerializationDynamic::enumerateStreams( settings.path.pop_back(); const auto * column_dynamic = data.column ? &assert_cast(*data.column) : nullptr; - const auto * deserialize_prefix_state = data.deserialize_prefix_state ? checkAndGetState(data.deserialize_prefix_state) : nullptr; + const auto * deserialize_state = data.deserialize_state ? checkAndGetState(data.deserialize_state) : nullptr; - /// If column is nullptr and we didn't deserizlize prefix yet, nothing to enumerate as we don't have any variants. - if (!column_dynamic && !deserialize_prefix_state) + /// If column is nullptr and we don't have deserialize state yet, nothing to enumerate as we don't have any variants. + if (!column_dynamic && !deserialize_state) return; - const auto & variant_type = column_dynamic ? column_dynamic->getVariantInfo().variant_type : checkAndGetState(deserialize_prefix_state->structure_state)->variant_type; + const auto & variant_type = column_dynamic ? column_dynamic->getVariantInfo().variant_type : checkAndGetState(deserialize_state->structure_state)->variant_type; auto variant_serialization = variant_type->getDefaultSerialization(); settings.path.push_back(Substream::DynamicData); @@ -72,7 +67,7 @@ void SerializationDynamic::enumerateStreams( .withType(variant_type) .withColumn(column_dynamic ? column_dynamic->getVariantColumnPtr() : nullptr) .withSerializationInfo(data.serialization_info) - .withDeserializePrefix(deserialize_prefix_state ? deserialize_prefix_state->variant_state : nullptr); + .withDeserializeState(deserialize_state ? deserialize_state->variant_state : nullptr); settings.path.back().data = variant_data; variant_serialization->enumerateStreams(settings, callback, variant_data); settings.path.pop_back(); @@ -124,11 +119,11 @@ void SerializationDynamic::serializeBinaryBulkStatePrefix( { size_t size = 0; /// Use statistics from column if it was created during merge. - if (statistics.data.empty() || statistics.source != ColumnDynamic::Statistics::Source::MERGE) - size = variant_column.getVariantByGlobalDiscriminator(i).size(); + if (!statistics.data.empty() && statistics.source == ColumnDynamic::Statistics::Source::MERGE) + size = statistics.data.at(variant_info.variant_names[i]); /// Otherwise we can use only variant sizes from current column. else - size = statistics.data.at(variant_info.variant_names[i]); + size = variant_column.getVariantByGlobalDiscriminator(i).size(); writeVarUInt(size, *stream); } } @@ -243,12 +238,9 @@ void SerializationDynamic::serializeBinaryBulkWithMultipleStreams( if (!variant_info.variant_type->equals(*dynamic_state->variant_type)) throw Exception(ErrorCodes::LOGICAL_ERROR, "Mismatch of internal columns of Dynamic. Expected: {}, Got: {}", dynamic_state->variant_type->getName(), variant_info.variant_type->getName()); - /// Update statistics. - if (offset == 0) - dynamic_state->updateStatistics(*variant_column); - settings.path.push_back(Substream::DynamicData); - dynamic_state->variant_serialization->serializeBinaryBulkWithMultipleStreams(*variant_column, offset, limit, settings, dynamic_state->variant_state); + assert_cast(*dynamic_state->variant_serialization) + .serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(*variant_column, offset, limit, settings, dynamic_state->variant_state, dynamic_state->statistics.data); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.cpp b/src/DataTypes/Serializations/SerializationDynamicElement.cpp index 9be9802d926..059a7d57e4e 100644 --- a/src/DataTypes/Serializations/SerializationDynamicElement.cpp +++ b/src/DataTypes/Serializations/SerializationDynamicElement.cpp @@ -33,21 +33,21 @@ void SerializationDynamicElement::enumerateStreams( /// If we didn't deserialize prefix yet, we don't know if we actually have this variant in Dynamic column, /// so we cannot enumerate variant streams. - if (!data.deserialize_prefix_state) + if (!data.deserialize_state) return; - auto * deserialize_prefix_state = checkAndGetState(data.deserialize_prefix_state); + auto * deserialize_state = checkAndGetState(data.deserialize_state); /// If we don't have this variant, no need to enumerate streams for it as we won't read from any stream. - if (!deserialize_prefix_state->variant_serialization) + if (!deserialize_state->variant_serialization) return; settings.path.push_back(Substream::DynamicData); - auto variant_data = SubstreamData(deserialize_prefix_state->variant_serialization) + auto variant_data = SubstreamData(deserialize_state->variant_serialization) .withType(data.type) .withColumn(data.column) .withSerializationInfo(data.serialization_info) - .withDeserializePrefix(deserialize_prefix_state->variant_element_state); - deserialize_prefix_state->variant_serialization->enumerateStreams(settings, callback, variant_data); + .withDeserializeState(deserialize_state->variant_element_state); + deserialize_state->variant_serialization->enumerateStreams(settings, callback, variant_data); settings.path.pop_back(); } diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index cda82f31820..10635fb9142 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -399,7 +399,7 @@ void SerializationMap::enumerateStreams( .withType(data.type ? assert_cast(*data.type).getNestedType() : nullptr) .withColumn(data.column ? assert_cast(*data.column).getNestedColumnPtr() : nullptr) .withSerializationInfo(data.serialization_info) - .withDeserializePrefix(data.deserialize_prefix_state); + .withDeserializeState(data.deserialize_state); nested->enumerateStreams(settings, callback, next_data); } diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 6e4b4c4c533..ef0a75fac40 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -567,7 +567,7 @@ void SerializationTuple::enumerateStreams( const auto * type_tuple = data.type ? &assert_cast(*data.type) : nullptr; const auto * column_tuple = data.column ? &assert_cast(*data.column) : nullptr; const auto * info_tuple = data.serialization_info ? &assert_cast(*data.serialization_info) : nullptr; - const auto * tuple_deserialize_prefix_state = data.deserialize_prefix_state ? checkAndGetState(data.deserialize_prefix_state) : nullptr; + const auto * tuple_deserialize_state = data.deserialize_state ? checkAndGetState(data.deserialize_state) : nullptr; for (size_t i = 0; i < elems.size(); ++i) { @@ -575,7 +575,7 @@ void SerializationTuple::enumerateStreams( .withType(type_tuple ? type_tuple->getElement(i) : nullptr) .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr) .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr) - .withDeserializePrefix(tuple_deserialize_prefix_state ? tuple_deserialize_prefix_state->states[i] : nullptr); + .withDeserializeState(tuple_deserialize_state ? tuple_deserialize_state->states[i] : nullptr); elems[i]->enumerateStreams(settings, callback, next_data); } diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 8e0ef112444..9456ffa3ad3 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -45,7 +45,7 @@ void SerializationVariant::enumerateStreams( { const auto * type_variant = data.type ? &assert_cast(*data.type) : nullptr; const auto * column_variant = data.column ? &assert_cast(*data.column) : nullptr; - const auto * variant_deserialize_prefix_state = data.deserialize_prefix_state ? checkAndGetState(data.deserialize_prefix_state) : nullptr; + const auto * variant_deserialize_state = data.deserialize_state ? checkAndGetState(data.deserialize_state) : nullptr; auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", SubstreamType::NamedVariantDiscriminators); auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr; @@ -71,7 +71,7 @@ void SerializationVariant::enumerateStreams( .withType(type_variant ? type_variant->getVariant(i) : nullptr) .withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr) .withSerializationInfo(data.serialization_info) - .withDeserializePrefix(variant_deserialize_prefix_state ? variant_deserialize_prefix_state->states[i] : nullptr); + .withDeserializeState(variant_deserialize_state ? variant_deserialize_state->states[i] : nullptr); addVariantElementToPath(settings.path, i); settings.path.back().data = variant_data; @@ -144,12 +144,13 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix( } -void SerializationVariant::serializeBinaryBulkWithMultipleStreams( +void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics( const IColumn & column, size_t offset, size_t limit, SerializeBinaryBulkSettings & settings, - SerializeBinaryBulkStatePtr & state) const + SerializeBinaryBulkStatePtr & state, + std::unordered_map & variants_statistics) const { const ColumnVariant & col = assert_cast(column); if (const size_t size = col.size(); limit == 0 || offset + limit > size) @@ -188,6 +189,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( { addVariantElementToPath(settings.path, i); variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); + variants_statistics[variant_names[i]] += col.getVariantByGlobalDiscriminator(i).size(); settings.path.pop_back(); } settings.path.pop_back(); @@ -208,6 +210,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( addVariantElementToPath(settings.path, non_empty_global_discr); /// We can use the same offset/limit as for whole Variant column variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->states[non_empty_global_discr]); + variants_statistics[variant_names[non_empty_global_discr]] += limit; settings.path.pop_back(); settings.path.pop_back(); return; @@ -247,12 +250,23 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( variant_offsets_and_limits[i].second, settings, variant_state->states[i]); + variants_statistics[variant_names[i]] += variant_offsets_and_limits[i].second; settings.path.pop_back(); } } settings.path.pop_back(); } +void SerializationVariant::serializeBinaryBulkWithMultipleStreams( + const DB::IColumn & column, + size_t offset, + size_t limit, + DB::ISerialization::SerializeBinaryBulkSettings & settings, + DB::ISerialization::SerializeBinaryBulkStatePtr & state) const +{ + std::unordered_map tmp_statistics; + serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(column, offset, limit, settings, state, tmp_statistics); +} void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( ColumnPtr & column, diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h index 0de786f5561..b6aa1534538 100644 --- a/src/DataTypes/Serializations/SerializationVariant.h +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -69,6 +69,14 @@ public: SerializeBinaryBulkSettings & settings, SerializeBinaryBulkStatePtr & state) const override; + void serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state, + std::unordered_map & variants_statistics) const; + void deserializeBinaryBulkWithMultipleStreams( ColumnPtr & column, size_t limit, diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 0e1ad81ce5b..dc7fc3b9b35 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -38,13 +38,13 @@ void SerializationVariantElement::enumerateStreams( callback(settings.path); settings.path.pop_back(); - const auto * deserialize_prefix_state = data.deserialize_prefix_state ? checkAndGetState(data.deserialize_prefix_state) : nullptr; + const auto * deserialize_state = data.deserialize_state ? checkAndGetState(data.deserialize_state) : nullptr; addVariantToPath(settings.path); auto nested_data = SubstreamData(nested_serialization) .withType(data.type ? removeNullableOrLowCardinalityNullable(data.type) : nullptr) .withColumn(data.column ? removeNullableOrLowCardinalityNullable(data.column) : nullptr) .withSerializationInfo(data.serialization_info) - .withDeserializePrefix(deserialize_prefix_state ? deserialize_prefix_state->variant_element_state : nullptr); + .withDeserializeState(deserialize_state ? deserialize_state->variant_element_state : nullptr); settings.path.back().data = data; nested_serialization->enumerateStreams(settings, callback, data); removeVariantFromPath(settings.path); diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 9a8ed03a81d..b01643a9532 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -66,8 +66,6 @@ #include #include -#include - namespace DB { @@ -4050,9 +4048,9 @@ private: casted_variant_columns.reserve(variant_types.size()); for (size_t i = 0; i != variant_types.size(); ++i) { - auto variant_col = column_variant.getVariantPtrByLocalDiscriminator(i); + auto variant_col = column_variant.getVariantPtrByGlobalDiscriminator(i); ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], "" }}; - const auto & variant_wrapper = variant_wrappers[column_variant.globalDiscriminatorByLocal(i)]; + const auto & variant_wrapper = variant_wrappers[i]; casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size())); } @@ -4062,11 +4060,11 @@ private: res->reserve(input_rows_count); for (size_t i = 0; i != input_rows_count; ++i) { - auto local_discr = local_discriminators[i]; - if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) + auto global_discr = column_variant.globalDiscriminatorByLocal(local_discriminators[i]); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) res->insertDefault(); else - res->insertFrom(*casted_variant_columns[local_discr], column_variant.offsetAt(i)); + res->insertFrom(*casted_variant_columns[global_discr], column_variant.offsetAt(i)); } return res; @@ -4236,14 +4234,14 @@ private: return createColumnToVariantWrapper(from_type, assert_cast(*to_type)); } - WrapperType createDynamicToColumnWrapper(const DataTypePtr & to_type) const + WrapperType createDynamicToColumnWrapper(const DataTypePtr &) const { - return [this, to_type] + return [this] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * col_nullable, size_t input_rows_count) -> ColumnPtr { const auto & column_dynamic = assert_cast(*arguments.front().column.get()); const auto & variant_info = column_dynamic.getVariantInfo(); - auto variant_wrapper = createVariantToColumnWrapper(assert_cast(*variant_info.variant_type), to_type); + auto variant_wrapper = createVariantToColumnWrapper(assert_cast(*variant_info.variant_type), result_type); ColumnsWithTypeAndName args = {ColumnWithTypeAndName(column_dynamic.getVariantColumnPtr(), variant_info.variant_type, "")}; return variant_wrapper(args, result_type, col_nullable, input_rows_count); }; @@ -4279,8 +4277,6 @@ private: size_t max_result_num_variants, const ColumnDynamic::Statistics & statistics = {}) const { - LOG_DEBUG(getLogger("FunctionsConversion"), "getReducedVariant for variant {} with size {}", variant_type->getName(), variant_column.size()); - const auto & variant_types = assert_cast(*variant_type).getVariants(); /// First check if we don't exceed the limit in current Variant column. if (variant_types.size() < max_result_num_variants || (variant_types.size() == max_result_num_variants && variant_name_to_discriminator.contains("String"))) @@ -4296,12 +4292,11 @@ private: { /// String variant won't be removed. String variant_name = variant_types[i]->getName(); - LOG_DEBUG(getLogger("FunctionsConversion"), "Variant {}/{} size: {}, statistics: {}", variant_name, i, variant_column.getVariantByGlobalDiscriminator(i).size(), statistics.data.contains(variant_name) ? toString(statistics.data.at(variant_name)) : "none"); if (variant_name == "String") { old_string_discriminator = i; - /// For simplicity, add this variant to the list that will be converted string, + /// For simplicity, add this variant to the list that will be converted to string, /// so we will process it with other variants when constructing the new String variant. variants_to_convert_to_string.push_back(i); } @@ -4361,11 +4356,9 @@ private: { auto string_type = std::make_shared(); auto string_wrapper = prepareUnpackDictionaries(variant_types[discr], string_type); - LOG_DEBUG(getLogger("FunctionsConversion"), "Convert variant {} with size {} to String", variant_types[discr]->getName(), variant_column.getVariantPtrByGlobalDiscriminator(discr)->size()); auto column_to_convert = ColumnWithTypeAndName(variant_column.getVariantPtrByGlobalDiscriminator(discr), variant_types[discr], ""); ColumnsWithTypeAndName args = {column_to_convert}; auto variant_string_column = string_wrapper(args, string_type, nullptr, column_to_convert.column->size()); - LOG_DEBUG(getLogger("FunctionsConversion"), "Got String column with size {}", variant_string_column->size()); string_variant_size += variant_string_column->size(); variants_converted_to_string[discr] = variant_string_column; } @@ -4381,11 +4374,9 @@ private: new_offsets_data.reserve(variant_column.size()); const auto & old_local_discriminators = variant_column.getLocalDiscriminators(); const auto & old_offsets = variant_column.getOffsets(); - LOG_DEBUG(getLogger("FunctionsConversion"), "Discriminators size: {}. Offsets size: {}", old_local_discriminators.size(), old_offsets.size()); for (size_t i = 0; i != old_local_discriminators.size(); ++i) { auto old_discr = variant_column.globalDiscriminatorByLocal(old_local_discriminators[i]); - LOG_DEBUG(getLogger("FunctionsConversion"), "Row {}, discriminator {}", i, UInt64(old_discr)); if (old_discr == ColumnVariant::NULL_DISCRIMINATOR) { @@ -4398,12 +4389,10 @@ private: new_discriminators_data.push_back(new_discr); if (new_discr != string_variant_discriminator) { - LOG_DEBUG(getLogger("FunctionsConversion"), "Keep variant {}", UInt64(old_discr)); new_offsets_data.push_back(old_offsets[i]); } else { - LOG_DEBUG(getLogger("FunctionsConversion"), "Get string value of variant {} with String column with size {} at offset {}", UInt64(old_discr), variants_converted_to_string[old_discr]->size(), old_offsets[i]); new_offsets_data.push_back(string_variant->size()); string_variant->insertFrom(*variants_converted_to_string[old_discr], old_offsets[i]); } diff --git a/src/Functions/dynamicElement.cpp b/src/Functions/dynamicElement.cpp index 964c058776e..6752a61b6c3 100644 --- a/src/Functions/dynamicElement.cpp +++ b/src/Functions/dynamicElement.cpp @@ -149,24 +149,30 @@ private: REGISTER_FUNCTION(DynamicElement) { -// factory.registerFunction(FunctionDocumentation{ -// .description = R"( -//Extracts a column with specified type from a `Dynamic` column. -//)", -// .syntax{"dynamicElement(dynamic, type_name)"}, -// .arguments{{ -// {"dynamic", "Dynamic column"}, -// {"type_name", "The name of the variant type to extract"}}}, -// .examples{{{ -// "Example", -// R"( -//)", -// R"( -//)"}}}, -// .categories{"Dynamic"}, -// }); - - factory.registerFunction(); + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Extracts a column with specified type from a `Dynamic` column. +)", + .syntax{"dynamicElement(dynamic, type_name)"}, + .arguments{ + {"dynamic", "Dynamic column"}, + {"type_name", "The name of the variant type to extract"}}, + .examples{{{ + "Example", + R"( +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT d, dynamicType(d), dynamicElement(d, 'String'), dynamicElement(d, 'Int64'), dynamicElement(d, 'Array(Int64)'), dynamicElement(d, 'Date'), dynamicElement(d, 'Array(String)') FROM test;)", + R"( +┌─d─────────────┬─dynamicType(d)─┬─dynamicElement(d, 'String')─┬─dynamicElement(d, 'Int64')─┬─dynamicElement(d, 'Array(Int64)')─┬─dynamicElement(d, 'Date')─┬─dynamicElement(d, 'Array(String)')─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ [] │ ᴺᵁᴸᴸ │ [] │ +│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ ᴺᵁᴸᴸ │ [] │ +└───────────────┴────────────────┴─────────────────────────────┴────────────────────────────┴───────────────────────────────────┴───────────────────────────┴────────────────────────────────────┘ +)"}}}, + .categories{"Dynamic"}, + }); } } diff --git a/src/Functions/variantElement.cpp b/src/Functions/variantElement.cpp index b57ccb6fee1..e63afc68b34 100644 --- a/src/Functions/variantElement.cpp +++ b/src/Functions/variantElement.cpp @@ -171,10 +171,10 @@ REGISTER_FUNCTION(VariantElement) Extracts a column with specified type from a `Variant` column. )", .syntax{"variantElement(variant, type_name, [, default_value])"}, - .arguments{{ + .arguments{ {"variant", "Variant column"}, {"type_name", "The name of the variant type to extract"}, - {"default_value", "The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional"}}}, + {"default_value", "The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional"}}, .examples{{{ "Example", R"( diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 9107c67afdd..9ef5b58ff91 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -2392,12 +2392,14 @@ void IMergeTreeDataPart::setBrokenReason(const String & message, int code) const exception_code = code; } -ColumnPtr IMergeTreeDataPart::readColumnSample(const NameAndTypePair & column) const +ColumnPtr IMergeTreeDataPart::getColumnSample(const NameAndTypePair & column) const { const size_t total_mark = getMarksCount(); - if (!total_mark) + /// If column doesn't have dynamic subcolumns or part has no data, just create column using it's type. + if (!column.type->hasDynamicSubcolumns() || !total_mark) return column.type->createColumn(); + /// Otherwise, read sample column with 0 rows from the part, so it will load dynamic structure. NamesAndTypesList cols; cols.emplace_back(column); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 78619f216c0..ddfc66cc622 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -166,7 +166,9 @@ public: NameAndTypePair getColumn(const String & name) const; std::optional tryGetColumn(const String & column_name) const; - ColumnPtr readColumnSample(const NameAndTypePair & column) const; + /// Get sample column from part. For ordinary columns it just creates column using it's type. + /// For columns with dynamic structure it reads sample column with 0 rows from the part. + ColumnPtr getColumnSample(const NameAndTypePair & column) const; const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index d0a685d95fc..e34822ce6df 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -44,18 +44,29 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( marks_source_hashing = std::make_unique(*marks_compressor); } -} - -void MergeTreeDataPartWriterCompact::initStreamsIfNeeded(const Block & block) -{ - if (!compressed_streams.empty()) - return; auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); for (const auto & column : columns_list) { auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); - addStreams(column, block.getByName(column.name).column, compression); + addStreams(column, nullptr, compression); + } +} + +void MergeTreeDataPartWriterCompact::initDynamicStreamsIfNeeded(const Block & block) +{ + if (is_dynamic_streams_initialized) + return; + + is_dynamic_streams_initialized = true; + auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); + for (const auto & column : columns_list) + { + if (column.type->hasDynamicSubcolumns()) + { + auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); + addStreams(column, block.getByName(column.name).column, compression); + } } } @@ -155,7 +166,8 @@ void writeColumnSingleGranule( void MergeTreeDataPartWriterCompact::write(const Block & block, const IColumn::Permutation * permutation) { - initStreamsIfNeeded(block); + /// On first block of data initialize streams for dynamic subcolumns. + initDynamicStreamsIfNeeded(block); /// Fill index granularity for this block /// if it's unknown (in case of insert data or horizontal merge, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index 1c748803c52..f35479387f6 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -44,7 +44,7 @@ private: void addStreams(const NameAndTypePair & name_and_type, const ColumnPtr & column, const ASTPtr & effective_codec_desc); - void initStreamsIfNeeded(const Block & block); + void initDynamicStreamsIfNeeded(const Block & block); Block header; @@ -98,6 +98,8 @@ private: /// then finally to 'marks_file'. std::unique_ptr marks_compressor; std::unique_ptr marks_source_hashing; + + bool is_dynamic_streams_initialized = false; }; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index c23a9a81cbc..fb7ee9f7fe8 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -89,19 +89,29 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( indices_to_recalc_, stats_to_recalc_, marks_file_extension_, default_codec_, settings_, index_granularity_) { -} - -void MergeTreeDataPartWriterWide::initStreamsIfNeeded(const DB::Block & block) -{ - if (!column_streams.empty()) - return; - - block_sample = block.cloneEmpty(); auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); for (const auto & column : columns_list) { auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); - addStreams(column, block_sample.getByName(column.name).column, compression); + addStreams(column, nullptr, compression); + } +} + +void MergeTreeDataPartWriterWide::initDynamicStreamsIfNeeded(const DB::Block & block) +{ + if (is_dynamic_streams_initialized) + return; + + is_dynamic_streams_initialized = true; + block_sample = block.cloneEmpty(); + auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); + for (const auto & column : columns_list) + { + if (column.type->hasDynamicSubcolumns()) + { + auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); + addStreams(column, block_sample.getByName(column.name).column, compression); + } } } @@ -123,6 +133,10 @@ void MergeTreeDataPartWriterWide::addStreams( else stream_name = full_stream_name; + /// Shared offsets for Nested type. + if (column_streams.contains(stream_name)) + return; + auto it = stream_name_to_full_name.find(stream_name); if (it != stream_name_to_full_name.end() && it->second != full_stream_name) throw Exception(ErrorCodes::INCORRECT_FILE_NAME, @@ -130,10 +144,6 @@ void MergeTreeDataPartWriterWide::addStreams( " It is a collision between a filename for one column and a hash of filename for another column or a bug", stream_name, it->second, full_stream_name); - /// Shared offsets for Nested type. - if (column_streams.contains(stream_name)) - return; - const auto & subtype = substream_path.back().data.type; CompressionCodecPtr compression_codec; @@ -231,7 +241,8 @@ void MergeTreeDataPartWriterWide::shiftCurrentMark(const Granules & granules_wri void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Permutation * permutation) { - initStreamsIfNeeded(block); + /// On first block of data initialize streams for dynamic subcolumns. + initDynamicStreamsIfNeeded(block); /// Fill index granularity for this block /// if it's unknown (in case of insert data or horizontal merge, @@ -604,7 +615,6 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai " index granularity size {}, last rows {}", column->size(), mark_num, index_granularity.getMarksCount(), index_granularity_rows); } - } void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index ebdd907914f..8343144f2e1 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -87,7 +87,7 @@ private: const ColumnPtr & column, const ASTPtr & effective_codec_desc); - void initStreamsIfNeeded(const Block & block); + void initDynamicStreamsIfNeeded(const Block & block); /// Method for self check (used in debug-build only). Checks that written /// data and corresponding marks are consistent. Otherwise throws logical @@ -135,6 +135,8 @@ private: size_t rows_written_in_last_mark = 0; Block block_sample; + + bool is_dynamic_streams_initialized = false; }; } diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index d18d5eec975..64ca6132cc4 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -349,7 +349,7 @@ void MergeTreeReaderWide::prefetchForColumn( } }; - auto data = ISerialization::SubstreamData(serialization).withType(name_and_type.type).withDeserializePrefix(deserialize_binary_bulk_state_map[name_and_type.name]); + auto data = ISerialization::SubstreamData(serialization).withType(name_and_type.type).withDeserializeState(deserialize_binary_bulk_state_map[name_and_type.name]); ISerialization::EnumerateStreamsSettings settings; serialization->enumerateStreams(settings, callback, data); } diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 5e388d6a8ac..2bbc5bdb3ae 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -60,6 +60,21 @@ static bool checkOperationIsNotCanceled(ActionBlocker & merges_blocker, MergeLis return true; } +static bool haveMutationsOfDynamicColumns(const MergeTreeData::DataPartPtr & data_part, const MutationCommands & commands) +{ + for (const auto & command : commands) + { + if (!command.column_name.empty()) + { + auto column = data_part->tryGetColumn(command.column_name); + if (column && column->type->hasDynamicSubcolumns()) + return true; + } + } + + return false; +} + static UInt64 getExistingRowsCount(const Block & block) { auto column = block.getByName(RowExistsColumn::name).column; @@ -95,7 +110,7 @@ static void splitAndModifyMutationCommands( auto part_columns = part->getColumnsDescription(); const auto & table_columns = metadata_snapshot->getColumns(); - if (!isWidePart(part) || !isFullPartStorage(part->getDataPartStorage())) + if (haveMutationsOfDynamicColumns(part, commands) || !isWidePart(part) || !isFullPartStorage(part->getDataPartStorage())) { NameSet mutated_columns; NameSet dropped_columns; @@ -2250,7 +2265,9 @@ bool MutateTask::prepare() /// All columns from part are changed and may be some more that were missing before in part /// TODO We can materialize compact part without copying data - if (!isWidePart(ctx->source_part) || !isFullPartStorage(ctx->source_part->getDataPartStorage()) + /// Also currently mutations of types with dynamic subcolumns in Wide part are possible only by + /// rewriting the whole part. + if (MutationHelpers::haveMutationsOfDynamicColumns(ctx->source_part, ctx->commands_for_part) || !isWidePart(ctx->source_part) || !isFullPartStorage(ctx->source_part->getDataPartStorage()) || (ctx->interpreter && ctx->interpreter->isAffectingAllColumns())) { /// In case of replicated merge tree with zero copy replication diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index b4d32e71d0d..fc06bcac823 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -219,7 +219,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( auto file_name = *stream_name + ".bin"; checksums_data.files[file_name] = checksum_compressed_file(data_part_storage, file_name); - }); + }, column.type, data_part->getColumnSample(column)); } } else diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters.sh.disabled b/tests/queries/0_stateless/03040_dynamic_type_alters.sh similarity index 100% rename from tests/queries/0_stateless/03040_dynamic_type_alters.sh.disabled rename to tests/queries/0_stateless/03040_dynamic_type_alters.sh From df92f422376173ba93228760d5c210dc21b4c128 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 30 Apr 2024 18:45:19 +0000 Subject: [PATCH 089/392] Fix tests, improve dynamic/variantElement functions, add more comments --- src/Columns/ColumnArray.cpp | 2 +- src/Columns/ColumnConst.cpp | 9 ------- src/Columns/ColumnConst.h | 2 -- src/Columns/ColumnDynamic.cpp | 9 +++---- src/Columns/ColumnDynamic.h | 19 ++++++++----- src/Columns/ColumnMap.cpp | 2 +- src/Columns/ColumnNullable.cpp | 2 +- src/Columns/ColumnSparse.cpp | 2 +- src/Columns/ColumnTuple.cpp | 2 +- src/Columns/ColumnVariant.cpp | 2 +- src/Columns/IColumn.h | 3 +++ src/DataTypes/DataTypeDynamic.h | 3 +++ src/DataTypes/Serializations/ISerialization.h | 3 ++- .../SerializationDynamicElement.cpp | 3 +++ .../SerializationDynamicElement.h | 2 +- .../SerializationVariantElement.cpp | 4 +-- src/Functions/dynamicElement.cpp | 26 ++++++------------ src/Functions/dynamicType.cpp | 14 +++++++--- src/Functions/variantElement.cpp | 27 +++++++------------ src/Interpreters/TreeRewriter.cpp | 9 ++----- src/Interpreters/convertFieldToType.cpp | 3 --- src/Parsers/ParserDataType.cpp | 5 +++- src/Processors/Formats/IOutputFormat.h | 3 +-- src/Processors/Merges/Algorithms/MergedData.h | 3 +++ .../Transforms/ColumnGathererTransform.cpp | 3 +++ src/Storages/ColumnsDescription.cpp | 3 +++ .../MergeTree/MergeTreeReaderWide.cpp | 1 - src/Storages/MergeTree/MergeTreeSettings.h | 1 - .../0_stateless/02941_variant_type_4.sh | 2 +- .../03038_nested_dynamic_merges.reference | 10 +++---- .../03038_nested_dynamic_merges.sh | 8 +++--- .../03039_dynamic_all_merge_algorithms_1.sh | 12 ++++----- 32 files changed, 98 insertions(+), 101 deletions(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 29773492dc9..b8e2a541f5f 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -1289,7 +1289,7 @@ size_t ColumnArray::getNumberOfDimensions() const return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion. } -void ColumnArray::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +void ColumnArray::takeDynamicStructureFromSourceColumns(const Columns & source_columns) { Columns nested_source_columns; nested_source_columns.reserve(source_columns.size()); diff --git a/src/Columns/ColumnConst.cpp b/src/Columns/ColumnConst.cpp index cf3f448516c..f2cea83db0e 100644 --- a/src/Columns/ColumnConst.cpp +++ b/src/Columns/ColumnConst.cpp @@ -159,15 +159,6 @@ void ColumnConst::compareColumn( std::fill(compare_results.begin(), compare_results.end(), res); } -void ColumnConst::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) -{ - Columns nested_source_columns; - nested_source_columns.reserve(source_columns.size()); - for (const auto & source_column : source_columns) - nested_source_columns.push_back(assert_cast(*source_column).getDataColumnPtr()); - data->takeDynamicStructureFromSourceColumns(nested_source_columns); -} - ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value) { auto data = column->cloneEmpty(); diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index 042468cbbcc..c2c0fa3027c 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -308,8 +308,6 @@ public: bool isCollationSupported() const override { return data->isCollationSupported(); } bool hasDynamicStructure() const override { return data->hasDynamicStructure(); } - - void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; }; ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value); diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index f3dff01af25..a1dd60f4748 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -65,14 +65,14 @@ bool ColumnDynamic::addNewVariant(const DB::DataTypePtr & new_variant) if (variant_info.variant_names.size() >= max_dynamic_types) { /// ColumnDynamic can have max_dynamic_types number of variants only when it has String as a variant. - /// Otherwise we won't be able to add cast new variants to Strings. + /// Otherwise we won't be able to cast new variants to Strings. if (!variant_info.variant_name_to_discriminator.contains("String")) throw Exception(ErrorCodes::LOGICAL_ERROR, "Maximum number of variants reached, but no String variant exists"); return false; } - /// If we have max_dynamic_types - 1 number of variants and don't have String variant, we can add only String variant. + /// If we have (max_dynamic_types - 1) number of variants and don't have String variant, we can add only String variant. if (variant_info.variant_names.size() == max_dynamic_types - 1 && new_variant->getName() != "String" && !variant_info.variant_name_to_discriminator.contains("String")) return false; @@ -218,7 +218,7 @@ void ColumnDynamic::insert(const DB::Field & x) return; /// If we cannot insert field into current variant column, extend it with new variant for this field from its type. - if (likely(addNewVariant(applyVisitor(FieldToDataType(), x)))) + if (addNewVariant(applyVisitor(FieldToDataType(), x))) { /// Now we should be able to insert this field into extended variant column. variant_column->insert(x); @@ -566,7 +566,6 @@ const char * ColumnDynamic::deserializeAndInsertFromArena(const char * pos) } /// We reached maximum number of variants and couldn't add new variant. - /// This case should be really rare in real use cases. /// We should always be able to add String variant and cast inserted value to String. addStringVariant(); /// Create temporary column of this variant type and deserialize value into it. @@ -645,7 +644,7 @@ ColumnPtr ColumnDynamic::compress() const }); } -void ColumnDynamic::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source_columns) { if (!empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "takeDynamicStructureFromSourceColumns should be called only on empty Dynamic column"); diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h index b5167f4b9d9..4e9c7edd5f9 100644 --- a/src/Columns/ColumnDynamic.h +++ b/src/Columns/ColumnDynamic.h @@ -22,15 +22,18 @@ namespace DB class ColumnDynamic final : public COWHelper, ColumnDynamic> { public: + /// struct Statistics { enum class Source { - READ, - MERGE, + READ, /// Statistics were loaded into column during reading from MergeTree. + MERGE, /// Statistics were calculated during merge of several MergeTree parts. }; + /// Source of the statistics. Source source; + /// Statistics data: (variant name) -> (total variant size in data part). std::unordered_map data; }; @@ -42,9 +45,9 @@ private: DataTypePtr variant_type; /// Name of the whole variant to not call getName() every time. String variant_name; - /// Store names of variants to not call getName() every time on variants. + /// Names of variants to not call getName() every time on variants. Names variant_names; - /// Store mapping (variant name) -> (global discriminator). + /// Mapping (variant name) -> (global discriminator). /// It's used during variant extension. std::unordered_map variant_name_to_discriminator; }; @@ -335,7 +338,7 @@ private: /// Combine current variant with the other variant and return global discriminators mapping /// from other variant to the combined one. It's used for inserting from /// different variants. - /// Returns nullptr if maximum number of Variants is reached and the new Variant cannot be created. + /// Returns nullptr if maximum number of variants is reached and the new variant cannot be created. std::vector * combineVariants(const VariantInfo & other_variant_info); void updateVariantInfoAndExpandVariantColumn(const DataTypePtr & new_variant_type); @@ -343,7 +346,7 @@ private: WrappedPtr variant_column; /// Store the type of current variant with some additional information. VariantInfo variant_info; - /// Maximum number of different types that can be stored in Dynamic. + /// The maximum number of different types that can be stored in this Dynamic column. /// If exceeded, all new variants will be converted to String. size_t max_dynamic_types; @@ -351,7 +354,11 @@ private: /// Used in takeDynamicStructureFromSourceColumns and set during deserialization. Statistics statistics; + /// Cache (Variant name) -> (global discriminators mapping from this variant to current variant in Dynamic column). + /// Used to avoid mappings recalculation in combineVariants for the same Variant types. std::unordered_map> variant_mappings_cache; + /// Cache of Variant types that couldn't be combined with current variant in Dynamic column. + /// Used to avoid checking if combination is possible for the same Variant types. std::unordered_set variants_with_failed_combination; }; diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 48e8bced23a..eecea1a273f 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -312,7 +312,7 @@ ColumnPtr ColumnMap::compress() const }); } -void ColumnMap::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +void ColumnMap::takeDynamicStructureFromSourceColumns(const Columns & source_columns) { Columns nested_source_columns; nested_source_columns.reserve(source_columns.size()); diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 011f3702bdf..bb0e15d39ab 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -835,7 +835,7 @@ ColumnPtr ColumnNullable::getNestedColumnWithDefaultOnNull() const return res; } -void ColumnNullable::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +void ColumnNullable::takeDynamicStructureFromSourceColumns(const Columns & source_columns) { Columns nested_source_columns; nested_source_columns.reserve(source_columns.size()); diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 80e20bb7631..d54801b6e07 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -801,7 +801,7 @@ ColumnSparse::Iterator ColumnSparse::getIterator(size_t n) const return Iterator(offsets_data, _size, current_offset, n); } -void ColumnSparse::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +void ColumnSparse::takeDynamicStructureFromSourceColumns(const Columns & source_columns) { Columns values_source_columns; values_source_columns.reserve(source_columns.size()); diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index 4e8e4063157..19f74048d84 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -582,7 +582,7 @@ bool ColumnTuple::hasDynamicStructure() const return false; } -void ColumnTuple::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +void ColumnTuple::takeDynamicStructureFromSourceColumns(const Columns & source_columns) { std::vector nested_source_columns; nested_source_columns.resize(columns.size()); diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index 819491f7fd9..ec47f5dfa74 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -1539,7 +1539,7 @@ bool ColumnVariant::hasDynamicStructure() const return false; } -void ColumnVariant::takeDynamicStructureFromSourceColumns(const DB::Columns & source_columns) +void ColumnVariant::takeDynamicStructureFromSourceColumns(const Columns & source_columns) { std::vector variants_source_columns; variants_source_columns.resize(variants.size()); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 33f398474ed..76f5af5bcd7 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -534,7 +534,10 @@ public: return res; } + /// Checks if column has dynamic subcolumns. virtual bool hasDynamicStructure() const { return false; } + /// For columns with dynamic subcolumns this method takes dynamic structure from source columns + /// and creates proper resulting dynamic structure in advance for merge of these source columns. virtual void takeDynamicStructureFromSourceColumns(const std::vector & /*source_columns*/) {} /** Some columns can contain another columns inside. diff --git a/src/DataTypes/DataTypeDynamic.h b/src/DataTypes/DataTypeDynamic.h index 452e05061a0..9fc727fd9c8 100644 --- a/src/DataTypes/DataTypeDynamic.h +++ b/src/DataTypes/DataTypeDynamic.h @@ -8,6 +8,8 @@ namespace DB { +/// Dynamic type allows to store values of any type inside it and to read +/// subcolumns with any type without knowing all of them in advance. class DataTypeDynamic final : public IDataType { public: @@ -28,6 +30,7 @@ public: Field getDefault() const override; + /// 2 Dynamic types with different max_dynamic_types parameters are considered as different. bool equals(const IDataType & rhs) const override { if (const auto * rhs_dynamic_type = typeid_cast(&rhs)) diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index b233230f9cc..914ff9cf4a2 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -151,7 +151,8 @@ public: /// For types with dynamic subcolumns deserialize state contains information /// about current dynamic structure. And this information can be useful - /// when we call enumerateStreams to enumerate dynamic streams. + /// when we call enumerateStreams after deserializeBinaryBulkStatePrefix + /// to enumerate dynamic streams. DeserializeBinaryBulkStatePtr deserialize_state; }; diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.cpp b/src/DataTypes/Serializations/SerializationDynamicElement.cpp index 059a7d57e4e..b0a4e63d0a5 100644 --- a/src/DataTypes/Serializations/SerializationDynamicElement.cpp +++ b/src/DataTypes/Serializations/SerializationDynamicElement.cpp @@ -97,6 +97,9 @@ void SerializationDynamicElement::deserializeBinaryBulkWithMultipleStreams( DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const { + if (!state) + return; + auto * dynamic_element_state = checkAndGetState(state); if (dynamic_element_state->variant_serialization) diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.h b/src/DataTypes/Serializations/SerializationDynamicElement.h index 9e4980e0a27..2ddc3324139 100644 --- a/src/DataTypes/Serializations/SerializationDynamicElement.h +++ b/src/DataTypes/Serializations/SerializationDynamicElement.h @@ -10,7 +10,7 @@ namespace DB class SerializationDynamicElement final : public SerializationWrapper { private: - /// To be able to deserialize Dyna,ic element as a subcolumn + /// To be able to deserialize Dynamic element as a subcolumn /// we need its type name and global discriminator. String dynamic_element_name; diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index dc7fc3b9b35..1f9a81ac671 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -45,8 +45,8 @@ void SerializationVariantElement::enumerateStreams( .withColumn(data.column ? removeNullableOrLowCardinalityNullable(data.column) : nullptr) .withSerializationInfo(data.serialization_info) .withDeserializeState(deserialize_state ? deserialize_state->variant_element_state : nullptr); - settings.path.back().data = data; - nested_serialization->enumerateStreams(settings, callback, data); + settings.path.back().data = nested_data; + nested_serialization->enumerateStreams(settings, callback, nested_data); removeVariantFromPath(settings.path); } diff --git a/src/Functions/dynamicElement.cpp b/src/Functions/dynamicElement.cpp index 6752a61b6c3..202533dc5c8 100644 --- a/src/Functions/dynamicElement.cpp +++ b/src/Functions/dynamicElement.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,7 @@ public: getName(), arguments[0].type->getName()); - auto return_type = makeNullableOrLowCardinalityNullableSafe(getRequestedElementType(arguments[1].column)); + auto return_type = makeNullableOrLowCardinalityNullableSafe(getRequestedType(arguments[1].column)); for (; count_arrays; --count_arrays) return_type = std::make_shared(return_type); @@ -97,29 +98,18 @@ public: } const ColumnDynamic * input_col_as_dynamic = checkAndGetColumn(input_col); - if (!input_col_as_dynamic) + const DataTypeDynamic * input_type_as_dynamic = checkAndGetDataType(input_type); + if (!input_col_as_dynamic || !input_type_as_dynamic) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be Dynamic or array of Dynamics. Actual {}", getName(), input_arg.type->getName()); - auto element_type = getRequestedElementType(arguments[1].column); - const auto & variant_info = input_col_as_dynamic->getVariantInfo(); - auto it = variant_info.variant_name_to_discriminator.find(element_type->getName()); - if (it == variant_info.variant_name_to_discriminator.end()) - { - auto result_type = makeNullableOrLowCardinalityNullableSafe(element_type); - auto result_column = result_type->createColumn(); - result_column->insertManyDefaults(input_rows_count); - return wrapInArraysAndConstIfNeeded(std::move(result_column), array_offsets, input_arg_is_const, input_rows_count); - } - - const auto & variant_column = input_col_as_dynamic->getVariantColumn(); - auto subcolumn_creator = SerializationVariantElement::VariantSubcolumnCreator(variant_column.getLocalDiscriminatorsPtr(), element_type->getName(), it->second, variant_column.localDiscriminatorByGlobal(it->second)); - auto result_column = subcolumn_creator.create(variant_column.getVariantPtrByGlobalDiscriminator(it->second)); - return wrapInArraysAndConstIfNeeded(std::move(result_column), array_offsets, input_arg_is_const, input_rows_count); + auto type = getRequestedType(arguments[1].column); + auto subcolumn = input_type_as_dynamic->getSubcolumn(type->getName(), input_col_as_dynamic->getPtr()); + return wrapInArraysAndConstIfNeeded(std::move(subcolumn), array_offsets, input_arg_is_const, input_rows_count); } private: - DataTypePtr getRequestedElementType(const ColumnPtr & type_name_column) const + DataTypePtr getRequestedType(const ColumnPtr & type_name_column) const { const auto * name_col = checkAndGetColumnConst(type_name_column.get()); if (!name_col) diff --git a/src/Functions/dynamicType.cpp b/src/Functions/dynamicType.cpp index 8fb2974ceff..e8ca73597d6 100644 --- a/src/Functions/dynamicType.cpp +++ b/src/Functions/dynamicType.cpp @@ -21,7 +21,7 @@ extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; namespace { -/// Return enum with type name for each row in Dynamic column. +/// Return String with type name for each row in Dynamic column. class FunctionDynamicType : public IFunction { public: @@ -89,13 +89,21 @@ REGISTER_FUNCTION(DynamicType) Returns the variant type name for each row of `Dynamic` column. If row contains NULL, it returns 'None' for it. )", .syntax = {"dynamicType(variant)"}, - .arguments = {{"variant", "Variant column"}}, + .arguments = {{"dynamic", "Dynamic column"}}, .examples = {{{ "Example", R"( +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT d, dynamicType(d) FROM test; )", R"( - +┌─d─────────────┬─dynamicType(d)─┐ +│ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ +│ Hello, World! │ String │ +│ [1,2,3] │ Array(Int64) │ +└───────────────┴────────────────┘ )"}}}, .categories{"Variant"}, }); diff --git a/src/Functions/variantElement.cpp b/src/Functions/variantElement.cpp index e63afc68b34..80d34083d9d 100644 --- a/src/Functions/variantElement.cpp +++ b/src/Functions/variantElement.cpp @@ -112,18 +112,15 @@ public: throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be Variant or array of Variants. Actual {}", getName(), input_arg.type->getName()); - std::optional variant_global_discr = getVariantGlobalDiscriminator(arguments[1].column, *input_type_as_variant, arguments.size()); + auto variant_discr = getVariantGlobalDiscriminator(arguments[1].column, *input_type_as_variant, arguments.size()); - if (!variant_global_discr.has_value()) + if (!variant_discr) return arguments[2].column; - auto variant_local_discr = input_col_as_variant->localDiscriminatorByGlobal(*variant_global_discr); - const auto & variant_type = input_type_as_variant->getVariant(*variant_global_discr); - const auto & variant_column = input_col_as_variant->getVariantPtrByGlobalDiscriminator(*variant_global_discr); - auto subcolumn_creator = SerializationVariantElement::VariantSubcolumnCreator(input_col_as_variant->getLocalDiscriminatorsPtr(), variant_type->getName(), *variant_global_discr, variant_local_discr); - auto res = subcolumn_creator.create(variant_column); - return wrapInArraysAndConstIfNeeded(std::move(res), array_offsets, input_arg_is_const, input_rows_count); + auto variant_column = input_type_as_variant->getSubcolumn(input_type_as_variant->getVariant(*variant_discr)->getName(), input_col_as_variant->getPtr()); + return wrapInArraysAndConstIfNeeded(std::move(variant_column), array_offsets, input_arg_is_const, input_rows_count); } + private: std::optional getVariantGlobalDiscriminator(const ColumnPtr & index_column, const DataTypeVariant & variant_type, size_t argument_size) const { @@ -133,20 +130,16 @@ private: "Second argument to {} with Variant argument must be a constant String", getName()); - String variant_element_name = name_col->getValue(); - auto variant_element_type = DataTypeFactory::instance().tryGet(variant_element_name); - if (variant_element_type) + auto variant_element_name = name_col->getValue(); + if (auto variant_element_type = DataTypeFactory::instance().tryGet(variant_element_name)) { - const auto & variants = variant_type.getVariants(); - for (size_t i = 0; i != variants.size(); ++i) - { - if (variants[i]->getName() == variant_element_type->getName()) - return i; - } + if (auto discr = variant_type.tryGetVariantDiscriminator(variant_element_type->getName())) + return discr; } if (argument_size == 2) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} doesn't contain variant with type {}", variant_type.getName(), variant_element_name); + return std::nullopt; } diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index a6cb378243a..a3c5a7ed3ed 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -2,7 +2,7 @@ #include #include -//#include +#include #include #include @@ -1188,27 +1188,22 @@ bool TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select } } + /// Check for dynamic subcolums in unknown required columns. if (!unknown_required_source_columns.empty()) { - for (const NameAndTypePair & pair : source_columns_ordinary) { -// std::cerr << "Check ordinary column " << pair.name << "\n"; if (!pair.type->hasDynamicSubcolumns()) continue; -// std::cerr << "Check dyamic subcolumns\n"; - for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();) { auto [column_name, dynamic_subcolumn_name] = Nested::splitName(*it); -// std::cerr << "Check dyamic subcolumn " << dynamic_subcolumn_name << "\n"; if (column_name == pair.name) { if (auto dynamic_subcolumn_type = pair.type->tryGetSubcolumnType(dynamic_subcolumn_name)) { -// std::cerr << "Found\n"; source_columns.emplace_back(*it, dynamic_subcolumn_type); it = unknown_required_source_columns.erase(it); continue; diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 30b7de409f1..9363e3d83eb 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -27,7 +27,6 @@ #include #include #include -#include namespace DB @@ -167,8 +166,6 @@ Field convertDecimalType(const Field & from, const To & type) Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const IDataType * from_type_hint) { - checkStackSize(); - if (from_type_hint && from_type_hint->equals(type)) { return src; diff --git a/src/Parsers/ParserDataType.cpp b/src/Parsers/ParserDataType.cpp index 747a9a6f7ba..573430ae9ab 100644 --- a/src/Parsers/ParserDataType.cpp +++ b/src/Parsers/ParserDataType.cpp @@ -7,12 +7,14 @@ #include #include + namespace DB { namespace { +/// Parser of Dynamic type arguments: Dynamic(max_types=N) class DynamicArgumentsParser : public IParserBase { private: @@ -47,7 +49,8 @@ private: /// - Nested table elements; /// - Enum element in form of 'a' = 1; /// - literal; -/// - another data type (or identifier) +/// - Dynamic type arguments; +/// - another data type (or identifier); class ParserDataTypeArgument : public IParserBase { public: diff --git a/src/Processors/Formats/IOutputFormat.h b/src/Processors/Formats/IOutputFormat.h index 9996bedb20e..cae2ab7691e 100644 --- a/src/Processors/Formats/IOutputFormat.h +++ b/src/Processors/Formats/IOutputFormat.h @@ -105,8 +105,6 @@ public: } } - virtual void finalizeBuffers() {} - protected: friend class ParallelFormattingOutputFormat; @@ -124,6 +122,7 @@ protected: virtual void consumeTotals(Chunk) {} virtual void consumeExtremes(Chunk) {} virtual void finalizeImpl() {} + virtual void finalizeBuffers() {} virtual void writePrefix() {} virtual void writeSuffix() {} virtual void resetFormatterImpl() {} diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h index 95f915e4478..c5bb074bb0c 100644 --- a/src/Processors/Merges/Algorithms/MergedData.h +++ b/src/Processors/Merges/Algorithms/MergedData.h @@ -99,6 +99,9 @@ public: { columns[i] = columns[i]->cloneResized(num_rows); } + /// For columns with Dynamic structure we cannot just take column from input chunk because resulting column may have + /// different Dynamic structure (and have some merge statistics after calling takeDynamicStructureFromSourceColumns). + /// We should insert into data resulting column using insertRangeFrom. else if (columns[i]->hasDynamicStructure()) { columns[i] = columns[i]->cloneEmpty(); diff --git a/src/Processors/Transforms/ColumnGathererTransform.cpp b/src/Processors/Transforms/ColumnGathererTransform.cpp index 6736cd59e83..b6bcec26c0c 100644 --- a/src/Processors/Transforms/ColumnGathererTransform.cpp +++ b/src/Processors/Transforms/ColumnGathererTransform.cpp @@ -60,6 +60,9 @@ IMergingAlgorithm::Status ColumnGathererStream::merge() if (source_to_fully_copy) /// Was set on a previous iteration { Chunk res; + /// For columns with Dynamic structure we cannot just take column source_to_fully_copy because resulting column may have + /// different Dynamic structure (and have some merge statistics after calling takeDynamicStructureFromSourceColumns). + /// We should insert into data resulting column using insertRangeFrom. if (result_column->hasDynamicStructure()) { auto col = result_column->cloneEmpty(); diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 6f844e31970..3a3ee0d1d14 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -550,6 +550,7 @@ bool ColumnsDescription::hasSubcolumn(const String & column_name) const if (subcolumns.get<0>().count(column_name)) return true; + /// Check for dynamic subcolumns auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); auto it = columns.get<1>().find(ordinary_column_name); if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) @@ -655,6 +656,7 @@ std::optional ColumnsDescription::tryGetColumn(const GetColumns return *jt; } + /// Check for dynmaic subcolumns. auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); it = columns.get<1>().find(ordinary_column_name); if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) @@ -752,6 +754,7 @@ bool ColumnsDescription::hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, cons if ((it != columns.get<1>().end() && (defaultKindToGetKind(it->default_desc.kind) & kind)) || hasSubcolumn(column_name)) return true; + /// Check for dynamic subcolumns. auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); it = columns.get<1>().find(ordinary_column_name); if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index 64ca6132cc4..de6b742934f 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -1,5 +1,4 @@ #include -#include #include #include diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 375c1e37bae..a00508fd1c1 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -43,7 +43,6 @@ struct Settings; M(UInt64, compact_parts_max_granules_to_buffer, 128, "Only available in ClickHouse Cloud", 0) \ M(UInt64, compact_parts_merge_max_bytes_to_prefetch_part, 16 * 1024 * 1024, "Only available in ClickHouse Cloud", 0) \ M(Bool, load_existing_rows_count_for_old_parts, false, "Whether to load existing_rows_count for existing parts. If false, existing_rows_count will be equal to rows_count for existing parts.", 0) \ - /** M(UInt64, max_types_for_dynamic_serialization, 32, "The maximum number of different types in Dynamic column stored separately in MergeTree tables in wide format. If exceeded, new types will be converted to String", 0) */ \ \ /** Merge settings. */ \ M(UInt64, merge_max_block_size, 8192, "How many rows in blocks should be formed for merge operations. By default has the same value as `index_granularity`.", 0) \ diff --git a/tests/queries/0_stateless/02941_variant_type_4.sh b/tests/queries/0_stateless/02941_variant_type_4.sh index f6eaf2fcc9a..ddff3852865 100755 --- a/tests/queries/0_stateless/02941_variant_type_4.sh +++ b/tests/queries/0_stateless/02941_variant_type_4.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1 --max_insert_threads 0 --group_by_two_level_threshold 454338 --group_by_two_level_threshold_bytes 50000000 --distributed_aggregation_memory_efficient 1 --fsync_metadata 0 --output_format_parallel_formatting 0 --input_format_parallel_parsing 1 --min_chunk_bytes_for_parallel_parsing 10898151 --max_read_buffer_size 730200 --prefer_localhost_replica 1 --max_block_size 77643 --max_threads 18 --optimize_append_index 0 --optimize_if_chain_to_multiif 0 --optimize_if_transform_strings_to_enum 0 --optimize_read_in_order 0 --optimize_or_like_chain 0 --optimize_substitute_columns 0 --enable_multiple_prewhere_read_steps 0 --read_in_order_two_level_merge_threshold 20 --optimize_aggregation_in_order 1 --aggregation_in_order_max_block_bytes 39857781 --use_uncompressed_cache 1 --min_bytes_to_use_direct_io 1 --min_bytes_to_use_mmap_io 10737418240 --local_filesystem_read_method io_uring --remote_filesystem_read_method threadpool --local_filesystem_read_prefetch 1 --filesystem_cache_segments_batch_size 10 --read_from_filesystem_cache_if_exists_otherwise_bypass_cache 1 --throw_on_error_from_cache_on_write_operations 1 --remote_filesystem_read_prefetch 0 --allow_prefetched_read_pool_for_remote_filesystem 0 --filesystem_prefetch_max_memory_usage 128Mi --filesystem_prefetches_limit 0 --filesystem_prefetch_min_bytes_for_single_read_task 8Mi --filesystem_prefetch_step_marks 0 --filesystem_prefetch_step_bytes 100Mi --compile_aggregate_expressions 0 --compile_sort_description 0 --merge_tree_coarse_index_granularity 30 --optimize_distinct_in_order 1 --max_bytes_before_external_sort 10737418240 --max_bytes_before_external_group_by 1 --max_bytes_before_remerge_sort 2279999838 --min_compress_block_size 56847 --max_compress_block_size 2399536 --merge_tree_compact_parts_min_granules_to_multibuffer_read 39 --optimize_sorting_by_input_stream_properties 1 --http_response_buffer_size 2739586 --http_wait_end_of_query False --enable_memory_bound_merging_of_aggregation_results 1 --min_count_to_compile_expression 3 --min_count_to_compile_aggregate_expression 0 --min_count_to_compile_sort_description 3 --session_timezone America/Mazatlan --prefer_warmed_unmerged_parts_seconds 7 --use_page_cache_for_disks_without_file_cache False --page_cache_inject_eviction True --merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability 0.19 --ratio_of_defaults_for_sparse_serialization 0.0 --prefer_fetch_merged_part_size_threshold 1 --vertical_merge_algorithm_min_rows_to_activate 389696 --vertical_merge_algorithm_min_columns_to_activate 100 --allow_vertical_merges_from_compact_to_wide_parts 0 --min_merge_bytes_to_use_direct_io 10737418240 --index_granularity_bytes 16233524 --merge_max_block_size 6455 --index_granularity 16034 --min_bytes_for_wide_part 0 --compress_marks 0 --compress_primary_key 0 --marks_compress_block_size 15959 --primary_key_compress_block_size 70269 --replace_long_file_name_to_hash 1 --max_file_name_length 123 --min_bytes_for_full_part_storage 0 --compact_parts_max_bytes_to_buffer 511937149 --compact_parts_max_granules_to_buffer 142 --compact_parts_merge_max_bytes_to_prefetch_part 28443027 --cache_populated_by_fetch 0 --concurrent_part_removal_threshold 0 --old_parts_lifetime 480" function test6_insert() { diff --git a/tests/queries/0_stateless/03038_nested_dynamic_merges.reference b/tests/queries/0_stateless/03038_nested_dynamic_merges.reference index f8118ce8b95..65034647775 100644 --- a/tests/queries/0_stateless/03038_nested_dynamic_merges.reference +++ b/tests/queries/0_stateless/03038_nested_dynamic_merges.reference @@ -2,8 +2,8 @@ MergeTree compact + horizontal merge test 16667 Tuple(a Dynamic(max_types=3)):Date 33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) -50000 Tuple(a Dynamic(max_types=3)):UInt64 50000 Tuple(a Dynamic(max_types=3)):String +50000 Tuple(a Dynamic(max_types=3)):UInt64 100000 UInt64:None 33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) 50000 Tuple(a Dynamic(max_types=3)):UInt64 @@ -25,8 +25,8 @@ MergeTree wide + horizontal merge test 16667 Tuple(a Dynamic(max_types=3)):Date 33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) -50000 Tuple(a Dynamic(max_types=3)):UInt64 50000 Tuple(a Dynamic(max_types=3)):String +50000 Tuple(a Dynamic(max_types=3)):UInt64 100000 UInt64:None 33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) 50000 Tuple(a Dynamic(max_types=3)):UInt64 @@ -40,8 +40,8 @@ test 100000 UInt64:None 133333 Tuple(a Dynamic(max_types=3)):None 50000 Tuple(a Dynamic(max_types=3)):UInt64 -100000 UInt64:None 100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None 116667 Tuple(a Dynamic(max_types=3)):String 133333 Tuple(a Dynamic(max_types=3)):None MergeTree compact + vertical merge @@ -59,8 +59,8 @@ test 33333 Tuple(a Dynamic(max_types=3)):Array(UInt8) 50000 Tuple(a Dynamic(max_types=3)):UInt64 66667 Tuple(a Dynamic(max_types=3)):String -100000 UInt64:None 100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None 133333 Tuple(a Dynamic(max_types=3)):None 50000 Tuple(a Dynamic(max_types=3)):UInt64 100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) @@ -86,7 +86,7 @@ test 100000 UInt64:None 133333 Tuple(a Dynamic(max_types=3)):None 50000 Tuple(a Dynamic(max_types=3)):UInt64 -100000 UInt64:None 100000 Tuple(a Dynamic(max_types=3)):Tuple(UInt64) +100000 UInt64:None 116667 Tuple(a Dynamic(max_types=3)):String 133333 Tuple(a Dynamic(max_types=3)):None diff --git a/tests/queries/0_stateless/03038_nested_dynamic_merges.sh b/tests/queries/0_stateless/03038_nested_dynamic_merges.sh index afb167ec20d..b82ddb3813e 100755 --- a/tests/queries/0_stateless/03038_nested_dynamic_merges.sh +++ b/tests/queries/0_stateless/03038_nested_dynamic_merges.sh @@ -18,16 +18,16 @@ function test() $CH_CLIENT -q "insert into test select number, tuple(if(number % 2 == 0, number, 'str_' || toString(number)))::Tuple(a Dynamic(max_types=3)) from numbers(100000)" $CH_CLIENT -q "insert into test select number, tuple(if(number % 3 == 0, toDate(number), range(number % 10)))::Tuple(a Dynamic(max_types=3)) from numbers(50000)" - $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count(), type" $CH_CLIENT -nm -q "system start merges test; optimize table test final;" - $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count(), type" $CH_CLIENT -q "insert into test select number, tuple(if(number % 3 == 0, toDateTime(number), NULL))::Tuple(a Dynamic(max_types=3)) from numbers(50000)" $CH_CLIENT -q "insert into test select number, tuple(if(number % 2 == 0, tuple(number), NULL))::Tuple(a Dynamic(max_types=3)) from numbers(200000)" - $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count(), type" $CH_CLIENT -nm -q "system start merges test; optimize table test final;" - $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) || ':' || dynamicType(d.\`Tuple(a Dynamic(max_types=3))\`.a) as type from test group by type order by count(), type" } $CH_CLIENT -q "drop table if exists test;" diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh index 3384a135307..9298fe28fec 100755 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh @@ -18,9 +18,9 @@ function test() $CH_CLIENT -q "insert into test select number, number from numbers(100000)" $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(50000, 100000)" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "drop table test" echo "SummingMergeTree" @@ -29,10 +29,10 @@ function test() $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" $CH_CLIENT -q "insert into test select number, 1, 'str_' || toString(number) from numbers(50000, 100000)" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "select count(), sum from test group by sum" $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "select count(), sum from test group by sum" $CH_CLIENT -q "drop table test" @@ -42,10 +42,10 @@ function test() $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), number from numbers(100000) group by number" $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), 'str_' || toString(number) from numbers(50000, 100000) group by number" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum" $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum" $CH_CLIENT -q "drop table test" } From c9b019d392c4fa3e2f25a2921383711fc2c93ce5 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 30 Apr 2024 18:46:38 +0000 Subject: [PATCH 090/392] Mark ColumnDynamic constructor explicit --- src/Columns/ColumnDynamic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h index 4e9c7edd5f9..c6626433877 100644 --- a/src/Columns/ColumnDynamic.h +++ b/src/Columns/ColumnDynamic.h @@ -52,7 +52,7 @@ private: std::unordered_map variant_name_to_discriminator; }; - ColumnDynamic(size_t max_dynamic_types_); + explicit ColumnDynamic(size_t max_dynamic_types_); ColumnDynamic(MutableColumnPtr variant_column_, const VariantInfo & variant_info_, size_t max_dynamic_types_, const Statistics & statistics_ = {}); public: From 3b9f593524ba27105864464f41d8b3e858d163f9 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 30 Apr 2024 19:00:32 +0000 Subject: [PATCH 091/392] Fix type in code, add more docs --- docs/en/sql-reference/data-types/dynamic.md | 256 +++++++++++++++++++- src/Storages/ColumnsDescription.cpp | 2 +- 2 files changed, 256 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index e20bdad1e79..e3cade25b55 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -106,6 +106,7 @@ SELECT toTypeName(d.String), toTypeName(d.Int64), toTypeName(d.`Array(Int64)`), ```sql SELECT d, dynamicType(d), dynamicElement(d, 'String'), dynamicElement(d, 'Int64'), dynamicElement(d, 'Array(Int64)'), dynamicElement(d, 'Date'), dynamicElement(d, 'Array(String)') FROM test;``` +``` ```text ┌─d─────────────┬─dynamicType(d)─┬─dynamicElement(d, 'String')─┬─dynamicElement(d, 'Int64')─┬─dynamicElement(d, 'Array(Int64)')─┬─dynamicElement(d, 'Date')─┬─dynamicElement(d, 'Array(String)')─┐ @@ -139,7 +140,7 @@ SELECT dynamicType(d) from test; There are 4 possible conversions that can be performed with `Dynamic` column. -### Converting an ordinary column to a Variant column +### Converting an ordinary column to a Dynamic column ```sql SELECT 'Hello, World!'::Dynamic as d, dynamicType(d); @@ -151,7 +152,260 @@ SELECT 'Hello, World!'::Dynamic as d, dynamicType(d); └───────────────┴────────────────┘ ``` +### Converting a String column to a Dynamic column through parsing +To parse `Dynamic` type values from a `String` column you can enable setting `cast_string_to_dynamic_use_inference`: +```sql +SET cast_string_to_dynamic_use_inference = 1; +SELECT CAST(materialize(map('key1', '42', 'key2', 'true', 'key3', '2020-01-01')), 'Map(String, Dynamic)') as map_of_dynamic, mapApply((k, v) -> (k, dynamicType(v)), map_of_dynamic) as map_of_dynamic_types; +``` +```text +┌─map_of_dynamic──────────────────────────────┬─map_of_dynamic_types─────────────────────────┐ +│ {'key1':42,'key2':true,'key3':'2020-01-01'} │ {'key1':'Int64','key2':'Bool','key3':'Date'} │ +└─────────────────────────────────────────────┴──────────────────────────────────────────────┘ +``` + +### Converting a Dynamic column to an ordinary column + +It is possible to convert a `Dynamic` column to an ordinary column. In this case all nested types will be converted to a destination type: + +```sql +CREATE TABLE test (d Dynamic) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('42.42'), (true), ('e10'); +SELECT d::Nullable(Float64) FROM test; +``` + +```text +┌─CAST(d, 'Nullable(Float64)')─┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ 42.42 │ +│ 1 │ +│ 0 │ +└──────────────────────────────┘ +``` + +### Converting a Variant column to Dynamic column + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('String'), ([1, 2, 3]); +SELECT v::Dynamic as d, dynamicType(d) from test; +``` + +```text +┌─d───────┬─dynamicType(d)─┐ +│ ᴺᵁᴸᴸ │ None │ +│ 42 │ UInt64 │ +│ String │ String │ +│ [1,2,3] │ Array(UInt64) │ +└─────────┴────────────────┘ +``` + +### Converting a Dynamic(max_types=N) column to another Dynamic(max_types=K) + +If `K >= N` than during conversion the data doesn't change: + +```sql +CREATE TABLE test (d Dynamic(max_types=3)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true); +SELECT d::Dynamic(max_types=5) as d2, dynamicType(d2) FROM test; +``` + +```text +┌─d─────┬─dynamicType(d)─┐ +│ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ +│ 43 │ Int64 │ +│ 42.42 │ String │ +│ true │ Bool │ +└───────┴────────────────┘ +``` + +If `K < N`, then the values with the rarest types are converted to `String`: +```text +CREATE TABLE test (d Dynamic(max_types=4)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true), ([1, 2, 3]); +SELECT d, dynamicType(d), d::Dynamic(max_types=2) as d2, dynamicType(d2) FROM test; +``` + +```text +┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ 42 │ Int64 │ +│ 43 │ Int64 │ 43 │ Int64 │ +│ 42.42 │ String │ 42.42 │ String │ +│ true │ Bool │ true │ String │ +│ [1,2,3] │ Array(Int64) │ [1,2,3] │ String │ +└─────────┴────────────────┴─────────┴─────────────────┘ +``` + +If `K=1`, all types are converted to `String`: + +```text +CREATE TABLE test (d Dynamic(max_types=4)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), (43), ('42.42'), (true), ([1, 2, 3]); +SELECT d, dynamicType(d), d::Dynamic(max_types=1) as d2, dynamicType(d2) FROM test; +``` + +```text +┌─d───────┬─dynamicType(d)─┬─d2──────┬─dynamicType(d2)─┐ +│ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ None │ +│ 42 │ Int64 │ 42 │ String │ +│ 43 │ Int64 │ 43 │ String │ +│ 42.42 │ String │ 42.42 │ String │ +│ true │ Bool │ true │ String │ +│ [1,2,3] │ Array(Int64) │ [1,2,3] │ String │ +└─────────┴────────────────┴─────────┴─────────────────┘ +``` + +## Reading Variant type from the data + +All text formats (TSV, CSV, CustomSeparated, Values, JSONEachRow, etc) supports reading `Dynamic` type. During data parsing ClickHouse tries to infer the type of each value and use it during insertion to `Dynamic` column. + +Example: + +```sql +SELECT + d, + dynamicType(d), + dynamicElement(d, 'String') AS str, + dynamicElement(d, 'Int64') AS num, + dynamicElement(d, 'Float64') AS float, + dynamicElement(d, 'Date') AS date, + dynamicElement(d, 'Array(Int64)') AS arr +FROM format(JSONEachRow, 'd Dynamic', $$ +{"d" : "Hello, World!"}, +{"d" : 42}, +{"d" : 42.42}, +{"d" : "2020-01-01"}, +{"d" : [1, 2, 3]} +$$) +``` + +```text +┌─d─────────────┬─dynamicType(d)─┬─str───────────┬──num─┬─float─┬───────date─┬─arr─────┐ +│ Hello, World! │ String │ Hello, World! │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42.42 │ Float64 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 42.42 │ ᴺᵁᴸᴸ │ [] │ +│ 2020-01-01 │ Date │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-01 │ [] │ +│ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴────────────────┴───────────────┴──────┴───────┴────────────┴─────────┘ +``` + +## Comparing values of Dynamic type + +Values of `Dynamic` types are compared similar to values of `Variant` type: +The result of operator `<` for values `d1` with underlying type `T1` and `d2` with underlying type `T2` of a type `Dynamic` is defined as follows: +- If `T1 = T2 = T`, the result will be `d1.T < d2.T` (underlying values will be compared). +- If `T1 != T2`, the result will be `T1 < T2` (type names will be compared). + +Examples: +```sql +CREATE TABLE test (d1 Dynamic, d2 Dynamic) ENGINE=Memory; +INSERT INTO test VALUES (42, 42), (42, 43), (42, 'abc'), (42, [1, 2, 3]), (42, []), (42, NULL); +``` + +```sql +SELECT d2, dynamicType(d2) as d2_type from test order by d2; +``` + +```text +┌─d2──────┬─d2_type──────┐ +│ [] │ Array(Int64) │ +│ [1,2,3] │ Array(Int64) │ +│ 42 │ Int64 │ +│ 43 │ Int64 │ +│ abc │ String │ +│ ᴺᵁᴸᴸ │ None │ +└─────────┴──────────────┘ +``` + +```sql +SELECT d1, dynamicType(d1) as d1_type, d2, dynamicType(d2) as d2_type, d1 = d2, d1 < d2, d1 > d2 from test; +``` + +```text +┌─d1─┬─d1_type─┬─d2──────┬─d2_type──────┬─equals(d1, d2)─┬─less(d1, d2)─┬─greater(d1, d2)─┐ +│ 42 │ Int64 │ 42 │ Int64 │ 1 │ 0 │ 0 │ +│ 42 │ Int64 │ 43 │ Int64 │ 0 │ 1 │ 0 │ +│ 42 │ Int64 │ abc │ String │ 0 │ 1 │ 0 │ +│ 42 │ Int64 │ [1,2,3] │ Array(Int64) │ 0 │ 0 │ 1 │ +│ 42 │ Int64 │ [] │ Array(Int64) │ 0 │ 0 │ 1 │ +│ 42 │ Int64 │ ᴺᵁᴸᴸ │ None │ 0 │ 1 │ 0 │ +└────┴─────────┴─────────┴──────────────┴────────────────┴──────────────┴─────────────────┘ +``` + +If you need to find the row with specific `Dynamic` value, you can do one of the following: + +- Cast value to the `Dynamic` type: + +```sql +SELECT * FROM test WHERE d2 == [1,2,3]::Array(UInt32)::Dynamic; +``` + +```text +┌─d1─┬─d2──────┐ +│ 42 │ [1,2,3] │ +└────┴─────────┘ +``` + +- Compare `Dynamic` subcolumn with required type: + +```sql +SELECT * FROM test WHERE d2.`Array(Int65)` == [1,2,3] -- or using variantElement(d2, 'Array(UInt32)') +``` + +```text +┌─d1─┬─d2──────┐ +│ 42 │ [1,2,3] │ +└────┴─────────┘ +``` + +Sometimes it can be useful to make additional check on dynamic type as subcolumns with complex types like `Array/Map/Tuple` cannot be inside `Nullable` and will have default values instead of `NULL` on rows with different types: + +```sql +SELECT d2, d2.`Array(Int64)`, dynamicType(d2) FROM test WHERE d2.`Array(Int64)` == []; +``` + +```text +┌─d2───┬─d2.Array(UInt32)─┬─dynamicType(d2)─┐ +│ 42 │ [] │ Int64 │ +│ 43 │ [] │ Int64 │ +│ abc │ [] │ String │ +│ [] │ [] │ Array(Int32) │ +│ ᴺᵁᴸᴸ │ [] │ None │ +└──────┴──────────────────┴─────────────────┘ +``` + +```sql +SELECT d2, d2.`Array(Int64)`, dynamicType(d2) FROM test WHERE dynamicType(d2) == 'Array(Int64)' AND d2.`Array(Int64)` == []; +``` + +```text +┌─d2─┬─d2.Array(UInt32)─┬─dynamicType(d2)─┐ +│ [] │ [] │ Array(Int64) │ +└────┴──────────────────┴─────────────────┘ +``` + +**Note:** values of dynamic types with different numeric types are considered as different values and not compared between each other, their type names are compared instead. + +Example: + +```sql +CREATE TABLE test (d Dynamic) ENGINE=Memory; +INSERT INTO test VALUES (1::UInt32), (1::Int64), (100::UInt32), (100::Int64); +SELECT d, dynamicType(d) FROM test ORDER by d; +``` + +```text +┌─v───┬─dynamicType(v)─┐ +│ 1 │ Int64 │ +│ 100 │ Int64 │ +│ 1 │ UInt32 │ +│ 100 │ UInt32 │ +└─────┴────────────────┘ +``` diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 3a3ee0d1d14..4cf66649ad1 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -656,7 +656,7 @@ std::optional ColumnsDescription::tryGetColumn(const GetColumns return *jt; } - /// Check for dynmaic subcolumns. + /// Check for dynamic subcolumns. auto [ordinary_column_name, dynamic_subcolumn_name] = Nested::splitName(column_name); it = columns.get<1>().find(ordinary_column_name); if (it != columns.get<1>().end() && it->type->hasDynamicSubcolumns()) From ff6fa4bf6e414caa7cd483a3155d38187ceaf3f5 Mon Sep 17 00:00:00 2001 From: serxa Date: Fri, 3 May 2024 17:03:16 +0000 Subject: [PATCH 092/392] fix unit tests for asyncloader --- src/Common/tests/gtest_async_loader.cpp | 36 ++++++++++++++++--------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/Common/tests/gtest_async_loader.cpp b/src/Common/tests/gtest_async_loader.cpp index 174997ddf14..304fa996934 100644 --- a/src/Common/tests/gtest_async_loader.cpp +++ b/src/Common/tests/gtest_async_loader.cpp @@ -262,7 +262,8 @@ TEST(AsyncLoader, CancelPendingJob) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } } @@ -288,7 +289,8 @@ TEST(AsyncLoader, CancelPendingTask) } catch (Exception & e) { - ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } try @@ -298,7 +300,8 @@ TEST(AsyncLoader, CancelPendingTask) } catch (Exception & e) { - ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } } @@ -325,7 +328,8 @@ TEST(AsyncLoader, CancelPendingDependency) } catch (Exception & e) { - ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } try @@ -335,7 +339,8 @@ TEST(AsyncLoader, CancelPendingDependency) } catch (Exception & e) { - ASSERT_TRUE(e.code() == ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } } @@ -451,8 +456,9 @@ TEST(AsyncLoader, JobFailure) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_FAILED); - ASSERT_TRUE(e.message().find(error_message) != String::npos); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains(error_message)); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_FAILED")); } } @@ -489,8 +495,9 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); - ASSERT_TRUE(e.message().find(error_message) != String::npos); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); + ASSERT_TRUE(e.message().contains(error_message)); } try { @@ -499,8 +506,9 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); - ASSERT_TRUE(e.message().find(error_message) != String::npos); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); + ASSERT_TRUE(e.message().contains(error_message)); } } @@ -531,7 +539,8 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } try { @@ -540,7 +549,8 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies) } catch (Exception & e) { - ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_CANCELED); + ASSERT_EQ(e.code(), ErrorCodes::ASYNC_LOAD_WAIT_FAILED); + ASSERT_TRUE(e.message().contains("ASYNC_LOAD_CANCELED")); } } From c90e04ed4be9c6f8cf274eabf9f0d10c27102c83 Mon Sep 17 00:00:00 2001 From: serxa Date: Mon, 6 May 2024 11:40:45 +0000 Subject: [PATCH 093/392] fix tests build --- src/Common/tests/gtest_async_loader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Common/tests/gtest_async_loader.cpp b/src/Common/tests/gtest_async_loader.cpp index 304fa996934..9fda58b9008 100644 --- a/src/Common/tests/gtest_async_loader.cpp +++ b/src/Common/tests/gtest_async_loader.cpp @@ -35,6 +35,7 @@ namespace DB::ErrorCodes extern const int ASYNC_LOAD_CYCLE; extern const int ASYNC_LOAD_FAILED; extern const int ASYNC_LOAD_CANCELED; + extern const int ASYNC_LOAD_WAIT_FAILED; } struct Initializer { From 936f94d286f50133cf12ba449245502769a22e40 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 7 May 2024 14:40:45 +0200 Subject: [PATCH 094/392] Add print --- utils/keeper-bench/Runner.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp index 8b111f5adb9..a893dac3851 100644 --- a/utils/keeper-bench/Runner.cpp +++ b/utils/keeper-bench/Runner.cpp @@ -635,11 +635,14 @@ struct ZooKeeperRequestFromLogReader break; } case Coordination::OpNum::Check: + case Coordination::OpNum::CheckNotExists: { auto check_request = std::make_shared(); check_request->path = current_block->getPath(idx_in_block); if (auto version = current_block->getVersion(idx_in_block)) check_request->version = *version; + if (op_num == Coordination::OpNum::CheckNotExists) + check_request->not_exists = true; request_from_log.request = check_request; break; } @@ -868,10 +871,20 @@ void Runner::runBenchmarkFromLog() } ZooKeeperRequestFromLogReader request_reader(input_request_log, global_context); + + delay_watch.restart(); while (auto request_from_log = request_reader.getNextRequest()) { request_from_log->connection = get_zookeeper_connection(request_from_log->session_id); push_request(std::move(*request_from_log)); + + if (delay > 0 && delay_watch.elapsedSeconds() > delay) + { + dumpStats("Write", stats.write_requests); + dumpStats("Read", stats.read_requests); + std::cerr << std::endl; + delay_watch.restart(); + } } } From 412805c99e0e789d7bc13dcb73fdf8199758ad2a Mon Sep 17 00:00:00 2001 From: Danila Puzov Date: Thu, 9 May 2024 19:38:19 +0300 Subject: [PATCH 095/392] Add serial, generateSnowflakeID, generateUUIDv7 functions --- src/Functions/generateSnowflakeID.cpp | 92 ++++++++++++++ src/Functions/generateUUIDv7.cpp | 113 +++++++++++++++++ src/Functions/serial.cpp | 171 ++++++++++++++++++++++++++ 3 files changed, 376 insertions(+) create mode 100644 src/Functions/generateSnowflakeID.cpp create mode 100644 src/Functions/generateUUIDv7.cpp create mode 100644 src/Functions/serial.cpp diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp new file mode 100644 index 00000000000..e54b720ec98 --- /dev/null +++ b/src/Functions/generateSnowflakeID.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +class FunctionSnowflakeID : public IFunction +{ +private: + mutable std::atomic machine_sequence_number{0}; + mutable std::atomic last_timestamp{0}; + +public: + static constexpr auto name = "generateSnowflakeID"; + + static FunctionPtr create(ContextPtr /*context*/) + { + return std::make_shared(); + } + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 0; } + + bool isDeterministicInScopeOfQuery() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + bool isVariadic() const override { return true; } + + bool isStateful() const override { return true; } + bool isDeterministic() const override { return false; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() > 1) { + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.", + getName(), arguments.size()); + } + + return std::make_shared(); + } + + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & /*arguments*/, const DataTypePtr &, size_t input_rows_count) const override + { + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_to = col_res->getData(); + size_t size = input_rows_count; + vec_to.resize(size); + + auto serverUUID = ServerUUID::get(); + + // hash serverUUID into 32 bytes + Int64 h = UUIDHelpers::getHighBytes(serverUUID); + Int64 l = UUIDHelpers::getLowBytes(serverUUID); + Int64 machine_id = (h * 11) ^ (l * 17); + + for (Int64 & x : vec_to) { + const auto tm_point = std::chrono::system_clock::now(); + Int64 current_timestamp = std::chrono::duration_cast( + tm_point.time_since_epoch()).count(); + + Int64 local_machine_sequence_number = 0; + + if (current_timestamp != last_timestamp.load()) { + machine_sequence_number.store(0); + last_timestamp.store(current_timestamp); + } else { + local_machine_sequence_number = machine_sequence_number.fetch_add(1) + 1; + } + + x = (current_timestamp << 22) | (machine_id & 0x3ff000ull) | (local_machine_sequence_number & 0xfffull); + } + + return col_res; + } + +}; + +REGISTER_FUNCTION(GenerateSnowflakeID) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp new file mode 100644 index 00000000000..61d742d2fda --- /dev/null +++ b/src/Functions/generateUUIDv7.cpp @@ -0,0 +1,113 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +#define DECLARE_SEVERAL_IMPLEMENTATIONS(...) \ +DECLARE_DEFAULT_CODE (__VA_ARGS__) \ +DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__) + +DECLARE_SEVERAL_IMPLEMENTATIONS( + +class FunctionGenerateUUIDv7 : public IFunction +{ +public: + static constexpr auto name = "generateUUIDv7"; + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override { return 0; } + + bool isDeterministicInScopeOfQuery() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + bool isVariadic() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() > 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.", + getName(), arguments.size()); + + return std::make_shared(); + } + + bool isDeterministic() const override { return false; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override + { + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_to = col_res->getData(); + + size_t size = input_rows_count; + vec_to.resize(size); + + /// RandImpl is target-dependent and is not the same in different TargetSpecific namespaces. + RandImpl::execute(reinterpret_cast(vec_to.data()), vec_to.size() * sizeof(UUID)); + + for (UUID & uuid : vec_to) + { + /// https://www.ietf.org/archive/id/draft-peabody-dispatch-new-uuid-format-04.html#section-5.2 + + const auto tm_point = std::chrono::system_clock::now(); + UInt64 unix_ts_ms = std::chrono::duration_cast( + tm_point.time_since_epoch()).count(); + + UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & 0x0000000000000fffull) | 0x0000000000007000ull | (unix_ts_ms << 16); + UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & 0x3fffffffffffffffull) | 0x8000000000000000ull; + } + + return col_res; + } +}; + +) // DECLARE_SEVERAL_IMPLEMENTATIONS +#undef DECLARE_SEVERAL_IMPLEMENTATIONS + +class FunctionGenerateUUIDv7 : public TargetSpecific::Default::FunctionGenerateUUIDv7 +{ +public: + explicit FunctionGenerateUUIDv7(ContextPtr context) : selector(context) + { + selector.registerImplementation(); + + #if USE_MULTITARGET_CODE + selector.registerImplementation(); + #endif + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + return selector.selectAndExecute(arguments, result_type, input_rows_count); + } + + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context); + } + +private: + ImplementationSelector selector; +}; + +REGISTER_FUNCTION(GenerateUUIDv7) +{ + factory.registerFunction(); +} + +} + + diff --git a/src/Functions/serial.cpp b/src/Functions/serial.cpp new file mode 100644 index 00000000000..4f336013ca8 --- /dev/null +++ b/src/Functions/serial.cpp @@ -0,0 +1,171 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "Common/Logger.h" +#include + +namespace DB { + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +class FunctionSerial : public IFunction +{ +private: + mutable zkutil::ZooKeeperPtr zk{nullptr}; + ContextPtr context; + +public: + static constexpr auto name = "serial"; + + explicit FunctionSerial(ContextPtr ctx) : context(ctx) + { + if (ctx->hasZooKeeper()) { + zk = ctx->getZooKeeper(); + } + } + + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(std::move(context)); + } + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } + + bool isStateful() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForConstantFolding() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForNothing() const override { return false; } + bool canBeExecutedOnDefaultArguments() const override { return false; } + bool isInjective(const ColumnsWithTypeAndName & /*sample_columns*/) const override { return true; } + bool hasInformationAboutMonotonicity() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1.", + getName(), arguments.size()); + if (!isStringOrFixedString(arguments[0])) { + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Type of argument for function {} doesn't match: passed {}, should be string", + getName(), arguments[0]->getName()); + } + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_to = col_res->getData(); + size_t size = input_rows_count; + LOG_INFO(getLogger("Serial Function"), "Size = {}", size); + vec_to.resize(size); + + const auto & serial_path = "/serials/" + arguments[0].column->getDataAt(0).toString(); + + // if serial name used first time + zk->createAncestors(serial_path); + zk->createIfNotExists(serial_path, ""); + + Int64 counter; + + if (zk != nullptr) { + // Get Lock in ZooKeeper + // https://zookeeper.apache.org/doc/r3.2.2/recipes.html + + // 1. + if (zk->expired()) { + zk = context->getZooKeeper(); + } + + std::string lock_path = serial_path + "/lock-"; + std::string path_created = zk->create(lock_path, "", zkutil::CreateMode::EphemeralSequential); + Int64 created_sequence_number = std::stoll(path_created.substr(lock_path.size(), path_created.size() - lock_path.size())); + + while (true) { + // 2. + zkutil::Strings children = zk->getChildren(serial_path); + + // 3. + Int64 lowest_child_sequence_number = -1; + for (auto& child : children) { + if (child == "counter") { + continue; + } + std::string child_suffix = child.substr(5, 10); + Int64 seq_number = std::stoll(child_suffix); + + if (lowest_child_sequence_number == -1 || seq_number < lowest_child_sequence_number) { + lowest_child_sequence_number = seq_number; + } + } + + if (lowest_child_sequence_number == created_sequence_number) { + break; + // we have a lock in ZooKeeper, now can get the counter value + } + + // 4. and 5. + Int64 prev_seq_number = created_sequence_number - 1; + std::string to_wait_key = std::to_string(prev_seq_number); + while (to_wait_key.size() != 10) { + to_wait_key = "0" + to_wait_key; + } + + zk->waitForDisappear(lock_path + to_wait_key); + } + + // Now we have a lock + // Update counter in ZooKeeper + std::string counter_path = serial_path + "/counter"; + if (zk->exists(counter_path)) { + std::string counter_string = zk->get(counter_path, nullptr); + counter = std::stoll(counter_string); + + LOG_INFO(getLogger("Serial Function"), "Got counter from Zookeeper = {}", counter); + } else { + counter = 1; + } + zk->createOrUpdate(counter_path, std::to_string(counter + input_rows_count), zkutil::CreateMode::Persistent); + + // Unlock = delete node created on step 1. + zk->deleteEphemeralNodeIfContentMatches(path_created, ""); + } else { + // ZooKeeper is not available + // What to do? + + counter = 1; + } + + // Make a result + for (auto& val : vec_to) { + val = counter; + ++counter; + } + + + return col_res; + } + +}; + +REGISTER_FUNCTION(Serial) +{ + factory.registerFunction(); +} + +} From a31ee9891f610a14513c622bc81dcb25eaf25eb5 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 10 May 2024 10:36:59 +0200 Subject: [PATCH 096/392] Move setting to 24.5 version in SettingsChangesHistory --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index e8cf1e98d27..3c1249d29e5 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -87,13 +87,13 @@ static std::map sett { {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, + {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, {"output_format_parquet_use_custom_encoder", false, true, "Enable custom Parquet encoder."}, }}, {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"}, {"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"}, {"ignore_drop_queries_probability", 0, 0, "Allow to ignore drop queries in server with specified probability for testing purposes"}, {"lightweight_deletes_sync", 2, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes"}, - {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, {"query_cache_system_table_handling", "save", "throw", "The query cache no longer caches results of queries against system tables"}, {"input_format_json_ignore_unnecessary_fields", false, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields"}, {"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."}, From fbf8dcb7feb480175f76f7fa9252cf80f3ca3cc4 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 10 May 2024 11:55:24 +0200 Subject: [PATCH 097/392] Apply suggestions from code review Co-authored-by: Antonio Andelic --- src/Columns/ColumnDynamic.cpp | 7 +++---- src/Columns/ColumnVariant.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index a1dd60f4748..629df476591 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -48,8 +48,8 @@ ColumnDynamic::MutablePtr ColumnDynamic::create(MutableColumnPtr variant_column, variant_info.variant_name_to_discriminator.reserve(variants.size()); for (ColumnVariant::Discriminator discr = 0; discr != variants.size(); ++discr) { - variant_info.variant_names.push_back(variants[discr]->getName()); - variant_info.variant_name_to_discriminator[variant_info.variant_names.back()] = discr; + const auto & variant_name = variant_info.variant_names.emplace_back(variants[discr]->getName()); + variant_info.variant_name_to_discriminator[variant_name] = discr; } return create(std::move(variant_column), variant_info, max_dynamic_types_, statistics_); @@ -133,8 +133,7 @@ void ColumnDynamic::updateVariantInfoAndExpandVariantColumn(const DB::DataTypePt for (ColumnVariant::Discriminator discr = 0; discr != new_variants.size(); ++discr) { - String name = new_variants[discr]->getName(); - new_variant_names.push_back(name); + const auto & name = new_variant_names.emplace_back(new_variants[discr]->getName()); new_variant_name_to_discriminator[name] = discr; auto current_it = variant_info.variant_name_to_discriminator.find(name); diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index 8f703ea17d9..e5a4498f340 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -189,7 +189,7 @@ public: void insertRangeFrom(const IColumn & src_, size_t start, size_t length, const std::vector & global_discriminators_mapping); void insertManyFrom(const IColumn & src_, size_t position, size_t length, const std::vector & global_discriminators_mapping); - /// Methods for insertrion into a specific variant. + /// Methods for insertion into a specific variant. void insertIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t n); void insertRangeIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t start, size_t length); void insertManyIntoVariantFrom(Discriminator global_discr, const IColumn & src_, size_t position, size_t length); From e7c7eb159a44beb52cd3c7f2634fd8f13214ad71 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 10 May 2024 11:32:27 +0000 Subject: [PATCH 098/392] Apply suggestions from the code review --- src/Columns/ColumnDynamic.cpp | 41 ++++--------------- src/Columns/tests/gtest_column_dynamic.cpp | 26 ++++++------ src/DataTypes/DataTypeDynamic.h | 5 +-- .../Serializations/SerializationDynamic.cpp | 7 +++- src/Functions/FunctionsConversion.cpp | 9 ++-- src/Interpreters/InterpreterInsertQuery.cpp | 6 ++- .../Algorithms/CollapsingSortedAlgorithm.cpp | 8 +++- 7 files changed, 44 insertions(+), 58 deletions(-) diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index 629df476591..76f536a3409 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -80,41 +80,14 @@ bool ColumnDynamic::addNewVariant(const DB::DataTypePtr & new_variant) DataTypes all_variants = current_variants; all_variants.push_back(new_variant); auto new_variant_type = std::make_shared(all_variants); - const auto & new_variants = assert_cast(*new_variant_type).getVariants(); - - std::vector current_to_new_discriminators; - current_to_new_discriminators.resize(variant_info.variant_names.size()); - Names new_variant_names; - new_variant_names.reserve(new_variants.size()); - std::unordered_map new_variant_name_to_discriminator; - new_variant_name_to_discriminator.reserve(new_variants.size()); - std::vector> new_variant_columns_and_discriminators_to_add; - new_variant_columns_and_discriminators_to_add.reserve(new_variants.size() - current_variants.size()); - - for (ColumnVariant::Discriminator discr = 0; discr != new_variants.size(); ++discr) - { - String name = new_variants[discr]->getName(); - new_variant_names.push_back(name); - new_variant_name_to_discriminator[name] = discr; - auto it = variant_info.variant_name_to_discriminator.find(name); - if (it == variant_info.variant_name_to_discriminator.end()) - new_variant_columns_and_discriminators_to_add.emplace_back(new_variants[discr]->createColumn(), discr); - else - current_to_new_discriminators[it->second] = discr; - } - - variant_info.variant_type = new_variant_type; - variant_info.variant_name = new_variant_type->getName(); - variant_info.variant_names = new_variant_names; - variant_info.variant_name_to_discriminator = new_variant_name_to_discriminator; - assert_cast(*variant_column).extend(current_to_new_discriminators, std::move(new_variant_columns_and_discriminators_to_add)); - variant_mappings_cache.clear(); + updateVariantInfoAndExpandVariantColumn(new_variant_type); return true; } void ColumnDynamic::addStringVariant() { - addNewVariant(std::make_shared()); + if (!addNewVariant(std::make_shared())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot add String variant to Dynamic column, it's a bug"); } void ColumnDynamic::updateVariantInfoAndExpandVariantColumn(const DB::DataTypePtr & new_variant_type) @@ -704,13 +677,13 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source result_variants.reserve(max_dynamic_types); /// Add String variant in advance. result_variants.push_back(std::make_shared()); - size_t i = 0; - while (result_variants.size() != max_dynamic_types && i < variants_with_sizes.size()) + for (const auto & [_, variant] : variants_with_sizes) { - const auto & variant = variants_with_sizes[i].second; + if (result_variants.size() == max_dynamic_types) + break; + if (variant->getName() != "String") result_variants.push_back(variant); - ++i; } result_variant_type = std::make_shared(result_variants); diff --git a/src/Columns/tests/gtest_column_dynamic.cpp b/src/Columns/tests/gtest_column_dynamic.cpp index 4c209f7d8a9..a2862b09de1 100644 --- a/src/Columns/tests/gtest_column_dynamic.cpp +++ b/src/Columns/tests/gtest_column_dynamic.cpp @@ -195,7 +195,7 @@ TEST(ColumnDynamic, InsertFromOverflow1) column_to->insertFrom(*column_from, 1); ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); field = (*column_to)[column_to->size() - 1]; ASSERT_EQ(field, "42.42"); @@ -220,7 +220,7 @@ TEST(ColumnDynamic, InsertFromOverflow2) ASSERT_EQ(field, 42); column_to->insertFrom(*column_from, 1); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); field = (*column_to)[column_to->size() - 1]; ASSERT_EQ(field, "42.42"); @@ -299,7 +299,7 @@ TEST(ColumnDynamic, InsertManyFromOverflow1) column_to->insertManyFrom(*column_from, 1, 2); ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); field = (*column_to)[column_to->size() - 2]; ASSERT_EQ(field, "42.42"); @@ -332,7 +332,7 @@ TEST(ColumnDynamic, InsertManyFromOverflow2) column_to->insertManyFrom(*column_from, 1, 2); ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); field = (*column_to)[column_to->size() - 2]; ASSERT_EQ(field, "42.42"); @@ -406,7 +406,7 @@ TEST(ColumnDynamic, InsertRangeFromOverflow1) ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); auto field = (*column_to)[column_to->size() - 4]; ASSERT_EQ(field, Field(42)); field = (*column_to)[column_to->size() - 3]; @@ -429,7 +429,7 @@ TEST(ColumnDynamic, InsertRangeFromOverflow2) ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); auto field = (*column_to)[column_to->size() - 3]; ASSERT_EQ(field, Field(42)); field = (*column_to)[column_to->size() - 2]; @@ -451,7 +451,7 @@ TEST(ColumnDynamic, InsertRangeFromOverflow3) ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); auto field = (*column_to)[column_to->size() - 3]; ASSERT_EQ(field, Field(42)); field = (*column_to)[column_to->size() - 2]; @@ -470,9 +470,9 @@ TEST(ColumnDynamic, InsertRangeFromOverflow4) auto column_to = getDynamicWithManyVariants(254); column_to->insertRangeFrom(*column_from, 0, 3); ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); auto field = (*column_to)[column_to->size() - 3]; ASSERT_EQ(field, Field("42")); field = (*column_to)[column_to->size() - 2]; @@ -495,7 +495,7 @@ TEST(ColumnDynamic, InsertRangeFromOverflow5) ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); auto field = (*column_to)[column_to->size() - 4]; ASSERT_EQ(field, Field(42)); field = (*column_to)[column_to->size() - 3]; @@ -522,8 +522,8 @@ TEST(ColumnDynamic, InsertRangeFromOverflow6) ASSERT_EQ(column_to->getVariantInfo().variant_names.size(), 255); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Array(Int8)")); auto field = (*column_to)[column_to->size() - 5]; ASSERT_EQ(field, Field("44")); @@ -620,7 +620,7 @@ TEST(ColumnDynamic, SerializeDeserializeFromArenaOverflow) ASSERT_EQ((*column_from)[column_from->size() - 2], "str"); ASSERT_EQ((*column_from)[column_from->size() - 1], Null()); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Int8")); - ASSERT_TRUE(!column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); + ASSERT_FALSE(column_to->getVariantInfo().variant_name_to_discriminator.contains("Float64")); ASSERT_TRUE(column_to->getVariantInfo().variant_name_to_discriminator.contains("String")); } diff --git a/src/DataTypes/DataTypeDynamic.h b/src/DataTypes/DataTypeDynamic.h index 9fc727fd9c8..bd3d822fbb6 100644 --- a/src/DataTypes/DataTypeDynamic.h +++ b/src/DataTypes/DataTypeDynamic.h @@ -2,9 +2,6 @@ #include -#define DEFAULT_MAX_DYNAMIC_TYPES 32 - - namespace DB { @@ -46,6 +43,8 @@ public: size_t getMaxDynamicTypes() const { return max_dynamic_types; } private: + static constexpr size_t DEFAULT_MAX_DYNAMIC_TYPES = 32; + SerializationPtr doGetDefaultSerialization() const override; String doGetName() const override; diff --git a/src/DataTypes/Serializations/SerializationDynamic.cpp b/src/DataTypes/Serializations/SerializationDynamic.cpp index 5e6106f560f..d0ecc3b80a2 100644 --- a/src/DataTypes/Serializations/SerializationDynamic.cpp +++ b/src/DataTypes/Serializations/SerializationDynamic.cpp @@ -118,7 +118,12 @@ void SerializationDynamic::serializeBinaryBulkStatePrefix( for (size_t i = 0; i != variant_info.variant_names.size(); ++i) { size_t size = 0; - /// Use statistics from column if it was created during merge. + /// Check if we can use statistics stored in the column. There are 2 possible sources + /// of this statistics: + /// - statistics calculated during merge of some data parts (Statistics::Source::MERGE) + /// - statistics read from the data part during deserialization of Dynamic column (Statistics::Source::READ). + /// We can rely only on statistics calculated during the merge, because column with statistics that was read + /// during deserialization from some data part could be filtered/limited/transformed/etc and so the statistics can be outdated. if (!statistics.data.empty() && statistics.source == ColumnDynamic::Statistics::Source::MERGE) size = statistics.data.at(variant_info.variant_names[i]); /// Otherwise we can use only variant sizes from current column. diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index b01643a9532..910168d8010 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -575,7 +575,7 @@ ColumnUInt8::MutablePtr copyNullMap(ColumnPtr col) template struct ConvertImplGenericToString { - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/, const FormatSettings & format_settings) { static_assert(std::is_same_v || std::is_same_v, "Can be used only to serialize to ColumnString or ColumnFixedString"); @@ -596,7 +596,6 @@ struct ConvertImplGenericToString auto & write_buffer = write_helper.getWriteBuffer(); - FormatSettings format_settings; auto serialization = type.getDefaultSerialization(); for (size_t row = 0; row < size; ++row) { @@ -2299,7 +2298,7 @@ private: if constexpr (std::is_same_v) { if (from_type->getCustomSerialization()) - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context ? getFormatSettings(context) : FormatSettings()); } bool done = false; @@ -2332,7 +2331,7 @@ private: /// Generic conversion of any type to String. if (std::is_same_v) { - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context ? getFormatSettings(context) : FormatSettings()); } else throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", @@ -5060,7 +5059,7 @@ private: { ret = [](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr { - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context ? getFormatSettings(context) : FormatSettings()); }; return true; } diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 6c8e662477d..128854e87ba 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -552,7 +552,11 @@ BlockIO InterpreterInsertQuery::execute() { /// Change query sample block columns to Nullable to allow inserting nullable columns, where NULL values will be substituted with /// default column values (in AddingDefaultsTransform), so all values will be cast correctly. - if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && !isVariant(query_columns[col_idx].type) && !isDynamic(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) + if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) + && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) + && !isVariant(query_columns[col_idx].type) + && !isDynamic(query_columns[col_idx].type) + && output_columns.has(query_columns[col_idx].name)) query_sample_block.setColumn(col_idx, ColumnWithTypeAndName(makeNullableOrLowCardinalityNullable(query_columns[col_idx].column), makeNullableOrLowCardinalityNullable(query_columns[col_idx].type), query_columns[col_idx].name)); } } diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp index f5e4c88fcd0..07ee8f4ddef 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp @@ -31,7 +31,13 @@ CollapsingSortedAlgorithm::CollapsingSortedAlgorithm( LoggerPtr log_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs, std::make_unique(use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_)) + : IMergingAlgorithmWithSharedChunks( + header_, + num_inputs, + std::move(description_), + out_row_sources_buf_, + max_row_refs, + std::make_unique(use_average_block_sizes, max_block_size_rows_, max_block_size_bytes_)) , sign_column_number(header_.getPositionByName(sign_column)) , only_positive_sign(only_positive_sign_) , log(log_) From 4f1a97644ef6a6f462c01a0fb4046d07448d1d8c Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 10 May 2024 11:34:16 +0000 Subject: [PATCH 099/392] Use nested column properly in SerializationSparse::enumerateStreams --- src/DataTypes/Serializations/SerializationSparse.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp index f9228069b90..73488d308bb 100644 --- a/src/DataTypes/Serializations/SerializationSparse.cpp +++ b/src/DataTypes/Serializations/SerializationSparse.cpp @@ -170,7 +170,7 @@ void SerializationSparse::enumerateStreams( auto next_data = SubstreamData(nested) .withType(data.type) - .withColumn(column_sparse ? column_sparse->getValuesPtr() : nullptr) + .withColumn(column_sparse ? column_sparse->getValuesPtr() : data.column) .withSerializationInfo(data.serialization_info); nested->enumerateStreams(settings, callback, next_data); From fa5898a3cd5a9b4276eb75e39c4475dfdf722e3b Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Fri, 10 May 2024 13:46:56 +0200 Subject: [PATCH 100/392] Refactor data part writer --- src/Storages/MergeTree/IMergeTreeDataPart.h | 21 ++-- .../MergeTree/IMergeTreeDataPartWriter.cpp | 119 +++++++++++++++++- .../MergeTree/IMergeTreeDataPartWriter.h | 57 ++++++++- .../MergeTree/IMergedBlockOutputStream.cpp | 17 ++- .../MergeTree/IMergedBlockOutputStream.h | 15 ++- src/Storages/MergeTree/MergeTask.cpp | 3 +- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- .../MergeTree/MergeTreeDataPartCompact.cpp | 48 ++++--- .../MergeTree/MergeTreeDataPartCompact.h | 17 +-- .../MergeTree/MergeTreeDataPartWide.cpp | 18 ++- .../MergeTree/MergeTreeDataPartWide.h | 17 +-- .../MergeTreeDataPartWriterCompact.cpp | 27 ++-- .../MergeTreeDataPartWriterCompact.h | 9 +- .../MergeTreeDataPartWriterOnDisk.cpp | 32 +++-- .../MergeTree/MergeTreeDataPartWriterOnDisk.h | 9 +- .../MergeTree/MergeTreeDataPartWriterWide.cpp | 69 +++++----- .../MergeTree/MergeTreeDataPartWriterWide.h | 9 +- .../MergeTree/MergeTreeDataWriter.cpp | 4 +- src/Storages/MergeTree/MergeTreePartition.cpp | 13 +- src/Storages/MergeTree/MergeTreePartition.h | 4 +- .../MergeTree/MergedBlockOutputStream.cpp | 29 +++-- .../MergeTree/MergedBlockOutputStream.h | 2 +- .../MergedColumnOnlyOutputStream.cpp | 11 +- src/Storages/MergeTree/MutateTask.cpp | 2 +- 24 files changed, 409 insertions(+), 145 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index ba2ff2ed6fe..4ec5b3f5f8a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -74,7 +74,7 @@ public: using VirtualFields = std::unordered_map; using MergeTreeReaderPtr = std::unique_ptr; - using MergeTreeWriterPtr = std::unique_ptr; +// using MergeTreeWriterPtr = std::unique_ptr; using ColumnSizeByName = std::unordered_map; using NameToNumber = std::unordered_map; @@ -106,15 +106,16 @@ public: const ValueSizeMap & avg_value_size_hints_, const ReadBufferFromFileBase::ProfileCallback & profile_callback_) const = 0; - virtual MergeTreeWriterPtr getWriter( - const NamesAndTypesList & columns_list, - const StorageMetadataPtr & metadata_snapshot, - const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, - const CompressionCodecPtr & default_codec_, - const MergeTreeWriterSettings & writer_settings, - const MergeTreeIndexGranularity & computed_index_granularity) = 0; +//// virtual MergeTreeWriterPtr getWriter( +//// const NamesAndTypesList & columns_list, +//// const StorageMetadataPtr & metadata_snapshot, +//// const std::vector & indices_to_recalc, +//// const Statistics & stats_to_recalc_, +//// const CompressionCodecPtr & default_codec_, +//// const MergeTreeWriterSettings & writer_settings, +//// const MergeTreeIndexGranularity & computed_index_granularity) = 0; +// TODO: remove? virtual bool isStoredOnDisk() const = 0; virtual bool isStoredOnRemoteDisk() const = 0; @@ -168,6 +169,8 @@ public: const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; } + const SerializationByName & getSerializations() const { return serializations; } + SerializationPtr getSerialization(const String & column_name) const; SerializationPtr tryGetSerialization(const String & column_name) const; diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index 2488c63e309..c67e148d011 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -1,8 +1,15 @@ #include +#include "Storages/MergeTree/MergeTreeSettings.h" namespace DB { +namespace ErrorCodes +{ + extern const int NO_SUCH_COLUMN_IN_TABLE; +} + + Block getBlockAndPermute(const Block & block, const Names & names, const IColumn::Permutation * permutation) { Block result; @@ -38,13 +45,23 @@ Block permuteBlockIfNeeded(const Block & block, const IColumn::Permutation * per } IMergeTreeDataPartWriter::IMergeTreeDataPartWriter( - const MergeTreeMutableDataPartPtr & data_part_, +// const MergeTreeMutableDataPartPtr & data_part_, + const String & data_part_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + + const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) - : data_part(data_part_) - , storage(data_part_->storage) + : data_part_name(data_part_name_) + , serializations(serializations_) + , data_part_storage(data_part_storage_) + , index_granularity_info(index_granularity_info_) + + , storage_settings(storage_settings_) , metadata_snapshot(metadata_snapshot_) , columns_list(columns_list_) , settings(settings_) @@ -60,6 +77,102 @@ Columns IMergeTreeDataPartWriter::releaseIndexColumns() std::make_move_iterator(index_columns.end())); } +SerializationPtr IMergeTreeDataPartWriter::getSerialization(const String & column_name) const +{ + auto it = serializations.find(column_name); + if (it == serializations.end()) + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, + "There is no column or subcolumn {} in part {}", column_name, data_part_name); + + return it->second; +} + +ASTPtr IMergeTreeDataPartWriter::getCodecDescOrDefault(const String & column_name, CompressionCodecPtr default_codec) const +{ + auto get_codec_or_default = [&](const auto & column_desc) + { + return column_desc.codec ? column_desc.codec : default_codec->getFullCodecDesc(); + }; + + const auto & columns = metadata_snapshot->getColumns(); + if (const auto * column_desc = columns.tryGet(column_name)) + return get_codec_or_default(*column_desc); + +///// TODO: is this needed? +// if (const auto * virtual_desc = virtual_columns->tryGetDescription(column_name)) +// return get_codec_or_default(*virtual_desc); +// + return default_codec->getFullCodecDesc(); +} + + IMergeTreeDataPartWriter::~IMergeTreeDataPartWriter() = default; + +MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + + const NamesAndTypesList & columns_list, + const StorageMetadataPtr & metadata_snapshot, + const std::vector & indices_to_recalc, + const Statistics & stats_to_recalc_, + const String & marks_file_extension_, + const CompressionCodecPtr & default_codec_, + const MergeTreeWriterSettings & writer_settings, + const MergeTreeIndexGranularity & computed_index_granularity); + +MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + + const NamesAndTypesList & columns_list, + const StorageMetadataPtr & metadata_snapshot, + const std::vector & indices_to_recalc, + const Statistics & stats_to_recalc_, + const String & marks_file_extension_, + const CompressionCodecPtr & default_codec_, + const MergeTreeWriterSettings & writer_settings, + const MergeTreeIndexGranularity & computed_index_granularity); + + + +MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( + MergeTreeDataPartType part_type, + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + + const NamesAndTypesList & columns_list, + const StorageMetadataPtr & metadata_snapshot, + const std::vector & indices_to_recalc, + const Statistics & stats_to_recalc_, + const String & marks_file_extension_, + const CompressionCodecPtr & default_codec_, + const MergeTreeWriterSettings & writer_settings, + const MergeTreeIndexGranularity & computed_index_granularity) +{ + if (part_type == MergeTreeDataPartType::Compact) + return createMergeTreeDataPartCompactWriter(data_part_name_, logger_name_, serializations_, data_part_storage_, + index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, indices_to_recalc, stats_to_recalc_, + marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); + else if (part_type == MergeTreeDataPartType::Wide) + return createMergeTreeDataPartWideWriter(data_part_name_, logger_name_, serializations_, data_part_storage_, + index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, indices_to_recalc, stats_to_recalc_, + marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown part type: {}", part_type.toString()); +} + } diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index 3f359904ddd..ec04fd5f8a8 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -7,6 +7,8 @@ #include #include #include +#include "Storages/MergeTree/MergeTreeDataPartType.h" +#include "Storages/MergeTree/MergeTreeSettings.h" namespace DB @@ -22,7 +24,15 @@ class IMergeTreeDataPartWriter : private boost::noncopyable { public: IMergeTreeDataPartWriter( - const MergeTreeMutableDataPartPtr & data_part_, +// const MergeTreeMutableDataPartPtr & data_part_, + + const String & data_part_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeWriterSettings & settings_, @@ -39,10 +49,30 @@ public: Columns releaseIndexColumns(); const MergeTreeIndexGranularity & getIndexGranularity() const { return index_granularity; } + SerializationPtr getSerialization(const String & column_name) const; + + ASTPtr getCodecDescOrDefault(const String & column_name, CompressionCodecPtr default_codec) const; + + IDataPartStorage & getDataPartStorage() { return *data_part_storage; } + protected: - const MergeTreeMutableDataPartPtr data_part; - const MergeTreeData & storage; +// const MergeTreeMutableDataPartPtr data_part; // TODO: remove + + /// Serializations for every columns and subcolumns by their names. + String data_part_name; + SerializationByName serializations; + MutableDataPartStoragePtr data_part_storage; + MergeTreeIndexGranularityInfo index_granularity_info; + + +// const MergeTreeData & storage; // TODO: remove + + const MergeTreeSettingsPtr storage_settings; + const size_t low_cardinality_max_dictionary_size = 0; // TODO: pass it in ctor + const bool low_cardinality_use_single_dictionary_for_part = true; // TODO: pass it in ctor + + const StorageMetadataPtr metadata_snapshot; const NamesAndTypesList columns_list; const MergeTreeWriterSettings settings; @@ -52,4 +82,25 @@ protected: MutableColumns index_columns; }; +using MergeTreeDataPartWriterPtr = std::unique_ptr; + +MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( + MergeTreeDataPartType part_type, + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + + const NamesAndTypesList & columns_list, + const StorageMetadataPtr & metadata_snapshot, + const std::vector & indices_to_recalc, + const Statistics & stats_to_recalc_, + const String & marks_file_extension, + const CompressionCodecPtr & default_codec_, + const MergeTreeWriterSettings & writer_settings, + const MergeTreeIndexGranularity & computed_index_granularity); + + } diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp index c8d6aa0ba65..f99adf7c4db 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp @@ -2,25 +2,30 @@ #include #include #include +#include "Storages/MergeTree/IDataPartStorage.h" +#include "Storages/StorageSet.h" namespace DB { IMergedBlockOutputStream::IMergedBlockOutputStream( - const MergeTreeMutableDataPartPtr & data_part, +// const MergeTreeMutableDataPartPtr & data_part, + const MergeTreeSettingsPtr & storage_settings_, + MutableDataPartStoragePtr data_part_storage_, const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list, bool reset_columns_) - : storage(data_part->storage) + //: storage(data_part->storage) + : storage_settings(storage_settings_) , metadata_snapshot(metadata_snapshot_) - , data_part_storage(data_part->getDataPartStoragePtr()) + , data_part_storage(data_part_storage_)//data_part->getDataPartStoragePtr()) , reset_columns(reset_columns_) { if (reset_columns) { SerializationInfo::Settings info_settings = { - .ratio_of_defaults_for_sparse = storage.getSettings()->ratio_of_defaults_for_sparse_serialization, + .ratio_of_defaults_for_sparse = storage_settings->ratio_of_defaults_for_sparse_serialization,//storage.getSettings()->ratio_of_defaults_for_sparse_serialization, .choose_kind = false, }; @@ -42,7 +47,7 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( return {}; for (const auto & column : empty_columns) - LOG_TRACE(storage.log, "Skipping expired/empty column {} for part {}", column, data_part->name); + LOG_TRACE(data_part->storage.log, "Skipping expired/empty column {} for part {}", column, data_part->name); /// Collect counts for shared streams of different columns. As an example, Nested columns have shared stream with array sizes. std::map stream_counts; @@ -91,7 +96,7 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart( } else /// If we have no file in checksums it doesn't exist on disk { - LOG_TRACE(storage.log, "Files {} doesn't exist in checksums so it doesn't exist on disk, will not try to remove it", *itr); + LOG_TRACE(data_part->storage.log, "Files {} doesn't exist in checksums so it doesn't exist on disk, will not try to remove it", *itr); itr = remove_files.erase(itr); } } diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.h b/src/Storages/MergeTree/IMergedBlockOutputStream.h index ca4e3899b29..b6f279e6d58 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.h +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.h @@ -1,10 +1,12 @@ #pragma once #include "Storages/MergeTree/IDataPartStorage.h" +#include "Storages/MergeTree/MergeTreeSettings.h" #include #include #include #include +#include "Common/Logger.h" namespace DB { @@ -13,7 +15,9 @@ class IMergedBlockOutputStream { public: IMergedBlockOutputStream( - const MergeTreeMutableDataPartPtr & data_part, +// const MergeTreeMutableDataPartPtr & data_part, + const MergeTreeSettingsPtr & storage_settings_, + MutableDataPartStoragePtr data_part_storage_, const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list, bool reset_columns_); @@ -39,11 +43,16 @@ protected: SerializationInfoByName & serialization_infos, MergeTreeData::DataPart::Checksums & checksums); - const MergeTreeData & storage; +// const MergeTreeData & storage; // TODO: remove +//// + MergeTreeSettingsPtr storage_settings; + LoggerPtr log; +//// + StorageMetadataPtr metadata_snapshot; MutableDataPartStoragePtr data_part_storage; - IMergeTreeDataPart::MergeTreeWriterPtr writer; + MergeTreeDataPartWriterPtr writer; bool reset_columns = false; SerializationInfoByName new_serialization_infos; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 34e17e40a74..1b5ad0d81a7 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include namespace DB @@ -378,7 +379,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()), MergeTreeStatisticsFactory::instance().getMany(global_ctx->metadata_snapshot->getColumns()), ctx->compression_codec, - global_ctx->txn, + global_ctx->txn ? global_ctx->txn->tid : Tx::PrehistoricTID, /*reset_columns=*/ true, ctx->blocks_are_granules_size, global_ctx->context->getWriteSettings()); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 440c62213a3..8a96e4c9f04 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -8423,7 +8423,7 @@ std::pair MergeTreeData::createE MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), Statistics{}, - compression_codec, txn); + compression_codec, txn ? txn->tid : Tx::PrehistoricTID); bool sync_on_insert = settings->fsync_after_insert; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 418b2d8f81b..eebbe3110c0 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -47,27 +47,37 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartCompact::getReader( avg_value_size_hints, profile_callback, CLOCK_MONOTONIC_COARSE); } -IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter( - const NamesAndTypesList & columns_list, - const StorageMetadataPtr & metadata_snapshot, - const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, - const CompressionCodecPtr & default_codec_, - const MergeTreeWriterSettings & writer_settings, - const MergeTreeIndexGranularity & computed_index_granularity) +MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + + const NamesAndTypesList & columns_list, + const StorageMetadataPtr & metadata_snapshot, + const std::vector & indices_to_recalc, + const Statistics & stats_to_recalc_, + const String & marks_file_extension_, + const CompressionCodecPtr & default_codec_, + const MergeTreeWriterSettings & writer_settings, + const MergeTreeIndexGranularity & computed_index_granularity) { - NamesAndTypesList ordered_columns_list; - std::copy_if(columns_list.begin(), columns_list.end(), std::back_inserter(ordered_columns_list), - [this](const auto & column) { return getColumnPosition(column.name) != std::nullopt; }); - - /// Order of writing is important in compact format - ordered_columns_list.sort([this](const auto & lhs, const auto & rhs) - { return *getColumnPosition(lhs.name) < *getColumnPosition(rhs.name); }); - +////// TODO: fix the order of columns +//// +//// NamesAndTypesList ordered_columns_list; +//// std::copy_if(columns_list.begin(), columns_list.end(), std::back_inserter(ordered_columns_list), +//// [this](const auto & column) { return getColumnPosition(column.name) != std::nullopt; }); +//// +//// /// Order of writing is important in compact format +//// ordered_columns_list.sort([this](const auto & lhs, const auto & rhs) +//// { return *getColumnPosition(lhs.name) < *getColumnPosition(rhs.name); }); +//// return std::make_unique( - shared_from_this(), ordered_columns_list, metadata_snapshot, - indices_to_recalc, stats_to_recalc_, getMarksFileExtension(), - default_codec_, writer_settings, computed_index_granularity); + data_part_name_, logger_name_, serializations_, data_part_storage_, + index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, indices_to_recalc, stats_to_recalc_, + marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index 3a4e7b95f33..5a57d778b7d 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -40,15 +40,16 @@ public: const ValueSizeMap & avg_value_size_hints, const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; - MergeTreeWriterPtr getWriter( - const NamesAndTypesList & columns_list, - const StorageMetadataPtr & metadata_snapshot, - const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, - const CompressionCodecPtr & default_codec_, - const MergeTreeWriterSettings & writer_settings, - const MergeTreeIndexGranularity & computed_index_granularity) override; +// MergeTreeWriterPtr getWriter( +// const NamesAndTypesList & columns_list, +// const StorageMetadataPtr & metadata_snapshot, +// const std::vector & indices_to_recalc, +// const Statistics & stats_to_recalc_, +// const CompressionCodecPtr & default_codec_, +// const MergeTreeWriterSettings & writer_settings, +// const MergeTreeIndexGranularity & computed_index_granularity) override; +// TODO: remove? bool isStoredOnDisk() const override { return true; } bool isStoredOnRemoteDisk() const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index fc3108e522a..c99cff258e0 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -53,20 +53,26 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader( profile_callback); } -IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartWide::getWriter( +MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, + const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) { - return std::make_unique( - shared_from_this(), columns_list, - metadata_snapshot, indices_to_recalc, stats_to_recalc_, - getMarksFileExtension(), - default_codec_, writer_settings, computed_index_granularity); + return std::make_unique(data_part_name_, logger_name_, serializations_, data_part_storage_, + index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, indices_to_recalc, stats_to_recalc_, + marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index 84eeec4211b..45d0fbbebec 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -35,15 +35,16 @@ public: const ValueSizeMap & avg_value_size_hints, const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; - MergeTreeWriterPtr getWriter( - const NamesAndTypesList & columns_list, - const StorageMetadataPtr & metadata_snapshot, - const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, - const CompressionCodecPtr & default_codec_, - const MergeTreeWriterSettings & writer_settings, - const MergeTreeIndexGranularity & computed_index_granularity) override; +// MergeTreeWriterPtr getWriter( +// const NamesAndTypesList & columns_list, +// const StorageMetadataPtr & metadata_snapshot, +// const std::vector & indices_to_recalc, +// const Statistics & stats_to_recalc_, +// const CompressionCodecPtr & default_codec_, +// const MergeTreeWriterSettings & writer_settings, +// const MergeTreeIndexGranularity & computed_index_granularity) override; +// TODO: remove? bool isStoredOnDisk() const override { return true; } bool isStoredOnRemoteDisk() const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 1605e5cdb9a..6e8ea1a915b 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -10,7 +10,14 @@ namespace ErrorCodes } MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( - const MergeTreeMutableDataPartPtr & data_part_, +// const MergeTreeMutableDataPartPtr & data_part_, + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, @@ -19,23 +26,26 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) - : MergeTreeDataPartWriterOnDisk(data_part_, columns_list_, metadata_snapshot_, + : MergeTreeDataPartWriterOnDisk( + data_part_name_, logger_name_, serializations_, + data_part_storage_, index_granularity_info_, storage_settings_, + columns_list_, metadata_snapshot_, indices_to_recalc_, stats_to_recalc, marks_file_extension_, default_codec_, settings_, index_granularity_) - , plain_file(data_part_->getDataPartStorage().writeFile( + , plain_file(getDataPartStorage().writeFile( MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION, settings.max_compress_block_size, settings_.query_write_settings)) , plain_hashing(*plain_file) { - marks_file = data_part_->getDataPartStorage().writeFile( + marks_file = getDataPartStorage().writeFile( MergeTreeDataPartCompact::DATA_FILE_NAME + marks_file_extension_, 4096, settings_.query_write_settings); marks_file_hashing = std::make_unique(*marks_file); - if (data_part_->index_granularity_info.mark_type.compressed) + if (index_granularity_info.mark_type.compressed) { marks_compressor = std::make_unique( *marks_file_hashing, @@ -45,10 +55,9 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( marks_source_hashing = std::make_unique(*marks_compressor); } - auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); for (const auto & column : columns_list) { - auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); + auto compression = getCodecDescOrDefault(column.name, default_codec); addStreams(column, compression); } } @@ -81,7 +90,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, compressed_streams.emplace(stream_name, stream); }; - data_part->getSerialization(column.name)->enumerateStreams(callback, column.type); + getSerialization(column.name)->enumerateStreams(callback, column.type); } namespace @@ -230,7 +239,7 @@ void MergeTreeDataPartWriterCompact::writeDataBlock(const Block & block, const G writeBinaryLittleEndian(static_cast(0), marks_out); writeColumnSingleGranule( - block.getByName(name_and_type->name), data_part->getSerialization(name_and_type->name), + block.getByName(name_and_type->name), getSerialization(name_and_type->name), stream_getter, granule.start_row, granule.rows_to_write); /// Each type always have at least one substream diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index ddb6178dce6..3bec4c7e988 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -11,7 +11,14 @@ class MergeTreeDataPartWriterCompact : public MergeTreeDataPartWriterOnDisk { public: MergeTreeDataPartWriterCompact( - const MergeTreeMutableDataPartPtr & data_part, +// const MergeTreeMutableDataPartPtr & data_part, + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 491d2399b82..13892c17577 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -140,7 +140,13 @@ void MergeTreeDataPartWriterOnDisk::Stream::addToChecksums(Merg MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( - const MergeTreeMutableDataPartPtr & data_part_, + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeIndices & indices_to_recalc_, @@ -149,7 +155,9 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) - : IMergeTreeDataPartWriter(data_part_, columns_list_, metadata_snapshot_, settings_, index_granularity_) + : IMergeTreeDataPartWriter( + data_part_name_, serializations_, data_part_storage_, index_granularity_info_, + storage_settings_, columns_list_, metadata_snapshot_, settings_, index_granularity_) , skip_indices(indices_to_recalc_) , stats(stats_to_recalc_) , marks_file_extension(marks_file_extension_) @@ -157,14 +165,14 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( , compute_granularity(index_granularity.empty()) , compress_primary_key(settings.compress_primary_key) , execution_stats(skip_indices.size(), stats.size()) - , log(getLogger(storage.getLogName() + " (DataPartWriter)")) + , log(getLogger(logger_name_ + " (DataPartWriter)")) { if (settings.blocks_are_granules_size && !index_granularity.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't take information about index granularity from blocks, when non empty index_granularity array specified"); - if (!data_part->getDataPartStorage().exists()) - data_part->getDataPartStorage().createDirectories(); + if (!getDataPartStorage().exists()) + getDataPartStorage().createDirectories(); if (settings.rewrite_primary_key) initPrimaryIndex(); @@ -223,7 +231,7 @@ static size_t computeIndexGranularityImpl( size_t MergeTreeDataPartWriterOnDisk::computeIndexGranularity(const Block & block) const { - const auto storage_settings = storage.getSettings(); +// const auto storage_settings = storage.getSettings(); return computeIndexGranularityImpl( block, storage_settings->index_granularity_bytes, @@ -237,7 +245,7 @@ void MergeTreeDataPartWriterOnDisk::initPrimaryIndex() if (metadata_snapshot->hasPrimaryKey()) { String index_name = "primary" + getIndexExtension(compress_primary_key); - index_file_stream = data_part->getDataPartStorage().writeFile(index_name, DBMS_DEFAULT_BUFFER_SIZE, settings.query_write_settings); + index_file_stream = getDataPartStorage().writeFile(index_name, DBMS_DEFAULT_BUFFER_SIZE, settings.query_write_settings); index_file_hashing_stream = std::make_unique(*index_file_stream); if (compress_primary_key) @@ -256,7 +264,7 @@ void MergeTreeDataPartWriterOnDisk::initStatistics() String stats_name = stat_ptr->getFileName(); stats_streams.emplace_back(std::make_unique>( stats_name, - data_part->getDataPartStoragePtr(), + data_part_storage, stats_name, STAT_FILE_SUFFIX, default_codec, settings.max_compress_block_size, settings.query_write_settings)); @@ -275,7 +283,7 @@ void MergeTreeDataPartWriterOnDisk::initSkipIndices() skip_indices_streams.emplace_back( std::make_unique>( stream_name, - data_part->getDataPartStoragePtr(), + data_part_storage, stream_name, skip_index->getSerializedFileExtension(), stream_name, marks_file_extension, default_codec, settings.max_compress_block_size, @@ -285,7 +293,7 @@ void MergeTreeDataPartWriterOnDisk::initSkipIndices() GinIndexStorePtr store = nullptr; if (typeid_cast(&*skip_index) != nullptr) { - store = std::make_shared(stream_name, data_part->getDataPartStoragePtr(), data_part->getDataPartStoragePtr(), storage.getSettings()->max_digestion_size_per_segment); + store = std::make_shared(stream_name, data_part_storage, data_part_storage, /*storage.getSettings()*/storage_settings->max_digestion_size_per_segment); gin_index_stores[stream_name] = store; } skip_indices_aggregators.push_back(skip_index->createIndexAggregatorForPart(store, settings)); @@ -498,7 +506,7 @@ void MergeTreeDataPartWriterOnDisk::finishStatisticsSerialization(bool sync) } for (size_t i = 0; i < stats.size(); ++i) - LOG_DEBUG(log, "Spent {} ms calculating statistics {} for the part {}", execution_stats.statistics_build_us[i] / 1000, stats[i]->columnName(), data_part->name); + LOG_DEBUG(log, "Spent {} ms calculating statistics {} for the part {}", execution_stats.statistics_build_us[i] / 1000, stats[i]->columnName(), data_part_name); } void MergeTreeDataPartWriterOnDisk::fillStatisticsChecksums(MergeTreeData::DataPart::Checksums & checksums) @@ -524,7 +532,7 @@ void MergeTreeDataPartWriterOnDisk::finishSkipIndicesSerialization(bool sync) store.second->finalize(); for (size_t i = 0; i < skip_indices.size(); ++i) - LOG_DEBUG(log, "Spent {} ms calculating index {} for the part {}", execution_stats.skip_indices_build_us[i] / 1000, skip_indices[i]->index.name, data_part->name); + LOG_DEBUG(log, "Spent {} ms calculating index {} for the part {}", execution_stats.skip_indices_build_us[i] / 1000, skip_indices[i]->index.name, data_part_name); gin_index_stores.clear(); skip_indices_streams.clear(); diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index 9f2cc3970fa..39f33217b57 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -104,7 +104,14 @@ public: using StatisticStreamPtr = std::unique_ptr>; MergeTreeDataPartWriterOnDisk( - const MergeTreeMutableDataPartPtr & data_part_, +// const MergeTreeMutableDataPartPtr & data_part_, + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 6a3b08d4d65..1f68a9d31a1 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -76,7 +76,14 @@ Granules getGranulesToWrite(const MergeTreeIndexGranularity & index_granularity, } MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( - const MergeTreeMutableDataPartPtr & data_part_, +// const MergeTreeMutableDataPartPtr & data_part_, + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, @@ -85,14 +92,16 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) - : MergeTreeDataPartWriterOnDisk(data_part_, columns_list_, metadata_snapshot_, - indices_to_recalc_, stats_to_recalc_, marks_file_extension_, - default_codec_, settings_, index_granularity_) + : MergeTreeDataPartWriterOnDisk( + data_part_name_, logger_name_, serializations_, + data_part_storage_, index_granularity_info_, storage_settings_, + columns_list_, metadata_snapshot_, + indices_to_recalc_, stats_to_recalc_, marks_file_extension_, + default_codec_, settings_, index_granularity_) { - auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); for (const auto & column : columns_list) { - auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); + auto compression = getCodecDescOrDefault(column.name, default_codec); addStreams(column, compression); } } @@ -105,7 +114,7 @@ void MergeTreeDataPartWriterWide::addStreams( { assert(!substream_path.empty()); - auto storage_settings = storage.getSettings(); +// auto storage_settings = storage.getSettings(); auto full_stream_name = ISerialization::getFileNameForStream(column, substream_path); String stream_name; @@ -149,7 +158,7 @@ void MergeTreeDataPartWriterWide::addStreams( column_streams[stream_name] = std::make_unique>( stream_name, - data_part->getDataPartStoragePtr(), + data_part_storage, stream_name, DATA_FILE_EXTENSION, stream_name, marks_file_extension, compression_codec, @@ -163,7 +172,7 @@ void MergeTreeDataPartWriterWide::addStreams( }; ISerialization::SubstreamPath path; - data_part->getSerialization(column.name)->enumerateStreams(callback, column.type); + getSerialization(column.name)->enumerateStreams(callback, column.type); } const String & MergeTreeDataPartWriterWide::getStreamName( @@ -264,7 +273,7 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm { auto & column = block_to_write.getByName(it->name); - if (data_part->getSerialization(it->name)->getKind() != ISerialization::Kind::SPARSE) + if (getSerialization(it->name)->getKind() != ISerialization::Kind::SPARSE) column.column = recursiveRemoveSparse(column.column); if (permutation) @@ -334,7 +343,7 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn( min_compress_block_size = value->safeGet(); if (!min_compress_block_size) min_compress_block_size = settings.min_compress_block_size; - data_part->getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; auto stream_name = getStreamName(column, substream_path); @@ -368,7 +377,7 @@ void MergeTreeDataPartWriterWide::writeSingleGranule( ISerialization::SerializeBinaryBulkSettings & serialize_settings, const Granule & granule) { - const auto & serialization = data_part->getSerialization(name_and_type.name); + const auto & serialization = getSerialization(name_and_type.name); serialization->serializeBinaryBulkWithMultipleStreams(column, granule.start_row, granule.rows_to_write, serialize_settings, serialization_state); /// So that instead of the marks pointing to the end of the compressed block, there were marks pointing to the beginning of the next one. @@ -398,7 +407,7 @@ void MergeTreeDataPartWriterWide::writeColumn( const auto & [name, type] = name_and_type; auto [it, inserted] = serialization_states.emplace(name, nullptr); - auto serialization = data_part->getSerialization(name_and_type.name); + auto serialization = getSerialization(name_and_type.name); if (inserted) { @@ -407,11 +416,11 @@ void MergeTreeDataPartWriterWide::writeColumn( serialization->serializeBinaryBulkStatePrefix(column, serialize_settings, it->second); } - const auto & global_settings = storage.getContext()->getSettingsRef(); +// const auto & global_settings = storage.getContext()->getSettingsRef(); ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.getter = createStreamGetter(name_and_type, offset_columns); - serialize_settings.low_cardinality_max_dictionary_size = global_settings.low_cardinality_max_dictionary_size; - serialize_settings.low_cardinality_use_single_dictionary_for_part = global_settings.low_cardinality_use_single_dictionary_for_part != 0; + serialize_settings.low_cardinality_max_dictionary_size = low_cardinality_max_dictionary_size;//global_settings.low_cardinality_max_dictionary_size; + serialize_settings.low_cardinality_use_single_dictionary_for_part = low_cardinality_use_single_dictionary_for_part;//global_settings.low_cardinality_use_single_dictionary_for_part != 0; for (const auto & granule : granules) { @@ -460,7 +469,7 @@ void MergeTreeDataPartWriterWide::writeColumn( void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePair & name_type) { const auto & [name, type] = name_type; - const auto & serialization = data_part->getSerialization(name_type.name); + const auto & serialization = getSerialization(name_type.name); if (!type->isValueRepresentedByNumber() || type->haveSubtypes() || serialization->getKind() != ISerialization::Kind::DEFAULT) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot validate column of non fixed type {}", type->getName()); @@ -470,21 +479,21 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai String bin_path = escaped_name + DATA_FILE_EXTENSION; /// Some columns may be removed because of ttl. Skip them. - if (!data_part->getDataPartStorage().exists(mrk_path)) + if (!getDataPartStorage().exists(mrk_path)) return; - auto mrk_file_in = data_part->getDataPartStorage().readFile(mrk_path, {}, std::nullopt, std::nullopt); + auto mrk_file_in = getDataPartStorage().readFile(mrk_path, {}, std::nullopt, std::nullopt); std::unique_ptr mrk_in; - if (data_part->index_granularity_info.mark_type.compressed) + if (index_granularity_info.mark_type.compressed) mrk_in = std::make_unique(std::move(mrk_file_in)); else mrk_in = std::move(mrk_file_in); - DB::CompressedReadBufferFromFile bin_in(data_part->getDataPartStorage().readFile(bin_path, {}, std::nullopt, std::nullopt)); + DB::CompressedReadBufferFromFile bin_in(getDataPartStorage().readFile(bin_path, {}, std::nullopt, std::nullopt)); bool must_be_last = false; UInt64 offset_in_compressed_file = 0; UInt64 offset_in_decompressed_block = 0; - UInt64 index_granularity_rows = data_part->index_granularity_info.fixed_index_granularity; + UInt64 index_granularity_rows = index_granularity_info.fixed_index_granularity; size_t mark_num; @@ -500,7 +509,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai if (settings.can_use_adaptive_granularity) readBinaryLittleEndian(index_granularity_rows, *mrk_in); else - index_granularity_rows = data_part->index_granularity_info.fixed_index_granularity; + index_granularity_rows = index_granularity_info.fixed_index_granularity; if (must_be_last) { @@ -533,7 +542,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai ErrorCodes::LOGICAL_ERROR, "Incorrect mark rows for part {} for mark #{}" " (compressed offset {}, decompressed offset {}), in-memory {}, on disk {}, total marks {}", - data_part->getDataPartStorage().getFullPath(), + getDataPartStorage().getFullPath(), mark_num, offset_in_compressed_file, offset_in_decompressed_block, index_granularity.getMarkRows(mark_num), index_granularity_rows, index_granularity.getMarksCount()); @@ -596,10 +605,10 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove) { - const auto & global_settings = storage.getContext()->getSettingsRef(); +// const auto & global_settings = storage.getContext()->getSettingsRef(); ISerialization::SerializeBinaryBulkSettings serialize_settings; - serialize_settings.low_cardinality_max_dictionary_size = global_settings.low_cardinality_max_dictionary_size; - serialize_settings.low_cardinality_use_single_dictionary_for_part = global_settings.low_cardinality_use_single_dictionary_for_part != 0; + serialize_settings.low_cardinality_max_dictionary_size = low_cardinality_max_dictionary_size;//global_settings.low_cardinality_max_dictionary_size; + serialize_settings.low_cardinality_use_single_dictionary_for_part = low_cardinality_use_single_dictionary_for_part;//global_settings.low_cardinality_use_single_dictionary_for_part != 0; WrittenOffsetColumns offset_columns; if (rows_written_in_last_mark > 0) { @@ -622,7 +631,7 @@ void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksum if (!serialization_states.empty()) { serialize_settings.getter = createStreamGetter(*it, written_offset_columns ? *written_offset_columns : offset_columns); - data_part->getSerialization(it->name)->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]); + getSerialization(it->name)->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]); } if (write_final_mark) @@ -665,7 +674,7 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(bool sync) { if (column.type->isValueRepresentedByNumber() && !column.type->haveSubtypes() - && data_part->getSerialization(column.name)->getKind() == ISerialization::Kind::DEFAULT) + && getSerialization(column.name)->getKind() == ISerialization::Kind::DEFAULT) { validateColumnOfFixedSize(column); } @@ -708,7 +717,7 @@ void MergeTreeDataPartWriterWide::writeFinalMark( { writeSingleMark(column, offset_columns, 0); /// Memoize information about offsets - data_part->getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) + getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path) { bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes; if (is_offsets) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index f5ff323563d..ef9c4ab17dc 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -21,7 +21,14 @@ class MergeTreeDataPartWriterWide : public MergeTreeDataPartWriterOnDisk { public: MergeTreeDataPartWriterWide( - const MergeTreeMutableDataPartPtr & data_part, +// const MergeTreeMutableDataPartPtr & data_part, + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index daa163d741c..0f05c171230 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -600,7 +600,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( indices, MergeTreeStatisticsFactory::instance().getMany(metadata_snapshot->getColumns()), compression_codec, - context->getCurrentTransaction(), + context->getCurrentTransaction() ? context->getCurrentTransaction()->tid : Tx::PrehistoricTID, false, false, context->getWriteSettings()); @@ -738,7 +738,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( MergeTreeIndices{}, Statistics{}, /// TODO(hanfei): It should be helpful to write statistics for projection result. compression_codec, - NO_TRANSACTION_PTR, + Tx::PrehistoricTID, false, false, data.getContext()->getWriteSettings()); out->writeWithPermutation(block, perm_ptr); diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index ddeaf69136a..c2ef7f98388 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -12,6 +12,7 @@ #include #include #include +#include "Interpreters/Context_fwd.h" #include #include @@ -413,12 +414,14 @@ void MergeTreePartition::load(const MergeTreeData & storage, const PartMetadataM partition_key_sample.getByPosition(i).type->getDefaultSerialization()->deserializeBinary(value[i], *file, {}); } -std::unique_ptr MergeTreePartition::store(const MergeTreeData & storage, IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums) const +std::unique_ptr MergeTreePartition::store(/*const MergeTreeData & storage,*/ + StorageMetadataPtr metadata_snapshot, ContextPtr storage_context, + IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums) const { - auto metadata_snapshot = storage.getInMemoryMetadataPtr(); - const auto & context = storage.getContext(); - const auto & partition_key_sample = adjustPartitionKey(metadata_snapshot, storage.getContext()).sample_block; - return store(partition_key_sample, data_part_storage, checksums, context->getWriteSettings()); +// auto metadata_snapshot = storage.getInMemoryMetadataPtr(); +// const auto & context = storage.getContext(); + const auto & partition_key_sample = adjustPartitionKey(metadata_snapshot, storage_context).sample_block; + return store(partition_key_sample, data_part_storage, checksums, storage_context->getWriteSettings()); } std::unique_ptr MergeTreePartition::store(const Block & partition_key_sample, IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const diff --git a/src/Storages/MergeTree/MergeTreePartition.h b/src/Storages/MergeTree/MergeTreePartition.h index 78b141f26ec..04175d6f927 100644 --- a/src/Storages/MergeTree/MergeTreePartition.h +++ b/src/Storages/MergeTree/MergeTreePartition.h @@ -44,7 +44,9 @@ public: /// Store functions return write buffer with written but not finalized data. /// User must call finish() for returned object. - [[nodiscard]] std::unique_ptr store(const MergeTreeData & storage, IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums) const; + [[nodiscard]] std::unique_ptr store(//const MergeTreeData & storage, + StorageMetadataPtr metadata_snapshot, ContextPtr storage_context, + IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums) const; [[nodiscard]] std::unique_ptr store(const Block & partition_key_sample, IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const; void assign(const MergeTreePartition & other) { value = other.value; } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 9f641fd8eb5..2441d941952 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -21,35 +21,40 @@ MergedBlockOutputStream::MergedBlockOutputStream( const MergeTreeIndices & skip_indices, const Statistics & statistics, CompressionCodecPtr default_codec_, - const MergeTreeTransactionPtr & txn, + TransactionID tid, bool reset_columns_, bool blocks_are_granules_size, const WriteSettings & write_settings_, const MergeTreeIndexGranularity & computed_index_granularity) - : IMergedBlockOutputStream(data_part, metadata_snapshot_, columns_list_, reset_columns_) + : IMergedBlockOutputStream(data_part->storage.getSettings(), data_part->getDataPartStoragePtr(), metadata_snapshot_, columns_list_, reset_columns_) , columns_list(columns_list_) , default_codec(default_codec_) , write_settings(write_settings_) { MergeTreeWriterSettings writer_settings( - storage.getContext()->getSettings(), + data_part->storage.getContext()->getSettings(), write_settings, - storage.getSettings(), + storage_settings, data_part->index_granularity_info.mark_type.adaptive, /* rewrite_primary_key = */ true, blocks_are_granules_size); +// TODO: looks like isStoredOnDisk() is always true for MergeTreeDataPart if (data_part->isStoredOnDisk()) data_part_storage->createDirectories(); - /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. - TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; +// /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. +// TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; /// NOTE do not pass context for writing to system.transactions_info_log, /// because part may have temporary name (with temporary block numbers). Will write it later. data_part->version.setCreationTID(tid, nullptr); data_part->storeVersionMetadata(); - writer = data_part->getWriter(columns_list, metadata_snapshot, skip_indices, statistics, default_codec, writer_settings, computed_index_granularity); + writer = createMergeTreeDataPartWriter(data_part->getType(), + data_part->name, data_part->storage.getLogName(), data_part->getSerializations(), + data_part_storage, data_part->index_granularity_info, + storage_settings, + columns_list, metadata_snapshot, skip_indices, statistics, data_part->getMarksFileExtension(), default_codec, writer_settings, computed_index_granularity); } /// If data is pre-sorted. @@ -208,7 +213,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis if (new_part->isProjectionPart()) { - if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING || isCompactPart(new_part)) + if (new_part->storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING || isCompactPart(new_part)) { auto count_out = new_part->getDataPartStorage().writeFile("count.txt", 4096, write_settings); HashingWriteBuffer count_out_hashing(*count_out); @@ -234,14 +239,16 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis written_files.emplace_back(std::move(out)); } - if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) + if (new_part->storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) { - if (auto file = new_part->partition.store(storage, new_part->getDataPartStorage(), checksums)) + if (auto file = new_part->partition.store(//storage, + new_part->storage.getInMemoryMetadataPtr(), new_part->storage.getContext(), + new_part->getDataPartStorage(), checksums)) written_files.emplace_back(std::move(file)); if (new_part->minmax_idx->initialized) { - auto files = new_part->minmax_idx->store(storage, new_part->getDataPartStorage(), checksums); + auto files = new_part->minmax_idx->store(new_part->storage, new_part->getDataPartStorage(), checksums); for (auto & file : files) written_files.emplace_back(std::move(file)); } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.h b/src/Storages/MergeTree/MergedBlockOutputStream.h index 540b3b3bffa..c1e3d75fefc 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -22,7 +22,7 @@ public: const MergeTreeIndices & skip_indices, const Statistics & statistics, CompressionCodecPtr default_codec_, - const MergeTreeTransactionPtr & txn, + TransactionID tid, bool reset_columns_ = false, bool blocks_are_granules_size = false, const WriteSettings & write_settings = {}, diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 728b2e38833..51853384012 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -20,11 +20,11 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( WrittenOffsetColumns * offset_columns_, const MergeTreeIndexGranularity & index_granularity, const MergeTreeIndexGranularityInfo * index_granularity_info) - : IMergedBlockOutputStream(data_part, metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true) + : IMergedBlockOutputStream(data_part->storage.getSettings(), data_part->getDataPartStoragePtr(), metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true) , header(header_) { const auto & global_settings = data_part->storage.getContext()->getSettings(); - const auto & storage_settings = data_part->storage.getSettings(); +// const auto & storage_settings = data_part->storage.getSettings(); MergeTreeWriterSettings writer_settings( global_settings, @@ -33,11 +33,16 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( index_granularity_info ? index_granularity_info->mark_type.adaptive : data_part->storage.canUseAdaptiveGranularity(), /* rewrite_primary_key = */ false); - writer = data_part->getWriter( + writer = createMergeTreeDataPartWriter( + data_part->getType(), + data_part->name, data_part->storage.getLogName(), data_part->getSerializations(), + data_part_storage, data_part->index_granularity_info, + storage_settings, header.getNamesAndTypesList(), metadata_snapshot_, indices_to_recalc, stats_to_recalc_, + data_part->getMarksFileExtension(), default_codec, writer_settings, index_granularity); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 55d845dfbb9..54077055d96 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1660,7 +1660,7 @@ private: skip_indices, stats_to_rewrite, ctx->compression_codec, - ctx->txn, + ctx->txn ? ctx->txn->tid : Tx::PrehistoricTID, /*reset_columns=*/ true, /*blocks_are_granules_size=*/ false, ctx->context->getWriteSettings(), From 32b8aba8ef1bf9a0b890065a5d719a002cee8bb5 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Fri, 10 May 2024 14:12:34 +0200 Subject: [PATCH 101/392] Style --- src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index c67e148d011..b46fbc5fc9e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -6,6 +6,7 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int NO_SUCH_COLUMN_IN_TABLE; } @@ -144,7 +145,6 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const MergeTreeIndexGranularity & computed_index_granularity); - MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( MergeTreeDataPartType part_type, const String & data_part_name_, From 60c721c21b645bad32dbe361b502e9132474793a Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 10 May 2024 12:20:27 +0000 Subject: [PATCH 102/392] Fix build after conflict resolution --- src/Functions/FunctionsConversion.cpp | 3 ++- src/Storages/MergeTree/MergeTreeReaderWide.cpp | 11 +++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 90703947182..8f5d11b05ee 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -5057,7 +5058,7 @@ private: } else if (from_type->getCustomSerialization()) { - ret = [](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + ret = [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr { return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context ? getFormatSettings(context) : FormatSettings()); }; diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index 9468cffd25d..b7eefab112c 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -249,7 +249,7 @@ MergeTreeReaderWide::FileStreams::iterator MergeTreeReaderWide::addStream(const auto marks_loader = std::make_shared( data_part_info_for_read, mark_cache, - data_part_info_for_read->getIndexGranularityInfo().getMarksFilePath(*stream_name), + data_part_info_for_read->getIndexGranularityInfo().getMarksFilePath(stream_name), num_marks_in_part, data_part_info_for_read->getIndexGranularityInfo(), settings.save_marks_in_cache, @@ -257,24 +257,23 @@ MergeTreeReaderWide::FileStreams::iterator MergeTreeReaderWide::addStream(const load_marks_threadpool, /*num_columns_in_mark=*/ 1); - has_any_stream = true; auto stream_settings = settings; stream_settings.is_low_cardinality_dictionary = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys; auto create_stream = [&]() { return std::make_unique( - data_part_info_for_read->getDataPartStorage(), *stream_name, DATA_FILE_EXTENSION, + data_part_info_for_read->getDataPartStorage(), stream_name, DATA_FILE_EXTENSION, num_marks_in_part, all_mark_ranges, stream_settings, - uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(*stream_name + DATA_FILE_EXTENSION), + uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION), std::move(marks_loader), profile_callback, clock_type); }; if (read_without_marks) - return streams.emplace(*stream_name, create_stream.operator()()); + return streams.emplace(stream_name, create_stream.operator()()).first; marks_loader->startAsyncLoad(); - return streams.emplace(*stream_name, create_stream.operator()()); + return streams.emplace(stream_name, create_stream.operator()()).first; } ReadBuffer * MergeTreeReaderWide::getStream( From fb20e80db417f63ed7a12036488accb9f418f261 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 10 May 2024 13:23:19 +0000 Subject: [PATCH 103/392] Better test, fix style --- src/Functions/FunctionsConversion.cpp | 62 ++++++++++++------- .../MergeTree/MergeTreeReaderWide.cpp | 2 +- ...9_dynamic_all_merge_algorithms_2.reference | 20 +++--- .../03039_dynamic_all_merge_algorithms_2.sh | 8 +-- 4 files changed, 56 insertions(+), 36 deletions(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 8f5d11b05ee..5bb6fa065de 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -576,7 +576,7 @@ ColumnUInt8::MutablePtr copyNullMap(ColumnPtr col) template struct ConvertImplGenericToString { - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/, const FormatSettings & format_settings) + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/, const ContextPtr & context) { static_assert(std::is_same_v || std::is_same_v, "Can be used only to serialize to ColumnString or ColumnFixedString"); @@ -597,6 +597,7 @@ struct ConvertImplGenericToString auto & write_buffer = write_helper.getWriteBuffer(); + FormatSettings format_settings = context ? getFormatSettings(context) : FormatSettings{}; auto serialization = type.getDefaultSerialization(); for (size_t row = 0; row < size; ++row) { @@ -1820,7 +1821,7 @@ struct ConvertImpl template struct ConvertImplGenericFromString { - static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) + static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count, const ContextPtr & context) { const IColumn & column_from = *arguments[0].column; const IDataType & data_type_to = *result_type; @@ -1828,7 +1829,7 @@ struct ConvertImplGenericFromString auto serialization = data_type_to.getDefaultSerialization(); const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; - executeImpl(column_from, *res, *serialization, input_rows_count, null_map, result_type.get()); + executeImpl(column_from, *res, *serialization, input_rows_count, null_map, result_type.get(), context); return res; } @@ -1838,11 +1839,12 @@ struct ConvertImplGenericFromString const ISerialization & serialization_from, size_t input_rows_count, const PaddedPODArray * null_map, - const IDataType * result_type) + const IDataType * result_type, + const ContextPtr & context) { column_to.reserve(input_rows_count); - FormatSettings format_settings; + FormatSettings format_settings = context ? getFormatSettings(context) : FormatSettings{}; for (size_t i = 0; i < input_rows_count; ++i) { if (null_map && (*null_map)[i]) @@ -2299,7 +2301,7 @@ private: if constexpr (std::is_same_v) { if (from_type->getCustomSerialization()) - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context ? getFormatSettings(context) : FormatSettings()); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context); } bool done = false; @@ -2332,7 +2334,7 @@ private: /// Generic conversion of any type to String. if (std::is_same_v) { - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context ? getFormatSettings(context) : FormatSettings()); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context); } else throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", @@ -3288,8 +3290,17 @@ private: if (checkAndGetDataType(from_type.get())) { if (cast_type == CastType::accurateOrNull) - return &ConvertImplGenericFromString::execute; - return &ConvertImplGenericFromString::execute; + { + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; + } + + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; } return createWrapper(from_type, to_type, requested_result_is_nullable); @@ -3452,7 +3463,10 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; } else if (const auto * agg_type = checkAndGetDataType(from_type_untyped.get())) { @@ -3495,7 +3509,10 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; } DataTypePtr from_type_holder; @@ -3586,7 +3603,10 @@ private: /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); + }; } const auto * from_type = checkAndGetDataType(from_type_untyped.get()); @@ -3929,9 +3949,9 @@ private: } else if (checkAndGetDataType(from_type.get())) { - return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) { - auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count)->assumeMutable(); + auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count, context)->assumeMutable(); res->finalize(); return res; }; @@ -4104,8 +4124,8 @@ private: args[0].type = removeNullable(removeLowCardinality(args[0].type)); if (cast_type == CastType::accurateOrNull) - return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count); - return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count); + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count, context); + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count, context); }; } @@ -4265,8 +4285,8 @@ private: args[0].type = removeNullable(removeLowCardinality(args[0].type)); if (cast_type == CastType::accurateOrNull) - return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count); - return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count); + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count, context); + return ConvertImplGenericFromString::execute(args, result_type, column_nullable, input_rows_count, context); }; } @@ -5020,9 +5040,9 @@ private: wrapped_result_type = makeNullable(result_type); if (this->cast_type == CastType::accurateOrNull) return ConvertImplGenericFromString::execute( - arguments, wrapped_result_type, column_nullable, input_rows_count); + arguments, wrapped_result_type, column_nullable, input_rows_count, context); return ConvertImplGenericFromString::execute( - arguments, wrapped_result_type, column_nullable, input_rows_count); + arguments, wrapped_result_type, column_nullable, input_rows_count, context); }; return true; } @@ -5060,7 +5080,7 @@ private: { ret = [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr { - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context ? getFormatSettings(context) : FormatSettings()); + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count, context); }; return true; } diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index b7eefab112c..b6882fdced9 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -271,7 +271,7 @@ MergeTreeReaderWide::FileStreams::iterator MergeTreeReaderWide::addStream(const if (read_without_marks) return streams.emplace(stream_name, create_stream.operator()()).first; - + marks_loader->startAsyncLoad(); return streams.emplace(stream_name, create_stream.operator()()).first; } diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference index 03c8b4564fa..af6c7d8d567 100644 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference @@ -2,8 +2,8 @@ MergeTree compact + horizontal merge CollapsingMergeTree 100000 String 100000 UInt64 -50000 UInt64 50000 String +50000 UInt64 VersionedCollapsingMergeTree 100000 String 100000 UInt64 @@ -11,34 +11,34 @@ VersionedCollapsingMergeTree 75000 UInt64 MergeTree wide + horizontal merge CollapsingMergeTree -100000 UInt64 100000 String +100000 UInt64 50000 String 50000 UInt64 VersionedCollapsingMergeTree -100000 UInt64 100000 String +100000 UInt64 75000 String 75000 UInt64 MergeTree compact + vertical merge CollapsingMergeTree -100000 UInt64 100000 String -50000 UInt64 +100000 UInt64 50000 String +50000 UInt64 VersionedCollapsingMergeTree -100000 UInt64 100000 String -75000 UInt64 +100000 UInt64 75000 String +75000 UInt64 MergeTree wide + vertical merge CollapsingMergeTree -100000 UInt64 100000 String +100000 UInt64 50000 String 50000 UInt64 VersionedCollapsingMergeTree -100000 UInt64 100000 String -75000 UInt64 +100000 UInt64 75000 String +75000 UInt64 diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh index 5dae9228d0a..f067a99ca19 100755 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh @@ -18,9 +18,9 @@ function test() $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" $CH_CLIENT -q "insert into test select number, -1, 'str_' || toString(number) from numbers(50000, 100000)" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "drop table test" echo "VersionedCollapsingMergeTree" @@ -29,9 +29,9 @@ function test() $CH_CLIENT -q "insert into test select number, 1, 1, number from numbers(100000)" $CH_CLIENT -q "insert into test select number, -1, number >= 75000 ? 2 : 1, 'str_' || toString(number) from numbers(50000, 100000)" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count()" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" $CH_CLIENT -q "drop table test" } From b20d60858f1286a5e406e2c74036e6ad244fda2b Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Fri, 10 May 2024 15:48:32 +0200 Subject: [PATCH 104/392] Pass low cardinality settings --- src/Storages/MergeTree/IMergeTreeDataPartWriter.h | 2 -- src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp | 8 ++++---- src/Storages/MergeTree/MergeTreeIOSettings.h | 5 +++++ 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index ec04fd5f8a8..52e21bed2f2 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -69,8 +69,6 @@ protected: // const MergeTreeData & storage; // TODO: remove const MergeTreeSettingsPtr storage_settings; - const size_t low_cardinality_max_dictionary_size = 0; // TODO: pass it in ctor - const bool low_cardinality_use_single_dictionary_for_part = true; // TODO: pass it in ctor const StorageMetadataPtr metadata_snapshot; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 1f68a9d31a1..713dee87fa8 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -419,8 +419,8 @@ void MergeTreeDataPartWriterWide::writeColumn( // const auto & global_settings = storage.getContext()->getSettingsRef(); ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.getter = createStreamGetter(name_and_type, offset_columns); - serialize_settings.low_cardinality_max_dictionary_size = low_cardinality_max_dictionary_size;//global_settings.low_cardinality_max_dictionary_size; - serialize_settings.low_cardinality_use_single_dictionary_for_part = low_cardinality_use_single_dictionary_for_part;//global_settings.low_cardinality_use_single_dictionary_for_part != 0; + serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size;//global_settings.low_cardinality_max_dictionary_size; + serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part;//global_settings.low_cardinality_use_single_dictionary_for_part != 0; for (const auto & granule : granules) { @@ -607,8 +607,8 @@ void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksum { // const auto & global_settings = storage.getContext()->getSettingsRef(); ISerialization::SerializeBinaryBulkSettings serialize_settings; - serialize_settings.low_cardinality_max_dictionary_size = low_cardinality_max_dictionary_size;//global_settings.low_cardinality_max_dictionary_size; - serialize_settings.low_cardinality_use_single_dictionary_for_part = low_cardinality_use_single_dictionary_for_part;//global_settings.low_cardinality_use_single_dictionary_for_part != 0; + serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size;//global_settings.low_cardinality_max_dictionary_size; + serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part;//global_settings.low_cardinality_use_single_dictionary_for_part != 0; WrittenOffsetColumns offset_columns; if (rows_written_in_last_mark > 0) { diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h index 12a83703148..421c62887da 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.h +++ b/src/Storages/MergeTree/MergeTreeIOSettings.h @@ -74,6 +74,8 @@ struct MergeTreeWriterSettings , blocks_are_granules_size(blocks_are_granules_size_) , query_write_settings(query_write_settings_) , max_threads_for_annoy_index_creation(global_settings.max_threads_for_annoy_index_creation) + , low_cardinality_max_dictionary_size(global_settings.low_cardinality_max_dictionary_size) + , low_cardinality_use_single_dictionary_for_part(global_settings.low_cardinality_use_single_dictionary_for_part) { } @@ -93,6 +95,9 @@ struct MergeTreeWriterSettings WriteSettings query_write_settings; size_t max_threads_for_annoy_index_creation; + + size_t low_cardinality_max_dictionary_size; + bool low_cardinality_use_single_dictionary_for_part; }; } From cd3604f23543cbd07f650c1446d54606d06a81cf Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 10 May 2024 14:14:17 +0000 Subject: [PATCH 105/392] Remove trailing whitespaces --- src/Functions/FunctionsConversion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 5bb6fa065de..09d0025860a 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -3296,7 +3296,7 @@ private: return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); }; } - + return [this](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) -> ColumnPtr { return ConvertImplGenericFromString::execute(arguments, result_type, column_nullable, input_rows_count, context); From 5004c225831c1fa1cf8c213673148a1ca299d4e1 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Fri, 10 May 2024 15:25:21 +0200 Subject: [PATCH 106/392] Fix Array and Map support with Keyed hashing When working with materialized key columns and rows containing Arrays or Maps (implemented as Tuple's Arrays) with multiple values, the keyed hash functions were erroneously refusing to proceed, because they misinterpreted the output vector size. Close #61497 which was reported as a security issue, but it didn't actually have any security impact. The usefulness of keyed hashing over Maps is also questionable, but we support it for completeness. --- src/Functions/FunctionsHashing.h | 24 ++++++++++++++++++- .../0_stateless/02534_keyed_siphash.reference | 3 +++ .../0_stateless/02534_keyed_siphash.sql | 7 ++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 79b33e2f75b..bccdba5ee69 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -49,6 +49,8 @@ #include #include +#include + namespace DB { @@ -75,17 +77,29 @@ namespace impl ColumnPtr key0; ColumnPtr key1; bool is_const; + const ColumnArray::Offsets * offsets{}; size_t size() const { assert(key0 && key1); assert(key0->size() == key1->size()); + assert(offsets == nullptr || offsets->size() == key0->size()); + if (offsets != nullptr) + return offsets->back(); return key0->size(); } SipHashKey getKey(size_t i) const { if (is_const) i = 0; + if (offsets != nullptr) + { + const auto begin = offsets->begin(); + auto upper = std::upper_bound(begin, offsets->end(), i); + if (upper == offsets->end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "offset {} not found in function SipHashKeyColumns::getKey", i); + i = upper - begin; + } const auto & key0data = assert_cast(*key0).getData(); const auto & key1data = assert_cast(*key1).getData(); return {key0data[i], key1data[i]}; @@ -1112,7 +1126,15 @@ private: typename ColumnVector::Container vec_temp(nested_size); bool nested_is_first = true; - executeForArgument(key_cols, nested_type, nested_column, vec_temp, nested_is_first); + + if constexpr (Keyed) + { + KeyColumnsType key_cols_tmp{key_cols}; + key_cols_tmp.offsets = &offsets; + executeForArgument(key_cols_tmp, nested_type, nested_column, vec_temp, nested_is_first); + } + else + executeForArgument(key_cols, nested_type, nested_column, vec_temp, nested_is_first); const size_t size = offsets.size(); diff --git a/tests/queries/0_stateless/02534_keyed_siphash.reference b/tests/queries/0_stateless/02534_keyed_siphash.reference index e3fae07333a..3f478218ff1 100644 --- a/tests/queries/0_stateless/02534_keyed_siphash.reference +++ b/tests/queries/0_stateless/02534_keyed_siphash.reference @@ -236,3 +236,6 @@ Check asan bug 0 Check bug found fuzzing 9042C6691B1A75F0EA3314B6F55728BB +Check bug 2 found fuzzing +608E1FF030C9E206185B112C2A25F1A7 +ABB65AE97711A2E053E324ED88B1D08B diff --git a/tests/queries/0_stateless/02534_keyed_siphash.sql b/tests/queries/0_stateless/02534_keyed_siphash.sql index 112ae15bf46..fb707109c83 100644 --- a/tests/queries/0_stateless/02534_keyed_siphash.sql +++ b/tests/queries/0_stateless/02534_keyed_siphash.sql @@ -338,3 +338,10 @@ SELECT sipHash128((toUInt64(9223372036854775806), 1)) = sipHash128(1) GROUP BY s SELECT 'Check bug found fuzzing'; SELECT [(255, 1048575)], sipHash128ReferenceKeyed((toUInt64(2147483646), toUInt64(9223372036854775807)), ([(NULL, 100), (NULL, NULL), (1024, 10)], toUInt64(2), toUInt64(1024)), ''), hex(sipHash128ReferenceKeyed((-9223372036854775807, 1.), '-1', NULL)), ('', toUInt64(65535), [(9223372036854775807, 9223372036854775806)], toUInt64(65536)), arrayJoin((NULL, 65537, 255), [(NULL, NULL)]) GROUP BY tupleElement((NULL, NULL, NULL, -1), toUInt64(2), 2) = NULL; -- { serverError NOT_IMPLEMENTED } SELECT hex(sipHash128ReferenceKeyed((0::UInt64, 0::UInt64), ([1, 1]))); + +SELECT 'Check bug 2 found fuzzing'; +DROP TABLE IF EXISTS sipHashKeyed_keys; +CREATE TABLE sipHashKeyed_keys (`a` Map(String, String)) ENGINE = Memory; +INSERT INTO sipHashKeyed_keys FORMAT VALUES ({'a':'b', 'c':'d'}), ({'e':'f', 'g':'h'}); +SELECT hex(sipHash128ReferenceKeyed((0::UInt64, materialize(0::UInt64)), a)) FROM sipHashKeyed_keys ORDER BY a; +DROP TABLE sipHashKeyed_keys; From a3aff6939c0b3afeeb9e4ab9c6f2992a2c61b543 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Fri, 10 May 2024 19:21:16 +0200 Subject: [PATCH 107/392] Protected methods --- src/Storages/MergeTree/IMergeTreeDataPartWriter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index 52e21bed2f2..6854668a01e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -49,13 +49,13 @@ public: Columns releaseIndexColumns(); const MergeTreeIndexGranularity & getIndexGranularity() const { return index_granularity; } +protected: SerializationPtr getSerialization(const String & column_name) const; ASTPtr getCodecDescOrDefault(const String & column_name, CompressionCodecPtr default_codec) const; IDataPartStorage & getDataPartStorage() { return *data_part_storage; } -protected: // const MergeTreeMutableDataPartPtr data_part; // TODO: remove From 9d0ad7ba67b6855344512398b5f924bdad4ece9e Mon Sep 17 00:00:00 2001 From: copperybean Date: Sun, 14 Jan 2024 11:25:12 +0800 Subject: [PATCH 108/392] original parquet reader Change-Id: I83a8ec8271edefcd96cb5b3bcd12f6b545d9dec0 --- .../Impl/Parquet/ParquetColumnReader.h | 29 + .../Formats/Impl/Parquet/ParquetDataBuffer.h | 179 ++++++ .../Impl/Parquet/ParquetDataValuesReader.cpp | 553 ++++++++++++++++++ .../Impl/Parquet/ParquetDataValuesReader.h | 263 +++++++++ .../Impl/Parquet/ParquetLeafColReader.cpp | 506 ++++++++++++++++ .../Impl/Parquet/ParquetLeafColReader.h | 63 ++ .../Impl/Parquet/ParquetRecordReader.cpp | 225 +++++++ .../Impl/Parquet/ParquetRecordReader.h | 48 ++ 8 files changed, 1866 insertions(+) create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetColumnReader.h create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h diff --git a/src/Processors/Formats/Impl/Parquet/ParquetColumnReader.h b/src/Processors/Formats/Impl/Parquet/ParquetColumnReader.h new file mode 100644 index 00000000000..cfd9d3ba5bd --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetColumnReader.h @@ -0,0 +1,29 @@ +#pragma once + +#include + +namespace parquet +{ + +class PageReader; +class ColumnChunkMetaData; +class DataPageV1; +class DataPageV2; + +} + +namespace DB +{ + +class ParquetColumnReader +{ +public: + virtual ColumnWithTypeAndName readBatch(UInt32 rows_num, const String & name) = 0; + + virtual ~ParquetColumnReader() = default; +}; + +using ParquetColReaderPtr = std::unique_ptr; +using ParquetColReaders = std::vector; + +} diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h new file mode 100644 index 00000000000..1f83c74f9ad --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h @@ -0,0 +1,179 @@ +#pragma once + +#include + +#include +#include +#include + +namespace DB +{ + +template struct ToArrowDecimal; + +template <> struct ToArrowDecimal>> +{ + using ArrowDecimal = arrow::Decimal128; +}; + +template <> struct ToArrowDecimal>> +{ + using ArrowDecimal = arrow::Decimal256; +}; + + +class ParquetDataBuffer +{ +private: + +public: + ParquetDataBuffer(const uint8_t * data_, UInt64 avaible_, UInt8 datetime64_scale_ = DataTypeDateTime64::default_scale) + : data(reinterpret_cast(data_)), avaible(avaible_), datetime64_scale(datetime64_scale_) {} + + template + void ALWAYS_INLINE readValue(TValue & dst) + { + checkAvaible(sizeof(TValue)); + dst = *reinterpret_cast(data); + consume(sizeof(TValue)); + } + + void ALWAYS_INLINE readBytes(void * dst, size_t bytes) + { + checkAvaible(bytes); + memcpy(dst, data, bytes); + consume(bytes); + } + + void ALWAYS_INLINE readDateTime64(DateTime64 & dst) + { + static const int max_scale_num = 9; + static const UInt64 pow10[max_scale_num + 1] + = {1000000000, 100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1}; + static const UInt64 spd = 60 * 60 * 24; + static const UInt64 scaled_day[max_scale_num + 1] + = {spd, + 10 * spd, + 100 * spd, + 1000 * spd, + 10000 * spd, + 100000 * spd, + 1000000 * spd, + 10000000 * spd, + 100000000 * spd, + 1000000000 * spd}; + + checkAvaible(sizeof(parquet::Int96)); + auto decoded = parquet::DecodeInt96Timestamp(*reinterpret_cast(data)); + + uint64_t scaled_nano = decoded.nanoseconds / pow10[datetime64_scale]; + dst = static_cast(decoded.days_since_epoch * scaled_day[datetime64_scale] + scaled_nano); + + consume(sizeof(parquet::Int96)); + } + + /** + * This method should only be used to read string whose elements size is small. + * Because memcpySmallAllowReadWriteOverflow15 instead of memcpy is used according to ColumnString::indexImpl + */ + void ALWAYS_INLINE readString(ColumnString & column, size_t cursor) + { + // refer to: PlainByteArrayDecoder::DecodeArrowDense in encoding.cc + // deserializeBinarySSE2 in SerializationString.cpp + checkAvaible(4); + auto value_len = ::arrow::util::SafeLoadAs(getArrowData()); + if (unlikely(value_len < 0 || value_len > INT32_MAX - 4)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid or corrupted value_len '{}'", value_len); + } + consume(4); + checkAvaible(value_len); + + auto chars_cursor = column.getChars().size(); + column.getChars().resize(chars_cursor + value_len + 1); + + memcpySmallAllowReadWriteOverflow15(&column.getChars()[chars_cursor], data, value_len); + column.getChars().back() = 0; + + column.getOffsets().data()[cursor] = column.getChars().size(); + consume(value_len); + } + + template + void ALWAYS_INLINE readOverBigDecimal(TDecimal * out, Int32 elem_bytes_num) + { + using TArrowDecimal = typename ToArrowDecimal::ArrowDecimal; + + checkAvaible(elem_bytes_num); + + // refer to: RawBytesToDecimalBytes in reader_internal.cc, Decimal128::FromBigEndian in decimal.cc + auto status = TArrowDecimal::FromBigEndian(getArrowData(), elem_bytes_num); + if (unlikely(!status.ok())) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Read parquet decimal failed: {}", status.status().ToString()); + } + status.ValueUnsafe().ToBytes(reinterpret_cast(out)); + consume(elem_bytes_num); + } + +private: + const Int8 * data; + UInt64 avaible; + const UInt8 datetime64_scale; + + void ALWAYS_INLINE checkAvaible(UInt64 num) + { + if (unlikely(avaible < num)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Consuming {} bytes while {} avaible", num, avaible); + } + } + + const uint8_t * ALWAYS_INLINE getArrowData() { return reinterpret_cast(data); } + + void ALWAYS_INLINE consume(UInt64 num) + { + data += num; + avaible -= num; + } +}; + + +class LazyNullMap +{ +public: + LazyNullMap(UInt32 size_) : size(size_), col_nullable(nullptr) {} + + void setNull(UInt32 cursor) + { + initialize(); + null_map[cursor] = 1; + } + + void setNull(UInt32 cursor, UInt32 count) + { + initialize(); + memset(null_map + cursor, 1, count); + } + + ColumnPtr getNullableCol() { return col_nullable; } + +private: + UInt32 size; + UInt8 * null_map; + ColumnPtr col_nullable; + + void initialize() + { + if (likely(col_nullable)) + { + return; + } + auto col = ColumnVector::create(size); + null_map = col->getData().data(); + col_nullable = std::move(col); + memset(null_map, 0, size); + } +}; + +} diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp new file mode 100644 index 00000000000..659a7a11969 --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -0,0 +1,553 @@ +#include "ParquetDataValuesReader.h" + +#include +#include + +#include + +namespace DB +{ + +void RleValuesReader::nextGroup() +{ + // refer to: + // RleDecoder::NextCounts in rle_encoding.h and VectorizedRleValuesReader::readNextGroup in Spark + UInt32 indicator_value = 0; + [[maybe_unused]] auto read_res = bit_reader->GetVlqInt(&indicator_value); + assert(read_res); + + cur_group_is_packed = indicator_value & 1; + cur_group_size = indicator_value >> 1; + + if (cur_group_is_packed) + { + cur_group_size *= 8; + cur_packed_bit_values.resize(cur_group_size); + bit_reader->GetBatch(bit_width, cur_packed_bit_values.data(), cur_group_size); + } + else + { + cur_value = 0; + read_res = bit_reader->GetAligned((bit_width + 7) / 8, &cur_value); + assert(read_res); + } + cur_group_cursor = 0; + +} + +template +void RleValuesReader::visitValues( + UInt32 num_values, IndividualVisitor && individual_visitor, RepeatedVisitor && repeated_visitor) +{ + // refer to: VisitNullBitmapInline in visitor_inline.h + while (num_values) + { + nextGroupIfNecessary(); + auto cur_count = std::min(num_values, curGroupLeft()); + + if (cur_group_is_packed) + { + for (auto i = cur_group_cursor; i < cur_group_cursor + cur_count; i++) + { + individual_visitor(cur_packed_bit_values[i]); + } + } + else + { + repeated_visitor(cur_count, cur_value); + } + cur_group_cursor += cur_count; + num_values -= cur_count; + } +} + +template +void RleValuesReader::visitNullableValues( + size_t cursor, + UInt32 num_values, + Int32 max_def_level, + LazyNullMap & null_map, + IndividualVisitor && individual_visitor, + RepeatedVisitor && repeated_visitor) +{ + while (num_values) + { + nextGroupIfNecessary(); + auto cur_count = std::min(num_values, curGroupLeft()); + + if (cur_group_is_packed) + { + for (auto i = cur_group_cursor; i < cur_group_cursor + cur_count; i++) + { + if (cur_packed_bit_values[i] == max_def_level) + { + individual_visitor(cursor); + } + else + { + null_map.setNull(cursor); + } + cursor++; + } + } + else + { + if (cur_value == max_def_level) + { + repeated_visitor(cursor, cur_count); + } + else + { + null_map.setNull(cursor, cur_count); + } + cursor += cur_count; + } + cur_group_cursor += cur_count; + num_values -= cur_count; + } +} + +template +void RleValuesReader::visitNullableBySteps( + size_t cursor, + UInt32 num_values, + Int32 max_def_level, + IndividualNullVisitor && individual_null_visitor, + SteppedValidVisitor && stepped_valid_visitor, + RepeatedVisitor && repeated_visitor) +{ + // refer to: + // RleDecoder::GetBatch in rle_encoding.h and TypedColumnReaderImpl::ReadBatchSpaced in column_reader.cc + // VectorizedRleValuesReader::readBatchInternal in Spark + while (num_values > 0) + { + nextGroupIfNecessary(); + auto cur_count = std::min(num_values, curGroupLeft()); + + if (cur_group_is_packed) + { + valid_index_steps.resize(cur_count + 1); + valid_index_steps[0] = 0; + auto step_idx = 0; + auto null_map_cursor = cursor; + + for (auto i = cur_group_cursor; i < cur_group_cursor + cur_count; i++) + { + if (cur_packed_bit_values[i] == max_def_level) + { + valid_index_steps[++step_idx] = 1; + } + else + { + individual_null_visitor(null_map_cursor); + if (unlikely(valid_index_steps[step_idx] == UINT8_MAX)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "unsupported packed values number"); + } + valid_index_steps[step_idx]++; + } + null_map_cursor++; + } + valid_index_steps.resize(step_idx + 1); + stepped_valid_visitor(cursor, valid_index_steps); + } + else + { + repeated_visitor(cur_value == max_def_level, cursor, cur_count); + } + + cursor += cur_count; + cur_group_cursor += cur_count; + num_values -= cur_count; + } +} + +template +void RleValuesReader::setValues(TValue * res_values, UInt32 num_values, ValueGetter && val_getter) +{ + visitValues( + num_values, + /* individual_visitor */ [&](Int32 val) + { + *(res_values++) = val_getter(val); + }, + /* repeated_visitor */ [&](UInt32 count, Int32 val) + { + std::fill(res_values, res_values + count, val_getter(val)); + res_values += count; + } + ); +} + +template +void RleValuesReader::setValueBySteps( + TValue * res_values, + const std::vector & col_data_steps, + ValueGetter && val_getter) +{ + auto step_iterator = col_data_steps.begin(); + res_values += *(step_iterator++); + + visitValues( + col_data_steps.size() - 1, + /* individual_visitor */ [&](Int32 val) + { + *res_values = val_getter(val); + res_values += *(step_iterator++); + }, + /* repeated_visitor */ [&](UInt32 count, Int32 val) + { + auto getted_val = val_getter(val); + for (UInt32 i = 0; i < count; i++) + { + *res_values = getted_val; + res_values += *(step_iterator++); + } + } + ); +} + + +namespace +{ + +template +TValue * getResizedPrimitiveData(TColumn & column, size_t size) +{ + auto old_size = column.size(); + column.getData().resize(size); + memset(column.getData().data() + old_size, 0, sizeof(TValue) * (size - old_size)); + return column.getData().data(); +} + +} // anoynomous namespace + + +template <> +void ParquetPlainValuesReader::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto & column = *assert_cast(col_ptr.get()); + auto cursor = column.size(); + + column.getOffsets().resize(cursor + num_values); + auto * offset_data = column.getOffsets().data(); + auto & chars = column.getChars(); + + def_level_reader->visitValues( + num_values, + /* individual_visitor */ [&](Int32 val) + { + if (val == max_def_level) + { + plain_data_buffer.readString(column, cursor); + } + else + { + chars.push_back(0); + offset_data[cursor] = chars.size(); + null_map.setNull(cursor); + } + cursor++; + }, + /* repeated_visitor */ [&](UInt32 count, Int32 val) + { + if (val == max_def_level) + { + for (UInt32 i = 0; i < count; i++) + { + plain_data_buffer.readString(column, cursor); + cursor++; + } + } + else + { + null_map.setNull(cursor, count); + + auto chars_size_bak = chars.size(); + chars.resize(chars_size_bak + count); + memset(&chars[chars_size_bak], 0, count); + + auto idx = cursor; + cursor += count; + // the type of offset_data is PaddedPODArray, which makes sure that the -1 index is avaible + for (auto val_offset = offset_data[idx - 1]; idx < cursor; idx++) + { + offset_data[idx] = ++val_offset; + } + } + } + ); +} + + +template <> +void ParquetPlainValuesReader>::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto cursor = col_ptr->size(); + auto * column_data = getResizedPrimitiveData( + *assert_cast *>(col_ptr.get()), cursor + num_values); + + def_level_reader->visitNullableValues( + cursor, + num_values, + max_def_level, + null_map, + /* individual_visitor */ [&](size_t nest_cursor) + { + plain_data_buffer.readDateTime64(column_data[nest_cursor]); + }, + /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) + { + auto col_data_pos = column_data + nest_cursor; + for (UInt32 i = 0; i < count; i++) + { + plain_data_buffer.readDateTime64(col_data_pos[i]); + } + } + ); +} + +template +void ParquetPlainValuesReader::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto cursor = col_ptr->size(); + auto * column_data = getResizedPrimitiveData(*assert_cast(col_ptr.get()), cursor + num_values); + using TValue = std::decay_t; + + def_level_reader->visitNullableValues( + cursor, + num_values, + max_def_level, + null_map, + /* individual_visitor */ [&](size_t nest_cursor) + { + plain_data_buffer.readValue(column_data[nest_cursor]); + }, + /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) + { + plain_data_buffer.readBytes(column_data + nest_cursor, count * sizeof(TValue)); + } + ); +} + + +template +void ParquetFixedLenPlainReader::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + if constexpr (std::same_as> || std::same_as>) + { + readOverBigDecimal(col_ptr, null_map, num_values); + } + else + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "unsupported type"); + } +} + +template +void ParquetFixedLenPlainReader::readOverBigDecimal( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto cursor = col_ptr->size(); + auto * column_data = getResizedPrimitiveData( + *assert_cast(col_ptr.get()), cursor + num_values); + + def_level_reader->visitNullableValues( + cursor, + num_values, + max_def_level, + null_map, + /* individual_visitor */ [&](size_t nest_cursor) + { + plain_data_buffer.readOverBigDecimal(column_data + nest_cursor, elem_bytes_num); + }, + /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) + { + auto col_data_pos = column_data + nest_cursor; + for (UInt32 i = 0; i < count; i++) + { + plain_data_buffer.readOverBigDecimal(col_data_pos + i, elem_bytes_num); + } + } + ); +} + + +template +void ParquetRleLCReader::readBatch( + MutableColumnPtr & index_col, LazyNullMap & null_map, UInt32 num_values) +{ + auto cursor = index_col->size(); + auto * column_data = getResizedPrimitiveData(*assert_cast(index_col.get()), cursor + num_values); + + bool has_null = false; + + // in ColumnLowCardinality, first element in dictionary is null + // so we should increase each value by 1 in parquet index + auto val_getter = [&](Int32 val) { return val + 1; }; + + def_level_reader->visitNullableBySteps( + cursor, + num_values, + max_def_level, + /* individual_null_visitor */ [&](UInt32 nest_cursor) { + column_data[nest_cursor] = 0; + has_null = true; + }, + /* stepped_valid_visitor */ [&](UInt32 nest_cursor, const std::vector & valid_index_steps) { + rle_data_reader->setValueBySteps(column_data + nest_cursor, valid_index_steps, val_getter); + }, + /* repeated_visitor */ [&](bool is_valid, UInt32 nest_cursor, UInt32 count) { + if (is_valid) + { + rle_data_reader->setValues(column_data + nest_cursor, count, val_getter); + } + else + { + auto data_pos = column_data + nest_cursor; + std::fill(data_pos, data_pos + count, 0); + has_null = true; + } + } + ); + if (has_null) + { + null_map.setNull(0); + } +} + +template <> +void ParquetRleDictReader::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto & column = *assert_cast(col_ptr.get()); + auto cursor = column.size(); + std::vector value_cache; + + const auto & dict_chars = static_cast(page_dictionary).getChars(); + const auto & dict_offsets = static_cast(page_dictionary).getOffsets(); + + column.getOffsets().resize(cursor + num_values); + auto * offset_data = column.getOffsets().data(); + auto & chars = column.getChars(); + + auto append_nulls = [&](UInt8 num) { + for (auto limit = cursor + num; cursor < limit; cursor++) + { + chars.push_back(0); + offset_data[cursor] = chars.size(); + null_map.setNull(cursor); + } + }; + + auto append_string = [&](Int32 dict_idx) { + auto dict_chars_cursor = dict_offsets[dict_idx - 1]; + auto value_len = dict_offsets[dict_idx] - dict_chars_cursor; + auto chars_cursor = chars.size(); + chars.resize(chars_cursor + value_len); + + memcpySmallAllowReadWriteOverflow15(&chars[chars_cursor], &dict_chars[dict_chars_cursor], value_len); + offset_data[cursor] = chars.size(); + cursor++; + }; + + auto val_getter = [&](Int32 val) { return val + 1; }; + + def_level_reader->visitNullableBySteps( + cursor, + num_values, + max_def_level, + /* individual_null_visitor */ [&](UInt32) {}, + /* stepped_valid_visitor */ [&](UInt32, const std::vector & valid_index_steps) { + value_cache.resize(valid_index_steps.size()); + rle_data_reader->setValues(value_cache.data() + 1, valid_index_steps.size() - 1, val_getter); + + append_nulls(valid_index_steps[0]); + for (size_t i = 1; i < valid_index_steps.size(); i++) + { + append_string(value_cache[i]); + append_nulls(valid_index_steps[i] - 1); + } + }, + /* repeated_visitor */ [&](bool is_valid, UInt32, UInt32 count) { + if (is_valid) + { + value_cache.resize(count); + rle_data_reader->setValues(value_cache.data(), count, val_getter); + for (UInt32 i = 0; i < count; i++) + { + append_string(value_cache[i]); + } + } + else + { + append_nulls(count); + } + } + ); +} + +template +void ParquetRleDictReader::readBatch( + MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) +{ + auto cursor = col_ptr->size(); + auto * column_data = getResizedPrimitiveData(*assert_cast(col_ptr.get()), cursor + num_values); + const auto & dictionary_array = static_cast(page_dictionary).getData(); + + auto val_getter = [&](Int32 val) { return dictionary_array[val]; }; + def_level_reader->visitNullableBySteps( + cursor, + num_values, + max_def_level, + /* individual_null_visitor */ [&](UInt32 nest_cursor) { + null_map.setNull(nest_cursor); + }, + /* stepped_valid_visitor */ [&](UInt32 nest_cursor, const std::vector & valid_index_steps) { + rle_data_reader->setValueBySteps(column_data + nest_cursor, valid_index_steps, val_getter); + }, + /* repeated_visitor */ [&](bool is_valid, UInt32 nest_cursor, UInt32 count) { + if (is_valid) + { + rle_data_reader->setValues(column_data + nest_cursor, count, val_getter); + } + else + { + null_map.setNull(nest_cursor, count); + } + } + ); +} + + +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader>; +template class ParquetPlainValuesReader>; +template class ParquetPlainValuesReader; + +template class ParquetFixedLenPlainReader>; +template class ParquetFixedLenPlainReader>; + +template class ParquetRleLCReader; +template class ParquetRleLCReader; +template class ParquetRleLCReader; + +template class ParquetRleDictReader; +template class ParquetRleDictReader; +template class ParquetRleDictReader; +template class ParquetRleDictReader; +template class ParquetRleDictReader>; +template class ParquetRleDictReader>; +template class ParquetRleDictReader>; +template class ParquetRleDictReader>; +template class ParquetRleDictReader>; +template class ParquetRleDictReader; + +} diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h new file mode 100644 index 00000000000..2c95f495339 --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -0,0 +1,263 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ParquetDataBuffer.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int PARQUET_EXCEPTION; +} + +class RleValuesReader +{ +public: + RleValuesReader(std::unique_ptr bit_reader_, Int32 bit_width_) + : bit_reader(std::move(bit_reader_)), bit_width(bit_width_) {} + + /** + * @brief Used when the bit_width is 0, so all elements have same value. + */ + RleValuesReader(UInt32 total_size, Int32 val = 0) + : bit_reader(nullptr), bit_width(0), cur_group_size(total_size), cur_value(val), cur_group_is_packed(false) + {} + + void nextGroup(); + + void nextGroupIfNecessary() { if (cur_group_cursor >= cur_group_size) nextGroup(); } + + UInt32 curGroupLeft() const { return cur_group_size - cur_group_cursor; } + + /** + * @brief Visit num_values elements. + * For RLE encoding, for same group, the value is same, so they can be visited repeatedly. + * For BitPacked encoding, the values may be different with each other, so they must be visited individual. + * + * @tparam IndividualVisitor A callback with signature: void(Int32 val) + * @tparam RepeatedVisitor A callback with signature: void(UInt32 count, Int32 val) + */ + template + void visitValues(UInt32 num_values, IndividualVisitor && individual_visitor, RepeatedVisitor && repeated_visitor); + + /** + * @brief Visit num_values elements by parsed nullability. + * If the parsed value is same as max_def_level, then it is processed as null value. + * + * @tparam IndividualVisitor A callback with signature: void(size_t cursor) + * @tparam RepeatedVisitor A callback with signature: void(size_t cursor, UInt32 count) + * + * Because the null map is processed, so only the callbacks only need to process the valid data. + */ + template + void visitNullableValues( + size_t cursor, + UInt32 num_values, + Int32 max_def_level, + LazyNullMap & null_map, + IndividualVisitor && individual_visitor, + RepeatedVisitor && repeated_visitor); + + /** + * @brief Visit num_values elements by parsed nullability. + * It may be inefficient to process the valid data individually like in visitNullableValues, + * so a valid_index_steps index array is generated first, in order to process valid data continuously. + * + * @tparam IndividualNullVisitor A callback with signature: void(size_t cursor), used to process null value + * @tparam SteppedValidVisitor A callback with signature: + * void(size_t cursor, const std::vector & valid_index_steps) + * for n valid elements with null value interleaved in a BitPacked group, + * i-th item in valid_index_steps describes how many elements in column there are after (i-1)-th valid element. + * + * take following BitPacked group with 2 valid elements for example: + * null valid null null valid null + * then the valid_index_steps has values [1, 3, 2]. + * Please note that the the sum of valid_index_steps is same as elements number in this group. + * + * @tparam RepeatedVisitor A callback with signature: void(bool is_valid, UInt32 cursor, UInt32 count) + */ + template + void visitNullableBySteps( + size_t cursor, + UInt32 num_values, + Int32 max_def_level, + IndividualNullVisitor && null_visitor, + SteppedValidVisitor && stepped_valid_visitor, + RepeatedVisitor && repeated_visitor); + + /** + * @brief Set the Values to column_data directly + * + * @tparam TValue The type of column data. + * @tparam ValueGetter A callback with signature: TValue(Int32 val) + */ + template + void setValues(TValue * column_data, UInt32 num_values, ValueGetter && val_getter); + + /** + * @brief Set the value by valid_index_steps generated in visitNullableBySteps. + * According to visitNullableBySteps, the elements number is valid_index_steps.size()-1, + * so valid_index_steps.size()-1 elements are read, and set to column_data with steps in valid_index_steps + */ + template + void setValueBySteps( + TValue * column_data, + const std::vector & col_data_steps, + ValueGetter && val_getter); + +private: + std::unique_ptr bit_reader; + + std::vector cur_packed_bit_values; + std::vector valid_index_steps; + + Int32 bit_width; + + UInt32 cur_group_size = 0; + UInt32 cur_group_cursor = 0; + Int32 cur_value; + bool cur_group_is_packed; +}; + +using RleValuesReaderPtr = std::unique_ptr; + + +class ParquetDataValuesReader +{ +public: + virtual void readBatch(MutableColumnPtr & column, LazyNullMap & null_map, UInt32 num_values) = 0; + + virtual ~ParquetDataValuesReader() = default; +}; + +using ParquetDataValuesReaderPtr = std::unique_ptr; + + +/** + * The definition level is RLE or BitPacked encoding, while data is read directly + */ +template +class ParquetPlainValuesReader : public ParquetDataValuesReader +{ +public: + + ParquetPlainValuesReader( + Int32 max_def_level_, + std::unique_ptr def_level_reader_, + ParquetDataBuffer data_buffer_) + : max_def_level(max_def_level_) + , def_level_reader(std::move(def_level_reader_)) + , plain_data_buffer(std::move(data_buffer_)) + {} + + void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + std::unique_ptr def_level_reader; + ParquetDataBuffer plain_data_buffer; +}; + +/** + * The data and definition level encoding are same as ParquetPlainValuesReader. + * But the element size is const and bigger than primitive data type. + */ +template +class ParquetFixedLenPlainReader : public ParquetDataValuesReader +{ +public: + + ParquetFixedLenPlainReader( + Int32 max_def_level_, + Int32 elem_bytes_num_, + std::unique_ptr def_level_reader_, + ParquetDataBuffer data_buffer_) + : max_def_level(max_def_level_) + , elem_bytes_num(elem_bytes_num_) + , def_level_reader(std::move(def_level_reader_)) + , plain_data_buffer(std::move(data_buffer_)) + {} + + void readOverBigDecimal(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values); + + void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + Int32 elem_bytes_num; + std::unique_ptr def_level_reader; + ParquetDataBuffer plain_data_buffer; +}; + +/** + * Read data according to the format of ColumnLowCardinality format. + * + * Only index and null column are processed in this class. + * And all null value is mapped to first index in dictionary, + * so the result index valued is added by one. +*/ +template +class ParquetRleLCReader : public ParquetDataValuesReader +{ +public: + ParquetRleLCReader( + Int32 max_def_level_, + std::unique_ptr def_level_reader_, + std::unique_ptr rle_data_reader_) + : max_def_level(max_def_level_) + , def_level_reader(std::move(def_level_reader_)) + , rle_data_reader(std::move(rle_data_reader_)) + {} + + void readBatch(MutableColumnPtr & index_col, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + std::unique_ptr def_level_reader; + std::unique_ptr rle_data_reader; +}; + +/** + * The definition level is RLE or BitPacked encoded, + * and the index of dictionary is also RLE or BitPacked encoded. + * + * while the result is not parsed as a low cardinality column, + * instead, a normal column is generated. + */ +template +class ParquetRleDictReader : public ParquetDataValuesReader +{ +public: + ParquetRleDictReader( + Int32 max_def_level_, + std::unique_ptr def_level_reader_, + std::unique_ptr rle_data_reader_, + const IColumn & page_dictionary_) + : max_def_level(max_def_level_) + , def_level_reader(std::move(def_level_reader_)) + , rle_data_reader(std::move(rle_data_reader_)) + , page_dictionary(page_dictionary_) + {} + + void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override; + +private: + Int32 max_def_level; + std::unique_ptr def_level_reader; + std::unique_ptr rle_data_reader; + const IColumn & page_dictionary; +}; + +} diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp new file mode 100644 index 00000000000..00dee9074fe --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -0,0 +1,506 @@ +#include "ParquetLeafColReader.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; + extern const int PARQUET_EXCEPTION; +} + +namespace +{ + +template +void visitColStrIndexType(size_t data_size, TypeVisitor && visitor) +{ + // refer to: DataTypeLowCardinality::createColumnUniqueImpl + if (data_size < (1ull << 8)) + { + visitor(static_cast(nullptr)); + } + else if (data_size < (1ull << 16)) + { + visitor(static_cast(nullptr)); + } + else if (data_size < (1ull << 32)) + { + visitor(static_cast(nullptr)); + } + else + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "unsupported data size {}", data_size); + } +} + +void reserveColumnStrRows(MutableColumnPtr & col, UInt32 rows_num) +{ + col->reserve(rows_num); + + /// Never reserve for too big size according to SerializationString::deserializeBinaryBulk + if (rows_num < 256 * 1024 * 1024) + { + try + { + static_cast(col.get())->getChars().reserve(rows_num); + } + catch (Exception & e) + { + e.addMessage("(limit = " + toString(rows_num) + ")"); + throw; + } + } +}; + + +template +ColumnPtr readDictPage( + const parquet::DictionaryPage & page, + const parquet::ColumnDescriptor & col_des, + const DataTypePtr & /* data_type */); + +template <> +ColumnPtr readDictPage( + const parquet::DictionaryPage & page, + const parquet::ColumnDescriptor & /* col_des */, + const DataTypePtr & /* data_type */) +{ + auto col = ColumnString::create(); + col->getOffsets().resize(page.num_values() + 1); + col->getChars().reserve(page.num_values()); + ParquetDataBuffer buffer(page.data(), page.size()); + + // will be read as low cardinality column + // in which case, the null key is set to first position, so the first string should be empty + col->getChars().push_back(0); + col->getOffsets()[0] = 1; + for (auto i = 1; i <= page.num_values(); i++) + { + buffer.readString(*col, i); + } + return col; +} + +template <> +ColumnPtr readDictPage>( + const parquet::DictionaryPage & page, + const parquet::ColumnDescriptor & /* col_des */, + const DataTypePtr & data_type) +{ + auto & datetime_type = assert_cast(*data_type); + auto dict_col = ColumnDecimal::create(page.num_values(), datetime_type.getScale()); + auto * col_data = dict_col->getData().data(); + ParquetDataBuffer buffer(page.data(), page.size(), datetime_type.getScale()); + for (auto i = 0; i < page.num_values(); i++) + { + buffer.readDateTime64(col_data[i]); + } + return dict_col; +} + +template +ColumnPtr readDictPage( + const parquet::DictionaryPage & page, + const parquet::ColumnDescriptor & col_des, + const DataTypePtr & /* data_type */) +{ + auto dict_col = TColumnDecimal::create(page.num_values(), col_des.type_scale()); + auto * col_data = dict_col->getData().data(); + ParquetDataBuffer buffer(page.data(), page.size()); + for (auto i = 0; i < page.num_values(); i++) + { + buffer.readOverBigDecimal(col_data + i, col_des.type_length()); + } + return dict_col; +} + +template requires (!std::is_same_v) +ColumnPtr readDictPage( + const parquet::DictionaryPage & page, + const parquet::ColumnDescriptor & col_des, + const DataTypePtr & /* data_type */) +{ + auto dict_col = TColumnDecimal::create(page.num_values(), col_des.type_scale()); + ParquetDataBuffer buffer(page.data(), page.size()); + buffer.readBytes(dict_col->getData().data(), page.num_values() * sizeof(typename TColumnDecimal::ValueType)); + return dict_col; +} + +template +ColumnPtr readDictPage( + const parquet::DictionaryPage & page, + const parquet::ColumnDescriptor & /* col_des */, + const DataTypePtr & /* data_type */) +{ + auto dict_col = TColumnVector::create(page.num_values()); + ParquetDataBuffer buffer(page.data(), page.size()); + buffer.readBytes(dict_col->getData().data(), page.num_values() * sizeof(typename TColumnVector::ValueType)); + return dict_col; +} + + +template +std::unique_ptr createPlainReader( + const parquet::ColumnDescriptor & col_des, + RleValuesReaderPtr def_level_reader, + ParquetDataBuffer buffer); + +template +std::unique_ptr createPlainReader( + const parquet::ColumnDescriptor & col_des, + RleValuesReaderPtr def_level_reader, + ParquetDataBuffer buffer) +{ + return std::make_unique>( + col_des.max_definition_level(), + col_des.type_length(), + std::move(def_level_reader), + std::move(buffer)); +} + +template +std::unique_ptr createPlainReader( + const parquet::ColumnDescriptor & col_des, + RleValuesReaderPtr def_level_reader, + ParquetDataBuffer buffer) +{ + return std::make_unique>( + col_des.max_definition_level(), std::move(def_level_reader), std::move(buffer)); +} + + +} // anonymous namespace + + +template +ParquetLeafColReader::ParquetLeafColReader( + const parquet::ColumnDescriptor & col_descriptor_, + DataTypePtr base_type_, + std::unique_ptr meta_, + std::unique_ptr reader_) + : col_descriptor(col_descriptor_) + , base_data_type(base_type_) + , col_chunk_meta(std::move(meta_)) + , parquet_page_reader(std::move(reader_)) + , log(&Poco::Logger::get("ParquetLeafColReader")) +{ +} + +template +ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt32 rows_num, const String & name) +{ + reading_rows_num = rows_num; + auto readPageIfEmpty = [&]() { + while (!cur_page_values) readPage(); + }; + + // make sure the dict page has been read, and the status is updated + readPageIfEmpty(); + resetColumn(rows_num); + + while (rows_num) + { + // if dictionary page encountered, another page should be read + readPageIfEmpty(); + + auto read_values = std::min(rows_num, cur_page_values); + data_values_reader->readBatch(column, *null_map, read_values); + + cur_page_values -= read_values; + rows_num -= read_values; + } + + return releaseColumn(name); +} + +template <> +void ParquetLeafColReader::resetColumn(UInt32 rows_num) +{ + if (reading_low_cardinality) + { + assert(dictionary); + visitColStrIndexType(dictionary->size(), [&](TColVec *) { + column = TColVec::create(); + }); + + // only first position is used + null_map = std::make_unique(1); + column->reserve(rows_num); + } + else + { + null_map = std::make_unique(rows_num); + column = ColumnString::create(); + reserveColumnStrRows(column, rows_num); + } +} + +template +void ParquetLeafColReader::resetColumn(UInt32 rows_num) +{ + assert(!reading_low_cardinality); + + column = base_data_type->createColumn(); + column->reserve(rows_num); + null_map = std::make_unique(rows_num); +} + +template +void ParquetLeafColReader::degradeDictionary() +{ + assert(dictionary && column->size()); + null_map = std::make_unique(reading_rows_num); + auto col_existing = std::move(column); + column = ColumnString::create(); + + ColumnString & col_dest = *static_cast(column.get()); + const ColumnString & col_dict_str = *static_cast(dictionary.get()); + + visitColStrIndexType(dictionary->size(), [&](TColVec *) { + const TColVec & col_src = *static_cast(col_existing.get()); + reserveColumnStrRows(column, reading_rows_num); + + col_dest.getOffsets().resize(col_src.size()); + for (size_t i = 0; i < col_src.size(); i++) + { + auto src_idx = col_src.getData()[i]; + if (0 == src_idx) + { + null_map->setNull(i); + } + auto dict_chars_cursor = col_dict_str.getOffsets()[src_idx - 1]; + auto str_len = col_dict_str.getOffsets()[src_idx] - dict_chars_cursor; + auto dst_chars_cursor = col_dest.getChars().size(); + col_dest.getChars().resize(dst_chars_cursor + str_len); + + memcpySmallAllowReadWriteOverflow15( + &col_dest.getChars()[dst_chars_cursor], &col_dict_str.getChars()[dict_chars_cursor], str_len); + col_dest.getOffsets()[i] = col_dest.getChars().size(); + } + }); + LOG_INFO(log, "degraded dictionary to normal column"); +} + +template +ColumnWithTypeAndName ParquetLeafColReader::releaseColumn(const String & name) +{ + DataTypePtr data_type = base_data_type; + if (reading_low_cardinality) + { + MutableColumnPtr col_unique; + if (null_map->getNullableCol()) + { + data_type = std::make_shared(data_type); + col_unique = ColumnUnique::create(dictionary->assumeMutable(), true); + } + else + { + col_unique = ColumnUnique::create(dictionary->assumeMutable(), false); + } + column = ColumnLowCardinality::create(std::move(col_unique), std::move(column), true); + data_type = std::make_shared(data_type); + } + else + { + if (null_map->getNullableCol()) + { + column = ColumnNullable::create(std::move(column), null_map->getNullableCol()->assumeMutable()); + data_type = std::make_shared(data_type); + } + } + ColumnWithTypeAndName res = {std::move(column), data_type, name}; + column = nullptr; + null_map = nullptr; + + return res; +} + +template +void ParquetLeafColReader::readPage() +{ + // refer to: ColumnReaderImplBase::ReadNewPage in column_reader.cc + auto cur_page = parquet_page_reader->NextPage(); + switch (cur_page->type()) + { + case parquet::PageType::DATA_PAGE: + readPageV1(*std::static_pointer_cast(cur_page)); + break; + case parquet::PageType::DATA_PAGE_V2: + readPageV2(*std::static_pointer_cast(cur_page)); + break; + case parquet::PageType::DICTIONARY_PAGE: + { + const parquet::DictionaryPage & dict_page = *std::static_pointer_cast(cur_page); + if (unlikely( + dict_page.encoding() != parquet::Encoding::PLAIN_DICTIONARY + && dict_page.encoding() != parquet::Encoding::PLAIN)) + { + throw new Exception( + ErrorCodes::NOT_IMPLEMENTED, "Unsupported dictionary page encoding {}", dict_page.encoding()); + } + LOG_INFO(log, "{} values in dictionary page of column {}", dict_page.num_values(), col_descriptor.name()); + + dictionary = readDictPage(dict_page, col_descriptor, base_data_type); + if (std::is_same_v) + { + reading_low_cardinality = true; + } + break; + } + default: + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported page type: {}", cur_page->type()); + } +} + +template +void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) +{ + static parquet::LevelDecoder repetition_level_decoder; + + cur_page_values = page.num_values(); + + // refer to: VectorizedColumnReader::readPageV1 in Spark and LevelDecoder::SetData in column_reader.cc + if (page.definition_level_encoding() != parquet::Encoding::RLE && col_descriptor.max_definition_level() != 0) + { + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unsupported encoding: {}", page.definition_level_encoding()); + } + const auto * buffer = page.data(); + auto max_size = page.size(); + + if (col_descriptor.max_repetition_level() > 0) + { + auto rep_levels_bytes = repetition_level_decoder.SetData( + page.repetition_level_encoding(), col_descriptor.max_repetition_level(), 0, buffer, max_size); + buffer += rep_levels_bytes; + max_size -= rep_levels_bytes; + } + + assert(col_descriptor.max_definition_level() >= 0); + std::unique_ptr def_level_reader; + if (col_descriptor.max_definition_level() > 0) { + auto bit_width = arrow::BitUtil::Log2(col_descriptor.max_definition_level() + 1); + auto num_bytes = ::arrow::util::SafeLoadAs(buffer); + auto bit_reader = std::make_unique(buffer + 4, num_bytes); + num_bytes += 4; + buffer += num_bytes; + max_size -= num_bytes; + def_level_reader = std::make_unique(std::move(bit_reader), bit_width); + } + else + { + def_level_reader = std::make_unique(page.num_values()); + } + + switch (page.encoding()) + { + case parquet::Encoding::PLAIN: + { + if (reading_low_cardinality) + { + reading_low_cardinality = false; + degradeDictionary(); + } + + ParquetDataBuffer parquet_buffer = [&]() { + if constexpr (!std::is_same_v, TColumn>) + return ParquetDataBuffer(buffer, max_size); + + auto scale = assert_cast(*base_data_type).getScale(); + return ParquetDataBuffer(buffer, max_size, scale); + }(); + data_values_reader = createPlainReader( + col_descriptor, std::move(def_level_reader), std::move(parquet_buffer)); + break; + } + case parquet::Encoding::RLE_DICTIONARY: + case parquet::Encoding::PLAIN_DICTIONARY: + { + if (unlikely(!dictionary)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "dictionary should be existed"); + } + + // refer to: DictDecoderImpl::SetData in encoding.cc + auto bit_width = *buffer; + auto bit_reader = std::make_unique(++buffer, --max_size); + data_values_reader = createDictReader( + std::move(def_level_reader), std::make_unique(std::move(bit_reader), bit_width)); + break; + } + case parquet::Encoding::BYTE_STREAM_SPLIT: + case parquet::Encoding::DELTA_BINARY_PACKED: + case parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY: + case parquet::Encoding::DELTA_BYTE_ARRAY: + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unsupported encoding: {}", page.encoding()); + + default: + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unknown encoding type: {}", page.encoding()); + } +} + +template +void ParquetLeafColReader::readPageV2(const parquet::DataPageV2 & /*page*/) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "read page V2 is not implemented yet"); +} + +template +std::unique_ptr ParquetLeafColReader::createDictReader( + std::unique_ptr def_level_reader, std::unique_ptr rle_data_reader) +{ + if (reading_low_cardinality && std::same_as) + { + std::unique_ptr res; + visitColStrIndexType(dictionary->size(), [&](TCol *) { + res = std::make_unique>( + col_descriptor.max_definition_level(), + std::move(def_level_reader), + std::move(rle_data_reader)); + }); + return res; + } + return std::make_unique>( + col_descriptor.max_definition_level(), + std::move(def_level_reader), + std::move(rle_data_reader), + *assert_cast(dictionary.get())); +} + + +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader; +template class ParquetLeafColReader>; +template class ParquetLeafColReader>; +template class ParquetLeafColReader>; +template class ParquetLeafColReader>; +template class ParquetLeafColReader>; + +} diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h new file mode 100644 index 00000000000..f730afe40ed --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h @@ -0,0 +1,63 @@ +#pragma once + +#include +#include +#include + +#include "ParquetColumnReader.h" +#include "ParquetDataValuesReader.h" + +namespace parquet +{ + +class ColumnDescriptor; + +} + + +namespace DB +{ + +template +class ParquetLeafColReader : public ParquetColumnReader +{ +public: + ParquetLeafColReader( + const parquet::ColumnDescriptor & col_descriptor_, + DataTypePtr base_type_, + std::unique_ptr meta_, + std::unique_ptr reader_); + + ColumnWithTypeAndName readBatch(UInt32 rows_num, const String & name) override; + +private: + const parquet::ColumnDescriptor & col_descriptor; + DataTypePtr base_data_type; + std::unique_ptr col_chunk_meta; + std::unique_ptr parquet_page_reader; + std::unique_ptr data_values_reader; + + MutableColumnPtr column; + std::unique_ptr null_map; + + ColumnPtr dictionary; + + UInt32 cur_page_values = 0; + UInt32 reading_rows_num = 0; + bool reading_low_cardinality = false; + + Poco::Logger * log; + + void resetColumn(UInt32 rows_num); + void degradeDictionary(); + ColumnWithTypeAndName releaseColumn(const String & name); + + void readPage(); + void readPageV1(const parquet::DataPageV1 & page); + void readPageV2(const parquet::DataPageV2 & page); + + std::unique_ptr createDictReader( + std::unique_ptr def_level_reader, std::unique_ptr rle_data_reader); +}; + +} diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp new file mode 100644 index 00000000000..a5744b85174 --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -0,0 +1,225 @@ +#include "ParquetRecordReader.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "ParquetLeafColReader.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int PARQUET_EXCEPTION; +} + +// #define THROW_ARROW_NOT_OK(status) \ +// do \ +// { \ +// if (::arrow::Status _s = (status); !_s.ok()) \ +// throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ +// } while (false) + + +#define THROW_PARQUET_EXCEPTION(s) \ + do \ + { \ + try { (s); } \ + catch (const ::parquet::ParquetException & e) \ + { \ + throw Exception(e.what(), ErrorCodes::PARQUET_EXCEPTION); \ + } \ + } while (false) + +namespace +{ + +Int64 getTotalRows(const parquet::FileMetaData & meta_data) +{ + Int64 res = 0; + for (int i = 0; i < meta_data.num_row_groups(); i++) + { + res += meta_data.RowGroup(i)->num_rows(); + } + return res; +} + +std::unique_ptr createReader( + const parquet::ColumnDescriptor & col_descriptor, + DataTypePtr ch_type, + std::unique_ptr meta, + std::unique_ptr reader) +{ + if (col_descriptor.logical_type()->is_date() && parquet::Type::INT32 == col_descriptor.physical_type()) + { + return std::make_unique>( + col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); + } + else if (col_descriptor.logical_type()->is_decimal()) + { + switch (col_descriptor.physical_type()) + { + case parquet::Type::INT32: + { + auto data_type = std::make_shared( + col_descriptor.type_precision(), col_descriptor.type_scale()); + return std::make_unique>>( + col_descriptor, data_type, std::move(meta), std::move(reader)); + } + case parquet::Type::INT64: + { + auto data_type = std::make_shared( + col_descriptor.type_precision(), col_descriptor.type_scale()); + return std::make_unique>>( + col_descriptor, data_type, std::move(meta), std::move(reader)); + } + case parquet::Type::FIXED_LEN_BYTE_ARRAY: + { + if (col_descriptor.type_length() <= static_cast(DecimalUtils::max_precision)) + { + auto data_type = std::make_shared( + col_descriptor.type_precision(), col_descriptor.type_scale()); + return std::make_unique>>( + col_descriptor, data_type, std::move(meta), std::move(reader)); + } + else + { + auto data_type = std::make_shared( + col_descriptor.type_precision(), col_descriptor.type_scale()); + return std::make_unique>>( + col_descriptor, data_type, std::move(meta), std::move(reader)); + } + } + default: + throw Exception( + ErrorCodes::PARQUET_EXCEPTION, + "Type not supported for decimal: {}", + col_descriptor.physical_type()); + } + } + else + { + switch (col_descriptor.physical_type()) + { + case parquet::Type::INT32: + return std::make_unique>( + col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); + case parquet::Type::INT64: + return std::make_unique>( + col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); + case parquet::Type::FLOAT: + return std::make_unique>( + col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); + case parquet::Type::INT96: + { + DataTypePtr read_type = ch_type; + if (!isDateTime64(ch_type)) + { + read_type = std::make_shared(ParquetRecordReader::default_datetime64_scale); + } + return std::make_unique>>( + col_descriptor, read_type, std::move(meta), std::move(reader)); + } + case parquet::Type::DOUBLE: + return std::make_unique>( + col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); + case parquet::Type::BYTE_ARRAY: + return std::make_unique>( + col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); + default: + throw Exception( + ErrorCodes::PARQUET_EXCEPTION, "Type not supported: {}", col_descriptor.physical_type()); + } + } +} + +} // anonymouse namespace + +ParquetRecordReader::ParquetRecordReader( + Block header_, + std::shared_ptr<::arrow::io::RandomAccessFile> file, + const parquet::ReaderProperties& properties) + : header(std::move(header_)) +{ + // Only little endian system is supported currently + static_assert(std::endian::native == std::endian::little); + + log = &Poco::Logger::get("ParquetRecordReader"); + THROW_PARQUET_EXCEPTION(file_reader = parquet::ParquetFileReader::Open(std::move(file), properties)); + left_rows = getTotalRows(*file_reader->metadata()); + + parquet_col_indice.reserve(header.columns()); + column_readers.reserve(header.columns()); + for (const auto & col_with_name : header) + { + auto idx = file_reader->metadata()->schema()->ColumnIndex(col_with_name.name); + if (idx < 0) + { + throw Exception("can not find column with name: " + col_with_name.name, ErrorCodes::BAD_ARGUMENTS); + } + parquet_col_indice.push_back(idx); + } +} + +Chunk ParquetRecordReader::readChunk(UInt32 num_rows) +{ + if (!left_rows) + { + return Chunk{}; + } + if (!cur_row_group_left_rows) + { + loadNextRowGroup(); + } + + Columns columns(header.columns()); + auto num_rows_read = std::min(static_cast(num_rows), cur_row_group_left_rows); + for (size_t i = 0; i < header.columns(); i++) + { + columns[i] = castColumn( + column_readers[i]->readBatch(num_rows_read, header.getByPosition(i).name), + header.getByPosition(i).type); + } + left_rows -= num_rows_read; + cur_row_group_left_rows -= num_rows_read; + + return Chunk{std::move(columns), num_rows_read}; +} + +void ParquetRecordReader::loadNextRowGroup() +{ + Stopwatch watch(CLOCK_MONOTONIC); + cur_row_group_reader = file_reader->RowGroup(next_row_group_idx); + + column_readers.clear(); + for (size_t i = 0; i < parquet_col_indice.size(); i++) + { + column_readers.emplace_back(createReader( + *file_reader->metadata()->schema()->Column(parquet_col_indice[i]), + header.getByPosition(i).type, + cur_row_group_reader->metadata()->ColumnChunk(parquet_col_indice[i]), + cur_row_group_reader->GetColumnPageReader(parquet_col_indice[i]))); + } + LOG_DEBUG(log, "reading row group {} consumed {} ms", next_row_group_idx, watch.elapsedNanoseconds() / 1e6); + ++next_row_group_idx; + cur_row_group_left_rows = cur_row_group_reader->metadata()->num_rows(); +} + +} diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h new file mode 100644 index 00000000000..d77cab6553b --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include "ParquetColumnReader.h" + +namespace DB +{ + +class ParquetRecordReader +{ +public: + ParquetRecordReader( + Block header_, + std::shared_ptr<::arrow::io::RandomAccessFile> file, + const parquet::ReaderProperties& properties); + + Chunk readChunk(UInt32 num_rows); + + // follow the scale generated by spark + static constexpr UInt8 default_datetime64_scale = 9; + +private: + std::unique_ptr file_reader; + + Block header; + + std::shared_ptr cur_row_group_reader; + ParquetColReaders column_readers; + + std::vector parquet_col_indice; + UInt64 left_rows; + UInt64 cur_row_group_left_rows = 0; + int next_row_group_idx = 0; + + Poco::Logger * log; + + void loadNextRowGroup(); +}; + +} From 8fb89cec9f28d6a12c2216ccd849fe0ead3ccd33 Mon Sep 17 00:00:00 2001 From: copperybean Date: Sun, 14 Jan 2024 12:01:23 +0800 Subject: [PATCH 109/392] fix build Change-Id: I57f025b17a04e2c5dded3f18e7f477841287a2c2 --- base/base/Decimal_fwd.h | 4 ++++ src/Columns/ColumnDecimal.h | 8 +++++++ src/Columns/ColumnVector.h | 3 +++ src/Common/ErrorCodes.cpp | 1 + .../Impl/Parquet/ParquetColumnReader.h | 3 ++- .../Formats/Impl/Parquet/ParquetDataBuffer.h | 12 ++++++---- .../Impl/Parquet/ParquetDataValuesReader.cpp | 23 ++++++++++--------- .../Impl/Parquet/ParquetDataValuesReader.h | 23 +++++++++---------- .../Impl/Parquet/ParquetLeafColReader.cpp | 17 +++++++------- .../Impl/Parquet/ParquetLeafColReader.h | 7 +++--- .../Impl/Parquet/ParquetRecordReader.cpp | 19 ++++++--------- .../Impl/Parquet/ParquetRecordReader.h | 7 +++--- 12 files changed, 71 insertions(+), 56 deletions(-) diff --git a/base/base/Decimal_fwd.h b/base/base/Decimal_fwd.h index beb228cea3c..a11e13a479b 100644 --- a/base/base/Decimal_fwd.h +++ b/base/base/Decimal_fwd.h @@ -44,6 +44,10 @@ concept is_over_big_int = || std::is_same_v || std::is_same_v || std::is_same_v; + +template +concept is_over_big_decimal = is_decimal && is_over_big_int; + } template <> struct is_signed { static constexpr bool value = true; }; diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index e0ea26744dc..e606aaaff0f 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -141,6 +141,14 @@ protected: UInt32 scale; }; +template +concept is_col_over_big_decimal = std::is_same_v> + && is_decimal && is_over_big_int; + +template +concept is_col_int_decimal = std::is_same_v> + && is_decimal && std::is_integral_v; + template class ColumnVector; template struct ColumnVectorOrDecimalT { using Col = ColumnVector; }; template struct ColumnVectorOrDecimalT { using Col = ColumnDecimal; }; diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index 39ee1d931bd..91bceaa4534 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -441,6 +441,9 @@ ColumnPtr ColumnVector::indexImpl(const PaddedPODArray & indexes, size_ return res; } +template +concept is_col_vector = std::is_same_v>; + /// Prevent implicit template instantiation of ColumnVector for common types extern template class ColumnVector; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 44c051401ef..106f443d532 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -600,6 +600,7 @@ M(719, QUERY_CACHE_USED_WITH_SYSTEM_TABLE) \ M(720, USER_EXPIRED) \ M(721, DEPRECATED_FUNCTION) \ + M(722, PARQUET_EXCEPTION) \ \ M(900, DISTRIBUTED_CACHE_ERROR) \ M(901, CANNOT_USE_DISTRIBUTED_CACHE) \ diff --git a/src/Processors/Formats/Impl/Parquet/ParquetColumnReader.h b/src/Processors/Formats/Impl/Parquet/ParquetColumnReader.h index cfd9d3ba5bd..2c78949e8e1 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetColumnReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetColumnReader.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace parquet { @@ -18,7 +19,7 @@ namespace DB class ParquetColumnReader { public: - virtual ColumnWithTypeAndName readBatch(UInt32 rows_num, const String & name) = 0; + virtual ColumnWithTypeAndName readBatch(UInt64 rows_num, const String & name) = 0; virtual ~ParquetColumnReader() = default; }; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h index 1f83c74f9ad..be9710e1726 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h @@ -142,15 +142,19 @@ private: class LazyNullMap { public: - LazyNullMap(UInt32 size_) : size(size_), col_nullable(nullptr) {} + LazyNullMap(UInt64 size_) : size(size_), col_nullable(nullptr) {} - void setNull(UInt32 cursor) + template + requires std::is_integral_v + void setNull(T cursor) { initialize(); null_map[cursor] = 1; } - void setNull(UInt32 cursor, UInt32 count) + template + requires std::is_integral_v + void setNull(T cursor, UInt32 count) { initialize(); memset(null_map + cursor, 1, count); @@ -159,7 +163,7 @@ public: ColumnPtr getNullableCol() { return col_nullable; } private: - UInt32 size; + UInt64 size; UInt8 * null_map; ColumnPtr col_nullable; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 659a7a11969..3afc66dcb36 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -189,7 +189,7 @@ void RleValuesReader::setValueBySteps( res_values += *(step_iterator++); visitValues( - col_data_steps.size() - 1, + static_cast(col_data_steps.size() - 1), /* individual_visitor */ [&](Int32 val) { *res_values = val_getter(val); @@ -394,14 +394,14 @@ void ParquetRleLCReader::readBatch( cursor, num_values, max_def_level, - /* individual_null_visitor */ [&](UInt32 nest_cursor) { + /* individual_null_visitor */ [&](size_t nest_cursor) { column_data[nest_cursor] = 0; has_null = true; }, - /* stepped_valid_visitor */ [&](UInt32 nest_cursor, const std::vector & valid_index_steps) { + /* stepped_valid_visitor */ [&](size_t nest_cursor, const std::vector & valid_index_steps) { rle_data_reader->setValueBySteps(column_data + nest_cursor, valid_index_steps, val_getter); }, - /* repeated_visitor */ [&](bool is_valid, UInt32 nest_cursor, UInt32 count) { + /* repeated_visitor */ [&](bool is_valid, size_t nest_cursor, UInt32 count) { if (is_valid) { rle_data_reader->setValues(column_data + nest_cursor, count, val_getter); @@ -461,10 +461,11 @@ void ParquetRleDictReader::readBatch( cursor, num_values, max_def_level, - /* individual_null_visitor */ [&](UInt32) {}, - /* stepped_valid_visitor */ [&](UInt32, const std::vector & valid_index_steps) { + /* individual_null_visitor */ [&](size_t) {}, + /* stepped_valid_visitor */ [&](size_t, const std::vector & valid_index_steps) { value_cache.resize(valid_index_steps.size()); - rle_data_reader->setValues(value_cache.data() + 1, valid_index_steps.size() - 1, val_getter); + rle_data_reader->setValues( + value_cache.data() + 1, static_cast(valid_index_steps.size() - 1), val_getter); append_nulls(valid_index_steps[0]); for (size_t i = 1; i < valid_index_steps.size(); i++) @@ -473,7 +474,7 @@ void ParquetRleDictReader::readBatch( append_nulls(valid_index_steps[i] - 1); } }, - /* repeated_visitor */ [&](bool is_valid, UInt32, UInt32 count) { + /* repeated_visitor */ [&](bool is_valid, size_t, UInt32 count) { if (is_valid) { value_cache.resize(count); @@ -504,13 +505,13 @@ void ParquetRleDictReader::readBatch( cursor, num_values, max_def_level, - /* individual_null_visitor */ [&](UInt32 nest_cursor) { + /* individual_null_visitor */ [&](size_t nest_cursor) { null_map.setNull(nest_cursor); }, - /* stepped_valid_visitor */ [&](UInt32 nest_cursor, const std::vector & valid_index_steps) { + /* stepped_valid_visitor */ [&](size_t nest_cursor, const std::vector & valid_index_steps) { rle_data_reader->setValueBySteps(column_data + nest_cursor, valid_index_steps, val_getter); }, - /* repeated_visitor */ [&](bool is_valid, UInt32 nest_cursor, UInt32 count) { + /* repeated_visitor */ [&](bool is_valid, size_t nest_cursor, UInt32 count) { if (is_valid) { rle_data_reader->setValues(column_data + nest_cursor, count, val_getter); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index 2c95f495339..66a1f4877e4 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -3,7 +3,6 @@ #include #include -#include #include #include #include @@ -25,7 +24,7 @@ namespace ErrorCodes class RleValuesReader { public: - RleValuesReader(std::unique_ptr bit_reader_, Int32 bit_width_) + RleValuesReader(std::unique_ptr bit_reader_, Int32 bit_width_) : bit_reader(std::move(bit_reader_)), bit_width(bit_width_) {} /** @@ -45,7 +44,7 @@ public: * @brief Visit num_values elements. * For RLE encoding, for same group, the value is same, so they can be visited repeatedly. * For BitPacked encoding, the values may be different with each other, so they must be visited individual. - * + * * @tparam IndividualVisitor A callback with signature: void(Int32 val) * @tparam RepeatedVisitor A callback with signature: void(UInt32 count, Int32 val) */ @@ -55,10 +54,10 @@ public: /** * @brief Visit num_values elements by parsed nullability. * If the parsed value is same as max_def_level, then it is processed as null value. - * + * * @tparam IndividualVisitor A callback with signature: void(size_t cursor) * @tparam RepeatedVisitor A callback with signature: void(size_t cursor, UInt32 count) - * + * * Because the null map is processed, so only the callbacks only need to process the valid data. */ template @@ -74,18 +73,18 @@ public: * @brief Visit num_values elements by parsed nullability. * It may be inefficient to process the valid data individually like in visitNullableValues, * so a valid_index_steps index array is generated first, in order to process valid data continuously. - * + * * @tparam IndividualNullVisitor A callback with signature: void(size_t cursor), used to process null value * @tparam SteppedValidVisitor A callback with signature: * void(size_t cursor, const std::vector & valid_index_steps) * for n valid elements with null value interleaved in a BitPacked group, * i-th item in valid_index_steps describes how many elements in column there are after (i-1)-th valid element. - * + * * take following BitPacked group with 2 valid elements for example: * null valid null null valid null * then the valid_index_steps has values [1, 3, 2]. * Please note that the the sum of valid_index_steps is same as elements number in this group. - * + * * @tparam RepeatedVisitor A callback with signature: void(bool is_valid, UInt32 cursor, UInt32 count) */ template @@ -99,7 +98,7 @@ public: /** * @brief Set the Values to column_data directly - * + * * @tparam TValue The type of column data. * @tparam ValueGetter A callback with signature: TValue(Int32 val) */ @@ -118,7 +117,7 @@ public: ValueGetter && val_getter); private: - std::unique_ptr bit_reader; + std::unique_ptr bit_reader; std::vector cur_packed_bit_values; std::vector valid_index_steps; @@ -203,7 +202,7 @@ private: /** * Read data according to the format of ColumnLowCardinality format. - * + * * Only index and null column are processed in this class. * And all null value is mapped to first index in dictionary, * so the result index valued is added by one. @@ -232,7 +231,7 @@ private: /** * The definition level is RLE or BitPacked encoded, * and the index of dictionary is also RLE or BitPacked encoded. - * + * * while the result is not parsed as a low cardinality column, * instead, a normal column is generated. */ diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index 00dee9074fe..2e3d329bcd2 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -58,7 +59,7 @@ void visitColStrIndexType(size_t data_size, TypeVisitor && visitor) } } -void reserveColumnStrRows(MutableColumnPtr & col, UInt32 rows_num) +void reserveColumnStrRows(MutableColumnPtr & col, UInt64 rows_num) { col->reserve(rows_num); @@ -212,7 +213,7 @@ ParquetLeafColReader::ParquetLeafColReader( } template -ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt32 rows_num, const String & name) +ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt64 rows_num, const String & name) { reading_rows_num = rows_num; auto readPageIfEmpty = [&]() { @@ -228,7 +229,7 @@ ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt32 rows_num, // if dictionary page encountered, another page should be read readPageIfEmpty(); - auto read_values = std::min(rows_num, cur_page_values); + auto read_values = static_cast(std::min(rows_num, static_cast(cur_page_values))); data_values_reader->readBatch(column, *null_map, read_values); cur_page_values -= read_values; @@ -239,7 +240,7 @@ ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt32 rows_num, } template <> -void ParquetLeafColReader::resetColumn(UInt32 rows_num) +void ParquetLeafColReader::resetColumn(UInt64 rows_num) { if (reading_low_cardinality) { @@ -261,7 +262,7 @@ void ParquetLeafColReader::resetColumn(UInt32 rows_num) } template -void ParquetLeafColReader::resetColumn(UInt32 rows_num) +void ParquetLeafColReader::resetColumn(UInt64 rows_num) { assert(!reading_low_cardinality); @@ -403,9 +404,9 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) assert(col_descriptor.max_definition_level() >= 0); std::unique_ptr def_level_reader; if (col_descriptor.max_definition_level() > 0) { - auto bit_width = arrow::BitUtil::Log2(col_descriptor.max_definition_level() + 1); + auto bit_width = arrow::bit_util::Log2(col_descriptor.max_definition_level() + 1); auto num_bytes = ::arrow::util::SafeLoadAs(buffer); - auto bit_reader = std::make_unique(buffer + 4, num_bytes); + auto bit_reader = std::make_unique(buffer + 4, num_bytes); num_bytes += 4; buffer += num_bytes; max_size -= num_bytes; @@ -447,7 +448,7 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) // refer to: DictDecoderImpl::SetData in encoding.cc auto bit_width = *buffer; - auto bit_reader = std::make_unique(++buffer, --max_size); + auto bit_reader = std::make_unique(++buffer, --max_size); data_values_reader = createDictReader( std::move(def_level_reader), std::make_unique(std::move(bit_reader), bit_width)); break; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h index f730afe40ed..c5b14132f17 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include @@ -28,7 +27,7 @@ public: std::unique_ptr meta_, std::unique_ptr reader_); - ColumnWithTypeAndName readBatch(UInt32 rows_num, const String & name) override; + ColumnWithTypeAndName readBatch(UInt64 rows_num, const String & name) override; private: const parquet::ColumnDescriptor & col_descriptor; @@ -42,13 +41,13 @@ private: ColumnPtr dictionary; + UInt64 reading_rows_num = 0; UInt32 cur_page_values = 0; - UInt32 reading_rows_num = 0; bool reading_low_cardinality = false; Poco::Logger * log; - void resetColumn(UInt32 rows_num); + void resetColumn(UInt64 rows_num); void degradeDictionary(); ColumnWithTypeAndName releaseColumn(const String & name); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index a5744b85174..9ff4a7a16aa 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -30,21 +31,14 @@ namespace ErrorCodes extern const int PARQUET_EXCEPTION; } -// #define THROW_ARROW_NOT_OK(status) \ -// do \ -// { \ -// if (::arrow::Status _s = (status); !_s.ok()) \ -// throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ -// } while (false) - - #define THROW_PARQUET_EXCEPTION(s) \ do \ { \ try { (s); } \ catch (const ::parquet::ParquetException & e) \ { \ - throw Exception(e.what(), ErrorCodes::PARQUET_EXCEPTION); \ + auto msg = PreformattedMessage::create("Excepted when reading parquet: {}", e.what()); \ + throw Exception(std::move(msg), ErrorCodes::PARQUET_EXCEPTION); \ } \ } while (false) @@ -172,13 +166,14 @@ ParquetRecordReader::ParquetRecordReader( auto idx = file_reader->metadata()->schema()->ColumnIndex(col_with_name.name); if (idx < 0) { - throw Exception("can not find column with name: " + col_with_name.name, ErrorCodes::BAD_ARGUMENTS); + auto msg = PreformattedMessage::create("can not find column with name: {}", col_with_name.name); + throw Exception(std::move(msg), ErrorCodes::BAD_ARGUMENTS); } parquet_col_indice.push_back(idx); } } -Chunk ParquetRecordReader::readChunk(UInt32 num_rows) +Chunk ParquetRecordReader::readChunk(size_t num_rows) { if (!left_rows) { @@ -190,7 +185,7 @@ Chunk ParquetRecordReader::readChunk(UInt32 num_rows) } Columns columns(header.columns()); - auto num_rows_read = std::min(static_cast(num_rows), cur_row_group_left_rows); + auto num_rows_read = std::min(num_rows, cur_row_group_left_rows); for (size_t i = 0; i < header.columns(); i++) { columns[i] = castColumn( diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h index d77cab6553b..69cdaa5ccb7 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h @@ -1,9 +1,8 @@ #pragma once -#include #include #include -#include +#include #include #include @@ -22,8 +21,8 @@ public: std::shared_ptr<::arrow::io::RandomAccessFile> file, const parquet::ReaderProperties& properties); - Chunk readChunk(UInt32 num_rows); - + Chunk readChunk(size_t num_rows); + // follow the scale generated by spark static constexpr UInt8 default_datetime64_scale = 9; From dbdff6c038834f973d803f44ef096b6015d09e3b Mon Sep 17 00:00:00 2001 From: copperybean Date: Sun, 28 Jan 2024 09:56:36 +0800 Subject: [PATCH 110/392] support reading simple types by native parquet reader Change-Id: I38b8368b022263d9a71cb3f3e9fdad5d6ca26753 --- src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + .../Formats/Impl/Parquet/ParquetDataBuffer.h | 2 +- .../Impl/Parquet/ParquetLeafColReader.cpp | 12 +- .../Impl/Parquet/ParquetRecordReader.cpp | 73 +++++++---- .../Impl/Parquet/ParquetRecordReader.h | 14 ++- .../Formats/Impl/ParquetBlockInputFormat.cpp | 118 ++++++++++++------ .../Formats/Impl/ParquetBlockInputFormat.h | 4 + 9 files changed, 153 insertions(+), 73 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 4a0de354a03..2465164e912 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1013,6 +1013,7 @@ class IColumn; M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \ M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \ M(Bool, input_format_parquet_filter_push_down, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.", 0) \ + M(Bool, input_format_parquet_use_native_reader, false, "When reading Parquet files, to use native reader instead of arrow reader.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ M(Bool, input_format_orc_allow_missing_columns, true, "Allow missing columns while reading ORC input formats", 0) \ M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 43ccee173f0..557b49d2a0a 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -154,6 +154,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching; format_settings.parquet.preserve_order = settings.input_format_parquet_preserve_order; format_settings.parquet.filter_push_down = settings.input_format_parquet_filter_push_down; + format_settings.parquet.use_native_reader = settings.input_format_parquet_use_native_reader; format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference; format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index d5fedf99adb..0ac4ea5e0fb 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -258,6 +258,7 @@ struct FormatSettings bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; bool filter_push_down = true; + bool use_native_reader = false; std::unordered_set skip_row_groups = {}; bool output_string_as_string = false; bool output_fixed_string_as_fixed_byte_array = true; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h index be9710e1726..d4956f83092 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h @@ -34,7 +34,7 @@ public: void ALWAYS_INLINE readValue(TValue & dst) { checkAvaible(sizeof(TValue)); - dst = *reinterpret_cast(data); + dst = *(reinterpret_cast(data)); consume(sizeof(TValue)); } diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index 2e3d329bcd2..e2677d7cae3 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -274,7 +274,14 @@ void ParquetLeafColReader::resetColumn(UInt64 rows_num) template void ParquetLeafColReader::degradeDictionary() { + // if last batch read all dictionary indices, then degrade is not needed this time + if (!column) + { + dictionary = nullptr; + return; + } assert(dictionary && column->size()); + null_map = std::make_unique(reading_rows_num); auto col_existing = std::move(column); column = ColumnString::create(); @@ -304,7 +311,8 @@ void ParquetLeafColReader::degradeDictionary() col_dest.getOffsets()[i] = col_dest.getChars().size(); } }); - LOG_INFO(log, "degraded dictionary to normal column"); + dictionary = nullptr; + LOG_DEBUG(log, "degraded dictionary to normal column"); } template @@ -364,7 +372,7 @@ void ParquetLeafColReader::readPage() throw new Exception( ErrorCodes::NOT_IMPLEMENTED, "Unsupported dictionary page encoding {}", dict_page.encoding()); } - LOG_INFO(log, "{} values in dictionary page of column {}", dict_page.num_values(), col_descriptor.name()); + LOG_DEBUG(log, "{} values in dictionary page of column {}", dict_page.num_values(), col_descriptor.name()); dictionary = readDictPage(dict_page, col_descriptor, base_data_type); if (std::is_same_v) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 9ff4a7a16aa..42f131ff794 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -31,31 +31,29 @@ namespace ErrorCodes extern const int PARQUET_EXCEPTION; } -#define THROW_PARQUET_EXCEPTION(s) \ - do \ - { \ - try { (s); } \ - catch (const ::parquet::ParquetException & e) \ - { \ +#define THROW_PARQUET_EXCEPTION(s) \ + do \ + { \ + try { (s); } \ + catch (const ::parquet::ParquetException & e) \ + { \ auto msg = PreformattedMessage::create("Excepted when reading parquet: {}", e.what()); \ throw Exception(std::move(msg), ErrorCodes::PARQUET_EXCEPTION); \ - } \ + } \ } while (false) namespace { -Int64 getTotalRows(const parquet::FileMetaData & meta_data) +std::unique_ptr createFileReader( + std::shared_ptr<::arrow::io::RandomAccessFile> arrow_file) { - Int64 res = 0; - for (int i = 0; i < meta_data.num_row_groups(); i++) - { - res += meta_data.RowGroup(i)->num_rows(); - } + std::unique_ptr res; + THROW_PARQUET_EXCEPTION(res = parquet::ParquetFileReader::Open(std::move(arrow_file))); return res; } -std::unique_ptr createReader( +std::unique_ptr createColReader( const parquet::ColumnDescriptor & col_descriptor, DataTypePtr ch_type, std::unique_ptr meta, @@ -86,7 +84,7 @@ std::unique_ptr createReader( } case parquet::Type::FIXED_LEN_BYTE_ARRAY: { - if (col_descriptor.type_length() <= static_cast(DecimalUtils::max_precision)) + if (col_descriptor.type_length() <= static_cast(sizeof(Decimal128))) { auto data_type = std::make_shared( col_descriptor.type_precision(), col_descriptor.type_scale()); @@ -148,16 +146,21 @@ std::unique_ptr createReader( ParquetRecordReader::ParquetRecordReader( Block header_, - std::shared_ptr<::arrow::io::RandomAccessFile> file, - const parquet::ReaderProperties& properties) - : header(std::move(header_)) + parquet::ArrowReaderProperties reader_properties_, + std::shared_ptr<::arrow::io::RandomAccessFile> arrow_file, + const FormatSettings & format_settings, + std::vector row_groups_indices_) + : file_reader(createFileReader(std::move(arrow_file))) + , reader_properties(reader_properties_) + , header(std::move(header_)) + , max_block_size(format_settings.parquet.max_block_size) + , row_groups_indices(std::move(row_groups_indices_)) + , left_rows(getTotalRows(*file_reader->metadata())) { // Only little endian system is supported currently static_assert(std::endian::native == std::endian::little); log = &Poco::Logger::get("ParquetRecordReader"); - THROW_PARQUET_EXCEPTION(file_reader = parquet::ParquetFileReader::Open(std::move(file), properties)); - left_rows = getTotalRows(*file_reader->metadata()); parquet_col_indice.reserve(header.columns()); column_readers.reserve(header.columns()); @@ -167,13 +170,18 @@ ParquetRecordReader::ParquetRecordReader( if (idx < 0) { auto msg = PreformattedMessage::create("can not find column with name: {}", col_with_name.name); - throw Exception(std::move(msg), ErrorCodes::BAD_ARGUMENTS); + throw Exception(std::move(msg), ErrorCodes::PARQUET_EXCEPTION); } parquet_col_indice.push_back(idx); } + if (reader_properties.pre_buffer()) + { + THROW_PARQUET_EXCEPTION(file_reader->PreBuffer( + row_groups_indices, parquet_col_indice, reader_properties.io_context(), reader_properties.cache_options())); + } } -Chunk ParquetRecordReader::readChunk(size_t num_rows) +Chunk ParquetRecordReader::readChunk() { if (!left_rows) { @@ -185,7 +193,7 @@ Chunk ParquetRecordReader::readChunk(size_t num_rows) } Columns columns(header.columns()); - auto num_rows_read = std::min(num_rows, cur_row_group_left_rows); + auto num_rows_read = std::min(max_block_size, cur_row_group_left_rows); for (size_t i = 0; i < header.columns(); i++) { columns[i] = castColumn( @@ -201,20 +209,33 @@ Chunk ParquetRecordReader::readChunk(size_t num_rows) void ParquetRecordReader::loadNextRowGroup() { Stopwatch watch(CLOCK_MONOTONIC); - cur_row_group_reader = file_reader->RowGroup(next_row_group_idx); + cur_row_group_reader = file_reader->RowGroup(row_groups_indices[next_row_group_idx]); column_readers.clear(); for (size_t i = 0; i < parquet_col_indice.size(); i++) { - column_readers.emplace_back(createReader( + column_readers.emplace_back(createColReader( *file_reader->metadata()->schema()->Column(parquet_col_indice[i]), header.getByPosition(i).type, cur_row_group_reader->metadata()->ColumnChunk(parquet_col_indice[i]), cur_row_group_reader->GetColumnPageReader(parquet_col_indice[i]))); } - LOG_DEBUG(log, "reading row group {} consumed {} ms", next_row_group_idx, watch.elapsedNanoseconds() / 1e6); + + auto duration = watch.elapsedNanoseconds() / 1e6; + LOG_DEBUG(log, "reading row group {} consumed {} ms", row_groups_indices[next_row_group_idx], duration); + ++next_row_group_idx; cur_row_group_left_rows = cur_row_group_reader->metadata()->num_rows(); } +Int64 ParquetRecordReader::getTotalRows(const parquet::FileMetaData & meta_data) +{ + Int64 res = 0; + for (size_t i = 0; i < row_groups_indices.size(); i++) + { + res += meta_data.RowGroup(row_groups_indices[i])->num_rows(); + } + return res; +} + } diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h index 69cdaa5ccb7..4789be59ec8 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -18,23 +19,29 @@ class ParquetRecordReader public: ParquetRecordReader( Block header_, - std::shared_ptr<::arrow::io::RandomAccessFile> file, - const parquet::ReaderProperties& properties); + parquet::ArrowReaderProperties reader_properties_, + std::shared_ptr<::arrow::io::RandomAccessFile> arrow_file, + const FormatSettings & format_settings, + std::vector row_groups_indices_); - Chunk readChunk(size_t num_rows); + Chunk readChunk(); // follow the scale generated by spark static constexpr UInt8 default_datetime64_scale = 9; private: std::unique_ptr file_reader; + parquet::ArrowReaderProperties reader_properties; Block header; std::shared_ptr cur_row_group_reader; ParquetColReaders column_readers; + UInt64 max_block_size; + std::vector parquet_col_indice; + std::vector row_groups_indices; UInt64 left_rows; UInt64 cur_row_group_left_rows = 0; int next_row_group_idx = 0; @@ -42,6 +49,7 @@ private: Poco::Logger * log; void loadNextRowGroup(); + Int64 getTotalRows(const parquet::FileMetaData & meta_data); }; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index d41cb3447de..e35d53dc4f4 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace CurrentMetrics { @@ -392,6 +393,8 @@ void ParquetBlockInputFormat::initializeIfNeeded() { if (std::exchange(is_initialized, true)) return; + if (format_settings.parquet.use_native_reader) + LOG_INFO(&Poco::Logger::get("ParquetBlockInputFormat"), "using native parquet reader"); // Create arrow file adapter. // TODO: Make the adapter do prefetching on IO threads, based on the full set of ranges that @@ -479,23 +482,35 @@ void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_bat if (metadata->writer_version().VersionLt(parquet::ApplicationVersion::PARQUET_816_FIXED_VERSION())) properties.set_pre_buffer(false); - parquet::arrow::FileReaderBuilder builder; - THROW_ARROW_NOT_OK( - builder.Open(arrow_file, /* not to be confused with ArrowReaderProperties */ parquet::default_reader_properties(), metadata)); - builder.properties(properties); - // TODO: Pass custom memory_pool() to enable memory accounting with non-jemalloc allocators. - THROW_ARROW_NOT_OK(builder.Build(&row_group_batch.file_reader)); + if (format_settings.parquet.use_native_reader) + { + row_group_batch.native_record_reader = std::make_shared( + getPort().getHeader(), + std::move(properties), + arrow_file, + format_settings, + row_group_batch.row_groups_idxs); + } + else + { + parquet::arrow::FileReaderBuilder builder; + THROW_ARROW_NOT_OK( + builder.Open(arrow_file, /* not to be confused with ArrowReaderProperties */ parquet::default_reader_properties(), metadata)); + builder.properties(properties); + // TODO: Pass custom memory_pool() to enable memory accounting with non-jemalloc allocators. + THROW_ARROW_NOT_OK(builder.Build(&row_group_batch.file_reader)); - THROW_ARROW_NOT_OK( - row_group_batch.file_reader->GetRecordBatchReader(row_group_batch.row_groups_idxs, column_indices, &row_group_batch.record_batch_reader)); + THROW_ARROW_NOT_OK( + row_group_batch.file_reader->GetRecordBatchReader(row_group_batch.row_groups_idxs, column_indices, &row_group_batch.record_batch_reader)); - row_group_batch.arrow_column_to_ch_column = std::make_unique( - getPort().getHeader(), - "Parquet", - format_settings.parquet.allow_missing_columns, - format_settings.null_as_default, - format_settings.date_time_overflow_behavior, - format_settings.parquet.case_insensitive_column_matching); + row_group_batch.arrow_column_to_ch_column = std::make_unique( + getPort().getHeader(), + "Parquet", + format_settings.parquet.allow_missing_columns, + format_settings.null_as_default, + format_settings.date_time_overflow_behavior, + format_settings.parquet.case_insensitive_column_matching); + } } void ParquetBlockInputFormat::scheduleRowGroup(size_t row_group_batch_idx) @@ -561,6 +576,7 @@ void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_batch_idx, std::un lock.unlock(); auto end_of_row_group = [&] { + row_group_batch.native_record_reader.reset(); row_group_batch.arrow_column_to_ch_column.reset(); row_group_batch.record_batch_reader.reset(); row_group_batch.file_reader.reset(); @@ -573,35 +589,55 @@ void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_batch_idx, std::un // reached. Wake up read() instead. condvar.notify_all(); }; - - if (!row_group_batch.record_batch_reader) - initializeRowGroupBatchReader(row_group_batch_idx); - - auto batch = row_group_batch.record_batch_reader->Next(); - if (!batch.ok()) - throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}", batch.status().ToString()); - - if (!*batch) + auto get_pending_chunk = [&](size_t num_rows, Chunk chunk = {}) { - end_of_row_group(); - return; - } - - auto tmp_table = arrow::Table::FromRecordBatches({*batch}); - - size_t approx_chunk_original_size = static_cast(std::ceil(static_cast(row_group_batch.total_bytes_compressed) / row_group_batch.total_rows * (*tmp_table)->num_rows())); - PendingChunk res = { - .chunk = {}, - .block_missing_values = {}, - .chunk_idx = row_group_batch.next_chunk_idx, - .row_group_batch_idx = row_group_batch_idx, - .approx_original_chunk_size = approx_chunk_original_size + size_t approx_chunk_original_size = static_cast(std::ceil( + static_cast(row_group_batch.total_bytes_compressed) / row_group_batch.total_rows * num_rows)); + return PendingChunk{ + .chunk = std::move(chunk), + .block_missing_values = {}, + .chunk_idx = row_group_batch.next_chunk_idx, + .row_group_batch_idx = row_group_batch_idx, + .approx_original_chunk_size = approx_chunk_original_size + }; }; - /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. - /// Otherwise fill the missing columns with zero values of its type. - BlockMissingValues * block_missing_values_ptr = format_settings.defaults_for_omitted_fields ? &res.block_missing_values : nullptr; - res.chunk = row_group_batch.arrow_column_to_ch_column->arrowTableToCHChunk(*tmp_table, (*tmp_table)->num_rows(), block_missing_values_ptr); + if (!row_group_batch.record_batch_reader && !row_group_batch.native_record_reader) + initializeRowGroupBatchReader(row_group_batch_idx); + + PendingChunk res; + if (format_settings.parquet.use_native_reader) + { + auto chunk = row_group_batch.native_record_reader->readChunk(); + if (!chunk) + { + end_of_row_group(); + return; + } + + auto num_rows = chunk.getNumRows(); + res = get_pending_chunk(num_rows, std::move(chunk)); + } + else + { + auto batch = row_group_batch.record_batch_reader->Next(); + if (!batch.ok()) + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}", batch.status().ToString()); + + if (!*batch) + { + end_of_row_group(); + return; + } + + auto tmp_table = arrow::Table::FromRecordBatches({*batch}); + res = get_pending_chunk((*tmp_table)->num_rows()); + + /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. + /// Otherwise fill the missing columns with zero values of its type. + BlockMissingValues * block_missing_values_ptr = format_settings.defaults_for_omitted_fields ? &res.block_missing_values : nullptr; + res.chunk = row_group_batch.arrow_column_to_ch_column->arrowTableToCHChunk(*tmp_table, (*tmp_table)->num_rows(), block_missing_values_ptr); + } lock.lock(); diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index b5b884b5efa..a737c695fd6 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -16,6 +16,7 @@ namespace DB { class ArrowColumnToCHColumn; +class ParquetRecordReader; // Parquet files contain a metadata block with the following information: // * list of columns, @@ -210,6 +211,9 @@ private: std::vector row_groups_idxs; // These are only used by the decoding thread, so don't require locking the mutex. + // If use_native_reader, only native_record_reader is used; + // otherwise, only native_record_reader is not used. + std::shared_ptr native_record_reader; std::unique_ptr file_reader; std::shared_ptr record_batch_reader; std::unique_ptr arrow_column_to_ch_column; From 8172f6cec023df144ef20a7cfd49b43548cefd41 Mon Sep 17 00:00:00 2001 From: copperybean Date: Wed, 21 Feb 2024 00:17:30 +0800 Subject: [PATCH 111/392] log duration while reading parquet Change-Id: If79741b7456667a8dde3e355d9dc684c2dd84f4f --- .../Formats/Impl/ParquetBlockInputFormat.cpp | 11 +++++++++++ src/Processors/Formats/Impl/ParquetBlockInputFormat.h | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index e35d53dc4f4..7faa7300416 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -673,6 +673,15 @@ void ParquetBlockInputFormat::scheduleMoreWorkIfNeeded(std::optional row } } +Chunk ParquetBlockInputFormat::generate() +{ + auto res = IInputFormat::generate(); + if (!res) + LOG_INFO(&Poco::Logger::get("ParquetBlockInputFormat"), "{} ms consumed by reading parquet file", consumed_nanosecs / 1e6); + + return res; +} + Chunk ParquetBlockInputFormat::read() { initializeIfNeeded(); @@ -683,6 +692,8 @@ Chunk ParquetBlockInputFormat::read() if (need_only_count) return getChunkForCount(row_group_batches[row_group_batches_completed++].total_rows); + Stopwatch watch(CLOCK_MONOTONIC); + SCOPE_EXIT({ consumed_nanosecs += watch.elapsedNanoseconds(); }); std::unique_lock lock(mutex); while (true) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index a737c695fd6..a94637da942 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -65,6 +65,8 @@ public: size_t getApproxBytesReadForChunk() const override { return previous_approx_bytes_read_for_chunk; } + Chunk generate() override; + private: Chunk read() override; @@ -286,6 +288,8 @@ private: std::exception_ptr background_exception = nullptr; std::atomic is_stopped{0}; bool is_initialized = false; + + UInt64 consumed_nanosecs = 0; }; class ParquetSchemaReader : public ISchemaReader From e0179150c1671f75f9480ebca17c4ea2595ae811 Mon Sep 17 00:00:00 2001 From: copperybean Date: Fri, 23 Feb 2024 01:09:02 +0800 Subject: [PATCH 112/392] Revert "log duration while reading parquet" This reverts commit 5df94b7f8955b541ae37e4bbdc13a1fec9ddbbd9. --- .../Formats/Impl/ParquetBlockInputFormat.cpp | 11 ----------- src/Processors/Formats/Impl/ParquetBlockInputFormat.h | 4 ---- 2 files changed, 15 deletions(-) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 7faa7300416..e35d53dc4f4 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -673,15 +673,6 @@ void ParquetBlockInputFormat::scheduleMoreWorkIfNeeded(std::optional row } } -Chunk ParquetBlockInputFormat::generate() -{ - auto res = IInputFormat::generate(); - if (!res) - LOG_INFO(&Poco::Logger::get("ParquetBlockInputFormat"), "{} ms consumed by reading parquet file", consumed_nanosecs / 1e6); - - return res; -} - Chunk ParquetBlockInputFormat::read() { initializeIfNeeded(); @@ -692,8 +683,6 @@ Chunk ParquetBlockInputFormat::read() if (need_only_count) return getChunkForCount(row_group_batches[row_group_batches_completed++].total_rows); - Stopwatch watch(CLOCK_MONOTONIC); - SCOPE_EXIT({ consumed_nanosecs += watch.elapsedNanoseconds(); }); std::unique_lock lock(mutex); while (true) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index a94637da942..a737c695fd6 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -65,8 +65,6 @@ public: size_t getApproxBytesReadForChunk() const override { return previous_approx_bytes_read_for_chunk; } - Chunk generate() override; - private: Chunk read() override; @@ -288,8 +286,6 @@ private: std::exception_ptr background_exception = nullptr; std::atomic is_stopped{0}; bool is_initialized = false; - - UInt64 consumed_nanosecs = 0; }; class ParquetSchemaReader : public ISchemaReader From 18b3ebcda363eb7e9b8f52c7170d8bc208bb9b07 Mon Sep 17 00:00:00 2001 From: copperybean Date: Fri, 23 Feb 2024 01:10:22 +0800 Subject: [PATCH 113/392] add test Change-Id: I53ade40ba24a742a21f9e09dbab7fff90b032b4b --- .../02998_native_parquet_reader.parquet | Bin 0 -> 76392 bytes .../02998_native_parquet_reader.reference | 2000 +++++++++++++++++ .../02998_native_parquet_reader.sh | 210 ++ 3 files changed, 2210 insertions(+) create mode 100644 tests/queries/0_stateless/02998_native_parquet_reader.parquet create mode 100644 tests/queries/0_stateless/02998_native_parquet_reader.reference create mode 100755 tests/queries/0_stateless/02998_native_parquet_reader.sh diff --git a/tests/queries/0_stateless/02998_native_parquet_reader.parquet b/tests/queries/0_stateless/02998_native_parquet_reader.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c0d222342e31969fd5e6b4fb0fd8d0ecd4a822bc GIT binary patch literal 76392 zcmeFa2UHZ<*7jdjLRG1#7*NcZF=GH4V*>`vn7UfKN)ZzV6f^rubDC=l>mf^Z- zqw=2B(?(r=cDA9rxJhB;L3FaI0sDdNA@0=`ljj(2m|~TW>|lDI&YqoX=qYYiEMx_Dy%Hgw@r=N&ukx2)p2rw;f`sW@`W8j z$LpG~3k;s(c7>OR(sxZw*{^J0alfv(ywGsZv_tvEdeH}T&DceTe&SB0gglH+F*Rqu zv;D<`x{~r@!+len@`LrJ59u7(C58dwF2zb7PCqcUU@2~(n4l{qFEugLPZjg9P zS5^)-q?-0BmYffLT<64w7zT^`lydSI+H7)WSW)MY1 zsUVM|(@icc&q-pU&PEP5WSCT?I5(a?rE9}R7>0=PN=4b1&NQ`UOK?NQ)4ED>q~WP) zzfzK$K%ddIW0xDe!~=@0Jdu88YR_76!^E??%JK@sbJIbk6z4~u({*518ob3rN)>q$ zon`9Cmga_w=XF)(RfZR)1f>i&nZBUYva1av#KTH8c?z9va%HW#kz$grx*TQ5F&$CL za#QJxIyW}jFiJeC)R3pqxh8kE95-6Lq^l{%81hWVl=9qk`m(MQyT;%n9#`z-8T3n2 zXSM=2M!ce{C9gHSGM!LtIDh)8t_!=)FjhRN)Rt$``KGRHMQ)sUO;<-=Z+LA=R4Q?^ z=<7NUc7tKOcuJ`&&!*p)y0NyLub8Z>CvP;oHJw%}b93k$y6)^I!vyh+QeU1+zcck< zt8f#=o4N*atl_=stWuSmN8i%*WH%f9#B++hJfHqx>cv*$CW*Io4dpF{kEZiVbuNIu zqwCFXHB1&SD2?O=bb+Z4TZ5Y--qkghw;4W}l9Za_@XG~j~iRGpqR8s>;Ml-67u!jwS;$uZCN7JR1k!))&NX*r_%0~=klqZTK7enXiMzKc?!D6c7Cavd z7b3pYxy#24WfimH%&nzg>3rDZhEOq0=_IeC%PC`64HqWn>pIIP4CR${#f4i>zt)Xq zPa49-45f>_fv%v8W7}{M;u~F8IniLFWGZdBjr3dHc=nVbQhcg-$eZYjiZ9!aTQ0uS zb(2pUDk;yD_FOFeUN?a~V^|?RSGvoaX@wc#IL%(@+Ct}<)zY@i=)5krn8p~YsFVe zKY15zr_5lxaO=eHy8iMNLoFp=>B{YCWn?(y@kbKk7K>48b;!L!qFps@u*dl&Z2FnUAqPvGQ5z%k8I&3k%tMh8^M;Wr%!$ZlWw=`*AzP z62ee9#n4pws`Td$(j|q(?0rL=_)YPW57Et(CF}rhmuMvnlOGtGE8mrYTmoH6Sjs*$ z>=u70-tuAEL0QJ?xIJQNVYvLr&_bb9fjdH%5dzuA23e%l5%N*Gr4q#IIiqMTjFg`k zS}BY=h&x7?6@uAR!(OqNI!ZoHw^l;f!Q4KvoG@B88ypo2B+v=Eyb#K!8BC(3>LZ_| zos=+E8RFPB1%BN_J62T7P;>C)>I62ecqVVca?lfIV zh-9A{_KU^U@$wnEjk28e;tq(mg0K9{&{ip-4&%$YYH)Jp5d5SR-Gzerrne^tPgiwv=gSu zFAeTWIdu$og{~#6WnURih~?Gk@>RN%vW^|gofK;eGvs_jXQhHVj=M(J5!SP>4T++S z>MviXyC@sj@!Tn~t}s)6W9X_>RDHQ*x}LC+eQP)^R#IolH)s!K6FY%BBi0vY%kK=` z6kBy7cav@)#Io-VXT{3u9QhXAUD?d~apy#PVXpka&_k)BPU3FU4TUZ2N5grssya`; zL-$m+vXi+BVk2R`Twv&>R8yyLcj?B$HujStNvy60$oJ^p%64`tcTsF2ERa7N`Y1Kj zX|*`7tD=LjME-8*uhdp&a*ya1!fy75;hI=ST`E7O2Pk{kS=@E8rLat<#DPj(bvE~e zZY9VpEhdZg)Id3v)+t7I4tGOrEd{9Nb*g;q+TZzLI2Xz^jO?MO$*izynv4y%y&Y``P z!)ze;Skwxu<rHXDsv}`SoR9dSc+)LVB zIL4M0&7z|kBfp|YDaYARE=}wttdYx!qZKDLjLWAx3n$p}V!G(8u9aWYKFUcpoXZfq z2U#MtJytozMsiO@4`G8`Q5>hVQI~V?=x)MkwvzZv zY^!dR-_zrjGwcfPx!7IUB-@I!NR{ls&P|mTdxEEqiVY6IC zoTzkAS91k)FX22}Rm>JUs$1kww4ZW;jpA~|-ojS7nm9?(s?pqMx{r{=Ru^+cS9P2G zg`TWjWMjBI(Nowi*AS;DZt5EDE8SPP#MTsFitg$T`5Qe|xy-KRUWxsLowA)cP3fer zqeE>B?1hJ@;B1AncNBi!+oi>IRNt1`5~MI^r9#tGZjJ8Gq$E zyODb<>V!RVU2&%3p>E7erpj_LW|ne;jpg2pdci2y7iTNo)y8WnzSjHgSW*dqHVlUMsbId&D4!e!}BqEgM zM&f*>x4NC<8A-UyHWojNeNO};EksHhs3yo|n5D{NR_17Fgm74HDK1lVs*$s1MhZ{ZRw9O_9+As3fl4a7 zmn$ZX5{}BP#UMqm?&HcaqXjeTC|XE^)MIjaCRj;hO`N6VBOI5V#1Lh$s&EyUF+w`) zEV7b8Jt5mLp-Kj;a-1|)I4NtyFhx}3xr)p^fr{p$b zgfc`uz}Ygs!ZWt5SV9`Ao|Y>!k;-%SAXieFAe@ogiOUr)^$=HunJ8qj?L{kTn0i*O z%B)acunAl#$xk>ZcMw-9-s)kl8Z$}AW;=?drQzy%xjM5-$zhLhWu(c%1z9VuRz|2t zxf;wAA(wR(t)-D_l3bICQu5eiTv=(Va8Y&>qm@zWan6pJCcI?b#d6YU^^#nRiBVp$ zC%E#`bm6kxNnE4&s3*DF%nTu)?JQQ1#;8~1I?P(-HJixUNdCfAxr?|?8LOV+>M}D0 zSlCrAHc=Xzn2QgBN*6V?m#WzfhN+!@?A4bO*~KWTj$3=`-P=;TiM5Tgpox*KD2-FE z$@LgwWKVOIq*=msnHbse>KU#+Gg}}=)>iUWljQ~sF|ud5%F-O+hD?m?1oa$e&&(Bw zk*y+4RBy@+8DeD5b5*5z!Y!E?SwHmx*NB-f5F=Ynnxx*A8#BbnCUMoJ0O5{IjO=9f zBG-gjAP^&4Lz<%Am76le$X?=VN(+U1GBL7K)yrHnW|2UQterGXO_7^3#K>OZYDtTQ z`!X@I)77h-1G7XRMz*#zLwz8(V2F{u#?_IQ3J+ysWc}6aTuWw|K#Xi%X{P!}Zp9EI zo6Oad0)@viF|xDN8(eE9NFYYGzBF5XB0Dm~$ll}{NWns?OpNRt^%m#Egb2jQ+Dmg) zv+T?eBYT@`D1{1XGBL9A)H|Gp2@{BsZ6wWC(`6Ti7}>jAV<}w7kcp8EQ15YVm5Dt)(C}S9WKJku`IUQjCx%6C)d}rg5E^H3BiRPEv^aQtr$UBb&}S zOKXK!GBL8DY6jPZStk%9tC7Oge7P$_jBF<7BCQu*%f!frt4}!(W`jVCY#S*;eIs{c zh>?B9wUsssZ)IX+Bh}|zcV?47jBGn;x%y7-!4M;x#kH4Wh4(TsvMbaVTu)}RK#Xh$ zX{Gu>?!^!zo6U8Uwg?|(Vq{mTIb3gMt3ZsbR$8qV$bA@MWOF%JX`Ap#CPp?&&Eq_o z?E*2fZc?=RS?}F5F?w0A`m!jBFQao%&rK$Pgp@hU+Tr7JkUY$gWr4ayn*@K#Z)1v_Yke0z-`KJFc6A zH6$Z3vK!U+oSrcX#K?A+HmQtp5JQaY2d;;-S14vAMmARc$PH%p3B<_uls2mtMgv2P zYysCxG6|MOVq~|dpE!|G1Y%@+OIuaeC^5vye&+f}=-Z9N$Zk`=a6_1Qff!j&X}iiB zhcd*-e&zZ~`-S30Vq|xy-#9PkfIy6FKWV30!Z?f}M)o_`Upgq1G!i2lr~cr)nL`3G zvIC@Ds+Dm#LyRoN50nywQbuBAcdIl%f;lV@Bde44sHKe~8DeA^IF}WvRaHE z#T*rgk=08^)!I0kAx74MA0!$nKDC^23`2}8%NwKesTGXl7-D33%od4)jgc5xRV~hsXHE&k$PSU>)rv-6h8Wor{7~t% zP{~M)?0&T*KY=+T5F_g)9Z+qJ6B%M;t@vTmS)sC#7}=2a{OrNl2Fq~jO;PBJU^YeED$5>BOO=mj58QwWGnDvq$@%#BQdfkR2$x( zxhfDNJ61ZW);7*$h>@+xkCUzmb&SNwCaRVAS88-YNQ~@RwJJZ4xg`)I>nEL4?TzypVq~lF zlcd{1LnAS==hfY^2`!Ao$X-+H@qx@^ff(6Y(si|^F^C~Xwmv^wdLpzk5+j?e zHsFJqRDl@TInoWawK0StM%JF6E13mHBQdf!)rNd1lO_-&J5Rc$IvK+lVq_cf^QCmb z*+`7+ZM88U&SVJ0$OcGvRE;r$Ax5?dzd*_qT#UrX-c_6Ok<3$p7}>l{FO#x`jz(f+AE~YQC?-cBMmA7-tZI$X3^B5;`5-A*a5WMm`$To*W0*XF z7};PcRdqA2VTh4+;zOjDg1eCzS+nZQuVr2d#K?w9X=*3qI))fo4Id`u3!RO`$fm0< z{CeiKK#Xj-l%aMpZeWO!ZNo=MZ-lN!Vq`Pbw){rstw4-yr1VtvFm7Uqk!{B>m);58 zjKs)3Q`_^g%zJ?t*%i`rwYzaMLyT+(ex>w5=wT#AHcRcuZ(%+P#K^9aUZ_2dTNz?x zwft(SK8+|W8X00_yYrhQ25x~E*>`FW zelJr@PmFA=^j_5)_c6rC_T)E97WzR(Vq`z4y?7I2sV7Eui}X<)Y*ZLxWP9^lC01`R z5+hrn_Tg2A(-R}RP5PvY#(0JpSxBl}tH%kO83>xq%wA$?JY7!NSS$oAuR zN+t9|jl{@)Rr~V?nUZ>9WaFf7s+aK)LyYVIewSpWA7&&*_PaWePhd*viILqc{ZPG) zhZ$mIb^IQww0^je7+ES_;Eyn6^u);G`;7Pz#-j`|vU=VqS?fm{iIHXE2l2<4vU*}< z_e#a$M;VVZ#K;cj_etgSqm9JKTErXp6HIwMF|sDfGTz5{k|9P`jQSw4O!f10VJCq{O^R6KsX@eD(ZtQUVkveo+< ziIFW4Ka4-iRMrzCdr&GFKf!p8Ax74lKO|MrPc#xEYZX77KhIRv6C;}-m5TQ>USNok z9l;-#s_7>giIFWGKax*ks_Ti7JtCEfpKQFy5FgkD*JuOv=pJlwk5F@}%Q{8Hm%h8Wpd{B^0NewmRN*}Czw`6o;(Ju$M$QoZ;>U1v z)LI{8Bu2J={9NA5IO>U!y(u+_4>qPT#K_L$Z%I!25F;_N_VM%ibjDdvjO=ZxVSK1D zgCRyXfWISY^kGI~WE;gV;4>K)Ju$L(rN;5$#-|K1vJ3fpQX74Qkr>$~@r(FpOj|uM zvMEy2_(UrCH&pi}y=$_TALpQ8$TQ&4< z;=SNdeecDXKV;`U?a-Vv@A|Il@-y52Jn!HjM!VY8R9`5F~|&$7^rTKxQmHNrA~OeMI{gBEa~-^o|3`-u-- zVM)v=O;3vEOxWZOQcIOqt-Z7oY9$k4;>#Wl+n*7i;i-kv`!)$Exf?+ifw6 zRt%O8+tp~89~&1nenK!;qMH5MdMnrEE}*!eXGz=YkFyK>-h=x2&%qv)#oV#qo6`UI zXLnP&?ysBE6ommtmatjR*sT3xZdP=>ftLBhtjTywV1OUU_rl%r(+C6o1MGh>?thH) zNDqpd`;Rf=U@wOG+ecD3Ox4%sO&y!tiQ67o6*c+yoiybz%uF)OeDaB2p)KnM-~cCm zrc9tYe)0%^yw03D3a?Wqj>YT9nN#sPZL~LD{l-nl>#%8~@#-^t242U{n2guyKEr9w zVuZh?hVm~yfErV305uLj!{|Er^>H*ujW0F8!h1Sh+F}@vEJLZ`Q2CY`P&}0PpE8ne z#E%&<5h_3Y&L~{LpU{lP(WFAn@EK!h4e#qSh2~0`E&Rvg!qh^^I2=tYl#KTFr8O3l z##_Fp{Bf5FxPU)#mx(x?mu5^JL3gBl8j|0eG7=YYK;A^*?|D!6!Nu`~ z7iWz5^_v)e;l)wI#?u{p&BgC|`~Lbp47>2s*pYtty{U!2H)_(aix@GvXfa{TNT%>$ z?6hB3FxB4%fIj*{Qm4cP!s>R_x-cc{(dx+A9>inKleX<7YKkcySo1T z?Z2PrA3xK-pXdKPvi|sK{`E6|=jU%3{&#-<{3NLVtM#_-xapHfXM>3rom-r*O+a0n zP4}Ws6ffEm-FDJcg=Q^jG8dC}a}`~pLM&-!uDz6uxU6Z(>H0KhT+wF`Xs!Lnlv7-erSO>iy4#ACSfwChNDp#iJz%x zlSmuHQKRrTOzPxMv@V#|snLb4$z)%DG(e;c;;6CE%)}M^2@Pg=(rTfRnm%bbI-t?s zmhaIcOdf&r6ABgRNvMg13evZAv=}$4P%+(yv?FMgFj+t`rBFedwT>1O#}>9o!^RT@ zCKGfgq+k>_X+ImJk$%4}p*JbKM0&HIm&Q!}bqVjI@Y1C5GtkR;H{3uP1PfpEhlMQ? z-a_H!X@1_gJgTtOnS{2m@FGTEc#-s>KU<_}=mrZfVc3P2M);5YMVyH)u<#PbOD8p@+A+$nm@bi+a7v+-v|CS&#CC3{0lp%KU)&|xmYv-4Od~8 zRAf!4@Cr5gS1*J{v2fJGru}M?rvJKt{*YWi!TNh~=no5Me=jEdjE(=-umAs_D4aIX zrOf}`m#|>`#$S4&{8|#FZ=0qYioQ}~E;jhrb49QJr(%0E%zQ_OaasAty{rS84_s^*=7F$RU;_E*JO$@&A+E&}< zeo{*3qN_Xio$8V=yta__oJ87EbDQ)p+4vhy7G0m2IThbO{al~n$u~Oq!VBvXTw#;K z@l!^Ub)HFuUw9QR)bOJU*LgW-1 zJY9`1Y?%tb9>coI&p)9eUs&O*F|uYu`9b9~9N)%~bsGvxEaXpou|*mc@)g-+{7l7K z&-l^B-{X)h7-9Ja0Q&5r53F8)azEbT7<8~ARwzn9q}j% zSBjc;!|DvCfBe1h%e}%2WPzyg0;Yd*p>S=88jhlZFY$g|!aE@M7&D@CXKFOLLKcP! ze+i=|SA2bNWgM;`nAhM087#RO>CcW+6UhbN!V5@&8Teb#6!;krQ2#za{|6_i34fjo zBu@VrbwvP$hlMY{6_$UVDKfl&{00fue=7dMQIVSz9mqd!_}g3m&nJrh`p;v`kbhbM z{!v$oz>D1fpQHZQ>%Wzhe=7dMQIVVcJp6B;`17Y!^kzkmfA^q@uzvpz|M8&yt)gfo z{?~7_$Suer8s)-Q{L*iS|J8|~PwDr{qPO{X$N#uJSzja*a;aahzHR<_{PX-@IrG~c zs4+pm?)2M4|J!|kQ;@s=@s35W{{Gl^H@U-qekIe+e^&jKJ7IFiH^G06`q%FFb6lj= z|4(-%EyMrrUHp8W|GRtqbCkb5&!6V)zw>VYwVuYM*LFOBb$d@M^-V=b!{ zezR1XA5GR(sBySL`j*lb!)IV^#eXbWOc^%{i!kIHAwROdVK;UpVPGWame!ZBiDPWu zB^^pARdgRyxTU5dG|x!8ltNmiT-q{!U~L?nExPNf3O^xobm^Xr)cT9oWd;=s368t? zw7$Q0qId1?la=DX?YdfEW@kQ2+*W&B%Y<$lHKw@0wc><@4cI32x74k)?)ST{{#GhNQ{)dDA$L1^b zzu0_bSvcHPM`@dl_}L_9q!cUKjb4)8U2H~%ToGeDLe|p?_lhO5I0`OCJ2*X`u(TsZ zdFbj^vcdSSGgwAar15IG7dX&9W`O~dgidto;M`d-K z+o(h>iaOn;9;Hvce|`Xla>cqHd8e#WKU^tlAXu7= z0ehmc)K>{C>%1qAb&eD@60Dfed5^5!C~7rWdZ!A*jQDS#YzE6#JMZGcIuvystiaR< zBd>B4bswzA(A~{XLX!rTXH^$4tl)Q1*bN+3MJN(|j|%^8}PUj^80dJ9&5sO)Qk<_lP+sET)&K+_Pce5agjpKcU2 z2dtzFA&{<3Q60cac#(D_u^mN)f~5^waCjCp$H2;Rs??zpwh1PI6{lUWBexMneE`eW z{MgO848o!;o>7?5sv$JI3!ZyarD@-xaRSTEIX5T`krxD(wOQX_R$Yqf0an(F^g%9& zhc#gN1ufLSh2|nyw$8SXBVc{*fR&pQrgr+P<{PfKe#tVMS-O?4?6z_pRFD&+w6=21u?S&tk|GMAICv+7c39w%5!)d zib@BIcU_eExH&~xgXM2d4Y033QB}YS2;0}a3N*FB(neQ~e+7*vSOLzPoH{TRwGAvk z^Wc%=8Y1?<^2pANZiIM<2P-+q(b5x|OtAc&t5jWpt)Z{L(zq^Oq(fd*2P@8OuDKbh z)c`D0nCZm|XqtoN7hR>F4>S^3Cg<2Y2Q4T{0V~#Qu*htT*aOQy`>E4b#7rVsRIt;w zBxqiP73W;_$fwE_^%X21*Cn;Dx2C8@V5OMTPE^ME)?k^#l)#e6opxZwMpu1phGr~S z1+6wuZH~M+0amhE+$uIh?12@R{p?Ov#KTpv9D<$QU7`67R*G}AL6fUdRB;h;>$>z* zSLB5=SbSQ#v;rFxT*0yqR~sybrW;tv(bXacK{Ep^`_@~^#}%ij%V1GylG_~z#2#2F z+0Uo0LOeVG%PUxuei)jPt#MCH^@8jgs6t>Rxi0fcbfKtDU}@4azLvoG-e76NkKR zu)NZS2AoG-+5wg(XKk(5G)27!D>1l@-5zM_f#svA>GTLS#{n#Rx1fzm2Z|a3R!rK{ zHeZpOK48U$AFy}{jUQP4t7`h5fo3CE39Yy1mqlK@0V^`it7b{kcV__dqCHr8x8U}TkQd{?N=kcne=0t*X<%gqAKW?$nz>-bt+KOf z2hAR^vRdzm2rf@iKfp>%8+PIl>QW+DF*)mprn9J-VA+MV8?qjnj$kEeYE8Y0n$sPu zK(~-bTihwiAFP73=l*#OC~6^CwjqbyQlSY3D`i!!J^P_a0L#{KXR~6+3o8NRPxF=> z+9USBO3K;reK6vo8d#no?Y~ukrYBehn%d>NATI>465K*(S4Cbd0n0u;tI{acq6n}& zLK4yhXrjU5SJ&>|6q<8jc{s*hosBuGGFW!$!@tI%EyLsum&`#V8uD^F6oOos})!Q=_A{$MO}IamOeN3 z;aSvbcd*PM+6VKXnF3a!>lviz8rBsktjXP>arh#fKh~+CXy|tfbZVTF-{&Gg!Qn+-(T*q8C`^ z^wItys7s~6O32+(`5qZiD5O`tpVgYSCk`d_qsGYYfd(u0C&S3jvC78Akp%cy$Ri^0bP zmPg+94>b`Fi@{0`?fm{eH2cAdbZS(xB=X`USf1L>={=AaFTqMo%`ej!wdf;QNui1N zt3mSvtiY&;CJCBmU?n-3&S{Ys3&4uas5bmQ>XJKH{&_oA#~>b-gGGgPS=A4kQ(z@J zHQv1lHRlRg0opDL&Xz(C16F?O>%CjG6lGBk&nWbi|7vKg+TgiIJ?e88nzmpSI4L6z zH>0R&6iM@B(PHQ;%)^Y z9`=CcALMb<9h!$=**Q1Onu?n93@o$OHo#VIc#+(%gmP2Mut-BRs4=lUvU6UEaLjqWFLER=DgC+|sPv>UK@=$Z$ zf@SU6ZAdfZMQyMG%jP@{DI?EV0lERHqk=k4OXD@e%Ct4ivwW!o9$AM zqAtw<%QJg-+1ZGPb6}+eb+>ke<~>*e&duwOLd~Hh#I0-hZ#ycXhXKoEe&1+mcZ$-0 zl@NCB-3(~7VEIRzFDF1V5v&B~1Lk#@vrdB*XRbA`HRh?sU>v z2j^EnUR(z&#az3(5p`)fSf*^b&v?W`Dp;D}p1tcsQwA)vbNbEGs5w=^igoSTHl`YS z7_h9<;_jv6?e+)DHr#*nZDmy%2bVC2H$N1CguZciiItP|tj_Go5#6x+o3W7Z^ z@yH!_uwpgOdT&R~=>wLRo9Cff_0hwCm7FH`KUadHmVuQLKBxL2Xd=LhiFvjp6q@5; z<+nb%c4Alb|6nDhHPG;Oh&`}kbCj_cP^<00vJdGyW&t#wU?ppw=T%2uh+sv!_0_$# zM-KxQm2P~~9<^u{SbW6XW6hyi50-h=^SYMEol9WZIUaM#>W2Owtb#QAdjY6RsbD4N zsKvF2ho)fZL;CTbp^?C%T(Ux=P;TG%n@7A1jY4q2cn2Td|qv8!`ppF{HH$m)y6`6Z*No~Z#FtBVxxwl=RSpim}OD;bXHD?1@F&%Z2dLu6` zgT<$+rPjcI-2uxwbm6^VXdZ)=ygIj|7c?d7h611=7QSrAdvK%i?@GSlZA<(~BW@vcaOF z@}8tXV+)qnX^M9Po;Y1j`}n<=no|)C0@UX(}CSMNu=s^2liJ za20i_4On)0hrL64V_gWWxX=>G3N(ko@^O0gG#xePELis1LA?{&p@#u0CiOrzgY#d( ziVa<|{~eAk+TyuKy{dKznwDV2I!%kUMP4ig%Rj^6dS}d2UBU9q8|UYTc!&ZkCA8$k zM$nuGE5<2*u>pB;11!CEaGnG5;s;nssRx5**MYY!k7pFRw7)MjWx3SaBIGCY(TB8UR*6-uN;vEKoDS;=`;;Z-wR-SV>N=>)b}oc>-3T))2bO z6+KKhe4eR??BC(>R0N9(TlVHDG&R6VjCwuzC^S96qMTa1R;{Sa#8GV`OMXf#u~qv)UKr-4U=f znXP-+W1gA~mi5btx&06im%xe%E0be|-1!Wa-uWHX0eN9L1aa#+WK4PFMJuoZ%||SU zpcb_UD>5wjdLL*ygXJ0h&KX}KQ&Yf-be?r|3g)Z}VEJS^X2+r~1%jo|_6xaO8Z{Fv zbC7lL5@nHyuRHDF>E^>rg(l2YMK=63j=p-G%?^4pw4V$b=+l`hgV?{r>I- zXac}Wbe=sp7kPIZte8xvpvs-GCJ9zx_M}Fg5D%GPSqGPG_#K)GU?n(z=+GQ_Q4=hG zSFfEG$ctWJWtoq;_dzWZz{(E`E#(T$P_RtVAEs4=CJd~6=Q*E7V9rVd%WQVGUxB)` z1+0YZ$rq2IR=);I8(c1FIyAMw%5whrcqeL36R_f3%N@v~(Zhgcn|3_?0{qtqu%g+p%#C?m8~s06)@hmxCAwox60EH3DL#&fhwotd1(zTF3YunM z*=h>r*dZ@mz)Eo~KVTU0!Ursmv=a-os6~^&@(d5}(Gr@OU|GlPsyzjoIIui5^PF`E z{V!l?(_BU^LR~rrmTk^dZZBe{qz3sOT!Hn0rY%?=nom{NpyqS|i+8K=_CAXq1}y)y zlXi#Uzvh7z5FU}e8=9qHX=8Su%7NwpSTU{V-@VoV{r@10KP_j{C7x`7S`-47DLgW$4m2yl@{8Hy zH3FK`U?sHe4;1{Bx#v-h_Cl2No4lvC|-E27(o*`7$UNHOC7q zAGeAvE|)|P16E4fsUf@Jzhb~LhcCBZ4^1psu`%+GN6=gctDyCQ3W-h7|AUpBmZ#_~ z5qn_8<;=*egm`EPmP1ITj2qB+gO#HBTEHMLe8GxwtCZ}HyjTwwpMLsVZPcReU|C13 zxKIHaBUs5X#*G7^c?6cdi2%|x(x zmv7OtQFCU2mE>kS@t8Gw7_c)eD2KJ#UbqpR%A%!T4$h{1D3|+yVE<=oMm7YxK%Fgh`cxmmQVWG zwiR&xBv^hCt1Q{Bs1IOytlsyi3p5|V3Upljx*_tS8(3cH`QfKgmr7Q~J#%NBT7Y;M z1Xf~5mBhBt1cBw_^5f2U)SOjd*}GRU8g093o%#XMyTmQU_%ug?upGr`IasXBB&G*Mv1xKPucqULM{ zOYdH_XMN;FGFVCJ=Vw~t{C%*pLZaM1;y4YgxYbG(4K&tZWjQWwUKM#U9IV9jH&^0O zm+FBPlRM|fWW>W1u=n6wnbM}H2=w2;nV-@r;U=^fau$|Wn-WDv| z(CGAu(7XgIWwmIhar?!5LR5D#Hs1%%dUR|A^kVA;3XpLhf{=OS2?wnoFqn&@G` z@=Kj{?g9K4T>;N1w0p!AXo|PPbC25paWgc|VEH=*SuN3`{|C!6obVtsn&K}VpwWu^$aiKk$b%v%Q zSUyn)qN+gC39LA$;62~m(Eo!KknzDI3Uz4^So*wx*#i*|JHaxC+RZA1+_?u1E>#$mt1EI+TE7|GLhPud$e6S+5wb;+~(Zhg6ndffl zgj!?|79ZAYyc0AIU?oKzx?K{vGXyN&IrPQLuIT^4GG`PlU52_e1+0X;g@?N!9!`O! z4Xd3%A$MMbML8$5+kl$$4Xi|M?d_A17mdMkFwb*6UJBk8EKOMNlKY@(4^}}`!j#3( zi~~#K92PhlbJhv4tTR8=qtN38fR&ZENPiNw`Wje%VReL=(0m8W!TIn*8{|cap@>^; zolZH8(Zhh{Wu9+#L@jay%O|YQ?gr3w2g^SCaJg^L_=Dx+9Nxg(6a7C}+RV>q{W0_~ zuxwv0{^*Q&cnDT(Slth=p(%-Z)64nDtZ>wvN?_T!*3B4>yyy&8q&Z;zUihy*V8w)a zc8i5Z081ZzM41N760l;NwT@>gtZ$YrG$X(Yj6Ul55}FlYB{{p^Zil>h z23Bn5S7t2cshwc?zg+6r5b^LHEGqoomfq0R2P@I}Si*eNoEBgOxYn;V5P2~atbFsr z^1J?njtT!!5IJ3?^*-qsZ(I(X9rHHf_NwcmVfYlb4O^jVA*L-eDFif=>eA6 z)qZFfKCXvda$hB8h$^~20aW|0clIt7RUKWumZye zj`)J(7_dBIPF_id<~&$|n$Dgzkr!3K@=v4Y_ryH)5G>D}wste{b{m0}68x}63up#| z6`)CM?Twl<5-hD-qYci;ik?gXJHSIIbHsx4=r!ba@qx zIjaF!acOjobEr#MUVn0GJUYG$n$cjHG^cJoK+Ty9mY-YWcBYQ#VZbt{ zElc@~+}sUTR=BVw7aA3;xR_HVPD5h`i*oE5ZjHQX4OU7TbG$v~srO)+a@r5^K|FK< zOB3=~tPRanu*{m%llvnt=7SaM*5si*^1=v~b$Z~8DX3S6z_N|dyN-nBBv>gir+2l1 zCJ!tJM~|jKn6q5L;?s+X2T_-pD!6A(hi_&MH4`kKkSAZ(LbCuYYnL-+ub}3Hf|cyn zbk^oh=wZOprUzBbwTHI_%Ohft`7tzCz~a}O@rZ}!Ggw}Z-LBBci{4;q(k;F;!#q_6 zEbH8k>jxnohJY0llDf7$G~r-rUCx?1AurZ|MY%VtTm^Y?0W81t-~%I2i*A7BA2E2o z4x0O5Y1W*5-WVFr9(BO6`?y({vkYMQq+71tsYUF8rOnm0y=jG-306{w*(DU3bzu3q zoa=KOHD?D{4(`npR&+xT16FK$$biR9;cdZ+i!ju<4$U*LeAb*>x(%8NU?n>CDDf6~ z=L1$uI_qNRjx|ZJ{Bm6%bVoe+gH;fcmSTb2i32Ov<@`%W09eT`7sB?U<|Kj@>E7bQqTc9Xz@jq3R$PPsDgeta6#p|8XsGsh?yLP< zp$(y$gJtK`%Olec{XbX*>HLhcn5W#qO3rn+ZHIVR0hT^Aqv8i>PJ>0YNovvnd2to2 zME8thKFEu&U^!%jx9EymWNCxk3?1@C15GKgtfOY$oC!@kusof5KNpb~5n$P86t5VJ zy5tEKmDlOee$?tsU(nfYl?V~!Ab~yI^a1p zkHGS3b7`zC^5QvIcAcKO4nkg31S>Kla-s&cs3usZP%oE8&@=$cFKV{+IB0ZWnVdZ9 z^+sOo0xK}%*y(5ZY{!A+mDlC{cErqKu(Cp*y&Ve83$WsxE*D>pn)41UAMG>qjpFEG zz)DG7URr_wY66xybXdw3Xj+368#QNN1~kLLGCTD>d%h9+f3OlVjt?7J46z4RWM0=* zH4qQy!Lki|z9I#h4`8J@U5P8MMIY#exYa(N-yM1302XgvVQh$6mT(71w? z95uJM2#p_D*3SKgcSK&C0V^xx1eIcm*aIsu&!bf|;^7up9${H6`a)9-^Cs_nwd+FE zoYG(=X|rmaDUEOgOJiQya|`@e7qGNp!z-?WrZ-qr^t`;=(98u(>)fA7z?_u~mTl(A zO`ber53KyWZa2#y9?W3*hrLL?2#qyZ8s}@zKBMMT1*<^&;&@x+MR%}#%&T&$;Cz3u z{K7^Yuuml<(u)LxJ8lQ(I8mw67 zf!bY|v$DbR&ph?mwG?6xEYFuctZBrS_^sc5v;h(({qlXE*%Cd;APK! zGZ7D58$6@%Tw^O}oWV+RzHw#*YK|LNfv&l0x7(tJ0jt0qowT?Iye(K%_~;eWp;-u4 zV)Vi>hoDiwqBO$1wJzxY4H$psnX0WYPn`$L^s-m)H%(A8!P121^^~FM1Xh9bq8lE_ z3s0~TT=QINATQ>DWuF!^`Wtd{8CbUApYHfU<5wKdJ$e!E0L=-o95njs<&hUW)aqtnnSx&h zZ-8b9SbEKpGHopJ9RygmZm(vPL0-gw6`0ne!eG>*EnxYFe<|+`O)yxVF-yACf+hv5 zNX=k_1@fXXSOID03)Z79<$Is4$f~_5t?ye1!|V=RZ(+hfaT$qUoo&Y zdKj=0(t5_z?9R#AE5BaRj{(sdiCppT67Gol<;qT`$Mx0 zEK|&~#aP;=-hh>_5!bx$jQ$@ib6S#9IO-Bt75B{PH>MZjVIWxAkT;__7LCGSYvty%O`?rejJ+1U}@I`51Ru` zWw2r#hfW`gyciFbU;5?p1^8^6f#s1qu*)GW)`h@I4td{c5;QVc{w^UGOCc{3z|y$C zZ;{aqJq%cJ>3y#?MJ>t)D<*=z<_Jv+Sbl3lsQ1v+2P?_3>|%H1#VoL5)32z0s7r0Z z^3T;hZH{;d0*eaW*>4~;N5G163H4cunsW{;ANLQ*2IR$Cuu{_d`R=fRw*@OHg7Mu8 z%~P;q*Mx5Jg{Bo)1&-zF94m%=2P--KYTp<5Y(2n=%M}*KA!ee$atMukeHEGuV5PW( zaX(OVZh{r#{&C`M(!=z`=UHUGZb3vbs|LuOH(+J04(sp?8ZB5w_v=y_*YbxoMeKo< zlB=&%3-PcOtfKpM`8MG@W}xPzf|cZ6P`VfLq7+y~_v_k6TI|^b%@43_qrx8#fu~bv11wI_IP2gZb^e|u*-LLBxYT4Zu zxnorV&pj$)&OK-bgH?3Du2)8Kv;LT;CW58O8$7HW;^8=0MfdCaw2930LCyIDmVKws zJv$&T8h}-Fziwm_CZeqsG@trVF z1%VZlSE}?x#6t>LMfd9_wOLVnFha9DSh3o#!OgYkVZbW7UzZBw8!UvT6<8@zD+akj zvjD83`*rg(Zk0Tax)cRgQeJ7}b4$c6Sijq^>%8*RP1GDau#&ak8pgSyhXJeTeqD#K z;^#j@qXmnPURiJ$nsBg+?$@=;yuI5I^VC+b3i8VI^x_e>VEt~tuJbCd9>|NPU{S8$ zlk7U9hXD)!9s$|vz8*jMmkY9g_n+0eB-KT>QR~Sj_x-fxyQ9ryvN``d+3H@AF1pj# zE3CxGDY$ztj8YT5D!L`^z7C@-y3^M)^G>(DsAwjP(*C7&PD%+xJVyDuoxaYif1E+J zYJ*Waxc(TurYpK1jI!uX-^j3%7P-(22FoXUwbLGG;=n4p(>EaVZdQHFaL2*YzbqTv zw-h42EvkIcoxaXd8`~kHx`O59N^#|oQPaRGy3;o?%xdchXhwq-6CHJ@4m1bAD!S9x zlzDI2CU>l6f))6(T*FJK@|D5*-A-TU==KLttpVEt~VuV&5c z9jI0lz{+>s-|snt)ex|X?)3ExFB5PLnsBh}W4hP+1kGcxithB)K7BBvIJ(2vVA;O1 zVOyg^(t=fVr>~c0ZIznHsM%oIxgB^t)Do*9U=`iz`~Q*l-a%2bZ{IhFnk?Op$!^46 zDR!EQii*1y?5LZF9qT3p6*~$jmrAi?C3ftPpkl|8Y}SgMVnLeN5zLDHIk?|>?};TKB%|PZIu+*k6U0Ptb8YbEwUdk;FoTCtzm^U0a$9&H``}-s4MivR$xW0H2-8^eum#nEnzbT2o+UzjrI%15Tm+E(m&|_RCj>7 z1(w?ME%e=AzXWDZF|fG)m*2L6zNil@wdre3=1D4-Tgxz*3vO-pRLGRY0a5u+W{O z$DM}EBw(pcU!UTwnvSqKj0BeJ$7TWVU{+rTmfG|U^c!E;n_Kli{DG|U5**3=77BX{zr8a%Ti?^j6fVs2^Sb-ni3#Pyd={~U3rZ4R`mO5Vt z?p{oR_arD=N_2QG`y9gfTcEl(S&>4L&yvO zR^hhLM;qaP-V7|Y>04dABhCp<t9hlWtV5v=C8~^eCEuk;E04qD_#)0a# za2f(Ewdw1VaKDcMG9!WIx_x|vJ@gI-EVb#|u4HGcH87XX0L$i6t8oKhg=7~1vsz{P zdizhv-vu+L53ta_H~TTr7xRFnHhsera$eqm%p_n1Zl6#u1~Mmsr8a#-OLpD(1Mls1 zV0nLPZM_V$+7(z?YSXvp#KjGvFNOfir?2?>Z70|r0ZVQArX)OA9srrSz@oQLO#ca) z3&2vFzVRiyr!0cGbRSq@pW1Bg(HBmXfc1~*+tcWl05fMiu)_OF(Nmx=)&fgy`pO9p zwPzr+7+BnPeX~mT*jB)M`y5!wpW1di-5*Yrfc1~*+mk$32l`?Luu}S( zn-#zsW(%;?rf+q^ql+GpSp%$`?c}Eqka-L&wdq@4vX=|*2(<^S>`(0;wH*j2O2GQZ z^!1^}u7H`t0!!{IUl|2`u@6{k)3;sX;|Yf$vl&=6DOAFA$h-!Y+VpiRP3`j<=8_dy z6`$HiaKSJdfc1~*>oZBQ!pw;VR&`$sc?Z@or+}q4eM1wUd?%oHIAD3FOzQL!GM|8@ zHhqIjxtHU;q4spp3!gjGYYOX}5y1M#^z{kbVuG2I0IYWXZv6^^zPJc1wdot5_%vlZ zWKIGrEG6vrSjhYYmfG}+R3;DpvrJ%R^}CyQDiEX_V5v=Cx1_uUw$M9|fmM+*H7y%5 zEr6vqebKUm69V9Lz9+D%-@p1fqb}4fu+*k+JKt%pBcU&@0js>{J71wtx4=@HzM;O;Pv3%>a}QW< z{qHy72Eg4BV5v>tsH6g+5;C8F6|!S`Sq5YRfTcElql*rwc);oW2w;VNcl_YP zYSTB~cgC>(&=&>33htkC)(!eXGYsBOmFb(7^nBzT$ovFW^o|*E9UwCpSZdQZz352K z!!Va9V8wqZ%FS_tx&@Zn^v&{}`SBvmoN{1A^?xvCqaN;#084H9<|e)PQVbc?5#Ey> zGg}>i%vfNlP2a+zw5J4|&d&i>)`#LngJGR>0a$9&x7;^;LpSIP1z2hQAAZ$9U$g|4 z+Vn+}UnUzN(+F6FJHl_^keLcBwdw0xe00Hfm`hQ>D*sT@_&UsLGqBXAubbbjjwfN} z)S3!?*FUz_CmXoa4=lCm>y!M->j`990LyhJ_TVpM7+|SQ-@xKy4P6JrX(q7TK9-*G zgLTehV5v>tV87Y7T0>vB01FL@+cyLHqBpSArf+z1;k}`d@d8%hPMnT{%nD$sO<%hB zxDXF>X%n!5KbDOaU{=2dmfG}<@|!bjFU%Y*uzZ5n1iq>Rclv>)Hhoi)U(b^u696oF zr`F{QWYz;qZTfP>Cq~*1gVRi4MSXlz)e+V?pMa${ebfBrqD`PL+5;;*XziHGy? z#mD^p-2^i4fTcElQUCb^QRs{Qz>3PhRWuU*Zx02Q+Vl-fDA9XEW-73vJ!aoq0Vl2J zfTcEly-ULOK83kt23A&#&7Ne)d;^x+^!4#Kl}5wN2?17G{_Rz_VI4IASZdRkPAGkw z1{oSy=^k_DeS;di2`shg8&)#8*KiHg9~f+VtfT%Ho?t?<@yap~qZD8ctvz084H9CYMZkZicz^8dz?zwF*{2rXH}= zrf-Tr(=q~P&TL?z1$VDrf;Cbku+*k+PQshEJ0Y_USgxM)j+aA?y#$up^vy1rx-<~# z;S;ceW9^pwfD8ewe@x$=5!c_r%vl7ikZ1R(?l6m10ZVQA+9Z}s^`UpR11r#TzJ4;C zz`h5T+Vrg`nTDN*x%3lQQL(j~vXJQrtba^jA9nIom^rb)ihg$g_eoeIB?3!r`g$k6 zoxUD22Z2RVP287yF*|20!wZBx|Yp;yaDQAFtAeM>>mw;%wk}vOm6*6mpr8a%Td^@(sp)U%7m6Q8)?2mr1{s)%Y^o>s1=g}WBHcl{eycYIr1AQ?H zSZdQZzG!a!6;KaTf#tf|v7QfPHUmp-`X>8!k~hQk`XjJx@}4OXFt4n@Qk%Z%N&D|y zgp3ogD!dlG-V1kq!hoeVeY1+@Wmwz5%mh~8YNzyc$Z)_?o4(n;9y1-GFMa^aJ1=+3 z`$4e&_k^FRGJOk^4$SEW84OtN-civ_pfBbDOKtj=7tIebK|L%77QNbeczej41eV(L zt?>1%8wb~GWEy-%d3jx@!Mt(>mfG}nO+ILU5;ARp72>_PSu))9SpqDz>FZW(`uwIN z%uHZ$t6e_rgUkhBsZC#ZKd(c-o5I~tU?u0}KYTs}*8jj#o4$d`hmN*}-suLcXzwLg zQ0R-bz*3vO!Nqi97}UddVCAfKO~4>y2A10N4e|31S_pfq=D^C%+fbMdCD0LAYSWia zJ{-IkGJSz1cUnrWg1bIjfTcElqly`iJa3qpz_N*{*J(3k9s^5l`bPV8F8)No-B4gv z^*^{Hht5IBkqiVddL8lcTD};4$wQV zfu%Nm)BU=vp4lAkh62kyKat?;z{~`e+VssyPFs@%nTfz^=dpZO1oXuzV5v>t++udl zt?qC~8CYR44Q9te<`b~grf;EN*XD&S;ch6fLh_R?cYs+m5?E@}*Jk6<)+qE&IIu!J zR`|USg1slO)TS?55^3Kb>ftu9l4BZbenaLbu+*lntADrbQ0R*>z>3aKHXg8tnF%bl z>Fd4m*v)9jEC5!#$I85^&=)eW)TXab$%3O7{ou|Vu(D$sr7eIA3fE1w=^NG>Of;xLQm0!wZBMkO4d`W-T>ftBSE9rb)L>^*^{HhsfO76x~NdME`}MNH#C zpCHo+SZdRk_V4j$F!Y59ScUnUc5ZQjnF%bl>6?~tqE-ZCl7Ln2v8u@!=!+s?sZHOM zl0|P$K+RMF%RRP9*$l|E0G8VH<@|f@x$XgXLxJU5u-VrMX3-j8sZHP9gp>Q;L1s6w z+&sIshD#hgT>)Od%Jh{>qSmyAdiV>hkXTCt9Dknxf9Q+4zhKD7h%g?%8rewFFlu5`)G#!wGU zft4P6yK?|!^uYSZ^!4#u(h2(F0I;&2ZEL&;=G94HsZHPT#Pk)TA#(#*X`Vf59YDY` z2bS9O4J}yfc%O`M)BkVnar8a%DN|*189tL|HUs zpCll&0$Bf;zP`P!&d?XnfEAUybJKX3SFeGkHhtSAW&Y>^nQy=f_wrj22YV|6u+*ln zTbcK;7qww#0*i~w8L}NR>w)!;>Fe8P>)O6>Yz?fm++Cd?Lb-nemfG|UO**sVG-PT! z!_4vW*ZqODbSSXYrf+ar=Z|Bc9>Rf@6Zhb~3-nG3u>LW9eFMAMKwo?TR&MU@96ijU zpTJU^zVS(CecC~$9t6yLs& zq6We}4q%~qdrUVvz|8c5pQ$o^vy#p|;~+x-tJ@>Fbt! zzR@Pg_y8-^yU&h7SW9mJmfG}nFYZ3e2kK!ru+mmPnOO;$JHS$#zU};i(u;<|Jq}={ z11+OKtiFCtt|4hu#?gta$H0pYhNaJAkD&eM5?SpczmzM}d{Q`f1%ckjVp9 zy2|to^|M}9C0ZVQA#`}FU_JO_#1Xgw4{&fV*qQSsYo4#qu^Cr)POc=1rz56ZH z!$&0mOKtk57yAs{4>fZGSUxeiK}N__0ZVQAX8BeBJXaUaG=SAE|3LfgaB&(7EVb#I zn>^p95HjVP0>;kiBDzMb1FS^l`8VZ>uzzXvS%Jhf6xC<<`>FZkJ7rPni;RUc#V)9oHg^Uxh z)TXbS{}11z4sfOctoZyxGvi_ZMFUH1`uc37dq058T3{u63>cN!2`qD9sZHO&691Nt zP!I2bCC3yrmmq@yOKtiF`~S>s4t)^`tgQUQb!}l5EeDp`^bOz0yci6bEx^k57+Bc} z`r-|+)TS?85^y~h>fsx(s$-sK_l8VcV5v>tDF0uJcEa9j6|lwjRWP2a)<7WaqDDPXyK2743G7q)%j^{Y(ZoRZ!)3!xtB0V_22S8^t|AD18eFGC0jPC>)8CcPt z`uQ%<7dWugrmuHtVDFhw4;_J(6SH#fq(x&kb< z=}RXrtUd^tC%{VgG&sfkgJljZwdosH+V{mPPnem&Dvy0#umdvT!1~AZ^|9Toz~1UQ zuqvLBUV$)+?f^?|`f`bjw(FpGihxzGNqxvH z0MNV7`I1nszV5v>t?9%@D zgRZd00akEaantpXSq-dzOkW?n2W8FSOaoXUxs&E~hFMewEVb!tleGAWGxW|cU3Zdb3BH^%mkL& z^z}|!61f&K8W)&3Uc>*J4Sit*mfG}nFB>rax)1DefRz?kI&K+cb_44l)7Q6-OKuxD z(*Ra_?&RYwU>4bU!_QQizF|pA8`wbaGzJ#u71FMF5d5|wu+*k+NZG(@Kd6Vrz{-s) zvwnrlQDFUJ`uf&A6#{)>Hv>MS+$s89PB1fpr8a$&la}#OkZB359Ip}PiO?4dfu%Nm zqssPYWZTc3L4SsSEYK8+=_?n&lZb9Y&u+*lnjh|-y z1=w4402ZA$y=@(sMcsgY@SLtpF#mfG}neWQ=mK|P!VR{H8) zulqsfC9u?{ueYClhpx~UJ%PpL&5)MESx{eKsZHOI={kQoQ8WbeBC(K=w6150iC zh87zdY=e5Z0Ib5*yQ42c<~^{~rf-;^gXK8vt%87+lQ(m^71of$fTcElqmx(NeFT|l zz{>VUMznyw$O4wy^o=hb!aGAfn1SUQv!_`%$W#MMZTcqrIfl1_z8C>4oBZ(FCYVpgeY1*(j#vxz@EBNuF?+9^fJ_~@ZmLb+Y(FPt z5A3ZdV0q`yI`9VeU*W(~o4$p~F%BYRRshSrvnHtl^uAkz$3YG2pg-(^4|?5(1Jm7G84b#6mU5z2r30P`hH^kqyq%QPD9I&$U=SGi! zd6fh#wXaKWTvP4|nM1&mJsjsQhrQKzU~wv6H>#voz!L;!W^?$AV)y$cLBiVcwkj~I5}8hEnT-CyndChn^t01Fcj*c zF|fR259A@xI|G5G_I1$j*L2=_REO<4kJc>A~@~!kT0E!QOe*Bk`<1z3i);u!Vb(jDBmXP_M zM-dM=y6kHQIBWt|TF}ACJK#0cJTkfbOWE5<7?d@SB1Q)sPF)AhS@X!@z+D0Jcf;Rn z9xd!v`6m7&Z0c$rA?!2YNN+o6s+vay)4O^(HG;p_JQ6s#vb^mK7)&*f{srYf+P=}c zT8%EO-Tl!~{WSjQIu)TXWo_c^!SK(j1JB=gkNLa)_*p9&DR8k!k;bhr2*DYw$83J z{L3j@O~pv5<{J*`bwB^S_Hf70Awh#Zd-QdUX~>3*T;JXGx%+)>%IIlB>-MxXKpHn1 zvHAAUiks15(3X-Z4Qh7|?u@iP6KZ$Osq)UM!2_O7qc;aIVI$y2{qM{EzkX-H|NL+V zcu_U)YUX#_Mpxj^&)e6(BI@r9$Y1t#e9bRZ+Q>D(O7WrQyAB@L{1Qbl{K>Ypt?m-> zQnyC#9PvUICwoUcN3bV*+x8do4~G9DP0 zBds%XjLYneB4QG-iDF$7k4+HkIBk*pM7-B+mU~CM(`}M_Ij%z5XFN7WBkeLC8CN20 z6?@ZJj1o1bGZ-nNrp)So5ubG1yR!PFI=u+v&4p`Y%E-CV7iLUto2>DPaYCc zt>a`v#5Y~)mm*^#(j()wF#+kGQE1#?KUZvGx`E9R8=J0Uv&BYE2j$@rKXnJ>VG%!c z`{kjI8xh}(5@RyrlTmC;LV7BgNy6rfO-&*;PjoZgtR5NhM|W5r5%F7hNDhhkg#={0 zF>XfuGs=vc5WkF47YoLST9b^?B5pEcCOcc*QF%0Lqf3)VMf}wr`SQ-V4e6cn*0>ev zl~HcoVjn3sGu_5mkucrDB1E0j33(iAr#mi>Wozk<$zvRMAbm4F7*mkIjQ7UvNFSwz z=^nOFY;L-XEfC#Jcd93_b#?v?mr2Kc$l#2xMh*$is4}J^gOm=Yr`QUyz3Bxy6UpzsjQ3c z+?OB5!^p6V@5V#O(2Q#1LHk(I%T$2Hh@Pf=Y_;g&bV;7cHqc#^XR!5k7v$-VN0AX3 zzl>=}NXAd&5oEa1#q<(eBX%~uz~V%2)AQ=tY-8OOc^2D9cUcZ+8zQ4K{u+-XqcZ*& zk0B#7e!INJ{u8^I3bA;xtLYWC)~>1Unmm_v(`CzZ*e1HGm9>ngkg>e2@gy>aw=tfu z-yrrh6=Ung9;PB}o!H&!rfgzy-3@s@i|MY*^Bgmf@qBG#Iugp;8BZhQ6hBiLmMHp~ zO0fjd$5c`s!4f)2W>}q0lxbFr7N0(|c^Q*xU3D+a&gKx+5=QTk3Ah3)vRB zTk-}&dnZ50DeAF7wIZFKkK#cXTcT{((vg-qjJ zj2DrqytDBFGKF_?`GV~bgG?1zirC-u8QX5xUiUy=#_7>&bmA~mi5-<$}y~$?pY;fl#uy+Qy7EucsJus`-9?e(@*SxIL!0|+b<4v zdM>YJyXp$$HEb7Mz8vRhK^R_Zlo6W8jb_B8j5Phh4vQm9zp+DNi0N1LI<|-IrTib; zUH3wcXS*Shd^6*1gyjk2EhK{1+1O%7#nH44mL`re{l$*h`RZQF8(1G*p}d~$se4t~ z!gvo^$Tv6MMHcYx#yj>W#BsD8c3d1w*TRm8W1LFlB-US7EGM#lx*|Ej@d2`!Z)MCu zqWG4^`^X|?0$mq7C61@-U?;^;x_0#@wzsZK-pKaSmCDI%0J4m4YkY((<=Yq^B1`zz zHX68y8EF(t7bnsPcG|A5?ybCq4b+v(o7q0PHC(1bbaiCINj;1oXQ$>Rq|d|ud9^z zIKD>y4r?D7W z&-XAEA?x_=Hchc>;#}Gd%NFO*O|YwWBXr;8gKUVdT0X!I*L|z>GnOHVysxnoN#K2q zCH6N(6OCgx#Q8LaT^Hv${gRKcqjW#z!|X`i5BZSeTVx|2U@S+Hd4J;@BuR;&2}}|h zT8D`uO>3)vl9tB z`4l^Ws3o6t{DSP@gNzkO3g6%O8QHEZq1#~h#Km-L?5-F^x2on@il{4Ruq07OPG^nC zZhnxl3faXEG*%)z`2p}yoCo4Ex*e7yE~VRI_w6PV8aa~49F9B(jwLsFI1 zv?umdTt$0ePsC`tQ}qRQ2H`B9XQvZRauz!cImiz;{zMM&!;C+W{rpgy&RCupOM7Fv zVhrttJ+qrd)RQl<;e@Mvk)27nRE{+MK@RgHjK7gXe2DRv{c~|G-4!bk*U(+Cd@;_s zp?sB{Lo|@Du(OH!@?|Gm5oI0S1A8g{M|a0wi1Bo{>g()$qOp99 zokuj1v)Q@G34R=DhaBg}lC_Xy{1_Ww?6tUo_Q49p^>k0{l^sJgm2a{%;U?c;O+=H* z31nU56hEG{6noe2-m1w2<#Q)kDtn(@0k&i=RrmAm@~w^g!&BxPu;meH2sZpz4S03Zjkt zfCWuT&S94!m-v}v1LPt0qov+(i$>9#T&ST>U4>^~OMQ-vY5=U_xQzRYve8;MYeLNEuN+);&$RGdIDZcJn0-DzhO5M{&E?+iSUz4o!TQ0 z_+?}}B!^#0wngqMJWb(s#SEIn>xk*JvHBgmjp!}EWw#Q&oo`TzpXPkrN&+IOuzx;{aN%WIHI(0_!_*l{#$>n26FXWkWfu4aoi|6U-xRaPg zPphtE_Ywo;FYF#-fLy`uMxOI)$*xENzlQ9BCOd_39Bp`mz{o`W|Kuh6sc`r>8hA@Vo&0AY}=?0!No zD^9-1YkmXigB0@X$)3n75B4xIO#aRuB8JM<>_Mc2Pa^%1 zVm^`dLyGtW8wPJG-lS>VO}s&y@FsRgi4pQIHjN08f3io2;gy@n-bfk0k?e((^2ua? zgH|-tERKs39f89oz`Mdr@qKrehV3hl=GX(KFAy87QGNBL<_wD zuUQaAey_!zBF0*5*^|T=iw%1M`M_@{`y=o9ZDc>>9lzCPG2UFfLr3B6;%#~n-pnq8 z7;mY~rW2tSJN7g&u5u?i5c$OKAO|2H`4lq9p_O=_UW&IA@6k)}7UErJqXl6FVxpxk zEMX>C>NpKXzVN%rV5EZIMGitfD-Y=vcpLEny&P{X=FrQ&+p}j0%A#S<5TpfVGm)=+ zDrrEf_`ReaspR+Atis!ikLhToPfWHrvROo!#eqFXOsd>Z4nwT`K5{6c z@EkeBp_BNGj=?*MPwCZo2l0vXG>Z#+k(g?6W-ky^EKW`%knj9KG6bpS50Jx=Z%RJB z2KN;6=s4U%%mwqkK6{0jVX4PnCZ=0l*-OYT{s=h=`NYI$avbtkDWVhc9^z{{0q-sr(i^^GtVGPWG=&Y^ zJc}E96RpjkCMTkH{3&t*T8lqvvk~_dOXy_WM=Yk3@Sb)Sg0W~>nV>B=YbH#UJV~K- z`3#ao>+tEM(IG&5LvP0Y#WH#m?kAQyM_QV(w+Yrlu(yZ^i_U2>s^K%qFcjqlauSLt z@91rKZ}Ba?74IdM(_6l`VDAwNEzQ}x!~%;udk1yo&yiD62mUNM1-0kT*zCajiXZ3{ zJWza3Z^!%CJs=iaTCq7ql%*wmpIB6Rft-Ol^XJLws1u(>PICwnKhwML{^BQkC*Du| z=)BC*mVHDlwX|U$5=$(too1o+_)BCs>dIdvXQD1jCA}9PD1M>$-~+@8diVDZ>{DWe zr9JzESZ-;@K1LhzSIIeO1O5s*8?Db@w%Lad7QfOQ9xPVTsrVqfJYtol6Prs!TRO7O zh?SMs$@yqw{u((CZNz7ja~+0=-{=FlLA28QalNQG$6CDD0wTuZ$>tNQEgnt`+LXUZ z(x@AMgEXN{lppkAe3+MK^bMxpNfZE_LX%vy^+g^!hN>67>v$%a1h z-H$CJ5-q-LDUo3DVN1|f{C#pM+LFIVE!PIA$5+v;*zSe;`xPK>j_s9qnUn zLEpm{O3mrJ_yWnDzVm%TWL@HvWqf2E;-n=svNjsTewx;IrG7=@yEfXUV;&f#txfdPCe;!C8~ z&Y6};kq(4lp(5>T%HC-oI+*`Ta%eDLMW&*ItR3j5_zI~#{RCewwWA+@pAzXzoV84j zbRy1J!Xh2fA^bP;0BYc^ixfDyb8li$_Zx>1X)L+VzO@mT8f$M3!Z0qziGb z@&|bs9manr51~W(YVx2%tmH)(;4zXXosX}UJe)6CW=1w3E?Q5+`n0=0{@0b;~@b40OCun@mST1v~OII?n1xm*I($FI|c! zNIrDQ_lQV>kSt83uI8d2sYT)NhZN9>LR~n{nIP1$d5dq90_bu)S@Ngf;7Lle)LZ%m z3Xy8jM=H8hxyqhAi&BDyJcGi2MrJx}k^0c@@y$|i`W?PW>h<@oK1ykxdPl!VaZkOi zU+8omoh&$#S!kHxK%PSZ`A1)U+(BQK-V z1y}Nt!)|F1U4`$G2GW)IPHDj3C;Dimed=TVN~K-uBmD}eYv?SYA(@Sa3k}Gt=uE4g zw&HuG!L)+!k%H;3_AyGQ)MxtDO2^cv`c+B?bgs~Zyn)UU8k5)2+29a;$M;D?=xUsk z4D>fVRq;yA*T*TIsd@TX#UnMh3WG5=UuX)4+4BT9@}|QZOg|9;vVO8_j&ON7?8 znwnb~42q{q6Bz_Qt@KHKr{AjdPJOH2qV!5Fuj)WPMOO&z$tUP?p&j|yAyb;fIN*Xr zG4?eV!M`8%DN5he5Blv&VCsAQHm5vvmC%XIMWcm|hUc$PGkaly|igHkK?dz68x zU-Y|`0jU*LUC5W{8lf}!0*w>A$>$E2q?t?u{Gv32sgGZfrvFv+oH97|t3Fi;POZ}K zb$X5dCv+nV(RiUN`3hZYoy|1HuSm0)M)+kZoM~u(KpB$yO}}3;q+0d+6g|2@=t&l% z>xCX<5xP$3Zrc>UCe3Bs@N8)g(*(b&3{U-`Kcoyx{jNW#3{9=B@*~U8M8TIVMH2)c zvc%!0WMXjqhBTkS@axjNzrXcq%81lo`Xfq6>QDV)r?==vA%HALlLdeB4Vq+)UKx`b(i-;)+It?|236w}IHP~h+$2Ju9$E*!^C zKz9p+$SQP~Fp#W7cM1b++v5+UWlTFfM_S6X#qTSVID7pWh2k{&Oa;b%m7cVsdxgQI zg63~0!mRC6Gvy{o4qyC%{#yRNEI(JIe@U6nxmJxN|DcD35#(?5kPt%ta(FJSWxC=8(i)}#r%Z(G$Wr$__m)jHPO!$AmGqzW8ft1LK1iO6!@P_$$T4VXzFH&o$NGROWGR zRTHSX=qX`5RR=vOgi^H~N~9#lA1{^?89%&8N~qAmrj+5ddb0wHUcKb3K{Es+g`(-g zL<&JqTQ@Pi@iJ*6(+e+^l9>Se+e#$YOn*y(EwJ9AM4*|%B+3C51d6h+*#g-1#otO> zm_WQ-+RXI9-zbZ?7W%u&Law>~j>Qaz(R*sgw(P&bpHsh<}oHFaz+9QVJ7f|43QJ zwbegVmU3iQrkO{%7r31`x{F}0t>!N?5tl>KApDS^kcU1&Mppw8)x|#zJO6%}TI>L;? ze@cg$k@yekP(^orq4FQsP5(-X=ep`&IyXlxLL}v0a~w)FL(SG>%vk)7bd(u`|CZ92 z(e}m42Ck>RNLkPI(7#sJp?8EuR7>=>u#jqj-Vzqrj@Q~sCzw#JjdYwDhyPWQI6r-< zlF0e$OOyo8r)mk+2E8XNrdp$Sg(#|(V{Pd)Gf`_Nonj_vYe^?7dg;rRja-2Kjgrjy z>&u+mqYs2-R68_BSW2};?^}6>($F7U z={z%C>m+3{)9kC1-P|C3rLv0~sQ;quM4t<5sjg^&u!ib_<_mGQv$XZ3OH8=dRl3N` z)Ve6CoL;Xed%3~-ugV@SxN05M1AQs{M|DSE2=P=m$A;2XW{$RjbcLC%tuI}!7^<&U z_Hje>-xQ8B=&jDa=xbpE<%1Rq>#3gTE9-S;zP7P+jhUxyBxN&m?SCoHkEENwAM|!!I-p7lq1|o{cq(kH$wkQImCrjZK8Uk zWx__P7g{PLQvr@z$;`0YnoB^07H$Moj4{|KN4e4Zze*Z6O8>{XFZx#4LItAb!e*)u z`o?;TS*Wd95H8Tx+!l;<)qG@wcbXEY#6Iycc!*Lg7dMc7RRqZPs~Y7qL_`jAiroup?>jJBinlv%CqAU&y=Zg5r3bJGkiN)|WO;OsmC{Vp7& zLeOgA05u%_X3b~TXg#GoCQj=iW61q|%B726x56MH-sb+|Du( z&XYA7A(INz)La54X%Oo>W}CLR^p@GG?Io2nTO3*{ceq7{7RqgIp`p2QOXDb)IF_&oYRf;sNg!|M|O-tb(wM5fGxNDoC z9dEA9q-#UXcFbw*IOPqu$LIm4(?)ne!Gq9+9BP>(+{A_r%|vru zxUD_GT&H5IAy9eCZ87vw%DK&k-p;Eu?S;oww5FZ#h+3&>`^}y?tEJ2u=8P7$KTM`W zf8_(W-Ox{Y&uugGRo-bj3D2k)O-JD=wOZ3bcw&2AJK5~WWNE|94$L|2B;_-=(=b5! z#O*KyDId9%s(fmV##6|n;xryYE*0xI&FsQl)J`=!GZ(Z|%uW@148h75Znt5OQo-#q z40Qfa(^+^<#cRBU0&1sx*1igt#%9&=ec-R#O-axf@gxm1H*sp9q;1}l}CZo(^S zy{4=1l3J(fBD}D@rk!PO$Yg87%?+5V+L_8XZogruV&(Q3hA40es4AioH9dsaRD!0v zP)KcXoNI2v+|bT3H)gJDXPX;U95RF`-?@W^;Yu}kz%a~tqsCV#p^`N|LNS%3>G=(V zu1 zYPID~7{*y`xZ{Sg&O0=Hg%4DUCQx`!ZP)bq)`GdGU1)C3+|@2HyEAtjCR%HArwtRV zcHAk$cxx?9know>rRgtxqIPQf2_J1AXcwDXF*({Ob4%vFc9FH_Ml5Nq%Vijh);e5z zRVB4oGf?SnDyZF#%gk+=N7|+4Hq1lq5_9W{Glnp$hRZZeveqo&Dd&Bf!NOPA z4g?ETRH|msw+_rx?Fw^y=81N>xgGP^VXD=UJ7<_;b>Pk#CR^<_LxgYC0gXYhQu{S} zL9xx#t}=IGa<$Rsj?6RdN~;TZ!7$zG%$+w(vpR8ERX?c1nqk6s>X2rrP)!|lj5T{P z1=<+1CzG#TZT6_RYzVj3<1QIyT3xw|h8fOBH6w&yRGKD4_(>hn4FA@Jd8u7v?##T< z#+kjD=MHnM4Y{j^+13W!6~ioRea&d$FLhiqO87$^(~J~;+rHNRXYR%nYU9manOEAi z)+XF_!#rza?wVn)wGo&7wH9?sGd9zfI;k0xX+xcG++gm>6l>R;doV@Xb>{9BqJg$H zAC3nxT*xEufEz^a%sF|ASOkL1S$#k;) zqTOK*Vk)#L=KjoQ?RINh?xA71wGH>cu*}+;%lTTLx}uqpS&zD`nV#uNU2@!Q9>i2> zcbNw=mD-)=0ToXS(bo3dW5Y^oJMNKTh4VGdtjvZ~wkABY0d-X~^P8TrYWJE4Gm3VP zIhgtC5M%AcJu|GfcI2KKR#`h}=4LjbZfNFYHm0s?W@k3C{jS|-9>P@Dyot;=ZK~Ca z%QwVXJ-IwXtkr|d{ffb1_WaDIpft|Qbfa!M9yAYUergYxhcQ32`^`fuUKrx7ow?_R zwN`Jgz_7;IqG2+%l&qmMaTtT9ZzGvM+Qa4%%x~=>a|rXxVZF5*_sX!&+Le20_|Mu! z6PekJx~*X|3F?+6B2!n(R(sSuI>JVqW*)`-)gH0-Yip{ zW^?MUWlQ^Q%m1p+ZXU{tO-1|TG ze`b=$k+qhWU4iwj@BI{i)jdWonsUWGS}u}u**(hW%4PBH33uT#dv}LBbD7{}dfm;h z{Y&*an_u~t=(XMBdkI#y@q>k*lUWanF{kq&#-dGWv6g z-eci@T!Qy#xG$F*7Ikd1;TLqLnV;8NyXVU_Ql7i#$<ay?fe?;qjeT)}8s2v@)x);sYo|8Bh_AM4+xci>|p zjZ;3kSIUi2KDt-P4O2cK7s8{sBHr`ikz8T#x$ub8%>I3P7e14JuilxD^Y4jIb+3_| zrhIX)mYbw}cCRwVa>c!u!eh8%-izVUTv7Npy*r=P3CDNivp7kaUES;D7AfiOb#n8R zH1}G$nX8odYIr*0x9Y45f0gjCcY(itA{YdX!tel^~5^D>rVMo%FrhI^ASl`HSP6`sPC^WF?k z=E}l{^`5-Pe@GAT@&1E4>y~6P<*Qqii4@x{$hfPL_ilJPSJ8VXJdLa1y{-4*bNY|! zz4;veBYH1Bd!%j35BChi7t?z4yX1QWN|q^nQG9|8c!9 zpUZ#D6YJhCx2Iy<+vIjBKiyl6xmFH$TWo@AHu7+Cf@hqm0V-*yYPzC zV*V@o7`~|gvObzG;=km{;XWw$q_Vq1a)9!`}%agg8!aAjW6%Nt50>Gk%v&fxKGQ2seJBJ z@*r0muN9WKWUm<(InrzBv-ryXhx$yulK+7|gRdCjsQl3Y2<3~W!BAf0TX-wi&iggI zg=_1z!@<;Q{wMkzzN-JRKAW%Nf8;6T{zD#46^tguQ3c##V+YsK`y;%a>)`z!-o~|u zpX>Aa8vbYcJifaBsXo_zK^{dFai5n*Qia{;=HEFX8L@U+atc zI{sIlQtqqrc&eoPiad@g;l6Af;8J{9!uz=%zRcl$TzB}rzMOC1f2S|w>-*p8OWil* zNmLp4b$KFH+I>x);9`94u*}iEY~kNH%9mAN#W(hU)K~J2{2%lce8b38s=WJ_JcTOf zz9~hC^IWI9*@QxA3Rw>-gsWRDG@cfjo<<;=V7>q$<1b$unI2d^y5LxW2yZ z;lo@XpGV)sxAte~8~IlLh`xbu8JSB}cR!NnP}STI<=IqKG)9N<=gtd>s$CX{%?AaPmV05 z>bqabOQ?G8m-1q&F6s@3xnaIv!vEknU%v3~sZMz!eLLUY|5M+_xAXt-G;+U{ms1Vh zZ{%fE1NUp=95>RJKl}$b!siR0<%T11`Yyf`=F)fa9kEz_hx>!PifZD1FR!E;yWhzx zTw{EN!WX#FzJlTN+$diGeJ|e?%dGF=yI`60-F)ZBTB^DGle~s%=Kd(JrkbKf!~ST=n>-yO@U@8i2+Sv;-Wsq%WNrTdG#j%wllY+U0e`ih6Iaua;T z!dJNQ2&~IIg+b8}4(8SmxFhl=%I{8>H&U(LY4Qfw6kn@1F%F-7bsTVMRkT^=`0OMJD$FSy0Nn&Ic%B43T@ zp$qdx1K5~1n#LG8K&3!gWV#i$OMSJ&FH=Wg1@%Ap;aGua zQZts{!$8qf01ZVa0w^e}@s3;Rs~>*Lt?<8k`7qT7@)&8{Mql%A zD!0McEc}IAkCf4G@RP98`gMLHR!YAH<(7|A1E5^;F{(e5Q$FhAeXYU~ZnLjtIGx+% zYoXufr()&wTl^HPtbUW9963b|h7#qI)F3E9K0yscv9Q63KIinw2|i~MC3OZ?QNPPi z$13P|_-R;q&#zEk`3yA_N|H}gL!dl{&28}!VT%j;@UY2ANEQ77KMSj@-{)symGpa% z*ZBfG4En{H2IZi9^6#$gzBb`++%{iw_$#;7N9vFHxmY#*5kCj3sz2msN6t|rq5Mu# z#0bde(3s(9`|wY0r>|Z32e-r5HvBzx0ajCg%FoAY=uh~0Sar`BsE~Yt8Vwbc&r_qI z0;Y@G96=DSY7=kzc_M@ng|t_uTm4BV)7MgJnB#eXIi@3*(?rD z=z2DD>I$r({+3^kHPGMi%dq;MDNrf-1~nNfDPN~1K_yIxQ+>|m52yH?NpMa^n&=<+ zRaj&FJ--rbq`!m8%D1U$P#O6aH5Do?-*g@HF=r7jBz}a~2kdHl!rf$Gm>Z$yCtcCuCUxzjK%z-M& z52)EtMfpB83#wq|!h#W+1%U-U*EI2xZ}P)y1@$=u18M+nC?sjM4nL#q3ZHeY5`PDenQPh2cFHt zo%Rhlo5-E=^*@`C8pM)yi8-PXESl#oFp$`7Kx*-G=JQuc>8F9r+ct6sj%1bp7FTf^;JR8M|GEk}o+Ex?`kISGN>IiJ%kbvM@8h~;--os1ZM zC)Uxk25KaKpjJZ-<@eMosDW9SyW|^rwh(vGH{xtT?gG-y$i(l(x*Bo(9;}Puf||;o zsdZ2j`4hDkYAk%v8-2D2ciA_}h^FKBFtYIbvF=7@eqSVw+6Xn5Q>hJ5 zGx-a(9vy$S1b5vx?rd@Hns4mcVyOy78_}=ulo6fCP4VzhD>>p!3b&NgsZCG|vov?h zH}PyK?xt_T*^=B1Bw!%ChOq|Bs~BTI5GEUx2ssxIlmIzX$gaD-DQC-acYKr2mf>#u zCK(?75Z23x=MQ2%4U`W>Y-$Tc$QBiZaM`3Jbo$u}+WVJE3;+4{8V0 zR{jq9P9qP03L9i3@+Yx@Mgo5#!$s|ZIx4Z$Zm5G2L+t{6XX@|RP$MsY1{-1|@u#uD zp8Zf4B@?v|>a4_3d!bIC??i?f(EtW#MAKM)HS$5-m8{fnP&XwDbpYzBWCnewQGh>( zjWD7~ZNm+RHfBJS3OT_aN`agdAW8;(XX-_4v{8t^fQ>Q=^5?OUo`VphAXEsV6`0Z> z3iO@GIHM?k85?U9;V)rhjKWY)#X}v20!ln}2x1i!^qoct{u(yHD9&HS#v8@>D;YVd zV^D7;2Xz$crDUg$fW9;JCN|k9#oxdt872Aa*hJ4ssGpLXIsx@na#6>jKA`VJrWs}V z+t^g241WuoVw8pkDtV~W&;TWoItBGt5S0s57ANOnr#WHY)KCuvtb${ysL-^9SUFzNxd&P~{gY3=IK&Co<2d%0I^D8ddm5 z*c_uWG+Ze_ori`g`Kfb|bI=5Rr%{7{hAl9v^G~t)Mm7FPMq%m_G*T%MP5wEy&~p_UqZFmCK%MkV2}()o2IxCeKVqwm zM*IhCmC=xYkFE6Fg{CNFsXNeQr3`f&ngsezWS!BJ|BS6Qn(&{nHAZ7-x>A9<4^30b zQ}>{$N;%MX8ZG!VY=hC9PsP?7&G;`Fm8nP2Or;X_5SpP>q#l62Gd05r=d|J@PEt=x zJ{{ZWc?!)@s!~s&*-91aF*FPGorq{)ynzXZ6Z~-=w1(y@HK^y%Jf%AI44SJ{1AV7K z@-`MU2;RaZ1Lw_*+SDs(p;C)_2`x}+QZGQ?>CkiapY(i_@^bTEWlbK7WG%gIS;?Z#-@J9wi#{sZ`f8Nng5Dy@w|nWD0Qhf&|;+y z^%`0P(okfl(VqW_?J(N$Kd|jaTWGn`fO-!tQ|eRipruMZkcJwa1Q)j3=qSWuyNnJ( zOh#ks6SPukM16!-C=IC(APr5+jO{bJ2$`_GMrR=o+v7=v)+kM>FVJeG3H2FT1=3LD zH^VW!PCjQhA&aAWyF%-g7F0U4PH9f1L2H#}APqH~BR-}WPTb!)2RPZE8LcTD+NiXm zGN27gODY1=&@>ba8H|8nnn4RNrg|)hS1`(iHY< z|A=uz`<3o=HfW#Hjm`?~1!*X9-WVAlv|`v5V~kJ~yKIaWieQ&wazF=_o^*C7 zqy%UWq=7UPxo(UXN?_NFaYAwIsxcNiqV%D2L5G#zbWZ4y(hH=a#w4LMcGH+Bl)`Qp z6NHi({pm#Ln9`3D(XS1q;B|=^7wXs;JgT0C=1zlCf(GF=v= zp=r&qFUA_7DfZb|Ei}PC#Z-dsD%0tT&>dwOT>-ic(oiJbSTD4|(u{RNb1cG7yHVUnf4K^lt28M}l|xXainbi`wg9nc4572O1SudJjSL+_LoAPqJ43SIHc z#vY*yp2^rPbk0~yH-|ncYv^XsM`bnL6r`bPJ@9PCexW;_)z~L=!?VP+f>M?BbW7-q zvW{*6eFkYL3LCOO;gIp0kb=9715iZSMEjw1Wh31hN>es~G}O=p7Do+LU~t4x1Uf^Y z2`Cyv!l7s;5eCxGv|f02@O;K;VKAQ8I0a`? z_R`(pIAssr6?Q4RK^kg=1rGNa{}6t~y~gjt(2N6g3YxG7A|I7JVJ6P1(nAUHueK@S9JXxa?CqH$N4j#n`52-EQLF~7ojl{55E zI7vB84}tT5G!(63JP>B#m5uwtOuUkD501`&4uhk4@Eq(^LjY;0@mQFPS2G?7bMUIh zLt%EtIeH`5p zqIHZH!Xmu3@myGl*D{{LMU~6+IJk&%i5?3VRxW}x)Oal{#p@Zbge7=g^ZgI!Hs)HsCFdRAD{d!uTSr!<)y14?!Ag7=nP~hA!|pW@HGPGoH~4;p)m$dI4Nbc|y+zX=qvy zPc|$;!b!svM4X6O0@qSr(2L=k%5!=VTmz(`XglMZuoZ7>d=<9fZ44W(tGuR{!F7~Z z^isIC@)D$>#!q1f-of}GY{%Oh--T@%@933qedR5^0z*K zhMLiO3_Z*&!hXEFnOWGEkw$NXn=7gG2Dq8>gn5kbRQ6BbmQF(KHwc%VhtnSFLH2ONU{(ooYQ9Kw5<@xnp8r-=%o z44d8p6N*I#VO%k338bNENAbR94&eyi$ILDq#(T$XhubLM=xuPa@|E5SlOPR62bj5q z<9L5Fmv9X4XXb?4D?jO-a69D(y#sEmdA*lllVY0K{%ni=sj>pHJ08D zcTi*KT_6oj`yC%@<`vH1L(C-MG(I?HKioylMDK$;t8w&RxD!Z2(P3sZgM~Arj_Fr3 zAKYEdO8*9TQ?t+q;I3+BkcOHCgmd@^GpZVgn@(*j9imm($sTedK|R!Lv<%YFw2Sy? zvygBBA7vI4&f_Cv4#JFz&>`4qMr$wy(ol4qSyZ@;k2Q-3m+&!WVYsL2p%23WHJ&~M zvnmSGP_u+^4WD2Z7p~&t&0@k8Jtut(?ycsakHWpw?DP?khNj)bC!3{&8~7x%q;MUd z7;_Ttr{<hqq>r^%yq$bg4KpL9%5T9*U5+2~Q%!T>XtfA^ z86E}FP;{ADS9pytHR}kk@Fiw#c)VJIz7CI5i__QOv1&1phMEn8clZjkzVH@bZq^gt z=%wjf@IM;JKpKj! zGn)#Z@wH|X;S;{bYz$9VE714hX=-`;9z0bo2hvcpg^-4CFq;dh_O7<4f;7ePpwWrgXgN%KpJY2f{h1FLa=bjbeie4=~wVVwHEynUZB>bUw||; z?K{5BY$JTbx0=bqSA0v%TX>0Dmwp2;R_oBO;YA<~MR%I*g`fBivz_n*-)^>rm#Yov z_wX{cKK%|}s@4N(sM$$$;k(U_Vl2MP>>$SIjpbLWh1aVs=yZ6U+MG^< z*Q(7x8frRmf9F8#WPdu5e8&Lmt!W+JsJ5ar;0 z*-Ol>x21o;Thuo6cQ~jf)89ZEnx254F#CzQ@#AJ+F&BO;HWuElwr67CZE8FEC%hG; zq3CIIpqK|gWeyM%@snnMc$eCViGz2l9T^wAL+t?4P;-cwf;Sj>x`F$alBdRHb3 zyhrWAWQKREotaD^4Ndns6-u1kcV`~csepk0Bi0S?SGzOW;C*U0CM&!bq@n0}bGTRl zKW7dT^W%S*TyzeYfni4iFc5q|?E%tIbCg&Zzi5sW3*i^c5n@4|W#VB~Wf(Mi3BVvA z4NWhGUoppsMe)n#Xt4->DK-auQ0>WNheK+B@xU5LL(%Kzc(DY2%^W8d$FG`W;Uj7v zCKr5I?akzb52?LC8fs1wOXD}qiDD`IhB-kjsrP3R;bUq)CILRG_GNN|G&H>&e#e|5 zmc?(Ilf^Rlt=PQqNp&ET1fNg`FnQqPAPq(Do72S#_&sx)SRTJ?PKD2?LzrLS)9PR* zAACw3lmXCCbCy^ce`wAWE8!2!8Dd49WAaBG6yt+`SBElQkcOsL!=IRQ#H#pXbGBFo ze-v8?{zDzk6pYr!VhX@vkcOhq&G}*t{FymVtd2i5=fW4%QA`o|ygHI844+d+WB@eO zTqM@UUz!WWTKEfdfml-?%M^z%sbiR8@I`erQxv43>Gkk8<`S_k{@PqD*1=!JmV&RU zaCm}|tQ_-Avq*aZI+TM53aPG>5@ zchqT21^6~dL(z0|z1RXzGuMgD@lRDtiSGnvZpJ#_|1L(NTMYdpi;D7L~Q z<_58)K9{KuKT_u~)!>KfY^EwmL(_5GjLre$MpOdedTcHDsXCvj2|rQiF*V@FAPq&o znvzK3wke7PZkYo7LS4kvfuE}jncDC(bpc32&8=cv{JXhDY=eI@gJQD2l&KHDQkO9G z;Fs!RrY=ZB(>o9`=610?{?pthw!?qKHiF-(%bAAo8+94e0DcY9(D*oWm)MDLnLEXf zM69_3{-Ca6n!xYXl}uy!ow@>~q2^w(E0NjUBX%J&nY+c#`dX$r{7GHIG=o2?tC^-C z4NdPsWHa}R-HEK`KCv5-CAJlus;*~R!e7*NObhrkNJHb{XbuknnU3Tl+~xr|qHbdR zaJssYX$_~T8!`YIYHA`&pr$G^1QHG5=mJB)(M%#7&QLcq7)V3YdlA{qgJMs@V}`^4 z5g*$Iwp58phNE4OumRH0_*~`@u@8~cJS_Gma+rtUZ|YX29sE_@!nB2LH3-sB^SIcb zNHC9y{fOM=QL(SSgXswWP`5K3;P2`-raeeQ(+3er=1FlNk;gnC4j>X^yCAXZE~YaQ zqwZun!9PJ78vl!VMjS%qGf#_yiM-}1B$K+A>4wCqdzh|>OWh69&}arD;fp#}!W*q- zrXOHZkSywcrU#N)-N$qXX=wTgqM-SQIGiXDt!+r;k7W?IlL<;A*&Gi{Az7jKf2X1G zMa&E0D59`=UK~jjGS4B1sxbj1dJ13>XCeTkq2^_AEK$t7B#t49nis{<`XQz_60aU) zdLgJ9VtRr!G<^b5(!45;CrX%C#BoIN*nUV3^$61!$*vw|`XC;VhQ^mMZ-|qK(&lw> zB2mh`hU8X{GXs!Z>M^E2l2bhj(oplZIF%@8-V&z}WzC!7Wc?H~7)ex5GJ}u=^#n5z zq@n3Eh>GT2aXL}KydzE{%E$hSJ_Ko~ z84cv)W;7ENGaV1CKVudm)zzoW0;HPygqaW0(DWdYY+9m3kY@B0K*TOVYN;=n#Yj!{ zIkO0<0n*U;cIG#6E78{cDsCa#m^M;Zea$RG>Zq@nrATe{B}hZfpW+UpgZV?;PP8|_ zi`(>f%u1xb`j%ON)KlLu%Rw5NzMJT5#frO#PF9S#ljs<`25F>zU{)gy)%VOQqyb1n zTkq z)l_B!(oFrrtOsdmx<| z3qml}a4rY@s_sk*w^lPi8ftmOLqsntUOY(jv`{gm+sqb(P%S2i;Ht?;APr4FO7yjI zh)0M%R(A0)(K~iK(nkHpY(tXOugq421Zime04uk6oak@m5|0u6tei-D^(V6vX{Y{R zb|7ul?;s7e@`$I1K~|!8k{DLRz9S=mX-Mp>85324j^5%%peW5 z3W(>35mtVuw$*UUC!RGRC!FILFee|Vhn9_zK^mHVkr-_i5-$*=tb*csVr1+=gwYTt zggC(;=K%%M(D-pyQSmY{)+!=iBF0#Sk)E1|IgA9fc;*noYA8rUtrFrjVuDp%yh@C> ziiuZ@oXjz#x0ZuBiuBU5Ge`A1bmYX?&^wn}P$B{lD z4UL~>l@)IjQ>`-MEn(sX&q$M$DKpL9%5+4w=tcv1&VrJ|g$gi5$$%Y)N{oDJPl@?fHSvj2n7M?E)Cw^d zkr7%!<^o7V(_a#ctyVv$u-d`>Kky^4&{iZWM_(OMDaGBOIJq4CSCy5ehMsZ~dO zMJ%yuBjdFa%ynd(R-CzpjMa*PG}LM!z9Uvx^~JZua;u*B#wg9)LMCdZn48E1tt4{; zq@n2_iPcsk@dL5SYAC)ZR>s~%rf6lEJIG|M409Wq1k%v>byidHGqKicB7P#)SdEeC zS_S4lGEFPb+(V{ntrGJPnW0r=9)L77J;PC5 zt;C2^+oz?NPHc>Qip5i@K8!)Eb$u)nJ|@^R(*B zGi0t-4WywKDcVHPB1DUjEL=2=+RQ6tp;n7|i7e1+GA}?Hn*N>GX0;K&5nHWf@hh<< z_ARnRtINDW7Hf5w*T^D}hQ{x-+KWGl9acN>2eI91i!9d~Fz=CNT7BjnvQ(?57x+63 zwK_>IVz<>%iY0bg9i$keG4lyosWoCgA}h3p%m&CLLL)HD}V0wOTWfhFX*aIpO#e z$?fEG_K>m}tr;EJsI_7;kPTW(CIZsX2ug%3MnVY9q9vG6W36Z=DPtm=H9un@n?M>G zf7t3Nd5A++K#C_0S}Y=IB=a?T3SsQ%M1l^`P^*uWlQ?SimU0kBtX@)fqb>6T*`l>! zz9T^`nfV6N&`1Jt!s;jGCXQQurCh`@S1ht!YtP0Y+q8DfPh=}dL*q|d1EoB~DQkd~ zNSw6#BfGRtY#g#v>&Uv09a;yFhFU|Ud`@M`!BSr0j5SC~GP<%^kUd%#HZ!tY>&#{X zX=udf)T#Yd@;cecv zHCie{Tyo_=4r)Ew>_|upupUGMX=wa)YrIr~xMq!$iW66@vB(jv51R`)to3GdB8RkI zAPu!9Nu`OK)JYD7rI4%Ic(x>RMH|PK zKrX|-{hfx!zqgi44TyKvGO0fC)>?|(&?d2Ekn7q+wls20n*h>KYn9ZP_-L(^8WA6? z6;eZEDq9}8rA=YWAvd+jY*~z00l5v* z(D-y~z0`t8v(`z?iBxMX@<5x#RzdD-Gug_>J#7X^L#<6xYa+wiD77LY)&{AiF_*25 zJksW{)sTnUY_=*$LnAn0MkN4YSeqq3p}T4!Pqq1MP2`C-kF9|`25D&gS4)ye!j4`7 z5SAq%FSJE$9pt&Tkgbh8(-wd<)Y>YwCB9o*q&CDiD<~x!OWFF!D{To|4|%CAX6u49 zG}3{Lv9?R?iJ#UssU7jd)d+d3EoU1dZ?t7>1LQSGLp^cUE~yjgvUW-x$yjR#@L#@41S2DA;N9samvUW?IjkRoZsNJAsN$n4fZsVC{NLQ;T?ceO#HT_hvX zKoW@x49bOmuZYW3^puXEa9J$#z11f;80g zi*-gCLguqhOM}V0)+scTwwLXO#%X)luBc1f4bsr4VCK^p2QVqK6%k%g`E(nzw9 zbq+-|jSZl%rm`#wX$nX~t;^C_vY2&A8bcPfE=r?~Lu_v}UOUM4LQyTm_5^8YWCB^z zx+;w)OITN=ab$5C9bzPcBma?v)xwYf$05q3& zjO~x+)Q*BQ)VeKACCgd2q$y-s>!vi>IK>V|6Sb45yb2vtZF@!W*g_&k!XJH4|W9V)6TNPK^huaK-RRLO7qDY))Q$SS=}`T zEu>vwN23L`^Xw?J07yeUb*vZCBC@viTv|xhvYw$uwae@{w1{?z9g7y$E`l`FdMzy_ z>sha)C1hRerL@?%#!f_wYggF`Xff>yJ07H=kriY^>#ek$Y+${SmXY;cQ_xb{4R$hG zQoGJhLQ8-&)YHWJAgv-BTkoZnWFzYxT2{NwPD9IRx7ev@Y3*ioSN?YzYJHa0lFh77 z(i*a<^-)@F++$~=<+Z!)478kfhn)`6(8va|rIjkJCtFxwq;+I-*BrEx_JEy@R@CmZ zv(O444fXh~h_s1pZKX>a$yQbxT2*_@&O@tckJ!0rW$htIL#=2!CvHVOFlJ>)n~i7e zLbST}lwE*U)1I*NK^htflF3%|6hKBn`wdd?2v<65+J?*S-(pIvq z^;OzJwy|uquJ)Q;hSt$uu}jg~+DnjzT0f;7WC!bqw4H2keV4Ww@7R@SeeEr~0vS z+En|@u0xw>pV+l%W9=hIL+z~60kVgkMcPkxw=+xoj5Ky5+FVOzH=xb5FYJ1dhDH>U zwxjhNC_9_<8=2zb(N(%DUD3y_9-0(SH(JZncMav2*!9SFdRs9)1r0d1{i zfHc(hNQcN?cD!_u>}jJ?$gtThsB`ha22p1UnUz2q8aYb#wR1>E$Ub&<=`h*bwH|&B3^~M3l1`I@UHj24p-k*Pv~wtq z-HUbtX{cwI?RBczaP}`wP2*qfd}#MjR`xfvTPO>A0PPye4AM}$fOL)=Vdr-$oDa8s z(peL7l5(6%K#uBi>Or$INJAqR$j>*E-0NRN4gH8PVk2fp-u{r6V9QKCV!`) zo^f_j=`uOiE+Snb$Jm9@o*@r=7!8Et*+Zz)45XoU3F#U+!7eUcCCA&vq$_4l_88hb zl!HBr_6lWZkAO5Za+92Fmy&LflkAexb#kKXB-$^On>~T{4dr5wqkRzn-)X34nq5}9 zO-{ATNVmu-c4>5AC=YuY9S}-nPoe!o2_Ox%D@gaq8FqQ;E;-#UC*3jgIrZrWhw`$& zqk}?8>=}@TMjn#0?Ml)Ea+Y0Dx=+q@{ee1l&>Yk1RKs^Fn+-wQ{hfw-=Gj%H$K+hQ ziu8z_V^>CphYGOgQRm(m>^amCv>*+&Ye>(?1$K4mDLLP+COt6=vzO43p+f9MbVR5i zdjX`Ok(cCRyO#8VTx8djo|6k*SJ5${qU;rPbf^e>865@EP|q^EuJoE*YS)ookxT5_ z==e|x_BuK)RGhtrjtvz9X{g;mdPlCX>q~FR<#s*kjaiz#g-#5WVsD}oLM7Q7APtRt zBv;#wqz~jOyP@=+Td>phbpl5 z(P^Rb>^*d9s2oT`?G{oRxxsEOrIPFIX3`h4GW!Uf8LGrSL}!F5vJXHS8p$9x+pVOC zGmYO;N+&nEo}zO?RoN%#>`)cX8MK|?kMqDu0Wxc|uh4~|TI@@7L8vDC0;Hjl@8mYSjr5J&Y9~uy$t|w8 z=#o%f_6@o?REK?yE&^$&XQ$m>`bqAv+etsj?RHyqd8h&V9$gly&%Q&KhU$Sd)b12? zk-P1V!B}#a-60rbHfBGeD?^RgkLZd}L-qqmLo+gy`|K{kOypj>b1;tF<4Q%>gqpHn z(AA+P>}PZpNJBlp+1-O#$pdz`U>0(}-4$IQYQd(X>q5=hG<0pK8AwBI$MBMhof34D zvfU$?&1}u;=*CbhHUr%dYRN`G8k&KUA=?SwlbY?MaFVLaLirHJn&{?`pEb};$h5!H zP|snzXV60)vID_*@}SK+rvQ@u>P!R>tnH+O;vfyR`vh~6NA2Fh9OMzZS1`NTmi>Wl z3AJIrqrp%z`wgU_842VGyI(LjdED+B%taoHi$%AG+6Q9LZJ~DTPjo9tLp`VMfx$fF zDSJRLkvwVlM|Xuf1>(@1p^gC;x+By9q@nhZU_SD9dvGu>dBz?TOftI$vY>lHT>_cW z-J#BbOdt)-@R4WjUxQvUY!40oLjEJpjqVS14`f64g}Mc@qI*Fa>N#%@4;CQL*~5bQ z$vvEJ0qg#|4X%SM9Orkx-vN zF7$AycOWNvDAWt2q4uO;Y4WB$F<6ScVNVE_H2Vh<(PN>0fduqusBa)QNJBHqk$3DV z!LsCSdvdT0c`Gh2dNMRHkc6HH4G82xk0SsL_1w3o2P=^G>}kRBcV^Th-`Gonb;;NE;$R)} zRa`0bYG{0*Bzh$@E>Hr!4Ej#bdwY4X0r}2e7OYRcwU?qdLX!ez(CeXzfzs%;&;-zT z+N*+%$&dERU?cK_y&~AqoEj*P-U>|#ltXWZCI`xbzB8j4`NdunY)XE%R|lJrpW-T^ zcSF+y712APX@LspZP0gm((U!Z7G#>eF4&w*wb!B#LbC!@(EFj8fy(H;&_An}cV^(EY4bshH0;emKdHynLZ61_2Wp~E zLh}MO(8r+f^nA6YAW7P`7$iu`7SI==MS(i#^U%UTZS+}a0q8sJt--eBcY8~)4f)Lu z29wRDf%@pH(2_ts^kryqpf2b;Gdd*4*xQ5c$)EPNU_0_hTqE>tXnCL^`X;n2&;Wf6 z`p%d*dsnbivdi8X?3f&D??69OfP_cV_fR&Sviqc2CY~?+bQI&Jx!OO%1INv_!vz)&*LipF!Ul z13UG2sAS0gEtrz*why3@(58SNO%H7hv_{iH8$jRbgkacY)Jeo)k`aev(WVd}&}da4 z9F5k7#z5bh(JMK-eK6QF*<*)-f#mqOHfS^ul#E8xfk@NKA`$(`)a!&hjuy=9} z`w;pqv^CHU{TkX5Xp1^sfWFf{9_*i-U>^(iOU`W{4fZv61UjNWLfZo!(C?vbf%c&9 z%ovoMWS4<93KkmP*!>EPhxy!NU1 zOrgDjZt-!UJ%O(AuF!7KcjhhRa^)?VX-CJOJ2Q>#ba|Ka_(ax(9+_rlyWbL;Y8mWfQX{?5L9%6O(wL zPQtmGiB~)^F?q`-UKo+^%oSa_XLQ0P*MBYjHa1~P(dbfPQ*44<^uHG0IvtxZw@7sP zlUK0`w+jE)@+0$H2@n44W%WW=!lQq6qc3(P+&UZ^ZE)d2T*3|K;XhyY!2HY!$Np^b zWnt!oPycH1Xi4S--TJe|jd@xBd>C!vyovQKkzqp_ZU8kb(6`qhCLz1Cu$U_;=g@&n zzrj?m!GR$?`wi|x^{Z0(uP1W!>=zggEJyxBgJHwDVlkN4DLIf3eLDM~exv`p74Rg` z=f6Fh#c7%g7)sCK)vG#14Wh64_p$u-M0`TQ1XogCuu-o-zd_8vN|kFCgPej0SlRB; z-_Ga~Hy6n3%{#w%Hm~zxREx=$CGUTF?+Gqvy~NOn#K^GX+3F_!xmhRuxwR)gah3eH zKV$NGohl4537MRp56y8WlukT1!=2DNruc;sf4)GrESd5q0q@uOZ2#*2QDWlctj?On zzYX&y{kc~p{kfkc#{adW{PmZ!c_n0W{&W5LI)A>>3+I)}CN7`pPG}xe{M%S>0`kAS z73ZV<>!nWJk4?PS(pk4|_3VG{WJ%He?cYbgc&?GcVEdTr*{?j(}*Qi$;uEZUNtGKheGwgr=05;BZCsd6o?(9;3 z_Ved3|HBsdS2vsUCLVJ~*Dbzzc4lXs|6z33oc=1t6nAz-Z$kV($MwH&rhj#NFFh{t zLwa1aQS?LoA4j32|8ea4_jfRLzVi+em(O-5Y>ACYoc$PfHsa#h5>L%==Z+qvl9GmS zeabUK8E4Ng|K~xV{K#J1uzo`ZCuOcuzH<4BF-fKV#~QUITj$D_pzVxWa>(LD8px-O}0g zoz?!?tl!|_z54XxoCSkN3>?U?1Dys=Prb?({;wA)TzlMv31$D)+242n|GU0`fjMvC zzX#_0;T!}1&&LHG>;C`u^!Io8e>EuJ4Mq=^|2?kZ{RZ_KHk=tW^#A-#{(Wfw*WH1` z``7XMzX#d#uVp2x)GS-2)c@B&drXK)YW>d*wcH@C;6JzQf|bfws$9NO!IJEVegk_J ztjtskRIDDTR +# #include +# #include +# #include +# #include +# #include +# #include +# #include +# #include +# #include +# #include +# #include +# +# namespace +# { +# +# using namespace DB; +# +# const UInt32 ROW_NUM = 2000; +# const UInt32 MIN_STRING_LEN = 3; +# const UInt32 MAX_STRING_LEN = 5; +# +# const UInt32 PLAIN_ENCODING_CARDINALITY = ROW_NUM * 2; +# const UInt32 MIX_ENCODING_CARDINALITY = 800; +# const UInt32 DICT_ENCODING_CARDINALITY = 20; +# +# UInt16 nextNum() +# { +# static UInt16 idx = 0; +# static UInt16 nums[] = {0, 21845, 43690}; +# static size_t nums_len = sizeof(nums) / sizeof(nums[0]); +# return nums[(idx++) % nums_len]++; +# } +# +# template +# void generateValues(MutableColumnPtr & col, size_t num) +# { +# using FieldType = typename NumericDataType::FieldType; +# +# const size_t next_num_bytes = sizeof(nextNum()); +# char bytewise_val[sizeof(FieldType)]; +# +# while (col->size() < num) +# { +# for (auto bytes = 0; bytes < sizeof(FieldType); bytes += next_num_bytes) +# { +# auto tmp = nextNum(); +# memcpy(bytewise_val + bytes, &tmp, std::min(next_num_bytes, sizeof(FieldType) - bytes)); +# } +# if (is_decimal) +# { +# // clean highest 3 bits, make sure the result doest not exceed the limits of the decimal type +# if (bytewise_val[sizeof(FieldType) - 1] > 0) +# bytewise_val[sizeof(FieldType) - 1] &= 0x0f; +# else +# bytewise_val[sizeof(FieldType) - 1] |= 0xf0; +# } +# FieldType val; +# memcpy(&val, &bytewise_val, sizeof(FieldType)); +# col->insert(val); +# } +# } +# +# template <> +# void generateValues(MutableColumnPtr & col, size_t num) +# { +# std::string str; +# while (col->size() < num) +# { +# auto len = MIN_STRING_LEN + nextNum() % (MAX_STRING_LEN - MIN_STRING_LEN); +# str.clear(); +# for (size_t i = 0; i < len; i++) +# { +# str.push_back('a' + nextNum() % ('z' - 'a')); +# } +# col->insert(str); +# } +# } +# +# template +# ColumnWithTypeAndName generateColumn( +# std::shared_ptr ch_type, +# size_t cardinality, +# const std::string & col_name, +# const std::set & null_indice) +# { +# DataTypePtr col_type = ch_type; +# if (!null_indice.empty()) +# { +# col_type = std::make_shared(ch_type); +# } +# +# auto values = ch_type->createColumn(); +# values->reserve(cardinality); +# generateValues(values, cardinality); +# +# auto col = col_type->createColumn(); +# col->reserve(ROW_NUM); +# for (size_t i = 0; i < ROW_NUM; i++) +# { +# if (!null_indice.empty() && null_indice.contains(i)) +# { +# col->insert(Null()); +# } +# else +# { +# col->insert(values->operator[](nextNum() % cardinality)); +# } +# } +# return {std::move(col), col_type, col_name}; +# } +# +# Block generateBlock() +# { +# ColumnsWithTypeAndName cols; +# +# // test Int32 type +# std::set null_indice{512, 1001, 211, 392, 553, 1725}; +# // Nullability is expressed by definition level, and encoded by bit packed with smallest group size of 8 +# // when null value appeared. Here we make a big bit packed group with more than 1000 values. +# for (size_t i = 0; i < 170; i++) +# { +# null_indice.emplace(622 + i * 6); +# } +# cols.emplace_back(generateColumn( +# std::make_shared(), PLAIN_ENCODING_CARDINALITY, "plain_encoding_i32", null_indice)); +# null_indice = {917, 482, 283, 580, 1926, 1667, 1971}; +# cols.emplace_back(generateColumn( +# std::make_shared(), DICT_ENCODING_CARDINALITY, "dict_encoding_i32", null_indice)); +# +# // test string type +# null_indice = {818, 928, 1958, 1141, 1553, 1407, 690, 1769}; +# cols.emplace_back(generateColumn( +# std::make_shared(), PLAIN_ENCODING_CARDINALITY, "plain_encoding_str", null_indice)); +# null_indice = {1441, 1747, 216, 1209, 89, 52, 536, 625}; +# cols.emplace_back(generateColumn( +# std::make_shared(), MIX_ENCODING_CARDINALITY, "mix_encoding_str", null_indice)); +# null_indice = {1478, 1862, 894, 1314, 1844, 243, 869, 551}; +# cols.emplace_back(generateColumn( +# std::make_shared(), DICT_ENCODING_CARDINALITY, "dict_encoding_str", null_indice)); +# +# // test DateTime64 type +# auto dt_type = std::make_shared(ParquetRecordReader::default_datetime64_scale); +# null_indice = {1078, 112, 1981, 795, 371, 1176, 1526, 11}; +# cols.emplace_back(generateColumn(dt_type, PLAIN_ENCODING_CARDINALITY, "plain_encoding_dt64", null_indice)); +# null_indice = {1734, 1153, 1893, 1205, 644, 1670, 1482, 1479}; +# cols.emplace_back(generateColumn(dt_type, DICT_ENCODING_CARDINALITY, "dict_encoding_dt64", null_indice)); +# +# // test Decimal128 type +# auto d128_type = std::make_shared(DecimalUtils::max_precision, 3); +# null_indice = {852, 1448, 1569, 896, 1866, 1655, 100, 418}; +# cols.emplace_back(generateColumn(d128_type, PLAIN_ENCODING_CARDINALITY, "plain_encoding_decimal128", null_indice)); +# +# return {cols}; +# } +# +# void dumpBlock(const Block & block) +# { +# WriteBufferFromFile output_buf("/tmp/ut-out.csv"); +# auto out = getContext().context->getOutputFormat("CSVWithNames", output_buf, block); +# out->write(block); +# out->finalize(); +# std::cerr << block.dumpStructure() << std::endl << std::endl; +# } +# +# } +# +# EndOfCodes +# +# How to generate the parquet file: +# 1. Use above C++ codes. +# Put above codes in src/Common/tests/gtest_main.cpp, add following two inlines in main function: +# tryRegisterFormats(); +# dumpBlock(generateBlock()); +# 2. Genetate /tmp/ut-out.csv. +# After compiled, run any test, such as "./src/unit_tests_dbms --gtest_filter=IColumn.dumpStructure", +# 3. Generate the parquet file by following spark sql +# create temporary view tv using csv options('path' '/tmp/ut-out.csv', 'header' 'true', 'nullValue' '\\N'); +# insert overwrite directory "/tmp/test-parquet" using Parquet +# options('parquet.dictionary.page.size' '500') +# select /*+ COALESCE(1) */ cast(plain_encoding_i32 as int), cast(dict_encoding_i32 as int), +# plain_encoding_str, mix_encoding_str, dict_encoding_str, +# cast(plain_encoding_dt64 as timestamp), cast(dict_encoding_dt64 as timestamp), +# cast(plain_encoding_decimal128 as decimal(38, 3)) +# from tv; +# + +CH_SCHEMA="\ + plain_encoding_i32 Nullable(Int32), \ + dict_encoding_i32 Nullable(Int32), \ + plain_encoding_str Nullable(String), \ + mix_encoding_str Nullable(String), \ + dict_encoding_str LowCardinality(Nullable(String)), \ + plain_encoding_dt64 Nullable(DateTime64(9)), \ + dict_encoding_dt64 Nullable(DateTime64(9)), \ + plain_encoding_decimal128 Nullable(Decimal(38, 3))" +QUERY="SELECT * from file('$PAR_PATH', 'Parquet', '$CH_SCHEMA')" + +# there may be more than on group in parquet files, unstable results may generated by multithreads +$CLICKHOUSE_LOCAL --multiquery --max_threads 1 --input_format_parquet_use_native_reader true --query "$QUERY" From e1fcdba4dd51a4b4af500c1a09663820004a4a76 Mon Sep 17 00:00:00 2001 From: copperybean Date: Sat, 24 Feb 2024 22:47:53 +0800 Subject: [PATCH 114/392] fix style Change-Id: I8f7ebd173558b16d94d3161cb0b5300e7e78833d --- .../Formats/Impl/Parquet/ParquetDataBuffer.h | 21 ++++++---- .../Impl/Parquet/ParquetDataValuesReader.cpp | 40 +++++++++++++------ .../Impl/Parquet/ParquetDataValuesReader.h | 6 --- .../Impl/Parquet/ParquetLeafColReader.cpp | 18 ++++++--- .../Impl/Parquet/ParquetRecordReader.cpp | 3 +- 5 files changed, 54 insertions(+), 34 deletions(-) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h index d4956f83092..f21216d5b5d 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h @@ -9,6 +9,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int PARQUET_EXCEPTION; +} + template struct ToArrowDecimal; template <> struct ToArrowDecimal>> @@ -27,8 +32,8 @@ class ParquetDataBuffer private: public: - ParquetDataBuffer(const uint8_t * data_, UInt64 avaible_, UInt8 datetime64_scale_ = DataTypeDateTime64::default_scale) - : data(reinterpret_cast(data_)), avaible(avaible_), datetime64_scale(datetime64_scale_) {} + ParquetDataBuffer(const uint8_t * data_, UInt64 available_, UInt8 datetime64_scale_ = DataTypeDateTime64::default_scale) + : data(reinterpret_cast(data_)), available(available_), datetime64_scale(datetime64_scale_) {} template void ALWAYS_INLINE readValue(TValue & dst) @@ -84,7 +89,7 @@ public: auto value_len = ::arrow::util::SafeLoadAs(getArrowData()); if (unlikely(value_len < 0 || value_len > INT32_MAX - 4)) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid or corrupted value_len '{}'", value_len); + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Invalid or corrupted value_len '{}'", value_len); } consume(4); checkAvaible(value_len); @@ -110,7 +115,7 @@ public: auto status = TArrowDecimal::FromBigEndian(getArrowData(), elem_bytes_num); if (unlikely(!status.ok())) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Read parquet decimal failed: {}", status.status().ToString()); + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Read parquet decimal failed: {}", status.status().ToString()); } status.ValueUnsafe().ToBytes(reinterpret_cast(out)); consume(elem_bytes_num); @@ -118,14 +123,14 @@ public: private: const Int8 * data; - UInt64 avaible; + UInt64 available; const UInt8 datetime64_scale; void ALWAYS_INLINE checkAvaible(UInt64 num) { - if (unlikely(avaible < num)) + if (unlikely(available < num)) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Consuming {} bytes while {} avaible", num, avaible); + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Consuming {} bytes while {} available", num, available); } } @@ -134,7 +139,7 @@ private: void ALWAYS_INLINE consume(UInt64 num) { data += num; - avaible -= num; + available -= num; } }; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 3afc66dcb36..4ebe3d6a636 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -8,6 +8,12 @@ namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int PARQUET_EXCEPTION; +} + void RleValuesReader::nextGroup() { // refer to: @@ -142,7 +148,7 @@ void RleValuesReader::visitNullableBySteps( individual_null_visitor(null_map_cursor); if (unlikely(valid_index_steps[step_idx] == UINT8_MAX)) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "unsupported packed values number"); + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "unsupported packed values number"); } valid_index_steps[step_idx]++; } @@ -270,7 +276,7 @@ void ParquetPlainValuesReader::readBatch( auto idx = cursor; cursor += count; - // the type of offset_data is PaddedPODArray, which makes sure that the -1 index is avaible + // the type of offset_data is PaddedPODArray, which makes sure that the -1 index is available for (auto val_offset = offset_data[idx - 1]; idx < cursor; idx++) { offset_data[idx] = ++val_offset; @@ -394,14 +400,17 @@ void ParquetRleLCReader::readBatch( cursor, num_values, max_def_level, - /* individual_null_visitor */ [&](size_t nest_cursor) { + /* individual_null_visitor */ [&](size_t nest_cursor) + { column_data[nest_cursor] = 0; has_null = true; }, - /* stepped_valid_visitor */ [&](size_t nest_cursor, const std::vector & valid_index_steps) { + /* stepped_valid_visitor */ [&](size_t nest_cursor, const std::vector & valid_index_steps) + { rle_data_reader->setValueBySteps(column_data + nest_cursor, valid_index_steps, val_getter); }, - /* repeated_visitor */ [&](bool is_valid, size_t nest_cursor, UInt32 count) { + /* repeated_visitor */ [&](bool is_valid, size_t nest_cursor, UInt32 count) + { if (is_valid) { rle_data_reader->setValues(column_data + nest_cursor, count, val_getter); @@ -435,7 +444,8 @@ void ParquetRleDictReader::readBatch( auto * offset_data = column.getOffsets().data(); auto & chars = column.getChars(); - auto append_nulls = [&](UInt8 num) { + auto append_nulls = [&](UInt8 num) + { for (auto limit = cursor + num; cursor < limit; cursor++) { chars.push_back(0); @@ -444,7 +454,8 @@ void ParquetRleDictReader::readBatch( } }; - auto append_string = [&](Int32 dict_idx) { + auto append_string = [&](Int32 dict_idx) + { auto dict_chars_cursor = dict_offsets[dict_idx - 1]; auto value_len = dict_offsets[dict_idx] - dict_chars_cursor; auto chars_cursor = chars.size(); @@ -462,7 +473,8 @@ void ParquetRleDictReader::readBatch( num_values, max_def_level, /* individual_null_visitor */ [&](size_t) {}, - /* stepped_valid_visitor */ [&](size_t, const std::vector & valid_index_steps) { + /* stepped_valid_visitor */ [&](size_t, const std::vector & valid_index_steps) + { value_cache.resize(valid_index_steps.size()); rle_data_reader->setValues( value_cache.data() + 1, static_cast(valid_index_steps.size() - 1), val_getter); @@ -474,7 +486,8 @@ void ParquetRleDictReader::readBatch( append_nulls(valid_index_steps[i] - 1); } }, - /* repeated_visitor */ [&](bool is_valid, size_t, UInt32 count) { + /* repeated_visitor */ [&](bool is_valid, size_t, UInt32 count) + { if (is_valid) { value_cache.resize(count); @@ -505,13 +518,16 @@ void ParquetRleDictReader::readBatch( cursor, num_values, max_def_level, - /* individual_null_visitor */ [&](size_t nest_cursor) { + /* individual_null_visitor */ [&](size_t nest_cursor) + { null_map.setNull(nest_cursor); }, - /* stepped_valid_visitor */ [&](size_t nest_cursor, const std::vector & valid_index_steps) { + /* stepped_valid_visitor */ [&](size_t nest_cursor, const std::vector & valid_index_steps) + { rle_data_reader->setValueBySteps(column_data + nest_cursor, valid_index_steps, val_getter); }, - /* repeated_visitor */ [&](bool is_valid, size_t nest_cursor, UInt32 count) { + /* repeated_visitor */ [&](bool is_valid, size_t nest_cursor, UInt32 count) + { if (is_valid) { rle_data_reader->setValues(column_data + nest_cursor, count, val_getter); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index 66a1f4877e4..8bc381aa8d2 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -15,12 +15,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; - extern const int PARQUET_EXCEPTION; -} - class RleValuesReader { public: diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index e2677d7cae3..17feea80b9f 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -216,7 +216,8 @@ template ColumnWithTypeAndName ParquetLeafColReader::readBatch(UInt64 rows_num, const String & name) { reading_rows_num = rows_num; - auto readPageIfEmpty = [&]() { + auto readPageIfEmpty = [&]() + { while (!cur_page_values) readPage(); }; @@ -245,7 +246,8 @@ void ParquetLeafColReader::resetColumn(UInt64 rows_num) if (reading_low_cardinality) { assert(dictionary); - visitColStrIndexType(dictionary->size(), [&](TColVec *) { + visitColStrIndexType(dictionary->size(), [&](TColVec *) + { column = TColVec::create(); }); @@ -289,7 +291,8 @@ void ParquetLeafColReader::degradeDictionary() ColumnString & col_dest = *static_cast(column.get()); const ColumnString & col_dict_str = *static_cast(dictionary.get()); - visitColStrIndexType(dictionary->size(), [&](TColVec *) { + visitColStrIndexType(dictionary->size(), [&](TColVec *) + { const TColVec & col_src = *static_cast(col_existing.get()); reserveColumnStrRows(column, reading_rows_num); @@ -411,7 +414,8 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) assert(col_descriptor.max_definition_level() >= 0); std::unique_ptr def_level_reader; - if (col_descriptor.max_definition_level() > 0) { + if (col_descriptor.max_definition_level() > 0) + { auto bit_width = arrow::bit_util::Log2(col_descriptor.max_definition_level() + 1); auto num_bytes = ::arrow::util::SafeLoadAs(buffer); auto bit_reader = std::make_unique(buffer + 4, num_bytes); @@ -435,7 +439,8 @@ void ParquetLeafColReader::readPageV1(const parquet::DataPageV1 & page) degradeDictionary(); } - ParquetDataBuffer parquet_buffer = [&]() { + ParquetDataBuffer parquet_buffer = [&]() + { if constexpr (!std::is_same_v, TColumn>) return ParquetDataBuffer(buffer, max_size); @@ -485,7 +490,8 @@ std::unique_ptr ParquetLeafColReader::createDi if (reading_low_cardinality && std::same_as) { std::unique_ptr res; - visitColStrIndexType(dictionary->size(), [&](TCol *) { + visitColStrIndexType(dictionary->size(), [&](TCol *) + { res = std::make_unique>( col_descriptor.max_definition_level(), std::move(def_level_reader), diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 42f131ff794..69e694a340f 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -27,7 +27,6 @@ namespace DB namespace ErrorCodes { - extern const int BAD_ARGUMENTS; extern const int PARQUET_EXCEPTION; } @@ -142,7 +141,7 @@ std::unique_ptr createColReader( } } -} // anonymouse namespace +} // anonymous namespace ParquetRecordReader::ParquetRecordReader( Block header_, From 471dff6589abff5d05ab8a9bb267e198f377c536 Mon Sep 17 00:00:00 2001 From: copperybean Date: Sun, 25 Feb 2024 14:26:53 +0800 Subject: [PATCH 115/392] fix test Change-Id: Ia7dbf1d762f7f054a9aa677caaaff6bfe1a42c38 --- src/Core/SettingsChangesHistory.h | 1 + .../Formats/Impl/Parquet/ParquetDataBuffer.h | 13 +++++-------- .../Impl/Parquet/ParquetDataValuesReader.cpp | 2 +- .../Formats/Impl/Parquet/ParquetDataValuesReader.h | 4 ++-- .../Formats/Impl/Parquet/ParquetLeafColReader.cpp | 6 +++--- .../Formats/Impl/Parquet/ParquetRecordReader.cpp | 7 ++----- .../Formats/Impl/ParquetBlockInputFormat.cpp | 8 ++++++++ .../0_stateless/02998_native_parquet_reader.sh | 5 +++-- 8 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index ece48620618..6fb8fb9358c 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -176,6 +176,7 @@ static std::map sett {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"}, {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, + {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, }}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h index f21216d5b5d..5c37375fa0c 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h @@ -38,15 +38,13 @@ public: template void ALWAYS_INLINE readValue(TValue & dst) { - checkAvaible(sizeof(TValue)); - dst = *(reinterpret_cast(data)); - consume(sizeof(TValue)); + readBytes(&dst, sizeof(TValue)); } void ALWAYS_INLINE readBytes(void * dst, size_t bytes) { checkAvaible(bytes); - memcpy(dst, data, bytes); + std::copy(data, data + bytes, reinterpret_cast(dst)); consume(bytes); } @@ -68,13 +66,12 @@ public: 100000000 * spd, 1000000000 * spd}; - checkAvaible(sizeof(parquet::Int96)); - auto decoded = parquet::DecodeInt96Timestamp(*reinterpret_cast(data)); + parquet::Int96 tmp; + readValue(tmp); + auto decoded = parquet::DecodeInt96Timestamp(tmp); uint64_t scaled_nano = decoded.nanoseconds / pow10[datetime64_scale]; dst = static_cast(decoded.days_since_epoch * scaled_day[datetime64_scale] + scaled_nano); - - consume(sizeof(parquet::Int96)); } /** diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 4ebe3d6a636..6743086e9e6 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -306,7 +306,7 @@ void ParquetPlainValuesReader>::readBatch( }, /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) { - auto col_data_pos = column_data + nest_cursor; + auto * col_data_pos = column_data + nest_cursor; for (UInt32 i = 0; i < count; i++) { plain_data_buffer.readDateTime64(col_data_pos[i]); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index 8bc381aa8d2..688de4f52eb 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -97,7 +97,7 @@ public: * @tparam ValueGetter A callback with signature: TValue(Int32 val) */ template - void setValues(TValue * column_data, UInt32 num_values, ValueGetter && val_getter); + void setValues(TValue * res_values, UInt32 num_values, ValueGetter && val_getter); /** * @brief Set the value by valid_index_steps generated in visitNullableBySteps. @@ -106,7 +106,7 @@ public: */ template void setValueBySteps( - TValue * column_data, + TValue * res_values, const std::vector & col_data_steps, ValueGetter && val_getter); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index 17feea80b9f..52dfad7606a 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -113,7 +113,7 @@ ColumnPtr readDictPage>( const parquet::ColumnDescriptor & /* col_des */, const DataTypePtr & data_type) { - auto & datetime_type = assert_cast(*data_type); + const auto & datetime_type = assert_cast(*data_type); auto dict_col = ColumnDecimal::create(page.num_values(), datetime_type.getScale()); auto * col_data = dict_col->getData().data(); ParquetDataBuffer buffer(page.data(), page.size(), datetime_type.getScale()); @@ -282,7 +282,7 @@ void ParquetLeafColReader::degradeDictionary() dictionary = nullptr; return; } - assert(dictionary && column->size()); + assert(dictionary && !column->empty()); null_map = std::make_unique(reading_rows_num); auto col_existing = std::move(column); @@ -372,7 +372,7 @@ void ParquetLeafColReader::readPage() dict_page.encoding() != parquet::Encoding::PLAIN_DICTIONARY && dict_page.encoding() != parquet::Encoding::PLAIN)) { - throw new Exception( + throw Exception( ErrorCodes::NOT_IMPLEMENTED, "Unsupported dictionary page encoding {}", dict_page.encoding()); } LOG_DEBUG(log, "{} values in dictionary page of column {}", dict_page.num_values(), col_descriptor.name()); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 69e694a340f..9cde433b983 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -156,9 +156,6 @@ ParquetRecordReader::ParquetRecordReader( , row_groups_indices(std::move(row_groups_indices_)) , left_rows(getTotalRows(*file_reader->metadata())) { - // Only little endian system is supported currently - static_assert(std::endian::native == std::endian::little); - log = &Poco::Logger::get("ParquetRecordReader"); parquet_col_indice.reserve(header.columns()); @@ -230,9 +227,9 @@ void ParquetRecordReader::loadNextRowGroup() Int64 ParquetRecordReader::getTotalRows(const parquet::FileMetaData & meta_data) { Int64 res = 0; - for (size_t i = 0; i < row_groups_indices.size(); i++) + for (auto idx : row_groups_indices) { - res += meta_data.RowGroup(row_groups_indices[i])->num_rows(); + res += meta_data.RowGroup(idx)->num_rows(); } return res; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index e35d53dc4f4..2e849f09fda 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -484,6 +484,14 @@ void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_bat if (format_settings.parquet.use_native_reader) { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunreachable-code" + if constexpr (std::endian::native != std::endian::little) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "parquet native reader only supports little endian system currently"); +#pragma clang diagnostic pop + row_group_batch.native_record_reader = std::make_shared( getPort().getHeader(), std::move(properties), diff --git a/tests/queries/0_stateless/02998_native_parquet_reader.sh b/tests/queries/0_stateless/02998_native_parquet_reader.sh index 5c129e6c5ce..4e5169c4bf0 100755 --- a/tests/queries/0_stateless/02998_native_parquet_reader.sh +++ b/tests/queries/0_stateless/02998_native_parquet_reader.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -201,8 +202,8 @@ CH_SCHEMA="\ plain_encoding_str Nullable(String), \ mix_encoding_str Nullable(String), \ dict_encoding_str LowCardinality(Nullable(String)), \ - plain_encoding_dt64 Nullable(DateTime64(9)), \ - dict_encoding_dt64 Nullable(DateTime64(9)), \ + plain_encoding_dt64 Nullable(DateTime64(9, \\'UTC\\')), \ + dict_encoding_dt64 Nullable(DateTime64(9, \\'UTC\\')), \ plain_encoding_decimal128 Nullable(Decimal(38, 3))" QUERY="SELECT * from file('$PAR_PATH', 'Parquet', '$CH_SCHEMA')" From f68b788f5900b66ab4623874c98ed1b4025b5fd0 Mon Sep 17 00:00:00 2001 From: Danila Puzov Date: Sat, 11 May 2024 15:34:13 +0300 Subject: [PATCH 116/392] Tests and docs for serial, some fixes for generateSnowflakeID --- src/Functions/generateSnowflakeID.cpp | 62 +++- src/Functions/generateUUIDv7.cpp | 284 ++++++++++++++---- src/Functions/serial.cpp | 134 ++++----- .../03129_serial_test_zookeeper.reference | 8 + .../03129_serial_test_zookeeper.sql | 20 ++ 5 files changed, 373 insertions(+), 135 deletions(-) create mode 100644 tests/queries/0_stateless/03129_serial_test_zookeeper.reference create mode 100644 tests/queries/0_stateless/03129_serial_test_zookeeper.sql diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index e54b720ec98..dd837a58325 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -11,11 +11,42 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } +namespace +{ + +/* + Snowflake ID + https://en.wikipedia.org/wiki/Snowflake_ID + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +|0| timestamp | +├─┼ ┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| | machine_id | machine_seq_num | +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ + +- The first 41 (+ 1 top zero bit) bits is timestamp in Unix time milliseconds +- The middle 10 bits are the machine ID. +- The last 12 bits decode to number of ids processed by the machine at the given millisecond. +*/ + +constexpr auto timestamp_size = 41; +constexpr auto machine_id_size = 10; +constexpr auto machine_seq_num_size = 12; + +constexpr int64_t timestamp_mask = ((1LL << timestamp_size) - 1) << (machine_id_size + machine_seq_num_size); +constexpr int64_t machine_id_mask = ((1LL << machine_id_size) - 1) << machine_seq_num_size; +constexpr int64_t machine_seq_num_mask = (1LL << machine_seq_num_size) - 1; + +} + class FunctionSnowflakeID : public IFunction { private: - mutable std::atomic machine_sequence_number{0}; - mutable std::atomic last_timestamp{0}; + mutable std::atomic state{0}; + // previous snowflake id + // state is 1 atomic value because we don't want use mutex public: static constexpr auto name = "generateSnowflakeID"; @@ -60,23 +91,28 @@ public: // hash serverUUID into 32 bytes Int64 h = UUIDHelpers::getHighBytes(serverUUID); Int64 l = UUIDHelpers::getLowBytes(serverUUID); - Int64 machine_id = (h * 11) ^ (l * 17); + Int64 machine_id = ((h * 11) ^ (l * 17)) & machine_id_mask; - for (Int64 & x : vec_to) { + for (Int64 & el : vec_to) { const auto tm_point = std::chrono::system_clock::now(); Int64 current_timestamp = std::chrono::duration_cast( - tm_point.time_since_epoch()).count(); + tm_point.time_since_epoch()).count() & ((1LL << timestamp_size) - 1); - Int64 local_machine_sequence_number = 0; + Int64 last_state, new_state; + do { + last_state = state.load(); + Int64 last_timestamp = (last_state & timestamp_mask) >> (machine_id_size + machine_seq_num_size); + Int64 machine_seq_num = last_state & machine_seq_num_mask; - if (current_timestamp != last_timestamp.load()) { - machine_sequence_number.store(0); - last_timestamp.store(current_timestamp); - } else { - local_machine_sequence_number = machine_sequence_number.fetch_add(1) + 1; - } + if (current_timestamp == last_timestamp) { + ++machine_seq_num; + } + new_state = (current_timestamp << (machine_id_size + machine_seq_num_size)) | machine_id | machine_seq_num; + } while (!state.compare_exchange_strong(last_state, new_state)); + // failed CAS => another thread updated state + // successful CAS => we have unique (timestamp, machine_seq_num) on this machine - x = (current_timestamp << 22) | (machine_id & 0x3ff000ull) | (local_machine_sequence_number & 0xfffull); + el = new_state; } return col_res; diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp index 61d742d2fda..411a3a076ac 100644 --- a/src/Functions/generateUUIDv7.cpp +++ b/src/Functions/generateUUIDv7.cpp @@ -1,13 +1,178 @@ -#include -#include #include +#include +#include +#include namespace DB { -namespace ErrorCodes +namespace { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + +/* Bit layouts of UUIDv7 + +without counter: + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| unix_ts_ms | +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| unix_ts_ms | ver | rand_a | +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +|var| rand_b | +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| rand_b | +└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘ + +with counter: + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| unix_ts_ms | +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| unix_ts_ms | ver | counter_high_bits | +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +|var| counter_low_bits | +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| rand_b | +└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘ +*/ + +/// bit counts +constexpr auto rand_a_bits_count = 12; +constexpr auto rand_b_bits_count = 62; +constexpr auto rand_b_low_bits_count = 32; +constexpr auto counter_high_bits_count = rand_a_bits_count; +constexpr auto counter_low_bits_count = 30; +constexpr auto bits_in_counter = counter_high_bits_count + counter_low_bits_count; +constexpr uint64_t counter_limit = (1ull << bits_in_counter); + +/// bit masks for UUIDv7 components +constexpr uint64_t variant_2_mask = (2ull << rand_b_bits_count); +constexpr uint64_t rand_a_bits_mask = (1ull << rand_a_bits_count) - 1; +constexpr uint64_t rand_b_bits_mask = (1ull << rand_b_bits_count) - 1; +constexpr uint64_t rand_b_with_counter_bits_mask = (1ull << rand_b_low_bits_count) - 1; +constexpr uint64_t counter_low_bits_mask = (1ull << counter_low_bits_count) - 1; +constexpr uint64_t counter_high_bits_mask = rand_a_bits_mask; + +uint64_t getTimestampMillisecond() +{ + timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + const uint64_t sec = tp.tv_sec; + return sec * 1000 + tp.tv_nsec / 1000000; +} + +void setTimestampAndVersion(UUID & uuid, uint64_t timestamp) +{ + UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & rand_a_bits_mask) | (timestamp << 16) | 0x7000; +} + +void setVariant(UUID & uuid) +{ + UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & rand_b_bits_mask) | variant_2_mask; +} + +struct FillAllRandomPolicy +{ + static constexpr auto name = "generateUUIDv7NonMonotonic"; + static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), and a random field (74 bit, including a 2-bit variant field "2") to distinguish UUIDs within a millisecond. This function is the fastest generateUUIDv7* function but it gives no monotonicity guarantees within a timestamp.)"; + struct Data + { + void generate(UUID & uuid, uint64_t ts) + { + setTimestampAndVersion(uuid, ts); + setVariant(uuid); + } + }; +}; + +struct CounterFields +{ + uint64_t last_timestamp = 0; + uint64_t counter = 0; + + void resetCounter(const UUID & uuid) + { + const uint64_t counter_low_bits = (UUIDHelpers::getLowBytes(uuid) >> rand_b_low_bits_count) & counter_low_bits_mask; + const uint64_t counter_high_bits = UUIDHelpers::getHighBytes(uuid) & counter_high_bits_mask; + counter = (counter_high_bits << 30) | counter_low_bits; + } + + void incrementCounter(UUID & uuid) + { + if (++counter == counter_limit) [[unlikely]] + { + ++last_timestamp; + resetCounter(uuid); + setTimestampAndVersion(uuid, last_timestamp); + setVariant(uuid); + } + else + { + UUIDHelpers::getHighBytes(uuid) = (last_timestamp << 16) | 0x7000 | (counter >> counter_low_bits_count); + UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & rand_b_with_counter_bits_mask) | variant_2_mask | ((counter & counter_low_bits_mask) << rand_b_low_bits_count); + } + } + + void generate(UUID & uuid, uint64_t timestamp) + { + const bool need_to_increment_counter = (last_timestamp == timestamp) || ((last_timestamp > timestamp) & (last_timestamp < timestamp + 10000)); + if (need_to_increment_counter) + { + incrementCounter(uuid); + } + else + { + last_timestamp = timestamp; + resetCounter(uuid); + setTimestampAndVersion(uuid, last_timestamp); + setVariant(uuid); + } + } +}; + + +struct GlobalCounterPolicy +{ + static constexpr auto name = "generateUUIDv7"; + static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit, including a variant field "2", 2 bit) to distinguish UUIDs within a millisecond, and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. Function generateUUIDv7 guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)"; + + /// Guarantee counter monotonicity within one timestamp across all threads generating UUIDv7 simultaneously. + struct Data + { + static inline CounterFields fields; + static inline SharedMutex mutex; /// works a little bit faster than std::mutex here + std::lock_guard guard; + + Data() + : guard(mutex) + {} + + void generate(UUID & uuid, uint64_t timestamp) + { + fields.generate(uuid, timestamp); + } + }; +}; + +struct ThreadLocalCounterPolicy +{ + static constexpr auto name = "generateUUIDv7ThreadMonotonic"; + static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit, including a variant field "2", 2 bit) to distinguish UUIDs within a millisecond, and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.)"; + + /// Guarantee counter monotonicity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads. + struct Data + { + static inline thread_local CounterFields fields; + + void generate(UUID & uuid, uint64_t timestamp) + { + fields.generate(uuid, timestamp); + } + }; +}; + } #define DECLARE_SEVERAL_IMPLEMENTATIONS(...) \ @@ -16,77 +181,72 @@ DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__) DECLARE_SEVERAL_IMPLEMENTATIONS( -class FunctionGenerateUUIDv7 : public IFunction +template +class FunctionGenerateUUIDv7Base : public IFunction, public FillPolicy { public: - static constexpr auto name = "generateUUIDv7"; + String getName() const final { return FillPolicy::name; } - String getName() const override + size_t getNumberOfArguments() const final { return 0; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const final { return false; } + bool useDefaultImplementationForNulls() const final { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final { return false; } + bool isVariadic() const final { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - return name; - } - - size_t getNumberOfArguments() const override { return 0; } - - bool isDeterministicInScopeOfQuery() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - bool isVariadic() const override { return true; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (arguments.size() > 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.", - getName(), arguments.size()); + FunctionArgumentDescriptors mandatory_args; + FunctionArgumentDescriptors optional_args{ + {"expr", nullptr, nullptr, "Arbitrary Expression"} + }; + validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); return std::make_shared(); } - bool isDeterministic() const override { return false; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override { auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_to = col_res->getData(); - size_t size = input_rows_count; - vec_to.resize(size); - - /// RandImpl is target-dependent and is not the same in different TargetSpecific namespaces. - RandImpl::execute(reinterpret_cast(vec_to.data()), vec_to.size() * sizeof(UUID)); - - for (UUID & uuid : vec_to) + if (input_rows_count) { - /// https://www.ietf.org/archive/id/draft-peabody-dispatch-new-uuid-format-04.html#section-5.2 + vec_to.resize(input_rows_count); - const auto tm_point = std::chrono::system_clock::now(); - UInt64 unix_ts_ms = std::chrono::duration_cast( - tm_point.time_since_epoch()).count(); + /// Not all random bytes produced here are required for the UUIDv7 but it's the simplest way to get the required number of them by using RandImpl + RandImpl::execute(reinterpret_cast(vec_to.data()), vec_to.size() * sizeof(UUID)); - UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & 0x0000000000000fffull) | 0x0000000000007000ull | (unix_ts_ms << 16); - UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & 0x3fffffffffffffffull) | 0x8000000000000000ull; + /// Note: For performance reasons, clock_gettime is called once per chunk instead of once per UUID. This reduces precision but + /// it still complies with the UUID standard. + uint64_t timestamp = getTimestampMillisecond(); + for (UUID & uuid : vec_to) + { + typename FillPolicy::Data data; + data.generate(uuid, timestamp); + } } - return col_res; } }; - ) // DECLARE_SEVERAL_IMPLEMENTATIONS #undef DECLARE_SEVERAL_IMPLEMENTATIONS -class FunctionGenerateUUIDv7 : public TargetSpecific::Default::FunctionGenerateUUIDv7 +template +class FunctionGenerateUUIDv7Base : public TargetSpecific::Default::FunctionGenerateUUIDv7Base { public: - explicit FunctionGenerateUUIDv7(ContextPtr context) : selector(context) - { - selector.registerImplementation(); + using Self = FunctionGenerateUUIDv7Base; + using Parent = TargetSpecific::Default::FunctionGenerateUUIDv7Base; - #if USE_MULTITARGET_CODE - selector.registerImplementation(); - #endif + explicit FunctionGenerateUUIDv7Base(ContextPtr context) : selector(context) + { + selector.registerImplementation(); + +#if USE_MULTITARGET_CODE + using ParentAVX2 = TargetSpecific::AVX2::FunctionGenerateUUIDv7Base; + selector.registerImplementation(); +#endif } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override @@ -96,18 +256,34 @@ public: static FunctionPtr create(ContextPtr context) { - return std::make_shared(context); + return std::make_shared(context); } private: ImplementationSelector selector; }; +template +void registerUUIDv7Generator(auto& factory) +{ + static constexpr auto doc_syntax_format = "{}([expression])"; + static constexpr auto example_format = "SELECT {}()"; + static constexpr auto multiple_example_format = "SELECT {f}(1), {f}(2)"; + + FunctionDocumentation::Description doc_description = FillPolicy::doc_description; + FunctionDocumentation::Syntax doc_syntax = fmt::format(doc_syntax_format, FillPolicy::name); + FunctionDocumentation::Arguments doc_arguments = {{"expression", "The expression is used to bypass common subexpression elimination if the function is called multiple times in a query but otherwise ignored. Optional."}}; + FunctionDocumentation::ReturnedValue doc_returned_value = "A value of type UUID version 7."; + FunctionDocumentation::Examples doc_examples = {{"uuid", fmt::format(example_format, FillPolicy::name), ""}, {"multiple", fmt::format(multiple_example_format, fmt::arg("f", FillPolicy::name)), ""}}; + FunctionDocumentation::Categories doc_categories = {"UUID"}; + + factory.template registerFunction>({doc_description, doc_syntax, doc_arguments, doc_returned_value, doc_examples, doc_categories}, FunctionFactory::CaseInsensitive); +} + REGISTER_FUNCTION(GenerateUUIDv7) { - factory.registerFunction(); + registerUUIDv7Generator(factory); + registerUUIDv7Generator(factory); + registerUUIDv7Generator(factory); } - } - - diff --git a/src/Functions/serial.cpp b/src/Functions/serial.cpp index 4f336013ca8..1745e17b5e7 100644 --- a/src/Functions/serial.cpp +++ b/src/Functions/serial.cpp @@ -7,6 +7,9 @@ #include #include #include "Common/Logger.h" +#include "Common/ZooKeeper/IKeeper.h" +#include "Common/ZooKeeper/KeeperException.h" +#include "Common/ZooKeeper/Types.h" #include namespace DB { @@ -15,6 +18,7 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int KEEPER_EXCEPTION; } class FunctionSerial : public IFunction @@ -69,6 +73,15 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { + if (zk == nullptr) { + throw Exception(ErrorCodes::KEEPER_EXCEPTION, + "ZooKeeper is not configured for function {}", + getName()); + } + if (zk->expired()) { + zk = context->getZooKeeper(); + } + auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_to = col_res->getData(); size_t size = input_rows_count; @@ -77,78 +90,32 @@ public: const auto & serial_path = "/serials/" + arguments[0].column->getDataAt(0).toString(); - // if serial name used first time - zk->createAncestors(serial_path); - zk->createIfNotExists(serial_path, ""); + // CAS in ZooKeeper + // `get` value and version, `trySet` new with version check + // I didn't get how to do it with `multi` Int64 counter; + std::string counter_path = serial_path + "/counter"; - if (zk != nullptr) { - // Get Lock in ZooKeeper - // https://zookeeper.apache.org/doc/r3.2.2/recipes.html + // if serial name used first time + zk->createAncestors(counter_path); + zk->createIfNotExists(counter_path, "1"); - // 1. - if (zk->expired()) { - zk = context->getZooKeeper(); + Coordination::Stat stat; + while (true) { + std::string counter_string = zk->get(counter_path, &stat); + counter = std::stoll(counter_string); + std::string updated_counter = std::to_string(counter + input_rows_count); + Coordination::Error err = zk->trySet(counter_path, updated_counter); + if (err == Coordination::Error::ZOK) { + // CAS is done + break; } - - std::string lock_path = serial_path + "/lock-"; - std::string path_created = zk->create(lock_path, "", zkutil::CreateMode::EphemeralSequential); - Int64 created_sequence_number = std::stoll(path_created.substr(lock_path.size(), path_created.size() - lock_path.size())); - - while (true) { - // 2. - zkutil::Strings children = zk->getChildren(serial_path); - - // 3. - Int64 lowest_child_sequence_number = -1; - for (auto& child : children) { - if (child == "counter") { - continue; - } - std::string child_suffix = child.substr(5, 10); - Int64 seq_number = std::stoll(child_suffix); - - if (lowest_child_sequence_number == -1 || seq_number < lowest_child_sequence_number) { - lowest_child_sequence_number = seq_number; - } - } - - if (lowest_child_sequence_number == created_sequence_number) { - break; - // we have a lock in ZooKeeper, now can get the counter value - } - - // 4. and 5. - Int64 prev_seq_number = created_sequence_number - 1; - std::string to_wait_key = std::to_string(prev_seq_number); - while (to_wait_key.size() != 10) { - to_wait_key = "0" + to_wait_key; - } - - zk->waitForDisappear(lock_path + to_wait_key); + if (err != Coordination::Error::ZBADVERSION) { + throw Exception(ErrorCodes::KEEPER_EXCEPTION, + "ZooKeeper trySet operation failed with unexpected error = {} in function {}", + err, getName()); } - - // Now we have a lock - // Update counter in ZooKeeper - std::string counter_path = serial_path + "/counter"; - if (zk->exists(counter_path)) { - std::string counter_string = zk->get(counter_path, nullptr); - counter = std::stoll(counter_string); - - LOG_INFO(getLogger("Serial Function"), "Got counter from Zookeeper = {}", counter); - } else { - counter = 1; - } - zk->createOrUpdate(counter_path, std::to_string(counter + input_rows_count), zkutil::CreateMode::Persistent); - - // Unlock = delete node created on step 1. - zk->deleteEphemeralNodeIfContentMatches(path_created, ""); - } else { - // ZooKeeper is not available - // What to do? - - counter = 1; } // Make a result @@ -157,7 +124,6 @@ public: ++counter; } - return col_res; } @@ -165,7 +131,39 @@ public: REGISTER_FUNCTION(Serial) { - factory.registerFunction(); + factory.registerFunction(FunctionDocumentation + { + .description=R"( +Generates and returns sequential numbers starting from the previous counter value. +This function takes a constant string argument - a series identifier. +The server should be configured with a ZooKeeper. +)", + .syntax = "serial(identifier)", + .arguments{ + {"series identifier", "Series identifier (String)"} + }, + .returned_value = "Sequential numbers of type Int64 starting from the previous counter value", + .examples{ + {"first call", "SELECT serial('name')", R"( +┌─serial('name')─┐ +│ 1 │ +└────────────────┘)"}, + {"second call", "SELECT serial('name')", R"( +┌─serial('name')─┐ +│ 2 │ +└────────────────┘)"}, + {"column call", "SELECT *, serial('name') FROM test_table", R"( +┌─CounterID─┬─UserID─┬─ver─┬─serial('name')─┐ +│ 1 │ 3 │ 3 │ 3 │ +│ 1 │ 1 │ 1 │ 4 │ +│ 1 │ 2 │ 2 │ 5 │ +│ 1 │ 5 │ 5 │ 6 │ +│ 1 │ 4 │ 4 │ 7 │ +└───────────┴────────┴─────┴────────────────┘ + )"}}, + .categories{"Unique identifiers"} + }); + } } diff --git a/tests/queries/0_stateless/03129_serial_test_zookeeper.reference b/tests/queries/0_stateless/03129_serial_test_zookeeper.reference new file mode 100644 index 00000000000..60714f4064f --- /dev/null +++ b/tests/queries/0_stateless/03129_serial_test_zookeeper.reference @@ -0,0 +1,8 @@ +1 +2 +1 3 3 3 +1 1 1 4 +1 2 2 5 +1 5 5 6 +1 4 4 7 +1 diff --git a/tests/queries/0_stateless/03129_serial_test_zookeeper.sql b/tests/queries/0_stateless/03129_serial_test_zookeeper.sql new file mode 100644 index 00000000000..3eacd1ae908 --- /dev/null +++ b/tests/queries/0_stateless/03129_serial_test_zookeeper.sql @@ -0,0 +1,20 @@ +SELECT serial('x'); +SELECT serial('x'); + +DROP TABLE IF EXISTS default.test_table; + +CREATE TABLE test_table +( + CounterID UInt32, + UserID UInt32, + ver UInt16 +) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/1-1/test_table', 'x', ver) +PARTITION BY CounterID +ORDER BY (CounterID, intHash32(UserID)) +SAMPLE BY intHash32(UserID); + +INSERT INTO test_table VALUES (1, 1, 1), (1, 2, 2), (1, 3, 3), (1, 4, 4), (1, 5, 5); + +SELECT *, serial('x') FROM test_table; + +SELECT serial('y'); \ No newline at end of file From 9789d130a6cad5da2941037d91c69d9d63aa2733 Mon Sep 17 00:00:00 2001 From: Danila Puzov Date: Mon, 13 May 2024 01:11:23 +0300 Subject: [PATCH 117/392] Tests and docs for generateSnowflakeID and fixes --- src/Functions/generateSnowflakeID.cpp | 144 +++++++++++++----- src/Functions/serial.cpp | 36 ++--- .../03129_serial_test_zookeeper.reference | 15 +- .../03129_serial_test_zookeeper.sql | 24 +-- .../03130_generate_snowflake_id.reference | 3 + .../03130_generate_snowflake_id.sql | 11 ++ 6 files changed, 154 insertions(+), 79 deletions(-) create mode 100644 tests/queries/0_stateless/03130_generate_snowflake_id.reference create mode 100644 tests/queries/0_stateless/03130_generate_snowflake_id.sql diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index dd837a58325..1decda0ab46 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -1,7 +1,11 @@ -#include #include +#include #include +#include #include +#include +#include + namespace DB { @@ -38,15 +42,32 @@ constexpr auto machine_seq_num_size = 12; constexpr int64_t timestamp_mask = ((1LL << timestamp_size) - 1) << (machine_id_size + machine_seq_num_size); constexpr int64_t machine_id_mask = ((1LL << machine_id_size) - 1) << machine_seq_num_size; constexpr int64_t machine_seq_num_mask = (1LL << machine_seq_num_size) - 1; +constexpr int64_t max_machine_seq_num = machine_seq_num_mask; + +Int64 getMachineID() +{ + auto serverUUID = ServerUUID::get(); + + // hash serverUUID into 64 bits + Int64 h = UUIDHelpers::getHighBytes(serverUUID); + Int64 l = UUIDHelpers::getLowBytes(serverUUID); + return ((h * 11) ^ (l * 17)) & machine_id_mask; +} + +Int64 getTimestamp() +{ + const auto tm_point = std::chrono::system_clock::now(); + return std::chrono::duration_cast( + tm_point.time_since_epoch()).count() & ((1LL << timestamp_size) - 1); +} } class FunctionSnowflakeID : public IFunction { private: - mutable std::atomic state{0}; - // previous snowflake id - // state is 1 atomic value because we don't want use mutex + mutable std::atomic lowest_available_snowflake_id{0}; + // 1 atomic value because we don't want to use mutex public: static constexpr auto name = "generateSnowflakeID"; @@ -58,23 +79,19 @@ public: String getName() const override { return name; } size_t getNumberOfArguments() const override { return 0; } - + bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } bool useDefaultImplementationForNulls() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } bool isVariadic() const override { return true; } - bool isStateful() const override { return true; } - bool isDeterministic() const override { return false; } - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (arguments.size() > 1) { + if (!arguments.empty()) { throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.", + "Number of arguments for function {} doesn't match: passed {}, should be 0.", getName(), arguments.size()); } - return std::make_shared(); } @@ -83,36 +100,57 @@ public: { auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_to = col_res->getData(); - size_t size = input_rows_count; - vec_to.resize(size); + Int64 size64 = static_cast(input_rows_count); + vec_to.resize(input_rows_count); - auto serverUUID = ServerUUID::get(); + if (input_rows_count == 0) { + return col_res; + } - // hash serverUUID into 32 bytes - Int64 h = UUIDHelpers::getHighBytes(serverUUID); - Int64 l = UUIDHelpers::getLowBytes(serverUUID); - Int64 machine_id = ((h * 11) ^ (l * 17)) & machine_id_mask; + Int64 machine_id = getMachineID(); + Int64 current_timestamp = getTimestamp(); + Int64 current_machine_seq_num; - for (Int64 & el : vec_to) { - const auto tm_point = std::chrono::system_clock::now(); - Int64 current_timestamp = std::chrono::duration_cast( - tm_point.time_since_epoch()).count() & ((1LL << timestamp_size) - 1); + Int64 available_id, next_available_id; + do + { + available_id = lowest_available_snowflake_id.load(); + Int64 available_timestamp = (available_id & timestamp_mask) >> (machine_id_size + machine_seq_num_size); + Int64 available_machine_seq_num = available_id & machine_seq_num_mask; - Int64 last_state, new_state; - do { - last_state = state.load(); - Int64 last_timestamp = (last_state & timestamp_mask) >> (machine_id_size + machine_seq_num_size); - Int64 machine_seq_num = last_state & machine_seq_num_mask; + if (current_timestamp > available_timestamp) + { + current_machine_seq_num = 0; + } + else + { + current_timestamp = available_timestamp; + current_machine_seq_num = available_machine_seq_num; + } - if (current_timestamp == last_timestamp) { - ++machine_seq_num; - } - new_state = (current_timestamp << (machine_id_size + machine_seq_num_size)) | machine_id | machine_seq_num; - } while (!state.compare_exchange_strong(last_state, new_state)); - // failed CAS => another thread updated state - // successful CAS => we have unique (timestamp, machine_seq_num) on this machine + // calculate new `lowest_available_snowflake_id` + Int64 new_timestamp; + Int64 seq_nums_in_current_timestamp_left = (max_machine_seq_num - current_machine_seq_num + 1); + if (size64 >= seq_nums_in_current_timestamp_left) { + new_timestamp = current_timestamp + 1 + (size64 - seq_nums_in_current_timestamp_left) / max_machine_seq_num; + } else { + new_timestamp = current_timestamp; + } + Int64 new_machine_seq_num = (current_machine_seq_num + size64) & machine_seq_num_mask; + next_available_id = (new_timestamp << (machine_id_size + machine_seq_num_size)) | machine_id | new_machine_seq_num; + } + while (!lowest_available_snowflake_id.compare_exchange_strong(available_id, next_available_id)); + // failed CAS => another thread updated `lowest_available_snowflake_id` + // successful CAS => we have our range of exclusive values - el = new_state; + for (Int64 & el : vec_to) + { + el = (current_timestamp << (machine_id_size + machine_seq_num_size)) | machine_id | current_machine_seq_num; + if (current_machine_seq_num++ == max_machine_seq_num) + { + current_machine_seq_num = 0; + ++current_timestamp; + } } return col_res; @@ -122,7 +160,41 @@ public: REGISTER_FUNCTION(GenerateSnowflakeID) { - factory.registerFunction(); + factory.registerFunction(FunctionDocumentation + { + .description=R"( +Generates Snowflake ID -- unique identificators contains: +- The first 41 (+ 1 top zero bit) bits is timestamp in Unix time milliseconds +- The middle 10 bits are the machine ID. +- The last 12 bits decode to number of ids processed by the machine at the given millisecond. + +In case the number of ids processed overflows, the timestamp field is incremented by 1 and the counter is reset to 0. +This function guarantees strict monotony on 1 machine and differences in values obtained on different machines. +)", + .syntax = "generateSnowflakeID()", + .arguments{}, + .returned_value = "Column of Int64", + .examples{ + {"single call", "SELECT generateSnowflakeID();", R"( +┌─generateSnowflakeID()─┐ +│ 7195510166884597760 │ +└───────────────────────┘)"}, + {"column call", "SELECT generateSnowflakeID() FROM numbers(10);", R"( +┌─generateSnowflakeID()─┐ +│ 7195516038159417344 │ +│ 7195516038159417345 │ +│ 7195516038159417346 │ +│ 7195516038159417347 │ +│ 7195516038159417348 │ +│ 7195516038159417349 │ +│ 7195516038159417350 │ +│ 7195516038159417351 │ +│ 7195516038159417352 │ +│ 7195516038159417353 │ +└───────────────────────┘)"}, + }, + .categories{"Unique identifiers", "Snowflake ID"} + }); } } diff --git a/src/Functions/serial.cpp b/src/Functions/serial.cpp index 1745e17b5e7..3da2f4ce218 100644 --- a/src/Functions/serial.cpp +++ b/src/Functions/serial.cpp @@ -1,18 +1,11 @@ -#include -#include -#include -#include +#include #include #include #include #include -#include "Common/Logger.h" -#include "Common/ZooKeeper/IKeeper.h" -#include "Common/ZooKeeper/KeeperException.h" -#include "Common/ZooKeeper/Types.h" -#include -namespace DB { +namespace DB +{ namespace ErrorCodes { @@ -62,30 +55,26 @@ public: throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Number of arguments for function {} doesn't match: passed {}, should be 1.", getName(), arguments.size()); - if (!isStringOrFixedString(arguments[0])) { + if (!isStringOrFixedString(arguments[0])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Type of argument for function {} doesn't match: passed {}, should be string", getName(), arguments[0]->getName()); - } return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - if (zk == nullptr) { + if (zk == nullptr) throw Exception(ErrorCodes::KEEPER_EXCEPTION, "ZooKeeper is not configured for function {}", getName()); - } - if (zk->expired()) { + if (zk->expired()) zk = context->getZooKeeper(); - } auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_to = col_res->getData(); size_t size = input_rows_count; - LOG_INFO(getLogger("Serial Function"), "Size = {}", size); vec_to.resize(size); const auto & serial_path = "/serials/" + arguments[0].column->getDataAt(0).toString(); @@ -102,16 +91,19 @@ public: zk->createIfNotExists(counter_path, "1"); Coordination::Stat stat; - while (true) { + while (true) + { std::string counter_string = zk->get(counter_path, &stat); counter = std::stoll(counter_string); std::string updated_counter = std::to_string(counter + input_rows_count); Coordination::Error err = zk->trySet(counter_path, updated_counter); - if (err == Coordination::Error::ZOK) { + if (err == Coordination::Error::ZOK) + { // CAS is done break; } - if (err != Coordination::Error::ZBADVERSION) { + if (err != Coordination::Error::ZBADVERSION) + { throw Exception(ErrorCodes::KEEPER_EXCEPTION, "ZooKeeper trySet operation failed with unexpected error = {} in function {}", err, getName()); @@ -119,7 +111,8 @@ public: } // Make a result - for (auto& val : vec_to) { + for (auto& val : vec_to) + { val = counter; ++counter; } @@ -163,7 +156,6 @@ The server should be configured with a ZooKeeper. )"}}, .categories{"Unique identifiers"} }); - } } diff --git a/tests/queries/0_stateless/03129_serial_test_zookeeper.reference b/tests/queries/0_stateless/03129_serial_test_zookeeper.reference index 60714f4064f..479030db4be 100644 --- a/tests/queries/0_stateless/03129_serial_test_zookeeper.reference +++ b/tests/queries/0_stateless/03129_serial_test_zookeeper.reference @@ -1,8 +1,13 @@ 1 2 -1 3 3 3 -1 1 1 4 -1 2 2 5 -1 5 5 6 -1 4 4 7 1 +3 +4 +5 +6 +7 +1 1 +2 2 +3 3 +4 4 +5 5 diff --git a/tests/queries/0_stateless/03129_serial_test_zookeeper.sql b/tests/queries/0_stateless/03129_serial_test_zookeeper.sql index 3eacd1ae908..c3395009477 100644 --- a/tests/queries/0_stateless/03129_serial_test_zookeeper.sql +++ b/tests/queries/0_stateless/03129_serial_test_zookeeper.sql @@ -1,20 +1,12 @@ +-- Tags: zookeeper + SELECT serial('x'); SELECT serial('x'); +SELECT serial('y'); +SELECT serial('x') FROM numbers(5); -DROP TABLE IF EXISTS default.test_table; +SELECT serial(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT serial('x', 'y'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT serial(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -CREATE TABLE test_table -( - CounterID UInt32, - UserID UInt32, - ver UInt16 -) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/1-1/test_table', 'x', ver) -PARTITION BY CounterID -ORDER BY (CounterID, intHash32(UserID)) -SAMPLE BY intHash32(UserID); - -INSERT INTO test_table VALUES (1, 1, 1), (1, 2, 2), (1, 3, 3), (1, 4, 4), (1, 5, 5); - -SELECT *, serial('x') FROM test_table; - -SELECT serial('y'); \ No newline at end of file +SELECT serial('z'), serial('z') FROM numbers(5); diff --git a/tests/queries/0_stateless/03130_generate_snowflake_id.reference b/tests/queries/0_stateless/03130_generate_snowflake_id.reference new file mode 100644 index 00000000000..2049ba26379 --- /dev/null +++ b/tests/queries/0_stateless/03130_generate_snowflake_id.reference @@ -0,0 +1,3 @@ +1 +1 +10 diff --git a/tests/queries/0_stateless/03130_generate_snowflake_id.sql b/tests/queries/0_stateless/03130_generate_snowflake_id.sql new file mode 100644 index 00000000000..669814c9ecb --- /dev/null +++ b/tests/queries/0_stateless/03130_generate_snowflake_id.sql @@ -0,0 +1,11 @@ +SELECT bitShiftLeft(toUInt64(generateSnowflakeID()), 52) = 0; +SELECT bitAnd(bitShiftRight(toUInt64(generateSnowflakeID()), 63), 1) = 0; + +SELECT generateSnowflakeID(1); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } + +SELECT count(*) +FROM +( + SELECT DISTINCT generateSnowflakeID() + FROM numbers(10) +) \ No newline at end of file From 8e63d2f795d4e653ff4885212919725a7bb6a074 Mon Sep 17 00:00:00 2001 From: v01dxyz Date: Mon, 13 May 2024 09:21:01 +0200 Subject: [PATCH 118/392] Compress STDOUT if redirected to file with a compression extension * Add a new member to ClientBase: default_output_compression_method * Move the code to get file path from file descriptor to a separate Common function. The stateless test is almost a copy-paste of 02001_compress_output_file. Fixes https://github.com/ClickHouse/ClickHouse/issues/63496 --- programs/client/Client.cpp | 2 +- programs/local/LocalServer.cpp | 2 +- src/Client/ClientBase.cpp | 10 +++++- src/Client/ClientBase.h | 3 +- src/Common/tryGetFileNameByFileDescriptor.cpp | 33 +++++++++++++++++++ src/Common/tryGetFileNameByFileDescriptor.h | 10 ++++++ src/Formats/FormatFactory.cpp | 22 ++++--------- .../03144_compress_stdout.reference | 2 ++ .../0_stateless/03144_compress_stdout.sh | 23 +++++++++++++ 9 files changed, 88 insertions(+), 19 deletions(-) create mode 100644 src/Common/tryGetFileNameByFileDescriptor.cpp create mode 100644 src/Common/tryGetFileNameByFileDescriptor.h create mode 100644 tests/queries/0_stateless/03144_compress_stdout.reference create mode 100755 tests/queries/0_stateless/03144_compress_stdout.sh diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 396cd3e646b..9ae5dd735ed 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -1178,7 +1178,7 @@ void Client::processConfig() pager = config().getString("pager", ""); - setDefaultFormatsFromConfiguration(); + setDefaultFormatsAndCompressionFromConfiguration(); global_context->setClientName(std::string(DEFAULT_CLIENT_NAME)); global_context->setQueryKindInitial(); diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 5f2a51406e1..f18c0306254 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -611,7 +611,7 @@ void LocalServer::processConfig() if (config().has("macros")) global_context->setMacros(std::make_unique(config(), "macros", log)); - setDefaultFormatsFromConfiguration(); + setDefaultFormatsAndCompressionFromConfiguration(); /// Sets external authenticators config (LDAP, Kerberos). global_context->setExternalAuthenticatorsConfig(config()); diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index bd4430648c5..61d95e6eb4c 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -643,6 +644,9 @@ try bool extras_into_stdout = need_render_progress || logs_into_stdout; bool select_only_into_file = select_into_file && !select_into_file_and_stdout; + if (!out_file_buf && default_output_compression_method != CompressionMethod::None) + out_file_buf = wrapWriteBufferWithCompressionMethod(out_buf, default_output_compression_method, 3, 0); + /// It is not clear how to write progress and logs /// intermixed with data with parallel formatting. /// It may increase code complexity significantly. @@ -735,7 +739,7 @@ bool ClientBase::isRegularFile(int fd) return fstat(fd, &file_stat) == 0 && S_ISREG(file_stat.st_mode); } -void ClientBase::setDefaultFormatsFromConfiguration() +void ClientBase::setDefaultFormatsAndCompressionFromConfiguration() { if (config().has("output-format")) { @@ -759,6 +763,10 @@ void ClientBase::setDefaultFormatsFromConfiguration() default_output_format = *format_from_file_name; else default_output_format = "TSV"; + + std::optional file_name = tryGetFileNameFromFileDescriptor(STDOUT_FILENO); + if (file_name) + default_output_compression_method = chooseCompressionMethod(*file_name, ""); } else if (is_interactive) { diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 64cbdbe8989..7a0489641c8 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -190,7 +190,7 @@ protected: /// Adjust some settings after command line options and config had been processed. void adjustSettings(); - void setDefaultFormatsFromConfiguration(); + void setDefaultFormatsAndCompressionFromConfiguration(); void initTTYBuffer(ProgressOption progress); @@ -224,6 +224,7 @@ protected: String pager; String default_output_format; /// Query results output format. + CompressionMethod default_output_compression_method = CompressionMethod::None; String default_input_format; /// Tables' format for clickhouse-local. bool select_into_file = false; /// If writing result INTO OUTFILE. It affects progress rendering. diff --git a/src/Common/tryGetFileNameByFileDescriptor.cpp b/src/Common/tryGetFileNameByFileDescriptor.cpp new file mode 100644 index 00000000000..47e81050388 --- /dev/null +++ b/src/Common/tryGetFileNameByFileDescriptor.cpp @@ -0,0 +1,33 @@ +#include + +#ifdef OS_LINUX +# include +#elif defined(OS_DARWIN) +# include +#endif + +#include + + +namespace DB +{ +std::optional tryGetFileNameFromFileDescriptor(int fd) +{ +#ifdef OS_LINUX + std::string proc_path = fmt::format("/proc/self/fd/{}", fd); + char file_path[PATH_MAX] = {'\0'}; + if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1) + return file_path; + return std::nullopt; +#elif defined(OS_DARWIN) + char file_path[PATH_MAX] = {'\0'}; + if (fcntl(fd, F_GETPATH, file_path) != -1) + return file_path; + return std::nullopt; +#else + (void)fd; + return std::nullopt; +#endif +} + +} diff --git a/src/Common/tryGetFileNameByFileDescriptor.h b/src/Common/tryGetFileNameByFileDescriptor.h new file mode 100644 index 00000000000..c38ccb4f851 --- /dev/null +++ b/src/Common/tryGetFileNameByFileDescriptor.h @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + +namespace DB +{ +/// Supports only Linux/MacOS. On other platforms, returns nullopt. +std::optional tryGetFileNameFromFileDescriptor(int fd); +} diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index b7e9899da46..783daba44fd 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -15,7 +16,7 @@ #include #include #include -#include +#include #include @@ -692,21 +693,12 @@ String FormatFactory::getFormatFromFileName(String file_name) std::optional FormatFactory::tryGetFormatFromFileDescriptor(int fd) { -#ifdef OS_LINUX - std::string proc_path = fmt::format("/proc/self/fd/{}", fd); - char file_path[PATH_MAX] = {'\0'}; - if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1) - return tryGetFormatFromFileName(file_path); + std::optional file_name = tryGetFileNameFromFileDescriptor(fd); + + if (file_name) + return tryGetFormatFromFileName(*file_name); + return std::nullopt; -#elif defined(OS_DARWIN) - char file_path[PATH_MAX] = {'\0'}; - if (fcntl(fd, F_GETPATH, file_path) != -1) - return tryGetFormatFromFileName(file_path); - return std::nullopt; -#else - (void)fd; - return std::nullopt; -#endif } String FormatFactory::getFormatFromFileDescriptor(int fd) diff --git a/tests/queries/0_stateless/03144_compress_stdout.reference b/tests/queries/0_stateless/03144_compress_stdout.reference new file mode 100644 index 00000000000..6f51dfc24e1 --- /dev/null +++ b/tests/queries/0_stateless/03144_compress_stdout.reference @@ -0,0 +1,2 @@ +Hello, World! From client. +Hello, World! From local. diff --git a/tests/queries/0_stateless/03144_compress_stdout.sh b/tests/queries/0_stateless/03144_compress_stdout.sh new file mode 100755 index 00000000000..569754303a7 --- /dev/null +++ b/tests/queries/0_stateless/03144_compress_stdout.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -e + +[ -e "${CLICKHOUSE_TMP}"/test_compression_of_output_file_from_client.gz ] && rm "${CLICKHOUSE_TMP}"/test_compression_of_output_file_from_client.gz + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM (SELECT 'Hello, World! From client.')" > ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_client.gz +gunzip ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_client.gz +cat ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_client + +rm -f "${CLICKHOUSE_TMP}/test_compression_of_output_file_from_client" + +[ -e "${CLICKHOUSE_TMP}"/test_compression_of_output_file_from_local.gz ] && rm "${CLICKHOUSE_TMP}"/test_compression_of_output_file_from_local.gz + +${CLICKHOUSE_LOCAL} --query "SELECT * FROM (SELECT 'Hello, World! From local.')" > ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_local.gz +gunzip ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_local.gz +cat ${CLICKHOUSE_TMP}/test_compression_of_output_file_from_local + +rm -f "${CLICKHOUSE_TMP}/test_compression_of_output_file_from_local" From 6a94ba370a3a294f7f2b1471214be6ecfd6eaa7b Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Mon, 13 May 2024 09:43:03 +0200 Subject: [PATCH 119/392] Fix clang-tidy errors --- src/Functions/FunctionsHashing.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index bccdba5ee69..1091ec6c86f 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -94,8 +94,8 @@ namespace impl i = 0; if (offsets != nullptr) { - const auto begin = offsets->begin(); - auto upper = std::upper_bound(begin, offsets->end(), i); + const auto *const begin = offsets->begin(); + const auto * upper = std::upper_bound(begin, offsets->end(), i); if (upper == offsets->end()) throw Exception(ErrorCodes::LOGICAL_ERROR, "offset {} not found in function SipHashKeyColumns::getKey", i); i = upper - begin; From f1f668e7df24190eaf4f1d67360b9e53099289d2 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 10 May 2024 14:15:01 +0200 Subject: [PATCH 120/392] Setup node generator initial --- utils/keeper-bench/Runner.cpp | 288 ++++++++++++++++++++++++++++++---- utils/keeper-bench/Runner.h | 3 + utils/keeper-bench/main.cpp | 2 + 3 files changed, 265 insertions(+), 28 deletions(-) diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp index a893dac3851..0050230b6ec 100644 --- a/utils/keeper-bench/Runner.cpp +++ b/utils/keeper-bench/Runner.cpp @@ -1,17 +1,22 @@ #include "Runner.h" #include -#include #include +#include +#include +#include #include "Common/ConcurrentBoundedQueue.h" +#include "Common/Exception.h" #include "Common/ZooKeeper/IKeeper.h" #include "Common/ZooKeeper/ZooKeeperArgs.h" #include "Common/ZooKeeper/ZooKeeperCommon.h" #include "Common/ZooKeeper/ZooKeeperConstants.h" #include #include +#include "Coordination/KeeperSnapshotManager.h" #include "Core/ColumnWithTypeAndName.h" #include "Core/ColumnsWithTypeAndName.h" +#include #include "IO/ReadBuffer.h" #include "IO/ReadBufferFromFile.h" #include "base/Decimal.h" @@ -43,12 +48,14 @@ Runner::Runner( std::optional concurrency_, const std::string & config_path, const std::string & input_request_log_, + const std::string & setup_nodes_snapshot_path_, const Strings & hosts_strings_, std::optional max_time_, std::optional delay_, std::optional continue_on_error_, std::optional max_iterations_) : input_request_log(input_request_log_) + , setup_nodes_snapshot_path(setup_nodes_snapshot_path_) , info(std::make_shared()) { @@ -381,18 +388,18 @@ struct ZooKeeperRequestBlock { explicit ZooKeeperRequestBlock(DB::Block block_) : block(std::move(block_)) - , hostname_idx(block.getPositionByName("hostname")) // - , request_event_time_idx(block.getPositionByName("request_event_time")) // - , thread_id_idx(block.getPositionByName("thread_id")) // - , session_id_idx(block.getPositionByName("session_id")) // - , xid_idx(block.getPositionByName("xid")) // + , hostname_idx(block.getPositionByName("hostname")) + , request_event_time_idx(block.getPositionByName("request_event_time")) + , thread_id_idx(block.getPositionByName("thread_id")) + , session_id_idx(block.getPositionByName("session_id")) + , xid_idx(block.getPositionByName("xid")) , has_watch_idx(block.getPositionByName("has_watch")) , op_num_idx(block.getPositionByName("op_num")) , path_idx(block.getPositionByName("path")) , data_idx(block.getPositionByName("data")) , is_ephemeral_idx(block.getPositionByName("is_ephemeral")) , is_sequential_idx(block.getPositionByName("is_sequential")) - , response_event_time_idx(block.getPositionByName("response_event_time")) // + , response_event_time_idx(block.getPositionByName("response_event_time")) , error_idx(block.getPositionByName("error")) , requests_size_idx(block.getPositionByName("requests_size")) , version_idx(block.getPositionByName("version")) @@ -519,6 +526,7 @@ struct RequestFromLog { Coordination::ZooKeeperRequestPtr request; std::optional expected_result; + std::vector> subrequest_expected_results; int64_t session_id = 0; size_t executor_id = 0; bool has_watch = false; @@ -586,7 +594,6 @@ struct ZooKeeperRequestFromLogReader idx_in_block = 0; } - request_from_log.expected_result = current_block->getError(idx_in_block); request_from_log.session_id = current_block->getSessionId(idx_in_block); request_from_log.has_watch = current_block->hasWatch(idx_in_block); @@ -693,6 +700,12 @@ struct ZooKeeperRequestFromLogReader if (!subrequest_from_log) throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Failed to fetch subrequest for {}, subrequest index {}", op_num, i); + if (!subrequest_from_log->expected_result && request_from_log.expected_result + && request_from_log.expected_result == Coordination::Error::ZOK) + { + subrequest_from_log->expected_result = Coordination::Error::ZOK; + } + requests.push_back(std::move(subrequest_from_log->request)); if (subrequest_from_log->session_id != request_from_log.session_id) @@ -700,6 +713,8 @@ struct ZooKeeperRequestFromLogReader if (subrequest_from_log->executor_id != request_from_log.executor_id) throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Executor id mismatch for subrequest in {}, subrequest index {}", op_num, i); + + request_from_log.subrequest_expected_results.push_back(subrequest_from_log->expected_result); } request_from_log.request = std::make_shared(requests, default_acls); @@ -731,7 +746,6 @@ private: namespace { - struct RequestFromLogStats { struct Stats @@ -744,6 +758,192 @@ struct RequestFromLogStats Stats read_requests; }; +struct SetupNodeCollector +{ + explicit SetupNodeCollector(const std::string & setup_nodes_snapshot_path) + { + if (setup_nodes_snapshot_path.empty()) + return; + + keeper_context = std::make_shared(true, std::make_shared()); + keeper_context->setDigestEnabled(true); + keeper_context->setSnapshotDisk( + std::make_shared("Keeper-snapshots", setup_nodes_snapshot_path)); + + snapshot_manager.emplace(1, keeper_context); + auto snapshot_result = snapshot_manager->restoreFromLatestSnapshot(); + if (snapshot_result.storage == nullptr) + { + std::cerr << "No initial snapshot found" << std::endl; + initial_storage = std::make_unique( + /* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false); + initial_storage->initializeSystemNodes(); + } + else + { + std::cerr << "Loaded initial nodes from snapshot" << std::endl; + initial_storage = std::move(snapshot_result.storage); + } + } + + void processRequest(const RequestFromLog & request_from_log) + { + if (!request_from_log.expected_result.has_value()) + return; + + auto process_request = [&](const Coordination::ZooKeeperRequest & request, const auto expected_result) + { + const auto & path = request.getPath(); + if (processed_paths.contains(path)) + return; + + auto op_num = request.getOpNum(); + + if (op_num == Coordination::OpNum::Create) + { + if (expected_result == Coordination::Error::ZNODEEXISTS) + { + addExpectedNode(path); + processed_paths.insert(path); + } + else if (expected_result == Coordination::Error::ZOK) + { + /// we need to make sure ancestors exist + auto position = path.find_last_of('/'); + if (position != 0) + { + auto parent_path = path.substr(0, position); + if (!processed_paths.contains(parent_path)) + { + addExpectedNode(parent_path); + processed_paths.insert(parent_path); + } + } + + processed_paths.insert(path); + } + } + else if (op_num == Coordination::OpNum::Remove) + { + if (expected_result == Coordination::Error::ZOK) + { + addExpectedNode(path); + processed_paths.insert(path); + } + } + else if (op_num == Coordination::OpNum::Set) + { + if (expected_result == Coordination::Error::ZOK) + { + addExpectedNode(path); + processed_paths.insert(path); + } + } + else if (op_num == Coordination::OpNum::Check) + { + if (expected_result == Coordination::Error::ZOK) + { + addExpectedNode(path); + processed_paths.insert(path); + } + } + else if (op_num == Coordination::OpNum::CheckNotExists) + { + if (expected_result == Coordination::Error::ZNODEEXISTS) + { + addExpectedNode(path); + processed_paths.insert(path); + } + } + else if (request.isReadRequest()) + { + if (expected_result == Coordination::Error::ZOK) + { + addExpectedNode(path); + processed_paths.insert(path); + } + } + }; + + const auto & request = request_from_log.request; + if (request->getOpNum() == Coordination::OpNum::Multi || request->getOpNum() == Coordination::OpNum::MultiRead) + { + const auto & multi_request = dynamic_cast(*request); + const auto & subrequests = multi_request.requests; + + for (size_t i = 0; i < subrequests.size(); ++i) + { + const auto & zookeeper_request = dynamic_cast(*subrequests[i]); + const auto subrequest_expected_result = request_from_log.subrequest_expected_results[i]; + if (subrequest_expected_result.has_value()) + process_request(zookeeper_request, *subrequest_expected_result); + + } + } + else + process_request(*request, *request_from_log.expected_result); + } + + void addExpectedNode(const std::string & path) + { + std::lock_guard lock(nodes_mutex); + + if (initial_storage->container.contains(path)) + return; + + std::cerr << "Adding expected node " << path << std::endl; + + Coordination::Requests create_ops; + + size_t pos = 1; + while (true) + { + pos = path.find('/', pos); + if (pos == std::string::npos) + break; + + auto request = zkutil::makeCreateRequest(path.substr(0, pos), "", zkutil::CreateMode::Persistent, true); + create_ops.emplace_back(request); + ++pos; + } + + auto request = zkutil::makeCreateRequest(path, "", zkutil::CreateMode::Persistent, true); + create_ops.emplace_back(request); + + auto next_zxid = initial_storage->getNextZXID(); + + static Coordination::ACLs default_acls = [] + { + Coordination::ACL acl; + acl.permissions = Coordination::ACL::All; + acl.scheme = "world"; + acl.id = "anyone"; + return Coordination::ACLs{std::move(acl)}; + }(); + + auto multi_create_request = std::make_shared(create_ops, default_acls); + initial_storage->preprocessRequest(multi_create_request, 1, 0, next_zxid, /* check_acl = */ false); + auto responses = initial_storage->processRequest(multi_create_request, 1, next_zxid, /* check_acl = */ false); + if (responses.size() > 1 || responses[0].response->error != Coordination::Error::ZOK) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Invalid response after trying to create a node {}", responses[0].response->error); + } + + void generateSnapshot() + { + std::cerr << "Generating snapshot with starting data" << std::endl; + std::lock_guard lock(nodes_mutex); + DB::SnapshotMetadataPtr snapshot_meta = std::make_shared(initial_storage->getZXID(), 1, std::make_shared()); + DB::KeeperStorageSnapshot snapshot(initial_storage.get(), snapshot_meta); + snapshot_manager->serializeSnapshotToDisk(snapshot); + } + + std::mutex nodes_mutex; + DB::KeeperContextPtr keeper_context; + Coordination::KeeperStoragePtr initial_storage; + std::unordered_set processed_paths; + std::optional snapshot_manager; +}; + void dumpStats(std::string_view type, const RequestFromLogStats::Stats & stats_for_type) { std::cerr << fmt::format( @@ -751,7 +951,7 @@ void dumpStats(std::string_view type, const RequestFromLogStats::Stats & stats_f type, stats_for_type.total, stats_for_type.unexpected_results, - static_cast(stats_for_type.unexpected_results) / stats_for_type.total * 100) + stats_for_type.total != 0 ? static_cast(stats_for_type.unexpected_results) / stats_for_type.total * 100 : 0.0) << std::endl; }; @@ -763,24 +963,40 @@ void requestFromLogExecutor(std::shared_ptr>(); last_request = request_promise->get_future(); - Coordination::ResponseCallback callback - = [&, request_promise, request = request_from_log.request, expected_result = request_from_log.expected_result]( - const Coordination::Response & response) mutable + Coordination::ResponseCallback callback = [&, + request_promise, + request = request_from_log.request, + expected_result = request_from_log.expected_result, + subrequest_expected_results = std::move(request_from_log.subrequest_expected_results)]( + const Coordination::Response & response) mutable { auto & stats = request->isReadRequest() ? request_stats.read_requests : request_stats.write_requests; stats.total.fetch_add(1, std::memory_order_relaxed); - if (*expected_result != response.error) - stats.unexpected_results.fetch_add(1, std::memory_order_relaxed); + if (expected_result) + { + if (*expected_result != response.error) + stats.unexpected_results.fetch_add(1, std::memory_order_relaxed); - //if (!expected_result) - // return; + if (*expected_result != response.error) + { + std::cerr << fmt::format( + "Unexpected result for {}\ngot {}, expected {}\n", request->toString(), response.error, *expected_result) + << std::endl; - //if (*expected_result != response.error) - // std::cerr << fmt::format( - // "Unexpected result for {}, got {}, expected {}", request->getOpNum(), response.error, *expected_result) - // << std::endl; + if (const auto * multi_response = dynamic_cast(&response)) + { + std::string subresponses; + for (size_t i = 0; i < multi_response->responses.size(); ++i) + { + subresponses += fmt::format("{} = {}\n", i, multi_response->responses[i]->error); + } + + std::cerr << "Subresponses\n" << subresponses << std::endl; + } + } + } request_promise->set_value(); }; @@ -827,6 +1043,9 @@ void Runner::runBenchmarkFromLog() RequestFromLogStats stats; + std::optional setup_nodes_collector; + if (!setup_nodes_snapshot_path.empty()) + setup_nodes_collector.emplace(setup_nodes_snapshot_path); std::unordered_map>> executor_id_to_queue; @@ -850,7 +1069,7 @@ void Runner::runBenchmarkFromLog() return; } - auto executor_queue = std::make_shared>(std::numeric_limits().max()); + auto executor_queue = std::make_shared>(std::numeric_limits::max()); executor_id_to_queue.emplace(request.executor_id, executor_queue); auto scheduled = pool->trySchedule([&, executor_queue]() mutable { @@ -865,6 +1084,7 @@ void Runner::runBenchmarkFromLog() throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Failed to push to the executor's queue"); }; + if (!setup_nodes_collector) { auto setup_connection = getConnection(connection_infos[0], 0); benchmark_context.startup(*setup_connection); @@ -875,14 +1095,26 @@ void Runner::runBenchmarkFromLog() delay_watch.restart(); while (auto request_from_log = request_reader.getNextRequest()) { - request_from_log->connection = get_zookeeper_connection(request_from_log->session_id); - push_request(std::move(*request_from_log)); + if (setup_nodes_collector) + { + setup_nodes_collector->processRequest(*request_from_log); + } + else + { + request_from_log->connection = get_zookeeper_connection(request_from_log->session_id); + push_request(std::move(*request_from_log)); + } if (delay > 0 && delay_watch.elapsedSeconds() > delay) { - dumpStats("Write", stats.write_requests); - dumpStats("Read", stats.read_requests); - std::cerr << std::endl; + if (setup_nodes_collector) + setup_nodes_collector->generateSnapshot(); + else + { + dumpStats("Write", stats.write_requests); + dumpStats("Read", stats.read_requests); + std::cerr << std::endl; + } delay_watch.restart(); } } @@ -906,7 +1138,7 @@ void Runner::runBenchmarkWithGenerator() for (size_t i = 0; i < concurrency; ++i) { auto thread_connections = connections; - pool->scheduleOrThrowOnError([this, connections_ = std::move(thread_connections)]() mutable { thread(connections_); }); + pool->scheduleOrThrowOnError([this, my_connections = std::move(thread_connections)]() mutable { thread(my_connections); }); } } catch (...) diff --git a/utils/keeper-bench/Runner.h b/utils/keeper-bench/Runner.h index 0c646eb2166..c19a4d82898 100644 --- a/utils/keeper-bench/Runner.h +++ b/utils/keeper-bench/Runner.h @@ -27,6 +27,7 @@ public: void startup(Coordination::ZooKeeper & zookeeper); void cleanup(Coordination::ZooKeeper & zookeeper); + private: struct Node { @@ -54,6 +55,7 @@ public: std::optional concurrency_, const std::string & config_path, const std::string & input_request_log_, + const std::string & setup_nodes_snapshot_path_, const Strings & hosts_strings_, std::optional max_time_, std::optional delay_, @@ -96,6 +98,7 @@ private: std::shared_ptr getConnection(const ConnectionInfo & connection_info, size_t connection_info_idx); std::string input_request_log; + std::string setup_nodes_snapshot_path; size_t concurrency = 1; diff --git a/utils/keeper-bench/main.cpp b/utils/keeper-bench/main.cpp index 45fc28f3bca..0b963abf406 100644 --- a/utils/keeper-bench/main.cpp +++ b/utils/keeper-bench/main.cpp @@ -38,6 +38,7 @@ int main(int argc, char *argv[]) ("help", "produce help message") ("config", value()->default_value(""), "yaml/xml file containing configuration") ("input-request-log", value()->default_value(""), "log of requests that will be replayed") + ("setup-nodes-snapshot-path", value()->default_value(""), "directory containing snapshots with starting state") ("concurrency,c", value(), "number of parallel queries") ("report-delay,d", value(), "delay between intermediate reports in seconds (set 0 to disable reports)") ("iterations,i", value(), "amount of queries to be executed") @@ -60,6 +61,7 @@ int main(int argc, char *argv[]) Runner runner(valueToOptional(options["concurrency"]), options["config"].as(), options["input-request-log"].as(), + options["setup-nodes-snapshot-path"].as(), options["hosts"].as(), valueToOptional(options["time-limit"]), valueToOptional(options["report-delay"]), From 4653ec618d117f840cec5ba8c6d95895f0bbf4af Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 13 May 2024 13:43:47 +0000 Subject: [PATCH 121/392] Add more tests and documentation, fix existing tests and special build --- docs/en/sql-reference/data-types/dynamic.md | 86 ++++++++- src/Columns/ColumnDynamic.cpp | 7 + src/DataTypes/DataTypeDynamic.h | 2 +- ...9_dynamic_all_merge_algorithms_1.reference | 14 +- ... => 03040_dynamic_type_alters_1.reference} | 0 ...ters.sh => 03040_dynamic_type_alters_1.sh} | 3 +- .../03040_dynamic_type_alters_2.reference | 182 ++++++++++++++++++ .../03040_dynamic_type_alters_2.sh | 57 ++++++ .../03041_dynamic_type_check_table.reference | 56 ++++++ .../03041_dynamic_type_check_table.sh | 45 +++++ 10 files changed, 442 insertions(+), 10 deletions(-) rename tests/queries/0_stateless/{03040_dynamic_type_alters.reference => 03040_dynamic_type_alters_1.reference} (100%) rename tests/queries/0_stateless/{03040_dynamic_type_alters.sh => 03040_dynamic_type_alters_1.sh} (57%) create mode 100644 tests/queries/0_stateless/03040_dynamic_type_alters_2.reference create mode 100755 tests/queries/0_stateless/03040_dynamic_type_alters_2.sh create mode 100644 tests/queries/0_stateless/03041_dynamic_type_check_table.reference create mode 100755 tests/queries/0_stateless/03041_dynamic_type_check_table.sh diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index e3cade25b55..a2c8ba532ce 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -261,7 +261,7 @@ SELECT d, dynamicType(d), d::Dynamic(max_types=1) as d2, dynamicType(d2) FROM te └─────────┴────────────────┴─────────┴─────────────────┘ ``` -## Reading Variant type from the data +## Reading Dynamic type from the data All text formats (TSV, CSV, CustomSeparated, Values, JSONEachRow, etc) supports reading `Dynamic` type. During data parsing ClickHouse tries to infer the type of each value and use it during insertion to `Dynamic` column. @@ -409,3 +409,87 @@ SELECT d, dynamicType(d) FROM test ORDER by d; └─────┴────────────────┘ ``` +## Reaching the limit in number of different data types stored inside Dynamic + +`Dynamic` data type can store only limited number of different data types inside. By default, this limit is 32, but you can change it in type declaration using syntax `Dynamic(max_types=N)` where N is between 1 and 255 (due to implementation details, it's impossible to have more than 255 different data types inside Dynamic). +When the limit is reached, all new data types inserted to `Dynamic` column will be casted to `String` and stored as `String` values. + +Let's see what happens when the limit is reached in different scenarios. + +### Reaching the limit during data parsing + +During parsing of `Dynamic` values from the data, when the limit is reached for current block of data, all new values will be inserted as `String` values: + +```sql +SELECT d, dynamicType(d) FROM format(JSONEachRow, 'd Dynamic(max_types=3)', ' +{"d" : 42} +{"d" : [1, 2, 3]} +{"d" : "Hello, World!"} +{"d" : "2020-01-01"} +{"d" : ["str1", "str2", "str3"]} +{"d" : {"a" : 1, "b" : [1, 2, 3]}} +') +``` + +```text +┌─d──────────────────────────┬─dynamicType(d)─┐ +│ 42 │ Int64 │ +│ [1,2,3] │ Array(Int64) │ +│ Hello, World! │ String │ +│ 2020-01-01 │ String │ +│ ["str1", "str2", "str3"] │ String │ +│ {"a" : 1, "b" : [1, 2, 3]} │ String │ +└────────────────────────────┴────────────────┘ +``` + +As we can see, after inserting 3 different data types `Int64`, `Array(Int64)` and `String` all new types were converted to `String`. + +### During merges of data parts in MergeTree table engines + +During merge of several data parts in MergeTree table the `Dynamic` column in the resulting data part can reach the limit of different data types inside and won't be able to store all types from source parts. +In this case ClickHouse chooses what types will remain after merge and what types will be casted to `String`. In most cases ClickHouse tries to keep the most frequent types and cast the rarest types to `String`, but it depends on the implementation. + +Let's see an example of such merge. First, let's create a table with `Dynamic` column, set the limit of different data types to `3` and insert values with `5` different types: + +```sql +CREATE TABLE test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree ORDER BY id; +SYSTEM STOP MERGES test; +INSERT INTO test SELECT number, number FROM numbers(5); +INSERT INTO test SELECT number, range(number) FROM numbers(4); +INSERT INTO test SELECT number, toDate(number) FROM numbers(3); +INSERT INTO test SELECT number, map(number, number) FROM numbers(2); +INSERT INTO test SELECT number, 'str_' || toString(number) FROM numbers(1); +``` + +Each insert will create a separate data pert with `Dynamic` column containing single type: +```sql +SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part; +``` + +```text +┌─count()─┬─dynamicType(d)──────┬─_part─────┐ +│ 5 │ UInt64 │ all_1_1_0 │ +│ 4 │ Array(UInt64) │ all_2_2_0 │ +│ 3 │ Date │ all_3_3_0 │ +│ 2 │ Map(UInt64, UInt64) │ all_4_4_0 │ +│ 1 │ String │ all_5_5_0 │ +└─────────┴─────────────────────┴───────────┘ +``` + +Now, let's merge all parts into one and see what will happen: + +```sql +SYSTEM START MERGES test; +OPTIMIZE TABLE test FINAL; +SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) ORDER BY _part; +``` + +```text +┌─count()─┬─dynamicType(d)─┬─_part─────┐ +│ 5 │ UInt64 │ all_1_5_2 │ +│ 6 │ String │ all_1_5_2 │ +│ 4 │ Array(UInt64) │ all_1_5_2 │ +└─────────┴────────────────┴───────────┘ +``` + +As we can see, ClickHouse kept the most frequent types `UInt64` and `Array(UInt64)` and casted all other types to `String`. diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index 76f536a3409..0f247638d92 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -290,6 +290,13 @@ void ColumnDynamic::insertRangeFrom(const DB::IColumn & src_, size_t start, size /// We cannot combine 2 Variant types as total number of variants exceeds the limit. /// In this case we will add most frequent variants from this range and insert them as usual, /// all other variants will be converted to String. + /// TODO: instead of keeping all current variants and just adding new most frequent variants + /// from source columns we can also try to replace rarest existing variants with frequent + /// variants from source column (so we will avoid casting new frequent variants to String + /// and keeping rare existing ones). It will require rewriting of existing data in Variant + /// column but will improve usability of Dynamic column for example during squashing blocks + /// during insert. + const auto & src_variant_column = dynamic_src.getVariantColumn(); /// Calculate ranges for each variant in current range. diff --git a/src/DataTypes/DataTypeDynamic.h b/src/DataTypes/DataTypeDynamic.h index bd3d822fbb6..d5e4c5261ce 100644 --- a/src/DataTypes/DataTypeDynamic.h +++ b/src/DataTypes/DataTypeDynamic.h @@ -12,7 +12,7 @@ class DataTypeDynamic final : public IDataType public: static constexpr bool is_parametric = true; - DataTypeDynamic(size_t max_dynamic_types_ = DEFAULT_MAX_DYNAMIC_TYPES); + explicit DataTypeDynamic(size_t max_dynamic_types_ = DEFAULT_MAX_DYNAMIC_TYPES); TypeIndex getTypeId() const override { return TypeIndex::Dynamic; } const char * getFamilyName() const override { return "Dynamic"; } diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference index a7fbbabcd46..4b4a1e2ab51 100644 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference @@ -1,12 +1,12 @@ MergeTree compact + horizontal merge ReplacingMergeTree -100000 UInt64 100000 String +100000 UInt64 50000 UInt64 100000 String SummingMergeTree -100000 UInt64 100000 String +100000 UInt64 200000 1 50000 String 100000 UInt64 @@ -22,8 +22,8 @@ AggregatingMergeTree 100000 1 MergeTree wide + horizontal merge ReplacingMergeTree -100000 UInt64 100000 String +100000 UInt64 50000 UInt64 100000 String SummingMergeTree @@ -49,16 +49,16 @@ ReplacingMergeTree 50000 UInt64 100000 String SummingMergeTree -100000 UInt64 100000 String +100000 UInt64 200000 1 50000 String 100000 UInt64 50000 2 100000 1 AggregatingMergeTree -100000 UInt64 100000 String +100000 UInt64 200000 1 50000 String 100000 UInt64 @@ -66,8 +66,8 @@ AggregatingMergeTree 100000 1 MergeTree wide + vertical merge ReplacingMergeTree -100000 UInt64 100000 String +100000 UInt64 50000 UInt64 100000 String SummingMergeTree @@ -79,8 +79,8 @@ SummingMergeTree 50000 2 100000 1 AggregatingMergeTree -100000 UInt64 100000 String +100000 UInt64 200000 1 50000 String 100000 UInt64 diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters.reference b/tests/queries/0_stateless/03040_dynamic_type_alters_1.reference similarity index 100% rename from tests/queries/0_stateless/03040_dynamic_type_alters.reference rename to tests/queries/0_stateless/03040_dynamic_type_alters_1.reference diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters.sh b/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh similarity index 57% rename from tests/queries/0_stateless/03040_dynamic_type_alters.sh rename to tests/queries/0_stateless/03040_dynamic_type_alters_1.sh index a20a92712e0..1f2a6a31ad7 100755 --- a/tests/queries/0_stateless/03040_dynamic_type_alters.sh +++ b/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --stacktrace --max_insert_threads 3 --group_by_two_level_threshold 1000000 --group_by_two_level_threshold_bytes 42526602 --distributed_aggregation_memory_efficient 1 --fsync_metadata 1 --output_format_parallel_formatting 0 --input_format_parallel_parsing 0 --min_chunk_bytes_for_parallel_parsing 8125230 --max_read_buffer_size 859505 --prefer_localhost_replica 1 --max_block_size 34577 --max_threads 41 --optimize_append_index 0 --optimize_if_chain_to_multiif 1 --optimize_if_transform_strings_to_enum 1 --optimize_read_in_order 1 --optimize_or_like_chain 0 --optimize_substitute_columns 1 --enable_multiple_prewhere_read_steps 1 --read_in_order_two_level_merge_threshold 99 --optimize_aggregation_in_order 1 --aggregation_in_order_max_block_bytes 27635208 --use_uncompressed_cache 0 --min_bytes_to_use_direct_io 10737418240 --min_bytes_to_use_mmap_io 6451111320 --local_filesystem_read_method pread --remote_filesystem_read_method read --local_filesystem_read_prefetch 1 --filesystem_cache_segments_batch_size 50 --read_from_filesystem_cache_if_exists_otherwise_bypass_cache 0 --throw_on_error_from_cache_on_write_operations 0 --remote_filesystem_read_prefetch 1 --allow_prefetched_read_pool_for_remote_filesystem 0 --filesystem_prefetch_max_memory_usage 64Mi --filesystem_prefetches_limit 10 --filesystem_prefetch_min_bytes_for_single_read_task 16Mi --filesystem_prefetch_step_marks 0 --filesystem_prefetch_step_bytes 100Mi --compile_aggregate_expressions 0 --compile_sort_description 1 --merge_tree_coarse_index_granularity 32 --optimize_distinct_in_order 0 --max_bytes_before_external_sort 10737418240 --max_bytes_before_external_group_by 10737418240 --max_bytes_before_remerge_sort 1374192967 --min_compress_block_size 2152247 --max_compress_block_size 1830907 --merge_tree_compact_parts_min_granules_to_multibuffer_read 79 --optimize_sorting_by_input_stream_properties 1 --http_response_buffer_size 106072 --http_wait_end_of_query True --enable_memory_bound_merging_of_aggregation_results 0 --min_count_to_compile_expression 0 --min_count_to_compile_aggregate_expression 3 --min_count_to_compile_sort_description 3 --session_timezone Africa/Khartoum --prefer_warmed_unmerged_parts_seconds 4 --use_page_cache_for_disks_without_file_cache False --page_cache_inject_eviction True --merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability 0.03 --ratio_of_defaults_for_sparse_serialization 0.9779014012142565 --prefer_fetch_merged_part_size_threshold 4254002758 --vertical_merge_algorithm_min_rows_to_activate 1 --vertical_merge_algorithm_min_columns_to_activate 1 --allow_vertical_merges_from_compact_to_wide_parts 1 --min_merge_bytes_to_use_direct_io 1 --index_granularity_bytes 4982992 --merge_max_block_size 16662 --index_granularity 22872 --min_bytes_for_wide_part 1073741824 --compress_marks 0 --compress_primary_key 0 --marks_compress_block_size 86328 --primary_key_compress_block_size 64101 --replace_long_file_name_to_hash 0 --max_file_name_length 81 --min_bytes_for_full_part_storage 536870912 --compact_parts_max_bytes_to_buffer 480908080 --compact_parts_max_granules_to_buffer 1 --compact_parts_merge_max_bytes_to_prefetch_part 4535313 --cache_populated_by_fetch 0" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1" function run() { @@ -74,3 +74,4 @@ echo "MergeTree wide" $CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" run $CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters_2.reference b/tests/queries/0_stateless/03040_dynamic_type_alters_2.reference new file mode 100644 index 00000000000..18a181464e9 --- /dev/null +++ b/tests/queries/0_stateless/03040_dynamic_type_alters_2.reference @@ -0,0 +1,182 @@ +MergeTree compact +initial insert +alter add column +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter rename column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert nested dynamic +3 Array(Dynamic) +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N [] [] [] +1 1 \N \N \N \N \N [] [] [] +2 2 \N \N \N \N \N [] [] [] +3 3 3 \N 3 \N \N [] [] [] +4 4 4 \N 4 \N \N [] [] [] +5 5 5 \N 5 \N \N [] [] [] +6 6 str_6 str_6 \N \N \N [] [] [] +7 7 str_7 str_7 \N \N \N [] [] [] +8 8 str_8 str_8 \N \N \N [] [] [] +9 9 \N \N \N \N \N [] [] [] +10 10 \N \N \N \N \N [] [] [] +11 11 \N \N \N \N \N [] [] [] +12 12 12 \N 12 \N \N [] [] [] +13 13 str_13 str_13 \N \N \N [] [] [] +14 14 \N \N \N \N \N [] [] [] +15 15 [15] \N \N \N \N [15] [NULL] [NULL] +16 16 ['str_16'] \N \N \N \N [NULL] ['str_16'] [NULL] +17 17 [17] \N \N \N \N [17] [NULL] [NULL] +alter rename column 2 +3 Array(Dynamic) +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N [] [] [] +1 1 \N \N \N \N \N [] [] [] +2 2 \N \N \N \N \N [] [] [] +3 3 3 \N 3 \N \N [] [] [] +4 4 4 \N 4 \N \N [] [] [] +5 5 5 \N 5 \N \N [] [] [] +6 6 str_6 str_6 \N \N \N [] [] [] +7 7 str_7 str_7 \N \N \N [] [] [] +8 8 str_8 str_8 \N \N \N [] [] [] +9 9 \N \N \N \N \N [] [] [] +10 10 \N \N \N \N \N [] [] [] +11 11 \N \N \N \N \N [] [] [] +12 12 12 \N 12 \N \N [] [] [] +13 13 str_13 str_13 \N \N \N [] [] [] +14 14 \N \N \N \N \N [] [] [] +15 15 [15] \N \N \N \N [15] [NULL] [NULL] +16 16 ['str_16'] \N \N \N \N [NULL] ['str_16'] [NULL] +17 17 [17] \N \N \N \N [17] [NULL] [NULL] +MergeTree wide +initial insert +alter add column +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +alter rename column 1 +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +insert nested dynamic +3 Array(Dynamic) +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N [] [] [] +1 1 \N \N \N \N \N [] [] [] +2 2 \N \N \N \N \N [] [] [] +3 3 3 \N 3 \N \N [] [] [] +4 4 4 \N 4 \N \N [] [] [] +5 5 5 \N 5 \N \N [] [] [] +6 6 str_6 str_6 \N \N \N [] [] [] +7 7 str_7 str_7 \N \N \N [] [] [] +8 8 str_8 str_8 \N \N \N [] [] [] +9 9 \N \N \N \N \N [] [] [] +10 10 \N \N \N \N \N [] [] [] +11 11 \N \N \N \N \N [] [] [] +12 12 12 \N 12 \N \N [] [] [] +13 13 str_13 str_13 \N \N \N [] [] [] +14 14 \N \N \N \N \N [] [] [] +15 15 [15] \N \N \N \N [15] [NULL] [NULL] +16 16 ['str_16'] \N \N \N \N [NULL] ['str_16'] [NULL] +17 17 [17] \N \N \N \N [17] [NULL] [NULL] +alter rename column 2 +3 Array(Dynamic) +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N [] [] [] +1 1 \N \N \N \N \N [] [] [] +2 2 \N \N \N \N \N [] [] [] +3 3 3 \N 3 \N \N [] [] [] +4 4 4 \N 4 \N \N [] [] [] +5 5 5 \N 5 \N \N [] [] [] +6 6 str_6 str_6 \N \N \N [] [] [] +7 7 str_7 str_7 \N \N \N [] [] [] +8 8 str_8 str_8 \N \N \N [] [] [] +9 9 \N \N \N \N \N [] [] [] +10 10 \N \N \N \N \N [] [] [] +11 11 \N \N \N \N \N [] [] [] +12 12 12 \N 12 \N \N [] [] [] +13 13 str_13 str_13 \N \N \N [] [] [] +14 14 \N \N \N \N \N [] [] [] +15 15 [15] \N \N \N \N [15] [NULL] [NULL] +16 16 ['str_16'] \N \N \N \N [NULL] ['str_16'] [NULL] +17 17 [17] \N \N \N \N [17] [NULL] [NULL] diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters_2.sh b/tests/queries/0_stateless/03040_dynamic_type_alters_2.sh new file mode 100755 index 00000000000..6491e64372f --- /dev/null +++ b/tests/queries/0_stateless/03040_dynamic_type_alters_2.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1" + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column" + $CH_CLIENT -q "alter table test add column d Dynamic settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter add column 1" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "alter rename column 1" + $CH_CLIENT -q "alter table test rename column d to d1 settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d1) from test group by dynamicType(d1) order by count(), dynamicType(d1)" + $CH_CLIENT -q "select x, y, d1, d1.String, d1.UInt64, d1.Date, d1.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert nested dynamic" + $CH_CLIENT -q "insert into test select number, number, [number % 2 ? number : 'str_' || toString(number)]::Array(Dynamic) from numbers(15, 3)" + $CH_CLIENT -q "select count(), dynamicType(d1) from test group by dynamicType(d1) order by count(), dynamicType(d1)" + $CH_CLIENT -q "select x, y, d1, d1.String, d1.UInt64, d1.Date, d1.\`Tuple(a UInt64)\`.a, d1.\`Array(Dynamic)\`.UInt64, d1.\`Array(Dynamic)\`.String, d1.\`Array(Dynamic)\`.Date from test order by x" + + echo "alter rename column 2" + $CH_CLIENT -q "alter table test rename column d1 to d2 settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d2) from test group by dynamicType(d2) order by count(), dynamicType(d2)" + $CH_CLIENT -q "select x, y, d2, d2.String, d2.UInt64, d2.Date, d2.\`Tuple(a UInt64)\`.a, d2.\`Array(Dynamic)\`.UInt64, d2.\`Array(Dynamic)\`.String, d2.\`Array(Dynamic)\`.Date, from test order by x" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/03041_dynamic_type_check_table.reference b/tests/queries/0_stateless/03041_dynamic_type_check_table.reference new file mode 100644 index 00000000000..b1ea186a917 --- /dev/null +++ b/tests/queries/0_stateless/03041_dynamic_type_check_table.reference @@ -0,0 +1,56 @@ +MergeTree compact +initial insert +alter add column +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +check table +1 +MergeTree wide +initial insert +alter add column +3 None +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +insert after alter add column +4 String +4 UInt64 +7 None +0 0 \N \N \N \N \N +1 1 \N \N \N \N \N +2 2 \N \N \N \N \N +3 3 3 \N 3 \N \N +4 4 4 \N 4 \N \N +5 5 5 \N 5 \N \N +6 6 str_6 str_6 \N \N \N +7 7 str_7 str_7 \N \N \N +8 8 str_8 str_8 \N \N \N +9 9 \N \N \N \N \N +10 10 \N \N \N \N \N +11 11 \N \N \N \N \N +12 12 12 \N 12 \N \N +13 13 str_13 str_13 \N \N \N +14 14 \N \N \N \N \N +check table +1 diff --git a/tests/queries/0_stateless/03041_dynamic_type_check_table.sh b/tests/queries/0_stateless/03041_dynamic_type_check_table.sh new file mode 100755 index 00000000000..3d802485be3 --- /dev/null +++ b/tests/queries/0_stateless/03041_dynamic_type_check_table.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1" + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column" + $CH_CLIENT -q "alter table test add column d Dynamic(max_types=3) settings mutations_sync=1" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "insert after alter add column" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select x, y, d, d.String, d.UInt64, d.Date, d.\`Tuple(a UInt64)\`.a from test order by x" + + echo "check table" + $CH_CLIENT -q "check table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" + From 86406c9ac15d4438f257e0aa6b2ca75ea0750add Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 13 May 2024 14:43:32 +0000 Subject: [PATCH 122/392] Fix build --- src/DataTypes/Serializations/SerializationDynamic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationDynamic.h b/src/DataTypes/Serializations/SerializationDynamic.h index 4803bc25d18..7471ff54cf7 100644 --- a/src/DataTypes/Serializations/SerializationDynamic.h +++ b/src/DataTypes/Serializations/SerializationDynamic.h @@ -105,7 +105,7 @@ private: { DynamicStructureSerializationVersion structure_version; DataTypePtr variant_type; - ColumnDynamic::Statistics statistics = {.source = ColumnDynamic::Statistics::Source::READ}; + ColumnDynamic::Statistics statistics = {.source = ColumnDynamic::Statistics::Source::READ, .data = {}}; explicit DeserializeBinaryBulkStateDynamicStructure(UInt64 structure_version_) : structure_version(structure_version_) {} }; From 904800afc8e77bc5567ba2096258aec4802d8cee Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 13 May 2024 17:44:14 +0200 Subject: [PATCH 123/392] Apply recent changes to storages3/hdfs/azure --- .../ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- .../ObjectStorage/StorageObjectStorage.cpp | 29 ++++++++++++------- .../ObjectStorage/StorageObjectStorage.h | 3 +- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 74707b61238..c24874d0a94 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -592,7 +592,7 @@ std::unique_ptr S3ObjectStorage::cloneObjectStorage( ContextPtr context) { auto new_s3_settings = getSettings(config, config_prefix, context); - auto new_client = getClient(config, config_prefix, context, *new_s3_settings); + auto new_client = getClient(config, config_prefix, context, *new_s3_settings, true); auto new_uri{uri}; new_uri.bucket = new_namespace; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index a187a8fc54d..01790760747 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -206,7 +206,7 @@ void StorageObjectStorage::read( size_t num_streams) { updateConfiguration(local_context); - if (partition_by && configuration->withWildcard()) + if (partition_by && configuration->withPartitionWildcard()) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned {} storage is not implemented yet", @@ -247,7 +247,14 @@ SinkToStoragePtr StorageObjectStorage::write( const auto sample_block = metadata_snapshot->getSampleBlock(); const auto & settings = configuration->getQuerySettings(local_context); - if (configuration->withWildcard()) + if (configuration->withGlobsIgnorePartitionWildcard()) + { + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, + "Path '{}' contains globs, so the table is in readonly mode", + configuration->getPath()); + } + + if (configuration->withPartitionWildcard()) { ASTPtr partition_by_ast = nullptr; if (auto insert_query = std::dynamic_pointer_cast(query)) @@ -265,14 +272,6 @@ SinkToStoragePtr StorageObjectStorage::write( } } - if (configuration->withGlobs()) - { - throw Exception( - ErrorCodes::DATABASE_ACCESS_DENIED, - "{} key '{}' contains globs, so the table is in readonly mode", - getName(), configuration->getPath()); - } - auto paths = configuration->getPaths(); if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( *object_storage, *configuration, settings, paths.front(), paths.size())) @@ -428,13 +427,21 @@ StorageObjectStorage::Configuration::Configuration(const Configuration & other) structure = other.structure; } -bool StorageObjectStorage::Configuration::withWildcard() const +bool StorageObjectStorage::Configuration::withPartitionWildcard() const { static const String PARTITION_ID_WILDCARD = "{_partition_id}"; return getPath().find(PARTITION_ID_WILDCARD) != String::npos || getNamespace().find(PARTITION_ID_WILDCARD) != String::npos; } +bool StorageObjectStorage::Configuration::withGlobsIgnorePartitionWildcard() const +{ + if (!withPartitionWildcard()) + return withGlobs(); + else + return PartitionedSink::replaceWildcards(getPath(), "").find_first_of("*?{") != std::string::npos; +} + bool StorageObjectStorage::Configuration::isPathWithGlobs() const { return getPath().find_first_of("*?{") != std::string::npos; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 3f8ff79ad54..a396bad9d6e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -163,8 +163,9 @@ public: virtual void addStructureAndFormatToArgs( ASTs & args, const String & structure_, const String & format_, ContextPtr context) = 0; - bool withWildcard() const; + bool withPartitionWildcard() const; bool withGlobs() const { return isPathWithGlobs() || isNamespaceWithGlobs(); } + bool withGlobsIgnorePartitionWildcard() const; bool isPathWithGlobs() const; bool isNamespaceWithGlobs() const; virtual std::string getPathWithoutGlobs() const; From 61f7b95e3d4ec7711df7fadb332eabf02913ba75 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 13 May 2024 16:04:20 +0000 Subject: [PATCH 124/392] Fix build --- src/DataTypes/Serializations/SerializationDynamic.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationDynamic.cpp b/src/DataTypes/Serializations/SerializationDynamic.cpp index d0ecc3b80a2..cb9d4a2f7bc 100644 --- a/src/DataTypes/Serializations/SerializationDynamic.cpp +++ b/src/DataTypes/Serializations/SerializationDynamic.cpp @@ -31,7 +31,7 @@ struct SerializeBinaryBulkStateDynamic : public ISerialization::SerializeBinaryB ISerialization::SerializeBinaryBulkStatePtr variant_state; /// Variants statistics. Map (Variant name) -> (Variant size). - ColumnDynamic::Statistics statistics = { .source = ColumnDynamic::Statistics::Source::READ }; + ColumnDynamic::Statistics statistics = { .source = ColumnDynamic::Statistics::Source::READ, .data = {} }; SerializeBinaryBulkStateDynamic(UInt64 structure_version_) : structure_version(structure_version_) {} }; From f3b9a326fede69769811dc9309bfb5d00aefd874 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 13 May 2024 19:59:16 +0200 Subject: [PATCH 125/392] Fix build --- src/TableFunctions/TableFunctionObjectStorage.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index a997b34a75c..9f16a9a0b25 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -192,6 +192,15 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) #if USE_HDFS factory.registerFunction>( { + .documentation = + { + .description=R"(The table function can be used to read the data stored on HDFS virtual filesystem.)", + .examples{ + { + "hdfs", + "SELECT * FROM hdfs(url, format, compression, structure])", "" + }} + }, .allow_readonly = false }); #endif From 007c9be4db352567ef9a414a3aaecd1380d9de0d Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Tue, 14 May 2024 10:14:23 +0200 Subject: [PATCH 126/392] Restart CI From 0abb2be5eb55183e83c218cf352c88c7fb497939 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 14 May 2024 18:40:09 +0200 Subject: [PATCH 127/392] Review fixes --- docs/en/operations/settings/settings.md | 50 +++++++++++++++++++ .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 7 ++- .../ObjectStorages/HDFS/HDFSObjectStorage.h | 3 ++ src/Storages/Cache/SchemaCache.cpp | 1 - .../ObjectStorage/StorageObjectStorage.cpp | 6 +-- .../ObjectStorage/StorageObjectStorage.h | 6 ++- .../StorageObjectStorageSource.cpp | 2 +- .../StorageObjectStorageSource.h | 2 +- src/Storages/ObjectStorage/Utils.cpp | 6 +-- .../registerStorageObjectStorage.cpp | 6 +-- 10 files changed, 70 insertions(+), 19 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 91b544c6a82..72bd1ca8e2c 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3665,6 +3665,16 @@ Possible values: Default value: `0`. +## s3_ignore_file_doesnt_exist {#s3_ignore_file_doesnt_exist} + +Ignore ansense of file if it does not exist when reading certain keys. + +Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. + +Default value: `0`. + ## hdfs_truncate_on_insert {#hdfs_truncate_on_insert} Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. @@ -3697,6 +3707,46 @@ Possible values: Default value: `0`. +## hdfs_throw_on_zero_files_match {#hdfs_throw_on_zero_files_match} + +Throw an error if matched zero files according to glob expansion rules. + +Possible values: +- 1 — `SELECT` throws an exception. +- 0 — `SELECT` returns empty result. + +Default value: `0`. + +## hdfs_ignore_file_doesnt_exist {#hdfs_ignore_file_doesnt_exist} + +Ignore ansense of file if it does not exist when reading certain keys. + +Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. + +Default value: `0`. + +## azure_throw_on_zero_files_match {#azure_throw_on_zero_files_match} + +Throw an error if matched zero files according to glob expansion rules. + +Possible values: +- 1 — `SELECT` throws an exception. +- 0 — `SELECT` returns empty result. + +Default value: `0`. + +## azure_ignore_file_doesnt_exist {#azure_ignore_file_doesnt_exist} + +Ignore ansense of file if it does not exist when reading certain keys. + +Possible values: +- 1 — `SELECT` returns empty result. +- 0 — `SELECT` throws an exception. + +Default value: `0`. + ## engine_url_skip_empty_files {#engine_url_skip_empty_files} Enables or disables skipping empty files in [URL](../../engines/table-engines/special/url.md) engine tables. diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 6c2f310a7d1..1f3a4bdf6c7 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -186,7 +186,6 @@ ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) co void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const { initializeHDFSFS(); - auto * log = &Poco::Logger::get("HDFSObjectStorage"); LOG_TEST(log, "Trying to list files for {}", path); HDFSFileInfo ls; @@ -210,9 +209,6 @@ void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithM for (int i = 0; i < ls.length; ++i) { const String file_path = fs::path(ls.file_info[i].mName).lexically_normal(); - const size_t last_slash = file_path.rfind('/'); - const String file_name = file_path.substr(last_slash); - const bool is_directory = ls.file_info[i].mKind == 'D'; if (is_directory) { @@ -227,6 +223,9 @@ void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithM Poco::Timestamp::fromEpochTime(ls.file_info[i].mLastMod), {}})); } + + if (children.size() >= max_keys) + break; } } diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index e747b283400..8aae90d0721 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -39,6 +39,7 @@ public: bool lazy_initialize) : config(config_) , settings(std::move(settings_)) + , log(getLogger("HDFSObjectStorage(" + hdfs_root_path_ + ")")) { const size_t begin_of_path = hdfs_root_path_.find('/', hdfs_root_path_.find("//") + 2); url = hdfs_root_path_; @@ -134,6 +135,8 @@ private: std::string url; std::string url_without_path; std::string data_directory; + + LoggerPtr log; }; } diff --git a/src/Storages/Cache/SchemaCache.cpp b/src/Storages/Cache/SchemaCache.cpp index 5dc39f04ae0..299dd292772 100644 --- a/src/Storages/Cache/SchemaCache.cpp +++ b/src/Storages/Cache/SchemaCache.cpp @@ -1,6 +1,5 @@ #include #include -#include #include namespace ProfileEvents diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 01790760747..c5affb7989f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -61,10 +61,6 @@ StorageObjectStorage::StorageObjectStorage( metadata.setConstraints(constraints_); metadata.setComment(comment); - StoredObjects objects; - for (const auto & key : configuration->getPaths()) - objects.emplace_back(key); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); setInMemoryMetadata(metadata); } @@ -93,7 +89,7 @@ void StorageObjectStorage::updateConfiguration(ContextPtr context) { /// FIXME: we should be able to update everything apart from client if static_configuration == true. if (!configuration->isStaticConfiguration()) - object_storage->applyNewSettings(context->getConfigRef(), "s3.", context); + object_storage->applyNewSettings(context->getConfigRef(), configuration->getTypeName() + ".", context); } namespace diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index a396bad9d6e..928d49f9604 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -124,7 +124,6 @@ protected: ConfigurationPtr configuration; const ObjectStoragePtr object_storage; - const std::string engine_name; const std::optional format_settings; const ASTPtr partition_by; const bool distributed_processing; @@ -148,7 +147,9 @@ public: ContextPtr local_context, bool with_table_structure); + /// Storage type: s3, hdfs, azure. virtual std::string getTypeName() const = 0; + /// Engine name: S3, HDFS, Azure. virtual std::string getEngineName() const = 0; virtual Path getPath() const = 0; @@ -158,7 +159,10 @@ public: virtual void setPaths(const Paths & paths) = 0; virtual String getDataSourceDescription() = 0; + /// Sometimes object storages have something similar to chroot or namespace, for example + /// buckets in S3. If object storage doesn't have any namepaces return empty string. virtual String getNamespace() const = 0; + virtual StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const = 0; virtual void addStructureAndFormatToArgs( ASTs & args, const String & structure_, const String & format_, ContextPtr context) = 0; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index cb3f732ce83..e28924617e0 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -321,7 +321,7 @@ std::unique_ptr StorageObjectStorageSource::createReadBuffer(const S const bool object_too_small = object_size <= 2 * getContext()->getSettings().max_download_buffer_size; const bool use_prefetch = object_too_small && read_settings.remote_fs_method == RemoteFSReadMethod::threadpool; read_settings.remote_fs_method = use_prefetch ? RemoteFSReadMethod::threadpool : RemoteFSReadMethod::read; - /// User's S3 object may change, don't cache it. + /// User's object may change, don't cache it. read_settings.use_page_cache_for_disks_without_file_cache = false; // Create a read buffer that will prefetch the first ~1 MB of the file. diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index a8df00bc0ac..08d545f9b85 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -62,7 +62,7 @@ protected: const std::optional format_settings; const UInt64 max_block_size; const bool need_only_count; - const ReadFromFormatInfo read_from_format_info; + const ReadFromFormatInfo & read_from_format_info; const std::shared_ptr create_reader_pool; ColumnsDescription columns_desc; diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index bde3cb7e1cb..e49e14d2a0c 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -38,9 +38,9 @@ std::optional checkAndGetNewFileOnInsertIfNeeded( throw Exception( ErrorCodes::BAD_ARGUMENTS, "Object in bucket {} with key {} already exists. " - "If you want to overwrite it, enable setting s3_truncate_on_insert, if you " - "want to create a new file on each insert, enable setting s3_create_new_file_on_insert", - configuration.getNamespace(), key); + "If you want to overwrite it, enable setting {}_truncate_on_insert, if you " + "want to create a new file on each insert, enable setting {}_create_new_file_on_insert", + configuration.getNamespace(), key, configuration.getTypeName(), configuration.getTypeName()); } void resolveSchemaAndFormat( diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index 74c8aeaad7d..bf595b2f5d4 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -106,17 +106,17 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) void registerStorageS3(StorageFactory & factory) { - return registerStorageS3Impl("S3", factory); + registerStorageS3Impl("S3", factory); } void registerStorageCOS(StorageFactory & factory) { - return registerStorageS3Impl("COSN", factory); + registerStorageS3Impl("COSN", factory); } void registerStorageOSS(StorageFactory & factory) { - return registerStorageS3Impl("OSS", factory); + registerStorageS3Impl("OSS", factory); } #endif From 3778cee49e1d6ac1f0f4f470ba5d63458c33df3b Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 14 May 2024 18:41:19 +0200 Subject: [PATCH 128/392] Update src/Core/Settings.h Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/Core/Settings.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index aa20f68ac0d..066a551b37b 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -117,9 +117,9 @@ class IColumn; M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ M(Bool, hdfs_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ M(Bool, azure_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, s3_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageS3", 0) \ - M(Bool, hdfs_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageHDFS", 0) \ - M(Bool, azure_ignore_file_doesnt_exist, false, "Ignore if files does not exits and return 0 zeros for StorageAzure", 0) \ + M(Bool, s3_ignore_file_doesnt_exist, false, "Return 0 rows when the reqested files don't exist, instead of throwing an exception in S3 table engine", 0) \ + M(Bool, hdfs_ignore_file_doesnt_exist, false, "Return 0 rows when the reqested files don't exist, instead of throwing an exception in HDFS table engine", 0) \ + M(Bool, azure_ignore_file_doesnt_exist, false, "Return 0 rows when the reqested files don't exist, instead of throwing an exception in AzureBlobStorage table engine", 0) \ M(Bool, s3_validate_request_settings, true, "Validate S3 request settings", 0) \ M(Bool, s3_disable_checksum, false, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ From be693ceba7fa17e2c03c54197fb0d0f301640cc1 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 14 May 2024 18:46:35 +0200 Subject: [PATCH 129/392] Minor --- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index c5affb7989f..bc5b347d1e0 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -402,7 +402,6 @@ void StorageObjectStorage::Configuration::initialize( else configuration.fromAST(engine_args, local_context, with_table_structure); - // FIXME: it should be - if (format == "auto" && get_format_from_file) if (configuration.format == "auto") configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.getPath()).value_or("auto"); else From 65f404c153fb96602ec07c4f3919af14468b8d7d Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 14 May 2024 21:28:40 +0200 Subject: [PATCH 130/392] Review fixes --- docs/en/operations/settings/settings.md | 2 +- src/Core/Settings.h | 6 +++--- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 2 +- src/Storages/ObjectStorage/S3/Configuration.h | 2 ++ .../ObjectStorage/StorageObjectStorage.h | 5 +++-- .../StorageObjectStorageSource.cpp | 19 ++++++++----------- .../StorageObjectStorageSource.h | 2 +- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 72bd1ca8e2c..88e945a710c 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3667,7 +3667,7 @@ Default value: `0`. ## s3_ignore_file_doesnt_exist {#s3_ignore_file_doesnt_exist} -Ignore ansense of file if it does not exist when reading certain keys. +Ignore absense of file if it does not exist when reading certain keys. Possible values: - 1 — `SELECT` returns empty result. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 066a551b37b..afadaa88f6d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -117,9 +117,9 @@ class IColumn; M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ M(Bool, hdfs_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ M(Bool, azure_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, s3_ignore_file_doesnt_exist, false, "Return 0 rows when the reqested files don't exist, instead of throwing an exception in S3 table engine", 0) \ - M(Bool, hdfs_ignore_file_doesnt_exist, false, "Return 0 rows when the reqested files don't exist, instead of throwing an exception in HDFS table engine", 0) \ - M(Bool, azure_ignore_file_doesnt_exist, false, "Return 0 rows when the reqested files don't exist, instead of throwing an exception in AzureBlobStorage table engine", 0) \ + M(Bool, s3_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in S3 table engine", 0) \ + M(Bool, hdfs_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in HDFS table engine", 0) \ + M(Bool, azure_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in AzureBlobStorage table engine", 0) \ M(Bool, s3_validate_request_settings, true, "Validate S3 request settings", 0) \ M(Bool, s3_disable_checksum, false, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 1f3a4bdf6c7..dcb2af9d4d3 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -224,7 +224,7 @@ void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithM {}})); } - if (children.size() >= max_keys) + if (max_keys && children.size() >= max_keys) break; } } diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index b28b1c226a7..0bd7f1ab108 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -15,12 +15,14 @@ public: using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; static constexpr auto type_name = "s3"; + static constexpr auto namespace_name = "bucket"; StorageS3Configuration() = default; StorageS3Configuration(const StorageS3Configuration & other); std::string getTypeName() const override { return type_name; } std::string getEngineName() const override { return url.storage_name; } + std::string getNamespaceType() const override { return namespace_name; } Path getPath() const override { return url.key; } void setPath(const Path & path) override { url.key = path; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 928d49f9604..26b153ca0db 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -151,6 +151,9 @@ public: virtual std::string getTypeName() const = 0; /// Engine name: S3, HDFS, Azure. virtual std::string getEngineName() const = 0; + /// Sometimes object storages have something similar to chroot or namespace, for example + /// buckets in S3. If object storage doesn't have any namepaces return empty string. + virtual std::string getNamespaceType() const { return "namespace"; } virtual Path getPath() const = 0; virtual void setPath(const Path & path) = 0; @@ -159,8 +162,6 @@ public: virtual void setPaths(const Paths & paths) = 0; virtual String getDataSourceDescription() = 0; - /// Sometimes object storages have something similar to chroot or namespace, for example - /// buckets in S3. If object storage doesn't have any namepaces return empty string. virtual String getNamespace() const = 0; virtual StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const = 0; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index e28924617e0..737f733615f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -43,7 +43,7 @@ StorageObjectStorageSource::StorageObjectStorageSource( ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, const ReadFromFormatInfo & info, - std::optional format_settings_, + const std::optional & format_settings_, ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, @@ -95,7 +95,8 @@ std::shared_ptr StorageObjectStorageSourc local_context->getSettingsRef().max_threads); if (configuration->isNamespaceWithGlobs()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expression can not have wildcards inside {} name", configuration->getNamespaceType()); auto settings = configuration->getQuerySettings(local_context); @@ -425,15 +426,13 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne { std::lock_guard lock(next_mutex); auto object_info = nextImplUnlocked(processor); - if (object_info) + if (first_iteration && !object_info && throw_on_zero_files_match) { - if (first_iteration) - first_iteration = false; - } - else if (first_iteration && throw_on_zero_files_match) - { - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files"); + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, + "Can not match any files with path {}", + configuration->getPath()); } + first_iteration = false; return object_info; } @@ -456,8 +455,6 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne } new_batch = std::move(result.value()); - LOG_TEST(logger, "Batch size: {}", new_batch.size()); - for (auto it = new_batch.begin(); it != new_batch.end();) { if (!recursive && !re2::RE2::FullMatch((*it)->relative_path, *matcher)) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 08d545f9b85..9c67a125f5e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -31,7 +31,7 @@ public: ObjectStoragePtr object_storage_, ConfigurationPtr configuration, const ReadFromFormatInfo & info, - std::optional format_settings_, + const std::optional & format_settings_, ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, From a7b135ea8b8962ec4db318305391881ec1ff4ff8 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 15 May 2024 12:42:38 +0200 Subject: [PATCH 131/392] Fix style check --- docs/en/operations/settings/settings.md | 2 +- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 88e945a710c..131948eace9 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3667,7 +3667,7 @@ Default value: `0`. ## s3_ignore_file_doesnt_exist {#s3_ignore_file_doesnt_exist} -Ignore absense of file if it does not exist when reading certain keys. +Ignore absence of file if it does not exist when reading certain keys. Possible values: - 1 — `SELECT` returns empty result. diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 996f7da234a..3c72ef0f737 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -253,6 +253,7 @@ DockerHub DoubleDelta Doxygen Durre +doesnt ECMA Ecto EdgeAngle From 4c8bdad0e709b64ed045aed6092a429767370395 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 15 May 2024 12:54:59 +0200 Subject: [PATCH 132/392] Simplify glob iterator --- .../ObjectStorage/StorageObjectStorageCluster.cpp | 8 +++----- .../ObjectStorage/StorageObjectStorageSource.cpp | 15 +++------------ 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 193894a1d44..a43d9da0fa3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -82,11 +82,9 @@ void StorageObjectStorageCluster::updateQueryToSendIfNeeded( RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ContextPtr & local_context) const { - const auto settings = configuration->getQuerySettings(local_context); - auto iterator = std::make_shared( - object_storage, configuration, predicate, virtual_columns, local_context, - nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match, - local_context->getFileProgressCallback()); + auto iterator = StorageObjectStorageSource::createFileIterator( + configuration, object_storage, /* distributed_processing */false, local_context, + predicate, virtual_columns, nullptr, local_context->getFileProgressCallback()); auto callback = std::make_shared>([iterator]() mutable -> String { diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 737f733615f..8d5df96ca6e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -407,18 +407,9 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } else { - const auto object_key = configuration_->getPath(); - auto object_metadata = object_storage->getObjectMetadata(object_key); - auto object_info = std::make_shared(object_key, object_metadata); - - object_infos.emplace_back(object_info); - if (read_keys) - read_keys->emplace_back(object_info); - - if (file_progress_callback) - file_progress_callback(FileProgress(0, object_metadata.size_bytes)); - - is_finished = true; + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Using glob iterator with path without globs is not allowed (used path: {})", + configuration->getPath()); } } From a09bb5f0b7e2134ec576c3f20b492515cf258432 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 15 May 2024 11:42:11 +0000 Subject: [PATCH 133/392] Fix tests --- .../SerializationDynamicElement.cpp | 2 +- ...3039_dynamic_all_merge_algorithms_1.reference | 16 ++++++++-------- .../03039_dynamic_all_merge_algorithms_1.sh | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationDynamicElement.cpp b/src/DataTypes/Serializations/SerializationDynamicElement.cpp index b0a4e63d0a5..dafd6d663b0 100644 --- a/src/DataTypes/Serializations/SerializationDynamicElement.cpp +++ b/src/DataTypes/Serializations/SerializationDynamicElement.cpp @@ -72,7 +72,7 @@ void SerializationDynamicElement::deserializeBinaryBulkStatePrefix( auto dynamic_element_state = std::make_shared(); dynamic_element_state->structure_state = std::move(structure_state); - const auto & variant_type = checkAndGetState(structure_state)->variant_type; + const auto & variant_type = checkAndGetState(dynamic_element_state->structure_state)->variant_type; /// Check if we actually have required element in the Variant. if (auto global_discr = assert_cast(*variant_type).tryGetVariantDiscriminator(dynamic_element_name)) { diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference index 4b4a1e2ab51..6c69b81c183 100644 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference @@ -10,16 +10,16 @@ SummingMergeTree 200000 1 50000 String 100000 UInt64 -50000 2 100000 1 +50000 2 AggregatingMergeTree 100000 String 100000 UInt64 200000 1 50000 String 100000 UInt64 -50000 2 100000 1 +50000 2 MergeTree wide + horizontal merge ReplacingMergeTree 100000 String @@ -32,16 +32,16 @@ SummingMergeTree 200000 1 50000 String 100000 UInt64 -50000 2 100000 1 +50000 2 AggregatingMergeTree 100000 String 100000 UInt64 200000 1 50000 String 100000 UInt64 -50000 2 100000 1 +50000 2 MergeTree compact + vertical merge ReplacingMergeTree 100000 String @@ -54,16 +54,16 @@ SummingMergeTree 200000 1 50000 String 100000 UInt64 -50000 2 100000 1 +50000 2 AggregatingMergeTree 100000 String 100000 UInt64 200000 1 50000 String 100000 UInt64 -50000 2 100000 1 +50000 2 MergeTree wide + vertical merge ReplacingMergeTree 100000 String @@ -76,13 +76,13 @@ SummingMergeTree 200000 1 50000 String 100000 UInt64 -50000 2 100000 1 +50000 2 AggregatingMergeTree 100000 String 100000 UInt64 200000 1 50000 String 100000 UInt64 -50000 2 100000 1 +50000 2 diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh index 9298fe28fec..198c6ca93ff 100755 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh @@ -30,10 +30,10 @@ function test() $CH_CLIENT -q "insert into test select number, 1, 'str_' || toString(number) from numbers(50000, 100000)" $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "select count(), sum from test group by sum" + $CH_CLIENT -q "select count(), sum from test group by sum order by sum, count()" $CH_CLIENT -nm -q "system start merges test; optimize table test final" $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "select count(), sum from test group by sum" + $CH_CLIENT -q "select count(), sum from test group by sum order by sum, count()" $CH_CLIENT -q "drop table test" echo "AggregatingMergeTree" @@ -43,10 +43,10 @@ function test() $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), 'str_' || toString(number) from numbers(50000, 100000) group by number" $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum" + $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum order by sum, count()" $CH_CLIENT -nm -q "system start merges test; optimize table test final" $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum" + $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum order by sum, count()" $CH_CLIENT -q "drop table test" } From 12e512c70ddfe32f81f78ee7d58ae47c38d34ee9 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 15 May 2024 14:15:45 +0200 Subject: [PATCH 134/392] Delete tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference Removed mispelled file --- .../0.2973_parse_crlf_with_tsv_files.reference | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference diff --git a/tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference b/tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference deleted file mode 100644 index 14cf3a564e4..00000000000 --- a/tests/queries/0_stateless/0.2973_parse_crlf_with_tsv_files.reference +++ /dev/null @@ -1,12 +0,0 @@ -/home/shaun/Desktop/ClickHouse/user_files/02973_parse_crlf_with_tsv_files_test_data_without_crlf.tsv -<-- Read UNIX endings --> - -Akiba_Hebrew_Academy 2017-08-01 241 -Aegithina_tiphia 2018-02-01 34 -1971-72_Utah_Stars_season 2016-10-01 1 - -<-- Read DOS endings with setting input_format_tsv_crlf_end_of_line=1 --> - -Akiba_Hebrew_Academy 2017-08-01 241 -Aegithina_tiphia 2018-02-01 34 -1971-72_Utah_Stars_season 2016-10-01 1 From 53f5b958036d4ef3f69c3a22be96cf4c2e1b8c4a Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 15 May 2024 13:25:44 +0200 Subject: [PATCH 135/392] Fix typo --- docs/en/operations/settings/settings.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 131948eace9..1772a3aa861 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3719,7 +3719,7 @@ Default value: `0`. ## hdfs_ignore_file_doesnt_exist {#hdfs_ignore_file_doesnt_exist} -Ignore ansense of file if it does not exist when reading certain keys. +Ignore absence of file if it does not exist when reading certain keys. Possible values: - 1 — `SELECT` returns empty result. @@ -3739,7 +3739,7 @@ Default value: `0`. ## azure_ignore_file_doesnt_exist {#azure_ignore_file_doesnt_exist} -Ignore ansense of file if it does not exist when reading certain keys. +Ignore absence of file if it does not exist when reading certain keys. Possible values: - 1 — `SELECT` returns empty result. From ae10e7ded1080d5bd72372dc611cdcb7b96137ef Mon Sep 17 00:00:00 2001 From: MikhailBurdukov Date: Wed, 15 May 2024 13:09:00 +0000 Subject: [PATCH 136/392] Remove data from all disks after DROP with Lazy database. --- src/Databases/DatabaseOnDisk.cpp | 28 +++--- .../test_lazy_database/__init__.py | 0 .../configs/storage_policy.xml | 12 +++ tests/integration/test_lazy_database/test.py | 88 +++++++++++++++++++ 4 files changed, 117 insertions(+), 11 deletions(-) create mode 100644 tests/integration/test_lazy_database/__init__.py create mode 100644 tests/integration/test_lazy_database/configs/storage_policy.xml create mode 100644 tests/integration/test_lazy_database/test.py diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 67b45c7d08d..72a9ba318b1 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -324,31 +325,36 @@ void DatabaseOnDisk::dropTable(ContextPtr local_context, const String & table_na StoragePtr table = detachTable(local_context, table_name); - /// This is possible for Lazy database. - if (!table) - return; - bool renamed = false; try { fs::rename(table_metadata_path, table_metadata_path_drop); renamed = true; - table->drop(); - table->is_dropped = true; - - fs::path table_data_dir(local_context->getPath() + table_data_path_relative); - if (fs::exists(table_data_dir)) - (void)fs::remove_all(table_data_dir); + // The table might be not loaded for Lazy database engine. + if (table) + { + table->drop(); + table->is_dropped = true; + } } catch (...) { LOG_WARNING(log, getCurrentExceptionMessageAndPattern(/* with_stacktrace */ true)); - attachTable(local_context, table_name, table, table_data_path_relative); + if (table) + attachTable(local_context, table_name, table, table_data_path_relative); if (renamed) fs::rename(table_metadata_path_drop, table_metadata_path); throw; } + for (const auto & [disk_name, disk] : getContext()->getDisksMap()) + { + if (disk->isReadOnly() || !disk->exists(table_data_path_relative)) + continue; + + LOG_INFO(log, "Removing data directory from disk {} with path {} for dropped table {} ", disk_name, table_data_path_relative, table_name); + disk->removeRecursive(table_data_path_relative); + } (void)fs::remove(table_metadata_path_drop); } diff --git a/tests/integration/test_lazy_database/__init__.py b/tests/integration/test_lazy_database/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_lazy_database/configs/storage_policy.xml b/tests/integration/test_lazy_database/configs/storage_policy.xml new file mode 100644 index 00000000000..58771d6b284 --- /dev/null +++ b/tests/integration/test_lazy_database/configs/storage_policy.xml @@ -0,0 +1,12 @@ + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + + diff --git a/tests/integration/test_lazy_database/test.py b/tests/integration/test_lazy_database/test.py new file mode 100644 index 00000000000..6890aa87374 --- /dev/null +++ b/tests/integration/test_lazy_database/test.py @@ -0,0 +1,88 @@ +import logging +import time +import pytest +import os +from helpers.cluster import ClickHouseCluster + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["configs/storage_policy.xml"], + with_minio=True, + ) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def assert_objects_count(cluster, objects_count, path="data/"): + minio = cluster.minio_client + s3_objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True)) + if objects_count != len(s3_objects): + for s3_object in s3_objects: + object_meta = minio.stat_object(cluster.minio_bucket, s3_object.object_name) + logging.info("Existing S3 object: %s", str(object_meta)) + assert objects_count == len(s3_objects) + + +def list_of_files_on_ch_disk(node, disk, path): + disk_path = node.query( + f"SELECT path FROM system.disks WHERE name='{disk}'" + ).splitlines()[0] + return node.exec_in_container( + ["bash", "-c", f"ls {os.path.join(disk_path, path)}"], user="root" + ) + + +@pytest.mark.parametrize( + "engine", + [ + pytest.param("Log"), + ], +) +@pytest.mark.parametrize( + "disk,check_s3", + [ + pytest.param("default", False), + pytest.param("s3", True), + ], +) +@pytest.mark.parametrize( + "delay", + [ + pytest.param(0), + pytest.param(4), + ], +) +def test_drop_table(cluster, engine, disk, check_s3, delay): + node = cluster.instances["node"] + + node.query("DROP DATABASE IF EXISTS lazy") + node.query("CREATE DATABASE lazy ENGINE=Lazy(2)") + node.query( + "CREATE TABLE lazy.table (id UInt64) ENGINE={} SETTINGS disk = '{}'".format( + engine, + disk, + ) + ) + + node.query("INSERT INTO lazy.table SELECT number FROM numbers(10)") + assert node.query("SELECT count(*) FROM lazy.table") == "10\n" + if delay: + time.sleep(delay) + node.query("DROP TABLE lazy.table SYNC") + + if check_s3: + # There mustn't be any orphaned data + assert_objects_count(cluster, 0) + + # Local data must be removed + assert list_of_files_on_ch_disk(node, disk, "data/lazy/") == "" From 47dfeaa487743d81c66bb280e8eeb8f31ef21507 Mon Sep 17 00:00:00 2001 From: copperybean Date: Sun, 12 May 2024 21:57:37 +0800 Subject: [PATCH 137/392] fix comments Change-Id: I2677dc20fc515bbbe91f54154fc4c081f164758e --- .../Formats/Impl/Parquet/ParquetDataBuffer.h | 9 +- .../Impl/Parquet/ParquetDataValuesReader.cpp | 18 +- .../Impl/Parquet/ParquetDataValuesReader.h | 13 +- .../Impl/Parquet/ParquetLeafColReader.cpp | 33 +- .../Impl/Parquet/ParquetRecordReader.cpp | 326 +++++++++++++----- .../Impl/Parquet/ParquetRecordReader.h | 6 +- .../Formats/Impl/ParquetBlockInputFormat.cpp | 2 + .../02998_native_parquet_reader.sh | 4 +- .../native_parquet_reader.parquet} | Bin 9 files changed, 296 insertions(+), 115 deletions(-) rename tests/queries/0_stateless/{02998_native_parquet_reader.parquet => data_parquet/native_parquet_reader.parquet} (100%) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h index 5c37375fa0c..57df6f59f72 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataBuffer.h @@ -48,7 +48,7 @@ public: consume(bytes); } - void ALWAYS_INLINE readDateTime64(DateTime64 & dst) + void ALWAYS_INLINE readDateTime64FromInt96(DateTime64 & dst) { static const int max_scale_num = 9; static const UInt64 pow10[max_scale_num + 1] @@ -110,10 +110,7 @@ public: // refer to: RawBytesToDecimalBytes in reader_internal.cc, Decimal128::FromBigEndian in decimal.cc auto status = TArrowDecimal::FromBigEndian(getArrowData(), elem_bytes_num); - if (unlikely(!status.ok())) - { - throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Read parquet decimal failed: {}", status.status().ToString()); - } + assert(status.ok()); status.ValueUnsafe().ToBytes(reinterpret_cast(out)); consume(elem_bytes_num); } @@ -144,7 +141,7 @@ private: class LazyNullMap { public: - LazyNullMap(UInt64 size_) : size(size_), col_nullable(nullptr) {} + explicit LazyNullMap(UInt64 size_) : size(size_), col_nullable(nullptr) {} template requires std::is_integral_v diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 6743086e9e6..1f0c7105572 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -276,8 +276,7 @@ void ParquetPlainValuesReader::readBatch( auto idx = cursor; cursor += count; - // the type of offset_data is PaddedPODArray, which makes sure that the -1 index is available - for (auto val_offset = offset_data[idx - 1]; idx < cursor; idx++) + for (auto val_offset = chars_size_bak; idx < cursor; idx++) { offset_data[idx] = ++val_offset; } @@ -288,7 +287,7 @@ void ParquetPlainValuesReader::readBatch( template <> -void ParquetPlainValuesReader>::readBatch( +void ParquetPlainValuesReader, ParquetReaderTypes::TimestampInt96>::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) { auto cursor = col_ptr->size(); @@ -302,21 +301,21 @@ void ParquetPlainValuesReader>::readBatch( null_map, /* individual_visitor */ [&](size_t nest_cursor) { - plain_data_buffer.readDateTime64(column_data[nest_cursor]); + plain_data_buffer.readDateTime64FromInt96(column_data[nest_cursor]); }, /* repeated_visitor */ [&](size_t nest_cursor, UInt32 count) { auto * col_data_pos = column_data + nest_cursor; for (UInt32 i = 0; i < count; i++) { - plain_data_buffer.readDateTime64(col_data_pos[i]); + plain_data_buffer.readDateTime64FromInt96(col_data_pos[i]); } } ); } -template -void ParquetPlainValuesReader::readBatch( +template +void ParquetPlainValuesReader::readBatch( MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) { auto cursor = col_ptr->size(); @@ -542,11 +541,14 @@ void ParquetRleDictReader::readBatch( template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; template class ParquetPlainValuesReader; +template class ParquetPlainValuesReader; template class ParquetPlainValuesReader; template class ParquetPlainValuesReader; template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader>; +template class ParquetPlainValuesReader>; template class ParquetPlainValuesReader; template class ParquetFixedLenPlainReader>; @@ -557,7 +559,9 @@ template class ParquetRleLCReader; template class ParquetRleLCReader; template class ParquetRleDictReader; +template class ParquetRleDictReader; template class ParquetRleDictReader; +template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader; template class ParquetRleDictReader>; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index 688de4f52eb..0f916ff862d 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -24,7 +24,7 @@ public: /** * @brief Used when the bit_width is 0, so all elements have same value. */ - RleValuesReader(UInt32 total_size, Int32 val = 0) + explicit RleValuesReader(UInt32 total_size, Int32 val = 0) : bit_reader(nullptr), bit_width(0), cur_group_size(total_size), cur_value(val), cur_group_is_packed(false) {} @@ -72,7 +72,8 @@ public: * @tparam SteppedValidVisitor A callback with signature: * void(size_t cursor, const std::vector & valid_index_steps) * for n valid elements with null value interleaved in a BitPacked group, - * i-th item in valid_index_steps describes how many elements in column there are after (i-1)-th valid element. + * i-th item in valid_index_steps describes how many elements there are + * from i-th valid element (include) to (i+1)-th valid element (exclude). * * take following BitPacked group with 2 valid elements for example: * null valid null null valid null @@ -138,10 +139,16 @@ public: using ParquetDataValuesReaderPtr = std::unique_ptr; +enum class ParquetReaderTypes +{ + Normal, + TimestampInt96, +}; + /** * The definition level is RLE or BitPacked encoding, while data is read directly */ -template +template class ParquetPlainValuesReader : public ParquetDataValuesReader { public: diff --git a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp index 52dfad7606a..9e1cae9bb65 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp @@ -110,16 +110,24 @@ ColumnPtr readDictPage( template <> ColumnPtr readDictPage>( const parquet::DictionaryPage & page, - const parquet::ColumnDescriptor & /* col_des */, + const parquet::ColumnDescriptor & col_des, const DataTypePtr & data_type) { + const auto & datetime_type = assert_cast(*data_type); auto dict_col = ColumnDecimal::create(page.num_values(), datetime_type.getScale()); auto * col_data = dict_col->getData().data(); ParquetDataBuffer buffer(page.data(), page.size(), datetime_type.getScale()); - for (auto i = 0; i < page.num_values(); i++) + if (col_des.physical_type() == parquet::Type::INT64) { - buffer.readDateTime64(col_data[i]); + buffer.readBytes(dict_col->getData().data(), page.num_values() * sizeof(Int64)); + } + else + { + for (auto i = 0; i < page.num_values(); i++) + { + buffer.readDateTime64FromInt96(col_data[i]); + } } return dict_col; } @@ -190,8 +198,12 @@ std::unique_ptr createPlainReader( RleValuesReaderPtr def_level_reader, ParquetDataBuffer buffer) { - return std::make_unique>( - col_des.max_definition_level(), std::move(def_level_reader), std::move(buffer)); + if (std::is_same_v> && col_des.physical_type() == parquet::Type::INT96) + return std::make_unique>( + col_des.max_definition_level(), std::move(def_level_reader), std::move(buffer)); + else + return std::make_unique>( + col_des.max_definition_level(), std::move(def_level_reader), std::move(buffer)); } @@ -287,6 +299,7 @@ void ParquetLeafColReader::degradeDictionary() null_map = std::make_unique(reading_rows_num); auto col_existing = std::move(column); column = ColumnString::create(); + reserveColumnStrRows(column, reading_rows_num); ColumnString & col_dest = *static_cast(column.get()); const ColumnString & col_dict_str = *static_cast(dictionary.get()); @@ -294,8 +307,9 @@ void ParquetLeafColReader::degradeDictionary() visitColStrIndexType(dictionary->size(), [&](TColVec *) { const TColVec & col_src = *static_cast(col_existing.get()); - reserveColumnStrRows(column, reading_rows_num); + // It will be easier to create a ColumnLowCardinality and call convertToFullColumn() on it, + // while the performance loss is ignorable, the implementation can be updated next time. col_dest.getOffsets().resize(col_src.size()); for (size_t i = 0; i < col_src.size(); i++) { @@ -378,6 +392,11 @@ void ParquetLeafColReader::readPage() LOG_DEBUG(log, "{} values in dictionary page of column {}", dict_page.num_values(), col_descriptor.name()); dictionary = readDictPage(dict_page, col_descriptor, base_data_type); + if (unlikely(dictionary->size() < 2)) + { + // must not small than ColumnUnique::numSpecialValues() + dictionary->assumeMutable()->insertManyDefaults(2); + } if (std::is_same_v) { reading_low_cardinality = true; @@ -508,7 +527,9 @@ std::unique_ptr ParquetLeafColReader::createDi template class ParquetLeafColReader; +template class ParquetLeafColReader; template class ParquetLeafColReader; +template class ParquetLeafColReader; template class ParquetLeafColReader; template class ParquetLeafColReader; template class ParquetLeafColReader; diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 9cde433b983..fddd8059925 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -36,8 +36,7 @@ namespace ErrorCodes try { (s); } \ catch (const ::parquet::ParquetException & e) \ { \ - auto msg = PreformattedMessage::create("Excepted when reading parquet: {}", e.what()); \ - throw Exception(std::move(msg), ErrorCodes::PARQUET_EXCEPTION); \ + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Parquet exception: {}", e.what()); \ } \ } while (false) @@ -45,102 +44,252 @@ namespace { std::unique_ptr createFileReader( - std::shared_ptr<::arrow::io::RandomAccessFile> arrow_file) + std::shared_ptr<::arrow::io::RandomAccessFile> arrow_file, + std::shared_ptr metadata = nullptr) { std::unique_ptr res; - THROW_PARQUET_EXCEPTION(res = parquet::ParquetFileReader::Open(std::move(arrow_file))); + THROW_PARQUET_EXCEPTION(res = parquet::ParquetFileReader::Open( + std::move(arrow_file), + parquet::default_reader_properties(), + metadata)); return res; } -std::unique_ptr createColReader( - const parquet::ColumnDescriptor & col_descriptor, - DataTypePtr ch_type, - std::unique_ptr meta, - std::unique_ptr reader) +class ColReaderFactory { - if (col_descriptor.logical_type()->is_date() && parquet::Type::INT32 == col_descriptor.physical_type()) +public: + ColReaderFactory( + const parquet::ArrowReaderProperties & reader_properties_, + const parquet::ColumnDescriptor & col_descriptor_, + DataTypePtr ch_type_, + std::unique_ptr meta_, + std::unique_ptr page_reader_) + : reader_properties(reader_properties_) + , col_descriptor(col_descriptor_) + , ch_type(std::move(ch_type_)) + , meta(std::move(meta_)) + , page_reader(std::move(page_reader_)) {} + + std::unique_ptr makeReader(); + +private: + const parquet::ArrowReaderProperties & reader_properties; + const parquet::ColumnDescriptor & col_descriptor; + DataTypePtr ch_type; + std::unique_ptr meta; + std::unique_ptr page_reader; + + + UInt32 getScaleFromLogicalTimestamp(parquet::LogicalType::TimeUnit::unit tm_unit); + UInt32 getScaleFromArrowTimeUnit(arrow::TimeUnit::type tm_unit); + + std::unique_ptr fromInt32(); + std::unique_ptr fromInt64(); + std::unique_ptr fromByteArray(); + std::unique_ptr fromFLBA(); + + std::unique_ptr fromInt32INT(const parquet::IntLogicalType & int_type); + std::unique_ptr fromInt64INT(const parquet::IntLogicalType & int_type); + + template + auto makeLeafReader() { - return std::make_unique>( - col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); + return std::make_unique>( + col_descriptor, std::make_shared(), std::move(meta), std::move(page_reader)); } - else if (col_descriptor.logical_type()->is_decimal()) + + template + auto makeDecimalLeafReader() { - switch (col_descriptor.physical_type()) + auto data_type = std::make_shared>( + col_descriptor.type_precision(), col_descriptor.type_scale()); + return std::make_unique>>( + col_descriptor, std::move(data_type), std::move(meta), std::move(page_reader)); + } + + std::unique_ptr throwUnsupported(std::string msg = "") + { + throw Exception( + ErrorCodes::PARQUET_EXCEPTION, + "Unsupported logical type: {} and physical type: {} for field =={}=={}", + col_descriptor.logical_type()->ToString(), col_descriptor.physical_type(), col_descriptor.name(), msg); + } +}; + +UInt32 ColReaderFactory::getScaleFromLogicalTimestamp(parquet::LogicalType::TimeUnit::unit tm_unit) +{ + switch (tm_unit) + { + case parquet::LogicalType::TimeUnit::MILLIS: + return 3; + case parquet::LogicalType::TimeUnit::MICROS: + return 6; + case parquet::LogicalType::TimeUnit::NANOS: + return 9; + default: + throwUnsupported(PreformattedMessage::create(", invalid timestamp unit: {}", tm_unit)); + return 0; + } +} + +UInt32 ColReaderFactory::getScaleFromArrowTimeUnit(arrow::TimeUnit::type tm_unit) +{ + switch (tm_unit) + { + case arrow::TimeUnit::MILLI: + return 3; + case arrow::TimeUnit::MICRO: + return 6; + case arrow::TimeUnit::NANO: + return 9; + default: + throwUnsupported(PreformattedMessage::create(", invalid arrow time unit: {}", tm_unit)); + return 0; + } +} + +std::unique_ptr ColReaderFactory::fromInt32() +{ + switch (col_descriptor.logical_type()->type()) + { + case parquet::LogicalType::Type::INT: + return fromInt32INT(dynamic_cast(*col_descriptor.logical_type())); + case parquet::LogicalType::Type::NONE: + return makeLeafReader(); + case parquet::LogicalType::Type::DATE: + return makeLeafReader(); + case parquet::LogicalType::Type::DECIMAL: + return makeDecimalLeafReader(); + default: + return throwUnsupported(); + } +} + +std::unique_ptr ColReaderFactory::fromInt64() +{ + switch (col_descriptor.logical_type()->type()) + { + case parquet::LogicalType::Type::INT: + return fromInt64INT(dynamic_cast(*col_descriptor.logical_type())); + case parquet::LogicalType::Type::NONE: + return makeLeafReader(); + case parquet::LogicalType::Type::TIMESTAMP: { - case parquet::Type::INT32: - { - auto data_type = std::make_shared( - col_descriptor.type_precision(), col_descriptor.type_scale()); - return std::make_unique>>( - col_descriptor, data_type, std::move(meta), std::move(reader)); - } - case parquet::Type::INT64: - { - auto data_type = std::make_shared( - col_descriptor.type_precision(), col_descriptor.type_scale()); - return std::make_unique>>( - col_descriptor, data_type, std::move(meta), std::move(reader)); - } - case parquet::Type::FIXED_LEN_BYTE_ARRAY: - { - if (col_descriptor.type_length() <= static_cast(sizeof(Decimal128))) - { - auto data_type = std::make_shared( - col_descriptor.type_precision(), col_descriptor.type_scale()); - return std::make_unique>>( - col_descriptor, data_type, std::move(meta), std::move(reader)); - } - else - { - auto data_type = std::make_shared( - col_descriptor.type_precision(), col_descriptor.type_scale()); - return std::make_unique>>( - col_descriptor, data_type, std::move(meta), std::move(reader)); - } - } - default: - throw Exception( - ErrorCodes::PARQUET_EXCEPTION, - "Type not supported for decimal: {}", - col_descriptor.physical_type()); + const auto & tm_type = dynamic_cast(*col_descriptor.logical_type()); + auto read_type = std::make_shared(getScaleFromLogicalTimestamp(tm_type.time_unit())); + return std::make_unique>>( + col_descriptor, std::move(read_type), std::move(meta), std::move(page_reader)); } + case parquet::LogicalType::Type::DECIMAL: + return makeDecimalLeafReader(); + default: + return throwUnsupported(); } - else +} + +std::unique_ptr ColReaderFactory::fromByteArray() +{ + switch (col_descriptor.logical_type()->type()) { - switch (col_descriptor.physical_type()) - { - case parquet::Type::INT32: - return std::make_unique>( - col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); - case parquet::Type::INT64: - return std::make_unique>( - col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); - case parquet::Type::FLOAT: - return std::make_unique>( - col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); - case parquet::Type::INT96: - { - DataTypePtr read_type = ch_type; - if (!isDateTime64(ch_type)) - { - read_type = std::make_shared(ParquetRecordReader::default_datetime64_scale); - } - return std::make_unique>>( - col_descriptor, read_type, std::move(meta), std::move(reader)); - } - case parquet::Type::DOUBLE: - return std::make_unique>( - col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); - case parquet::Type::BYTE_ARRAY: - return std::make_unique>( - col_descriptor, std::make_shared(), std::move(meta), std::move(reader)); - default: - throw Exception( - ErrorCodes::PARQUET_EXCEPTION, "Type not supported: {}", col_descriptor.physical_type()); - } + case parquet::LogicalType::Type::STRING: + return makeLeafReader(); + default: + return throwUnsupported(); } } +std::unique_ptr ColReaderFactory::fromFLBA() +{ + switch (col_descriptor.logical_type()->type()) + { + case parquet::LogicalType::Type::DECIMAL: + { + if (col_descriptor.type_length() <= static_cast(sizeof(Decimal128))) + return makeDecimalLeafReader(); + else if (col_descriptor.type_length() <= static_cast(sizeof(Decimal256))) + return makeDecimalLeafReader(); + + return throwUnsupported(PreformattedMessage::create( + ", invalid type length: {}", col_descriptor.type_length())); + } + default: + return throwUnsupported(); + } +} + +std::unique_ptr ColReaderFactory::fromInt32INT(const parquet::IntLogicalType & int_type) +{ + switch (int_type.bit_width()) + { + case sizeof(Int32): + { + if (int_type.is_signed()) + return makeLeafReader(); + else + return makeLeafReader(); + } + default: + return throwUnsupported(PreformattedMessage::create(", bit width: {}", int_type.bit_width())); + } +} + +std::unique_ptr ColReaderFactory::fromInt64INT(const parquet::IntLogicalType & int_type) +{ + switch (int_type.bit_width()) + { + case sizeof(Int64): + { + if (int_type.is_signed()) + return makeLeafReader(); + else + return makeLeafReader(); + } + default: + return throwUnsupported(PreformattedMessage::create(", bit width: {}", int_type.bit_width())); + } +} + +// refer: GetArrowType method in schema_internal.cc of arrow +std::unique_ptr ColReaderFactory::makeReader() +{ + // this method should to be called only once for each instance + SCOPE_EXIT({ page_reader = nullptr; }); + assert(page_reader); + + switch (col_descriptor.physical_type()) + { + case parquet::Type::BOOLEAN: + break; + case parquet::Type::INT32: + return fromInt32(); + case parquet::Type::INT64: + return fromInt64(); + case parquet::Type::INT96: + { + DataTypePtr read_type = ch_type; + if (!isDateTime64(ch_type)) + { + auto scale = getScaleFromArrowTimeUnit(reader_properties.coerce_int96_timestamp_unit()); + read_type = std::make_shared(scale); + } + return std::make_unique>>( + col_descriptor, read_type, std::move(meta), std::move(page_reader)); + } + case parquet::Type::FLOAT: + return makeLeafReader(); + case parquet::Type::DOUBLE: + return makeLeafReader(); + case parquet::Type::BYTE_ARRAY: + return fromByteArray(); + case parquet::Type::FIXED_LEN_BYTE_ARRAY: + return fromFLBA(); + default: + break; + } + + return throwUnsupported(); +} + } // anonymous namespace ParquetRecordReader::ParquetRecordReader( @@ -148,8 +297,9 @@ ParquetRecordReader::ParquetRecordReader( parquet::ArrowReaderProperties reader_properties_, std::shared_ptr<::arrow::io::RandomAccessFile> arrow_file, const FormatSettings & format_settings, - std::vector row_groups_indices_) - : file_reader(createFileReader(std::move(arrow_file))) + std::vector row_groups_indices_, + std::shared_ptr metadata) + : file_reader(createFileReader(std::move(arrow_file), std::move(metadata))) , reader_properties(reader_properties_) , header(std::move(header_)) , max_block_size(format_settings.parquet.max_block_size) @@ -210,15 +360,17 @@ void ParquetRecordReader::loadNextRowGroup() column_readers.clear(); for (size_t i = 0; i < parquet_col_indice.size(); i++) { - column_readers.emplace_back(createColReader( + ColReaderFactory factory( + reader_properties, *file_reader->metadata()->schema()->Column(parquet_col_indice[i]), header.getByPosition(i).type, cur_row_group_reader->metadata()->ColumnChunk(parquet_col_indice[i]), - cur_row_group_reader->GetColumnPageReader(parquet_col_indice[i]))); + cur_row_group_reader->GetColumnPageReader(parquet_col_indice[i])); + column_readers.emplace_back(factory.makeReader()); } auto duration = watch.elapsedNanoseconds() / 1e6; - LOG_DEBUG(log, "reading row group {} consumed {} ms", row_groups_indices[next_row_group_idx], duration); + LOG_DEBUG(log, "begin to read row group {} consumed {} ms", row_groups_indices[next_row_group_idx], duration); ++next_row_group_idx; cur_row_group_left_rows = cur_row_group_reader->metadata()->num_rows(); diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h index 4789be59ec8..2f728a586a0 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.h @@ -22,13 +22,11 @@ public: parquet::ArrowReaderProperties reader_properties_, std::shared_ptr<::arrow::io::RandomAccessFile> arrow_file, const FormatSettings & format_settings, - std::vector row_groups_indices_); + std::vector row_groups_indices_, + std::shared_ptr metadata = nullptr); Chunk readChunk(); - // follow the scale generated by spark - static constexpr UInt8 default_datetime64_scale = 9; - private: std::unique_ptr file_reader; parquet::ArrowReaderProperties reader_properties; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 2e849f09fda..7fc7b9c3cab 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -3,6 +3,7 @@ #if USE_PARQUET +#include #include #include #include @@ -623,6 +624,7 @@ void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_batch_idx, std::un return; } + // TODO support defaults_for_omitted_fields feature when supporting nested columns auto num_rows = chunk.getNumRows(); res = get_pending_chunk(num_rows, std::move(chunk)); } diff --git a/tests/queries/0_stateless/02998_native_parquet_reader.sh b/tests/queries/0_stateless/02998_native_parquet_reader.sh index 4e5169c4bf0..d6369c4921b 100755 --- a/tests/queries/0_stateless/02998_native_parquet_reader.sh +++ b/tests/queries/0_stateless/02998_native_parquet_reader.sh @@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -PAR_PATH="$CURDIR"/02998_native_parquet_reader.parquet +PAR_PATH="$CURDIR"/data_parquet/native_parquet_reader.parquet # the content of parquet file can be generated by following codes # < Date: Wed, 15 May 2024 19:27:15 +0200 Subject: [PATCH 138/392] Fix special build --- src/Columns/ColumnDynamic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h index c6626433877..40e8e350733 100644 --- a/src/Columns/ColumnDynamic.h +++ b/src/Columns/ColumnDynamic.h @@ -137,7 +137,7 @@ public: void insertData(const char * pos, size_t length) override { - return variant_column->insertData(pos, length); + variant_column->insertData(pos, length); } void insert(const Field & x) override; From 4cfe2665de328a7a7921e0b0a76ddf9b4e2d5486 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 15 May 2024 20:28:17 +0200 Subject: [PATCH 139/392] Update src/Formats/FormatSettings.h --- src/Formats/FormatSettings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 004b16b6061..bf3269bd42d 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -363,7 +363,7 @@ struct FormatSettings bool skip_trailing_empty_lines = false; bool allow_variable_number_of_columns = false; bool crlf_end_of_line_input = false; - } tsv; + } tsv{}; struct { From 04fb84d4ade10df2a4fc9f6cb6f94ac4993d1ffd Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 15 May 2024 21:57:15 +0200 Subject: [PATCH 140/392] Update src/Core/SettingsChangesHistory.h Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/Core/SettingsChangesHistory.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 6edfcc129f8..e004e83355b 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -91,13 +91,13 @@ static std::map sett {"cross_join_min_rows_to_compress", 0, 10000000, "A new setting."}, {"cross_join_min_bytes_to_compress", 0, 1_GiB, "A new setting."}, {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, - {"hdfs_throw_on_zero_files_match", false, false, "Throw an error, when ListObjects request cannot match any files"}, - {"azure_throw_on_zero_files_match", false, false, "Throw an error, when ListObjects request cannot match any files"}, - {"s3_validate_request_settings", true, true, "Validate S3 request settings"}, + {"hdfs_throw_on_zero_files_match", false, false, "Allow to throw an error when ListObjects request cannot match any files in HDFS engine instead of empty query result"}, + {"azure_throw_on_zero_files_match", false, false, "Allow to throw an error when ListObjects request cannot match any files in AzureBlobStorage engine instead of empty query result"}, + {"s3_validate_request_settings", true, true, "Allow to disable S3 request settings validation"}, {"azure_skip_empty_files", false, false, "Allow to skip empty files in azure table engine"}, - {"hdfs_ignore_file_doesnt_exist", false, false, "Ignore if files does not exits and return 0 zeros for StorageHDFS"}, - {"azure_ignore_file_doesnt_exist", false, false, "Ignore if files does not exits and return 0 zeros for StorageAzureBlob"}, - {"s3_ignore_file_doesnt_exist", false, false, "Ignore if files does not exits and return 0 zeros for StorageS3"}, + {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, + {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, + {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, {"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"}, {"output_format_pretty_preserve_border_for_multiline_string", 1, 1, "Applies better rendering for multiline strings."}, }}, From a63e846724f503607fe38b34fda37345ee8111c5 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 15 May 2024 22:13:48 +0200 Subject: [PATCH 141/392] Review fixes --- docs/en/operations/settings/settings.md | 20 +++++++++++++++++++ .../StorageObjectStorageSink.cpp | 2 +- .../ObjectStorage/StorageObjectStorageSink.h | 2 +- src/Storages/S3Queue/S3QueueSource.cpp | 4 ++-- .../TableFunctionObjectStorage.cpp | 5 ++--- .../TableFunctionObjectStorage.h | 10 ++++++++-- .../TableFunctionObjectStorageCluster.h | 19 ++++++++++-------- 7 files changed, 45 insertions(+), 17 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 1772a3aa861..3a79eb64c67 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3675,6 +3675,16 @@ Possible values: Default value: `0`. +## s3_validate_request_settings {#s3_validate_request_settings} + +Enables s3 request settings validation. + +Possible values: +- 1 — validate settings. +- 0 — do not validate settings. + +Default value: `1`. + ## hdfs_truncate_on_insert {#hdfs_truncate_on_insert} Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. @@ -3747,6 +3757,16 @@ Possible values: Default value: `0`. +## azure_skip_empty_files {#azure_skip_empty_files} + +Enables or disables skipping empty files in S3 engine. + +Possible values: +- 0 — `SELECT` throws an exception if empty file is not compatible with requested format. +- 1 — `SELECT` returns empty result for empty file. + +Default value: `0`. + ## engine_url_skip_empty_files {#engine_url_skip_empty_files} Enables or disables skipping empty files in [URL](../../engines/table-engines/special/url.md) engine tables. diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 81bdeaa43a3..0a3cf19a590 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -15,7 +15,7 @@ namespace ErrorCodes StorageObjectStorageSink::StorageObjectStorageSink( ObjectStoragePtr object_storage, ConfigurationPtr configuration, - std::optional format_settings_, + const std::optional & format_settings_, const Block & sample_block_, ContextPtr context, const std::string & blob_path) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index a3c8ef68cf0..45cf83d606f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -13,7 +13,7 @@ public: StorageObjectStorageSink( ObjectStoragePtr object_storage, ConfigurationPtr configuration, - std::optional format_settings_, + const std::optional & format_settings_, const Block & sample_block_, ContextPtr context, const std::string & blob_path = ""); diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 0cee94769c4..458f681d7b5 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -235,7 +235,7 @@ Chunk StorageS3QueueSource::generate() catch (...) { LOG_ERROR(log, "Failed to set file {} as failed: {}", - key_with_info->key, getCurrentExceptionMessage(true)); + key_with_info->relative_path, getCurrentExceptionMessage(true)); } appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, false); @@ -262,7 +262,7 @@ Chunk StorageS3QueueSource::generate() catch (...) { LOG_ERROR(log, "Failed to set file {} as failed: {}", - key_with_info->key, getCurrentExceptionMessage(true)); + key_with_info->relative_path, getCurrentExceptionMessage(true)); } appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, false); diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index 9f16a9a0b25..550d9cc799b 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -116,9 +116,8 @@ StoragePtr TableFunctionObjectStorage::executeImpl( columns, ConstraintsDescription{}, String{}, - /// No format_settings for table function Azure - std::nullopt, - /* distributed_processing */ false, + /* format_settings */std::nullopt, + /* distributed_processing */false, nullptr); storage->startup(); diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h index bbc40cc6191..86b8f0d5e14 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.h +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -32,6 +32,7 @@ struct AzureDefinition " - storage_account_url, container_name, blobpath, account_name, account_key, format\n" " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression\n" " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure\n"; + static constexpr auto max_number_of_arguments = 8; }; struct S3Definition @@ -51,6 +52,7 @@ struct S3Definition " - url, access_key_id, secret_access_key, format, structure, compression_method\n" " - url, access_key_id, secret_access_key, session_token, format, structure, compression_method\n" "All signatures supports optional headers (specified as `headers('name'='value', 'name2'='value2')`)"; + static constexpr auto max_number_of_arguments = 8; }; struct GCSDefinition @@ -58,6 +60,7 @@ struct GCSDefinition static constexpr auto name = "gcs"; static constexpr auto storage_type_name = "GCS"; static constexpr auto signature = S3Definition::signature; + static constexpr auto max_number_of_arguments = S3Definition::max_number_of_arguments; }; struct COSNDefinition @@ -65,6 +68,7 @@ struct COSNDefinition static constexpr auto name = "cosn"; static constexpr auto storage_type_name = "COSN"; static constexpr auto signature = S3Definition::signature; + static constexpr auto max_number_of_arguments = S3Definition::max_number_of_arguments; }; struct OSSDefinition @@ -72,6 +76,7 @@ struct OSSDefinition static constexpr auto name = "oss"; static constexpr auto storage_type_name = "OSS"; static constexpr auto signature = S3Definition::signature; + static constexpr auto max_number_of_arguments = S3Definition::max_number_of_arguments; }; struct HDFSDefinition @@ -82,6 +87,7 @@ struct HDFSDefinition " - uri, format\n" " - uri, format, structure\n" " - uri, format, structure, compression_method\n"; + static constexpr auto max_number_of_arguments = 4; }; template @@ -91,7 +97,7 @@ public: static constexpr auto name = Definition::name; static constexpr auto signature = Definition::signature; - static size_t getMaxNumberOfArguments() { return 8; } + static size_t getMaxNumberOfArguments() { return Definition::max_number_of_arguments; } String getName() const override { return name; } @@ -105,7 +111,7 @@ public: bool supportsReadingSubsetOfColumns(const ContextPtr & context) override { - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); + return configuration->format != "auto" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); } std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.h b/src/TableFunctions/TableFunctionObjectStorageCluster.h index 76786fafe99..296791b8bda 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.h +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.h @@ -17,17 +17,10 @@ class StorageAzureConfiguration; struct AzureClusterDefinition { - /** - * azureBlobStorageCluster(cluster_name, source, [access_key_id, secret_access_key,] format, compression_method, structure) - * A table function, which allows to process many files from Azure Blob Storage on a specific cluster - * On initiator it creates a connection to _all_ nodes in cluster, discloses asterisks - * in Azure Blob Storage file path and dispatch each file dynamically. - * On worker node it asks initiator about next task to process, processes it. - * This is repeated until the tasks are finished. - */ static constexpr auto name = "azureBlobStorageCluster"; static constexpr auto storage_type_name = "AzureBlobStorageCluster"; static constexpr auto signature = " - cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]"; + static constexpr auto max_number_of_arguments = AzureDefinition::max_number_of_arguments + 1; }; struct S3ClusterDefinition @@ -44,6 +37,7 @@ struct S3ClusterDefinition " - cluster, url, access_key_id, secret_access_key, format, structure, compression_method\n" " - cluster, url, access_key_id, secret_access_key, session_token, format, structure, compression_method\n" "All signatures supports optional headers (specified as `headers('name'='value', 'name2'='value2')`)"; + static constexpr auto max_number_of_arguments = S3Definition::max_number_of_arguments + 1; }; struct HDFSClusterDefinition @@ -54,8 +48,17 @@ struct HDFSClusterDefinition " - cluster_name, uri, format\n" " - cluster_name, uri, format, structure\n" " - cluster_name, uri, format, structure, compression_method\n"; + static constexpr auto max_number_of_arguments = HDFSDefinition::max_number_of_arguments + 1; }; +/** +* Class implementing s3/hdfs/azureBlobStorage)Cluster(...) table functions, +* which allow to process many files from S3/HDFS/Azure blob storage on a specific cluster. +* On initiator it creates a connection to _all_ nodes in cluster, discloses asterisks +* in file path and dispatch each file dynamically. +* On worker node it asks initiator about next task to process, processes it. +* This is repeated until the tasks are finished. +*/ template class TableFunctionObjectStorageCluster : public ITableFunctionCluster> { From f19615788bf05be3440cddf552d0bf51e33cbc5c Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 15 May 2024 22:37:33 +0000 Subject: [PATCH 142/392] Fix special build --- src/Columns/ColumnDynamic.cpp | 6 +++--- src/DataTypes/DataTypeDynamic.cpp | 2 +- src/DataTypes/Serializations/SerializationDynamic.h | 2 +- src/Parsers/ParserDataType.cpp | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index 0f247638d92..d63a03dbafd 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -227,7 +227,7 @@ void ColumnDynamic::insertFrom(const DB::IColumn & src_, size_t n) auto & variant_col = assert_cast(*variant_column); /// If variants are different, we need to extend our variant with new variants. - if (auto global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) { variant_col.insertFrom(*dynamic_src.variant_column, n, *global_discriminators_mapping); return; @@ -281,7 +281,7 @@ void ColumnDynamic::insertRangeFrom(const DB::IColumn & src_, size_t start, size auto & variant_col = assert_cast(*variant_column); /// If variants are different, we need to extend our variant with new variants. - if (auto global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) { variant_col.insertRangeFrom(*dynamic_src.variant_column, start, length, *global_discriminators_mapping); return; @@ -443,7 +443,7 @@ void ColumnDynamic::insertManyFrom(const DB::IColumn & src_, size_t position, si auto & variant_col = assert_cast(*variant_column); /// If variants are different, we need to extend our variant with new variants. - if (auto global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) + if (auto * global_discriminators_mapping = combineVariants(dynamic_src.variant_info)) { variant_col.insertManyFrom(*dynamic_src.variant_column, position, length, *global_discriminators_mapping); return; diff --git a/src/DataTypes/DataTypeDynamic.cpp b/src/DataTypes/DataTypeDynamic.cpp index 2c6b3eba906..c920e69c13b 100644 --- a/src/DataTypes/DataTypeDynamic.cpp +++ b/src/DataTypes/DataTypeDynamic.cpp @@ -67,7 +67,7 @@ static DataTypePtr create(const ASTPtr & arguments) if (identifier_name != "max_types") throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected identifier: {}. Dynamic data type argument should be in a form 'max_types=N'", identifier_name); - auto literal = argument->arguments->children[1]->as(); + auto * literal = argument->arguments->children[1]->as(); if (!literal || literal->value.getType() != Field::Types::UInt64 || literal->value.get() == 0 || literal->value.get() > 255) throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "'max_types' argument for Dynamic type should be a positive integer between 1 and 255"); diff --git a/src/DataTypes/Serializations/SerializationDynamic.h b/src/DataTypes/Serializations/SerializationDynamic.h index 7471ff54cf7..001a3cf87ce 100644 --- a/src/DataTypes/Serializations/SerializationDynamic.h +++ b/src/DataTypes/Serializations/SerializationDynamic.h @@ -11,7 +11,7 @@ class SerializationDynamicElement; class SerializationDynamic : public ISerialization { public: - SerializationDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) + explicit SerializationDynamic(size_t max_dynamic_types_) : max_dynamic_types(max_dynamic_types_) { } diff --git a/src/Parsers/ParserDataType.cpp b/src/Parsers/ParserDataType.cpp index c88b5e0e3a2..78d62456fcf 100644 --- a/src/Parsers/ParserDataType.cpp +++ b/src/Parsers/ParserDataType.cpp @@ -55,7 +55,7 @@ private: class ParserDataTypeArgument : public IParserBase { public: - ParserDataTypeArgument(std::string_view type_name_) : type_name(type_name_) + explicit ParserDataTypeArgument(std::string_view type_name_) : type_name(type_name_) { } From d7f95ddfcf1c6b0f25c273615caf4be42986778c Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 16 May 2024 13:16:01 +0200 Subject: [PATCH 143/392] CI: Enable Arm integration tests job in CI --- .github/PULL_REQUEST_TEMPLATE.md | 9 +- tests/ci/ci_config.py | 139 +++++++++++++++++-------------- 2 files changed, 81 insertions(+), 67 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 3e0131a388a..64dc9049bc2 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -42,25 +42,25 @@ At a minimum, the following information should be added (but add more as needed) > Information about CI checks: https://clickhouse.com/docs/en/development/continuous-integration/
- Modify your CI run + CI Settings **NOTE:** If your merge the PR with modified CI you **MUST KNOW** what you are doing **NOTE:** Checked options will be applied if set before CI RunConfig/PrepareRunConfig step -#### Include tests (required builds will be added automatically): -- [ ] Fast test +#### Run these jobs only (required builds will be added automatically): - [ ] Integration Tests - [ ] Stateless tests - [ ] Stateful tests - [ ] Unit tests - [ ] Performance tests +- [ ] All with aarch64 - [ ] All with ASAN - [ ] All with TSAN - [ ] All with Analyzer - [ ] All with Azure - [ ] Add your option here -#### Exclude tests: +#### Deny these jobs: - [ ] Fast test - [ ] Integration Tests - [ ] Stateless tests @@ -72,7 +72,6 @@ At a minimum, the following information should be added (but add more as needed) - [ ] All with UBSAN - [ ] All with Coverage - [ ] All with Aarch64 -- [ ] Add your option here #### Extra options: - [ ] do not test (only style check) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 588f4934125..84041b8782f 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -448,9 +448,9 @@ bugfix_validate_check = DigestConfig( ], exclude_files=[".md"], docker=IMAGES.copy() - + [ - "clickhouse/stateless-test", - ], + + [ + "clickhouse/stateless-test", + ], ) # common test params docker_server_job_config = JobConfig( @@ -570,7 +570,7 @@ class CIConfig: if self.is_build_job(job_name): stage_type = CIStages.BUILDS_1 if job_name in CI_CONFIG.get_builds_for_report( - JobNames.BUILD_CHECK_SPECIAL + JobNames.BUILD_CHECK_SPECIAL ): # special builds go to Build_2 stage to not delay Builds_1/Test_1 stage_type = CIStages.BUILDS_2 @@ -584,7 +584,7 @@ class CIConfig: required_build = CI_CONFIG.test_configs[job_name].required_build assert required_build if required_build in CI_CONFIG.get_builds_for_report( - JobNames.BUILD_CHECK + JobNames.BUILD_CHECK ): stage_type = CIStages.TESTS_1 else: @@ -597,10 +597,10 @@ class CIConfig: def get_job_config(self, check_name: str) -> JobConfig: res = None for config in ( - self.build_config, - self.builds_report_config, - self.test_configs, - self.other_jobs_configs, + self.build_config, + self.builds_report_config, + self.test_configs, + self.other_jobs_configs, ): if check_name in config: # type: ignore res = config[check_name].job_config # type: ignore @@ -612,47 +612,47 @@ class CIConfig: if self.is_build_job(check_name) or check_name == JobNames.FAST_TEST: result = Runners.BUILDER elif any( - words in check_name.lower() - for words in [ - "install packages", - "compatibility check", - "docker", - "build check", - "jepsen", - "style check", - ] + words in check_name.lower() + for words in [ + "install packages", + "compatibility check", + "docker", + "build check", + "jepsen", + "style check", + ] ): result = Runners.STYLE_CHECKER elif check_name == JobNames.DOCS_CHECK: # docs job is demanding result = Runners.FUNC_TESTER_ARM elif any( - words in check_name.lower() - for words in [ - "stateless", - "stateful", - "clickbench", - "sqllogic test", - "libfuzzer", - "bugfix validation", - ] + words in check_name.lower() + for words in [ + "stateless", + "stateful", + "clickbench", + "sqllogic test", + "libfuzzer", + "bugfix validation", + ] ): result = Runners.FUNC_TESTER elif any( - words in check_name.lower() - for words in ["stress", "upgrade", "integration", "performance comparison"] + words in check_name.lower() + for words in ["stress", "upgrade", "integration", "performance comparison"] ): result = Runners.STRESS_TESTER elif any( - words in check_name.lower() - for words in ["ast fuzzer", "unit tests", "sqlancer", "sqltest"] + words in check_name.lower() + for words in ["ast fuzzer", "unit tests", "sqlancer", "sqltest"] ): result = Runners.FUZZER_UNIT_TESTER assert result, f"BUG, no runner for [{check_name}]" if ( - "aarch" in check_name.lower() or "arm64" in check_name.lower() + "aarch" in check_name.lower() or "arm64" in check_name.lower() ) and "aarch" not in result: if result == Runners.STRESS_TESTER: # FIXME: no arm stress tester group atm @@ -683,10 +683,10 @@ class CIConfig: check_name = self.normalize_string(check_name) for config in ( - self.build_config, - self.builds_report_config, - self.test_configs, - self.other_jobs_configs, + self.build_config, + self.builds_report_config, + self.test_configs, + self.other_jobs_configs, ): for job_name in config: # type: ignore if check_name == self.normalize_string(job_name): @@ -714,10 +714,10 @@ class CIConfig: def get_digest_config(self, check_name: str) -> DigestConfig: res = None for config in ( - self.other_jobs_configs, - self.build_config, - self.builds_report_config, - self.test_configs, + self.other_jobs_configs, + self.build_config, + self.builds_report_config, + self.test_configs, ): if check_name in config: # type: ignore res = config[check_name].job_config.digest # type: ignore @@ -732,15 +732,15 @@ class CIConfig: """ assert branch for config in ( - self.other_jobs_configs, - self.build_config, - self.builds_report_config, - self.test_configs, + self.other_jobs_configs, + self.build_config, + self.builds_report_config, + self.test_configs, ): yield from config # type: ignore def get_builds_for_report( - self, report_name: str, release: bool = False, backport: bool = False + self, report_name: str, release: bool = False, backport: bool = False ) -> List[str]: # hack to modify build list for release and bp wf assert not (release and backport), "Invalid input" @@ -1155,16 +1155,20 @@ CI_CONFIG = CIConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(**stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_ASAN: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore + Build.PACKAGE_ASAN, + job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore + Build.PACKAGE_MSAN, + job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_UBSAN: TestConfig( - Build.PACKAGE_UBSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore + Build.PACKAGE_UBSAN, + job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_TSAN: TestConfig( - Build.PACKAGE_TSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore + Build.PACKAGE_TSAN, + job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), # End stateful tests for parallel replicas JobNames.STATELESS_TEST_ASAN: TestConfig( @@ -1207,7 +1211,8 @@ CI_CONFIG = CIConfig( ), JobNames.STATELESS_TEST_AZURE_ASAN: TestConfig( Build.PACKAGE_ASAN, - job_config=JobConfig(num_batches=4, **statless_test_common_params, release_only=True, run_by_ci_option=True), # type: ignore + job_config=JobConfig(num_batches=4, **statless_test_common_params, release_only=True, + run_by_ci_option=True), # type: ignore ), JobNames.STATELESS_TEST_S3_TSAN: TestConfig( Build.PACKAGE_TSAN, @@ -1220,28 +1225,39 @@ CI_CONFIG = CIConfig( Build.PACKAGE_TSAN, job_config=JobConfig(**stress_test_common_params) # type: ignore ), JobNames.STRESS_TEST_ASAN: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore + Build.PACKAGE_ASAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) + # type: ignore ), JobNames.STRESS_TEST_UBSAN: TestConfig( - Build.PACKAGE_UBSAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore + Build.PACKAGE_UBSAN, + job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore ), JobNames.STRESS_TEST_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) + # type: ignore ), JobNames.UPGRADE_TEST_ASAN: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore + Build.PACKAGE_ASAN, + job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) + # type: ignore ), JobNames.STRESS_TEST_AZURE_TSAN: TestConfig( - Build.PACKAGE_TSAN, job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True) # type: ignore + Build.PACKAGE_TSAN, + job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True) # type: ignore ), JobNames.STRESS_TEST_AZURE_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True) # type: ignore + Build.PACKAGE_MSAN, + job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True) # type: ignore ), JobNames.UPGRADE_TEST_TSAN: TestConfig( - Build.PACKAGE_TSAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore + Build.PACKAGE_TSAN, + job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) + # type: ignore ), JobNames.UPGRADE_TEST_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore + Build.PACKAGE_MSAN, + job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) + # type: ignore ), JobNames.UPGRADE_TEST_DEBUG: TestConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore @@ -1260,8 +1276,7 @@ CI_CONFIG = CIConfig( ), JobNames.INTEGRATION_TEST_ARM: TestConfig( Build.PACKAGE_AARCH64, - # add [run_by_label="test arm"] to not run in regular pr workflow by default - job_config=JobConfig(num_batches=6, **integration_test_common_params, run_by_label="test arm"), # type: ignore + job_config=JobConfig(num_batches=5, **integration_test_common_params), # type: ignore ), JobNames.INTEGRATION_TEST: TestConfig( Build.PACKAGE_RELEASE, @@ -1335,7 +1350,8 @@ CI_CONFIG = CIConfig( ), JobNames.PERFORMANCE_TEST_ARM64: TestConfig( Build.PACKAGE_AARCH64, - job_config=JobConfig(num_batches=4, run_by_label="pr-performance", **perf_test_common_params), # type: ignore + job_config=JobConfig(num_batches=4, run_by_label="pr-performance", **perf_test_common_params), + # type: ignore ), JobNames.SQLANCER: TestConfig( Build.PACKAGE_RELEASE, job_config=sqllancer_test_common_params @@ -1365,7 +1381,6 @@ CI_CONFIG = CIConfig( ) CI_CONFIG.validate() - # checks required by Mergeable Check REQUIRED_CHECKS = [ "PR Check", From 22573361de3c4cdbd105e47856f00d1411d081e8 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 16 May 2024 13:58:19 +0200 Subject: [PATCH 144/392] fixing typos and var names --- tests/ci/ci.py | 34 +++---- tests/ci/ci_config.py | 195 +++++++++++++++++------------------- tests/ci/test_ci_options.py | 6 +- 3 files changed, 110 insertions(+), 125 deletions(-) diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 08048564383..3ed584f5d93 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -71,12 +71,12 @@ class PendingState: class CiCache: """ CI cache is a bunch of records. Record is a file stored under special location on s3. - The file name has following format + The file name has the following format _[]--___.ci RECORD_TYPE: - SUCCESSFUL - for successfuly finished jobs + SUCCESSFUL - for successfully finished jobs PENDING - for pending jobs ATTRIBUTES: @@ -508,7 +508,7 @@ class CiCache: self, job: str, batch: int, num_batches: int, release_branch: bool ) -> bool: """ - checks if a given job have already been done successfuly + checks if a given job have already been done successfully """ return self.exist( self.RecordType.SUCCESSFUL, job, batch, num_batches, release_branch @@ -749,7 +749,7 @@ class CiOptions: # list of specified jobs to run ci_jobs: Optional[List[str]] = None - # btaches to run for all multi-batch jobs + # batches to run for all multi-batch jobs job_batches: Optional[List[int]] = None do_not_test: bool = False @@ -903,7 +903,7 @@ class CiOptions: if self.ci_sets: for tag in self.ci_sets: label_config = CI_CONFIG.get_label_config(tag) - assert label_config, f"Unknonwn tag [{tag}]" + assert label_config, f"Unknown tag [{tag}]" print( f"NOTE: CI Set's tag: [{tag}], add jobs: [{label_config.run_jobs}]" ) @@ -953,7 +953,7 @@ class CiOptions: jobs_params[job] = { "batches": list(range(num_batches)), "num_batches": num_batches, - "run_if_ci_option_include_set": job_config.run_by_ci_option + "run_by_ci_option": job_config.run_by_ci_option and pr_info.is_pr, } @@ -969,7 +969,7 @@ class CiOptions: for job in jobs_to_do[:]: job_param = jobs_params[job] if ( - job_param["run_if_ci_option_include_set"] + job_param["run_by_ci_option"] and job not in jobs_to_do_requested ): print( @@ -1010,7 +1010,7 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: parser.add_argument( "--pre", action="store_true", - help="Action that executes prerequesetes for the job provided in --job-name", + help="Action that executes prerequisites for the job provided in --job-name", ) parser.add_argument( "--run", @@ -1080,7 +1080,7 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "--skip-jobs", action="store_true", default=False, - help="skip fetching data about job runs, used in --configure action (for debugging and nigthly ci)", + help="skip fetching data about job runs, used in --configure action (for debugging and nightly ci)", ) parser.add_argument( "--force", @@ -1298,7 +1298,7 @@ def _configure_docker_jobs(docker_digest_or_latest: bool) -> Dict: missing_amd64 = [] missing_aarch64 = [] if not docker_digest_or_latest: - # look for missing arm and amd images only among missing multiarch manifests @missing_multi_dict + # look for missing arm and amd images only among missing multi-arch manifests @missing_multi_dict # to avoid extra dockerhub api calls missing_amd64 = list( check_missing_images_on_dockerhub(missing_multi_dict, "amd64") @@ -1396,7 +1396,7 @@ def _configure_jobs( ): continue - # fill job randomization buckets (for jobs with configured @random_bucket property)) + # fill job randomization buckets (for jobs with configured @random_bucket property) if job_config.random_bucket: if not job_config.random_bucket in randomization_buckets: randomization_buckets[job_config.random_bucket] = set() @@ -1445,7 +1445,7 @@ def _configure_jobs( jobs_params[job] = { "batches": batches_to_do, "num_batches": num_batches, - "run_if_ci_option_include_set": job_config.run_by_ci_option + "run_by_ci_option": job_config.run_by_ci_option and pr_info.is_pr, } elif add_to_skip: @@ -1490,8 +1490,8 @@ def _configure_jobs( def _generate_ci_stage_config(jobs_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: """ populates GH Actions' workflow with real jobs - "Builds_1": [{"job_name": NAME, "runner_type": RUNER_TYPE}] - "Tests_1": [{"job_name": NAME, "runner_type": RUNER_TYPE}] + "Builds_1": [{"job_name": NAME, "runner_type": RUNNER_TYPE}] + "Tests_1": [{"job_name": NAME, "runner_type": RUNNER_TYPE}] ... """ result = {} # type: Dict[str, Any] @@ -1582,7 +1582,7 @@ def _fetch_commit_tokens(message: str, pr_info: PRInfo) -> List[str]: for match in matches if match in CILabels or match.startswith("job_") or match.startswith("batch_") ] - print(f"CI modifyers from commit message: [{res}]") + print(f"CI modifiers from commit message: [{res}]") res_2 = [] if pr_info.is_pr: matches = [match[-1] for match in re.findall(pattern, pr_info.body)] @@ -1593,7 +1593,7 @@ def _fetch_commit_tokens(message: str, pr_info: PRInfo) -> List[str]: or match.startswith("job_") or match.startswith("batch_") ] - print(f"CI modifyers from PR body: [{res_2}]") + print(f"CI modifiers from PR body: [{res_2}]") return list(set(res + res_2)) @@ -1659,7 +1659,7 @@ def _upload_build_artifacts( report_url = ci_cache.upload_build_report(build_result) print(f"Report file has been uploaded to [{report_url}]") - # Upload head master binaries + # Upload master head's binaries static_bin_name = CI_CONFIG.build_config[build_name].static_binary_name if pr_info.is_master and static_bin_name: # Full binary with debug info: diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 84041b8782f..dc67e05455c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -50,9 +50,9 @@ class CILabels(metaclass=WithIter): CI_SET_ARM = "ci_set_arm" CI_SET_INTEGRATION = "ci_set_integration" CI_SET_OLD_ANALYZER = "ci_set_old_analyzer" - CI_SET_STATLESS = "ci_set_stateless" + CI_SET_STATELESS = "ci_set_stateless" CI_SET_STATEFUL = "ci_set_stateful" - CI_SET_STATLESS_ASAN = "ci_set_stateless_asan" + CI_SET_STATELESS_ASAN = "ci_set_stateless_asan" CI_SET_STATEFUL_ASAN = "ci_set_stateful_asan" libFuzzer = "libFuzzer" @@ -203,7 +203,7 @@ class DigestConfig: include_paths: List[Union[str, Path]] = field(default_factory=list) # file suffixes to exclude from digest exclude_files: List[str] = field(default_factory=list) - # directories to exlude from digest + # directories to exclude from digest exclude_dirs: List[Union[str, Path]] = field(default_factory=list) # docker names to include into digest docker: List[str] = field(default_factory=list) @@ -214,7 +214,7 @@ class DigestConfig: @dataclass class LabelConfig: """ - configures different CI scenarious per GH label + configures different CI scenarios per GH label """ run_jobs: Iterable[str] = frozenset() @@ -228,7 +228,7 @@ class JobConfig: # configures digest calculation for the job digest: DigestConfig = field(default_factory=DigestConfig) - # will be triggered for the job if omited in CI workflow yml + # will be triggered for the job if omitted in CI workflow yml run_command: str = "" # job timeout, seconds timeout: Optional[int] = None @@ -239,7 +239,7 @@ class JobConfig: # to run always regardless of the job digest or/and label run_always: bool = False # if the job needs to be run on the release branch, including master (e.g. building packages, docker server). - # NOTE: Subsequent runs on the same branch with the similar digest are still considered skippable. + # NOTE: Subsequent runs on the same branch with the similar digest are still considered skip-able. required_on_release_branch: bool = False # job is for pr workflow only pr_only: bool = False @@ -448,9 +448,9 @@ bugfix_validate_check = DigestConfig( ], exclude_files=[".md"], docker=IMAGES.copy() - + [ - "clickhouse/stateless-test", - ], + + [ + "clickhouse/stateless-test", + ], ) # common test params docker_server_job_config = JobConfig( @@ -467,7 +467,7 @@ compatibility_test_common_params = { "digest": compatibility_check_digest, "run_command": "compatibility_check.py", } -statless_test_common_params = { +stateless_test_common_params = { "digest": stateless_check_digest, "run_command": 'functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT', "timeout": 10800, @@ -570,7 +570,7 @@ class CIConfig: if self.is_build_job(job_name): stage_type = CIStages.BUILDS_1 if job_name in CI_CONFIG.get_builds_for_report( - JobNames.BUILD_CHECK_SPECIAL + JobNames.BUILD_CHECK_SPECIAL ): # special builds go to Build_2 stage to not delay Builds_1/Test_1 stage_type = CIStages.BUILDS_2 @@ -584,7 +584,7 @@ class CIConfig: required_build = CI_CONFIG.test_configs[job_name].required_build assert required_build if required_build in CI_CONFIG.get_builds_for_report( - JobNames.BUILD_CHECK + JobNames.BUILD_CHECK ): stage_type = CIStages.TESTS_1 else: @@ -597,10 +597,10 @@ class CIConfig: def get_job_config(self, check_name: str) -> JobConfig: res = None for config in ( - self.build_config, - self.builds_report_config, - self.test_configs, - self.other_jobs_configs, + self.build_config, + self.builds_report_config, + self.test_configs, + self.other_jobs_configs, ): if check_name in config: # type: ignore res = config[check_name].job_config # type: ignore @@ -612,47 +612,47 @@ class CIConfig: if self.is_build_job(check_name) or check_name == JobNames.FAST_TEST: result = Runners.BUILDER elif any( - words in check_name.lower() - for words in [ - "install packages", - "compatibility check", - "docker", - "build check", - "jepsen", - "style check", - ] + words in check_name.lower() + for words in [ + "install packages", + "compatibility check", + "docker", + "build check", + "jepsen", + "style check", + ] ): result = Runners.STYLE_CHECKER elif check_name == JobNames.DOCS_CHECK: # docs job is demanding result = Runners.FUNC_TESTER_ARM elif any( - words in check_name.lower() - for words in [ - "stateless", - "stateful", - "clickbench", - "sqllogic test", - "libfuzzer", - "bugfix validation", - ] + words in check_name.lower() + for words in [ + "stateless", + "stateful", + "clickbench", + "sqllogic test", + "libfuzzer", + "bugfix validation", + ] ): result = Runners.FUNC_TESTER elif any( - words in check_name.lower() - for words in ["stress", "upgrade", "integration", "performance comparison"] + words in check_name.lower() + for words in ["stress", "upgrade", "integration", "performance comparison"] ): result = Runners.STRESS_TESTER elif any( - words in check_name.lower() - for words in ["ast fuzzer", "unit tests", "sqlancer", "sqltest"] + words in check_name.lower() + for words in ["ast fuzzer", "unit tests", "sqlancer", "sqltest"] ): result = Runners.FUZZER_UNIT_TESTER assert result, f"BUG, no runner for [{check_name}]" if ( - "aarch" in check_name.lower() or "arm64" in check_name.lower() + "aarch" in check_name.lower() or "arm64" in check_name.lower() ) and "aarch" not in result: if result == Runners.STRESS_TESTER: # FIXME: no arm stress tester group atm @@ -661,7 +661,7 @@ class CIConfig: # crosscompile - no arm required pass else: - # switch to aarch64 runnner + # switch to aarch64 runner result += "-aarch64" return result @@ -683,10 +683,10 @@ class CIConfig: check_name = self.normalize_string(check_name) for config in ( - self.build_config, - self.builds_report_config, - self.test_configs, - self.other_jobs_configs, + self.build_config, + self.builds_report_config, + self.test_configs, + self.other_jobs_configs, ): for job_name in config: # type: ignore if check_name == self.normalize_string(job_name): @@ -708,16 +708,16 @@ class CIConfig: break assert ( res - ), f"Error: Experimantal feature... Invlid request or not supported job [{check_name}]" + ), f"Error: Experimental feature... Invalid request or not supported job [{check_name}]" return res def get_digest_config(self, check_name: str) -> DigestConfig: res = None for config in ( - self.other_jobs_configs, - self.build_config, - self.builds_report_config, - self.test_configs, + self.other_jobs_configs, + self.build_config, + self.builds_report_config, + self.test_configs, ): if check_name in config: # type: ignore res = config[check_name].job_config.digest # type: ignore @@ -732,15 +732,15 @@ class CIConfig: """ assert branch for config in ( - self.other_jobs_configs, - self.build_config, - self.builds_report_config, - self.test_configs, + self.other_jobs_configs, + self.build_config, + self.builds_report_config, + self.test_configs, ): yield from config # type: ignore def get_builds_for_report( - self, report_name: str, release: bool = False, backport: bool = False + self, report_name: str, release: bool = False, backport: bool = False ) -> List[str]: # hack to modify build list for release and bp wf assert not (release and backport), "Invalid input" @@ -811,16 +811,16 @@ class CIConfig: f"The following names of the build report '{build_report_name}' " f"are missed in build_config: {missed_names}", ) - # And finally, all of tests' requirements must be in the builds + # And finally, all tests' requirements must be in the builds for test_name, test_config in self.test_configs.items(): if test_config.required_build not in self.build_config.keys(): logging.error( - "The requierment '%s' for '%s' is not found in builds", + "The requirement '%s' for '%s' is not found in builds", test_config, test_name, ) errors.append( - f"The requierment '{test_config}' for " + f"The requirement '{test_config}' for " f"'{test_name}' is not found in builds" ) @@ -861,7 +861,7 @@ CI_CONFIG = CIConfig( JobNames.INTEGRATION_TEST_ASAN_OLD_ANALYZER, ] ), - CILabels.CI_SET_STATLESS: LabelConfig( + CILabels.CI_SET_STATELESS: LabelConfig( run_jobs=[ JobNames.STYLE_CHECK, JobNames.FAST_TEST, @@ -869,7 +869,7 @@ CI_CONFIG = CIConfig( JobNames.STATELESS_TEST_RELEASE, ] ), - CILabels.CI_SET_STATLESS_ASAN: LabelConfig( + CILabels.CI_SET_STATELESS_ASAN: LabelConfig( run_jobs=[ JobNames.STYLE_CHECK, JobNames.FAST_TEST, @@ -1155,68 +1155,63 @@ CI_CONFIG = CIConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(**stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_ASAN: TestConfig( - Build.PACKAGE_ASAN, - job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore + Build.PACKAGE_ASAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_MSAN: TestConfig( - Build.PACKAGE_MSAN, - job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_UBSAN: TestConfig( - Build.PACKAGE_UBSAN, - job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore + Build.PACKAGE_UBSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_TSAN: TestConfig( - Build.PACKAGE_TSAN, - job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore + Build.PACKAGE_TSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), # End stateful tests for parallel replicas JobNames.STATELESS_TEST_ASAN: TestConfig( Build.PACKAGE_ASAN, - job_config=JobConfig(num_batches=4, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=4, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_TSAN: TestConfig( Build.PACKAGE_TSAN, - job_config=JobConfig(num_batches=5, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=5, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_MSAN: TestConfig( Build.PACKAGE_MSAN, - job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=6, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_UBSAN: TestConfig( Build.PACKAGE_UBSAN, - job_config=JobConfig(num_batches=2, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=2, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_DEBUG: TestConfig( Build.PACKAGE_DEBUG, - job_config=JobConfig(num_batches=5, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=5, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_RELEASE: TestConfig( - Build.PACKAGE_RELEASE, job_config=JobConfig(**statless_test_common_params) # type: ignore + Build.PACKAGE_RELEASE, job_config=JobConfig(**stateless_test_common_params) # type: ignore ), JobNames.STATELESS_TEST_RELEASE_COVERAGE: TestConfig( Build.PACKAGE_RELEASE_COVERAGE, - job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=6, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_AARCH64: TestConfig( - Build.PACKAGE_AARCH64, job_config=JobConfig(**statless_test_common_params) # type: ignore + Build.PACKAGE_AARCH64, job_config=JobConfig(**stateless_test_common_params) # type: ignore ), JobNames.STATELESS_TEST_OLD_ANALYZER_S3_REPLICATED_RELEASE: TestConfig( Build.PACKAGE_RELEASE, - job_config=JobConfig(num_batches=4, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=4, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_S3_DEBUG: TestConfig( Build.PACKAGE_DEBUG, - job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=6, **stateless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_AZURE_ASAN: TestConfig( Build.PACKAGE_ASAN, - job_config=JobConfig(num_batches=4, **statless_test_common_params, release_only=True, - run_by_ci_option=True), # type: ignore + job_config=JobConfig(num_batches=4, **stateless_test_common_params, release_only=True, run_by_ci_option=True), # type: ignore ), JobNames.STATELESS_TEST_S3_TSAN: TestConfig( Build.PACKAGE_TSAN, - job_config=JobConfig(num_batches=5, **statless_test_common_params), # type: ignore + job_config=JobConfig(num_batches=5, **stateless_test_common_params), # type: ignore ), JobNames.STRESS_TEST_DEBUG: TestConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(**stress_test_common_params) # type: ignore @@ -1225,39 +1220,28 @@ CI_CONFIG = CIConfig( Build.PACKAGE_TSAN, job_config=JobConfig(**stress_test_common_params) # type: ignore ), JobNames.STRESS_TEST_ASAN: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) - # type: ignore + Build.PACKAGE_ASAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore ), JobNames.STRESS_TEST_UBSAN: TestConfig( - Build.PACKAGE_UBSAN, - job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore + Build.PACKAGE_UBSAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore ), JobNames.STRESS_TEST_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) - # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_ASAN: TestConfig( - Build.PACKAGE_ASAN, - job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) - # type: ignore + Build.PACKAGE_ASAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore ), JobNames.STRESS_TEST_AZURE_TSAN: TestConfig( - Build.PACKAGE_TSAN, - job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True) # type: ignore + Build.PACKAGE_TSAN, job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True) # type: ignore ), JobNames.STRESS_TEST_AZURE_MSAN: TestConfig( - Build.PACKAGE_MSAN, - job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True) # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True) # type: ignore ), JobNames.UPGRADE_TEST_TSAN: TestConfig( - Build.PACKAGE_TSAN, - job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) - # type: ignore + Build.PACKAGE_TSAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_MSAN: TestConfig( - Build.PACKAGE_MSAN, - job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) - # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_DEBUG: TestConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore @@ -1276,7 +1260,8 @@ CI_CONFIG = CIConfig( ), JobNames.INTEGRATION_TEST_ARM: TestConfig( Build.PACKAGE_AARCH64, - job_config=JobConfig(num_batches=5, **integration_test_common_params), # type: ignore + # add [run_by_label="test arm"] to not run in regular pr workflow by default + job_config=JobConfig(num_batches=6, **integration_test_common_params, run_by_label="test arm"), # type: ignore ), JobNames.INTEGRATION_TEST: TestConfig( Build.PACKAGE_RELEASE, @@ -1330,7 +1315,7 @@ CI_CONFIG = CIConfig( JobNames.STATELESS_TEST_FLAKY_ASAN: TestConfig( # replace to non-default Build.PACKAGE_ASAN, - job_config=JobConfig(pr_only=True, **{**statless_test_common_params, "timeout": 3600}), # type: ignore + job_config=JobConfig(pr_only=True, **{**stateless_test_common_params, "timeout": 3600}), # type: ignore ), JobNames.JEPSEN_KEEPER: TestConfig( Build.BINARY_RELEASE, @@ -1350,8 +1335,7 @@ CI_CONFIG = CIConfig( ), JobNames.PERFORMANCE_TEST_ARM64: TestConfig( Build.PACKAGE_AARCH64, - job_config=JobConfig(num_batches=4, run_by_label="pr-performance", **perf_test_common_params), - # type: ignore + job_config=JobConfig(num_batches=4, run_by_label="pr-performance", **perf_test_common_params), # type: ignore ), JobNames.SQLANCER: TestConfig( Build.PACKAGE_RELEASE, job_config=sqllancer_test_common_params @@ -1381,6 +1365,7 @@ CI_CONFIG = CIConfig( ) CI_CONFIG.validate() + # checks required by Mergeable Check REQUIRED_CHECKS = [ "PR Check", @@ -1479,7 +1464,7 @@ CHECK_DESCRIPTIONS = [ "Checks if new added or modified tests are flaky by running them repeatedly, " "in parallel, with more randomization. Functional tests are run 100 times " "with address sanitizer, and additional randomization of thread scheduling. " - "Integrational tests are run up to 10 times. If at least once a new test has " + "Integration tests are run up to 10 times. If at least once a new test has " "failed, or was too long, this check will be red. We don't allow flaky tests, " 'read the doc', @@ -1569,7 +1554,7 @@ CHECK_DESCRIPTIONS = [ lambda x: x.startswith("ClickBench"), ), CheckDescription( - "Falback for unknown", + "Fallback for unknown", "There's no description for the check yet, please add it to " "tests/ci/ci_config.py:CHECK_DESCRIPTIONS", lambda x: True, diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py index 0f10f7d4f85..c07c094d439 100644 --- a/tests/ci/test_ci_options.py +++ b/tests/ci/test_ci_options.py @@ -161,7 +161,7 @@ class TestCIOptions(unittest.TestCase): "Stateless tests (azure, asan)": { "batches": list(range(3)), "num_batches": 3, - "run_if_ci_option_include_set": True, + "run_by_ci_option": True, } } jobs_to_do, jobs_to_skip, job_params = ci_options.apply( @@ -226,10 +226,10 @@ class TestCIOptions(unittest.TestCase): job_params[job] = { "batches": list(range(3)), "num_batches": 3, - "run_if_ci_option_include_set": "azure" in job, + "run_by_ci_option": "azure" in job, } else: - job_params[job] = {"run_if_ci_option_include_set": False} + job_params[job] = {"run_by_ci_option": False} jobs_to_do, jobs_to_skip, job_params = ci_options.apply( jobs_to_do, jobs_to_skip, job_params, PRInfo() From 2fe684da0917dfca12bce6fa215bd566370d9db5 Mon Sep 17 00:00:00 2001 From: Nikita Fomichev Date: Thu, 16 May 2024 14:51:04 +0200 Subject: [PATCH 145/392] Add dynamic tests --- .../03150_dynamic_type_mv_insert.reference | 35 ++ .../03150_dynamic_type_mv_insert.sql | 34 ++ ...151_dynamic_type_scale_max_types.reference | 26 ++ .../03151_dynamic_type_scale_max_types.sql | 23 ++ .../03152_dynamic_type_simple.reference | 25 ++ .../0_stateless/03152_dynamic_type_simple.sql | 29 ++ .../03153_dynamic_type_empty.reference | 15 + .../0_stateless/03153_dynamic_type_empty.sql | 5 + ..._dynamic_type_concurrent_inserts.reference | 7 + .../03156_dynamic_type_concurrent_inserts.sh | 21 ++ .../03157_dynamic_type_json.reference | 5 + .../0_stateless/03157_dynamic_type_json.sql | 13 + .../03158_dynamic_type_from_variant.reference | 17 + .../03158_dynamic_type_from_variant.sql | 15 + .../03159_dynamic_type_all_types.reference | 300 ++++++++++++++++++ .../03159_dynamic_type_all_types.sql | 99 ++++++ .../03160_dynamic_type_agg.reference | 1 + .../0_stateless/03160_dynamic_type_agg.sql | 10 + .../03162_dynamic_type_nested.reference | 4 + .../0_stateless/03162_dynamic_type_nested.sql | 16 + 20 files changed, 700 insertions(+) create mode 100644 tests/queries/0_stateless/03150_dynamic_type_mv_insert.reference create mode 100644 tests/queries/0_stateless/03150_dynamic_type_mv_insert.sql create mode 100644 tests/queries/0_stateless/03151_dynamic_type_scale_max_types.reference create mode 100644 tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql create mode 100644 tests/queries/0_stateless/03152_dynamic_type_simple.reference create mode 100644 tests/queries/0_stateless/03152_dynamic_type_simple.sql create mode 100644 tests/queries/0_stateless/03153_dynamic_type_empty.reference create mode 100644 tests/queries/0_stateless/03153_dynamic_type_empty.sql create mode 100644 tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.reference create mode 100755 tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.sh create mode 100644 tests/queries/0_stateless/03157_dynamic_type_json.reference create mode 100644 tests/queries/0_stateless/03157_dynamic_type_json.sql create mode 100644 tests/queries/0_stateless/03158_dynamic_type_from_variant.reference create mode 100644 tests/queries/0_stateless/03158_dynamic_type_from_variant.sql create mode 100644 tests/queries/0_stateless/03159_dynamic_type_all_types.reference create mode 100644 tests/queries/0_stateless/03159_dynamic_type_all_types.sql create mode 100644 tests/queries/0_stateless/03160_dynamic_type_agg.reference create mode 100644 tests/queries/0_stateless/03160_dynamic_type_agg.sql create mode 100644 tests/queries/0_stateless/03162_dynamic_type_nested.reference create mode 100644 tests/queries/0_stateless/03162_dynamic_type_nested.sql diff --git a/tests/queries/0_stateless/03150_dynamic_type_mv_insert.reference b/tests/queries/0_stateless/03150_dynamic_type_mv_insert.reference new file mode 100644 index 00000000000..0b76d30953e --- /dev/null +++ b/tests/queries/0_stateless/03150_dynamic_type_mv_insert.reference @@ -0,0 +1,35 @@ +1 2024-01-01 Date +2 1704056400 Decimal(18, 3) +3 1 String +4 2 String + +1 2024-01-01 Date +1 2024-01-01 Date +2 1704056400 Decimal(18, 3) +2 1704056400 Decimal(18, 3) +3 1 String +3 1 String +4 2 String +4 2 String + +1 2024-01-01 String +1 2024-01-01 String +2 1704056400 String +2 1704056400 String +3 1 String +3 1 String +4 2 String +4 2 String + +1 2024-01-01 Date +1 2024-01-01 String +1 2024-01-01 String +2 1704056400 Decimal(18, 3) +2 1704056400 String +2 1704056400 String +3 1 String +3 1 String +3 1 String +4 2 String +4 2 String +4 2 String diff --git a/tests/queries/0_stateless/03150_dynamic_type_mv_insert.sql b/tests/queries/0_stateless/03150_dynamic_type_mv_insert.sql new file mode 100644 index 00000000000..ad5ea9512c6 --- /dev/null +++ b/tests/queries/0_stateless/03150_dynamic_type_mv_insert.sql @@ -0,0 +1,34 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE null_table +( + n1 UInt8, + n2 Dynamic(max_types=3) +) +ENGINE = Null; + +CREATE MATERIALIZED VIEW dummy_rmv TO to_table +AS SELECT * FROM null_table; + +CREATE TABLE to_table +( + n1 UInt8, + n2 Dynamic(max_types=4) +) +ENGINE = MergeTree ORDER BY n1; + +INSERT INTO null_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +INSERT INTO null_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=1); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=10); +INSERT INTO null_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; diff --git a/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.reference b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.reference new file mode 100644 index 00000000000..d96fbf658d8 --- /dev/null +++ b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.reference @@ -0,0 +1,26 @@ +1 2024-01-01 Date +2 1704056400 String +3 1 String +4 2 String + +1 2024-01-01 Date +1 2024-01-01 Date +2 1704056400 Decimal(18, 3) +2 1704056400 String +3 1 Float32 +3 1 String +4 2 Float64 +4 2 String + +1 2024-01-01 String +1 2024-01-01 String +1 2024-01-01 String +2 1704056400 String +2 1704056400 String +2 1704056400 String +3 1 String +3 1 String +3 1 String +4 2 String +4 2 String +4 2 String diff --git a/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql new file mode 100644 index 00000000000..04322fc4f0c --- /dev/null +++ b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql @@ -0,0 +1,23 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE to_table +( + n1 UInt8, + n2 Dynamic(max_types=2) +) +ENGINE = MergeTree ORDER BY n1; + +INSERT INTO to_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=5); +INSERT INTO to_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +select ''; +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=1); +INSERT INTO to_table ( n1, n2 ) VALUES (1, '2024-01-01'), (2, toDateTime64('2024-01-01', 3, 'Asia/Istanbul')), (3, toFloat32(1)), (4, toFloat64(2)); +SELECT *, dynamicType(n2) FROM to_table ORDER BY ALL; + +ALTER TABLE to_table MODIFY COLUMN n2 Dynamic(max_types=500); -- { serverError UNEXPECTED_AST_STRUCTURE } diff --git a/tests/queries/0_stateless/03152_dynamic_type_simple.reference b/tests/queries/0_stateless/03152_dynamic_type_simple.reference new file mode 100644 index 00000000000..5f243209ff3 --- /dev/null +++ b/tests/queries/0_stateless/03152_dynamic_type_simple.reference @@ -0,0 +1,25 @@ +string1 String +42 Int64 +3.14 Float64 +[1,2] Array(Int64) +2021-01-01 Date +string2 String + +\N None 42 Int64 +42 Int64 string String +string String [1, 2] String +[1,2] Array(Int64) \N None + ┌─d────────────────────────┬─dynamicType(d)─┬─d.Int64─┬─d.String─┬─────d.Date─┬─d.Float64─┬──────────d.DateTime─┬─d.Array(Int64)─┬─d.Array(String)──────────┐ + 1. │ 42 │ Int64 │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ + 2. │ string1 │ String │ ᴺᵁᴸᴸ │ string1 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ + 3. │ 2021-01-01 │ Date │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2021-01-01 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ + 4. │ [1,2,3] │ Array(Int64) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ [] │ + 5. │ 3.14 │ Float64 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 3.14 │ ᴺᵁᴸᴸ │ [] │ [] │ + 6. │ string2 │ String │ ᴺᵁᴸᴸ │ string2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ + 7. │ 2021-01-01 12:00:00 │ DateTime │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2021-01-01 12:00:00 │ [] │ [] │ + 8. │ ['array','of','strings'] │ Array(String) │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ ['array','of','strings'] │ + 9. │ ᴺᵁᴸᴸ │ None │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ [] │ +10. │ 42.42 │ Float64 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 42.42 │ ᴺᵁᴸᴸ │ [] │ [] │ + └──────────────────────────┴────────────────┴─────────┴──────────┴────────────┴───────────┴─────────────────────┴────────────────┴──────────────────────────┘ + +49995000 diff --git a/tests/queries/0_stateless/03152_dynamic_type_simple.sql b/tests/queries/0_stateless/03152_dynamic_type_simple.sql new file mode 100644 index 00000000000..fd5328faf15 --- /dev/null +++ b/tests/queries/0_stateless/03152_dynamic_type_simple.sql @@ -0,0 +1,29 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE test_max_types (d Dynamic(max_types=5)) ENGINE = Memory; +INSERT INTO test_max_types VALUES ('string1'), (42), (3.14), ([1, 2]), (toDate('2021-01-01')), ('string2'); +SELECT d, dynamicType(d) FROM test_max_types; + +SELECT ''; +CREATE TABLE test_nested_dynamic (d1 Dynamic, d2 Dynamic(max_types=2)) ENGINE = Memory; +INSERT INTO test_nested_dynamic VALUES (NULL, 42), (42, 'string'), ('string', [1, 2]), ([1, 2], NULL); +SELECT d1, dynamicType(d1), d2, dynamicType(d2) FROM test_nested_dynamic; + +CREATE TABLE test_rapid_schema (d Dynamic) ENGINE = Memory; +INSERT INTO test_rapid_schema VALUES (42), ('string1'), (toDate('2021-01-01')), ([1, 2, 3]), (3.14), ('string2'), (toDateTime('2021-01-01 12:00:00')), (['array', 'of', 'strings']), (NULL), (toFloat64(42.42)); + +SELECT d, dynamicType(d), d.Int64, d.String, d.Date, d.Float64, d.DateTime, d.`Array(Int64)`, d.`Array(String)` +FROM test_rapid_schema FORMAT PrettyCompactMonoBlock; + + +SELECT ''; +SELECT finalizeAggregation(CAST(dynamic_state, 'AggregateFunction(sum, UInt64)')) +FROM +( + SELECT CAST(state, 'Dynamic') AS dynamic_state + FROM + ( + SELECT sumState(number) AS state + FROM numbers(10000) + ) +); diff --git a/tests/queries/0_stateless/03153_dynamic_type_empty.reference b/tests/queries/0_stateless/03153_dynamic_type_empty.reference new file mode 100644 index 00000000000..f7c047dcd19 --- /dev/null +++ b/tests/queries/0_stateless/03153_dynamic_type_empty.reference @@ -0,0 +1,15 @@ +[] String +[1] Array(Int64) +[] Array(Int64) +['1'] Array(String) +[] Array(Int64) +() String +(1) Tuple(Int64) +(0) Tuple(Int64) +('1') Tuple(String) +(0) Tuple(Int64) +{} String +{1:2} Map(Int64, Int64) +{} Map(Int64, Int64) +{'1':'2'} Map(String, String) +{} Map(Int64, Int64) diff --git a/tests/queries/0_stateless/03153_dynamic_type_empty.sql b/tests/queries/0_stateless/03153_dynamic_type_empty.sql new file mode 100644 index 00000000000..8e942fe6f6e --- /dev/null +++ b/tests/queries/0_stateless/03153_dynamic_type_empty.sql @@ -0,0 +1,5 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE test_null_empty (d Dynamic) ENGINE = Memory; +INSERT INTO test_null_empty VALUES ([]), ([1]), ([]), (['1']), ([]), (()),((1)), (()), (('1')), (()), ({}), ({1:2}), ({}), ({'1':'2'}), ({}); +SELECT d, dynamicType(d) FROM test_null_empty; diff --git a/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.reference b/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.reference new file mode 100644 index 00000000000..e1c7b69b136 --- /dev/null +++ b/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.reference @@ -0,0 +1,7 @@ +Array(UInt64) 12000 10000 +Date 12000 10001 +Float64 12000 10000 +Int64 10000 10000 +Map(UInt64, String) 10000 10000 +String 10000 10000 +UInt64 4000 4000 diff --git a/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.sh b/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.sh new file mode 100755 index 00000000000..d7709b722c9 --- /dev/null +++ b/tests/queries/0_stateless/03156_dynamic_type_concurrent_inserts.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "CREATE TABLE test_cc (d Dynamic) ENGINE = Memory" + + +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT number::Int64 AS d FROM numbers(10000) SETTINGS max_threads=1,max_insert_threads=1" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT toString(number) AS d FROM numbers(10000) SETTINGS max_threads=2,max_insert_threads=2" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT toDate(number % 10000) AS d FROM numbers(10000) SETTINGS max_threads=3,max_insert_threads=3" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT [number, number + 1] AS d FROM numbers(10000) SETTINGS max_threads=4,max_insert_threads=4" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT toFloat64(number) AS d FROM numbers(10000) SETTINGS max_threads=5,max_insert_threads=5" & +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "INSERT INTO test_cc SELECT map(number, toString(number)) AS d FROM numbers(10000) SETTINGS max_threads=6,max_insert_threads=6" & + +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --use_variant_as_common_type=1 --allow_experimental_variant_type=1 -q "INSERT INTO test_cc SELECT CAST(multiIf(number % 5 = 0, toString(number), number % 5 = 1, number, number % 5 = 2, toFloat64(number), number % 5 = 3, toDate('2020-01-01'), [number, number + 1]), 'Dynamic') FROM numbers(10000) SETTINGS max_threads=6,max_insert_threads=6" & + +wait + +$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 -q "SELECT dynamicType(d) t, count(), uniqExact(d) FROM test_cc GROUP BY t ORDER BY t" diff --git a/tests/queries/0_stateless/03157_dynamic_type_json.reference b/tests/queries/0_stateless/03157_dynamic_type_json.reference new file mode 100644 index 00000000000..38bca12bb95 --- /dev/null +++ b/tests/queries/0_stateless/03157_dynamic_type_json.reference @@ -0,0 +1,5 @@ +1 (((((((((('deep_value')))))))))) +2 (((((((((('deep_array_value')))))))))) + +(((((((((('deep_value')))))))))) Tuple(level1 Tuple(level2 Tuple(level3 Tuple(level4 Tuple(level5 Tuple(level6 Tuple(level7 Tuple(level8 Tuple(level9 Tuple(level10 String)))))))))) +(((((((((('deep_array_value')))))))))) Tuple(level1 Tuple(level2 Tuple(level3 Tuple(level4 Tuple(level5 Tuple(level6 Tuple(level7 Tuple(level8 Tuple(level9 Tuple(level10 String)))))))))) diff --git a/tests/queries/0_stateless/03157_dynamic_type_json.sql b/tests/queries/0_stateless/03157_dynamic_type_json.sql new file mode 100644 index 00000000000..cb1a5987104 --- /dev/null +++ b/tests/queries/0_stateless/03157_dynamic_type_json.sql @@ -0,0 +1,13 @@ +SET allow_experimental_dynamic_type=1; +SET allow_experimental_object_type=1; +SET allow_experimental_variant_type=1; + +CREATE TABLE test_deep_nested_json (i UInt16, d JSON) ENGINE = Memory; + +INSERT INTO test_deep_nested_json VALUES (1, '{"level1": {"level2": {"level3": {"level4": {"level5": {"level6": {"level7": {"level8": {"level9": {"level10": "deep_value"}}}}}}}}}}'); +INSERT INTO test_deep_nested_json VALUES (2, '{"level1": {"level2": {"level3": {"level4": {"level5": {"level6": {"level7": {"level8": {"level9": {"level10": "deep_array_value"}}}}}}}}}}'); + +SELECT * FROM test_deep_nested_json ORDER BY i; + +SELECT ''; +SELECT d::Dynamic d1, dynamicType(d1) FROM test_deep_nested_json ORDER BY i; diff --git a/tests/queries/0_stateless/03158_dynamic_type_from_variant.reference b/tests/queries/0_stateless/03158_dynamic_type_from_variant.reference new file mode 100644 index 00000000000..2ede006cedc --- /dev/null +++ b/tests/queries/0_stateless/03158_dynamic_type_from_variant.reference @@ -0,0 +1,17 @@ +false Variant(Bool, DateTime64(3), IPv6, String, UInt32) +false Variant(Bool, DateTime64(3), IPv6, String, UInt32) +true Variant(Bool, DateTime64(3), IPv6, String, UInt32) +2001-01-01 01:01:01.111 Variant(Bool, DateTime64(3), IPv6, String, UInt32) +s Variant(Bool, DateTime64(3), IPv6, String, UInt32) +0 Variant(Bool, DateTime64(3), IPv6, String, UInt32) +1 Variant(Bool, DateTime64(3), IPv6, String, UInt32) +\N Variant(Bool, DateTime64(3), IPv6, String, UInt32) + +false Bool +false Bool +true Bool +2001-01-01 01:01:01.111 DateTime64(3) +s String +0 UInt32 +1 UInt32 +\N None diff --git a/tests/queries/0_stateless/03158_dynamic_type_from_variant.sql b/tests/queries/0_stateless/03158_dynamic_type_from_variant.sql new file mode 100644 index 00000000000..20a9e17a148 --- /dev/null +++ b/tests/queries/0_stateless/03158_dynamic_type_from_variant.sql @@ -0,0 +1,15 @@ +SET allow_experimental_dynamic_type=1; +SET allow_experimental_object_type=1; +SET allow_experimental_variant_type=1; + +CREATE TABLE test_variable (v Variant(String, UInt32, IPv6, Bool, DateTime64)) ENGINE = Memory; +CREATE TABLE test_dynamic (d Dynamic) ENGINE = Memory; + +INSERT INTO test_variable VALUES (1), ('s'), (0), ('0'), ('true'), ('false'), ('2001-01-01 01:01:01.111'), (NULL); + +SELECT v, toTypeName(v) FROM test_variable ORDER BY v; + +INSERT INTO test_dynamic SELECT * FROM test_variable; + +SELECT ''; +SELECT d, dynamicType(d) FROM test_dynamic ORDER BY d; diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.reference b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference new file mode 100644 index 00000000000..a162ec4f857 --- /dev/null +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference @@ -0,0 +1,300 @@ +Array(Dynamic) [] +Array(Array(Dynamic)) [[]] +Array(Array(Array(Dynamic))) [[[]]] +Bool false +Bool true +Date 2022-01-01 +Date32 2022-01-01 +DateTime 2022-01-01 01:01:01 +DateTime64(3) 2022-01-01 01:01:01.011 +Decimal(9, 1) -99999999.9 +Decimal(18, 2) -999999999.99 +Decimal(38, 3) -999999999.999 +Decimal(76, 4) -999999999.9999 +Float32 -inf +Float32 -inf +Float32 -inf +Float32 -3.4028233e38 +Float32 -1.1754942e-38 +Float32 -1e-45 +Float32 1e-45 +Float32 1.1754942e-38 +Float32 3.4028233e38 +Float32 inf +Float32 inf +Float32 inf +Float32 nan +Float32 nan +Float32 nan +Float64 -inf +Float64 -inf +Float64 -inf +Float64 -1.7976931348623157e308 +Float64 -3.40282347e38 +Float64 -1.1754943499999998e-38 +Float64 -1.3999999999999999e-45 +Float64 -2.2250738585072014e-308 +Float64 2.2250738585072014e-308 +Float64 1.3999999999999999e-45 +Float64 1.1754943499999998e-38 +Float64 3.40282347e38 +Float64 1.7976931348623157e308 +Float64 inf +Float64 inf +Float64 inf +Float64 nan +Float64 nan +Float64 nan +FixedString(1) 1 +FixedString(2) 1\0 +FixedString(10) 1\0\0\0\0\0\0\0\0\0 +IPv4 192.168.0.1 +IPv6 ::1 +Int8 -128 +Int8 -128 +Int8 -127 +Int8 -127 +Int8 -1 +Int8 -1 +Int8 0 +Int8 0 +Int8 1 +Int8 1 +Int8 126 +Int8 126 +Int8 127 +Int8 127 +Int16 -32768 +Int16 -32767 +Int16 -1 +Int16 0 +Int16 1 +Int16 32766 +Int16 32767 +Int32 -2147483648 +Int32 -2147483647 +Int32 -1 +Int32 0 +Int32 1 +Int32 2147483646 +Int32 2147483647 +Int64 -9223372036854775808 +Int64 -9223372036854775807 +Int64 -1 +Int64 0 +Int64 1 +Int64 9223372036854775806 +Int64 9223372036854775807 +Int128 -170141183460469231731687303715884105728 +Int128 -170141183460469231731687303715884105727 +Int128 -1 +Int128 0 +Int128 1 +Int128 170141183460469231731687303715884105726 +Int128 170141183460469231731687303715884105727 +Int256 -57896044618658097711785492504343953926634992332820282019728792003956564819968 +Int256 -57896044618658097711785492504343953926634992332820282019728792003956564819967 +Int256 -1 +Int256 0 +Int256 1 +Int256 57896044618658097711785492504343953926634992332820282019728792003956564819966 +Int256 57896044618658097711785492504343953926634992332820282019728792003956564819967 +IntervalDay 1 +IntervalYear 3 +IntervalMonth 2 +LowCardinality(String) 1 +LowCardinality(String) 1 +LowCardinality(UInt16) 0 +MultiPolygon [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] +Map(Dynamic, Dynamic) {'11':'v1','22':'1'} +Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] +Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] +Object(\'json\') {"1":"2"} +Object(Nullable(\'json\')) {"1":null,"2":null,"2020-10-10":null,"k1":1,"k2":2} +Object(Nullable(\'json\')) {"1":2,"2":3,"2020-10-10":null,"k1":null,"k2":null} +Object(Nullable(\'json\')) {"1":null,"2":null,"2020-10-10":"foo","k1":null,"k2":null} +Point (1.23,4.5600000000000005) +Ring [(1.23,4.5600000000000005),(2.34,5.67)] +String string +SimpleAggregateFunction(anyLast, Array(Int16)) [1,2] +Tuple(Dynamic) ('') +Tuple(Tuple(Dynamic)) (('')) +Tuple(Tuple(Tuple(Dynamic))) (((''))) +UUID 00000000-0000-0000-0000-000000000000 +UUID dededdb6-7835-4ce4-8d11-b5de6f2820e9 +UInt8 0 +UInt8 1 +UInt8 254 +UInt8 255 +UInt16 0 +UInt16 1 +UInt16 65534 +UInt16 65535 +UInt32 0 +UInt32 1 +UInt32 4294967294 +UInt32 4294967295 +UInt64 0 +UInt64 1 +UInt64 18446744073709551614 +UInt64 18446744073709551615 +UInt128 0 +UInt128 1 +UInt128 340282366920938463463374607431768211454 +UInt128 340282366920938463463374607431768211455 +UInt256 0 +UInt256 1 +UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639934 +UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639935 + +Array(Dynamic) [] +Array(Array(Dynamic)) [[]] +Array(Array(Array(Dynamic))) [[[]]] +Bool false +Bool true +Date 2022-01-01 +Date32 2022-01-01 +DateTime 2022-01-01 01:01:01 +DateTime64(3) 2022-01-01 01:01:01.011 +Decimal(9, 1) -99999999.9 +Decimal(18, 2) -999999999.99 +Decimal(38, 3) -999999999.999 +Decimal(76, 4) -999999999.9999 +Float32 -inf +Float32 -inf +Float32 -inf +Float32 -3.4028233e38 +Float32 -1.1754942e-38 +Float32 -1e-45 +Float32 1e-45 +Float32 1.1754942e-38 +Float32 3.4028233e38 +Float32 inf +Float32 inf +Float32 inf +Float32 nan +Float32 nan +Float32 nan +Float64 -inf +Float64 -inf +Float64 -inf +Float64 -1.7976931348623157e308 +Float64 -3.40282347e38 +Float64 -1.1754943499999998e-38 +Float64 -1.3999999999999999e-45 +Float64 -2.2250738585072014e-308 +Float64 2.2250738585072014e-308 +Float64 1.3999999999999999e-45 +Float64 1.1754943499999998e-38 +Float64 3.40282347e38 +Float64 1.7976931348623157e308 +Float64 inf +Float64 inf +Float64 inf +Float64 nan +Float64 nan +Float64 nan +FixedString(1) 1 +FixedString(2) 1\0 +FixedString(10) 1\0\0\0\0\0\0\0\0\0 +IPv4 192.168.0.1 +IPv6 ::1 +Int8 -128 +Int8 -128 +Int8 -127 +Int8 -127 +Int8 -1 +Int8 -1 +Int8 0 +Int8 0 +Int8 1 +Int8 1 +Int8 126 +Int8 126 +Int8 127 +Int8 127 +Int16 -32768 +Int16 -32767 +Int16 -1 +Int16 0 +Int16 1 +Int16 32766 +Int16 32767 +Int32 -2147483648 +Int32 -2147483647 +Int32 -1 +Int32 0 +Int32 1 +Int32 2147483646 +Int32 2147483647 +Int64 -9223372036854775808 +Int64 -9223372036854775807 +Int64 -1 +Int64 0 +Int64 1 +Int64 9223372036854775806 +Int64 9223372036854775807 +Int128 -170141183460469231731687303715884105728 +Int128 -170141183460469231731687303715884105727 +Int128 -1 +Int128 0 +Int128 1 +Int128 170141183460469231731687303715884105726 +Int128 170141183460469231731687303715884105727 +Int256 -57896044618658097711785492504343953926634992332820282019728792003956564819968 +Int256 -57896044618658097711785492504343953926634992332820282019728792003956564819967 +Int256 -1 +Int256 0 +Int256 1 +Int256 57896044618658097711785492504343953926634992332820282019728792003956564819966 +Int256 57896044618658097711785492504343953926634992332820282019728792003956564819967 +IntervalDay 1 +IntervalYear 3 +IntervalMonth 2 +LowCardinality(String) 1 +LowCardinality(String) 1 +LowCardinality(UInt16) 0 +MultiPolygon [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] +Map(Dynamic, Dynamic) {'11':'v1','22':'1'} +Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] +Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] +Object(\'json\') {"1":"2"} +Object(Nullable(\'json\')) {"1":null,"2":null,"2020-10-10":null,"k1":1,"k2":2} +Object(Nullable(\'json\')) {"1":null,"2":null,"2020-10-10":"foo","k1":null,"k2":null} +Object(Nullable(\'json\')) {"1":2,"2":3,"2020-10-10":null,"k1":null,"k2":null} +Point (1.23,4.5600000000000005) +Ring [(1.23,4.5600000000000005),(2.34,5.67)] +String string +SimpleAggregateFunction(anyLast, Array(Int16)) [1,2] +Tuple(Dynamic) ('') +Tuple(Tuple(Dynamic)) (('')) +Tuple(Tuple(Tuple(Dynamic))) (((''))) +UUID 00000000-0000-0000-0000-000000000000 +UUID dededdb6-7835-4ce4-8d11-b5de6f2820e9 +UInt8 0 +UInt8 1 +UInt8 254 +UInt8 255 +UInt16 0 +UInt16 1 +UInt16 65534 +UInt16 65535 +UInt32 0 +UInt32 1 +UInt32 4294967294 +UInt32 4294967295 +UInt64 0 +UInt64 1 +UInt64 18446744073709551614 +UInt64 18446744073709551615 +UInt128 0 +UInt128 1 +UInt128 340282366920938463463374607431768211454 +UInt128 340282366920938463463374607431768211455 +UInt256 0 +UInt256 1 +UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639934 +UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639935 + +50 +50 diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.sql b/tests/queries/0_stateless/03159_dynamic_type_all_types.sql new file mode 100644 index 00000000000..38d70dee64e --- /dev/null +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.sql @@ -0,0 +1,99 @@ +-- Tags: no-random-settings + +SET allow_experimental_dynamic_type=1; +SET allow_experimental_object_type=1; +SET allow_experimental_variant_type=1; +SET allow_suspicious_low_cardinality_types=1; + + +CREATE TABLE t (d Dynamic(max_types=255)) ENGINE = Memory; +-- Integer types: signed and unsigned integers (UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256) +INSERT INTO t VALUES (-128::Int8), (-127::Int8), (-1::Int8), (0::Int8), (1::Int8), (126::Int8), (127::Int8); +INSERT INTO t VALUES (-128::Int8), (-127::Int8), (-1::Int8), (0::Int8), (1::Int8), (126::Int8), (127::Int8); +INSERT INTO t VALUES (-32768::Int16), (-32767::Int16), (-1::Int16), (0::Int16), (1::Int16), (32766::Int16), (32767::Int16); +INSERT INTO t VALUES (-2147483648::Int32), (-2147483647::Int32), (-1::Int32), (0::Int32), (1::Int32), (2147483646::Int32), (2147483647::Int32); +INSERT INTO t VALUES (-9223372036854775808::Int64), (-9223372036854775807::Int64), (-1::Int64), (0::Int64), (1::Int64), (9223372036854775806::Int64), (9223372036854775807::Int64); +INSERT INTO t VALUES (-170141183460469231731687303715884105728::Int128), (-170141183460469231731687303715884105727::Int128), (-1::Int128), (0::Int128), (1::Int128), (170141183460469231731687303715884105726::Int128), (170141183460469231731687303715884105727::Int128); +INSERT INTO t VALUES (-57896044618658097711785492504343953926634992332820282019728792003956564819968::Int256), (-57896044618658097711785492504343953926634992332820282019728792003956564819967::Int256), (-1::Int256), (0::Int256), (1::Int256), (57896044618658097711785492504343953926634992332820282019728792003956564819966::Int256), (57896044618658097711785492504343953926634992332820282019728792003956564819967::Int256); + +INSERT INTO t VALUES (0::UInt8), (1::UInt8), (254::UInt8), (255::UInt8); +INSERT INTO t VALUES (0::UInt16), (1::UInt16), (65534::UInt16), (65535::UInt16); +INSERT INTO t VALUES (0::UInt32), (1::UInt32), (4294967294::UInt32), (4294967295::UInt32); +INSERT INTO t VALUES (0::UInt64), (1::UInt64), (18446744073709551614::UInt64), (18446744073709551615::UInt64); +INSERT INTO t VALUES (0::UInt128), (1::UInt128), (340282366920938463463374607431768211454::UInt128), (340282366920938463463374607431768211455::UInt128); +INSERT INTO t VALUES (0::UInt256), (1::UInt256), (115792089237316195423570985008687907853269984665640564039457584007913129639934::UInt256), (115792089237316195423570985008687907853269984665640564039457584007913129639935::UInt256); + +-- Floating-point numbers: floats(Float32 and Float64) and Decimal values +INSERT INTO t VALUES (1.17549435e-38::Float32), (3.40282347e+38::Float32), (-3.40282347e+38::Float32), (-1.17549435e-38::Float32), (1.4e-45::Float32), (-1.4e-45::Float32); +INSERT INTO t VALUES (inf::Float32), (-inf::Float32), (nan::Float32); +INSERT INTO t VALUES (inf::FLOAT(12)), (-inf::FLOAT(12)), (nan::FLOAT(12)); +INSERT INTO t VALUES (inf::FLOAT(15,22)), (-inf::FLOAT(15,22)), (nan::FLOAT(15,22)); + +INSERT INTO t VALUES (1.17549435e-38::Float64), (3.40282347e+38::Float64), (-3.40282347e+38::Float64), (-1.17549435e-38::Float64), (1.4e-45::Float64), (-1.4e-45::Float64); +INSERT INTO t VALUES (2.2250738585072014e-308::Float64), (1.7976931348623157e+308::Float64), (-1.7976931348623157e+308::Float64), (-2.2250738585072014e-308::Float64); +INSERT INTO t VALUES (inf::Float64), (-inf::Float64), (nan::Float64); +INSERT INTO t VALUES (inf::DOUBLE(12)), (-inf::DOUBLE(12)), (nan::DOUBLE(12)); +INSERT INTO t VALUES (inf::DOUBLE(15,22)), (-inf::DOUBLE(15,22)), (nan::DOUBLE(15,22)); + +INSERT INTO t VALUES (-99999999.9::Decimal32(1)); +INSERT INTO t VALUES (-999999999.99::Decimal64(2)); +INSERT INTO t VALUES (-999999999.999::Decimal128(3)); +INSERT INTO t VALUES (-999999999.9999::Decimal256(4)); + +-- Strings: String and FixedString +INSERT INTO t VALUES ('string'::String), ('1'::FixedString(1)), ('1'::FixedString(2)), ('1'::FixedString(10)); --(''::String), + +-- Boolean +INSERT INTO t VALUES ('1'::Bool), (0::Bool); + +-- Dates: use Date and Date32 for days, and DateTime and DateTime64 for instances in time +INSERT INTO t VALUES ('2022-01-01'::Date), ('2022-01-01'::Date32), ('2022-01-01 01:01:01'::DateTime), ('2022-01-01 01:01:01.011'::DateTime64); + +-- JSON +INSERT INTO t VALUES ('{"1":"2"}'::JSON); +INSERT INTO t FORMAT JSONEachRow {"d" : {"k1" : 1, "k2" : 2}} {"d" : {"1" : 2, "2" : 3}} {"d" : {"2020-10-10" : "foo"}}; + +-- UUID +INSERT INTO t VALUES ('dededdb6-7835-4ce4-8d11-b5de6f2820e9'::UUID); +INSERT INTO t VALUES ('00000000-0000-0000-0000-000000000000'::UUID); + +-- LowCardinality +INSERT INTO t VALUES ('1'::LowCardinality(String)), ('1'::LowCardinality(String)), (0::LowCardinality(UInt16)); + +-- Arrays +INSERT INTO t VALUES ([]::Array(Dynamic)), ([[]]::Array(Array(Dynamic))), ([[[]]]::Array(Array(Array(Dynamic)))); + +-- Tuple +INSERT INTO t VALUES (()::Tuple(Dynamic)), ((())::Tuple(Tuple(Dynamic))), (((()))::Tuple(Tuple(Tuple(Dynamic)))); + +-- Map. +INSERT INTO t VALUES (map(11::Dynamic, 'v1'::Dynamic, '22'::Dynamic, 1::Dynamic)); + +-- SimpleAggregateFunction +INSERT INTO t VALUES ([1,2]::SimpleAggregateFunction(anyLast, Array(Int16))); + +-- IPs +INSERT INTO t VALUES (toIPv4('192.168.0.1')), (toIPv6('::1')); + +-- Geo +INSERT INTO t VALUES ((1.23, 4.56)::Point), (([(1.23, 4.56)::Point, (2.34, 5.67)::Point])::Ring); +INSERT INTO t VALUES ([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]]::MultiPolygon); + +-- Interval +INSERT INTO t VALUES (interval '1' day), (interval '2' month), (interval '3' year); + +-- Nested +INSERT INTO t VALUES ([(1, 'aa'), (2, 'bb')]::Nested(x UInt32, y String)); +INSERT INTO t VALUES ([(1, (2, ['aa', 'bb']), [(3, 'cc'), (4, 'dd')]), (5, (6, ['ee', 'ff']), [(7, 'gg'), (8, 'hh')])]::Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String))); + +SELECT dynamicType(d), d FROM t ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d ; + +CREATE TABLE t2 (d Dynamic(max_types=255)) ENGINE = Memory; +INSERT INTO t2 SELECT * FROM t; + +SELECT ''; +SELECT dynamicType(d), d FROM t2 ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d; + +SELECT ''; +SELECT uniqExact(dynamicType(d)) t_ FROM t; +SELECT uniqExact(dynamicType(d)) t_ FROM t2; diff --git a/tests/queries/0_stateless/03160_dynamic_type_agg.reference b/tests/queries/0_stateless/03160_dynamic_type_agg.reference new file mode 100644 index 00000000000..54f6e428839 --- /dev/null +++ b/tests/queries/0_stateless/03160_dynamic_type_agg.reference @@ -0,0 +1 @@ +4950 4950 diff --git a/tests/queries/0_stateless/03160_dynamic_type_agg.sql b/tests/queries/0_stateless/03160_dynamic_type_agg.sql new file mode 100644 index 00000000000..f99232031a8 --- /dev/null +++ b/tests/queries/0_stateless/03160_dynamic_type_agg.sql @@ -0,0 +1,10 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE t (d Dynamic) ENGINE = Memory; + +INSERT INTO t SELECT sumState(number) AS d FROM numbers(100); + +SELECT finalizeAggregation(d.`AggregateFunction(sum, UInt64)`), + sumMerge(d.`AggregateFunction(sum, UInt64)`) +FROM t GROUP BY d.`AggregateFunction(sum, UInt64)`; + diff --git a/tests/queries/0_stateless/03162_dynamic_type_nested.reference b/tests/queries/0_stateless/03162_dynamic_type_nested.reference new file mode 100644 index 00000000000..8d5bcb5f85a --- /dev/null +++ b/tests/queries/0_stateless/03162_dynamic_type_nested.reference @@ -0,0 +1,4 @@ + ┌─dynamicType(d)──────────────┬─d─────────────────────────────────────────┬─d.Nested(x UInt32, y Dynamic).x─┬─d.Nested(x UInt32, y Dynamic).y───┬─dynamicType(arrayElement(d.Nested(x UInt32, y Dynamic).y, 1))─┬─d.Nested(x UInt32, y Dynamic).y.String─┬─d.Nested(x UInt32, y Dynamic).y.Tuple(Int64, Array(String))─┐ +1. │ Nested(x UInt32, y Dynamic) │ [(1,'aa'),(2,'bb')] │ [1,2] │ ['aa','bb'] │ String │ ['aa','bb'] │ [(0,[]),(0,[])] │ +2. │ Nested(x UInt32, y Dynamic) │ [(1,(2,['aa','bb'])),(5,(6,['ee','ff']))] │ [1,5] │ [(2,['aa','bb']),(6,['ee','ff'])] │ Tuple(Int64, Array(String)) │ [NULL,NULL] │ [(2,['aa','bb']),(6,['ee','ff'])] │ + └─────────────────────────────┴───────────────────────────────────────────┴─────────────────────────────────┴───────────────────────────────────┴───────────────────────────────────────────────────────────────┴────────────────────────────────────────┴─────────────────────────────────────────────────────────────┘ diff --git a/tests/queries/0_stateless/03162_dynamic_type_nested.sql b/tests/queries/0_stateless/03162_dynamic_type_nested.sql new file mode 100644 index 00000000000..94007459a9e --- /dev/null +++ b/tests/queries/0_stateless/03162_dynamic_type_nested.sql @@ -0,0 +1,16 @@ +SET allow_experimental_dynamic_type=1; + +CREATE TABLE t (d Dynamic) ENGINE = Memory; + +INSERT INTO t VALUES ([(1, 'aa'), (2, 'bb')]::Nested(x UInt32, y Dynamic)) ; +INSERT INTO t VALUES ([(1, (2, ['aa', 'bb'])), (5, (6, ['ee', 'ff']))]::Nested(x UInt32, y Dynamic)); + +SELECT dynamicType(d), + d, + d.`Nested(x UInt32, y Dynamic)`.x, + d.`Nested(x UInt32, y Dynamic)`.y, + dynamicType(d.`Nested(x UInt32, y Dynamic)`.y[1]), + d.`Nested(x UInt32, y Dynamic)`.y.`String`, + d.`Nested(x UInt32, y Dynamic)`.y.`Tuple(Int64, Array(String))` +FROM t ORDER BY d +FORMAT PrettyCompactMonoBlock; From 4829db4d9e80a02eca4b08779bd645bcd3ed5ba7 Mon Sep 17 00:00:00 2001 From: Nikita Fomichev Date: Thu, 16 May 2024 14:51:22 +0200 Subject: [PATCH 146/392] Add Dynamic type in fuzzer tests --- tests/fuzz/dictionaries/datatypes.dict | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/fuzz/dictionaries/datatypes.dict b/tests/fuzz/dictionaries/datatypes.dict index 232e89db0c0..a01a94fd3e3 100644 --- a/tests/fuzz/dictionaries/datatypes.dict +++ b/tests/fuzz/dictionaries/datatypes.dict @@ -132,3 +132,4 @@ "YEAR" "bool" "boolean" +"Dynamic" From 73504a048bdc8076b079fcbe93578229348ef761 Mon Sep 17 00:00:00 2001 From: Nikita Fomichev Date: Thu, 16 May 2024 14:51:57 +0200 Subject: [PATCH 147/392] Fix doc --- docs/en/sql-reference/data-types/dynamic.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index a2c8ba532ce..eabf032c52f 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -355,7 +355,7 @@ SELECT * FROM test WHERE d2 == [1,2,3]::Array(UInt32)::Dynamic; - Compare `Dynamic` subcolumn with required type: ```sql -SELECT * FROM test WHERE d2.`Array(Int65)` == [1,2,3] -- or using variantElement(d2, 'Array(UInt32)') +SELECT * FROM test WHERE d2.`Array(Int64)` == [1,2,3] -- or using variantElement(d2, 'Array(UInt32)') ``` ```text From bb130f429e09b20d74f4df550fc096bd68262a14 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 16 May 2024 12:40:44 +0000 Subject: [PATCH 148/392] fix reading of columns of type Tuple(Map(LowCardinality(...))) --- .../SerializationLowCardinality.cpp | 9 ++++- .../03156_tuple_map_low_cardinality.reference | 6 ++++ .../03156_tuple_map_low_cardinality.sql | 33 +++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03156_tuple_map_low_cardinality.reference create mode 100644 tests/queries/0_stateless/03156_tuple_map_low_cardinality.sql diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 2d2be195098..18d6e48623b 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -515,8 +515,14 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( size_t limit, DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, - SubstreamsCache * /* cache */) const + SubstreamsCache * cache) const { + if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) + { + column = cached_column; + return; + } + auto mutable_column = column->assumeMutable(); ColumnLowCardinality & low_cardinality_column = typeid_cast(*mutable_column); @@ -670,6 +676,7 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams( } column = std::move(mutable_column); + addToSubstreamsCache(cache, settings.path, column); } void SerializationLowCardinality::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/tests/queries/0_stateless/03156_tuple_map_low_cardinality.reference b/tests/queries/0_stateless/03156_tuple_map_low_cardinality.reference new file mode 100644 index 00000000000..5b2a36927ee --- /dev/null +++ b/tests/queries/0_stateless/03156_tuple_map_low_cardinality.reference @@ -0,0 +1,6 @@ +100000 +100000 +100000 +100000 +100000 +100000 diff --git a/tests/queries/0_stateless/03156_tuple_map_low_cardinality.sql b/tests/queries/0_stateless/03156_tuple_map_low_cardinality.sql new file mode 100644 index 00000000000..836b426a9a9 --- /dev/null +++ b/tests/queries/0_stateless/03156_tuple_map_low_cardinality.sql @@ -0,0 +1,33 @@ +DROP TABLE IF EXISTS t_map_lc; + +CREATE TABLE t_map_lc +( + id UInt64, + t Tuple(m Map(LowCardinality(String), LowCardinality(String))) +) +ENGINE = MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 0; + +INSERT INTO t_map_lc SELECT * FROM generateRandom('id UInt64, t Tuple(m Map(LowCardinality(String), LowCardinality(String)))') LIMIT 100000; + +SELECT count(), FROM t_map_lc WHERE NOT ignore(*, mapKeys(t.m)); +SELECT count(), FROM t_map_lc WHERE NOT ignore(*, t.m.keys); +SELECT count(), FROM t_map_lc WHERE NOT ignore(*, t.m.values); +SELECT * FROM t_map_lc WHERE mapContains(t.m, 'not_existing_key_1337'); + +DROP TABLE t_map_lc; + +CREATE TABLE t_map_lc +( + id UInt64, + t Tuple(m Map(LowCardinality(String), LowCardinality(String))) +) +ENGINE = MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = '10G'; + +INSERT INTO t_map_lc SELECT * FROM generateRandom('id UInt64, t Tuple(m Map(LowCardinality(String), LowCardinality(String)))') LIMIT 100000; + +SELECT count(), FROM t_map_lc WHERE NOT ignore(*, mapKeys(t.m)); +SELECT count(), FROM t_map_lc WHERE NOT ignore(*, t.m.keys); +SELECT count(), FROM t_map_lc WHERE NOT ignore(*, t.m.values); +SELECT * FROM t_map_lc WHERE mapContains(t.m, 'not_existing_key_1337'); + +DROP TABLE t_map_lc; From d10bf725f030d9a2fd18b0dd87be409c22461eb5 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 16 May 2024 14:57:22 +0200 Subject: [PATCH 149/392] ci config update to enable job --- tests/ci/ci_config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index dc67e05455c..60ad6933afc 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -1260,8 +1260,7 @@ CI_CONFIG = CIConfig( ), JobNames.INTEGRATION_TEST_ARM: TestConfig( Build.PACKAGE_AARCH64, - # add [run_by_label="test arm"] to not run in regular pr workflow by default - job_config=JobConfig(num_batches=6, **integration_test_common_params, run_by_label="test arm"), # type: ignore + job_config=JobConfig(num_batches=5, **integration_test_common_params), # type: ignore ), JobNames.INTEGRATION_TEST: TestConfig( Build.PACKAGE_RELEASE, From 20b0a208bfdddd68f04c18ff74b3e2d4c99e2e2d Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 16 May 2024 15:04:13 +0200 Subject: [PATCH 150/392] Add proportionsZTest to docs --- .../functions/other-functions.md | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 2b0215115cb..64f823d0656 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -903,6 +903,52 @@ SELECT parseTimeDelta('1yr2mo') └──────────────────────────┘ ``` +## proportionsZTest + +Returns test statistics for the two proportion Z-test - a statistical test for comparing the proportions from two populations `x` and `y`. + +**Syntax** + +```sql +proportionsZTest(successes_x, successes_y, trials_x, trials_y, conf_level, pool_type) +``` + +**Arguments** + +- `successes_x`: Number of successes in population `x`. [UInt64](../data-types/int-uint.md). +- `successes_y`: Number of successes in population `y`. [UInt64](../data-types/int-uint.md). +- `trials_x`: Number of trials in population `x`. [UInt64](../data-types/int-uint.md). +- `trials_y`: NUmber of trials in population `y`. [UInt64](../data-types/int-uint.md). +- `conf_level`: Confidence level for the test. [Float64](../data-types/float.md). +- `pool_type`: Selection of pooling (way in which the standard error is estimated). can be either `unpooled` or `pooled`. [String](../data-types/string.md). + +:::note +For argument `pool_type`: In the pooled version, the two proportions are averaged, and only one proportion is used to estimate the standard error. In the unpooled version, the two proportions are used separately. +::: + +**Returned value** + +- `z_stat`: Z statistic. [Float64](../data-types/float.md). +- `p_val`: P value. [Float64](../data-types/float.md). +- `ci_low`: The lower confidence interval. [Float64](../data-types/float.md). +- `ci_high`: The upper confidence interval. [Float64](../data-types/float.md). + +**Example** + +Query: + +```sql +SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled'); +``` + +Result: + +```response +┌─proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled')───────────────────────────────┐ +│ (-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) │ +└────────────────────────────────────────────────────────────────────────────────────┘ +``` + ## least(a, b) Returns the smaller value of a and b. From 3ff2ec0a7d8d3006ccf90250cb95b6ac7c1e872e Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 16 May 2024 15:58:27 +0200 Subject: [PATCH 151/392] Fix segfault --- src/Storages/ObjectStorage/StorageObjectStorageSource.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 9c67a125f5e..abaf51edc4e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -62,7 +62,7 @@ protected: const std::optional format_settings; const UInt64 max_block_size; const bool need_only_count; - const ReadFromFormatInfo & read_from_format_info; + const ReadFromFormatInfo read_from_format_info; const std::shared_ptr create_reader_pool; ColumnsDescription columns_desc; From 17aa7991016875df603bec8495e17d3c1dbb7d3a Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 16 May 2024 16:43:54 +0200 Subject: [PATCH 152/392] CI: aarh64: disable kerberos tests --- tests/ci/ci_config.py | 2 +- tests/integration/test_storage_kerberized_kafka/test.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 60ad6933afc..4761b5b450f 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -1260,7 +1260,7 @@ CI_CONFIG = CIConfig( ), JobNames.INTEGRATION_TEST_ARM: TestConfig( Build.PACKAGE_AARCH64, - job_config=JobConfig(num_batches=5, **integration_test_common_params), # type: ignore + job_config=JobConfig(num_batches=6, **integration_test_common_params), # type: ignore ), JobNames.INTEGRATION_TEST: TestConfig( Build.PACKAGE_RELEASE, diff --git a/tests/integration/test_storage_kerberized_kafka/test.py b/tests/integration/test_storage_kerberized_kafka/test.py index 451e1ab2ccf..24d10d7ff83 100644 --- a/tests/integration/test_storage_kerberized_kafka/test.py +++ b/tests/integration/test_storage_kerberized_kafka/test.py @@ -5,7 +5,7 @@ import time import pytest import logging -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, is_arm from helpers.test_tools import TSV from helpers.client import QueryRuntimeException @@ -18,6 +18,10 @@ from kafka.protocol.admin import DescribeGroupsResponse_v1, DescribeGroupsReques from kafka.protocol.group import MemberAssignment import socket +if is_arm(): + # skip due to no arm support for clickhouse/kerberos-kdc docker image + pytestmark = pytest.mark.skip + cluster = ClickHouseCluster(__file__) instance = cluster.add_instance( "instance", From 93601066ea74a11da2dffedf6289e442997afaf9 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 16 May 2024 14:54:21 +0000 Subject: [PATCH 153/392] Automatic style fix --- tests/ci/ci.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 3ed584f5d93..9c2ded20cff 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -968,10 +968,7 @@ class CiOptions: for job in jobs_to_do[:]: job_param = jobs_params[job] - if ( - job_param["run_by_ci_option"] - and job not in jobs_to_do_requested - ): + if job_param["run_by_ci_option"] and job not in jobs_to_do_requested: print( f"Erasing job '{job}' from list because it's not in included set, but will run only by include" ) @@ -1445,8 +1442,7 @@ def _configure_jobs( jobs_params[job] = { "batches": batches_to_do, "num_batches": num_batches, - "run_by_ci_option": job_config.run_by_ci_option - and pr_info.is_pr, + "run_by_ci_option": job_config.run_by_ci_option and pr_info.is_pr, } elif add_to_skip: # treat job as being skipped only if it's controlled by digest From d8941873ec0fca6b4a2f6f27e2b095d46ac75753 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 16 May 2024 17:38:15 +0200 Subject: [PATCH 154/392] Fix typo --- docs/en/sql-reference/functions/other-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 64f823d0656..288432167bb 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -918,7 +918,7 @@ proportionsZTest(successes_x, successes_y, trials_x, trials_y, conf_level, pool_ - `successes_x`: Number of successes in population `x`. [UInt64](../data-types/int-uint.md). - `successes_y`: Number of successes in population `y`. [UInt64](../data-types/int-uint.md). - `trials_x`: Number of trials in population `x`. [UInt64](../data-types/int-uint.md). -- `trials_y`: NUmber of trials in population `y`. [UInt64](../data-types/int-uint.md). +- `trials_y`: Number of trials in population `y`. [UInt64](../data-types/int-uint.md). - `conf_level`: Confidence level for the test. [Float64](../data-types/float.md). - `pool_type`: Selection of pooling (way in which the standard error is estimated). can be either `unpooled` or `pooled`. [String](../data-types/string.md). From 9f70cb7cbfea827dcd2458beb5545608d14a5f02 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 16 May 2024 17:39:18 +0200 Subject: [PATCH 155/392] Update aspell-dict.txt --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index a69ca0fb644..bea838c1269 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -759,6 +759,7 @@ Promtail Protobuf ProtobufSingle ProxySQL +proportionsZTest Punycode PyArrow PyCharm @@ -2753,6 +2754,7 @@ unixODBC unixodbc unoptimized unparsed +unpooled unrealiable unreplicated unresolvable From e4eaf256b1746420ef359deef1af788eab02f0d8 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 16 May 2024 16:33:41 +0200 Subject: [PATCH 156/392] Analyzer: Fix COLUMNS resolve --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 30 +++++++++++++++++-- .../03152_analyzer_columns_list.reference | 1 + .../03152_analyzer_columns_list.sql | 1 + 3 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/03152_analyzer_columns_list.reference create mode 100644 tests/queries/0_stateless/03152_analyzer_columns_list.sql diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index f0a3a2c74b6..dad1b41c7af 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -4608,6 +4608,34 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher( std::unordered_set table_expression_column_names_to_skip; + QueryTreeNodesWithNames result; + + if (matcher_node_typed.getMatcherType() == MatcherNodeType::COLUMNS_LIST) + { + auto identifiers = matcher_node_typed.getColumnsIdentifiers(); + result.reserve(identifiers.size()); + + for (const auto & identifier : identifiers) + { + auto resolve_result = tryResolveIdentifier(IdentifierLookup{identifier, IdentifierLookupContext::EXPRESSION}, scope); + if (!resolve_result.isResolved()) + throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, + "Unknown identifier '{}' inside COLUMNS matcher. In scope {}", + identifier.getFullName(), scope.dump()); + + // TODO: Introduce IdentifierLookupContext::COLUMN and get read of this check + auto * resolved_column = resolve_result.resolved_identifier->as(); + if (!resolved_column) + throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, + "Identifier '{}' inside COLUMNS matcher must resolve into a column, but got {}. In scope {}", + identifier.getFullName(), resolve_result.resolved_identifier->getNodeTypeName(), scope.dump()); + result.emplace_back(resolve_result.resolved_identifier, resolved_column->getColumnName()); + } + return result; + } + + result.resize(matcher_node_typed.getColumnsIdentifiers().size()); + for (auto & table_expression : table_expressions_stack) { bool table_expression_in_resolve_process = nearest_query_scope->table_expressions_in_resolve_process.contains(table_expression.get()); @@ -4775,8 +4803,6 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher( table_expressions_column_nodes_with_names_stack.push_back(std::move(matched_column_nodes_with_names)); } - QueryTreeNodesWithNames result; - for (auto & table_expression_column_nodes_with_names : table_expressions_column_nodes_with_names_stack) { for (auto && table_expression_column_node_with_name : table_expression_column_nodes_with_names) diff --git a/tests/queries/0_stateless/03152_analyzer_columns_list.reference b/tests/queries/0_stateless/03152_analyzer_columns_list.reference new file mode 100644 index 00000000000..eefa8ebd513 --- /dev/null +++ b/tests/queries/0_stateless/03152_analyzer_columns_list.reference @@ -0,0 +1 @@ +11323 8 diff --git a/tests/queries/0_stateless/03152_analyzer_columns_list.sql b/tests/queries/0_stateless/03152_analyzer_columns_list.sql new file mode 100644 index 00000000000..5a7e3e9696e --- /dev/null +++ b/tests/queries/0_stateless/03152_analyzer_columns_list.sql @@ -0,0 +1 @@ +SELECT COLUMNS(license_text, library_name) APPLY (length) FROM system.licenses ORDER BY library_name LIMIT 1; From d5b690339309ba5082e20af294dcabf5ec306a7c Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 16 May 2024 16:49:28 +0200 Subject: [PATCH 157/392] Cleanup and add test --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 6 ++++-- tests/queries/0_stateless/03152_analyzer_columns_list.sql | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index dad1b41c7af..dfc5ebb3532 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -4623,12 +4623,14 @@ QueryAnalyzer::QueryTreeNodesWithNames QueryAnalyzer::resolveUnqualifiedMatcher( "Unknown identifier '{}' inside COLUMNS matcher. In scope {}", identifier.getFullName(), scope.dump()); - // TODO: Introduce IdentifierLookupContext::COLUMN and get read of this check + // TODO: Introduce IdentifierLookupContext::COLUMN and get rid of this check auto * resolved_column = resolve_result.resolved_identifier->as(); if (!resolved_column) throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER, "Identifier '{}' inside COLUMNS matcher must resolve into a column, but got {}. In scope {}", - identifier.getFullName(), resolve_result.resolved_identifier->getNodeTypeName(), scope.dump()); + identifier.getFullName(), + resolve_result.resolved_identifier->getNodeTypeName(), + scope.scope_node->formatASTForErrorMessage()); result.emplace_back(resolve_result.resolved_identifier, resolved_column->getColumnName()); } return result; diff --git a/tests/queries/0_stateless/03152_analyzer_columns_list.sql b/tests/queries/0_stateless/03152_analyzer_columns_list.sql index 5a7e3e9696e..2b19cdf37a2 100644 --- a/tests/queries/0_stateless/03152_analyzer_columns_list.sql +++ b/tests/queries/0_stateless/03152_analyzer_columns_list.sql @@ -1 +1,3 @@ SELECT COLUMNS(license_text, library_name) APPLY (length) FROM system.licenses ORDER BY library_name LIMIT 1; + +SELECT COLUMNS(license_text, library_name, xyz) APPLY (length) FROM system.licenses; -- { serverError UNKNOWN_IDENTIFIER } From b82eeeee88b521f5a4beb4a20006a452f0c0bb35 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 16 May 2024 17:43:59 +0000 Subject: [PATCH 158/392] Check what would be broken if do not add all the identifiers to functions map. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 52efee03ae4..d83b1b847bf 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -1039,10 +1039,6 @@ private: auto [_, inserted] = scope.alias_name_to_expression_node.insert(std::make_pair(alias, node)); if (!inserted) scope.nodes_with_duplicated_aliases.insert(node); - - /// If node is identifier put it also in scope alias name to lambda node map - if (node->getNodeType() == QueryTreeNodeType::IDENTIFIER) - scope.alias_name_to_lambda_node.insert(std::make_pair(alias, node)); } IdentifierResolveScope & scope; From 3fe9255d74d3b274e530208b7f2a76927f6b5728 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 16 May 2024 19:19:51 +0000 Subject: [PATCH 159/392] Fix tests flakiness --- .../Serializations/SerializationDynamic.cpp | 2 +- .../03037_dynamic_merges_1.reference | 120 ------------------ ...3037_dynamic_merges_1_horizontal.reference | 60 +++++++++ .../03037_dynamic_merges_1_horizontal.sh | 52 ++++++++ .../03037_dynamic_merges_1_vertical.reference | 60 +++++++++ ....sh => 03037_dynamic_merges_1_vertical.sh} | 17 +-- .../03039_dynamic_all_merge_algorithms_1.sh | 6 +- .../03040_dynamic_type_alters_1.sh | 2 +- 8 files changed, 180 insertions(+), 139 deletions(-) delete mode 100644 tests/queries/0_stateless/03037_dynamic_merges_1.reference create mode 100644 tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.reference create mode 100755 tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh create mode 100644 tests/queries/0_stateless/03037_dynamic_merges_1_vertical.reference rename tests/queries/0_stateless/{03037_dynamic_merges_1.sh => 03037_dynamic_merges_1_vertical.sh} (79%) diff --git a/src/DataTypes/Serializations/SerializationDynamic.cpp b/src/DataTypes/Serializations/SerializationDynamic.cpp index cb9d4a2f7bc..6351ff0ca0b 100644 --- a/src/DataTypes/Serializations/SerializationDynamic.cpp +++ b/src/DataTypes/Serializations/SerializationDynamic.cpp @@ -33,7 +33,7 @@ struct SerializeBinaryBulkStateDynamic : public ISerialization::SerializeBinaryB /// Variants statistics. Map (Variant name) -> (Variant size). ColumnDynamic::Statistics statistics = { .source = ColumnDynamic::Statistics::Source::READ, .data = {} }; - SerializeBinaryBulkStateDynamic(UInt64 structure_version_) : structure_version(structure_version_) {} + explicit SerializeBinaryBulkStateDynamic(UInt64 structure_version_) : structure_version(structure_version_) {} }; struct DeserializeBinaryBulkStateDynamic : public ISerialization::DeserializeBinaryBulkState diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1.reference b/tests/queries/0_stateless/03037_dynamic_merges_1.reference deleted file mode 100644 index 0a647b41c4b..00000000000 --- a/tests/queries/0_stateless/03037_dynamic_merges_1.reference +++ /dev/null @@ -1,120 +0,0 @@ -MergeTree compact + horizontal merge -test -50000 DateTime -60000 Date -70000 Array(UInt16) -80000 String -100000 None -100000 UInt64 -70000 Array(UInt16) -100000 None -100000 UInt64 -190000 String -70000 Array(UInt16) -100000 None -100000 UInt64 -190000 String -200000 Map(UInt64, UInt64) -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -260000 String -10000 Tuple(UInt64, UInt64) -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -260000 String -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -270000 String -MergeTree wide + horizontal merge -test -50000 DateTime -60000 Date -70000 Array(UInt16) -80000 String -100000 None -100000 UInt64 -70000 Array(UInt16) -100000 None -100000 UInt64 -190000 String -70000 Array(UInt16) -100000 None -100000 UInt64 -190000 String -200000 Map(UInt64, UInt64) -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -260000 String -10000 Tuple(UInt64, UInt64) -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -260000 String -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -270000 String -MergeTree compact + vertical merge -test -50000 DateTime -60000 Date -70000 Array(UInt16) -80000 String -100000 None -100000 UInt64 -70000 Array(UInt16) -100000 None -100000 UInt64 -190000 String -70000 Array(UInt16) -100000 None -100000 UInt64 -190000 String -200000 Map(UInt64, UInt64) -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -260000 String -10000 Tuple(UInt64, UInt64) -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -260000 String -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -270000 String -MergeTree wide + vertical merge -test -50000 DateTime -60000 Date -70000 Array(UInt16) -80000 String -100000 None -100000 UInt64 -70000 Array(UInt16) -100000 None -100000 UInt64 -190000 String -70000 Array(UInt16) -100000 None -100000 UInt64 -190000 String -200000 Map(UInt64, UInt64) -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -260000 String -10000 Tuple(UInt64, UInt64) -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -260000 String -100000 None -100000 UInt64 -200000 Map(UInt64, UInt64) -270000 String diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.reference b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.reference new file mode 100644 index 00000000000..59297e46330 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.reference @@ -0,0 +1,60 @@ +MergeTree compact +test +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String +MergeTree wide +test +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh new file mode 100755 index 00000000000..0d3cd45666a --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 " + +function test() +{ + echo "test" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(80000)" + $CH_CLIENT -q "insert into test select number, range(number % 10 + 1) from numbers(70000)" + $CH_CLIENT -q "insert into test select number, toDate(number) from numbers(60000)" + $CH_CLIENT -q "insert into test select number, toDateTime(number) from numbers(50000)" + $CH_CLIENT -q "insert into test select number, NULL from numbers(100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, map(number, number) from numbers(200000)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, tuple(number, number) from numbers(10000)" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final;" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_columns_to_activate=10;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_columns_to_activate=10;" +test +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.reference b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.reference new file mode 100644 index 00000000000..59297e46330 --- /dev/null +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.reference @@ -0,0 +1,60 @@ +MergeTree compact +test +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String +MergeTree wide +test +50000 DateTime +60000 Date +70000 Array(UInt16) +80000 String +100000 None +100000 UInt64 +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +70000 Array(UInt16) +100000 None +100000 UInt64 +190000 String +200000 Map(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +10000 Tuple(UInt64, UInt64) +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +260000 String +100000 None +100000 UInt64 +200000 Map(UInt64, UInt64) +270000 String diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1.sh b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh similarity index 79% rename from tests/queries/0_stateless/03037_dynamic_merges_1.sh rename to tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh index 056f6702727..b2c40668228 100755 --- a/tests/queries/0_stateless/03037_dynamic_merges_1.sh +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh @@ -7,8 +7,8 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 " function test() { @@ -40,23 +40,12 @@ function test() $CH_CLIENT -q "drop table if exists test;" -echo "MergeTree compact + horizontal merge" -$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_columns_to_activate=10;" -test -$CH_CLIENT -q "drop table test;" - -echo "MergeTree wide + horizontal merge" -$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_columns_to_activate=10;" -test -$CH_CLIENT -q "drop table test;" - - -echo "MergeTree compact + vertical merge" +echo "MergeTree compact" $CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" test $CH_CLIENT -q "drop table test;" -echo "MergeTree wide + vertical merge" +echo "MergeTree wide" $CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" test $CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh index 198c6ca93ff..0941f2da369 100755 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --optimize_aggregation_in_order 0" function test() @@ -53,10 +53,10 @@ function test() $CH_CLIENT -q "drop table if exists test;" echo "MergeTree compact + horizontal merge" -test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=10000000000, vertical_merge_algorithm_min_columns_to_activate=100000000000" echo "MergeTree wide + horizontal merge" -test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1,vertical_merge_algorithm_min_rows_to_activate=1000000000, vertical_merge_algorithm_min_columns_to_activate=1000000000000" echo "MergeTree compact + vertical merge" test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh b/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh index 1f2a6a31ad7..7a73be20a4d 100755 --- a/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh +++ b/tests/queries/0_stateless/03040_dynamic_type_alters_1.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_experimental_analyzer=1" function run() { From 9dbc9f038b6e316b4227a54b4a70e1e0eb8f7361 Mon Sep 17 00:00:00 2001 From: copperybean Date: Fri, 17 May 2024 11:11:53 +0800 Subject: [PATCH 160/392] fix comments second time Change-Id: I4b75367233f99ef432cdff78f724195673755a83 --- src/Core/SettingsChangesHistory.h | 2 +- .../Formats/Impl/Parquet/ParquetDataValuesReader.cpp | 3 +++ .../Formats/Impl/Parquet/ParquetRecordReader.cpp | 12 ++++++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 6fb8fb9358c..96ab7490c1f 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -91,6 +91,7 @@ static std::map sett {"cross_join_min_rows_to_compress", 0, 10000000, "A new setting."}, {"cross_join_min_bytes_to_compress", 0, 1_GiB, "A new setting."}, {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, + {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, {"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"}, {"output_format_pretty_preserve_border_for_multiline_string", 1, 1, "Applies better rendering for multiline strings."}, }}, @@ -176,7 +177,6 @@ static std::map sett {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"}, {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, - {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, }}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 1f0c7105572..65f569ec264 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -29,6 +29,9 @@ void RleValuesReader::nextGroup() { cur_group_size *= 8; cur_packed_bit_values.resize(cur_group_size); + + // try to suppress clang tidy warnings by assertion + assert(bit_width < 64); bit_reader->GetBatch(bit_width, cur_packed_bit_values.data(), cur_group_size); } else diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index fddd8059925..0b797dd66ad 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -192,6 +192,7 @@ std::unique_ptr ColReaderFactory::fromByteArray() switch (col_descriptor.logical_type()->type()) { case parquet::LogicalType::Type::STRING: + case parquet::LogicalType::Type::NONE: return makeLeafReader(); default: return throwUnsupported(); @@ -204,10 +205,13 @@ std::unique_ptr ColReaderFactory::fromFLBA() { case parquet::LogicalType::Type::DECIMAL: { - if (col_descriptor.type_length() <= static_cast(sizeof(Decimal128))) - return makeDecimalLeafReader(); - else if (col_descriptor.type_length() <= static_cast(sizeof(Decimal256))) - return makeDecimalLeafReader(); + if (col_descriptor.type_length() > 0) + { + if (col_descriptor.type_length() <= static_cast(sizeof(Decimal128))) + return makeDecimalLeafReader(); + else if (col_descriptor.type_length() <= static_cast(sizeof(Decimal256))) + return makeDecimalLeafReader(); + } return throwUnsupported(PreformattedMessage::create( ", invalid type length: {}", col_descriptor.type_length())); From cc583185bdfe7f336af795d95cd97ce65cbef10b Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 17 May 2024 08:33:08 +0200 Subject: [PATCH 161/392] Add revision and make some formatting changes to other-functions page --- .../functions/other-functions.md | 437 ++++++++++++------ src/Functions/array/arrayUnion.cpp | 0 .../03155_function_array_clamp.sql | 11 + 3 files changed, 313 insertions(+), 135 deletions(-) create mode 100644 src/Functions/array/arrayUnion.cpp create mode 100755 tests/queries/0_stateless/03155_function_array_clamp.sql diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 11ee471d709..5b77f16027b 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -6,11 +6,21 @@ sidebar_label: Other # Other Functions -## hostName() +## hostName Returns the name of the host on which this function was executed. If the function executes on a remote server (distributed processing), the remote server name is returned. If the function executes in the context of a distributed table, it generates a normal column with values relevant to each shard. Otherwise it produces a constant value. +**Syntax** + +```sql +hostName() +``` + +**Returned value** + +- Host name. [String](../data-types/string.md). + ## getMacro {#getMacro} Returns a named value from the [macros](../../operations/server-configuration-parameters/settings.md#macros) section of the server configuration. @@ -27,9 +37,7 @@ getMacro(name); **Returned value** -- Value of the specified macro. - -Type: [String](../../sql-reference/data-types/string.md). +- Value of the specified macro.[String](../../sql-reference/data-types/string.md). **Example** @@ -82,9 +90,7 @@ This function is case-insensitive. **Returned value** -- String with the fully qualified domain name. - -Type: `String`. +- String with the fully qualified domain name. [String](../data-types/string.md). **Example** @@ -163,34 +169,58 @@ Result: └────────────────┴────────────────────────────┘ ``` -## visibleWidth(x) +## visibleWidth Calculates the approximate width when outputting values to the console in text format (tab-separated). -This function is used by the system to implement Pretty formats. +This function is used by the system to implement [Pretty formats](../formats.mdx). `NULL` is represented as a string corresponding to `NULL` in `Pretty` formats. +**Syntax** + +```sql +visibleWidth(x) +``` + +**Example** + +Query: + ```sql SELECT visibleWidth(NULL) ``` +Result: + ```text ┌─visibleWidth(NULL)─┐ │ 4 │ └────────────────────┘ ``` -## toTypeName(x) +## toTypeName Returns the type name of the passed argument. If `NULL` is passed, then the function returns type `Nullable(Nothing)`, which corresponds to ClickHouse's internal `NULL` representation. -## blockSize() {#blockSize} +**Syntax** + +```sql +toTypeName(x) +``` + +## blockSize {#blockSize} In ClickHouse, queries are processed in blocks (chunks). This function returns the size (row count) of the block the function is called on. +**Syntax** + +```sql +blockSize() +``` + ## byteSize Returns an estimation of uncompressed byte size of its arguments in memory. @@ -207,9 +237,7 @@ byteSize(argument [, ...]) **Returned value** -- Estimation of byte size of the arguments in memory. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- Estimation of byte size of the arguments in memory. [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** @@ -288,16 +316,28 @@ Result: └────────────────────────────┘ ``` -## materialize(x) +## materialize Turns a constant into a full column containing a single value. Full columns and constants are represented differently in memory. Functions usually execute different code for normal and constant arguments, although the result should typically be the same. This function can be used to debug this behavior. -## ignore(…) +**Syntax** + +```sql +materialize(x) +``` + +## ignore Accepts any arguments, including `NULL` and does nothing. Always returns 0. The argument is internally still evaluated. Useful e.g. for benchmarks. +**Syntax** + +```sql +ignore(…) +``` + ## sleep Used to introduce a delay or pause in the execution of a query. It is primarily used for testing and debugging purposes. @@ -392,27 +432,33 @@ The `sleepEachRow()` function is primarily used for testing and debugging purpos Like the [`sleep()` function](#sleep), it's important to use `sleepEachRow()` judiciously and only when necessary, as it can significantly impact the overall performance and responsiveness of your ClickHouse system, especially when dealing with large result sets. -## currentDatabase() +## currentDatabase Returns the name of the current database. Useful in table engine parameters of `CREATE TABLE` queries where you need to specify the database. -## currentUser() {#currentUser} +**Syntax** + +```sql +currentDatabase() +``` + +## currentUser {#currentUser} Returns the name of the current user. In case of a distributed query, the name of the user who initiated the query is returned. +**Syntax** + ```sql -SELECT currentUser(); +currentUser() ``` Aliases: `user()`, `USER()`, `current_user()`. Aliases are case insensitive. **Returned values** -- The name of the current user. -- In distributed queries, the login of the user who initiated the query. - -Type: `String`. +- The name of the current user. [String](../data-types/string.md). +- In distributed queries, the login of the user who initiated the query. [String](../data-types/string.md). **Example** @@ -448,10 +494,8 @@ isConstant(x) **Returned values** -- `1` if `x` is constant. -- `0` if `x` is non-constant. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `x` is constant. [UInt8](../../sql-reference/data-types/int-uint.md). +- `0` if `x` is non-constant. [UInt8](../../sql-reference/data-types/int-uint.md). **Examples** @@ -497,14 +541,26 @@ Result: └────────────────────┘ ``` -## isFinite(x) +## isFinite Returns 1 if the Float32 or Float64 argument not infinite and not a NaN, otherwise this function returns 0. -## isInfinite(x) +**Syntax** + +```sql +isFinite(x) +``` + +## isInfinite Returns 1 if the Float32 or Float64 argument is infinite, otherwise this function returns 0. Note that 0 is returned for a NaN. +**Syntax** + +```sql +isInfinite(x) +``` + ## ifNotFinite Checks whether a floating point value is finite. @@ -517,8 +573,8 @@ ifNotFinite(x,y) **Arguments** -- `x` — Value to check for infinity. Type: [Float\*](../../sql-reference/data-types/float.md). -- `y` — Fallback value. Type: [Float\*](../../sql-reference/data-types/float.md). +- `x` — Value to check for infinity. [Float\*](../../sql-reference/data-types/float.md). +- `y` — Fallback value. [Float\*](../../sql-reference/data-types/float.md). **Returned value** @@ -539,10 +595,16 @@ Result: You can get similar result by using the [ternary operator](../../sql-reference/functions/conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. -## isNaN(x) +## isNaN Returns 1 if the Float32 and Float64 argument is NaN, otherwise this function 0. +**Syntax** + +```sql +isNaN(x) +``` + ## hasColumnInTable Given the database name, the table name, and the column name as constant strings, returns 1 if the given column exists, otherwise 0. @@ -733,11 +795,19 @@ LIMIT 10 └────────────────┴─────────┘ ``` -## formatReadableDecimalSize(x) +## formatReadableDecimalSize Given a size (number of bytes), this function returns a readable, rounded size with suffix (KB, MB, etc.) as string. -Example: +**Syntax** + +```sql +formatReadableDecimalSize(x) +``` + +**Example** + +Query: ```sql SELECT @@ -745,6 +815,8 @@ SELECT formatReadableDecimalSize(filesize_bytes) AS filesize ``` +Result: + ```text ┌─filesize_bytes─┬─filesize───┐ │ 1 │ 1.00 B │ @@ -754,11 +826,20 @@ SELECT └────────────────┴────────────┘ ``` -## formatReadableSize(x) +## formatReadableSize Given a size (number of bytes), this function returns a readable, rounded size with suffix (KiB, MiB, etc.) as string. -Example: +**Syntax** + +```sql +formatReadableSize(x) +``` +Alias: `FORMAT_BYTES`. + +**Example** + +Query: ```sql SELECT @@ -766,7 +847,7 @@ SELECT formatReadableSize(filesize_bytes) AS filesize ``` -Alias: `FORMAT_BYTES`. +Result: ```text ┌─filesize_bytes─┬─filesize───┐ @@ -777,11 +858,19 @@ Alias: `FORMAT_BYTES`. └────────────────┴────────────┘ ``` -## formatReadableQuantity(x) +## formatReadableQuantity Given a number, this function returns a rounded number with suffix (thousand, million, billion, etc.) as string. -Example: +**Syntax** + +```sql +formatReadableQuantity(x) +``` + +**Example** + +Query: ```sql SELECT @@ -789,6 +878,8 @@ SELECT formatReadableQuantity(number) AS number_for_humans ``` +Result: + ```text ┌─────────number─┬─number_for_humans─┐ │ 1024 │ 1.02 thousand │ @@ -903,15 +994,27 @@ SELECT parseTimeDelta('1yr2mo') └──────────────────────────┘ ``` -## least(a, b) +## least Returns the smaller value of a and b. -## greatest(a, b) +**Syntax** + +```sql +least(a, b) +``` + +## greatest Returns the larger value of a and b. -## uptime() +**Syntax** + +```sql +greatest(a, b) +``` + +## uptime Returns the server’s uptime in seconds. If executed in the context of a distributed table, this function generates a normal column with values relevant to each shard. Otherwise it produces a constant value. @@ -924,9 +1027,7 @@ uptime() **Returned value** -- Time value of seconds. - -Type: [UInt32](/docs/en/sql-reference/data-types/int-uint.md). +- Time value of seconds. [UInt32](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -944,7 +1045,7 @@ Result: └────────┘ ``` -## version() +## version Returns the current version of ClickHouse as a string in the form of: @@ -971,7 +1072,7 @@ None. **Returned value** -Type: [String](../data-types/string) +- Current version of ClickHouse. [String](../data-types/string) **Implementation details** @@ -993,23 +1094,47 @@ SELECT version() └───────────┘ ``` -## buildId() +## buildId Returns the build ID generated by a compiler for the running ClickHouse server binary. If executed in the context of a distributed table, this function generates a normal column with values relevant to each shard. Otherwise it produces a constant value. -## blockNumber() +**Syntax** + +```sql +buildId() +``` + +## blockNumber Returns the sequence number of the data block where the row is located. -## rowNumberInBlock() {#rowNumberInBlock} +**Syntax** + +```sql +blockNumber() +``` + +## rowNumberInBlock {#rowNumberInBlock} Returns the ordinal number of the row in the data block. Different data blocks are always recalculated. -## rowNumberInAllBlocks() +**Syntax** + +```sql +rowNumberInBlock() +``` + +## rowNumberInAllBlocks Returns the ordinal number of the row in the data block. This function only considers the affected data blocks. +**Syntax** + +```sql +rowNumberInAllBlocks() +``` + ## neighbor The window function that provides access to a row at a specified offset before or after the current row of a given column. @@ -1128,7 +1253,7 @@ Result: └────────────┴───────┴───────────┴────────────────┘ ``` -## runningDifference(x) {#runningDifference} +## runningDifference {#runningDifference} Calculates the difference between two consecutive row values in the data block. Returns 0 for the first row, and for subsequent rows the difference to the previous row. @@ -1143,7 +1268,15 @@ The result of the function depends on the affected data blocks and the order of The order of rows during calculation of `runningDifference()` can differ from the order of rows returned to the user. To prevent that you can create a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. -Example: +**Syntax** + +```sql +runningDifference(x) +``` + +**Example** + +Query: ```sql SELECT @@ -1162,6 +1295,8 @@ FROM ) ``` +Result: + ```text ┌─EventID─┬───────────EventTime─┬─delta─┐ │ 1106 │ 2016-11-24 00:00:04 │ 0 │ @@ -1174,6 +1309,8 @@ FROM Please note that the block size affects the result. The internal state of `runningDifference` state is reset for each new block. +Query: + ```sql SELECT number, @@ -1182,6 +1319,8 @@ FROM numbers(100000) WHERE diff != 1 ``` +Result: + ```text ┌─number─┬─diff─┐ │ 0 │ 0 │ @@ -1191,6 +1330,8 @@ WHERE diff != 1 └────────┴──────┘ ``` +Query: + ```sql set max_block_size=100000 -- default value is 65536! @@ -1201,6 +1342,8 @@ FROM numbers(100000) WHERE diff != 1 ``` +Result: + ```text ┌─number─┬─diff─┐ │ 0 │ 0 │ @@ -1238,9 +1381,7 @@ runningConcurrency(start, end) **Returned values** -- The number of concurrent events at each event start time. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md) +- The number of concurrent events at each event start time. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -1272,23 +1413,43 @@ Result: └────────────┴────────────────────────────────┘ ``` -## MACNumToString(num) +## MACNumToString Interprets a UInt64 number as a MAC address in big endian format. Returns the corresponding MAC address in format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form) as string. -## MACStringToNum(s) +**Syntax** + +```sql +MACNumToString(num) +``` + +## MACStringToNum The inverse function of MACNumToString. If the MAC address has an invalid format, it returns 0. -## MACStringToOUI(s) +**Syntax** + +```sql +MACStringToNum(s) +``` + +## MACStringToOUI Given a MAC address in format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form), returns the first three octets as a UInt64 number. If the MAC address has an invalid format, it returns 0. +**Syntax** + +```sql +MACStringToOUI(s) +``` + ## getSizeOfEnumType Returns the number of fields in [Enum](../../sql-reference/data-types/enum.md). An exception is thrown if the type is not `Enum`. +**Syntax** + ```sql getSizeOfEnumType(value) ``` @@ -1349,6 +1510,8 @@ Result: Returns the internal name of the data type that represents the value. +**Syntax** + ```sql toColumnTypeName(value) ``` @@ -1427,6 +1590,8 @@ Returns the default value for the given data type. Does not include default values for custom columns set by the user. +**Syntax** + ```sql defaultValueOfArgumentType(expression) ``` @@ -1625,29 +1790,31 @@ Result: Creates an array with a single value. -Used for the internal implementation of [arrayJoin](../../sql-reference/functions/array-join.md#functions_arrayjoin). +:::note +This function is used for the internal implementation of [arrayJoin](../../sql-reference/functions/array-join.md#functions_arrayjoin). +::: + +**Syntax** ```sql -SELECT replicate(x, arr); +replicate(x, arr) ``` -**Arguments:** +**Arguments** -- `arr` — An array. - `x` — The value to fill the result array with. +- `arr` — An array. [Array](../data-types/array.md). **Returned value** -An array of the lame length as `arr` filled with value `x`. - -Type: `Array`. +An array of the lame length as `arr` filled with value `x`. [Array](../data-types/array.md). **Example** Query: ```sql -SELECT replicate(1, ['a', 'b', 'c']) +SELECT replicate(1, ['a', 'b', 'c']); ``` Result: @@ -1658,6 +1825,36 @@ Result: └───────────────────────────────┘ ``` +## revision + +Returns the current ClickHouse [server revision](../../operations/system-tables/metrics#revision). + +**Syntax** + +```sql +revision() +``` + +**Returned value** + +- The current ClickHouse server revision. [UInt32](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +SELECT revision(); +``` + +Result: + +```response +┌─revision()─┐ +│ 54485 │ +└────────────┘ +``` + ## filesystemAvailable Returns the amount of free space in the filesystem hosting the database persistence. The returned value is always smaller than total free space ([filesystemFree](#filesystemfree)) because some space is reserved for the operating system. @@ -1670,9 +1867,7 @@ filesystemAvailable() **Returned value** -- The amount of remaining space available in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The amount of remaining space available in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -1702,9 +1897,7 @@ filesystemFree() **Returned value** -- The amount of free space in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The amount of free space in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -1734,9 +1927,7 @@ filesystemCapacity() **Returned value** -- Capacity of the filesystem in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- Capacity of the filesystem in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -2100,7 +2291,7 @@ Result: └──────────────────────────────────────────────────┘ ``` -## catboostEvaluate(path_to_model, feature_1, feature_2, …, feature_n) +## catboostEvaluate :::note This function is not available in ClickHouse Cloud. @@ -2109,6 +2300,14 @@ This function is not available in ClickHouse Cloud. Evaluate an external catboost model. [CatBoost](https://catboost.ai) is an open-source gradient boosting library developed by Yandex for machine learning. Accepts a path to a catboost model and model arguments (features). Returns Float64. +**Syntax** + +```sql +catboostEvaluate(path_to_model, feature_1, feature_2, …, feature_n) +``` + +**Example** + ```sql SELECT feat1, ..., feat_n, catboostEvaluate('/path/to/model.bin', feat_1, ..., feat_n) AS prediction FROM data_table @@ -2145,10 +2344,16 @@ communicate using a HTTP interface. By default, port `9012` is used. A different See [Training and applying models](https://catboost.ai/docs/features/training.html#training) for how to train catboost models from a training data set. -## throwIf(x\[, message\[, error_code\]\]) +## throwIf Throw an exception if argument `x` is true. +**Syntax** + +```sql +throwIf(x\[, message\[, error_code\]\]) +``` + **Arguments** - `x` - the condition to check. @@ -2284,9 +2489,7 @@ countDigits(x) **Returned value** -Number of digits. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). +- Number of digits. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). :::note For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). @@ -2310,9 +2513,7 @@ Result: ## errorCodeToName -Returns the textual name of an error code. - -Type: [LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md). +- Returns the textual name of an error code. [LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md). **Syntax** @@ -2343,9 +2544,7 @@ tcpPort() **Returned value** -- The TCP port number. - -Type: [UInt16](../../sql-reference/data-types/int-uint.md). +- The TCP port number. [UInt16](../../sql-reference/data-types/int-uint.md). **Example** @@ -2381,9 +2580,7 @@ currentProfiles() **Returned value** -- List of the current user settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the current user settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## enabledProfiles @@ -2397,9 +2594,7 @@ enabledProfiles() **Returned value** -- List of the enabled settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the enabled settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## defaultProfiles @@ -2413,9 +2608,7 @@ defaultProfiles() **Returned value** -- List of the default settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the default settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## currentRoles @@ -2429,9 +2622,7 @@ currentRoles() **Returned value** -- A list of the current roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- A list of the current roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## enabledRoles @@ -2445,9 +2636,7 @@ enabledRoles() **Returned value** -- List of the enabled roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the enabled roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## defaultRoles @@ -2461,9 +2650,7 @@ defaultRoles() **Returned value** -- List of the default roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the default roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## getServerPort @@ -2492,9 +2679,7 @@ getServerPort(port_name) **Returned value** -- The number of the server port. - -Type: [UInt16](../../sql-reference/data-types/int-uint.md). +- The number of the server port. [UInt16](../../sql-reference/data-types/int-uint.md). **Example** @@ -2526,9 +2711,7 @@ queryID() **Returned value** -- The ID of the current query. - -Type: [String](../../sql-reference/data-types/string.md) +- The ID of the current query. [String](../../sql-reference/data-types/string.md). **Example** @@ -2562,9 +2745,7 @@ initialQueryID() **Returned value** -- The ID of the initial current query. - -Type: [String](../../sql-reference/data-types/string.md) +- The ID of the initial current query. [String](../../sql-reference/data-types/string.md). **Example** @@ -2597,9 +2778,7 @@ shardNum() **Returned value** -- Shard index or constant `0`. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Shard index or constant `0`. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -2639,9 +2818,7 @@ shardCount() **Returned value** -- Total number of shards or `0`. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Total number of shards or `0`. [UInt32](../../sql-reference/data-types/int-uint.md). **See Also** @@ -2663,9 +2840,7 @@ getOSKernelVersion() **Returned value** -- The current OS kernel version. - -Type: [String](../../sql-reference/data-types/string.md). +- The current OS kernel version. [String](../../sql-reference/data-types/string.md). **Example** @@ -2699,9 +2874,7 @@ zookeeperSessionUptime() **Returned value** -- Uptime of the current ZooKeeper session in seconds. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Uptime of the current ZooKeeper session in seconds. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -2738,9 +2911,7 @@ All arguments must be constant. **Returned value** -- Randomly generated table structure. - -Type: [String](../../sql-reference/data-types/string.md). +- Randomly generated table structure. [String](../../sql-reference/data-types/string.md). **Examples** @@ -2807,9 +2978,7 @@ structureToCapnProtoSchema(structure) **Returned value** -- CapnProto schema - -Type: [String](../../sql-reference/data-types/string.md). +- CapnProto schema. [String](../../sql-reference/data-types/string.md). **Examples** @@ -2908,9 +3077,7 @@ structureToProtobufSchema(structure) **Returned value** -- Protobuf schema - -Type: [String](../../sql-reference/data-types/string.md). +- Protobuf schema. [String](../../sql-reference/data-types/string.md). **Examples** diff --git a/src/Functions/array/arrayUnion.cpp b/src/Functions/array/arrayUnion.cpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03155_function_array_clamp.sql b/tests/queries/0_stateless/03155_function_array_clamp.sql new file mode 100755 index 00000000000..4794dafda4b --- /dev/null +++ b/tests/queries/0_stateless/03155_function_array_clamp.sql @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel, no-ordinary-database, long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# In previous versions this command took longer than ten minutes. Now it takes less than a second in release mode: + +python3 -c 'import sys; import struct; sys.stdout.buffer.write(b"".join(struct.pack(" Date: Fri, 17 May 2024 08:44:25 +0200 Subject: [PATCH 162/392] Remove files which shouldn't be on this branch --- src/Functions/array/arrayUnion.cpp | 0 .../0_stateless/03155_function_array_clamp.sql | 11 ----------- 2 files changed, 11 deletions(-) delete mode 100644 src/Functions/array/arrayUnion.cpp delete mode 100755 tests/queries/0_stateless/03155_function_array_clamp.sql diff --git a/src/Functions/array/arrayUnion.cpp b/src/Functions/array/arrayUnion.cpp deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/queries/0_stateless/03155_function_array_clamp.sql b/tests/queries/0_stateless/03155_function_array_clamp.sql deleted file mode 100755 index 4794dafda4b..00000000000 --- a/tests/queries/0_stateless/03155_function_array_clamp.sql +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -# Tags: no-fasttest, no-parallel, no-ordinary-database, long - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -# In previous versions this command took longer than ten minutes. Now it takes less than a second in release mode: - -python3 -c 'import sys; import struct; sys.stdout.buffer.write(b"".join(struct.pack(" Date: Thu, 16 May 2024 18:17:46 +0200 Subject: [PATCH 163/392] Support for archives (unfinished) --- src/IO/S3/URI.h | 1 + .../ObjectStorage/ReadBufferIterator.cpp | 34 ++-- .../ObjectStorage/S3/Configuration.cpp | 8 + src/Storages/ObjectStorage/S3/Configuration.h | 3 + .../ObjectStorage/StorageObjectStorage.cpp | 10 ++ .../ObjectStorage/StorageObjectStorage.h | 4 + .../StorageObjectStorageSource.cpp | 146 +++++++++++++++++- .../StorageObjectStorageSource.h | 70 ++++++++- 8 files changed, 255 insertions(+), 21 deletions(-) diff --git a/src/IO/S3/URI.h b/src/IO/S3/URI.h index c52e6bc1441..363f98c46f5 100644 --- a/src/IO/S3/URI.h +++ b/src/IO/S3/URI.h @@ -29,6 +29,7 @@ struct URI std::string key; std::string version_id; std::string storage_name; + /// Path (or path pattern) in archive if uri is an archive. std::optional archive_pattern; std::string uri_str; diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 3705725ffe1..61575b0115a 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -244,22 +245,35 @@ ReadBufferIterator::Data ReadBufferIterator::next() } } - std::unique_ptr read_buffer = object_storage->readObject( - StoredObject(current_object_info->relative_path), - getContext()->getReadSettings(), - {}, - current_object_info->metadata->size_bytes); + std::unique_ptr read_buf; + CompressionMethod compression_method; + using ObjectInfoInArchive = StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive; + if (auto object_info_in_archive = dynamic_cast(current_object_info.get())) + { + compression_method = chooseCompressionMethod(configuration->getPathInArchive(), configuration->compression_method); + auto & archive_reader = object_info_in_archive->archive_reader; + read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); + } + else + { + compression_method = chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method); + read_buf = object_storage->readObject( + StoredObject(current_object_info->relative_path), + getContext()->getReadSettings(), + {}, + current_object_info->metadata->size_bytes); + } - if (!query_settings.skip_empty_files || !read_buffer->eof()) + if (!query_settings.skip_empty_files || !read_buf->eof()) { first = false; - read_buffer = wrapReadBufferWithCompressionMethod( - std::move(read_buffer), - chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method), + read_buf = wrapReadBufferWithCompressionMethod( + std::move(read_buf), + compression_method, static_cast(getContext()->getSettingsRef().zstd_window_log_max)); - return {std::move(read_buffer), std::nullopt, format}; + return {std::move(read_buf), std::nullopt, format}; } } } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 9fcbc6a6816..00d569fea9f 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -55,6 +55,14 @@ String StorageS3Configuration::getDataSourceDescription() return std::filesystem::path(url.uri.getHost() + std::to_string(url.uri.getPort())) / url.bucket; } +std::string StorageS3Configuration::getPathInArchive() const +{ + if (url.archive_pattern.has_value()) + return url.archive_pattern.value(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} is not an archive", getPath()); +} + void StorageS3Configuration::check(ContextPtr context) const { validateNamespace(url.bucket); diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index 0bd7f1ab108..de6c02d5020 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -34,6 +34,9 @@ public: String getDataSourceDescription() override; StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; + bool isArchive() const override { return url.archive_pattern.has_value(); } + std::string getPathInArchive() const override; + void check(ContextPtr context) const override; void validateNamespace(const String & name) const override; ConfigurationPtr clone() override { return std::make_shared(*this); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index bc5b347d1e0..73e3d861cff 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -452,6 +452,16 @@ std::string StorageObjectStorage::Configuration::getPathWithoutGlobs() const return getPath().substr(0, getPath().find_first_of("*?{")); } +bool StorageObjectStorage::Configuration::isPathInArchiveWithGlobs() const +{ + return getPathInArchive().find_first_of("*?{") != std::string::npos; +} + +std::string StorageObjectStorage::Configuration::getPathInArchive() const +{ + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} is not archive", getPath()); +} + void StorageObjectStorage::Configuration::assertInitialized() const { if (!initialized) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 26b153ca0db..7b118cb7e6b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -175,6 +175,10 @@ public: bool isNamespaceWithGlobs() const; virtual std::string getPathWithoutGlobs() const; + virtual bool isArchive() const { return false; } + bool isPathInArchiveWithGlobs() const; + virtual std::string getPathInArchive() const; + virtual void check(ContextPtr context) const; virtual void validateNamespace(const String & /* name */) const {} diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 8d5df96ca6e..56905e6c29b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -100,10 +101,11 @@ std::shared_ptr StorageObjectStorageSourc auto settings = configuration->getQuerySettings(local_context); + std::unique_ptr iterator; if (configuration->isPathWithGlobs()) { /// Iterate through disclosed globs and make a source for each file - return std::make_shared( + iterator = std::make_unique( object_storage, configuration, predicate, virtual_columns, local_context, read_keys, settings.list_object_keys_size, settings.throw_on_zero_files_match, file_progress_callback); @@ -123,10 +125,17 @@ std::shared_ptr StorageObjectStorageSourc copy_configuration->setPaths(keys); } - return std::make_shared( + iterator = std::make_unique( object_storage, copy_configuration, virtual_columns, read_keys, settings.ignore_non_existent_file, file_progress_callback); } + + if (configuration->isArchive()) + { + return std::make_shared(object_storage, configuration, std::move(iterator), local_context, read_keys); + } + + return iterator; } void StorageObjectStorageSource::lazyInitialize(size_t processor) @@ -262,9 +271,20 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade } else { - const auto compression_method = chooseCompressionMethod(object_info->relative_path, configuration->compression_method); + CompressionMethod compression_method; const auto max_parsing_threads = need_only_count ? std::optional(1) : std::nullopt; - read_buf = createReadBuffer(object_info->relative_path, object_info->metadata->size_bytes); + + if (auto object_info_in_archive = dynamic_cast(object_info.get())) + { + compression_method = chooseCompressionMethod(configuration->getPathInArchive(), configuration->compression_method); + auto & archive_reader = object_info_in_archive->archive_reader; + read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); + } + else + { + compression_method = chooseCompressionMethod(object_info->relative_path, configuration->compression_method); + read_buf = createReadBuffer(*object_info); + } auto input_format = FormatFactory::instance().getInput( configuration->format, *read_buf, read_from_format_info.format_header, @@ -312,8 +332,10 @@ std::future StorageObjectStorageSource return create_reader_scheduler([=, this] { return createReader(processor); }, Priority{}); } -std::unique_ptr StorageObjectStorageSource::createReadBuffer(const String & key, size_t object_size) +std::unique_ptr StorageObjectStorageSource::createReadBuffer(const ObjectInfo & object_info) { + const auto & object_size = object_info.metadata->size_bytes; + auto read_settings = getContext()->getReadSettings().adjustBufferSize(object_size); read_settings.enable_filesystem_cache = false; /// FIXME: Changing this setting to default value breaks something around parquet reading @@ -333,7 +355,7 @@ std::unique_ptr StorageObjectStorageSource::createReadBuffer(const S LOG_TRACE(log, "Downloading object of size {} with initial prefetch", object_size); auto async_reader = object_storage->readObjects( - StoredObjects{StoredObject{key, /* local_path */ "", object_size}}, read_settings); + StoredObjects{StoredObject{object_info.relative_path, /* local_path */ "", object_size}}, read_settings); async_reader->setReadUntilEnd(); if (read_settings.remote_fs_prefetch) @@ -344,7 +366,7 @@ std::unique_ptr StorageObjectStorageSource::createReadBuffer(const S else { /// FIXME: this is inconsistent that readObject always reads synchronously ignoring read_method setting. - return object_storage->readObject(StoredObject(key), read_settings); + return object_storage->readObject(StoredObject(object_info.relative_path, "", object_size), read_settings); } } @@ -609,4 +631,114 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator return buffer[current_index]; } +static IArchiveReader::NameFilter createArchivePathFilter(const std::string & archive_pattern) +{ + auto matcher = std::make_shared(makeRegexpPatternFromGlobs(archive_pattern)); + if (!matcher->ok()) + { + throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, + "Cannot compile regex from glob ({}): {}", + archive_pattern, matcher->error()); + } + return [matcher](const std::string & p) mutable { return re2::RE2::FullMatch(p, *matcher); }; +} + +StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive::ObjectInfoInArchive( + ObjectInfoPtr archive_object_, + const std::string & path_in_archive_, + std::shared_ptr archive_reader_) + : archive_object(archive_object_) + , path_in_archive(path_in_archive_) + , archive_reader(archive_reader_) +{ +} + +StorageObjectStorageSource::ArchiveIterator::ArchiveIterator( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + std::unique_ptr archives_iterator_, + ContextPtr context_, + ObjectInfos * read_keys_) + : IIterator("ArchiveIterator") + , WithContext(context_) + , object_storage(object_storage_) + , is_path_in_archive_with_globs(configuration_->isPathInArchiveWithGlobs()) + , archives_iterator(std::move(archives_iterator_)) + , filter(is_path_in_archive_with_globs ? createArchivePathFilter(configuration_->getPathInArchive()) : IArchiveReader::NameFilter{}) + , path_in_archive(is_path_in_archive_with_globs ? "" : configuration_->getPathInArchive()) + , read_keys(read_keys_) +{ +} + +std::shared_ptr +StorageObjectStorageSource::ArchiveIterator::createArchiveReader(ObjectInfoPtr object_info) const +{ + const auto size = object_info->metadata->size_bytes; + return DB::createArchiveReader( + /* path_to_archive */object_info->relative_path, + /* archive_read_function */[=, this]() + { + StoredObject stored_object(object_info->relative_path, "", size); + return object_storage->readObject(stored_object, getContext()->getReadSettings()); + }, + /* archive_size */size); +} + +StorageObjectStorageSource::ObjectInfoPtr +StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor) +{ + std::unique_lock lock{next_mutex}; + while (true) + { + if (filter) + { + if (!file_enumerator) + { + archive_object = archives_iterator->next(processor); + if (!archive_object) + return {}; + + archive_reader = createArchiveReader(archive_object); + file_enumerator = archive_reader->firstFile(); + if (!file_enumerator) + continue; + } + else if (!file_enumerator->nextFile()) + { + file_enumerator.reset(); + continue; + } + + path_in_archive = file_enumerator->getFileName(); + if (!filter(path_in_archive)) + continue; + } + else + { + archive_object = archives_iterator->next(processor); + if (!archive_object) + return {}; + + if (!archive_object->metadata) + archive_object->metadata = object_storage->getObjectMetadata(archive_object->relative_path); + + archive_reader = createArchiveReader(archive_object); + if (!archive_reader->fileExists(path_in_archive)) + continue; + } + + auto object_in_archive = std::make_shared(archive_object, path_in_archive, archive_reader); + + if (read_keys != nullptr) + read_keys->push_back(object_in_archive); + + return object_in_archive; + } +} + +size_t StorageObjectStorageSource::ArchiveIterator::estimatedKeysCount() +{ + return archives_iterator->estimatedKeysCount(); +} + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index abaf51edc4e..664aad56928 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -1,10 +1,11 @@ #pragma once +#include +#include +#include #include #include -#include -#include #include -#include +#include namespace DB @@ -25,6 +26,7 @@ public: class ReadTaskIterator; class GlobIterator; class KeysIterator; + class ArchiveIterator; StorageObjectStorageSource( String name_, @@ -109,7 +111,7 @@ protected: /// Recreate ReadBuffer and Pipeline for each file. ReaderHolder createReader(size_t processor = 0); std::future createReaderAsync(size_t processor = 0); - std::unique_ptr createReadBuffer(const String & key, size_t object_size); + std::unique_ptr createReadBuffer(const ObjectInfo & object_info); void addNumRowsToCache(const String & path, size_t num_rows); std::optional tryGetNumRowsFromCache(const ObjectInfoPtr & object_info); @@ -218,4 +220,64 @@ private: std::atomic index = 0; bool ignore_non_existent_files; }; + +/* + * An archives iterator. + * Allows to iterate files inside one or many archives. + * `archives_iterator` is an iterator which iterates over different archives. + * There are two ways to read files in archives: + * 1. When we want to read one concete file in each archive. + * In this case we go through all archives, check if this certain file + * exists within this archive and read it if it exists. + * 2. When we have a certain pattern of files we want to read in each archive. + * For this purpose we create a filter defined as IArchiveReader::NameFilter. + */ +class StorageObjectStorageSource::ArchiveIterator : public IIterator, private WithContext +{ +public: + explicit ArchiveIterator( + ObjectStoragePtr object_storage_, + ConfigurationPtr configuration_, + std::unique_ptr archives_iterator_, + ContextPtr context_, + ObjectInfos * read_keys_); + + size_t estimatedKeysCount() override; + + struct ObjectInfoInArchive : public ObjectInfo + { + ObjectInfoInArchive( + ObjectInfoPtr archive_object_, + const std::string & path_in_archive_, + std::shared_ptr archive_reader_); + + const ObjectInfoPtr archive_object; + const std::string path_in_archive; + const std::shared_ptr archive_reader; + }; + +private: + ObjectInfoPtr nextImpl(size_t processor) override; + std::shared_ptr createArchiveReader(ObjectInfoPtr object_info) const; + + const ObjectStoragePtr object_storage; + const bool is_path_in_archive_with_globs; + /// Iterator which iterates through different archives. + const std::unique_ptr archives_iterator; + /// Used when files inside archive are defined with a glob + const IArchiveReader::NameFilter filter = {}; + /// Current file inside the archive. + std::string path_in_archive = {}; + /// Read keys of files inside archives. + ObjectInfos * read_keys; + /// Object pointing to archive (NOT path within archive). + ObjectInfoPtr archive_object; + /// Reader of the archive. + std::shared_ptr archive_reader; + /// File enumerator inside the archive. + std::unique_ptr file_enumerator; + + std::mutex next_mutex; +}; + } From f0a2b85f052e88703ce6255addabeb842a47e8fe Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 17 May 2024 11:37:51 +0200 Subject: [PATCH 164/392] Fix test query --- .../02271_fix_column_matcher_and_column_transformer.sql | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql index 245b2cc97e3..b2a04788bbb 100644 --- a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql +++ b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql @@ -61,6 +61,11 @@ CREATE TABLE github_events ) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at); -with top_repos as ( select repo_name from github_events where event_type = 'WatchEvent' and toDate(created_at) = today() - 1 group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toMonday(created_at) = toMonday(today() - interval 1 week) group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toYear(created_at) = toYear(today()) - 1 group by repo_name order by count() desc limit 100 ), last_day as ( select repo_name, count() as count_last_day, rowNumberInAllBlocks() + 1 as position_last_day from github_events where repo_name in (select repo_name from top_repos) and toDate(created_at) = today() - 1 group by repo_name order by count_last_day desc ), last_week as ( select repo_name, count() as count_last_week, rowNumberInAllBlocks() + 1 as position_last_week from github_events where repo_name in (select repo_name from top_repos) and toMonday(created_at) = toMonday(today()) - interval 1 week group by repo_name order by count_last_week desc ), last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) select d.repo_name, columns(count) from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name; +with + top_repos as ( select repo_name from github_events where event_type = 'WatchEvent' and toDate(created_at) = today() - 1 group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toMonday(created_at) = toMonday(today() - interval 1 week) group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toYear(created_at) = toYear(today()) - 1 group by repo_name order by count() desc limit 100 ), + last_day as ( select repo_name, count() as count_last_day, rowNumberInAllBlocks() + 1 as position_last_day from github_events where repo_name in (select repo_name from top_repos) and toDate(created_at) = today() - 1 group by repo_name order by count_last_day desc ), + last_week as ( select repo_name, count() as count_last_week, rowNumberInAllBlocks() + 1 as position_last_week from github_events where repo_name in (select repo_name from top_repos) and toMonday(created_at) = toMonday(today()) - interval 1 week group by repo_name order by count_last_week desc ), + last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) +select d.repo_name, columns('count') from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name; DROP TABLE github_events; From 61ee5e46ad50fcedd86f6d62d4c2bda2f6fedade Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 17 May 2024 11:43:33 +0200 Subject: [PATCH 165/392] Update the test --- .../03152_analyzer_columns_list.reference | 2 +- .../0_stateless/03152_analyzer_columns_list.sql | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03152_analyzer_columns_list.reference b/tests/queries/0_stateless/03152_analyzer_columns_list.reference index eefa8ebd513..4e9025b5baf 100644 --- a/tests/queries/0_stateless/03152_analyzer_columns_list.reference +++ b/tests/queries/0_stateless/03152_analyzer_columns_list.reference @@ -1 +1 @@ -11323 8 +4 3 diff --git a/tests/queries/0_stateless/03152_analyzer_columns_list.sql b/tests/queries/0_stateless/03152_analyzer_columns_list.sql index 2b19cdf37a2..baed3a4ff68 100644 --- a/tests/queries/0_stateless/03152_analyzer_columns_list.sql +++ b/tests/queries/0_stateless/03152_analyzer_columns_list.sql @@ -1,3 +1,13 @@ -SELECT COLUMNS(license_text, library_name) APPLY (length) FROM system.licenses ORDER BY library_name LIMIT 1; +CREATE TABLE test +( + foo String, + bar String, +) +ENGINE = MergeTree() +ORDER BY (foo, bar); -SELECT COLUMNS(license_text, library_name, xyz) APPLY (length) FROM system.licenses; -- { serverError UNKNOWN_IDENTIFIER } +INSERT INTO test VALUES ('foo', 'bar1'); + +SELECT COLUMNS(bar, foo) APPLY (length) FROM test; + +SELECT COLUMNS(bar, foo, xyz) APPLY (length) FROM test; -- { serverError UNKNOWN_IDENTIFIER } From 53e992af4ff6c2df33f46c597498baa38c327ee3 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 17 May 2024 11:42:28 +0000 Subject: [PATCH 166/392] Remove some unnecessary UNREACHABLEs --- programs/keeper-client/Commands.cpp | 3 ++- programs/main.cpp | 2 +- src/Access/AccessEntityIO.cpp | 3 +-- src/Access/AccessRights.cpp | 1 - src/Access/IAccessStorage.cpp | 9 +++------ .../AggregateFunctionGroupArray.cpp | 13 ++++++------- .../AggregateFunctionSequenceNextNode.cpp | 1 - src/AggregateFunctions/AggregateFunctionSum.h | 13 ++++++------- src/Common/DateLUTImpl.cpp | 1 - src/Common/IntervalKind.cpp | 10 ---------- src/Common/TargetSpecific.cpp | 2 -- src/Common/ThreadProfileEvents.cpp | 1 - src/Common/ZooKeeper/IKeeper.cpp | 2 -- src/Compression/CompressionCodecDeflateQpl.cpp | 1 - src/Compression/CompressionCodecDoubleDelta.cpp | 3 +-- src/Coordination/KeeperReconfiguration.cpp | 8 +++++++- src/Coordination/KeeperServer.cpp | 3 ++- src/Core/Field.cpp | 1 - src/Core/Field.h | 2 -- src/DataTypes/Serializations/ISerialization.cpp | 1 - src/Disks/IO/CachedOnDiskReadBufferFromFile.h | 1 - .../MetadataStorageTransactionState.cpp | 1 - src/Disks/VolumeJBOD.cpp | 2 -- src/Formats/EscapingRuleUtils.cpp | 1 - src/Functions/FunctionsRound.h | 4 ---- src/Functions/PolygonUtils.h | 2 -- .../UserDefinedSQLObjectsZooKeeperStorage.cpp | 1 - src/IO/CompressionMethod.cpp | 1 - src/IO/HadoopSnappyReadBuffer.h | 1 - src/Interpreters/AggregatedDataVariants.cpp | 8 -------- src/Interpreters/Cache/FileSegment.cpp | 1 - src/Interpreters/ComparisonGraph.cpp | 1 - src/Interpreters/FilesystemCacheLog.cpp | 1 - src/Interpreters/HashJoin.cpp | 3 --- .../InterpreterTransactionControlQuery.cpp | 1 - src/Interpreters/SetVariants.cpp | 4 ---- src/Parsers/ASTExplainQuery.h | 2 -- src/Parsers/Lexer.cpp | 4 ---- .../Formats/Impl/MsgPackRowInputFormat.cpp | 1 - src/Processors/IProcessor.cpp | 2 -- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 6 ------ src/Processors/QueryPlan/TotalsHavingStep.cpp | 2 -- src/Processors/Transforms/FillingTransform.cpp | 1 - .../Transforms/buildPushingToViewsChain.cpp | 2 -- src/Storages/MergeTree/BackgroundJobsAssignee.cpp | 1 - src/Storages/MergeTree/KeyCondition.cpp | 2 -- src/Storages/MergeTree/MergeTreeData.cpp | 2 -- src/Storages/MergeTree/MergeTreeDataWriter.cpp | 2 -- src/Storages/WindowView/StorageWindowView.cpp | 1 - 49 files changed, 29 insertions(+), 112 deletions(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index ec5eaf5070c..38c3d4356f6 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -11,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int KEEPER_EXCEPTION; + extern const int UNEXPECTED_ZOOKEEPER_ERROR; } bool LSCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const @@ -441,7 +442,7 @@ void ReconfigCommand::execute(const DB::ASTKeeperQuery * query, DB::KeeperClient new_members = query->args[1].safeGet(); break; default: - UNREACHABLE(); + throw Exception(ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR, "Unexpected operation: {}", operation); } auto response = client->zookeeper->reconfig(joining, leaving, new_members); diff --git a/programs/main.cpp b/programs/main.cpp index 4bb73399719..48985ea683f 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -155,8 +155,8 @@ auto instructionFailToString(InstructionFail fail) ret("AVX2"); case InstructionFail::AVX512: ret("AVX512"); +#undef ret } - UNREACHABLE(); } diff --git a/src/Access/AccessEntityIO.cpp b/src/Access/AccessEntityIO.cpp index b0dfd74c53b..1b073329296 100644 --- a/src/Access/AccessEntityIO.cpp +++ b/src/Access/AccessEntityIO.cpp @@ -144,8 +144,7 @@ AccessEntityPtr deserializeAccessEntity(const String & definition, const String catch (Exception & e) { e.addMessage("Could not parse " + file_path); - e.rethrow(); - UNREACHABLE(); + throw; } } diff --git a/src/Access/AccessRights.cpp b/src/Access/AccessRights.cpp index c10931f554c..dd25d3e4ac0 100644 --- a/src/Access/AccessRights.cpp +++ b/src/Access/AccessRights.cpp @@ -258,7 +258,6 @@ namespace case TABLE_LEVEL: return AccessFlags::allFlagsGrantableOnTableLevel(); case COLUMN_LEVEL: return AccessFlags::allFlagsGrantableOnColumnLevel(); } - UNREACHABLE(); } } diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index 8e51481e415..8d4e7d3073e 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -257,8 +257,7 @@ std::vector IAccessStorage::insert(const std::vector & mu } e.addMessage("After successfully inserting {}/{}: {}", successfully_inserted.size(), multiple_entities.size(), successfully_inserted_str); } - e.rethrow(); - UNREACHABLE(); + throw; } } @@ -361,8 +360,7 @@ std::vector IAccessStorage::remove(const std::vector & ids, bool thr } e.addMessage("After successfully removing {}/{}: {}", removed_names.size(), ids.size(), removed_names_str); } - e.rethrow(); - UNREACHABLE(); + throw; } } @@ -458,8 +456,7 @@ std::vector IAccessStorage::update(const std::vector & ids, const Up } e.addMessage("After successfully updating {}/{}: {}", names_of_updated.size(), ids.size(), names_of_updated_str); } - e.rethrow(); - UNREACHABLE(); + throw; } } diff --git a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index d4fb7afcb78..930b2c6ce73 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -60,14 +60,13 @@ struct GroupArrayTrait template constexpr const char * getNameByTrait() { - if (Trait::last) + if constexpr (Trait::last) return "groupArrayLast"; - if (Trait::sampler == Sampler::NONE) - return "groupArray"; - else if (Trait::sampler == Sampler::RNG) - return "groupArraySample"; - - UNREACHABLE(); + switch (Trait::sampler) + { + case Sampler::NONE: return "groupArray"; + case Sampler::RNG: return "groupArraySample"; + } } template diff --git a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp index bed10333af0..a9dd53a75e8 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp +++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp @@ -414,7 +414,6 @@ public: break; return (i == events_size) ? base - i : unmatched_idx; } - UNREACHABLE(); } void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override diff --git a/src/AggregateFunctions/AggregateFunctionSum.h b/src/AggregateFunctions/AggregateFunctionSum.h index 58aaddf357a..2f23187d2ea 100644 --- a/src/AggregateFunctions/AggregateFunctionSum.h +++ b/src/AggregateFunctions/AggregateFunctionSum.h @@ -457,13 +457,12 @@ public: String getName() const override { - if constexpr (Type == AggregateFunctionTypeSum) - return "sum"; - else if constexpr (Type == AggregateFunctionTypeSumWithOverflow) - return "sumWithOverflow"; - else if constexpr (Type == AggregateFunctionTypeSumKahan) - return "sumKahan"; - UNREACHABLE(); + switch (Type) + { + case AggregateFunctionTypeSum: return "sum"; + case AggregateFunctionTypeSumWithOverflow: return "sumWithOverflow"; + case AggregateFunctionTypeSumKahan: return "sumKahan"; + } } explicit AggregateFunctionSum(const DataTypes & argument_types_) diff --git a/src/Common/DateLUTImpl.cpp b/src/Common/DateLUTImpl.cpp index 392ee64dcbf..c87d44a4b95 100644 --- a/src/Common/DateLUTImpl.cpp +++ b/src/Common/DateLUTImpl.cpp @@ -41,7 +41,6 @@ UInt8 getDayOfWeek(const cctz::civil_day & date) case cctz::weekday::saturday: return 6; case cctz::weekday::sunday: return 7; } - UNREACHABLE(); } inline cctz::time_point lookupTz(const cctz::time_zone & cctz_time_zone, const cctz::civil_day & date) diff --git a/src/Common/IntervalKind.cpp b/src/Common/IntervalKind.cpp index 22c7db504c3..1548d5cf9a5 100644 --- a/src/Common/IntervalKind.cpp +++ b/src/Common/IntervalKind.cpp @@ -34,8 +34,6 @@ Int64 IntervalKind::toAvgNanoseconds() const default: return toAvgSeconds() * NANOSECONDS_PER_SECOND; } - - UNREACHABLE(); } Int32 IntervalKind::toAvgSeconds() const @@ -54,7 +52,6 @@ Int32 IntervalKind::toAvgSeconds() const case IntervalKind::Kind::Quarter: return 7889238; /// Exactly 1/4 of a year. case IntervalKind::Kind::Year: return 31556952; /// The average length of a Gregorian year is equal to 365.2425 days } - UNREACHABLE(); } Float64 IntervalKind::toSeconds() const @@ -80,7 +77,6 @@ Float64 IntervalKind::toSeconds() const default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Not possible to get precise number of seconds in non-precise interval"); } - UNREACHABLE(); } bool IntervalKind::isFixedLength() const @@ -99,7 +95,6 @@ bool IntervalKind::isFixedLength() const case IntervalKind::Kind::Quarter: case IntervalKind::Kind::Year: return false; } - UNREACHABLE(); } IntervalKind IntervalKind::fromAvgSeconds(Int64 num_seconds) @@ -141,7 +136,6 @@ const char * IntervalKind::toKeyword() const case IntervalKind::Kind::Quarter: return "QUARTER"; case IntervalKind::Kind::Year: return "YEAR"; } - UNREACHABLE(); } @@ -161,7 +155,6 @@ const char * IntervalKind::toLowercasedKeyword() const case IntervalKind::Kind::Quarter: return "quarter"; case IntervalKind::Kind::Year: return "year"; } - UNREACHABLE(); } @@ -192,7 +185,6 @@ const char * IntervalKind::toDateDiffUnit() const case IntervalKind::Kind::Year: return "year"; } - UNREACHABLE(); } @@ -223,7 +215,6 @@ const char * IntervalKind::toNameOfFunctionToIntervalDataType() const case IntervalKind::Kind::Year: return "toIntervalYear"; } - UNREACHABLE(); } @@ -257,7 +248,6 @@ const char * IntervalKind::toNameOfFunctionExtractTimePart() const case IntervalKind::Kind::Year: return "toYear"; } - UNREACHABLE(); } diff --git a/src/Common/TargetSpecific.cpp b/src/Common/TargetSpecific.cpp index 49f396c0926..8540c9a9986 100644 --- a/src/Common/TargetSpecific.cpp +++ b/src/Common/TargetSpecific.cpp @@ -54,8 +54,6 @@ String toString(TargetArch arch) case TargetArch::AMXTILE: return "amxtile"; case TargetArch::AMXINT8: return "amxint8"; } - - UNREACHABLE(); } } diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp index 6a63d484cd9..23b41f23bde 100644 --- a/src/Common/ThreadProfileEvents.cpp +++ b/src/Common/ThreadProfileEvents.cpp @@ -75,7 +75,6 @@ const char * TasksStatsCounters::metricsProviderString(MetricsProvider provider) case MetricsProvider::Netlink: return "netlink"; } - UNREACHABLE(); } bool TasksStatsCounters::checkIfAvailable() diff --git a/src/Common/ZooKeeper/IKeeper.cpp b/src/Common/ZooKeeper/IKeeper.cpp index 7d2602bde1e..7cca262baca 100644 --- a/src/Common/ZooKeeper/IKeeper.cpp +++ b/src/Common/ZooKeeper/IKeeper.cpp @@ -146,8 +146,6 @@ const char * errorMessage(Error code) case Error::ZSESSIONMOVED: return "Session moved to another server, so operation is ignored"; case Error::ZNOTREADONLY: return "State-changing request is passed to read-only server"; } - - UNREACHABLE(); } bool isHardwareError(Error zk_return_code) diff --git a/src/Compression/CompressionCodecDeflateQpl.cpp b/src/Compression/CompressionCodecDeflateQpl.cpp index 7e0653c69f8..f1b5b24e866 100644 --- a/src/Compression/CompressionCodecDeflateQpl.cpp +++ b/src/Compression/CompressionCodecDeflateQpl.cpp @@ -466,7 +466,6 @@ void CompressionCodecDeflateQpl::doDecompressData(const char * source, UInt32 so sw_codec->doDecompressData(source, source_size, dest, uncompressed_size); return; } - UNREACHABLE(); } void CompressionCodecDeflateQpl::flushAsynchronousDecompressRequests() diff --git a/src/Compression/CompressionCodecDoubleDelta.cpp b/src/Compression/CompressionCodecDoubleDelta.cpp index e6e8db4c699..78fdf5c627a 100644 --- a/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/src/Compression/CompressionCodecDoubleDelta.cpp @@ -163,9 +163,8 @@ inline Int64 getMaxValueForByteSize(Int8 byte_size) case sizeof(UInt64): return std::numeric_limits::max(); default: - assert(false && "only 1, 2, 4 and 8 data sizes are supported"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "only 1, 2, 4 and 8 data sizes are supported"); } - UNREACHABLE(); } struct WriteSpec diff --git a/src/Coordination/KeeperReconfiguration.cpp b/src/Coordination/KeeperReconfiguration.cpp index e3642913a7a..a2a06f92283 100644 --- a/src/Coordination/KeeperReconfiguration.cpp +++ b/src/Coordination/KeeperReconfiguration.cpp @@ -5,6 +5,12 @@ namespace DB { + +namespace ErrorCodes +{ + extern const int UNEXPECTED_ZOOKEEPER_ERROR; +} + ClusterUpdateActions joiningToClusterUpdates(const ClusterConfigPtr & cfg, std::string_view joining) { ClusterUpdateActions out; @@ -79,7 +85,7 @@ String serializeClusterConfig(const ClusterConfigPtr & cfg, const ClusterUpdateA new_config.emplace_back(RaftServerConfig{*cfg->get_server(priority->id)}); } else - UNREACHABLE(); + throw Exception(ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR, "Unexpected update"); } for (const auto & item : cfg->get_servers()) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 8d21ce2ab01..b132c898be6 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -45,6 +45,7 @@ namespace ErrorCodes extern const int SUPPORT_IS_DISABLED; extern const int LOGICAL_ERROR; extern const int INVALID_CONFIG_PARAMETER; + extern const int UNEXPECTED_ZOOKEEPER_ERROR; } using namespace std::chrono_literals; @@ -990,7 +991,7 @@ KeeperServer::ConfigUpdateState KeeperServer::applyConfigUpdate( raft_instance->set_priority(update->id, update->priority, /*broadcast on live leader*/true); return Accepted; } - UNREACHABLE(); + throw Exception(ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR, "Unexpected action"); } ClusterUpdateActions KeeperServer::getRaftConfigurationDiff(const Poco::Util::AbstractConfiguration & config) diff --git a/src/Core/Field.cpp b/src/Core/Field.cpp index 73f0703f21e..7207485c799 100644 --- a/src/Core/Field.cpp +++ b/src/Core/Field.cpp @@ -146,7 +146,6 @@ inline Field getBinaryValue(UInt8 type, ReadBuffer & buf) case Field::Types::CustomType: return Field(); } - UNREACHABLE(); } void readBinary(Array & x, ReadBuffer & buf) diff --git a/src/Core/Field.h b/src/Core/Field.h index 4424d669c4d..710614cd0a0 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -667,8 +667,6 @@ public: case Types::AggregateFunctionState: return f(field.template get()); case Types::CustomType: return f(field.template get()); } - - UNREACHABLE(); } String dump() const; diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index a3a28f8091c..cd605c93f0d 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -36,7 +36,6 @@ String ISerialization::kindToString(Kind kind) case Kind::SPARSE: return "Sparse"; } - UNREACHABLE(); } ISerialization::Kind ISerialization::stringToKind(const String & str) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h index 3433698a162..cb34f7932c3 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h @@ -140,7 +140,6 @@ private: case ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE: return "REMOTE_FS_READ_AND_PUT_IN_CACHE"; } - UNREACHABLE(); } size_t first_offset = 0; diff --git a/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp b/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp index 245578b5d9e..a37f4ce7e65 100644 --- a/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp @@ -17,7 +17,6 @@ std::string toString(MetadataStorageTransactionState state) case MetadataStorageTransactionState::PARTIALLY_ROLLED_BACK: return "PARTIALLY_ROLLED_BACK"; } - UNREACHABLE(); } } diff --git a/src/Disks/VolumeJBOD.cpp b/src/Disks/VolumeJBOD.cpp index a0c71583a22..e796ad6cdd7 100644 --- a/src/Disks/VolumeJBOD.cpp +++ b/src/Disks/VolumeJBOD.cpp @@ -112,7 +112,6 @@ DiskPtr VolumeJBOD::getDisk(size_t /* index */) const return disks_by_size.top().disk; } } - UNREACHABLE(); } ReservationPtr VolumeJBOD::reserve(UInt64 bytes) @@ -164,7 +163,6 @@ ReservationPtr VolumeJBOD::reserve(UInt64 bytes) return reservation; } } - UNREACHABLE(); } bool VolumeJBOD::areMergesAvoided() const diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 3edade639df..2fe29d8bebb 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -62,7 +62,6 @@ String escapingRuleToString(FormatSettings::EscapingRule escaping_rule) case FormatSettings::EscapingRule::Raw: return "Raw"; } - UNREACHABLE(); } void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index 99f3a14dfec..233d4058f11 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -149,8 +149,6 @@ struct IntegerRoundingComputation return x; } } - - UNREACHABLE(); } static ALWAYS_INLINE T compute(T x, T scale) @@ -163,8 +161,6 @@ struct IntegerRoundingComputation case ScaleMode::Negative: return computeImpl(x, scale); } - - UNREACHABLE(); } static ALWAYS_INLINE void compute(const T * __restrict in, size_t scale, T * __restrict out) requires std::integral diff --git a/src/Functions/PolygonUtils.h b/src/Functions/PolygonUtils.h index c4851718da6..57f1243537d 100644 --- a/src/Functions/PolygonUtils.h +++ b/src/Functions/PolygonUtils.h @@ -381,8 +381,6 @@ bool PointInPolygonWithGrid::contains(CoordinateType x, Coordina case CellType::complexPolygon: return boost::geometry::within(Point(x, y), polygons[cell.index_of_inner_polygon]); } - - UNREACHABLE(); } diff --git a/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp b/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp index 568e0b9b5d2..766d63eafb0 100644 --- a/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp +++ b/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp @@ -35,7 +35,6 @@ namespace case UserDefinedSQLObjectType::Function: return "function_"; } - UNREACHABLE(); } constexpr std::string_view sql_extension = ".sql"; diff --git a/src/IO/CompressionMethod.cpp b/src/IO/CompressionMethod.cpp index b8e1134d422..22913125e99 100644 --- a/src/IO/CompressionMethod.cpp +++ b/src/IO/CompressionMethod.cpp @@ -52,7 +52,6 @@ std::string toContentEncodingName(CompressionMethod method) case CompressionMethod::None: return ""; } - UNREACHABLE(); } CompressionMethod chooseHTTPCompressionMethod(const std::string & list) diff --git a/src/IO/HadoopSnappyReadBuffer.h b/src/IO/HadoopSnappyReadBuffer.h index 73e52f2c503..bbbb84dd6dd 100644 --- a/src/IO/HadoopSnappyReadBuffer.h +++ b/src/IO/HadoopSnappyReadBuffer.h @@ -88,7 +88,6 @@ public: case Status::TOO_LARGE_COMPRESSED_BLOCK: return "TOO_LARGE_COMPRESSED_BLOCK"; } - UNREACHABLE(); } explicit HadoopSnappyReadBuffer( diff --git a/src/Interpreters/AggregatedDataVariants.cpp b/src/Interpreters/AggregatedDataVariants.cpp index 87cfdda5948..8f82f15248f 100644 --- a/src/Interpreters/AggregatedDataVariants.cpp +++ b/src/Interpreters/AggregatedDataVariants.cpp @@ -117,8 +117,6 @@ size_t AggregatedDataVariants::size() const APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M } - - UNREACHABLE(); } size_t AggregatedDataVariants::sizeWithoutOverflowRow() const @@ -136,8 +134,6 @@ size_t AggregatedDataVariants::sizeWithoutOverflowRow() const APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M } - - UNREACHABLE(); } const char * AggregatedDataVariants::getMethodName() const @@ -155,8 +151,6 @@ const char * AggregatedDataVariants::getMethodName() const APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M } - - UNREACHABLE(); } bool AggregatedDataVariants::isTwoLevel() const @@ -174,8 +168,6 @@ bool AggregatedDataVariants::isTwoLevel() const APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M } - - UNREACHABLE(); } bool AggregatedDataVariants::isConvertibleToTwoLevel() const diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index 9459029dc4c..61a356fa3c3 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -799,7 +799,6 @@ String FileSegment::stateToString(FileSegment::State state) case FileSegment::State::DETACHED: return "DETACHED"; } - UNREACHABLE(); } bool FileSegment::assertCorrectness() const diff --git a/src/Interpreters/ComparisonGraph.cpp b/src/Interpreters/ComparisonGraph.cpp index 4eacbae7a30..d53ff4b0227 100644 --- a/src/Interpreters/ComparisonGraph.cpp +++ b/src/Interpreters/ComparisonGraph.cpp @@ -309,7 +309,6 @@ ComparisonGraphCompareResult ComparisonGraph::pathToCompareResult(Path pat case Path::GREATER: return inverse ? ComparisonGraphCompareResult::LESS : ComparisonGraphCompareResult::GREATER; case Path::GREATER_OR_EQUAL: return inverse ? ComparisonGraphCompareResult::LESS_OR_EQUAL : ComparisonGraphCompareResult::GREATER_OR_EQUAL; } - UNREACHABLE(); } template diff --git a/src/Interpreters/FilesystemCacheLog.cpp b/src/Interpreters/FilesystemCacheLog.cpp index 80fe1c3a8ef..aa489351a98 100644 --- a/src/Interpreters/FilesystemCacheLog.cpp +++ b/src/Interpreters/FilesystemCacheLog.cpp @@ -26,7 +26,6 @@ static String typeToString(FilesystemCacheLogElement::CacheType type) case FilesystemCacheLogElement::CacheType::WRITE_THROUGH_CACHE: return "WRITE_THROUGH_CACHE"; } - UNREACHABLE(); } ColumnsDescription FilesystemCacheLogElement::getColumnsDescription() diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 3a21c13db5e..75da8bbc3e7 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -705,7 +705,6 @@ namespace APPLY_FOR_JOIN_VARIANTS(M) #undef M } - UNREACHABLE(); } } @@ -2641,8 +2640,6 @@ private: default: throw Exception(ErrorCodes::UNSUPPORTED_JOIN_KEYS, "Unsupported JOIN keys (type: {})", parent.data->type); } - - UNREACHABLE(); } template diff --git a/src/Interpreters/InterpreterTransactionControlQuery.cpp b/src/Interpreters/InterpreterTransactionControlQuery.cpp index d31ace758c4..13872fbe3f5 100644 --- a/src/Interpreters/InterpreterTransactionControlQuery.cpp +++ b/src/Interpreters/InterpreterTransactionControlQuery.cpp @@ -33,7 +33,6 @@ BlockIO InterpreterTransactionControlQuery::execute() case ASTTransactionControl::SET_SNAPSHOT: return executeSetSnapshot(session_context, tcl.snapshot); } - UNREACHABLE(); } BlockIO InterpreterTransactionControlQuery::executeBegin(ContextMutablePtr session_context) diff --git a/src/Interpreters/SetVariants.cpp b/src/Interpreters/SetVariants.cpp index 64796a013f1..c600d096160 100644 --- a/src/Interpreters/SetVariants.cpp +++ b/src/Interpreters/SetVariants.cpp @@ -41,8 +41,6 @@ size_t SetVariantsTemplate::getTotalRowCount() const APPLY_FOR_SET_VARIANTS(M) #undef M } - - UNREACHABLE(); } template @@ -57,8 +55,6 @@ size_t SetVariantsTemplate::getTotalByteCount() const APPLY_FOR_SET_VARIANTS(M) #undef M } - - UNREACHABLE(); } template diff --git a/src/Parsers/ASTExplainQuery.h b/src/Parsers/ASTExplainQuery.h index 701bde8cebd..eb095b5dbbc 100644 --- a/src/Parsers/ASTExplainQuery.h +++ b/src/Parsers/ASTExplainQuery.h @@ -40,8 +40,6 @@ public: case TableOverride: return "EXPLAIN TABLE OVERRIDE"; case CurrentTransaction: return "EXPLAIN CURRENT TRANSACTION"; } - - UNREACHABLE(); } static ExplainKind fromString(const String & str) diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp index 9ac6e623803..30717550713 100644 --- a/src/Parsers/Lexer.cpp +++ b/src/Parsers/Lexer.cpp @@ -41,8 +41,6 @@ Token quotedString(const char *& pos, const char * const token_begin, const char ++pos; continue; } - - UNREACHABLE(); } } @@ -538,8 +536,6 @@ const char * getTokenName(TokenType type) APPLY_FOR_TOKENS(M) #undef M } - - UNREACHABLE(); } diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 98cbdeaaa4b..6b7f1f5206c 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -657,7 +657,6 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {:x} is not supported", object_ext.type()); } } - UNREACHABLE(); } std::optional MsgPackSchemaReader::readRowAndGetDataTypes() diff --git a/src/Processors/IProcessor.cpp b/src/Processors/IProcessor.cpp index 8b160153733..5ab5e5277aa 100644 --- a/src/Processors/IProcessor.cpp +++ b/src/Processors/IProcessor.cpp @@ -36,8 +36,6 @@ std::string IProcessor::statusToName(Status status) case Status::ExpandPipeline: return "ExpandPipeline"; } - - UNREACHABLE(); } } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index e523a2c243c..2f7927681aa 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1136,8 +1136,6 @@ static void addMergingFinal( return std::make_shared(header, num_outputs, sort_description, max_block_size_rows, /*max_block_size_bytes=*/0, merging_params.graphite_params, now); } - - UNREACHABLE(); }; pipe.addTransform(get_merging_processor()); @@ -2143,8 +2141,6 @@ static const char * indexTypeToString(ReadFromMergeTree::IndexType type) case ReadFromMergeTree::IndexType::Skip: return "Skip"; } - - UNREACHABLE(); } static const char * readTypeToString(ReadFromMergeTree::ReadType type) @@ -2160,8 +2156,6 @@ static const char * readTypeToString(ReadFromMergeTree::ReadType type) case ReadFromMergeTree::ReadType::ParallelReplicas: return "Parallel"; } - - UNREACHABLE(); } void ReadFromMergeTree::describeActions(FormatSettings & format_settings) const diff --git a/src/Processors/QueryPlan/TotalsHavingStep.cpp b/src/Processors/QueryPlan/TotalsHavingStep.cpp index d1bd70fd0b2..ac5e144bf4a 100644 --- a/src/Processors/QueryPlan/TotalsHavingStep.cpp +++ b/src/Processors/QueryPlan/TotalsHavingStep.cpp @@ -86,8 +86,6 @@ static String totalsModeToString(TotalsMode totals_mode, double auto_include_thr case TotalsMode::AFTER_HAVING_AUTO: return "after_having_auto threshold " + std::to_string(auto_include_threshold); } - - UNREACHABLE(); } void TotalsHavingStep::describeActions(FormatSettings & settings) const diff --git a/src/Processors/Transforms/FillingTransform.cpp b/src/Processors/Transforms/FillingTransform.cpp index 05fd2a7254f..bb38c3e1dc5 100644 --- a/src/Processors/Transforms/FillingTransform.cpp +++ b/src/Processors/Transforms/FillingTransform.cpp @@ -67,7 +67,6 @@ static FillColumnDescription::StepFunction getStepFunction( FOR_EACH_INTERVAL_KIND(DECLARE_CASE) #undef DECLARE_CASE } - UNREACHABLE(); } static bool tryConvertFields(FillColumnDescription & descr, const DataTypePtr & type) diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 5e8ecdca95e..20977b801d3 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -897,8 +897,6 @@ static std::exception_ptr addStorageToException(std::exception_ptr ptr, const St { return std::current_exception(); } - - UNREACHABLE(); } void FinalizingViewsTransform::work() diff --git a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp index 56a4378cf9a..0a69bf1109f 100644 --- a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp +++ b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp @@ -93,7 +93,6 @@ String BackgroundJobsAssignee::toString(Type type) case Type::Moving: return "Moving"; } - UNREACHABLE(); } void BackgroundJobsAssignee::start() diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 849240502e4..dbc98404569 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -2957,8 +2957,6 @@ String KeyCondition::RPNElement::toString(std::string_view column_name, bool pri case ALWAYS_TRUE: return "true"; } - - UNREACHABLE(); } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f448a9a820d..6b6adf56cd2 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1175,8 +1175,6 @@ String MergeTreeData::MergingParams::getModeName() const case Graphite: return "Graphite"; case VersionedCollapsing: return "VersionedCollapsing"; } - - UNREACHABLE(); } Int64 MergeTreeData::getMaxBlockNumber() const diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index daa163d741c..395d27558f3 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -360,8 +360,6 @@ Block MergeTreeDataWriter::mergeBlock( return std::make_shared( block, 1, sort_description, block_size + 1, /*block_size_bytes=*/0, merging_params.graphite_params, time(nullptr)); } - - UNREACHABLE(); }; auto merging_algorithm = get_merging_algorithm(); diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index a9ec1f6c694..4e11787cecf 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -297,7 +297,6 @@ namespace CASE_WINDOW_KIND(Year) #undef CASE_WINDOW_KIND } - UNREACHABLE(); } class AddingAggregatedChunkInfoTransform : public ISimpleTransform From e560bd8a1a9c57640af1303a95f0a81d864c75e3 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 17 May 2024 14:37:47 +0000 Subject: [PATCH 167/392] Incorporate review feedback --- src/Access/AccessRights.cpp | 1 + src/AggregateFunctions/AggregateFunctionSum.h | 12 ++++++------ src/Compression/CompressionCodecDoubleDelta.cpp | 4 ++-- src/Coordination/KeeperReconfiguration.cpp | 4 ++-- src/Coordination/KeeperServer.cpp | 2 +- src/Core/Field.cpp | 1 + src/Functions/FunctionsTimeWindow.cpp | 2 -- src/Parsers/Lexer.cpp | 2 ++ 8 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/Access/AccessRights.cpp b/src/Access/AccessRights.cpp index dd25d3e4ac0..2127f4ada70 100644 --- a/src/Access/AccessRights.cpp +++ b/src/Access/AccessRights.cpp @@ -258,6 +258,7 @@ namespace case TABLE_LEVEL: return AccessFlags::allFlagsGrantableOnTableLevel(); case COLUMN_LEVEL: return AccessFlags::allFlagsGrantableOnColumnLevel(); } + chassert(false); } } diff --git a/src/AggregateFunctions/AggregateFunctionSum.h b/src/AggregateFunctions/AggregateFunctionSum.h index 2f23187d2ea..2ce03c530c2 100644 --- a/src/AggregateFunctions/AggregateFunctionSum.h +++ b/src/AggregateFunctions/AggregateFunctionSum.h @@ -457,12 +457,12 @@ public: String getName() const override { - switch (Type) - { - case AggregateFunctionTypeSum: return "sum"; - case AggregateFunctionTypeSumWithOverflow: return "sumWithOverflow"; - case AggregateFunctionTypeSumKahan: return "sumKahan"; - } + if constexpr (Type == AggregateFunctionTypeSum) + return "sum"; + else if constexpr (Type == AggregateFunctionTypeSumWithOverflow) + return "sumWithOverflow"; + else if constexpr (Type == AggregateFunctionTypeSumKahan) + return "sumKahan"; } explicit AggregateFunctionSum(const DataTypes & argument_types_) diff --git a/src/Compression/CompressionCodecDoubleDelta.cpp b/src/Compression/CompressionCodecDoubleDelta.cpp index 78fdf5c627a..443b9d33532 100644 --- a/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/src/Compression/CompressionCodecDoubleDelta.cpp @@ -142,9 +142,9 @@ namespace ErrorCodes { extern const int CANNOT_COMPRESS; extern const int CANNOT_DECOMPRESS; - extern const int BAD_ARGUMENTS; extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE; extern const int ILLEGAL_CODEC_PARAMETER; + extern const int LOGICAL_ERROR; } namespace @@ -163,7 +163,7 @@ inline Int64 getMaxValueForByteSize(Int8 byte_size) case sizeof(UInt64): return std::numeric_limits::max(); default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "only 1, 2, 4 and 8 data sizes are supported"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "only 1, 2, 4 and 8 data sizes are supported"); } } diff --git a/src/Coordination/KeeperReconfiguration.cpp b/src/Coordination/KeeperReconfiguration.cpp index a2a06f92283..05211af6704 100644 --- a/src/Coordination/KeeperReconfiguration.cpp +++ b/src/Coordination/KeeperReconfiguration.cpp @@ -8,7 +8,7 @@ namespace DB namespace ErrorCodes { - extern const int UNEXPECTED_ZOOKEEPER_ERROR; + extern const int LOGICAL_ERROR; } ClusterUpdateActions joiningToClusterUpdates(const ClusterConfigPtr & cfg, std::string_view joining) @@ -85,7 +85,7 @@ String serializeClusterConfig(const ClusterConfigPtr & cfg, const ClusterUpdateA new_config.emplace_back(RaftServerConfig{*cfg->get_server(priority->id)}); } else - throw Exception(ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR, "Unexpected update"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected update"); } for (const auto & item : cfg->get_servers()) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index b132c898be6..953072c5b0e 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -991,7 +991,7 @@ KeeperServer::ConfigUpdateState KeeperServer::applyConfigUpdate( raft_instance->set_priority(update->id, update->priority, /*broadcast on live leader*/true); return Accepted; } - throw Exception(ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR, "Unexpected action"); + chassert(false); } ClusterUpdateActions KeeperServer::getRaftConfigurationDiff(const Poco::Util::AbstractConfiguration & config) diff --git a/src/Core/Field.cpp b/src/Core/Field.cpp index 7207485c799..73f0703f21e 100644 --- a/src/Core/Field.cpp +++ b/src/Core/Field.cpp @@ -146,6 +146,7 @@ inline Field getBinaryValue(UInt8 type, ReadBuffer & buf) case Field::Types::CustomType: return Field(); } + UNREACHABLE(); } void readBinary(Array & x, ReadBuffer & buf) diff --git a/src/Functions/FunctionsTimeWindow.cpp b/src/Functions/FunctionsTimeWindow.cpp index 1c9f28c9724..f93a885ee65 100644 --- a/src/Functions/FunctionsTimeWindow.cpp +++ b/src/Functions/FunctionsTimeWindow.cpp @@ -232,7 +232,6 @@ struct TimeWindowImpl default: throw Exception(ErrorCodes::SYNTAX_ERROR, "Fraction seconds are unsupported by windows yet"); } - UNREACHABLE(); } template @@ -422,7 +421,6 @@ struct TimeWindowImpl default: throw Exception(ErrorCodes::SYNTAX_ERROR, "Fraction seconds are unsupported by windows yet"); } - UNREACHABLE(); } template diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp index 30717550713..d669c8a4690 100644 --- a/src/Parsers/Lexer.cpp +++ b/src/Parsers/Lexer.cpp @@ -41,6 +41,8 @@ Token quotedString(const char *& pos, const char * const token_begin, const char ++pos; continue; } + + chassert(false); } } From f266bdb88e1891e484add0431e9e5ca56c963635 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 17 May 2024 14:44:17 +0000 Subject: [PATCH 168/392] Fix more places --- src/Functions/FunctionsRound.h | 4 ---- src/Interpreters/HashJoin.h | 6 ------ .../MergeTree/PartMovesBetweenShardsOrchestrator.cpp | 2 -- src/Storages/WindowView/StorageWindowView.cpp | 2 -- 4 files changed, 14 deletions(-) diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index 233d4058f11..dde57e8320d 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -243,8 +243,6 @@ inline float roundWithMode(float x, RoundingMode mode) case RoundingMode::Ceil: return ceilf(x); case RoundingMode::Trunc: return truncf(x); } - - UNREACHABLE(); } inline double roundWithMode(double x, RoundingMode mode) @@ -256,8 +254,6 @@ inline double roundWithMode(double x, RoundingMode mode) case RoundingMode::Ceil: return ceil(x); case RoundingMode::Trunc: return trunc(x); } - - UNREACHABLE(); } template diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index 86db8943926..a0996556f9a 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -322,8 +322,6 @@ public: APPLY_FOR_JOIN_VARIANTS(M) #undef M } - - UNREACHABLE(); } size_t getTotalByteCountImpl(Type which) const @@ -338,8 +336,6 @@ public: APPLY_FOR_JOIN_VARIANTS(M) #undef M } - - UNREACHABLE(); } size_t getBufferSizeInCells(Type which) const @@ -354,8 +350,6 @@ public: APPLY_FOR_JOIN_VARIANTS(M) #undef M } - - UNREACHABLE(); } /// NOLINTEND(bugprone-macro-parentheses) }; diff --git a/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp b/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp index 78fcfabb704..4228d7b70b6 100644 --- a/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp +++ b/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp @@ -616,8 +616,6 @@ PartMovesBetweenShardsOrchestrator::Entry PartMovesBetweenShardsOrchestrator::st } } } - - UNREACHABLE(); } void PartMovesBetweenShardsOrchestrator::removePins(const Entry & entry, zkutil::ZooKeeperPtr zk) diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index 4e11787cecf..8bca1c97aad 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -919,7 +919,6 @@ UInt32 StorageWindowView::getWindowLowerBound(UInt32 time_sec) CASE_WINDOW_KIND(Year) #undef CASE_WINDOW_KIND } - UNREACHABLE(); } UInt32 StorageWindowView::getWindowUpperBound(UInt32 time_sec) @@ -947,7 +946,6 @@ UInt32 StorageWindowView::getWindowUpperBound(UInt32 time_sec) CASE_WINDOW_KIND(Year) #undef CASE_WINDOW_KIND } - UNREACHABLE(); } void StorageWindowView::addFireSignal(std::set & signals) From d964b4b78667a1437dd74836432828c5dda1be7e Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 17 May 2024 16:50:38 +0200 Subject: [PATCH 169/392] Finish archives related changes --- src/Disks/ObjectStorages/IObjectStorage.h | 6 +++ .../ObjectStorages/S3/S3ObjectStorage.cpp | 11 ++++- .../ObjectStorage/ReadBufferIterator.cpp | 40 ++++++++++++------- .../ObjectStorage/StorageObjectStorage.cpp | 7 +++- .../StorageObjectStorageCluster.cpp | 2 +- .../StorageObjectStorageSource.cpp | 37 +++++++++-------- .../StorageObjectStorageSource.h | 19 ++++++++- src/Storages/S3Queue/S3QueueSource.h | 2 +- 8 files changed, 88 insertions(+), 36 deletions(-) diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 43c7cf19adf..5724ae8929c 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -37,6 +37,7 @@ namespace DB namespace ErrorCodes { extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; } class ReadBufferFromFileBase; @@ -64,6 +65,11 @@ struct RelativePathWithMetadata {} virtual ~RelativePathWithMetadata() = default; + + virtual std::string getFileName() const { return std::filesystem::path(relative_path).filename(); } + virtual std::string getPath() const { return relative_path; } + virtual bool isArchive() const { return false; } + virtual std::string getPathToArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } }; struct ObjectKeyWithMetadata diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index c24874d0a94..983bb1834b8 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -457,7 +457,16 @@ std::optional S3ObjectStorage::tryGetObjectMetadata(const std::s ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) const { auto settings_ptr = s3_settings.get(); - auto object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true); + S3::ObjectInfo object_info; + try + { + object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true); + } + catch (DB::Exception & e) + { + e.addMessage("while reading " + path); + throw; + } ObjectMetadata result; result.size_bytes = object_info.size; diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 61575b0115a..e065de16e55 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -50,7 +50,7 @@ SchemaCache::Keys ReadBufferIterator::getKeysForSchemaCache() const std::back_inserter(sources), [&](const auto & elem) { - return std::filesystem::path(configuration->getDataSourceDescription()) / elem->relative_path; + return std::filesystem::path(configuration->getDataSourceDescription()) / elem->getPath(); }); return DB::getKeysForSchemaCache(sources, *format, format_settings, getContext()); } @@ -67,8 +67,9 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( const auto & object_info = (*it); auto get_last_mod_time = [&] -> std::optional { + const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); if (!object_info->metadata) - object_info->metadata = object_storage->tryGetObjectMetadata(object_info->relative_path); + object_info->metadata = object_storage->tryGetObjectMetadata(path); return object_info->metadata ? std::optional(object_info->metadata->last_modified.epochTime()) @@ -77,7 +78,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( if (format) { - auto cache_key = getKeyForSchemaCache(object_info->relative_path, *format); + auto cache_key = getKeyForSchemaCache(object_info->getPath(), *format); if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) return columns; } @@ -88,7 +89,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( /// If we have such entry for some format, we can use this format to read the file. for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) { - auto cache_key = getKeyForSchemaCache(object_info->relative_path, format_name); + auto cache_key = getKeyForSchemaCache(object_info->getPath(), format_name); if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) { /// Now format is known. It should be the same for all files. @@ -105,7 +106,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( void ReadBufferIterator::setNumRowsToLastFile(size_t num_rows) { if (query_settings.schema_inference_use_cache) - schema_cache.addNumRows(getKeyForSchemaCache(current_object_info->relative_path, *format), num_rows); + schema_cache.addNumRows(getKeyForSchemaCache(current_object_info->getPath(), *format), num_rows); } void ReadBufferIterator::setSchemaToLastFile(const ColumnsDescription & columns) @@ -113,7 +114,7 @@ void ReadBufferIterator::setSchemaToLastFile(const ColumnsDescription & columns) if (query_settings.schema_inference_use_cache && query_settings.schema_inference_mode == SchemaInferenceMode::UNION) { - schema_cache.addColumns(getKeyForSchemaCache(current_object_info->relative_path, *format), columns); + schema_cache.addColumns(getKeyForSchemaCache(current_object_info->getPath(), *format), columns); } } @@ -134,7 +135,7 @@ void ReadBufferIterator::setFormatName(const String & format_name) String ReadBufferIterator::getLastFileName() const { if (current_object_info) - return current_object_info->relative_path; + return current_object_info->getFileName(); else return ""; } @@ -142,9 +143,13 @@ String ReadBufferIterator::getLastFileName() const std::unique_ptr ReadBufferIterator::recreateLastReadBuffer() { auto context = getContext(); - auto impl = object_storage->readObject(StoredObject(current_object_info->relative_path), context->getReadSettings()); - const auto compression_method = chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method); + + const auto & path = current_object_info->isArchive() ? current_object_info->getPathToArchive() : current_object_info->getPath(); + auto impl = object_storage->readObject(StoredObject(), context->getReadSettings()); + + const auto compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->compression_method); const auto zstd_window_log_max = static_cast(context->getSettingsRef().zstd_window_log_max); + return wrapReadBufferWithCompressionMethod(std::move(impl), compression_method, zstd_window_log_max); } @@ -158,7 +163,7 @@ ReadBufferIterator::Data ReadBufferIterator::next() { for (const auto & object_info : read_keys) { - if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(object_info->relative_path)) + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(object_info->getFileName())) { format = format_from_file_name; break; @@ -170,7 +175,9 @@ ReadBufferIterator::Data ReadBufferIterator::next() if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) { if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + { return {nullptr, cached_columns, format}; + } } } @@ -178,7 +185,7 @@ ReadBufferIterator::Data ReadBufferIterator::next() { current_object_info = file_iterator->next(0); - if (!current_object_info || current_object_info->relative_path.empty()) + if (!current_object_info) { if (first) { @@ -203,6 +210,9 @@ ReadBufferIterator::Data ReadBufferIterator::next() return {nullptr, std::nullopt, format}; } + const auto filename = current_object_info->getFileName(); + chassert(!filename.empty()); + /// file iterator could get new keys after new iteration if (read_keys.size() > prev_read_keys_size) { @@ -211,7 +221,7 @@ ReadBufferIterator::Data ReadBufferIterator::next() { for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) { - if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->relative_path)) + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->getFileName())) { format = format_from_file_name; break; @@ -250,15 +260,15 @@ ReadBufferIterator::Data ReadBufferIterator::next() using ObjectInfoInArchive = StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive; if (auto object_info_in_archive = dynamic_cast(current_object_info.get())) { - compression_method = chooseCompressionMethod(configuration->getPathInArchive(), configuration->compression_method); + compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->compression_method); auto & archive_reader = object_info_in_archive->archive_reader; read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); } else { - compression_method = chooseCompressionMethod(current_object_info->relative_path, configuration->compression_method); + compression_method = chooseCompressionMethod(filename, configuration->compression_method); read_buf = object_storage->readObject( - StoredObject(current_object_info->relative_path), + StoredObject(current_object_info->getPath()), getContext()->getReadSettings(), {}, current_object_info->metadata->size_bytes); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 73e3d861cff..c45752c10f5 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -403,7 +403,12 @@ void StorageObjectStorage::Configuration::initialize( configuration.fromAST(engine_args, local_context, with_table_structure); if (configuration.format == "auto") - configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.getPath()).value_or("auto"); + { + configuration.format = FormatFactory::instance().tryGetFormatFromFileName( + configuration.isArchive() + ? configuration.getPathInArchive() + : configuration.getPath()).value_or("auto"); + } else FormatFactory::instance().checkFormatName(configuration.format); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index a43d9da0fa3..78f568d8ae2 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -90,7 +90,7 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten { auto object_info = iterator->next(0); if (object_info) - return object_info->relative_path; + return object_info->getPath(); else return ""; }); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 56905e6c29b..d3b67876224 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -100,6 +100,7 @@ std::shared_ptr StorageObjectStorageSourc "Expression can not have wildcards inside {} name", configuration->getNamespaceType()); auto settings = configuration->getQuerySettings(local_context); + const bool is_archive = configuration->isArchive(); std::unique_ptr iterator; if (configuration->isPathWithGlobs()) @@ -107,7 +108,7 @@ std::shared_ptr StorageObjectStorageSourc /// Iterate through disclosed globs and make a source for each file iterator = std::make_unique( object_storage, configuration, predicate, virtual_columns, - local_context, read_keys, settings.list_object_keys_size, + local_context, is_archive ? nullptr : read_keys, settings.list_object_keys_size, settings.throw_on_zero_files_match, file_progress_callback); } else @@ -126,11 +127,11 @@ std::shared_ptr StorageObjectStorageSourc } iterator = std::make_unique( - object_storage, copy_configuration, virtual_columns, read_keys, + object_storage, copy_configuration, virtual_columns, is_archive ? nullptr : read_keys, settings.ignore_non_existent_file, file_progress_callback); } - if (configuration->isArchive()) + if (is_archive) { return std::make_shared(object_storage, configuration, std::move(iterator), local_context, read_keys); } @@ -175,12 +176,13 @@ Chunk StorageObjectStorageSource::generate() progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); const auto & object_info = reader.getObjectInfo(); + const auto & filename = object_info.getFileName(); chassert(object_info.metadata); VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, fs::path(configuration->getNamespace()) / reader.getRelativePath(), - object_info.metadata->size_bytes); + object_info.metadata->size_bytes, &filename); return chunk; } @@ -219,7 +221,7 @@ void StorageObjectStorageSource::addNumRowsToCache(const String & path, size_t n std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const ObjectInfoPtr & object_info) { const auto cache_key = getKeyForSchemaCache( - fs::path(configuration->getDataSourceDescription()) / object_info->relative_path, + fs::path(configuration->getDataSourceDescription()) / object_info->getPath(), configuration->format, format_settings, getContext()); @@ -242,11 +244,14 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade { object_info = file_iterator->next(processor); - if (!object_info || object_info->relative_path.empty()) + if (!object_info || object_info->getFileName().empty()) return {}; if (!object_info->metadata) - object_info->metadata = object_storage->getObjectMetadata(object_info->relative_path); + { + const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); + object_info->metadata = object_storage->getObjectMetadata(path); + } } while (query_settings.skip_empty_files && object_info->metadata->size_bytes == 0); @@ -282,7 +287,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade } else { - compression_method = chooseCompressionMethod(object_info->relative_path, configuration->compression_method); + compression_method = chooseCompressionMethod(object_info->getFileName(), configuration->compression_method); read_buf = createReadBuffer(*object_info); } @@ -355,7 +360,7 @@ std::unique_ptr StorageObjectStorageSource::createReadBuffer(const O LOG_TRACE(log, "Downloading object of size {} with initial prefetch", object_size); auto async_reader = object_storage->readObjects( - StoredObjects{StoredObject{object_info.relative_path, /* local_path */ "", object_size}}, read_settings); + StoredObjects{StoredObject{object_info.getPath(), /* local_path */ "", object_size}}, read_settings); async_reader->setReadUntilEnd(); if (read_settings.remote_fs_prefetch) @@ -366,7 +371,7 @@ std::unique_ptr StorageObjectStorageSource::createReadBuffer(const O else { /// FIXME: this is inconsistent that readObject always reads synchronously ignoring read_method setting. - return object_storage->readObject(StoredObject(object_info.relative_path, "", object_size), read_settings); + return object_storage->readObject(StoredObject(object_info.getPath(), "", object_size), read_settings); } } @@ -381,7 +386,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::IIterator::next( if (object_info) { - LOG_TEST(logger, "Next key: {}", object_info->relative_path); + LOG_TEST(logger, "Next key: {}", object_info->getFileName()); } return object_info; @@ -470,7 +475,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne new_batch = std::move(result.value()); for (auto it = new_batch.begin(); it != new_batch.end();) { - if (!recursive && !re2::RE2::FullMatch((*it)->relative_path, *matcher)) + if (!recursive && !re2::RE2::FullMatch((*it)->getPath(), *matcher)) it = new_batch.erase(it); else ++it; @@ -487,7 +492,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne for (const auto & object_info : new_batch) { chassert(object_info); - paths.push_back(fs::path(configuration->getNamespace()) / object_info->relative_path); + paths.push_back(fs::path(configuration->getNamespace()) / object_info->getPath()); } VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_dag, virtual_columns, getContext()); @@ -675,10 +680,10 @@ StorageObjectStorageSource::ArchiveIterator::createArchiveReader(ObjectInfoPtr o { const auto size = object_info->metadata->size_bytes; return DB::createArchiveReader( - /* path_to_archive */object_info->relative_path, + /* path_to_archive */object_info->getPath(), /* archive_read_function */[=, this]() { - StoredObject stored_object(object_info->relative_path, "", size); + StoredObject stored_object(object_info->getPath(), "", size); return object_storage->readObject(stored_object, getContext()->getReadSettings()); }, /* archive_size */size); @@ -720,7 +725,7 @@ StorageObjectStorageSource::ArchiveIterator::nextImpl(size_t processor) return {}; if (!archive_object->metadata) - archive_object->metadata = object_storage->getObjectMetadata(archive_object->relative_path); + archive_object->metadata = object_storage->getObjectMetadata(archive_object->getPath()); archive_reader = createArchiveReader(archive_object); if (!archive_reader->fileExists(path_in_archive)) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 664aad56928..fb0ad3e32f1 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -92,7 +92,7 @@ protected: PullingPipelineExecutor * operator->() { return reader.get(); } const PullingPipelineExecutor * operator->() const { return reader.get(); } - const String & getRelativePath() const { return object_info->relative_path; } + std::string getRelativePath() const { return object_info->getPath(); } const ObjectInfo & getObjectInfo() const { return *object_info; } const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } @@ -251,6 +251,23 @@ public: const std::string & path_in_archive_, std::shared_ptr archive_reader_); + std::string getFileName() const override + { + return path_in_archive; + } + + std::string getPath() const override + { + return archive_object->getPath() + "::" + path_in_archive; + } + + std::string getPathToArchive() const override + { + return archive_object->getPath(); + } + + bool isArchive() const override { return true; } + const ObjectInfoPtr archive_object; const std::string path_in_archive; const std::shared_ptr archive_reader; diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index fdeed8d46d2..663577e055b 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -29,7 +29,7 @@ public: using FileStatusPtr = S3QueueFilesMetadata::FileStatusPtr; using ReaderHolder = StorageObjectStorageSource::ReaderHolder; using Metadata = S3QueueFilesMetadata; - using ObjectInfo = RelativePathWithMetadata; + using ObjectInfo = StorageObjectStorageSource::ObjectInfo; using ObjectInfoPtr = std::shared_ptr; using ObjectInfos = std::vector; From 4909c3ea2393c66226c23cd03847f1c5e5b05ff7 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Fri, 17 May 2024 18:24:21 +0200 Subject: [PATCH 170/392] Cleanups --- src/Storages/MergeTree/IMergeTreeDataPart.h | 11 ------ .../MergeTree/IMergeTreeDataPartWriter.cpp | 7 ---- .../MergeTree/IMergeTreeDataPartWriter.h | 39 ++++++------------- .../MergeTree/IMergedBlockOutputStream.cpp | 8 +--- .../MergeTree/IMergedBlockOutputStream.h | 10 ++--- src/Storages/MergeTree/MergeTask.cpp | 2 +- .../MergeTree/MergeTreeDataPartCompact.cpp | 29 +++++++------- .../MergeTree/MergeTreeDataPartCompact.h | 9 ----- .../MergeTree/MergeTreeDataPartWide.cpp | 15 ++++--- .../MergeTree/MergeTreeDataPartWide.h | 9 ----- .../MergeTreeDataPartWriterCompact.cpp | 18 ++++----- .../MergeTreeDataPartWriterCompact.h | 6 +-- .../MergeTreeDataPartWriterOnDisk.cpp | 4 +- .../MergeTree/MergeTreeDataPartWriterOnDisk.h | 13 ++----- .../MergeTree/MergeTreeDataPartWriterWide.cpp | 29 ++++++-------- .../MergeTree/MergeTreeDataPartWriterWide.h | 6 +-- src/Storages/MergeTree/MergeTreeIOSettings.h | 2 +- src/Storages/MergeTree/MergeTreePartition.cpp | 5 +-- src/Storages/MergeTree/MergeTreePartition.h | 2 +- .../MergeTree/MergedBlockOutputStream.cpp | 1 + .../MergedColumnOnlyOutputStream.cpp | 9 ++--- src/Storages/MergeTree/MutateTask.cpp | 1 + 22 files changed, 76 insertions(+), 159 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 4ec5b3f5f8a..091a7ceb5bd 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -43,7 +43,6 @@ class IReservation; using ReservationPtr = std::unique_ptr; class IMergeTreeReader; -class IMergeTreeDataPartWriter; class MarkCache; class UncompressedCache; class MergeTreeTransaction; @@ -74,7 +73,6 @@ public: using VirtualFields = std::unordered_map; using MergeTreeReaderPtr = std::unique_ptr; -// using MergeTreeWriterPtr = std::unique_ptr; using ColumnSizeByName = std::unordered_map; using NameToNumber = std::unordered_map; @@ -106,15 +104,6 @@ public: const ValueSizeMap & avg_value_size_hints_, const ReadBufferFromFileBase::ProfileCallback & profile_callback_) const = 0; -//// virtual MergeTreeWriterPtr getWriter( -//// const NamesAndTypesList & columns_list, -//// const StorageMetadataPtr & metadata_snapshot, -//// const std::vector & indices_to_recalc, -//// const Statistics & stats_to_recalc_, -//// const CompressionCodecPtr & default_codec_, -//// const MergeTreeWriterSettings & writer_settings, -//// const MergeTreeIndexGranularity & computed_index_granularity) = 0; - // TODO: remove? virtual bool isStoredOnDisk() const = 0; diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index b46fbc5fc9e..e01572715d6 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -1,5 +1,4 @@ #include -#include "Storages/MergeTree/MergeTreeSettings.h" namespace DB { @@ -46,12 +45,10 @@ Block permuteBlockIfNeeded(const Block & block, const IColumn::Permutation * per } IMergeTreeDataPartWriter::IMergeTreeDataPartWriter( -// const MergeTreeMutableDataPartPtr & data_part_, const String & data_part_name_, const SerializationByName & serializations_, MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, - const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, @@ -61,7 +58,6 @@ IMergeTreeDataPartWriter::IMergeTreeDataPartWriter( , serializations(serializations_) , data_part_storage(data_part_storage_) , index_granularity_info(index_granularity_info_) - , storage_settings(storage_settings_) , metadata_snapshot(metadata_snapshot_) , columns_list(columns_list_) @@ -117,7 +113,6 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, @@ -134,7 +129,6 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, @@ -153,7 +147,6 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index 6854668a01e..3245a23339b 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -1,14 +1,12 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include "Storages/MergeTree/MergeTreeDataPartType.h" -#include "Storages/MergeTree/MergeTreeSettings.h" +#include +#include +#include +#include +#include +#include +#include namespace DB @@ -24,15 +22,11 @@ class IMergeTreeDataPartWriter : private boost::noncopyable { public: IMergeTreeDataPartWriter( -// const MergeTreeMutableDataPartPtr & data_part_, - const String & data_part_name_, const SerializationByName & serializations_, MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, - const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeWriterSettings & settings_, @@ -42,7 +36,7 @@ public: virtual void write(const Block & block, const IColumn::Permutation * permutation) = 0; - virtual void fillChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove) = 0; + virtual void fillChecksums(MergeTreeDataPartChecksums & checksums, NameSet & checksums_to_remove) = 0; virtual void finish(bool sync) = 0; @@ -56,21 +50,12 @@ protected: IDataPartStorage & getDataPartStorage() { return *data_part_storage; } - -// const MergeTreeMutableDataPartPtr data_part; // TODO: remove - /// Serializations for every columns and subcolumns by their names. - String data_part_name; - SerializationByName serializations; + const String data_part_name; + const SerializationByName serializations; MutableDataPartStoragePtr data_part_storage; - MergeTreeIndexGranularityInfo index_granularity_info; - - -// const MergeTreeData & storage; // TODO: remove - + const MergeTreeIndexGranularityInfo index_granularity_info; const MergeTreeSettingsPtr storage_settings; - - const StorageMetadataPtr metadata_snapshot; const NamesAndTypesList columns_list; const MergeTreeWriterSettings settings; @@ -90,7 +75,6 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, @@ -100,5 +84,4 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity); - } diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp index f99adf7c4db..89c813ab233 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp @@ -2,30 +2,26 @@ #include #include #include -#include "Storages/MergeTree/IDataPartStorage.h" -#include "Storages/StorageSet.h" namespace DB { IMergedBlockOutputStream::IMergedBlockOutputStream( -// const MergeTreeMutableDataPartPtr & data_part, const MergeTreeSettingsPtr & storage_settings_, MutableDataPartStoragePtr data_part_storage_, const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list, bool reset_columns_) - //: storage(data_part->storage) : storage_settings(storage_settings_) , metadata_snapshot(metadata_snapshot_) - , data_part_storage(data_part_storage_)//data_part->getDataPartStoragePtr()) + , data_part_storage(data_part_storage_) , reset_columns(reset_columns_) { if (reset_columns) { SerializationInfo::Settings info_settings = { - .ratio_of_defaults_for_sparse = storage_settings->ratio_of_defaults_for_sparse_serialization,//storage.getSettings()->ratio_of_defaults_for_sparse_serialization, + .ratio_of_defaults_for_sparse = storage_settings->ratio_of_defaults_for_sparse_serialization, .choose_kind = false, }; diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.h b/src/Storages/MergeTree/IMergedBlockOutputStream.h index b6f279e6d58..a9b058418ea 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.h +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.h @@ -1,12 +1,12 @@ #pragma once -#include "Storages/MergeTree/IDataPartStorage.h" -#include "Storages/MergeTree/MergeTreeSettings.h" +#include +#include #include #include #include #include -#include "Common/Logger.h" +#include namespace DB { @@ -15,7 +15,6 @@ class IMergedBlockOutputStream { public: IMergedBlockOutputStream( -// const MergeTreeMutableDataPartPtr & data_part, const MergeTreeSettingsPtr & storage_settings_, MutableDataPartStoragePtr data_part_storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -43,11 +42,8 @@ protected: SerializationInfoByName & serialization_infos, MergeTreeData::DataPart::Checksums & checksums); -// const MergeTreeData & storage; // TODO: remove -//// MergeTreeSettingsPtr storage_settings; LoggerPtr log; -//// StorageMetadataPtr metadata_snapshot; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 1b5ad0d81a7..2ce74bde1d5 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -9,7 +9,7 @@ #include #include #include - +#include #include #include #include diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index eebbe3110c0..373ad6c23ea 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -48,21 +48,20 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartCompact::getReader( } MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( - const String & data_part_name_, - const String & logger_name_, - const SerializationByName & serializations_, - MutableDataPartStoragePtr data_part_storage_, - const MergeTreeIndexGranularityInfo & index_granularity_info_, - const MergeTreeSettingsPtr & storage_settings_, - - const NamesAndTypesList & columns_list, - const StorageMetadataPtr & metadata_snapshot, - const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, - const String & marks_file_extension_, - const CompressionCodecPtr & default_codec_, - const MergeTreeWriterSettings & writer_settings, - const MergeTreeIndexGranularity & computed_index_granularity) + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, + const NamesAndTypesList & columns_list, + const StorageMetadataPtr & metadata_snapshot, + const std::vector & indices_to_recalc, + const Statistics & stats_to_recalc_, + const String & marks_file_extension_, + const CompressionCodecPtr & default_codec_, + const MergeTreeWriterSettings & writer_settings, + const MergeTreeIndexGranularity & computed_index_granularity) { ////// TODO: fix the order of columns //// diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index 5a57d778b7d..ca88edba7b3 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -40,15 +40,6 @@ public: const ValueSizeMap & avg_value_size_hints, const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; -// MergeTreeWriterPtr getWriter( -// const NamesAndTypesList & columns_list, -// const StorageMetadataPtr & metadata_snapshot, -// const std::vector & indices_to_recalc, -// const Statistics & stats_to_recalc_, -// const CompressionCodecPtr & default_codec_, -// const MergeTreeWriterSettings & writer_settings, -// const MergeTreeIndexGranularity & computed_index_granularity) override; - // TODO: remove? bool isStoredOnDisk() const override { return true; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index c99cff258e0..34a3f30c4ba 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -54,18 +54,17 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader( } MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( - const String & data_part_name_, - const String & logger_name_, - const SerializationByName & serializations_, - MutableDataPartStoragePtr data_part_storage_, - const MergeTreeIndexGranularityInfo & index_granularity_info_, - const MergeTreeSettingsPtr & storage_settings_, - + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, - const String & marks_file_extension_, + const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index 45d0fbbebec..e3cb3f04335 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -35,15 +35,6 @@ public: const ValueSizeMap & avg_value_size_hints, const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; -// MergeTreeWriterPtr getWriter( -// const NamesAndTypesList & columns_list, -// const StorageMetadataPtr & metadata_snapshot, -// const std::vector & indices_to_recalc, -// const Statistics & stats_to_recalc_, -// const CompressionCodecPtr & default_codec_, -// const MergeTreeWriterSettings & writer_settings, -// const MergeTreeIndexGranularity & computed_index_granularity) override; - // TODO: remove? bool isStoredOnDisk() const override { return true; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 6e8ea1a915b..3f08d8eea21 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -10,14 +10,12 @@ namespace ErrorCodes } MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( -// const MergeTreeMutableDataPartPtr & data_part_, - const String & data_part_name_, - const String & logger_name_, - const SerializationByName & serializations_, - MutableDataPartStoragePtr data_part_storage_, - const MergeTreeIndexGranularityInfo & index_granularity_info_, - const MergeTreeSettingsPtr & storage_settings_, - + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, @@ -250,7 +248,7 @@ void MergeTreeDataPartWriterCompact::writeDataBlock(const Block & block, const G } } -void MergeTreeDataPartWriterCompact::fillDataChecksums(IMergeTreeDataPart::Checksums & checksums) +void MergeTreeDataPartWriterCompact::fillDataChecksums(MergeTreeDataPartChecksums & checksums) { if (columns_buffer.size() != 0) { @@ -420,7 +418,7 @@ size_t MergeTreeDataPartWriterCompact::ColumnsBuffer::size() const return accumulated_columns.at(0)->size(); } -void MergeTreeDataPartWriterCompact::fillChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & /*checksums_to_remove*/) +void MergeTreeDataPartWriterCompact::fillChecksums(MergeTreeDataPartChecksums & checksums, NameSet & /*checksums_to_remove*/) { // If we don't have anything to write, skip finalization. if (!columns_list.empty()) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index 3bec4c7e988..03804ff4966 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -11,14 +11,12 @@ class MergeTreeDataPartWriterCompact : public MergeTreeDataPartWriterOnDisk { public: MergeTreeDataPartWriterCompact( -// const MergeTreeMutableDataPartPtr & data_part, const String & data_part_name_, const String & logger_name_, const SerializationByName & serializations_, MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, @@ -30,12 +28,12 @@ public: void write(const Block & block, const IColumn::Permutation * permutation) override; - void fillChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove) override; + void fillChecksums(MergeTreeDataPartChecksums & checksums, NameSet & checksums_to_remove) override; void finish(bool sync) override; private: /// Finish serialization of the data. Flush rows in buffer to disk, compute checksums. - void fillDataChecksums(IMergeTreeDataPart::Checksums & checksums); + void fillDataChecksums(MergeTreeDataPartChecksums & checksums); void finishDataSerialization(bool sync); void fillIndexGranularity(size_t index_granularity_for_block, size_t rows_in_block) override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 13892c17577..25eb83a82c0 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -146,7 +146,6 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeIndices & indices_to_recalc_, @@ -231,7 +230,6 @@ static size_t computeIndexGranularityImpl( size_t MergeTreeDataPartWriterOnDisk::computeIndexGranularity(const Block & block) const { -// const auto storage_settings = storage.getSettings(); return computeIndexGranularityImpl( block, storage_settings->index_granularity_bytes, @@ -293,7 +291,7 @@ void MergeTreeDataPartWriterOnDisk::initSkipIndices() GinIndexStorePtr store = nullptr; if (typeid_cast(&*skip_index) != nullptr) { - store = std::make_shared(stream_name, data_part_storage, data_part_storage, /*storage.getSettings()*/storage_settings->max_digestion_size_per_segment); + store = std::make_shared(stream_name, data_part_storage, data_part_storage, storage_settings->max_digestion_size_per_segment); gin_index_stores[stream_name] = store; } skip_indices_aggregators.push_back(skip_index->createIndexAggregatorForPart(store, settings)); diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index 39f33217b57..e17724fa1d0 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -5,9 +5,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -97,21 +94,19 @@ public: void sync() const; - void addToChecksums(IMergeTreeDataPart::Checksums & checksums); + void addToChecksums(MergeTreeDataPartChecksums & checksums); }; using StreamPtr = std::unique_ptr>; using StatisticStreamPtr = std::unique_ptr>; MergeTreeDataPartWriterOnDisk( -// const MergeTreeMutableDataPartPtr & data_part_, const String & data_part_name_, const String & logger_name_, const SerializationByName & serializations_, MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, @@ -140,13 +135,13 @@ protected: void calculateAndSerializeStatistics(const Block & stats_block); /// Finishes primary index serialization: write final primary index row (if required) and compute checksums - void fillPrimaryIndexChecksums(MergeTreeData::DataPart::Checksums & checksums); + void fillPrimaryIndexChecksums(MergeTreeDataPartChecksums & checksums); void finishPrimaryIndexSerialization(bool sync); /// Finishes skip indices serialization: write all accumulated data to disk and compute checksums - void fillSkipIndicesChecksums(MergeTreeData::DataPart::Checksums & checksums); + void fillSkipIndicesChecksums(MergeTreeDataPartChecksums & checksums); void finishSkipIndicesSerialization(bool sync); - void fillStatisticsChecksums(MergeTreeData::DataPart::Checksums & checksums); + void fillStatisticsChecksums(MergeTreeDataPartChecksums & checksums); void finishStatisticsSerialization(bool sync); /// Get global number of the current which we are writing (or going to start to write) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 713dee87fa8..a57bf7d2037 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -76,14 +76,12 @@ Granules getGranulesToWrite(const MergeTreeIndexGranularity & index_granularity, } MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( -// const MergeTreeMutableDataPartPtr & data_part_, - const String & data_part_name_, - const String & logger_name_, - const SerializationByName & serializations_, - MutableDataPartStoragePtr data_part_storage_, - const MergeTreeIndexGranularityInfo & index_granularity_info_, - const MergeTreeSettingsPtr & storage_settings_, - + const String & data_part_name_, + const String & logger_name_, + const SerializationByName & serializations_, + MutableDataPartStoragePtr data_part_storage_, + const MergeTreeIndexGranularityInfo & index_granularity_info_, + const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, @@ -114,7 +112,6 @@ void MergeTreeDataPartWriterWide::addStreams( { assert(!substream_path.empty()); -// auto storage_settings = storage.getSettings(); auto full_stream_name = ISerialization::getFileNameForStream(column, substream_path); String stream_name; @@ -416,11 +413,10 @@ void MergeTreeDataPartWriterWide::writeColumn( serialization->serializeBinaryBulkStatePrefix(column, serialize_settings, it->second); } -// const auto & global_settings = storage.getContext()->getSettingsRef(); ISerialization::SerializeBinaryBulkSettings serialize_settings; serialize_settings.getter = createStreamGetter(name_and_type, offset_columns); - serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size;//global_settings.low_cardinality_max_dictionary_size; - serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part;//global_settings.low_cardinality_use_single_dictionary_for_part != 0; + serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size; + serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part; for (const auto & granule : granules) { @@ -603,12 +599,11 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai } -void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove) +void MergeTreeDataPartWriterWide::fillDataChecksums(MergeTreeDataPartChecksums & checksums, NameSet & checksums_to_remove) { -// const auto & global_settings = storage.getContext()->getSettingsRef(); ISerialization::SerializeBinaryBulkSettings serialize_settings; - serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size;//global_settings.low_cardinality_max_dictionary_size; - serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part;//global_settings.low_cardinality_use_single_dictionary_for_part != 0; + serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size; + serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part; WrittenOffsetColumns offset_columns; if (rows_written_in_last_mark > 0) { @@ -683,7 +678,7 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(bool sync) } -void MergeTreeDataPartWriterWide::fillChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove) +void MergeTreeDataPartWriterWide::fillChecksums(MergeTreeDataPartChecksums & checksums, NameSet & checksums_to_remove) { // If we don't have anything to write, skip finalization. if (!columns_list.empty()) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index ef9c4ab17dc..5789213c910 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -21,14 +21,12 @@ class MergeTreeDataPartWriterWide : public MergeTreeDataPartWriterOnDisk { public: MergeTreeDataPartWriterWide( -// const MergeTreeMutableDataPartPtr & data_part, const String & data_part_name_, const String & logger_name_, const SerializationByName & serializations_, MutableDataPartStoragePtr data_part_storage_, const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, - const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, @@ -40,14 +38,14 @@ public: void write(const Block & block, const IColumn::Permutation * permutation) override; - void fillChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove) final; + void fillChecksums(MergeTreeDataPartChecksums & checksums, NameSet & checksums_to_remove) final; void finish(bool sync) final; private: /// Finish serialization of data: write final mark if required and compute checksums /// Also validate written data in debug mode - void fillDataChecksums(IMergeTreeDataPart::Checksums & checksums, NameSet & checksums_to_remove); + void fillDataChecksums(MergeTreeDataPartChecksums & checksums, NameSet & checksums_to_remove); void finishDataSerialization(bool sync); /// Write data of one column. diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h index 421c62887da..2b7d5c366f2 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.h +++ b/src/Storages/MergeTree/MergeTreeIOSettings.h @@ -75,7 +75,7 @@ struct MergeTreeWriterSettings , query_write_settings(query_write_settings_) , max_threads_for_annoy_index_creation(global_settings.max_threads_for_annoy_index_creation) , low_cardinality_max_dictionary_size(global_settings.low_cardinality_max_dictionary_size) - , low_cardinality_use_single_dictionary_for_part(global_settings.low_cardinality_use_single_dictionary_for_part) + , low_cardinality_use_single_dictionary_for_part(global_settings.low_cardinality_use_single_dictionary_for_part != 0) { } diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index c2ef7f98388..c7b7557fe52 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -12,7 +12,6 @@ #include #include #include -#include "Interpreters/Context_fwd.h" #include #include @@ -414,12 +413,10 @@ void MergeTreePartition::load(const MergeTreeData & storage, const PartMetadataM partition_key_sample.getByPosition(i).type->getDefaultSerialization()->deserializeBinary(value[i], *file, {}); } -std::unique_ptr MergeTreePartition::store(/*const MergeTreeData & storage,*/ +std::unique_ptr MergeTreePartition::store( StorageMetadataPtr metadata_snapshot, ContextPtr storage_context, IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums) const { -// auto metadata_snapshot = storage.getInMemoryMetadataPtr(); -// const auto & context = storage.getContext(); const auto & partition_key_sample = adjustPartitionKey(metadata_snapshot, storage_context).sample_block; return store(partition_key_sample, data_part_storage, checksums, storage_context->getWriteSettings()); } diff --git a/src/Storages/MergeTree/MergeTreePartition.h b/src/Storages/MergeTree/MergeTreePartition.h index 04175d6f927..44def70bdd9 100644 --- a/src/Storages/MergeTree/MergeTreePartition.h +++ b/src/Storages/MergeTree/MergeTreePartition.h @@ -44,7 +44,7 @@ public: /// Store functions return write buffer with written but not finalized data. /// User must call finish() for returned object. - [[nodiscard]] std::unique_ptr store(//const MergeTreeData & storage, + [[nodiscard]] std::unique_ptr store( StorageMetadataPtr metadata_snapshot, ContextPtr storage_context, IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums) const; [[nodiscard]] std::unique_ptr store(const Block & partition_key_sample, IDataPartStorage & data_part_storage, MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const; diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 2441d941952..e0fb4f703a0 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 51853384012..1c75d81eca5 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -24,7 +24,6 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( , header(header_) { const auto & global_settings = data_part->storage.getContext()->getSettings(); -// const auto & storage_settings = data_part->storage.getSettings(); MergeTreeWriterSettings writer_settings( global_settings, @@ -34,10 +33,10 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( /* rewrite_primary_key = */ false); writer = createMergeTreeDataPartWriter( - data_part->getType(), - data_part->name, data_part->storage.getLogName(), data_part->getSerializations(), - data_part_storage, data_part->index_granularity_info, - storage_settings, + data_part->getType(), + data_part->name, data_part->storage.getLogName(), data_part->getSerializations(), + data_part_storage, data_part->index_granularity_info, + storage_settings, header.getNamesAndTypesList(), metadata_snapshot_, indices_to_recalc, diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 54077055d96..7d6b68c7359 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include From 13c94806e5f5ff800620d502229ff17cbce379f2 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 17 May 2024 19:44:57 +0200 Subject: [PATCH 171/392] fix waiting for mutations with retriable errors --- src/Storages/StorageReplicatedMergeTree.cpp | 31 ++++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index cc6599f8cd1..d60fa6bc787 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -590,6 +590,9 @@ void StorageReplicatedMergeTree::waitMutationToFinishOnReplicas( LOG_DEBUG(log, "Waiting for {} to apply mutation {}", replica, mutation_id); zkutil::EventPtr wait_event = std::make_shared(); + constexpr size_t MAX_RETRIES_ON_FAILED_MUTATION = 30; + size_t retries_on_failed_mutation = 0; + while (!partial_shutdown_called) { /// Mutation maybe killed or whole replica was deleted. @@ -637,18 +640,32 @@ void StorageReplicatedMergeTree::waitMutationToFinishOnReplicas( } } - /// If mutation status is empty, than local replica may just not loaded it into memory. - if (mutation_status && !mutation_status->latest_fail_reason.empty()) - { - LOG_DEBUG(log, "Mutation {} is done {} or failed {} (status: '{}')", mutation_id, mutation_status->is_done, !mutation_status->latest_fail_reason.empty(), mutation_status->latest_fail_reason); - break; - } - /// Replica can become inactive, so wait with timeout, if nothing happened -> recheck it if (!wait_event->tryWait(1000)) { LOG_TRACE(log, "Failed to wait for mutation '{}', will recheck", mutation_id); } + + /// If mutation status is empty, than local replica may just not loaded it into memory. + if (mutation_status && !mutation_status->latest_fail_reason.empty()) + { + LOG_DEBUG(log, "Mutation {} is done {} or failed {} (status: '{}')", mutation_id, mutation_status->is_done, !mutation_status->latest_fail_reason.empty(), mutation_status->latest_fail_reason); + + /// In some cases latest_fail_reason may be retryable and there's a chance it will be cleared after the next attempt + if (++retries_on_failed_mutation <= MAX_RETRIES_ON_FAILED_MUTATION) + continue; + + if (mutation_status->is_done) + { + LOG_DEBUG(log, "Looks like mutation {} is done, rechecking", mutation_id); + continue; + } + + /// It's still possible that latest_fail_reason will be cleared just before queue.getIncompleteMutationsStatus(...) below, + /// but it's unlikely. Anyway, rethrow the exception here to avoid exiting with is_done=false + checkMutationStatus(mutation_status, {mutation_id}); + throw Exception(ErrorCodes::LOGICAL_ERROR, "checkMutationStatus didn't throw when checking status of {}: {}", mutation_id, mutation_status->latest_fail_reason); + } } /// This replica inactive, don't check anything From 077e6057f275a69a5fac48097b995572a5e07f06 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 17 May 2024 21:45:07 +0200 Subject: [PATCH 172/392] Update reinterpretAsDate and reinterpretAsDateTime functions, add a test --- .../functions/type-conversion-functions.md | 84 ++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index ea08ffa50e7..cf3483f27a4 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1004,9 +1004,91 @@ Result: ## reinterpretAsDate +Accepts a string, fixed string or numeric value and interprets the bytes as a number in host order (little endian). It returns a date from the interpreted number as the number of days since the beginning of the Unix Epoch. + +**Syntax** + +```sql +reinterpretAsDate(x) +``` + +**Parameters** + +- `x`: number of days since the beginning of the Unix Epoch. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Date. [Date](../data-types/date.md). + +**Implementation details** + +:::note +If the provided string isn’t long enough, the function works as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. +::: + +**Example** + +Query: + +```sql +SELECT reinterpretAsDate(65), reinterpretAsDate('A'); +``` + +Result: + +```response +┌─reinterpretAsDate(65)─┬─reinterpretAsDate('A')─┐ +│ 1970-03-07 │ 1970-03-07 │ +└───────────────────────┴────────────────────────┘ +``` + ## reinterpretAsDateTime -These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). If the string isn’t long enough, the functions work as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. A date is interpreted as the number of days since the beginning of the Unix Epoch, and a date with time is interpreted as the number of seconds since the beginning of the Unix Epoch. +These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). Returns a date with time interpreted as the number of seconds since the beginning of the Unix Epoch. + +**Syntax** + +```sql +reinterpretAsDateTime(x) +``` + +**Parameters** + +- `x`: number of seconds since the beginning of the Unix Epoch. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Date and Time. [DateTime](../data-types/datetime.md). + +**Implementation details** + +:::note +If the provided string isn’t long enough, the function works as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. +::: + +**Example** + +Query: + +```sql +SELECT reinterpretAsDateTime(65), reinterpretAsDateTime('A'); +``` + +Result: + +```response +┌─reinterpretAsDateTime(65)─┬─reinterpretAsDateTime('A')─┐ +│ 1970-01-01 01:01:05 │ 1970-01-01 01:01:05 │ +└───────────────────────────┴────────────────────────────┘ +``` ## reinterpretAsString From 764bf4d477c95cc3d27fe438a439956829997f9c Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 17 May 2024 22:04:40 +0200 Subject: [PATCH 173/392] Update reinterpretAsFixedString documentation and add tests --- .../functions/type-conversion-functions.md | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index cf3483f27a4..14a12ab5d5d 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1098,6 +1098,38 @@ This function accepts a number or date or date with time and returns a string co This function accepts a number or date or date with time and returns a FixedString containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a FixedString that is one byte long. +**Syntax** + +```sql +reinterpretAsFixedString(x) +``` + +**Parameters** + +- `x`: value to reinterpret to string. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md). + +**Returned value** + +- Fixed string containing bytes representing `x`. [FixedString](../data-types/fixedstring.md). + +**Example** + +Query: + +```sql +SELECT + reinterpretAsFixedString(toDateTime('1970-01-01 01:01:05')), + reinterpretAsFixedString(toDate('1970-03-07')); +``` + +Result: + +```response +┌─reinterpretAsFixedString(toDateTime('1970-01-01 01:01:05'))─┬─reinterpretAsFixedString(toDate('1970-03-07'))─┐ +│ A │ A │ +└─────────────────────────────────────────────────────────────┴────────────────────────────────────────────────┘ +``` + ## reinterpretAsUUID :::note From 2c8b303a2fc69365be39a91179365466c3ebc14a Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 17 May 2024 20:16:58 +0000 Subject: [PATCH 174/392] Use Dynamic as supertype, add more tests, fix tests flakiness, update docs --- docs/en/sql-reference/data-types/dynamic.md | 4 ++-- src/DataTypes/getLeastSupertype.cpp | 19 +++++++++++++++++++ .../03037_dynamic_merges_1_horizontal.sh | 2 +- .../03037_dynamic_merges_1_vertical.sh | 2 +- .../03159_dynamic_type_all_types.reference | 12 ++++++------ .../03159_dynamic_type_all_types.sql | 4 ++-- .../03163_dynamic_as_supertype.reference | 10 ++++++++++ .../03163_dynamic_as_supertype.sql | 8 ++++++++ 8 files changed, 49 insertions(+), 12 deletions(-) create mode 100644 tests/queries/0_stateless/03163_dynamic_as_supertype.reference create mode 100644 tests/queries/0_stateless/03163_dynamic_as_supertype.sql diff --git a/docs/en/sql-reference/data-types/dynamic.md b/docs/en/sql-reference/data-types/dynamic.md index eabf032c52f..955fd54e641 100644 --- a/docs/en/sql-reference/data-types/dynamic.md +++ b/docs/en/sql-reference/data-types/dynamic.md @@ -14,7 +14,7 @@ To declare a column of `Dynamic` type, use the following syntax: Dynamic(max_types=N) ``` -Where `N` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a column with type `Dynamic`. If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_types` is `32`. +Where `N` is an optional parameter between `1` and `255` indicating how many different data types can be stored inside a column with type `Dynamic` across single block of data that is stored separately (for example across single data part for MergeTree table). If this limit is exceeded, all new types will be converted to type `String`. Default value of `max_types` is `32`. :::note The Dynamic data type is an experimental feature. To use it, set `allow_experimental_dynamic_type = 1`. @@ -355,7 +355,7 @@ SELECT * FROM test WHERE d2 == [1,2,3]::Array(UInt32)::Dynamic; - Compare `Dynamic` subcolumn with required type: ```sql -SELECT * FROM test WHERE d2.`Array(Int64)` == [1,2,3] -- or using variantElement(d2, 'Array(UInt32)') +SELECT * FROM test WHERE d2.`Array(Int65)` == [1,2,3] -- or using variantElement(d2, 'Array(UInt32)') ``` ```text diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index 0977bea362c..a71b19d6c92 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace DB @@ -256,6 +257,24 @@ DataTypePtr getLeastSupertype(const DataTypes & types) return types[0]; } + /// If one of the types is Dynamic, the supertype is Dynamic + { + bool have_dynamic = false; + size_t max_dynamic_types = 0; + + for (const auto & type : types) + { + if (const auto & dynamic_type = typeid_cast(type.get())) + { + have_dynamic = true; + max_dynamic_types = std::max(max_dynamic_types, dynamic_type->getMaxDynamicTypes()); + } + } + + if (have_dynamic) + return std::make_shared(max_dynamic_types); + } + /// Recursive rules /// If there are Nothing types, skip them diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh index 0d3cd45666a..7c1ac41cfdc 100755 --- a/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh @@ -8,7 +8,7 @@ CLICKHOUSE_LOG_COMMENT= . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 " +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --merge_max_block_size 8192 --merge_max_block_size_bytes=10485760 --index_granularity 8192" function test() { diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh index b2c40668228..927ceac72b5 100755 --- a/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh @@ -8,8 +8,8 @@ CLICKHOUSE_LOG_COMMENT= . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 " +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --merge_max_block_size 8192 --merge_max_block_size_bytes=10485760 --index_granularity 8192" function test() { echo "test" diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.reference b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference index a162ec4f857..7dcaaa1f3ec 100644 --- a/tests/queries/0_stateless/03159_dynamic_type_all_types.reference +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference @@ -110,9 +110,9 @@ Map(Dynamic, Dynamic) {'11':'v1','22':'1'} Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] Object(\'json\') {"1":"2"} -Object(Nullable(\'json\')) {"1":null,"2":null,"2020-10-10":null,"k1":1,"k2":2} -Object(Nullable(\'json\')) {"1":2,"2":3,"2020-10-10":null,"k1":null,"k2":null} -Object(Nullable(\'json\')) {"1":null,"2":null,"2020-10-10":"foo","k1":null,"k2":null} +Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":2,"2":3,"2020-10-10":null} +Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":null,"2":null,"2020-10-10":"foo"} +Object(Nullable(\'json\')) {"k1":1,"k2":2,"1":null,"2":null,"2020-10-10":null} Point (1.23,4.5600000000000005) Ring [(1.23,4.5600000000000005),(2.34,5.67)] String string @@ -259,9 +259,9 @@ Map(Dynamic, Dynamic) {'11':'v1','22':'1'} Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] Object(\'json\') {"1":"2"} -Object(Nullable(\'json\')) {"1":null,"2":null,"2020-10-10":null,"k1":1,"k2":2} -Object(Nullable(\'json\')) {"1":null,"2":null,"2020-10-10":"foo","k1":null,"k2":null} -Object(Nullable(\'json\')) {"1":2,"2":3,"2020-10-10":null,"k1":null,"k2":null} +Object(Nullable(\'json\')) {"k1":1,"k2":2,"1":null,"2":null,"2020-10-10":null} +Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":2,"2":3,"2020-10-10":null} +Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":null,"2":null,"2020-10-10":"foo"} Point (1.23,4.5600000000000005) Ring [(1.23,4.5600000000000005),(2.34,5.67)] String string diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.sql b/tests/queries/0_stateless/03159_dynamic_type_all_types.sql index 38d70dee64e..64fab07ed4f 100644 --- a/tests/queries/0_stateless/03159_dynamic_type_all_types.sql +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.sql @@ -86,13 +86,13 @@ INSERT INTO t VALUES (interval '1' day), (interval '2' month), (interval '3' yea INSERT INTO t VALUES ([(1, 'aa'), (2, 'bb')]::Nested(x UInt32, y String)); INSERT INTO t VALUES ([(1, (2, ['aa', 'bb']), [(3, 'cc'), (4, 'dd')]), (5, (6, ['ee', 'ff']), [(7, 'gg'), (8, 'hh')])]::Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String))); -SELECT dynamicType(d), d FROM t ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d ; +SELECT dynamicType(d), d FROM t ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d, toString(d); CREATE TABLE t2 (d Dynamic(max_types=255)) ENGINE = Memory; INSERT INTO t2 SELECT * FROM t; SELECT ''; -SELECT dynamicType(d), d FROM t2 ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d; +SELECT dynamicType(d), d FROM t2 ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d, toString(d); SELECT ''; SELECT uniqExact(dynamicType(d)) t_ FROM t; diff --git a/tests/queries/0_stateless/03163_dynamic_as_supertype.reference b/tests/queries/0_stateless/03163_dynamic_as_supertype.reference new file mode 100644 index 00000000000..5f1a8613a77 --- /dev/null +++ b/tests/queries/0_stateless/03163_dynamic_as_supertype.reference @@ -0,0 +1,10 @@ +str_0 Dynamic(max_types=3) String +1 Dynamic(max_types=3) UInt64 +str_2 Dynamic(max_types=3) String +3 Dynamic(max_types=3) UInt64 +str_1 String +42 UInt64 +str_2 String +43 UInt64 +2020-01-01 Date +[1,2,3] Array(Int64) diff --git a/tests/queries/0_stateless/03163_dynamic_as_supertype.sql b/tests/queries/0_stateless/03163_dynamic_as_supertype.sql new file mode 100644 index 00000000000..fbb6aa74fab --- /dev/null +++ b/tests/queries/0_stateless/03163_dynamic_as_supertype.sql @@ -0,0 +1,8 @@ +SET allow_experimental_dynamic_type=1; +SELECT if(number % 2, number::Dynamic(max_types=3), ('str_' || toString(number))::Dynamic(max_types=2)) AS d, toTypeName(d), dynamicType(d) FROM numbers(4); +CREATE TABLE dynamic_test_1 (d Dynamic(max_types=3)) ENGINE = Memory; +INSERT INTO dynamic_test_1 VALUES ('str_1'), (42::UInt64); +CREATE TABLE dynamic_test_2 (d Dynamic(max_types=5)) ENGINE = Memory; +INSERT INTO dynamic_test_2 VALUES ('str_2'), (43::UInt64), ('2020-01-01'::Date), ([1, 2, 3]); +SELECT d, dynamicType(d) FROM dynamic_test_1 UNION ALL SELECT d, dynamicType(d) FROM dynamic_test_2; + From dd6c763492d032738c922cff19c8687e05c2f542 Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 17 May 2024 17:48:06 -0400 Subject: [PATCH 175/392] Use of the redefined context in process query pipline. --- .../Transforms/buildPushingToViewsChain.cpp | 5 +-- .../Transforms/buildPushingToViewsChain.h | 3 ++ ...te_view_with_sql_security_option.reference | 1 + ...84_create_view_with_sql_security_option.sh | 35 +++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 5e8ecdca95e..cdcfad4442c 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -414,7 +414,8 @@ std::optional generateViewChain( out.getInputHeader(), view_id, nullptr, - std::move(runtime_stats)}); + std::move(runtime_stats), + insert_context}); if (type == QueryViewsLogElement::ViewType::MATERIALIZED) { @@ -590,7 +591,7 @@ Chain buildPushingToViewsChain( static QueryPipeline process(Block block, ViewRuntimeData & view, const ViewsData & views_data) { - const auto & context = views_data.context; + const auto & context = view.context; /// We create a table with the same name as original table and the same alias columns, /// but it will contain single block (that is INSERT-ed into main table). diff --git a/src/Processors/Transforms/buildPushingToViewsChain.h b/src/Processors/Transforms/buildPushingToViewsChain.h index 53aceeda1cc..a1feed91b60 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.h +++ b/src/Processors/Transforms/buildPushingToViewsChain.h @@ -33,6 +33,9 @@ struct ViewRuntimeData /// Info which is needed for query views log. std::unique_ptr runtime_stats; + /// An overridden context bounded to this view with the correct SQL security grants. + ContextPtr context; + void setException(std::exception_ptr e) { exception = e; diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference index 9ba927fa201..931cf8ac19c 100644 --- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference +++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference @@ -24,6 +24,7 @@ OK 2 OK OK +100 ===== TestGrants ===== OK OK diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh index 9c9df120298..62b03b5d5ff 100755 --- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh +++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh @@ -192,6 +192,41 @@ ${CLICKHOUSE_CLIENT} --user $user1 --query " ${CLICKHOUSE_CLIENT} --query "GRANT SET DEFINER ON $user2 TO $user1" +${CLICKHOUSE_CLIENT} --multiquery < Date: Sat, 18 May 2024 01:04:20 +0000 Subject: [PATCH 176/392] Fix tests --- .../0_stateless/03159_dynamic_type_all_types.reference | 6 +++--- .../0_stateless/03163_dynamic_as_supertype.reference | 10 +++++----- .../queries/0_stateless/03163_dynamic_as_supertype.sql | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.reference b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference index 7dcaaa1f3ec..abecca893f9 100644 --- a/tests/queries/0_stateless/03159_dynamic_type_all_types.reference +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference @@ -110,9 +110,9 @@ Map(Dynamic, Dynamic) {'11':'v1','22':'1'} Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] Object(\'json\') {"1":"2"} -Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":2,"2":3,"2020-10-10":null} -Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":null,"2":null,"2020-10-10":"foo"} Object(Nullable(\'json\')) {"k1":1,"k2":2,"1":null,"2":null,"2020-10-10":null} +Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":null,"2":null,"2020-10-10":"foo"} +Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":2,"2":3,"2020-10-10":null} Point (1.23,4.5600000000000005) Ring [(1.23,4.5600000000000005),(2.34,5.67)] String string @@ -260,8 +260,8 @@ Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] Object(\'json\') {"1":"2"} Object(Nullable(\'json\')) {"k1":1,"k2":2,"1":null,"2":null,"2020-10-10":null} -Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":2,"2":3,"2020-10-10":null} Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":null,"2":null,"2020-10-10":"foo"} +Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":2,"2":3,"2020-10-10":null} Point (1.23,4.5600000000000005) Ring [(1.23,4.5600000000000005),(2.34,5.67)] String string diff --git a/tests/queries/0_stateless/03163_dynamic_as_supertype.reference b/tests/queries/0_stateless/03163_dynamic_as_supertype.reference index 5f1a8613a77..33e3a15c7fb 100644 --- a/tests/queries/0_stateless/03163_dynamic_as_supertype.reference +++ b/tests/queries/0_stateless/03163_dynamic_as_supertype.reference @@ -2,9 +2,9 @@ str_0 Dynamic(max_types=3) String 1 Dynamic(max_types=3) UInt64 str_2 Dynamic(max_types=3) String 3 Dynamic(max_types=3) UInt64 -str_1 String -42 UInt64 -str_2 String -43 UInt64 -2020-01-01 Date [1,2,3] Array(Int64) +2020-01-01 Date +str_1 String +str_2 String +42 UInt64 +43 UInt64 diff --git a/tests/queries/0_stateless/03163_dynamic_as_supertype.sql b/tests/queries/0_stateless/03163_dynamic_as_supertype.sql index fbb6aa74fab..baba637eea4 100644 --- a/tests/queries/0_stateless/03163_dynamic_as_supertype.sql +++ b/tests/queries/0_stateless/03163_dynamic_as_supertype.sql @@ -4,5 +4,5 @@ CREATE TABLE dynamic_test_1 (d Dynamic(max_types=3)) ENGINE = Memory; INSERT INTO dynamic_test_1 VALUES ('str_1'), (42::UInt64); CREATE TABLE dynamic_test_2 (d Dynamic(max_types=5)) ENGINE = Memory; INSERT INTO dynamic_test_2 VALUES ('str_2'), (43::UInt64), ('2020-01-01'::Date), ([1, 2, 3]); -SELECT d, dynamicType(d) FROM dynamic_test_1 UNION ALL SELECT d, dynamicType(d) FROM dynamic_test_2; +SELECT * FROM (SELECT d, dynamicType(d) FROM dynamic_test_1 UNION ALL SELECT d, dynamicType(d) FROM dynamic_test_2) order by d; From 9ba21335e4b4d157f4b1de884e87ef84e917dc62 Mon Sep 17 00:00:00 2001 From: pufit Date: Sat, 18 May 2024 12:20:24 -0400 Subject: [PATCH 177/392] fix test --- .../0_stateless/02884_create_view_with_sql_security_option.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh index 62b03b5d5ff..a9a306a9e27 100755 --- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh +++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh @@ -197,18 +197,21 @@ CREATE TABLE $db.source ( a UInt64 ) +ENGINE = MergeTree ORDER BY a; CREATE TABLE $db.destination1 ( `a` UInt64 ) +ENGINE = MergeTree ORDER BY a; CREATE TABLE $db.destination2 ( `a` UInt64 ) +ENGINE = MergeTree ORDER BY a; CREATE MATERIALIZED VIEW $db.mv1 TO $db.destination1 From 3a79b1facc63aa9ae3a8deb986bd00cf51c14c1f Mon Sep 17 00:00:00 2001 From: pufit Date: Sat, 18 May 2024 17:15:01 -0400 Subject: [PATCH 178/392] fix test --- .../0_stateless/02884_create_view_with_sql_security_option.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh index a9a306a9e27..f1da343da36 100755 --- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh +++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh @@ -202,14 +202,14 @@ ORDER BY a; CREATE TABLE $db.destination1 ( - `a` UInt64 + a UInt64 ) ENGINE = MergeTree ORDER BY a; CREATE TABLE $db.destination2 ( - `a` UInt64 + a UInt64 ) ENGINE = MergeTree ORDER BY a; From 79b3f52dc5189d6def125cf5ed9b1fb2e37267e4 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 18 May 2024 23:18:41 +0000 Subject: [PATCH 179/392] only interpolate expression should be used for DAG --- src/Planner/PlannerExpressionAnalysis.cpp | 7 +++---- .../03155_analyzer_interpolate.reference | 13 +++++++++++++ .../0_stateless/03155_analyzer_interpolate.sql | 7 +++++++ 3 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/03155_analyzer_interpolate.reference create mode 100644 tests/queries/0_stateless/03155_analyzer_interpolate.sql diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index 6e194b2c03e..6ff56f36933 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -439,20 +439,19 @@ SortAnalysisResult analyzeSort(const QueryNode & query_node, auto & interpolate_list_node = query_node.getInterpolate()->as(); PlannerActionsVisitor interpolate_actions_visitor(planner_context); - auto interpolate_actions_dag = std::make_shared(); + auto interpolate_expression_dag = std::make_shared(); for (auto & interpolate_node : interpolate_list_node.getNodes()) { auto & interpolate_node_typed = interpolate_node->as(); - interpolate_actions_visitor.visit(interpolate_actions_dag, interpolate_node_typed.getExpression()); - interpolate_actions_visitor.visit(interpolate_actions_dag, interpolate_node_typed.getInterpolateExpression()); + interpolate_actions_visitor.visit(interpolate_expression_dag, interpolate_node_typed.getInterpolateExpression()); } std::unordered_map before_sort_actions_inputs_name_to_node; for (const auto & node : before_sort_actions->getInputs()) before_sort_actions_inputs_name_to_node.emplace(node->result_name, node); - for (const auto & node : interpolate_actions_dag->getNodes()) + for (const auto & node : interpolate_expression_dag->getNodes()) { if (before_sort_actions_dag_output_node_names.contains(node.result_name) || node.type != ActionsDAG::ActionType::INPUT) diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.reference b/tests/queries/0_stateless/03155_analyzer_interpolate.reference new file mode 100644 index 00000000000..791aaa5b2a2 --- /dev/null +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.reference @@ -0,0 +1,13 @@ +0 [5] +0.5 [5] +1 [1] +1.5 [5] +2 [5] +2.5 [5] +3 [5] +3.5 [5] +4 [4] +4.5 [5] +5 [5] +5.5 [5] +7 [7] diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.sql b/tests/queries/0_stateless/03155_analyzer_interpolate.sql new file mode 100644 index 00000000000..9b56106f2b4 --- /dev/null +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.sql @@ -0,0 +1,7 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/62464 +SET allow_experimental_analyzer = 1; + +SELECT n, [number] as inter FROM ( + SELECT toFloat32(number % 10) AS n, number + FROM numbers(10) WHERE number % 3 = 1 +) group by n, inter ORDER BY n WITH FILL FROM 0 TO 5.51 STEP 0.5 INTERPOLATE (inter AS [5]); From a67418bcc8abb685a1c0271f8f34d5434bb0a113 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sun, 19 May 2024 07:14:37 +0000 Subject: [PATCH 180/392] add NOT_AN_AGGREGATE exception for interpolate expression columns --- src/Planner/PlannerExpressionAnalysis.cpp | 16 ++++++++++++++-- .../0_stateless/03155_analyzer_interpolate.sql | 9 +++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index 6ff56f36933..e7d553af944 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -28,6 +28,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int NOT_AN_AGGREGATE; } namespace @@ -397,7 +398,8 @@ ProjectionAnalysisResult analyzeProjection(const QueryNode & query_node, SortAnalysisResult analyzeSort(const QueryNode & query_node, const ColumnsWithTypeAndName & input_columns, const PlannerContextPtr & planner_context, - ActionsChain & actions_chain) + ActionsChain & actions_chain, + std::optional aggregation_analysis_result_optional) { ActionsDAGPtr before_sort_actions = std::make_shared(input_columns); auto & before_sort_actions_outputs = before_sort_actions->getOutputs(); @@ -451,6 +453,10 @@ SortAnalysisResult analyzeSort(const QueryNode & query_node, for (const auto & node : before_sort_actions->getInputs()) before_sort_actions_inputs_name_to_node.emplace(node->result_name, node); + std::unordered_set aggregation_keys; + if (aggregation_analysis_result_optional) + aggregation_keys.insert(aggregation_analysis_result_optional->aggregation_keys.begin(), aggregation_analysis_result_optional->aggregation_keys.end()); + for (const auto & node : interpolate_expression_dag->getNodes()) { if (before_sort_actions_dag_output_node_names.contains(node.result_name) || @@ -466,6 +472,12 @@ SortAnalysisResult analyzeSort(const QueryNode & query_node, input_node_it = it; } + if (aggregation_analysis_result_optional) + if (!aggregation_keys.contains(node.result_name)) + throw Exception(ErrorCodes::NOT_AN_AGGREGATE, + "Column {} is not under aggregate function and not in GROUP BY keys. In query {}", + node.result_name, query_node.formatASTForErrorMessage()); + before_sort_actions_outputs.push_back(input_node_it->second); before_sort_actions_dag_output_node_names.insert(node.result_name); } @@ -567,7 +579,7 @@ PlannerExpressionsAnalysisResult buildExpressionAnalysisResult(const QueryTreeNo std::optional sort_analysis_result_optional; if (query_node.hasOrderBy()) { - sort_analysis_result_optional = analyzeSort(query_node, current_output_columns, planner_context, actions_chain); + sort_analysis_result_optional = analyzeSort(query_node, current_output_columns, planner_context, actions_chain, aggregation_analysis_result_optional); current_output_columns = actions_chain.getLastStepAvailableOutputColumns(); } diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.sql b/tests/queries/0_stateless/03155_analyzer_interpolate.sql index 9b56106f2b4..b3c1d233f47 100644 --- a/tests/queries/0_stateless/03155_analyzer_interpolate.sql +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.sql @@ -1,7 +1,12 @@ -- https://github.com/ClickHouse/ClickHouse/issues/62464 SET allow_experimental_analyzer = 1; -SELECT n, [number] as inter FROM ( +SELECT n, [number] AS inter FROM ( SELECT toFloat32(number % 10) AS n, number FROM numbers(10) WHERE number % 3 = 1 -) group by n, inter ORDER BY n WITH FILL FROM 0 TO 5.51 STEP 0.5 INTERPOLATE (inter AS [5]); +) GROUP BY n, inter ORDER BY n WITH FILL FROM 0 TO 5.51 STEP 0.5 INTERPOLATE (inter AS [5]); + +SELECT n, number+5 AS inter FROM ( -- { serverError NOT_AN_AGGREGATE } + SELECT toFloat32(number % 10) AS n, number, number*2 AS mn + FROM numbers(10) WHERE number % 3 = 1 +) GROUP BY n, inter ORDER BY n WITH FILL FROM 0 TO 5.51 STEP 0.5 INTERPOLATE (inter AS mn * 2); From f065128ef2d67dfa4709f5d783d3c5a33b6f1e42 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 19 May 2024 07:16:07 +0000 Subject: [PATCH 181/392] Fix style --- src/Compression/CompressionCodecDoubleDelta.cpp | 5 +++++ src/Coordination/KeeperServer.cpp | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Compression/CompressionCodecDoubleDelta.cpp b/src/Compression/CompressionCodecDoubleDelta.cpp index 443b9d33532..cbd8cd57a62 100644 --- a/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/src/Compression/CompressionCodecDoubleDelta.cpp @@ -21,6 +21,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + /** NOTE DoubleDelta is surprisingly bad name. The only excuse is that it comes from an academic paper. * Most people will think that "double delta" is just applying delta transform twice. * But in fact it is something more than applying delta transform twice. diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 953072c5b0e..b07c90b8660 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -45,7 +45,6 @@ namespace ErrorCodes extern const int SUPPORT_IS_DISABLED; extern const int LOGICAL_ERROR; extern const int INVALID_CONFIG_PARAMETER; - extern const int UNEXPECTED_ZOOKEEPER_ERROR; } using namespace std::chrono_literals; From 113bb0000510b30c0845593911baa6d72cd5fb20 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 19 May 2024 08:34:59 +0000 Subject: [PATCH 182/392] Fix clang-tidy "-readability-redundant-inline-specifier" --- .clang-tidy | 1 - base/base/BorrowedObjectPool.h | 14 ++--- .../library-bridge/LibraryBridgeHandlers.h | 2 +- programs/server/MetricsTransmitter.h | 8 +-- .../AggregateFunctionSequenceNextNode.cpp | 2 +- .../Combinators/AggregateFunctionIf.cpp | 4 +- src/AggregateFunctions/QuantileTDigest.h | 2 +- src/AggregateFunctions/QuantileTiming.h | 2 +- src/AggregateFunctions/ThetaSketchData.h | 4 +- src/AggregateFunctions/UniqVariadicHash.h | 8 +-- src/AggregateFunctions/UniquesHashSet.h | 10 ++-- ...egateFunctionsArithmericOperationsPass.cpp | 4 +- .../Passes/ComparisonTupleEliminationPass.cpp | 2 +- .../Passes/FunctionToSubcolumnsPass.cpp | 2 +- .../Passes/NormalizeCountVariantsPass.cpp | 2 +- .../RewriteAggregateFunctionWithIfPass.cpp | 2 +- .../RewriteSumFunctionWithSumAndCountPass.cpp | 2 +- src/Analyzer/Passes/SumIfToCountIfPass.cpp | 4 +- .../CatBoostLibraryBridgeHelper.h | 14 ++--- .../ExternalDictionaryLibraryBridgeHelper.h | 20 +++---- src/BridgeHelper/IBridgeHelper.h | 6 +- src/BridgeHelper/LibraryBridgeHelper.h | 2 +- src/BridgeHelper/XDBCBridgeHelper.h | 16 +++--- src/Common/CPUID.h | 4 +- src/Common/ColumnsHashingImpl.h | 2 +- src/Common/CombinedCardinalityEstimator.h | 6 +- src/Common/CompactArray.h | 2 +- src/Common/CounterInFile.h | 2 +- src/Common/CurrentThread.h | 4 +- src/Common/HashTable/FixedHashTable.h | 2 +- src/Common/HashTable/HashTable.h | 2 +- src/Common/HashTable/PackedHashMap.h | 2 +- src/Common/HashTable/SmallTable.h | 2 +- src/Common/HyperLogLogCounter.h | 20 +++---- src/Common/IntervalTree.h | 18 +++--- src/Common/JSONParsers/SimdJSONParser.h | 36 ++++++------ src/Common/PODArray.h | 2 +- src/Common/PoolBase.h | 2 +- src/Common/RadixSort.h | 4 +- src/Common/SpaceSaving.h | 4 +- src/Common/ThreadProfileEvents.h | 2 +- src/Common/Volnitsky.h | 18 +++--- src/Common/ZooKeeper/IKeeper.h | 6 +- src/Common/findExtreme.cpp | 4 +- src/Core/Field.h | 4 +- src/Core/Joins.h | 24 ++++---- src/Daemon/BaseDaemon.h | 2 +- src/DataTypes/DataTypeDecimalBase.h | 2 +- src/Dictionaries/CacheDictionaryStorage.h | 8 +-- src/Dictionaries/DictionaryHelpers.h | 8 +-- src/Dictionaries/Embedded/RegionsNames.h | 4 +- src/Dictionaries/ICacheDictionaryStorage.h | 16 +++--- src/Dictionaries/IPAddressDictionary.cpp | 2 +- src/Dictionaries/RegExpTreeDictionary.cpp | 4 +- src/Dictionaries/SSDCacheDictionaryStorage.h | 56 +++++++++---------- src/Disks/IO/IOUringReader.h | 4 +- src/Functions/DivisionUtils.h | 6 +- src/Functions/ExtractString.h | 6 +- src/Functions/FunctionBinaryArithmetic.h | 8 +-- src/Functions/FunctionSQLJSON.h | 20 +++---- src/Functions/FunctionsAES.h | 4 +- src/Functions/FunctionsBitToArray.cpp | 2 +- src/Functions/FunctionsCodingIP.cpp | 4 +- src/Functions/FunctionsConsistentHashing.h | 2 +- .../FunctionsLanguageClassification.cpp | 2 +- src/Functions/FunctionsLogical.cpp | 8 +-- src/Functions/FunctionsLogical.h | 42 +++++++------- .../FunctionsProgrammingClassification.cpp | 2 +- src/Functions/FunctionsRound.h | 2 +- src/Functions/FunctionsStringHash.cpp | 20 +++---- src/Functions/FunctionsStringSimilarity.cpp | 8 +-- src/Functions/FunctionsTimeWindow.h | 8 +-- .../FunctionsTonalityClassification.cpp | 2 +- src/Functions/GCDLCMImpl.h | 2 +- src/Functions/GregorianDate.cpp | 10 ++-- src/Functions/PolygonUtils.h | 2 +- src/Functions/TransformDateTime64.h | 8 +-- src/Functions/abs.cpp | 2 +- src/Functions/array/arrayIndex.h | 16 +++--- src/Functions/array/arrayNorm.cpp | 26 ++++----- src/Functions/bitAnd.cpp | 4 +- src/Functions/bitBoolMaskAnd.cpp | 2 +- src/Functions/bitBoolMaskOr.cpp | 2 +- src/Functions/bitCount.cpp | 2 +- src/Functions/bitHammingDistance.cpp | 2 +- src/Functions/bitNot.cpp | 4 +- src/Functions/bitOr.cpp | 4 +- src/Functions/bitRotateLeft.cpp | 4 +- src/Functions/bitRotateRight.cpp | 4 +- src/Functions/bitShiftLeft.cpp | 4 +- src/Functions/bitShiftRight.cpp | 6 +- src/Functions/bitSwapLastTwo.cpp | 4 +- src/Functions/bitTest.cpp | 2 +- src/Functions/bitTestAll.cpp | 2 +- src/Functions/bitTestAny.cpp | 2 +- src/Functions/bitWrapperFunc.cpp | 2 +- src/Functions/bitXor.cpp | 4 +- src/Functions/dateName.cpp | 18 +++--- src/Functions/divide.cpp | 4 +- src/Functions/divideDecimal.cpp | 2 +- src/Functions/factorial.cpp | 2 +- src/Functions/greatCircleDistance.cpp | 10 ++-- src/Functions/greatest.cpp | 6 +- src/Functions/h3GetUnidirectionalEdge.cpp | 2 +- src/Functions/initialQueryID.cpp | 6 +- src/Functions/intDiv.cpp | 2 +- src/Functions/intDivOrZero.cpp | 2 +- src/Functions/intExp10.cpp | 2 +- src/Functions/intExp2.cpp | 4 +- src/Functions/isValidUTF8.cpp | 4 +- src/Functions/jumpConsistentHash.cpp | 2 +- src/Functions/kostikConsistentHash.cpp | 2 +- src/Functions/least.cpp | 6 +- src/Functions/minus.cpp | 6 +- src/Functions/modulo.cpp | 2 +- src/Functions/moduloOrZero.cpp | 2 +- src/Functions/multiply.cpp | 6 +- src/Functions/multiplyDecimal.cpp | 2 +- src/Functions/negate.cpp | 4 +- src/Functions/plus.cpp | 6 +- src/Functions/queryID.cpp | 6 +- src/Functions/repeat.cpp | 4 +- src/Functions/roundAge.cpp | 2 +- src/Functions/roundDuration.cpp | 2 +- src/Functions/roundToExp2.cpp | 2 +- src/Functions/sign.cpp | 2 +- src/Functions/space.cpp | 2 +- src/Functions/tokenExtractors.cpp | 2 +- src/IO/BufferBase.h | 24 ++++---- src/IO/HTTPHeaderEntries.h | 2 +- src/IO/HadoopSnappyReadBuffer.h | 4 +- src/IO/IReadableWriteBuffer.h | 2 +- src/IO/PeekableReadBuffer.h | 6 +- src/IO/ReadBuffer.h | 2 +- src/IO/S3/Requests.h | 2 +- src/IO/WriteBuffer.h | 6 +- src/IO/ZstdDeflatingAppendableWriteBuffer.h | 2 +- src/Interpreters/DDLTask.h | 8 +-- src/Interpreters/DatabaseCatalog.h | 2 +- src/Interpreters/JIT/CHJIT.cpp | 14 ++--- src/Interpreters/JIT/CHJIT.h | 2 +- src/Interpreters/JIT/CompileDAG.h | 16 +++--- src/Interpreters/JoinUtils.h | 2 +- .../examples/hash_map_string_3.cpp | 2 +- .../Impl/CustomSeparatedRowInputFormat.h | 2 +- .../Formats/Impl/TemplateRowInputFormat.h | 2 +- src/Processors/Port.h | 6 +- src/Server/HTTPHandler.h | 6 +- src/Storages/Cache/ExternalDataSourceCache.h | 2 +- src/Storages/Cache/RemoteCacheController.h | 20 +++---- src/Storages/Hive/HiveFile.h | 4 +- src/Storages/Kafka/KafkaConsumer.h | 6 +- .../MergeTree/BackgroundProcessList.h | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 14 ++--- .../MergeTree/MergeTreeBlockReadUtils.h | 8 +-- .../MergeTree/MergeTreeIndexGranularityInfo.h | 4 +- src/Storages/StorageReplicatedMergeTree.h | 2 +- src/Storages/UVLoop.h | 4 +- src/TableFunctions/ITableFunction.h | 2 +- 159 files changed, 490 insertions(+), 491 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index e2f318562ec..66417c41c46 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -129,7 +129,6 @@ Checks: [ '-readability-avoid-nested-conditional-operator', '-modernize-use-designated-initializers', '-performance-enum-size', - '-readability-redundant-inline-specifier', '-readability-redundant-member-init', '-bugprone-crtp-constructor-accessibility', '-bugprone-suspicious-stringview-data-usage', diff --git a/base/base/BorrowedObjectPool.h b/base/base/BorrowedObjectPool.h index 05a23d5835e..f5ef28582b2 100644 --- a/base/base/BorrowedObjectPool.h +++ b/base/base/BorrowedObjectPool.h @@ -86,7 +86,7 @@ public: } /// Return object into pool. Client must return same object that was borrowed. - inline void returnObject(T && object_to_return) + void returnObject(T && object_to_return) { { std::lock_guard lock(objects_mutex); @@ -99,20 +99,20 @@ public: } /// Max pool size - inline size_t maxSize() const + size_t maxSize() const { return max_size; } /// Allocated objects size by the pool. If allocatedObjectsSize == maxSize then pool is full. - inline size_t allocatedObjectsSize() const + size_t allocatedObjectsSize() const { std::lock_guard lock(objects_mutex); return allocated_objects_size; } /// Returns allocatedObjectsSize == maxSize - inline bool isFull() const + bool isFull() const { std::lock_guard lock(objects_mutex); return allocated_objects_size == max_size; @@ -120,7 +120,7 @@ public: /// Borrowed objects size. If borrowedObjectsSize == allocatedObjectsSize and pool is full. /// Then client will wait during borrowObject function call. - inline size_t borrowedObjectsSize() const + size_t borrowedObjectsSize() const { std::lock_guard lock(objects_mutex); return borrowed_objects_size; @@ -129,7 +129,7 @@ public: private: template - inline T allocateObjectForBorrowing(const std::unique_lock &, FactoryFunc && func) + T allocateObjectForBorrowing(const std::unique_lock &, FactoryFunc && func) { ++allocated_objects_size; ++borrowed_objects_size; @@ -137,7 +137,7 @@ private: return std::forward(func)(); } - inline T borrowFromObjects(const std::unique_lock &) + T borrowFromObjects(const std::unique_lock &) { T dst; detail::moveOrCopyIfThrow(std::move(objects.back()), dst); diff --git a/programs/library-bridge/LibraryBridgeHandlers.h b/programs/library-bridge/LibraryBridgeHandlers.h index 1db71eb24cb..62fbf2caede 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.h +++ b/programs/library-bridge/LibraryBridgeHandlers.h @@ -23,7 +23,7 @@ public: void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: - static constexpr inline auto FORMAT = "RowBinary"; + static constexpr auto FORMAT = "RowBinary"; const size_t keep_alive_timeout; LoggerPtr log; diff --git a/programs/server/MetricsTransmitter.h b/programs/server/MetricsTransmitter.h index 23420117b56..24069a60071 100644 --- a/programs/server/MetricsTransmitter.h +++ b/programs/server/MetricsTransmitter.h @@ -56,10 +56,10 @@ private: std::condition_variable cond; std::optional thread; - static inline constexpr auto profile_events_path_prefix = "ClickHouse.ProfileEvents."; - static inline constexpr auto profile_events_cumulative_path_prefix = "ClickHouse.ProfileEventsCumulative."; - static inline constexpr auto current_metrics_path_prefix = "ClickHouse.Metrics."; - static inline constexpr auto asynchronous_metrics_path_prefix = "ClickHouse.AsynchronousMetrics."; + static constexpr auto profile_events_path_prefix = "ClickHouse.ProfileEvents."; + static constexpr auto profile_events_cumulative_path_prefix = "ClickHouse.ProfileEventsCumulative."; + static constexpr auto current_metrics_path_prefix = "ClickHouse.Metrics."; + static constexpr auto asynchronous_metrics_path_prefix = "ClickHouse.AsynchronousMetrics."; }; } diff --git a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp index bed10333af0..b3824720b04 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp +++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp @@ -341,7 +341,7 @@ public: value[i] = Node::read(buf, arena); } - inline std::optional getBaseIndex(Data & data) const + std::optional getBaseIndex(Data & data) const { if (data.value.size() == 0) return {}; diff --git a/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp b/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp index 9b5ee79a533..3e21ffa3418 100644 --- a/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp +++ b/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp @@ -73,7 +73,7 @@ private: using Base = AggregateFunctionNullBase>; - inline bool singleFilter(const IColumn ** columns, size_t row_num) const + bool singleFilter(const IColumn ** columns, size_t row_num) const { const IColumn * filter_column = columns[num_arguments - 1]; @@ -261,7 +261,7 @@ public: filter_is_only_null = arguments.back()->onlyNull(); } - static inline bool singleFilter(const IColumn ** columns, size_t row_num, size_t num_arguments) + static bool singleFilter(const IColumn ** columns, size_t row_num, size_t num_arguments) { return assert_cast(*columns[num_arguments - 1]).getData()[row_num]; } diff --git a/src/AggregateFunctions/QuantileTDigest.h b/src/AggregateFunctions/QuantileTDigest.h index 9d84f079daa..d5a4f6b576a 100644 --- a/src/AggregateFunctions/QuantileTDigest.h +++ b/src/AggregateFunctions/QuantileTDigest.h @@ -138,7 +138,7 @@ class QuantileTDigest compress(); } - inline bool canBeMerged(const BetterFloat & l_mean, const Value & r_mean) + bool canBeMerged(const BetterFloat & l_mean, const Value & r_mean) { return l_mean == r_mean || (!std::isinf(l_mean) && !std::isinf(r_mean)); } diff --git a/src/AggregateFunctions/QuantileTiming.h b/src/AggregateFunctions/QuantileTiming.h index 45fbf38258f..eef15828fc0 100644 --- a/src/AggregateFunctions/QuantileTiming.h +++ b/src/AggregateFunctions/QuantileTiming.h @@ -262,7 +262,7 @@ namespace detail UInt64 count_big[BIG_SIZE]; /// Get value of quantile by index in array `count_big`. - static inline UInt16 indexInBigToValue(size_t i) + static UInt16 indexInBigToValue(size_t i) { return (i * BIG_PRECISION) + SMALL_THRESHOLD + (intHash32<0>(i) % BIG_PRECISION - (BIG_PRECISION / 2)); /// A small randomization so that it is not noticeable that all the values are even. diff --git a/src/AggregateFunctions/ThetaSketchData.h b/src/AggregateFunctions/ThetaSketchData.h index f32386d945b..99dca27673d 100644 --- a/src/AggregateFunctions/ThetaSketchData.h +++ b/src/AggregateFunctions/ThetaSketchData.h @@ -24,14 +24,14 @@ private: std::unique_ptr sk_update; std::unique_ptr sk_union; - inline datasketches::update_theta_sketch * getSkUpdate() + datasketches::update_theta_sketch * getSkUpdate() { if (!sk_update) sk_update = std::make_unique(datasketches::update_theta_sketch::builder().build()); return sk_update.get(); } - inline datasketches::theta_union * getSkUnion() + datasketches::theta_union * getSkUnion() { if (!sk_union) sk_union = std::make_unique(datasketches::theta_union::builder().build()); diff --git a/src/AggregateFunctions/UniqVariadicHash.h b/src/AggregateFunctions/UniqVariadicHash.h index 840380e7f0f..5bb245397d4 100644 --- a/src/AggregateFunctions/UniqVariadicHash.h +++ b/src/AggregateFunctions/UniqVariadicHash.h @@ -38,7 +38,7 @@ bool isAllArgumentsContiguousInMemory(const DataTypes & argument_types); template <> struct UniqVariadicHash { - static inline UInt64 apply(size_t num_args, const IColumn ** columns, size_t row_num) + static UInt64 apply(size_t num_args, const IColumn ** columns, size_t row_num) { UInt64 hash; @@ -65,7 +65,7 @@ struct UniqVariadicHash template <> struct UniqVariadicHash { - static inline UInt64 apply(size_t num_args, const IColumn ** columns, size_t row_num) + static UInt64 apply(size_t num_args, const IColumn ** columns, size_t row_num) { UInt64 hash; @@ -94,7 +94,7 @@ struct UniqVariadicHash template <> struct UniqVariadicHash { - static inline UInt128 apply(size_t num_args, const IColumn ** columns, size_t row_num) + static UInt128 apply(size_t num_args, const IColumn ** columns, size_t row_num) { const IColumn ** column = columns; const IColumn ** columns_end = column + num_args; @@ -114,7 +114,7 @@ struct UniqVariadicHash template <> struct UniqVariadicHash { - static inline UInt128 apply(size_t num_args, const IColumn ** columns, size_t row_num) + static UInt128 apply(size_t num_args, const IColumn ** columns, size_t row_num) { const auto & tuple_columns = assert_cast(columns[0])->getColumns(); diff --git a/src/AggregateFunctions/UniquesHashSet.h b/src/AggregateFunctions/UniquesHashSet.h index d6fc2bb6634..d5241547711 100644 --- a/src/AggregateFunctions/UniquesHashSet.h +++ b/src/AggregateFunctions/UniquesHashSet.h @@ -105,14 +105,14 @@ private: } } - inline size_t buf_size() const { return 1ULL << size_degree; } /// NOLINT - inline size_t max_fill() const { return 1ULL << (size_degree - 1); } /// NOLINT - inline size_t mask() const { return buf_size() - 1; } + size_t buf_size() const { return 1ULL << size_degree; } /// NOLINT + size_t max_fill() const { return 1ULL << (size_degree - 1); } /// NOLINT + size_t mask() const { return buf_size() - 1; } - inline size_t place(HashValue x) const { return (x >> UNIQUES_HASH_BITS_FOR_SKIP) & mask(); } + size_t place(HashValue x) const { return (x >> UNIQUES_HASH_BITS_FOR_SKIP) & mask(); } /// The value is divided by 2 ^ skip_degree - inline bool good(HashValue hash) const { return hash == ((hash >> skip_degree) << skip_degree); } + bool good(HashValue hash) const { return hash == ((hash >> skip_degree) << skip_degree); } HashValue hash(Value key) const { return static_cast(Hash()(key)); } diff --git a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp index f96ba22eb7a..9153bc4eca2 100644 --- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp +++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp @@ -173,13 +173,13 @@ private: return arithmetic_function_clone; } - inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const + void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const { auto function = FunctionFactory::instance().get(function_name, getContext()); function_node.resolveAsFunction(function->build(function_node.getArgumentColumns())); } - static inline void resolveAggregateFunctionNode(FunctionNode & function_node, const QueryTreeNodePtr & argument, const String & aggregate_function_name) + static void resolveAggregateFunctionNode(FunctionNode & function_node, const QueryTreeNodePtr & argument, const String & aggregate_function_name) { auto function_aggregate_function = function_node.getAggregateFunction(); diff --git a/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp b/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp index f8233f473f8..ebefc12ae53 100644 --- a/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp +++ b/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp @@ -184,7 +184,7 @@ private: return result_function; } - inline QueryTreeNodePtr makeEqualsFunction(QueryTreeNodePtr lhs_argument, QueryTreeNodePtr rhs_argument) const + QueryTreeNodePtr makeEqualsFunction(QueryTreeNodePtr lhs_argument, QueryTreeNodePtr rhs_argument) const { return makeComparisonFunction(std::move(lhs_argument), std::move(rhs_argument), "equals"); } diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp index 6248f462979..15ac8d642a4 100644 --- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp +++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp @@ -215,7 +215,7 @@ public: } private: - inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const + void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const { auto function = FunctionFactory::instance().get(function_name, getContext()); function_node.resolveAsFunction(function->build(function_node.getArgumentColumns())); diff --git a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp index 0d6f3fc2d87..e70e08e65f4 100644 --- a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp +++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp @@ -59,7 +59,7 @@ public: } } private: - static inline void resolveAsCountAggregateFunction(FunctionNode & function_node) + static void resolveAsCountAggregateFunction(FunctionNode & function_node) { AggregateFunctionProperties properties; auto aggregate_function = AggregateFunctionFactory::instance().get("count", NullsAction::EMPTY, {}, {}, properties); diff --git a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp index 513dd0054d6..a82ad3dced1 100644 --- a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp +++ b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp @@ -108,7 +108,7 @@ public: } private: - static inline void resolveAsAggregateFunctionWithIf(FunctionNode & function_node, const DataTypes & argument_types) + static void resolveAsAggregateFunctionWithIf(FunctionNode & function_node, const DataTypes & argument_types) { auto result_type = function_node.getResultType(); diff --git a/src/Analyzer/Passes/RewriteSumFunctionWithSumAndCountPass.cpp b/src/Analyzer/Passes/RewriteSumFunctionWithSumAndCountPass.cpp index 917256bf4b1..5646d26f7f6 100644 --- a/src/Analyzer/Passes/RewriteSumFunctionWithSumAndCountPass.cpp +++ b/src/Analyzer/Passes/RewriteSumFunctionWithSumAndCountPass.cpp @@ -110,7 +110,7 @@ private: function_node.resolveAsFunction(function->build(function_node.getArgumentColumns())); } - static inline void resolveAsAggregateFunctionNode(FunctionNode & function_node, const DataTypePtr & argument_type) + static void resolveAsAggregateFunctionNode(FunctionNode & function_node, const DataTypePtr & argument_type) { AggregateFunctionProperties properties; const auto aggregate_function = AggregateFunctionFactory::instance().get(function_node.getFunctionName(), diff --git a/src/Analyzer/Passes/SumIfToCountIfPass.cpp b/src/Analyzer/Passes/SumIfToCountIfPass.cpp index 1a4712aa697..852cbe75c4a 100644 --- a/src/Analyzer/Passes/SumIfToCountIfPass.cpp +++ b/src/Analyzer/Passes/SumIfToCountIfPass.cpp @@ -156,7 +156,7 @@ public: } private: - static inline void resolveAsCountIfAggregateFunction(FunctionNode & function_node, const DataTypePtr & argument_type) + static void resolveAsCountIfAggregateFunction(FunctionNode & function_node, const DataTypePtr & argument_type) { AggregateFunctionProperties properties; auto aggregate_function = AggregateFunctionFactory::instance().get( @@ -165,7 +165,7 @@ private: function_node.resolveAsAggregateFunction(std::move(aggregate_function)); } - inline QueryTreeNodePtr getMultiplyFunction(QueryTreeNodePtr left, QueryTreeNodePtr right) + QueryTreeNodePtr getMultiplyFunction(QueryTreeNodePtr left, QueryTreeNodePtr right) { auto multiply_function_node = std::make_shared("multiply"); auto & multiply_arguments_nodes = multiply_function_node->getArguments().getNodes(); diff --git a/src/BridgeHelper/CatBoostLibraryBridgeHelper.h b/src/BridgeHelper/CatBoostLibraryBridgeHelper.h index 55dfd715f00..5d5c6d01705 100644 --- a/src/BridgeHelper/CatBoostLibraryBridgeHelper.h +++ b/src/BridgeHelper/CatBoostLibraryBridgeHelper.h @@ -14,8 +14,8 @@ namespace DB class CatBoostLibraryBridgeHelper final : public LibraryBridgeHelper { public: - static constexpr inline auto PING_HANDLER = "/catboost_ping"; - static constexpr inline auto MAIN_HANDLER = "/catboost_request"; + static constexpr auto PING_HANDLER = "/catboost_ping"; + static constexpr auto MAIN_HANDLER = "/catboost_request"; explicit CatBoostLibraryBridgeHelper( ContextPtr context_, @@ -38,11 +38,11 @@ protected: bool bridgeHandShake() override; private: - static constexpr inline auto CATBOOST_LIST_METHOD = "catboost_list"; - static constexpr inline auto CATBOOST_REMOVEMODEL_METHOD = "catboost_removeModel"; - static constexpr inline auto CATBOOST_REMOVEALLMODELS_METHOD = "catboost_removeAllModels"; - static constexpr inline auto CATBOOST_GETTREECOUNT_METHOD = "catboost_GetTreeCount"; - static constexpr inline auto CATBOOST_LIB_EVALUATE_METHOD = "catboost_libEvaluate"; + static constexpr auto CATBOOST_LIST_METHOD = "catboost_list"; + static constexpr auto CATBOOST_REMOVEMODEL_METHOD = "catboost_removeModel"; + static constexpr auto CATBOOST_REMOVEALLMODELS_METHOD = "catboost_removeAllModels"; + static constexpr auto CATBOOST_GETTREECOUNT_METHOD = "catboost_GetTreeCount"; + static constexpr auto CATBOOST_LIB_EVALUATE_METHOD = "catboost_libEvaluate"; Poco::URI createRequestURI(const String & method) const; diff --git a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.h b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.h index 5632fd2a28e..63816aa63ef 100644 --- a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.h +++ b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.h @@ -25,8 +25,8 @@ public: String dict_attributes; }; - static constexpr inline auto PING_HANDLER = "/extdict_ping"; - static constexpr inline auto MAIN_HANDLER = "/extdict_request"; + static constexpr auto PING_HANDLER = "/extdict_ping"; + static constexpr auto MAIN_HANDLER = "/extdict_request"; ExternalDictionaryLibraryBridgeHelper(ContextPtr context_, const Block & sample_block, const Field & dictionary_id_, const LibraryInitData & library_data_); @@ -62,14 +62,14 @@ protected: ReadWriteBufferFromHTTP::OutStreamCallback getInitLibraryCallback() const; private: - static constexpr inline auto EXT_DICT_LIB_NEW_METHOD = "extDict_libNew"; - static constexpr inline auto EXT_DICT_LIB_CLONE_METHOD = "extDict_libClone"; - static constexpr inline auto EXT_DICT_LIB_DELETE_METHOD = "extDict_libDelete"; - static constexpr inline auto EXT_DICT_LOAD_ALL_METHOD = "extDict_loadAll"; - static constexpr inline auto EXT_DICT_LOAD_IDS_METHOD = "extDict_loadIds"; - static constexpr inline auto EXT_DICT_LOAD_KEYS_METHOD = "extDict_loadKeys"; - static constexpr inline auto EXT_DICT_IS_MODIFIED_METHOD = "extDict_isModified"; - static constexpr inline auto EXT_DICT_SUPPORTS_SELECTIVE_LOAD_METHOD = "extDict_supportsSelectiveLoad"; + static constexpr auto EXT_DICT_LIB_NEW_METHOD = "extDict_libNew"; + static constexpr auto EXT_DICT_LIB_CLONE_METHOD = "extDict_libClone"; + static constexpr auto EXT_DICT_LIB_DELETE_METHOD = "extDict_libDelete"; + static constexpr auto EXT_DICT_LOAD_ALL_METHOD = "extDict_loadAll"; + static constexpr auto EXT_DICT_LOAD_IDS_METHOD = "extDict_loadIds"; + static constexpr auto EXT_DICT_LOAD_KEYS_METHOD = "extDict_loadKeys"; + static constexpr auto EXT_DICT_IS_MODIFIED_METHOD = "extDict_isModified"; + static constexpr auto EXT_DICT_SUPPORTS_SELECTIVE_LOAD_METHOD = "extDict_supportsSelectiveLoad"; Poco::URI createRequestURI(const String & method) const; diff --git a/src/BridgeHelper/IBridgeHelper.h b/src/BridgeHelper/IBridgeHelper.h index 6812bd04a03..8ce1c0e143a 100644 --- a/src/BridgeHelper/IBridgeHelper.h +++ b/src/BridgeHelper/IBridgeHelper.h @@ -16,9 +16,9 @@ class IBridgeHelper: protected WithContext { public: - static constexpr inline auto DEFAULT_HOST = "127.0.0.1"; - static constexpr inline auto DEFAULT_FORMAT = "RowBinary"; - static constexpr inline auto PING_OK_ANSWER = "Ok."; + static constexpr auto DEFAULT_HOST = "127.0.0.1"; + static constexpr auto DEFAULT_FORMAT = "RowBinary"; + static constexpr auto PING_OK_ANSWER = "Ok."; static const inline std::string PING_METHOD = Poco::Net::HTTPRequest::HTTP_GET; static const inline std::string MAIN_METHOD = Poco::Net::HTTPRequest::HTTP_POST; diff --git a/src/BridgeHelper/LibraryBridgeHelper.h b/src/BridgeHelper/LibraryBridgeHelper.h index 8940f9d1c9e..0c56fe7a221 100644 --- a/src/BridgeHelper/LibraryBridgeHelper.h +++ b/src/BridgeHelper/LibraryBridgeHelper.h @@ -37,7 +37,7 @@ protected: Poco::URI createBaseURI() const override; - static constexpr inline size_t DEFAULT_PORT = 9012; + static constexpr size_t DEFAULT_PORT = 9012; const Poco::Util::AbstractConfiguration & config; LoggerPtr log; diff --git a/src/BridgeHelper/XDBCBridgeHelper.h b/src/BridgeHelper/XDBCBridgeHelper.h index b557e12b85b..5f4c7fd8381 100644 --- a/src/BridgeHelper/XDBCBridgeHelper.h +++ b/src/BridgeHelper/XDBCBridgeHelper.h @@ -52,12 +52,12 @@ class XDBCBridgeHelper : public IXDBCBridgeHelper { public: - static constexpr inline auto DEFAULT_PORT = BridgeHelperMixin::DEFAULT_PORT; - static constexpr inline auto PING_HANDLER = "/ping"; - static constexpr inline auto MAIN_HANDLER = "/"; - static constexpr inline auto COL_INFO_HANDLER = "/columns_info"; - static constexpr inline auto IDENTIFIER_QUOTE_HANDLER = "/identifier_quote"; - static constexpr inline auto SCHEMA_ALLOWED_HANDLER = "/schema_allowed"; + static constexpr auto DEFAULT_PORT = BridgeHelperMixin::DEFAULT_PORT; + static constexpr auto PING_HANDLER = "/ping"; + static constexpr auto MAIN_HANDLER = "/"; + static constexpr auto COL_INFO_HANDLER = "/columns_info"; + static constexpr auto IDENTIFIER_QUOTE_HANDLER = "/identifier_quote"; + static constexpr auto SCHEMA_ALLOWED_HANDLER = "/schema_allowed"; XDBCBridgeHelper( ContextPtr context_, @@ -256,7 +256,7 @@ protected: struct JDBCBridgeMixin { - static constexpr inline auto DEFAULT_PORT = 9019; + static constexpr auto DEFAULT_PORT = 9019; static String configPrefix() { @@ -287,7 +287,7 @@ struct JDBCBridgeMixin struct ODBCBridgeMixin { - static constexpr inline auto DEFAULT_PORT = 9018; + static constexpr auto DEFAULT_PORT = 9018; static String configPrefix() { diff --git a/src/Common/CPUID.h b/src/Common/CPUID.h index d7a714ec5af..b49f7706904 100644 --- a/src/Common/CPUID.h +++ b/src/Common/CPUID.h @@ -69,9 +69,9 @@ union CPUInfo UInt32 edx; } registers; - inline explicit CPUInfo(UInt32 op) noexcept { cpuid(op, info); } + explicit CPUInfo(UInt32 op) noexcept { cpuid(op, info); } - inline CPUInfo(UInt32 op, UInt32 sub_op) noexcept { cpuid(op, sub_op, info); } + CPUInfo(UInt32 op, UInt32 sub_op) noexcept { cpuid(op, sub_op, info); } }; inline bool haveRDTSCP() noexcept diff --git a/src/Common/ColumnsHashingImpl.h b/src/Common/ColumnsHashingImpl.h index f74a56292ae..0e013decf1f 100644 --- a/src/Common/ColumnsHashingImpl.h +++ b/src/Common/ColumnsHashingImpl.h @@ -453,7 +453,7 @@ protected: /// Return the columns which actually contain the values of the keys. /// For a given key column, if it is nullable, we return its nested /// column. Otherwise we return the key column itself. - inline const ColumnRawPtrs & getActualColumns() const + const ColumnRawPtrs & getActualColumns() const { return actual_columns; } diff --git a/src/Common/CombinedCardinalityEstimator.h b/src/Common/CombinedCardinalityEstimator.h index 0e53755d773..132f00de8eb 100644 --- a/src/Common/CombinedCardinalityEstimator.h +++ b/src/Common/CombinedCardinalityEstimator.h @@ -292,13 +292,13 @@ private: } template - inline T & getContainer() + T & getContainer() { return *reinterpret_cast(address & mask); } template - inline const T & getContainer() const + const T & getContainer() const { return *reinterpret_cast(address & mask); } @@ -309,7 +309,7 @@ private: address |= static_cast(t); } - inline details::ContainerType getContainerType() const + details::ContainerType getContainerType() const { return static_cast(address & ~mask); } diff --git a/src/Common/CompactArray.h b/src/Common/CompactArray.h index 613dc3d0b90..7b2bd658d2e 100644 --- a/src/Common/CompactArray.h +++ b/src/Common/CompactArray.h @@ -116,7 +116,7 @@ public: /** Return the current cell number and the corresponding content. */ - inline std::pair get() const + std::pair get() const { if ((current_bucket_index == 0) || is_eof) throw Exception(ErrorCodes::NO_AVAILABLE_DATA, "No available data."); diff --git a/src/Common/CounterInFile.h b/src/Common/CounterInFile.h index 854bf7cc675..0a11e52be2c 100644 --- a/src/Common/CounterInFile.h +++ b/src/Common/CounterInFile.h @@ -37,7 +37,7 @@ namespace fs = std::filesystem; class CounterInFile { private: - static inline constexpr size_t SMALL_READ_WRITE_BUFFER_SIZE = 16; + static constexpr size_t SMALL_READ_WRITE_BUFFER_SIZE = 16; public: /// path - the name of the file, including the path diff --git a/src/Common/CurrentThread.h b/src/Common/CurrentThread.h index e2b627a7f29..8dade8c6fd5 100644 --- a/src/Common/CurrentThread.h +++ b/src/Common/CurrentThread.h @@ -62,9 +62,9 @@ public: static void updatePerformanceCountersIfNeeded(); static ProfileEvents::Counters & getProfileEvents(); - inline ALWAYS_INLINE static MemoryTracker * getMemoryTracker() + static MemoryTracker * getMemoryTracker() { - if (unlikely(!current_thread)) + if (!current_thread) [[unlikely]] return nullptr; return ¤t_thread->memory_tracker; } diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index 49675aaafbc..8f6ec1604ee 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -261,7 +261,7 @@ public: return true; } - inline const value_type & get() const + const value_type & get() const { if (!is_initialized || is_eof) throw DB::Exception(DB::ErrorCodes::NO_AVAILABLE_DATA, "No available data"); diff --git a/src/Common/HashTable/HashTable.h b/src/Common/HashTable/HashTable.h index 9050b7ef6d7..a600f57b06a 100644 --- a/src/Common/HashTable/HashTable.h +++ b/src/Common/HashTable/HashTable.h @@ -844,7 +844,7 @@ public: return true; } - inline const value_type & get() const + const value_type & get() const { if (!is_initialized || is_eof) throw DB::Exception(DB::ErrorCodes::NO_AVAILABLE_DATA, "No available data"); diff --git a/src/Common/HashTable/PackedHashMap.h b/src/Common/HashTable/PackedHashMap.h index 0d25addb58e..72eb721b274 100644 --- a/src/Common/HashTable/PackedHashMap.h +++ b/src/Common/HashTable/PackedHashMap.h @@ -69,7 +69,7 @@ struct PackedHashMapCell : public HashMapCellvalue.first, state); } static bool isZero(const Key key, const State & /*state*/) { return ZeroTraits::check(key); } - static inline bool bitEqualsByValue(key_type a, key_type b) { return a == b; } + static bool bitEqualsByValue(key_type a, key_type b) { return a == b; } template auto get() const diff --git a/src/Common/HashTable/SmallTable.h b/src/Common/HashTable/SmallTable.h index 3229e4748ea..63a6b932dd0 100644 --- a/src/Common/HashTable/SmallTable.h +++ b/src/Common/HashTable/SmallTable.h @@ -112,7 +112,7 @@ public: return true; } - inline const value_type & get() const + const value_type & get() const { if (!is_initialized || is_eof) throw DB::Exception(DB::ErrorCodes::NO_AVAILABLE_DATA, "No available data"); diff --git a/src/Common/HyperLogLogCounter.h b/src/Common/HyperLogLogCounter.h index bacd4cc7288..9b2b33dc918 100644 --- a/src/Common/HyperLogLogCounter.h +++ b/src/Common/HyperLogLogCounter.h @@ -128,13 +128,13 @@ public: { } - inline void update(UInt8 cur_rank, UInt8 new_rank) + void update(UInt8 cur_rank, UInt8 new_rank) { denominator -= static_cast(1.0) / (1ULL << cur_rank); denominator += static_cast(1.0) / (1ULL << new_rank); } - inline void update(UInt8 rank) + void update(UInt8 rank) { denominator += static_cast(1.0) / (1ULL << rank); } @@ -166,13 +166,13 @@ public: rank_count[0] = static_cast(initial_value); } - inline void update(UInt8 cur_rank, UInt8 new_rank) + void update(UInt8 cur_rank, UInt8 new_rank) { --rank_count[cur_rank]; ++rank_count[new_rank]; } - inline void update(UInt8 rank) + void update(UInt8 rank) { ++rank_count[rank]; } @@ -429,13 +429,13 @@ public: private: /// Extract subset of bits in [begin, end[ range. - inline HashValueType extractBitSequence(HashValueType val, UInt8 begin, UInt8 end) const + HashValueType extractBitSequence(HashValueType val, UInt8 begin, UInt8 end) const { return (val >> begin) & ((1ULL << (end - begin)) - 1); } /// Rank is number of trailing zeros. - inline UInt8 calculateRank(HashValueType val) const + UInt8 calculateRank(HashValueType val) const { if (unlikely(val == 0)) return max_rank; @@ -448,7 +448,7 @@ private: return zeros_plus_one; } - inline HashValueType getHash(Value key) const + HashValueType getHash(Value key) const { /// NOTE: this should be OK, since value is the same as key for HLL. return static_cast( @@ -496,7 +496,7 @@ private: throw Poco::Exception("Internal error", DB::ErrorCodes::LOGICAL_ERROR); } - inline double applyCorrection(double raw_estimate) const + double applyCorrection(double raw_estimate) const { double fixed_estimate; @@ -525,7 +525,7 @@ private: /// Correction used in HyperLogLog++ algorithm. /// Source: "HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm" /// (S. Heule et al., Proceedings of the EDBT 2013 Conference). - inline double applyBiasCorrection(double raw_estimate) const + double applyBiasCorrection(double raw_estimate) const { double fixed_estimate; @@ -540,7 +540,7 @@ private: /// Calculation of unique values using LinearCounting algorithm. /// Source: "A Linear-time Probabilistic Counting Algorithm for Database Applications" /// (Whang et al., ACM Trans. Database Syst., pp. 208-229, 1990). - inline double applyLinearCorrection(double raw_estimate) const + double applyLinearCorrection(double raw_estimate) const { double fixed_estimate; diff --git a/src/Common/IntervalTree.h b/src/Common/IntervalTree.h index fbd1de3197e..db7f5238921 100644 --- a/src/Common/IntervalTree.h +++ b/src/Common/IntervalTree.h @@ -23,7 +23,7 @@ struct Interval Interval(IntervalStorageType left_, IntervalStorageType right_) : left(left_), right(right_) { } - inline bool contains(IntervalStorageType point) const { return left <= point && point <= right; } + bool contains(IntervalStorageType point) const { return left <= point && point <= right; } }; template @@ -290,7 +290,7 @@ private: IntervalStorageType middle_element; - inline bool hasValue() const { return sorted_intervals_range_size != 0; } + bool hasValue() const { return sorted_intervals_range_size != 0; } }; using IntervalWithEmptyValue = Interval; @@ -585,7 +585,7 @@ private: } } - inline size_t findFirstIteratorNodeIndex() const + size_t findFirstIteratorNodeIndex() const { size_t nodes_size = nodes.size(); size_t result_index = 0; @@ -602,7 +602,7 @@ private: return result_index; } - inline size_t findLastIteratorNodeIndex() const + size_t findLastIteratorNodeIndex() const { if (unlikely(nodes.empty())) return 0; @@ -618,7 +618,7 @@ private: return result_index; } - inline void increaseIntervalsSize() + void increaseIntervalsSize() { /// Before tree is build we store all intervals size in our first node to allow tree iteration. ++intervals_size; @@ -630,7 +630,7 @@ private: size_t intervals_size = 0; bool tree_is_built = false; - static inline const Interval & getInterval(const IntervalWithValue & interval_with_value) + static const Interval & getInterval(const IntervalWithValue & interval_with_value) { if constexpr (is_empty_value) return interval_with_value; @@ -639,7 +639,7 @@ private: } template - static inline bool callCallback(const IntervalWithValue & interval, IntervalCallback && callback) + static bool callCallback(const IntervalWithValue & interval, IntervalCallback && callback) { if constexpr (is_empty_value) return callback(interval); @@ -647,7 +647,7 @@ private: return callback(interval.first, interval.second); } - static inline void + static void intervalsToPoints(const std::vector & intervals, std::vector & temporary_points_storage) { for (const auto & interval_with_value : intervals) @@ -658,7 +658,7 @@ private: } } - static inline IntervalStorageType pointsMedian(std::vector & points) + static IntervalStorageType pointsMedian(std::vector & points) { size_t size = points.size(); size_t middle_element_index = size / 2; diff --git a/src/Common/JSONParsers/SimdJSONParser.h b/src/Common/JSONParsers/SimdJSONParser.h index a8594710d20..827d142266a 100644 --- a/src/Common/JSONParsers/SimdJSONParser.h +++ b/src/Common/JSONParsers/SimdJSONParser.h @@ -26,62 +26,62 @@ class SimdJSONBasicFormatter { public: explicit SimdJSONBasicFormatter(PaddedPODArray & buffer_) : buffer(buffer_) {} - inline void comma() { oneChar(','); } + void comma() { oneChar(','); } /** Start an array, prints [ **/ - inline void startArray() { oneChar('['); } + void startArray() { oneChar('['); } /** End an array, prints ] **/ - inline void endArray() { oneChar(']'); } + void endArray() { oneChar(']'); } /** Start an array, prints { **/ - inline void startObject() { oneChar('{'); } + void startObject() { oneChar('{'); } /** Start an array, prints } **/ - inline void endObject() { oneChar('}'); } + void endObject() { oneChar('}'); } /** Prints a true **/ - inline void trueAtom() + void trueAtom() { const char * s = "true"; buffer.insert(s, s + 4); } /** Prints a false **/ - inline void falseAtom() + void falseAtom() { const char * s = "false"; buffer.insert(s, s + 5); } /** Prints a null **/ - inline void nullAtom() + void nullAtom() { const char * s = "null"; buffer.insert(s, s + 4); } /** Prints a number **/ - inline void number(int64_t x) + void number(int64_t x) { char number_buffer[24]; auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x); buffer.insert(number_buffer, res.ptr); } /** Prints a number **/ - inline void number(uint64_t x) + void number(uint64_t x) { char number_buffer[24]; auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x); buffer.insert(number_buffer, res.ptr); } /** Prints a number **/ - inline void number(double x) + void number(double x) { char number_buffer[24]; auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x); buffer.insert(number_buffer, res.ptr); } /** Prints a key (string + colon) **/ - inline void key(std::string_view unescaped) + void key(std::string_view unescaped) { string(unescaped); oneChar(':'); } /** Prints a string. The string is escaped as needed. **/ - inline void string(std::string_view unescaped) + void string(std::string_view unescaped) { oneChar('\"'); size_t i = 0; @@ -165,7 +165,7 @@ public: oneChar('\"'); } - inline void oneChar(char c) + void oneChar(char c) { buffer.push_back(c); } @@ -182,7 +182,7 @@ class SimdJSONElementFormatter public: explicit SimdJSONElementFormatter(PaddedPODArray & buffer_) : format(buffer_) {} /** Append an element to the builder (to be printed) **/ - inline void append(simdjson::dom::element value) + void append(simdjson::dom::element value) { switch (value.type()) { @@ -224,7 +224,7 @@ public: } } /** Append an array to the builder (to be printed) **/ - inline void append(simdjson::dom::array value) + void append(simdjson::dom::array value) { format.startArray(); auto iter = value.begin(); @@ -241,7 +241,7 @@ public: format.endArray(); } - inline void append(simdjson::dom::object value) + void append(simdjson::dom::object value) { format.startObject(); auto pair = value.begin(); @@ -258,7 +258,7 @@ public: format.endObject(); } - inline void append(simdjson::dom::key_value_pair kv) + void append(simdjson::dom::key_value_pair kv) { format.key(kv.key); append(kv.value); diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index b4069027ad1..ece5114a998 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -284,7 +284,7 @@ public: } template - inline void assertNotIntersects(It1 from_begin [[maybe_unused]], It2 from_end [[maybe_unused]]) + void assertNotIntersects(It1 from_begin [[maybe_unused]], It2 from_end [[maybe_unused]]) { #if !defined(NDEBUG) const char * ptr_begin = reinterpret_cast(&*from_begin); diff --git a/src/Common/PoolBase.h b/src/Common/PoolBase.h index d6fc1656eca..fb0c75e7c95 100644 --- a/src/Common/PoolBase.h +++ b/src/Common/PoolBase.h @@ -174,7 +174,7 @@ public: items.emplace_back(std::make_shared(allocObject(), *this)); } - inline size_t size() + size_t size() { std::lock_guard lock(mutex); return items.size(); diff --git a/src/Common/RadixSort.h b/src/Common/RadixSort.h index a30e19d8212..238321ec76e 100644 --- a/src/Common/RadixSort.h +++ b/src/Common/RadixSort.h @@ -385,7 +385,7 @@ private: * PASS is counted from least significant (0), so the first pass is NUM_PASSES - 1. */ template - static inline void radixSortMSDInternal(Element * arr, size_t size, size_t limit) + static void radixSortMSDInternal(Element * arr, size_t size, size_t limit) { /// The beginning of every i-1-th bucket. 0th element will be equal to 1st. /// Last element will point to array end. @@ -528,7 +528,7 @@ private: // A helper to choose sorting algorithm based on array length template - static inline void radixSortMSDInternalHelper(Element * arr, size_t size, size_t limit) + static void radixSortMSDInternalHelper(Element * arr, size_t size, size_t limit) { if (size <= INSERTION_SORT_THRESHOLD) insertionSortInternal(arr, size); diff --git a/src/Common/SpaceSaving.h b/src/Common/SpaceSaving.h index 7a740ae6c9b..81ac4e71e8c 100644 --- a/src/Common/SpaceSaving.h +++ b/src/Common/SpaceSaving.h @@ -131,12 +131,12 @@ public: ~SpaceSaving() { destroyElements(); } - inline size_t size() const + size_t size() const { return counter_list.size(); } - inline size_t capacity() const + size_t capacity() const { return m_capacity; } diff --git a/src/Common/ThreadProfileEvents.h b/src/Common/ThreadProfileEvents.h index 26aeab08302..0af3ccb4c80 100644 --- a/src/Common/ThreadProfileEvents.h +++ b/src/Common/ThreadProfileEvents.h @@ -107,7 +107,7 @@ struct RUsageCounters } private: - static inline UInt64 getClockMonotonic() + static UInt64 getClockMonotonic() { struct timespec ts; if (0 != clock_gettime(CLOCK_MONOTONIC, &ts)) diff --git a/src/Common/Volnitsky.h b/src/Common/Volnitsky.h index 6513bdb8bc3..9c2852e4a10 100644 --- a/src/Common/Volnitsky.h +++ b/src/Common/Volnitsky.h @@ -54,16 +54,16 @@ namespace VolnitskyTraits /// min haystack size to use main algorithm instead of fallback static constexpr size_t min_haystack_size_for_algorithm = 20000; - static inline bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = 0) + static bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = 0) { return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits::max() || (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm); } - static inline Ngram toNGram(const UInt8 * const pos) { return unalignedLoad(pos); } + static Ngram toNGram(const UInt8 * const pos) { return unalignedLoad(pos); } template - static inline bool putNGramASCIICaseInsensitive(const UInt8 * pos, int offset, Callback && putNGramBase) + static bool putNGramASCIICaseInsensitive(const UInt8 * pos, int offset, Callback && putNGramBase) { struct Chars { @@ -115,7 +115,7 @@ namespace VolnitskyTraits } template - static inline bool putNGramUTF8CaseInsensitive( + static bool putNGramUTF8CaseInsensitive( const UInt8 * pos, int offset, const UInt8 * begin, size_t size, Callback && putNGramBase) { const UInt8 * end = begin + size; @@ -349,7 +349,7 @@ namespace VolnitskyTraits } template - static inline bool putNGram(const UInt8 * pos, int offset, [[maybe_unused]] const UInt8 * begin, size_t size, Callback && putNGramBase) + static bool putNGram(const UInt8 * pos, int offset, [[maybe_unused]] const UInt8 * begin, size_t size, Callback && putNGramBase) { if constexpr (CaseSensitive) { @@ -580,7 +580,7 @@ public: return true; } - inline bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const + bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const { const size_t fallback_size = fallback_needles.size(); for (size_t i = 0; i < fallback_size; ++i) @@ -609,7 +609,7 @@ public: return false; } - inline size_t searchOneFirstIndex(const UInt8 * haystack, const UInt8 * haystack_end) const + size_t searchOneFirstIndex(const UInt8 * haystack, const UInt8 * haystack_end) const { const size_t fallback_size = fallback_needles.size(); @@ -647,7 +647,7 @@ public: } template - inline UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & count_chars) const + UInt64 searchOneFirstPosition(const UInt8 * haystack, const UInt8 * haystack_end, const CountCharsCallback & count_chars) const { const size_t fallback_size = fallback_needles.size(); @@ -682,7 +682,7 @@ public: } template - inline void searchOneAll(const UInt8 * haystack, const UInt8 * haystack_end, AnsType * answer, const CountCharsCallback & count_chars) const + void searchOneAll(const UInt8 * haystack, const UInt8 * haystack_end, AnsType * answer, const CountCharsCallback & count_chars) const { const size_t fallback_size = fallback_needles.size(); for (size_t i = 0; i < fallback_size; ++i) diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index ec49c94808e..ddd30c4eef2 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -491,12 +491,12 @@ public: incrementErrorMetrics(code); } - inline static Exception createDeprecated(const std::string & msg, Error code_) + static Exception createDeprecated(const std::string & msg, Error code_) { return Exception(msg, code_, 0); } - inline static Exception fromPath(Error code_, const std::string & path) + static Exception fromPath(Error code_, const std::string & path) { return Exception(code_, "Coordination error: {}, path {}", errorMessage(code_), path); } @@ -504,7 +504,7 @@ public: /// Message must be a compile-time constant template requires std::is_convertible_v - inline static Exception fromMessage(Error code_, T && message) + static Exception fromMessage(Error code_, T && message) { return Exception(std::forward(message), code_); } diff --git a/src/Common/findExtreme.cpp b/src/Common/findExtreme.cpp index ce3bbb86d7c..a99b1f2dd3d 100644 --- a/src/Common/findExtreme.cpp +++ b/src/Common/findExtreme.cpp @@ -11,13 +11,13 @@ namespace DB template struct MinComparator { - static ALWAYS_INLINE inline const T & cmp(const T & a, const T & b) { return std::min(a, b); } + static ALWAYS_INLINE const T & cmp(const T & a, const T & b) { return std::min(a, b); } }; template struct MaxComparator { - static ALWAYS_INLINE inline const T & cmp(const T & a, const T & b) { return std::max(a, b); } + static ALWAYS_INLINE const T & cmp(const T & a, const T & b) { return std::max(a, b); } }; MULTITARGET_FUNCTION_AVX2_SSE42( diff --git a/src/Core/Field.h b/src/Core/Field.h index 4424d669c4d..73d3f4ec44e 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -855,13 +855,13 @@ template <> struct Field::EnumToType { usi template <> struct Field::EnumToType { using Type = CustomType; }; template <> struct Field::EnumToType { using Type = UInt64; }; -inline constexpr bool isInt64OrUInt64FieldType(Field::Types::Which t) +constexpr bool isInt64OrUInt64FieldType(Field::Types::Which t) { return t == Field::Types::Int64 || t == Field::Types::UInt64; } -inline constexpr bool isInt64OrUInt64orBoolFieldType(Field::Types::Which t) +constexpr bool isInt64OrUInt64orBoolFieldType(Field::Types::Which t) { return t == Field::Types::Int64 || t == Field::Types::UInt64 diff --git a/src/Core/Joins.h b/src/Core/Joins.h index ccdd6eefab7..96d2b51325c 100644 --- a/src/Core/Joins.h +++ b/src/Core/Joins.h @@ -19,16 +19,16 @@ enum class JoinKind : uint8_t const char * toString(JoinKind kind); -inline constexpr bool isLeft(JoinKind kind) { return kind == JoinKind::Left; } -inline constexpr bool isRight(JoinKind kind) { return kind == JoinKind::Right; } -inline constexpr bool isInner(JoinKind kind) { return kind == JoinKind::Inner; } -inline constexpr bool isFull(JoinKind kind) { return kind == JoinKind::Full; } -inline constexpr bool isCrossOrComma(JoinKind kind) { return kind == JoinKind::Comma || kind == JoinKind::Cross; } -inline constexpr bool isRightOrFull(JoinKind kind) { return kind == JoinKind::Right || kind == JoinKind::Full; } -inline constexpr bool isLeftOrFull(JoinKind kind) { return kind == JoinKind::Left || kind == JoinKind::Full; } -inline constexpr bool isInnerOrRight(JoinKind kind) { return kind == JoinKind::Inner || kind == JoinKind::Right; } -inline constexpr bool isInnerOrLeft(JoinKind kind) { return kind == JoinKind::Inner || kind == JoinKind::Left; } -inline constexpr bool isPaste(JoinKind kind) { return kind == JoinKind::Paste; } +constexpr bool isLeft(JoinKind kind) { return kind == JoinKind::Left; } +constexpr bool isRight(JoinKind kind) { return kind == JoinKind::Right; } +constexpr bool isInner(JoinKind kind) { return kind == JoinKind::Inner; } +constexpr bool isFull(JoinKind kind) { return kind == JoinKind::Full; } +constexpr bool isCrossOrComma(JoinKind kind) { return kind == JoinKind::Comma || kind == JoinKind::Cross; } +constexpr bool isRightOrFull(JoinKind kind) { return kind == JoinKind::Right || kind == JoinKind::Full; } +constexpr bool isLeftOrFull(JoinKind kind) { return kind == JoinKind::Left || kind == JoinKind::Full; } +constexpr bool isInnerOrRight(JoinKind kind) { return kind == JoinKind::Inner || kind == JoinKind::Right; } +constexpr bool isInnerOrLeft(JoinKind kind) { return kind == JoinKind::Inner || kind == JoinKind::Left; } +constexpr bool isPaste(JoinKind kind) { return kind == JoinKind::Paste; } /// Allows more optimal JOIN for typical cases. enum class JoinStrictness : uint8_t @@ -66,7 +66,7 @@ enum class ASOFJoinInequality : uint8_t const char * toString(ASOFJoinInequality asof_join_inequality); -inline constexpr ASOFJoinInequality getASOFJoinInequality(std::string_view func_name) +constexpr ASOFJoinInequality getASOFJoinInequality(std::string_view func_name) { ASOFJoinInequality inequality = ASOFJoinInequality::None; @@ -82,7 +82,7 @@ inline constexpr ASOFJoinInequality getASOFJoinInequality(std::string_view func_ return inequality; } -inline constexpr ASOFJoinInequality reverseASOFJoinInequality(ASOFJoinInequality inequality) +constexpr ASOFJoinInequality reverseASOFJoinInequality(ASOFJoinInequality inequality) { if (inequality == ASOFJoinInequality::Less) return ASOFJoinInequality::Greater; diff --git a/src/Daemon/BaseDaemon.h b/src/Daemon/BaseDaemon.h index a0f47c44460..3d34d404595 100644 --- a/src/Daemon/BaseDaemon.h +++ b/src/Daemon/BaseDaemon.h @@ -40,7 +40,7 @@ class BaseDaemon : public Poco::Util::ServerApplication, public Loggers friend class SignalListener; public: - static inline constexpr char DEFAULT_GRAPHITE_CONFIG_NAME[] = "graphite"; + static constexpr char DEFAULT_GRAPHITE_CONFIG_NAME[] = "graphite"; BaseDaemon(); ~BaseDaemon() override; diff --git a/src/DataTypes/DataTypeDecimalBase.h b/src/DataTypes/DataTypeDecimalBase.h index 642d2de833f..997c554059b 100644 --- a/src/DataTypes/DataTypeDecimalBase.h +++ b/src/DataTypes/DataTypeDecimalBase.h @@ -147,7 +147,7 @@ public: static T getScaleMultiplier(UInt32 scale); - inline DecimalUtils::DataTypeDecimalTrait getTrait() const + DecimalUtils::DataTypeDecimalTrait getTrait() const { return {precision, scale}; } diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 01217c58e31..a960a916027 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -754,7 +754,7 @@ private: std::vector attributes; - inline void setCellDeadline(Cell & cell, TimePoint now) + void setCellDeadline(Cell & cell, TimePoint now) { if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) { @@ -774,7 +774,7 @@ private: cell.deadline = std::chrono::system_clock::to_time_t(deadline); } - inline size_t getCellIndex(const KeyType key) const + size_t getCellIndex(const KeyType key) const { const size_t hash = DefaultHash()(key); const size_t index = hash & size_overlap_mask; @@ -783,7 +783,7 @@ private: using KeyStateAndCellIndex = std::pair; - inline KeyStateAndCellIndex getKeyStateAndCellIndex(const KeyType key, const time_t now) const + KeyStateAndCellIndex getKeyStateAndCellIndex(const KeyType key, const time_t now) const { size_t place_value = getCellIndex(key); const size_t place_value_end = place_value + max_collision_length; @@ -810,7 +810,7 @@ private: return std::make_pair(KeyState::not_found, place_value & size_overlap_mask); } - inline size_t getCellIndexForInsert(const KeyType & key) const + size_t getCellIndexForInsert(const KeyType & key) const { size_t place_value = getCellIndex(key); const size_t place_value_end = place_value + max_collision_length; diff --git a/src/Dictionaries/DictionaryHelpers.h b/src/Dictionaries/DictionaryHelpers.h index 8bf190d3edc..64fc05e99ab 100644 --- a/src/Dictionaries/DictionaryHelpers.h +++ b/src/Dictionaries/DictionaryHelpers.h @@ -44,7 +44,7 @@ public: { } - inline bool isConstant() const { return default_values_column == nullptr; } + bool isConstant() const { return default_values_column == nullptr; } Field getDefaultValue(size_t row) const { @@ -450,17 +450,17 @@ public: keys_size = key_columns.front()->size(); } - inline size_t getKeysSize() const + size_t getKeysSize() const { return keys_size; } - inline size_t getCurrentKeyIndex() const + size_t getCurrentKeyIndex() const { return current_key_index; } - inline KeyType extractCurrentKey() + KeyType extractCurrentKey() { assert(current_key_index < keys_size); diff --git a/src/Dictionaries/Embedded/RegionsNames.h b/src/Dictionaries/Embedded/RegionsNames.h index 0053c74745a..0e4c1fe8b88 100644 --- a/src/Dictionaries/Embedded/RegionsNames.h +++ b/src/Dictionaries/Embedded/RegionsNames.h @@ -48,14 +48,14 @@ public: }; private: - static inline constexpr const char * languages[] = + static constexpr const char * languages[] = { #define M(NAME, FALLBACK, NUM) #NAME, FOR_EACH_LANGUAGE(M) #undef M }; - static inline constexpr Language fallbacks[] = + static constexpr Language fallbacks[] = { #define M(NAME, FALLBACK, NUM) Language::FALLBACK, FOR_EACH_LANGUAGE(M) diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h index dcd7434946f..532154cd190 100644 --- a/src/Dictionaries/ICacheDictionaryStorage.h +++ b/src/Dictionaries/ICacheDictionaryStorage.h @@ -26,15 +26,15 @@ struct KeyState : state(state_) {} - inline bool isFound() const { return state == State::found; } - inline bool isExpired() const { return state == State::expired; } - inline bool isNotFound() const { return state == State::not_found; } - inline bool isDefault() const { return is_default; } - inline void setDefault() { is_default = true; } - inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; } + bool isFound() const { return state == State::found; } + bool isExpired() const { return state == State::expired; } + bool isNotFound() const { return state == State::not_found; } + bool isDefault() const { return is_default; } + void setDefault() { is_default = true; } + void setDefaultValue(bool is_default_value) { is_default = is_default_value; } /// Valid only if keyState is found or expired - inline size_t getFetchedColumnIndex() const { return fetched_column_index; } - inline void setFetchedColumnIndex(size_t fetched_column_index_value) { fetched_column_index = fetched_column_index_value; } + size_t getFetchedColumnIndex() const { return fetched_column_index; } + void setFetchedColumnIndex(size_t fetched_column_index_value) { fetched_column_index = fetched_column_index_value; } private: State state = not_found; size_t fetched_column_index = 0; diff --git a/src/Dictionaries/IPAddressDictionary.cpp b/src/Dictionaries/IPAddressDictionary.cpp index 1bc6d16c932..a67118caaf8 100644 --- a/src/Dictionaries/IPAddressDictionary.cpp +++ b/src/Dictionaries/IPAddressDictionary.cpp @@ -66,7 +66,7 @@ namespace return buf; } - inline UInt8 prefixIPv6() const + UInt8 prefixIPv6() const { return isv6 ? prefix : prefix + 96; } diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index 2e93a8e6001..ab999202e42 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -474,7 +474,7 @@ public: } // Checks if no more values can be added for a given attribute - inline bool full(const String & attr_name, std::unordered_set * const defaults = nullptr) const + bool full(const String & attr_name, std::unordered_set * const defaults = nullptr) const { if (collect_values_limit) { @@ -490,7 +490,7 @@ public: } // Returns the number of full attributes - inline size_t attributesFull() const { return n_full_attributes; } + size_t attributesFull() const { return n_full_attributes; } }; std::pair processBackRefs(const String & data, const re2::RE2 & searcher, const std::vector & pieces) diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index e3eea71cd9a..cb0ade9b899 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -134,7 +134,7 @@ public: /// Reset block with new block_data /// block_data must be filled with zeroes if it is new block - inline void reset(char * new_block_data) + void reset(char * new_block_data) { block_data = new_block_data; current_block_offset = block_header_size; @@ -142,13 +142,13 @@ public: } /// Check if it is enough place to write key in block - inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const + bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const { return (current_block_offset + (sizeof(cache_key.key) + sizeof(cache_key.size) + cache_key.size)) <= block_size; } /// Check if it is enough place to write key in block - inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const + bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const { const StringRef & key = cache_key.key; size_t complex_key_size = sizeof(key.size) + key.size; @@ -159,7 +159,7 @@ public: /// Write key and returns offset in ssd cache block where data is written /// It is client responsibility to check if there is enough place in block to write key /// Returns true if key was written and false if there was not enough place to write key - inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block) + bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block) { assert(cache_key.size > 0); @@ -188,7 +188,7 @@ public: return true; } - inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block) + bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block) { assert(cache_key.size > 0); @@ -223,20 +223,20 @@ public: return true; } - inline size_t getKeysSize() const { return keys_size; } + size_t getKeysSize() const { return keys_size; } /// Write keys size into block header - inline void writeKeysSize() + void writeKeysSize() { char * keys_size_offset_data = block_data + block_header_check_sum_size; std::memcpy(keys_size_offset_data, &keys_size, sizeof(size_t)); } /// Get check sum from block header - inline size_t getCheckSum() const { return unalignedLoad(block_data); } + size_t getCheckSum() const { return unalignedLoad(block_data); } /// Calculate check sum in block - inline size_t calculateCheckSum() const + size_t calculateCheckSum() const { size_t calculated_check_sum = static_cast(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size)); @@ -244,7 +244,7 @@ public: } /// Check if check sum from block header matched calculated check sum in block - inline bool checkCheckSum() const + bool checkCheckSum() const { size_t calculated_check_sum = calculateCheckSum(); size_t check_sum = getCheckSum(); @@ -253,16 +253,16 @@ public: } /// Write check sum in block header - inline void writeCheckSum() + void writeCheckSum() { size_t check_sum = static_cast(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size)); std::memcpy(block_data, &check_sum, sizeof(size_t)); } - inline size_t getBlockSize() const { return block_size; } + size_t getBlockSize() const { return block_size; } /// Returns block data - inline char * getBlockData() const { return block_data; } + char * getBlockData() const { return block_data; } /// Read keys that were serialized in block /// It is client responsibility to ensure that simple or complex keys were written in block @@ -405,16 +405,16 @@ public: current_write_block.writeCheckSum(); } - inline char * getPlace(SSDCacheIndex index) const + char * getPlace(SSDCacheIndex index) const { return buffer.m_data + index.block_index * block_size + index.offset_in_block; } - inline size_t getCurrentBlockIndex() const { return current_block_index; } + size_t getCurrentBlockIndex() const { return current_block_index; } - inline const char * getData() const { return buffer.m_data; } + const char * getData() const { return buffer.m_data; } - inline size_t getSizeInBytes() const { return block_size * partition_blocks_size; } + size_t getSizeInBytes() const { return block_size * partition_blocks_size; } void readKeys(PaddedPODArray & keys) const { @@ -431,7 +431,7 @@ public: } } - inline void reset() + void reset() { current_block_index = 0; current_write_block.reset(buffer.m_data); @@ -751,9 +751,9 @@ public: } } - inline size_t getCurrentBlockIndex() const { return current_block_index; } + size_t getCurrentBlockIndex() const { return current_block_index; } - inline void reset() + void reset() { current_block_index = 0; } @@ -789,7 +789,7 @@ private: int fd = -1; }; - inline static int preallocateDiskSpace(int fd, size_t offset, size_t len) + static int preallocateDiskSpace(int fd, size_t offset, size_t len) { #if defined(OS_FREEBSD) return posix_fallocate(fd, offset, len); @@ -798,7 +798,7 @@ private: #endif } - inline static char * getRequestBuffer(const iocb & request) + static char * getRequestBuffer(const iocb & request) { char * result = nullptr; @@ -811,7 +811,7 @@ private: return result; } - inline static ssize_t eventResult(io_event & event) + static ssize_t eventResult(io_event & event) { ssize_t bytes_written; @@ -986,9 +986,9 @@ private: size_t in_memory_partition_index; CellState state; - inline bool isInMemory() const { return state == in_memory; } - inline bool isOnDisk() const { return state == on_disk; } - inline bool isDefaultValue() const { return state == default_value; } + bool isInMemory() const { return state == in_memory; } + bool isOnDisk() const { return state == on_disk; } + bool isDefaultValue() const { return state == default_value; } }; struct KeyToBlockOffset @@ -1367,7 +1367,7 @@ private: } } - inline void setCellDeadline(Cell & cell, TimePoint now) + void setCellDeadline(Cell & cell, TimePoint now) { if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) { @@ -1384,7 +1384,7 @@ private: cell.deadline = std::chrono::system_clock::to_time_t(deadline); } - inline void eraseKeyFromIndex(KeyType key) + void eraseKeyFromIndex(KeyType key) { auto it = index.find(key); diff --git a/src/Disks/IO/IOUringReader.h b/src/Disks/IO/IOUringReader.h index 89e71e4b215..359b3badc45 100644 --- a/src/Disks/IO/IOUringReader.h +++ b/src/Disks/IO/IOUringReader.h @@ -61,12 +61,12 @@ private: void monitorRing(); - template inline void failPromise(std::promise & promise, const Exception & ex) + template void failPromise(std::promise & promise, const Exception & ex) { promise.set_exception(std::make_exception_ptr(ex)); } - inline std::future makeFailedResult(const Exception & ex) + std::future makeFailedResult(const Exception & ex) { auto promise = std::promise{}; failPromise(promise, ex); diff --git a/src/Functions/DivisionUtils.h b/src/Functions/DivisionUtils.h index ff07309e248..7fd5b7476e1 100644 --- a/src/Functions/DivisionUtils.h +++ b/src/Functions/DivisionUtils.h @@ -68,7 +68,7 @@ struct DivideIntegralImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { using CastA = std::conditional_t && std::is_same_v, uint8_t, A>; using CastB = std::conditional_t && std::is_same_v, uint8_t, B>; @@ -120,7 +120,7 @@ struct ModuloImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { if constexpr (std::is_floating_point_v) { @@ -175,7 +175,7 @@ struct PositiveModuloImpl : ModuloImpl using ResultType = typename NumberTraits::ResultOfPositiveModulo::Type; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { auto res = ModuloImpl::template apply(a, b); if constexpr (is_signed_v) diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index aa0e1b04835..5b8fa41958a 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -20,7 +20,7 @@ namespace DB // includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word struct ExtractStringImpl { - static ALWAYS_INLINE inline const UInt8 * readOneWord(const UInt8 *& pos, const UInt8 * end) + static ALWAYS_INLINE const UInt8 * readOneWord(const UInt8 *& pos, const UInt8 * end) { // jump separators while (pos < end && isUTF8Sep(*pos)) @@ -35,10 +35,10 @@ struct ExtractStringImpl } // we use ASCII non-alphanum character as UTF8 separator - static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); } + static ALWAYS_INLINE bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); } // read one UTF8 character - static ALWAYS_INLINE inline void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end) + static ALWAYS_INLINE void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end) { size_t length = UTF8::seqLength(*pos); diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index 6203999fa37..5d19ba44d9b 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -284,7 +284,7 @@ struct BinaryOperation private: template - static inline void apply(const A * __restrict a, const B * __restrict b, ResultType * __restrict c, size_t i) + static void apply(const A * __restrict a, const B * __restrict b, ResultType * __restrict c, size_t i) { if constexpr (op_case == OpCase::Vector) c[i] = Op::template apply(a[i], b[i]); @@ -432,7 +432,7 @@ template struct FixedStringReduceOperationImpl { template - static void inline process(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt16 * __restrict result, size_t size, size_t N) + static void process(const UInt8 * __restrict a, const UInt8 * __restrict b, UInt16 * __restrict result, size_t size, size_t N) { if constexpr (op_case == OpCase::Vector) vectorVector(a, b, result, size, N); @@ -503,7 +503,7 @@ struct StringReduceOperationImpl } } - static inline UInt64 constConst(std::string_view a, std::string_view b) + static UInt64 constConst(std::string_view a, std::string_view b) { return process( reinterpret_cast(a.data()), @@ -643,7 +643,7 @@ public: private: template - static inline void processWithRightNullmapImpl(const auto & a, const auto & b, ResultContainerType & c, size_t size, const NullMap * right_nullmap, ApplyFunc apply_func) + static void processWithRightNullmapImpl(const auto & a, const auto & b, ResultContainerType & c, size_t size, const NullMap * right_nullmap, ApplyFunc apply_func) { if (right_nullmap) { diff --git a/src/Functions/FunctionSQLJSON.h b/src/Functions/FunctionSQLJSON.h index 37db514fd1f..83ed874c47b 100644 --- a/src/Functions/FunctionSQLJSON.h +++ b/src/Functions/FunctionSQLJSON.h @@ -44,27 +44,27 @@ class DefaultJSONStringSerializer public: explicit DefaultJSONStringSerializer(ColumnString & col_str_) : col_str(col_str_) { } - inline void addRawData(const char * ptr, size_t len) + void addRawData(const char * ptr, size_t len) { out << std::string_view(ptr, len); } - inline void addRawString(std::string_view str) + void addRawString(std::string_view str) { out << str; } /// serialize the json element into stringstream - inline void addElement(const Element & element) + void addElement(const Element & element) { out << element.getElement(); } - inline void commit() + void commit() { auto out_str = out.str(); col_str.insertData(out_str.data(), out_str.size()); } - inline void rollback() {} + void rollback() {} private: ColumnString & col_str; std::stringstream out; // STYLE_CHECK_ALLOW_STD_STRING_STREAM @@ -82,27 +82,27 @@ public: prev_offset = offsets.empty() ? 0 : offsets.back(); } /// Put the data into column's buffer directly. - inline void addRawData(const char * ptr, size_t len) + void addRawData(const char * ptr, size_t len) { chars.insert(ptr, ptr + len); } - inline void addRawString(std::string_view str) + void addRawString(std::string_view str) { chars.insert(str.data(), str.data() + str.size()); } /// serialize the json element into column's buffer directly - inline void addElement(const Element & element) + void addElement(const Element & element) { formatter.append(element.getElement()); } - inline void commit() + void commit() { chars.push_back(0); offsets.push_back(chars.size()); } - inline void rollback() + void rollback() { chars.resize(prev_offset); } diff --git a/src/Functions/FunctionsAES.h b/src/Functions/FunctionsAES.h index 14745460658..524b4f82acd 100644 --- a/src/Functions/FunctionsAES.h +++ b/src/Functions/FunctionsAES.h @@ -59,7 +59,7 @@ enum class CipherMode : uint8_t template struct KeyHolder { - inline StringRef setKey(size_t cipher_key_size, StringRef key) const + StringRef setKey(size_t cipher_key_size, StringRef key) const { if (key.size != cipher_key_size) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid key size: {} expected {}", key.size, cipher_key_size); @@ -71,7 +71,7 @@ struct KeyHolder template <> struct KeyHolder { - inline StringRef setKey(size_t cipher_key_size, StringRef key) + StringRef setKey(size_t cipher_key_size, StringRef key) { if (key.size < cipher_key_size) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid key size: {} expected {}", key.size, cipher_key_size); diff --git a/src/Functions/FunctionsBitToArray.cpp b/src/Functions/FunctionsBitToArray.cpp index 566ce16d1a7..adabda1a7f8 100644 --- a/src/Functions/FunctionsBitToArray.cpp +++ b/src/Functions/FunctionsBitToArray.cpp @@ -79,7 +79,7 @@ public: private: template - inline static void writeBitmask(T x, WriteBuffer & out) + static void writeBitmask(T x, WriteBuffer & out) { using UnsignedT = make_unsigned_t; UnsignedT u_x = x; diff --git a/src/Functions/FunctionsCodingIP.cpp b/src/Functions/FunctionsCodingIP.cpp index 54f7b6dd1f4..e01967274f4 100644 --- a/src/Functions/FunctionsCodingIP.cpp +++ b/src/Functions/FunctionsCodingIP.cpp @@ -785,7 +785,7 @@ private: #include - static inline void applyCIDRMask(const char * __restrict src, char * __restrict dst_lower, char * __restrict dst_upper, UInt8 bits_to_keep) + static void applyCIDRMask(const char * __restrict src, char * __restrict dst_lower, char * __restrict dst_upper, UInt8 bits_to_keep) { __m128i mask = _mm_loadu_si128(reinterpret_cast(getCIDRMaskIPv6(bits_to_keep).data())); __m128i lower = _mm_and_si128(_mm_loadu_si128(reinterpret_cast(src)), mask); @@ -916,7 +916,7 @@ public: class FunctionIPv4CIDRToRange : public IFunction { private: - static inline std::pair applyCIDRMask(UInt32 src, UInt8 bits_to_keep) + static std::pair applyCIDRMask(UInt32 src, UInt8 bits_to_keep) { if (bits_to_keep >= 8 * sizeof(UInt32)) return { src, src }; diff --git a/src/Functions/FunctionsConsistentHashing.h b/src/Functions/FunctionsConsistentHashing.h index 6f2eec5be98..306b6395dc5 100644 --- a/src/Functions/FunctionsConsistentHashing.h +++ b/src/Functions/FunctionsConsistentHashing.h @@ -83,7 +83,7 @@ private: using BucketsType = typename Impl::BucketsType; template - inline BucketsType checkBucketsRange(T buckets) const + BucketsType checkBucketsRange(T buckets) const { if (unlikely(buckets <= 0)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} (number of buckets) must be positive number", getName()); diff --git a/src/Functions/FunctionsLanguageClassification.cpp b/src/Functions/FunctionsLanguageClassification.cpp index 55485d41ce0..94391606762 100644 --- a/src/Functions/FunctionsLanguageClassification.cpp +++ b/src/Functions/FunctionsLanguageClassification.cpp @@ -31,7 +31,7 @@ extern const int SUPPORT_IS_DISABLED; struct FunctionDetectLanguageImpl { - static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string) + static ALWAYS_INLINE std::string_view codeISO(std::string_view code_string) { if (code_string.ends_with("-Latn")) code_string.remove_suffix(code_string.size() - 5); diff --git a/src/Functions/FunctionsLogical.cpp b/src/Functions/FunctionsLogical.cpp index 7e7ae76d6eb..2f5ce6deebf 100644 --- a/src/Functions/FunctionsLogical.cpp +++ b/src/Functions/FunctionsLogical.cpp @@ -170,7 +170,7 @@ public: : vec(in[in.size() - N]->getData()), next(in) {} /// Returns a combination of values in the i-th row of all columns stored in the constructor. - inline ResultValueType apply(const size_t i) const + ResultValueType apply(const size_t i) const { const auto a = !!vec[i]; return Op::apply(a, next.apply(i)); @@ -190,7 +190,7 @@ public: explicit AssociativeApplierImpl(const UInt8ColumnPtrs & in) : vec(in[in.size() - 1]->getData()) {} - inline ResultValueType apply(const size_t i) const { return !!vec[i]; } + ResultValueType apply(const size_t i) const { return !!vec[i]; } private: const UInt8Container & vec; @@ -291,7 +291,7 @@ public: } /// Returns a combination of values in the i-th row of all columns stored in the constructor. - inline ResultValueType apply(const size_t i) const + ResultValueType apply(const size_t i) const { return Op::ternaryApply(vec[i], next.apply(i)); } @@ -315,7 +315,7 @@ public: TernaryValueBuilder::build(in[in.size() - 1], vec.data()); } - inline ResultValueType apply(const size_t i) const { return vec[i]; } + ResultValueType apply(const size_t i) const { return vec[i]; } private: UInt8Container vec; diff --git a/src/Functions/FunctionsLogical.h b/src/Functions/FunctionsLogical.h index 41464329f79..3c2eb3ee0b8 100644 --- a/src/Functions/FunctionsLogical.h +++ b/src/Functions/FunctionsLogical.h @@ -84,47 +84,47 @@ struct AndImpl { using ResultType = UInt8; - static inline constexpr bool isSaturable() { return true; } + static constexpr bool isSaturable() { return true; } /// Final value in two-valued logic (no further operations with True, False will change this value) - static inline constexpr bool isSaturatedValue(bool a) { return !a; } + static constexpr bool isSaturatedValue(bool a) { return !a; } /// Final value in three-valued logic (no further operations with True, False, Null will change this value) - static inline constexpr bool isSaturatedValueTernary(UInt8 a) { return a == Ternary::False; } + static constexpr bool isSaturatedValueTernary(UInt8 a) { return a == Ternary::False; } - static inline constexpr ResultType apply(UInt8 a, UInt8 b) { return a & b; } + static constexpr ResultType apply(UInt8 a, UInt8 b) { return a & b; } - static inline constexpr ResultType ternaryApply(UInt8 a, UInt8 b) { return std::min(a, b); } + static constexpr ResultType ternaryApply(UInt8 a, UInt8 b) { return std::min(a, b); } /// Will use three-valued logic for NULLs (see above) or default implementation (any operation with NULL returns NULL). - static inline constexpr bool specialImplementationForNulls() { return true; } + static constexpr bool specialImplementationForNulls() { return true; } }; struct OrImpl { using ResultType = UInt8; - static inline constexpr bool isSaturable() { return true; } - static inline constexpr bool isSaturatedValue(bool a) { return a; } - static inline constexpr bool isSaturatedValueTernary(UInt8 a) { return a == Ternary::True; } - static inline constexpr ResultType apply(UInt8 a, UInt8 b) { return a | b; } - static inline constexpr ResultType ternaryApply(UInt8 a, UInt8 b) { return std::max(a, b); } - static inline constexpr bool specialImplementationForNulls() { return true; } + static constexpr bool isSaturable() { return true; } + static constexpr bool isSaturatedValue(bool a) { return a; } + static constexpr bool isSaturatedValueTernary(UInt8 a) { return a == Ternary::True; } + static constexpr ResultType apply(UInt8 a, UInt8 b) { return a | b; } + static constexpr ResultType ternaryApply(UInt8 a, UInt8 b) { return std::max(a, b); } + static constexpr bool specialImplementationForNulls() { return true; } }; struct XorImpl { using ResultType = UInt8; - static inline constexpr bool isSaturable() { return false; } - static inline constexpr bool isSaturatedValue(bool) { return false; } - static inline constexpr bool isSaturatedValueTernary(UInt8) { return false; } - static inline constexpr ResultType apply(UInt8 a, UInt8 b) { return a != b; } - static inline constexpr ResultType ternaryApply(UInt8 a, UInt8 b) { return a != b; } - static inline constexpr bool specialImplementationForNulls() { return false; } + static constexpr bool isSaturable() { return false; } + static constexpr bool isSaturatedValue(bool) { return false; } + static constexpr bool isSaturatedValueTernary(UInt8) { return false; } + static constexpr ResultType apply(UInt8 a, UInt8 b) { return a != b; } + static constexpr ResultType ternaryApply(UInt8 a, UInt8 b) { return a != b; } + static constexpr bool specialImplementationForNulls() { return false; } #if USE_EMBEDDED_COMPILER - static inline llvm::Value * apply(llvm::IRBuilder<> & builder, llvm::Value * a, llvm::Value * b) + static llvm::Value * apply(llvm::IRBuilder<> & builder, llvm::Value * a, llvm::Value * b) { return builder.CreateXor(a, b); } @@ -136,13 +136,13 @@ struct NotImpl { using ResultType = UInt8; - static inline ResultType apply(A a) + static ResultType apply(A a) { return !static_cast(a); } #if USE_EMBEDDED_COMPILER - static inline llvm::Value * apply(llvm::IRBuilder<> & builder, llvm::Value * a) + static llvm::Value * apply(llvm::IRBuilder<> & builder, llvm::Value * a) { return builder.CreateNot(a); } diff --git a/src/Functions/FunctionsProgrammingClassification.cpp b/src/Functions/FunctionsProgrammingClassification.cpp index a93e1d9a87d..8e9eff50aab 100644 --- a/src/Functions/FunctionsProgrammingClassification.cpp +++ b/src/Functions/FunctionsProgrammingClassification.cpp @@ -21,7 +21,7 @@ namespace DB struct FunctionDetectProgrammingLanguageImpl { /// Calculate total weight - static ALWAYS_INLINE inline Float64 stateMachine( + static ALWAYS_INLINE Float64 stateMachine( const FrequencyHolder::Map & standard, const std::unordered_map & model) { diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index 99f3a14dfec..1f20fbff24e 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -296,7 +296,7 @@ class FloatRoundingComputation : public BaseFloatRoundingComputation using Base = BaseFloatRoundingComputation; public: - static inline void compute(const T * __restrict in, const typename Base::VectorType & scale, T * __restrict out) + static void compute(const T * __restrict in, const typename Base::VectorType & scale, T * __restrict out) { auto val = Base::load(in); diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index 0bf6e39e651..cd33564caf9 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -99,7 +99,7 @@ struct Hash } template - static ALWAYS_INLINE inline UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size) + static ALWAYS_INLINE UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size) { if (size & 1) { @@ -153,7 +153,7 @@ struct Hash } template - static ALWAYS_INLINE inline UInt64 shingleHash(const std::vector & shingle, size_t offset = 0) + static ALWAYS_INLINE UInt64 shingleHash(const std::vector & shingle, size_t offset = 0) { UInt64 crc = -1ULL; @@ -177,14 +177,14 @@ struct SimHashImpl static constexpr size_t min_word_size = 4; /// Update fingerprint according to hash_value bits. - static ALWAYS_INLINE inline void updateFingerVector(Int64 * finger_vec, UInt64 hash_value) + static ALWAYS_INLINE void updateFingerVector(Int64 * finger_vec, UInt64 hash_value) { for (size_t i = 0; i < 64; ++i) finger_vec[i] += (hash_value & (1ULL << i)) ? 1 : -1; } /// Return a 64 bit value according to finger_vec. - static ALWAYS_INLINE inline UInt64 getSimHash(const Int64 * finger_vec) + static ALWAYS_INLINE UInt64 getSimHash(const Int64 * finger_vec) { UInt64 res = 0; @@ -200,7 +200,7 @@ struct SimHashImpl // for each ngram, calculate a 64 bit hash value, and update the vector according the hash value // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 - static ALWAYS_INLINE inline UInt64 ngramHashASCII(const UInt8 * data, size_t size, size_t shingle_size) + static ALWAYS_INLINE UInt64 ngramHashASCII(const UInt8 * data, size_t size, size_t shingle_size) { if (size < shingle_size) return Hash::shingleHash(-1ULL, data, size); @@ -217,7 +217,7 @@ struct SimHashImpl return getSimHash(finger_vec); } - static ALWAYS_INLINE inline UInt64 ngramHashUTF8(const UInt8 * data, size_t size, size_t shingle_size) + static ALWAYS_INLINE UInt64 ngramHashUTF8(const UInt8 * data, size_t size, size_t shingle_size) { const UInt8 * start = data; const UInt8 * end = data + size; @@ -259,7 +259,7 @@ struct SimHashImpl // 2. next, we extract one word each time, and calculate a new hash value of the new word,then use the latest N hash // values to calculate the next word shingle hash value - static ALWAYS_INLINE inline UInt64 wordShingleHash(const UInt8 * data, size_t size, size_t shingle_size) + static ALWAYS_INLINE UInt64 wordShingleHash(const UInt8 * data, size_t size, size_t shingle_size) { const UInt8 * start = data; const UInt8 * end = data + size; @@ -400,7 +400,7 @@ struct MinHashImpl using MaxHeap = Heap>; using MinHeap = Heap>; - static ALWAYS_INLINE inline void ngramHashASCII( + static ALWAYS_INLINE void ngramHashASCII( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, @@ -429,7 +429,7 @@ struct MinHashImpl } } - static ALWAYS_INLINE inline void ngramHashUTF8( + static ALWAYS_INLINE void ngramHashUTF8( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, @@ -472,7 +472,7 @@ struct MinHashImpl // MinHash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64) // for each word shingle, we calculate a hash value, but in fact, we just maintain the // K minimum and K maximum hash value - static ALWAYS_INLINE inline void wordShingleHash( + static ALWAYS_INLINE void wordShingleHash( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, diff --git a/src/Functions/FunctionsStringSimilarity.cpp b/src/Functions/FunctionsStringSimilarity.cpp index aadf5c246fc..5224c76d7b0 100644 --- a/src/Functions/FunctionsStringSimilarity.cpp +++ b/src/Functions/FunctionsStringSimilarity.cpp @@ -85,7 +85,7 @@ struct NgramDistanceImpl } template - static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) + static ALWAYS_INLINE void unrollLowering(Container & cont, const std::index_sequence &) { ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); } @@ -195,7 +195,7 @@ struct NgramDistanceImpl } template - static ALWAYS_INLINE inline size_t calculateNeedleStats( + static ALWAYS_INLINE size_t calculateNeedleStats( const char * data, const size_t size, NgramCount * ngram_stats, @@ -228,7 +228,7 @@ struct NgramDistanceImpl } template - static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric( + static ALWAYS_INLINE UInt64 calculateHaystackStatsAndMetric( const char * data, const size_t size, NgramCount * ngram_stats, @@ -275,7 +275,7 @@ struct NgramDistanceImpl } template - static inline auto dispatchSearcher(Callback callback, Args &&... args) + static auto dispatchSearcher(Callback callback, Args &&... args) { if constexpr (!UTF8) return callback(std::forward(args)..., readASCIICodePoints, calculateASCIIHash); diff --git a/src/Functions/FunctionsTimeWindow.h b/src/Functions/FunctionsTimeWindow.h index 6183d25c8bd..7522bd374a2 100644 --- a/src/Functions/FunctionsTimeWindow.h +++ b/src/Functions/FunctionsTimeWindow.h @@ -97,7 +97,7 @@ template<> \ template <> \ struct AddTime \ { \ - static inline auto execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) \ + static auto execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) \ { \ return time_zone.add##INTERVAL_KIND##s(ExtendedDayNum(d), delta); \ } \ @@ -110,7 +110,7 @@ template<> \ template <> struct AddTime { - static inline NO_SANITIZE_UNDEFINED ExtendedDayNum execute(UInt16 d, UInt64 delta, const DateLUTImpl &) + static NO_SANITIZE_UNDEFINED ExtendedDayNum execute(UInt16 d, UInt64 delta, const DateLUTImpl &) { return ExtendedDayNum(static_cast(d + delta * 7)); } @@ -120,7 +120,7 @@ template<> \ template <> \ struct AddTime \ { \ - static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &) \ + static NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &) \ { return static_cast(t + delta * (INTERVAL)); } \ }; ADD_TIME(Day, 86400) @@ -133,7 +133,7 @@ template<> \ template <> \ struct AddTime \ { \ - static inline NO_SANITIZE_UNDEFINED Int64 execute(Int64 t, UInt64 delta, const UInt32 scale) \ + static NO_SANITIZE_UNDEFINED Int64 execute(Int64 t, UInt64 delta, const UInt32 scale) \ { \ if (scale < (DEF_SCALE)) \ { \ diff --git a/src/Functions/FunctionsTonalityClassification.cpp b/src/Functions/FunctionsTonalityClassification.cpp index 3de38d99c88..a8cc09186f6 100644 --- a/src/Functions/FunctionsTonalityClassification.cpp +++ b/src/Functions/FunctionsTonalityClassification.cpp @@ -18,7 +18,7 @@ namespace DB */ struct FunctionDetectTonalityImpl { - static ALWAYS_INLINE inline Float32 detectTonality( + static ALWAYS_INLINE Float32 detectTonality( const UInt8 * str, const size_t str_len, const FrequencyHolder::Map & emotional_dict) diff --git a/src/Functions/GCDLCMImpl.h b/src/Functions/GCDLCMImpl.h index df531363c31..094c248497b 100644 --- a/src/Functions/GCDLCMImpl.h +++ b/src/Functions/GCDLCMImpl.h @@ -26,7 +26,7 @@ struct GCDLCMImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger::Type(a), typename NumberTraits::ToInteger::Type(b)); throwIfDivisionLeadsToFPE(typename NumberTraits::ToInteger::Type(b), typename NumberTraits::ToInteger::Type(a)); diff --git a/src/Functions/GregorianDate.cpp b/src/Functions/GregorianDate.cpp index eb7ef4abe56..91861e8bbd2 100644 --- a/src/Functions/GregorianDate.cpp +++ b/src/Functions/GregorianDate.cpp @@ -20,12 +20,12 @@ namespace ErrorCodes namespace { - inline constexpr bool is_leap_year(int32_t year) + constexpr bool is_leap_year(int32_t year) { return (year % 4 == 0) && ((year % 400 == 0) || (year % 100 != 0)); } - inline constexpr uint8_t monthLength(bool is_leap_year, uint8_t month) + constexpr uint8_t monthLength(bool is_leap_year, uint8_t month) { switch (month) { @@ -49,7 +49,7 @@ namespace /** Integer division truncated toward negative infinity. */ template - inline constexpr I div(I x, J y) + constexpr I div(I x, J y) { const auto y_cast = static_cast(y); if (x > 0 && y_cast < 0) @@ -63,7 +63,7 @@ namespace /** Integer modulus, satisfying div(x, y)*y + mod(x, y) == x. */ template - inline constexpr I mod(I x, J y) + constexpr I mod(I x, J y) { const auto y_cast = static_cast(y); const auto r = x % y_cast; @@ -76,7 +76,7 @@ namespace /** Like std::min(), but the type of operands may differ. */ template - inline constexpr I min(I x, J y) + constexpr I min(I x, J y) { const auto y_cast = static_cast(y); return x < y_cast ? x : y_cast; diff --git a/src/Functions/PolygonUtils.h b/src/Functions/PolygonUtils.h index c4851718da6..0c57fd7f0b5 100644 --- a/src/Functions/PolygonUtils.h +++ b/src/Functions/PolygonUtils.h @@ -124,7 +124,7 @@ public: bool hasEmptyBound() const { return has_empty_bound; } - inline bool ALWAYS_INLINE contains(CoordinateType x, CoordinateType y) const + bool ALWAYS_INLINE contains(CoordinateType x, CoordinateType y) const { Point point(x, y); diff --git a/src/Functions/TransformDateTime64.h b/src/Functions/TransformDateTime64.h index 896e9d8ca48..b52ccd3cce0 100644 --- a/src/Functions/TransformDateTime64.h +++ b/src/Functions/TransformDateTime64.h @@ -53,7 +53,7 @@ public: {} template - inline auto NO_SANITIZE_UNDEFINED execute(const DateTime64 & t, Args && ... args) const + auto NO_SANITIZE_UNDEFINED execute(const DateTime64 & t, Args && ... args) const { /// Type conversion from float to integer may be required. /// We are Ok with implementation specific result for out of range and denormals conversion. @@ -90,14 +90,14 @@ public: template requires(!std::same_as) - inline auto execute(const T & t, Args &&... args) const + auto execute(const T & t, Args &&... args) const { return wrapped_transform.execute(t, std::forward(args)...); } template - inline auto NO_SANITIZE_UNDEFINED executeExtendedResult(const DateTime64 & t, Args && ... args) const + auto NO_SANITIZE_UNDEFINED executeExtendedResult(const DateTime64 & t, Args && ... args) const { /// Type conversion from float to integer may be required. /// We are Ok with implementation specific result for out of range and denormals conversion. @@ -131,7 +131,7 @@ public: template requires (!std::same_as) - inline auto executeExtendedResult(const T & t, Args && ... args) const + auto executeExtendedResult(const T & t, Args && ... args) const { return wrapped_transform.executeExtendedResult(t, std::forward(args)...); } diff --git a/src/Functions/abs.cpp b/src/Functions/abs.cpp index 0cd313caf1e..9ac2363f765 100644 --- a/src/Functions/abs.cpp +++ b/src/Functions/abs.cpp @@ -12,7 +12,7 @@ struct AbsImpl using ResultType = std::conditional_t, A, typename NumberTraits::ResultOfAbs::Type>; static constexpr bool allow_string_or_fixed_string = false; - static inline NO_SANITIZE_UNDEFINED ResultType apply(A a) + static NO_SANITIZE_UNDEFINED ResultType apply(A a) { if constexpr (is_decimal) return a < A(0) ? A(-a) : a; diff --git a/src/Functions/array/arrayIndex.h b/src/Functions/array/arrayIndex.h index 395f96bbffb..fa9b3dc92dd 100644 --- a/src/Functions/array/arrayIndex.h +++ b/src/Functions/array/arrayIndex.h @@ -322,7 +322,7 @@ private: } template - static inline void invokeCheckNullMaps( + static void invokeCheckNullMaps( const ColumnString::Chars & data, const ColumnArray::Offsets & offsets, const ColumnString::Offsets & str_offsets, const ColumnString::Chars & values, OffsetT item_offsets, @@ -339,7 +339,7 @@ private: } public: - static inline void process( + static void process( const ColumnString::Chars & data, const ColumnArray::Offsets & offsets, const ColumnString::Offsets & string_offsets, const ColumnString::Chars & item_values, Offset item_offsets, PaddedPODArray & result, @@ -348,7 +348,7 @@ public: invokeCheckNullMaps(data, offsets, string_offsets, item_values, item_offsets, result, data_map, item_map); } - static inline void process( + static void process( const ColumnString::Chars & data, const ColumnArray::Offsets & offsets, const ColumnString::Offsets & string_offsets, const ColumnString::Chars & item_values, const ColumnString::Offsets & item_offsets, PaddedPODArray & result, @@ -467,10 +467,10 @@ private: NullMaps maps; ResultColumnPtr result { ResultColumnType::create() }; - inline void moveResult() { result_column = std::move(result); } + void moveResult() { result_column = std::move(result); } }; - static inline bool allowArguments(const DataTypePtr & inner_type, const DataTypePtr & arg) + static bool allowArguments(const DataTypePtr & inner_type, const DataTypePtr & arg) { auto inner_type_decayed = removeNullable(removeLowCardinality(inner_type)); auto arg_decayed = removeNullable(removeLowCardinality(arg)); @@ -633,7 +633,7 @@ private: * (s1, s1, s2, ...), (s2, s1, s2, ...), (s3, s1, s2, ...) */ template - static inline ColumnPtr executeIntegral(const ColumnsWithTypeAndName & arguments) + static ColumnPtr executeIntegral(const ColumnsWithTypeAndName & arguments) { const ColumnArray * const left = checkAndGetColumn(arguments[0].column.get()); @@ -658,14 +658,14 @@ private: } template - static inline bool executeIntegral(ExecutionData& data) + static bool executeIntegral(ExecutionData& data) { return (executeIntegralExpanded(data) || ...); } /// Invoke executeIntegralImpl with such parameters: (A, other1), (A, other2), ... template - static inline bool executeIntegralExpanded(ExecutionData& data) + static bool executeIntegralExpanded(ExecutionData& data) { return (executeIntegralImpl(data) || ...); } diff --git a/src/Functions/array/arrayNorm.cpp b/src/Functions/array/arrayNorm.cpp index e87eff6add1..ca1e8f21aee 100644 --- a/src/Functions/array/arrayNorm.cpp +++ b/src/Functions/array/arrayNorm.cpp @@ -25,19 +25,19 @@ struct L1Norm struct ConstParams {}; template - inline static ResultType accumulate(ResultType result, ResultType value, const ConstParams &) + static ResultType accumulate(ResultType result, ResultType value, const ConstParams &) { return result + fabs(value); } template - inline static ResultType combine(ResultType result, ResultType other_result, const ConstParams &) + static ResultType combine(ResultType result, ResultType other_result, const ConstParams &) { return result + other_result; } template - inline static ResultType finalize(ResultType result, const ConstParams &) + static ResultType finalize(ResultType result, const ConstParams &) { return result; } @@ -50,19 +50,19 @@ struct L2Norm struct ConstParams {}; template - inline static ResultType accumulate(ResultType result, ResultType value, const ConstParams &) + static ResultType accumulate(ResultType result, ResultType value, const ConstParams &) { return result + value * value; } template - inline static ResultType combine(ResultType result, ResultType other_result, const ConstParams &) + static ResultType combine(ResultType result, ResultType other_result, const ConstParams &) { return result + other_result; } template - inline static ResultType finalize(ResultType result, const ConstParams &) + static ResultType finalize(ResultType result, const ConstParams &) { return sqrt(result); } @@ -73,7 +73,7 @@ struct L2SquaredNorm : L2Norm static constexpr auto name = "L2Squared"; template - inline static ResultType finalize(ResultType result, const ConstParams &) + static ResultType finalize(ResultType result, const ConstParams &) { return result; } @@ -91,19 +91,19 @@ struct LpNorm }; template - inline static ResultType accumulate(ResultType result, ResultType value, const ConstParams & params) + static ResultType accumulate(ResultType result, ResultType value, const ConstParams & params) { return result + static_cast(std::pow(fabs(value), params.power)); } template - inline static ResultType combine(ResultType result, ResultType other_result, const ConstParams &) + static ResultType combine(ResultType result, ResultType other_result, const ConstParams &) { return result + other_result; } template - inline static ResultType finalize(ResultType result, const ConstParams & params) + static ResultType finalize(ResultType result, const ConstParams & params) { return static_cast(std::pow(result, params.inverted_power)); } @@ -116,19 +116,19 @@ struct LinfNorm struct ConstParams {}; template - inline static ResultType accumulate(ResultType result, ResultType value, const ConstParams &) + static ResultType accumulate(ResultType result, ResultType value, const ConstParams &) { return fmax(result, fabs(value)); } template - inline static ResultType combine(ResultType result, ResultType other_result, const ConstParams &) + static ResultType combine(ResultType result, ResultType other_result, const ConstParams &) { return fmax(result, other_result); } template - inline static ResultType finalize(ResultType result, const ConstParams &) + static ResultType finalize(ResultType result, const ConstParams &) { return result; } diff --git a/src/Functions/bitAnd.cpp b/src/Functions/bitAnd.cpp index 8efc5181919..c6ab9023142 100644 --- a/src/Functions/bitAnd.cpp +++ b/src/Functions/bitAnd.cpp @@ -20,7 +20,7 @@ struct BitAndImpl static constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { return static_cast(a) & static_cast(b); } @@ -28,7 +28,7 @@ struct BitAndImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { if (!left->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "BitAndImpl expected an integral type"); diff --git a/src/Functions/bitBoolMaskAnd.cpp b/src/Functions/bitBoolMaskAnd.cpp index 11c0c1d1b7d..bd89b6eb69a 100644 --- a/src/Functions/bitBoolMaskAnd.cpp +++ b/src/Functions/bitBoolMaskAnd.cpp @@ -25,7 +25,7 @@ struct BitBoolMaskAndImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply([[maybe_unused]] A left, [[maybe_unused]] B right) + static Result apply([[maybe_unused]] A left, [[maybe_unused]] B right) { // Should be a logical error, but this function is callable from SQL. // Need to investigate this. diff --git a/src/Functions/bitBoolMaskOr.cpp b/src/Functions/bitBoolMaskOr.cpp index 7940bf3e2ca..1ddf2d258f8 100644 --- a/src/Functions/bitBoolMaskOr.cpp +++ b/src/Functions/bitBoolMaskOr.cpp @@ -25,7 +25,7 @@ struct BitBoolMaskOrImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply([[maybe_unused]] A left, [[maybe_unused]] B right) + static Result apply([[maybe_unused]] A left, [[maybe_unused]] B right) { if constexpr (!std::is_same_v || !std::is_same_v) // Should be a logical error, but this function is callable from SQL. diff --git a/src/Functions/bitCount.cpp b/src/Functions/bitCount.cpp index f1a3ac897c1..68555b1386c 100644 --- a/src/Functions/bitCount.cpp +++ b/src/Functions/bitCount.cpp @@ -13,7 +13,7 @@ struct BitCountImpl using ResultType = std::conditional_t<(sizeof(A) * 8 >= 256), UInt16, UInt8>; static constexpr bool allow_string_or_fixed_string = true; - static inline ResultType apply(A a) + static ResultType apply(A a) { /// We count bits in the value representation in memory. For example, we support floats. /// We need to avoid sign-extension when converting signed numbers to larger type. So, uint8_t(-1) has 8 bits. diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index f00f38b61af..f8a1a95ae14 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -19,7 +19,7 @@ struct BitHammingDistanceImpl static constexpr bool allow_string_integer = false; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a, B b) + static NO_SANITIZE_UNDEFINED Result apply(A a, B b) { /// Note: it's unspecified if signed integers should be promoted with sign-extension or with zero-fill. /// This behavior can change in the future. diff --git a/src/Functions/bitNot.cpp b/src/Functions/bitNot.cpp index 62ebdc7c52a..44dc77bb7bb 100644 --- a/src/Functions/bitNot.cpp +++ b/src/Functions/bitNot.cpp @@ -19,7 +19,7 @@ struct BitNotImpl using ResultType = typename NumberTraits::ResultOfBitNot::Type; static constexpr bool allow_string_or_fixed_string = true; - static inline ResultType NO_SANITIZE_UNDEFINED apply(A a) + static ResultType NO_SANITIZE_UNDEFINED apply(A a) { return ~static_cast(a); } @@ -27,7 +27,7 @@ struct BitNotImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * arg, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * arg, bool) { if (!arg->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "BitNotImpl expected an integral type"); diff --git a/src/Functions/bitOr.cpp b/src/Functions/bitOr.cpp index 9e19fc55219..22ce15d892d 100644 --- a/src/Functions/bitOr.cpp +++ b/src/Functions/bitOr.cpp @@ -19,7 +19,7 @@ struct BitOrImpl static constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { return static_cast(a) | static_cast(b); } @@ -27,7 +27,7 @@ struct BitOrImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { if (!left->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "BitOrImpl expected an integral type"); diff --git a/src/Functions/bitRotateLeft.cpp b/src/Functions/bitRotateLeft.cpp index c72466b8d49..2fe2c4e0f1d 100644 --- a/src/Functions/bitRotateLeft.cpp +++ b/src/Functions/bitRotateLeft.cpp @@ -20,7 +20,7 @@ struct BitRotateLeftImpl static const constexpr bool allow_string_integer = false; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) + static NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) { if constexpr (is_big_int_v || is_big_int_v) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Bit rotate is not implemented for big integers"); @@ -32,7 +32,7 @@ struct BitRotateLeftImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { if (!left->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "BitRotateLeftImpl expected an integral type"); diff --git a/src/Functions/bitRotateRight.cpp b/src/Functions/bitRotateRight.cpp index 045758f9a31..a2f0fe12324 100644 --- a/src/Functions/bitRotateRight.cpp +++ b/src/Functions/bitRotateRight.cpp @@ -20,7 +20,7 @@ struct BitRotateRightImpl static const constexpr bool allow_string_integer = false; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) + static NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) { if constexpr (is_big_int_v || is_big_int_v) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Bit rotate is not implemented for big integers"); @@ -32,7 +32,7 @@ struct BitRotateRightImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { if (!left->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "BitRotateRightImpl expected an integral type"); diff --git a/src/Functions/bitShiftLeft.cpp b/src/Functions/bitShiftLeft.cpp index 7b3748edb5c..c366a1ecb44 100644 --- a/src/Functions/bitShiftLeft.cpp +++ b/src/Functions/bitShiftLeft.cpp @@ -20,7 +20,7 @@ struct BitShiftLeftImpl static const constexpr bool allow_string_integer = true; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) + static NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) { if constexpr (is_big_int_v) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "BitShiftLeft is not implemented for big integers as second argument"); @@ -145,7 +145,7 @@ struct BitShiftLeftImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { if (!left->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "BitShiftLeftImpl expected an integral type"); diff --git a/src/Functions/bitShiftRight.cpp b/src/Functions/bitShiftRight.cpp index 21a0f7584aa..1c37cd3bf4c 100644 --- a/src/Functions/bitShiftRight.cpp +++ b/src/Functions/bitShiftRight.cpp @@ -21,7 +21,7 @@ struct BitShiftRightImpl static const constexpr bool allow_string_integer = true; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) + static NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) { if constexpr (is_big_int_v) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "BitShiftRight is not implemented for big integers as second argument"); @@ -31,7 +31,7 @@ struct BitShiftRightImpl return static_cast(a) >> static_cast(b); } - static inline NO_SANITIZE_UNDEFINED void bitShiftRightForBytes(const UInt8 * op_pointer, const UInt8 * begin, UInt8 * out, const size_t shift_right_bits) + static NO_SANITIZE_UNDEFINED void bitShiftRightForBytes(const UInt8 * op_pointer, const UInt8 * begin, UInt8 * out, const size_t shift_right_bits) { while (op_pointer > begin) { @@ -123,7 +123,7 @@ struct BitShiftRightImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool is_signed) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool is_signed) { if (!left->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "BitShiftRightImpl expected an integral type"); diff --git a/src/Functions/bitSwapLastTwo.cpp b/src/Functions/bitSwapLastTwo.cpp index d8957598c62..4ff436d5708 100644 --- a/src/Functions/bitSwapLastTwo.cpp +++ b/src/Functions/bitSwapLastTwo.cpp @@ -21,7 +21,7 @@ struct BitSwapLastTwoImpl using ResultType = UInt8; static constexpr const bool allow_string_or_fixed_string = false; - static inline ResultType NO_SANITIZE_UNDEFINED apply([[maybe_unused]] A a) + static ResultType NO_SANITIZE_UNDEFINED apply([[maybe_unused]] A a) { if constexpr (!std::is_same_v) // Should be a logical error, but this function is callable from SQL. @@ -35,7 +35,7 @@ struct BitSwapLastTwoImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; -static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * arg, bool) +static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * arg, bool) { if (!arg->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "__bitSwapLastTwo expected an integral type"); diff --git a/src/Functions/bitTest.cpp b/src/Functions/bitTest.cpp index 4c9c6aa2dfb..78ec9c8b773 100644 --- a/src/Functions/bitTest.cpp +++ b/src/Functions/bitTest.cpp @@ -21,7 +21,7 @@ struct BitTestImpl static const constexpr bool allow_string_integer = false; template - NO_SANITIZE_UNDEFINED static inline Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) + NO_SANITIZE_UNDEFINED static Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) { if constexpr (is_big_int_v || is_big_int_v) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "bitTest is not implemented for big integers as second argument"); diff --git a/src/Functions/bitTestAll.cpp b/src/Functions/bitTestAll.cpp index a2dcef3eb96..92f63bfa262 100644 --- a/src/Functions/bitTestAll.cpp +++ b/src/Functions/bitTestAll.cpp @@ -9,7 +9,7 @@ namespace struct BitTestAllImpl { template - static inline UInt8 apply(A a, B b) { return (a & b) == b; } + static UInt8 apply(A a, B b) { return (a & b) == b; } }; struct NameBitTestAll { static constexpr auto name = "bitTestAll"; }; diff --git a/src/Functions/bitTestAny.cpp b/src/Functions/bitTestAny.cpp index 6b20d6c184c..c8f445d524e 100644 --- a/src/Functions/bitTestAny.cpp +++ b/src/Functions/bitTestAny.cpp @@ -9,7 +9,7 @@ namespace struct BitTestAnyImpl { template - static inline UInt8 apply(A a, B b) { return (a & b) != 0; } + static UInt8 apply(A a, B b) { return (a & b) != 0; } }; struct NameBitTestAny { static constexpr auto name = "bitTestAny"; }; diff --git a/src/Functions/bitWrapperFunc.cpp b/src/Functions/bitWrapperFunc.cpp index 99c06172c30..d243a6724a8 100644 --- a/src/Functions/bitWrapperFunc.cpp +++ b/src/Functions/bitWrapperFunc.cpp @@ -21,7 +21,7 @@ struct BitWrapperFuncImpl using ResultType = UInt8; static constexpr const bool allow_string_or_fixed_string = false; - static inline ResultType NO_SANITIZE_UNDEFINED apply(A a [[maybe_unused]]) + static ResultType NO_SANITIZE_UNDEFINED apply(A a [[maybe_unused]]) { // Should be a logical error, but this function is callable from SQL. // Need to investigate this. diff --git a/src/Functions/bitXor.cpp b/src/Functions/bitXor.cpp index 78c4c64d06e..43004c6f676 100644 --- a/src/Functions/bitXor.cpp +++ b/src/Functions/bitXor.cpp @@ -19,7 +19,7 @@ struct BitXorImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { return static_cast(a) ^ static_cast(b); } @@ -27,7 +27,7 @@ struct BitXorImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { if (!left->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "BitXorImpl expected an integral type"); diff --git a/src/Functions/dateName.cpp b/src/Functions/dateName.cpp index 4d7a4f0b53d..c06dfe15dc4 100644 --- a/src/Functions/dateName.cpp +++ b/src/Functions/dateName.cpp @@ -214,7 +214,7 @@ private: template struct QuarterWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { writeText(ToQuarterImpl::execute(source, timezone), buffer); } @@ -223,7 +223,7 @@ private: template struct MonthWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { const auto month = ToMonthImpl::execute(source, timezone); static constexpr std::string_view month_names[] = @@ -249,7 +249,7 @@ private: template struct WeekWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { writeText(ToISOWeekImpl::execute(source, timezone), buffer); } @@ -258,7 +258,7 @@ private: template struct DayOfYearWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { writeText(ToDayOfYearImpl::execute(source, timezone), buffer); } @@ -267,7 +267,7 @@ private: template struct DayWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { writeText(ToDayOfMonthImpl::execute(source, timezone), buffer); } @@ -276,7 +276,7 @@ private: template struct WeekDayWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { const auto day = ToDayOfWeekImpl::execute(source, 0, timezone); static constexpr std::string_view day_names[] = @@ -297,7 +297,7 @@ private: template struct HourWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { writeText(ToHourImpl::execute(source, timezone), buffer); } @@ -306,7 +306,7 @@ private: template struct MinuteWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { writeText(ToMinuteImpl::execute(source, timezone), buffer); } @@ -315,7 +315,7 @@ private: template struct SecondWriter { - static inline void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) + static void write(WriteBuffer & buffer, Time source, const DateLUTImpl & timezone) { writeText(ToSecondImpl::execute(source, timezone), buffer); } diff --git a/src/Functions/divide.cpp b/src/Functions/divide.cpp index ca552256cd1..7c67245c382 100644 --- a/src/Functions/divide.cpp +++ b/src/Functions/divide.cpp @@ -16,7 +16,7 @@ struct DivideFloatingImpl static const constexpr bool allow_string_integer = false; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) + static NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]]) { return static_cast(a) / b; } @@ -24,7 +24,7 @@ struct DivideFloatingImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { if (left->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "DivideFloatingImpl expected a floating-point type"); diff --git a/src/Functions/divideDecimal.cpp b/src/Functions/divideDecimal.cpp index 1d0db232062..c8d2c5edc8a 100644 --- a/src/Functions/divideDecimal.cpp +++ b/src/Functions/divideDecimal.cpp @@ -18,7 +18,7 @@ struct DivideDecimalsImpl static constexpr auto name = "divideDecimal"; template - static inline Decimal256 + static Decimal256 execute(FirstType a, SecondType b, UInt16 scale_a, UInt16 scale_b, UInt16 result_scale) { if (b.value == 0) diff --git a/src/Functions/factorial.cpp b/src/Functions/factorial.cpp index b814e8198e6..7ff9126c004 100644 --- a/src/Functions/factorial.cpp +++ b/src/Functions/factorial.cpp @@ -19,7 +19,7 @@ struct FactorialImpl static const constexpr bool allow_decimal = false; static const constexpr bool allow_string_or_fixed_string = false; - static inline NO_SANITIZE_UNDEFINED ResultType apply(A a) + static NO_SANITIZE_UNDEFINED ResultType apply(A a) { if constexpr (std::is_floating_point_v || is_over_big_int) throw Exception( diff --git a/src/Functions/greatCircleDistance.cpp b/src/Functions/greatCircleDistance.cpp index 1c12317f510..1bd71f19f76 100644 --- a/src/Functions/greatCircleDistance.cpp +++ b/src/Functions/greatCircleDistance.cpp @@ -94,13 +94,13 @@ struct Impl } } - static inline NO_SANITIZE_UNDEFINED size_t toIndex(T x) + static NO_SANITIZE_UNDEFINED size_t toIndex(T x) { /// Implementation specific behaviour on overflow or infinite value. return static_cast(x); } - static inline T degDiff(T f) + static T degDiff(T f) { f = std::abs(f); if (f > 180) @@ -108,7 +108,7 @@ struct Impl return f; } - inline T fastCos(T x) + T fastCos(T x) { T y = std::abs(x) * (T(COS_LUT_SIZE) / T(PI) / T(2.0)); size_t i = toIndex(y); @@ -117,7 +117,7 @@ struct Impl return cos_lut[i] + (cos_lut[i + 1] - cos_lut[i]) * y; } - inline T fastSin(T x) + T fastSin(T x) { T y = std::abs(x) * (T(COS_LUT_SIZE) / T(PI) / T(2.0)); size_t i = toIndex(y); @@ -128,7 +128,7 @@ struct Impl /// fast implementation of asin(sqrt(x)) /// max error in floats 0.00369%, in doubles 0.00072% - inline T fastAsinSqrt(T x) + T fastAsinSqrt(T x) { if (x < T(0.122)) { diff --git a/src/Functions/greatest.cpp b/src/Functions/greatest.cpp index 93fd7e24853..87a48c887b4 100644 --- a/src/Functions/greatest.cpp +++ b/src/Functions/greatest.cpp @@ -15,7 +15,7 @@ struct GreatestBaseImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { return static_cast(a) > static_cast(b) ? static_cast(a) : static_cast(b); @@ -24,7 +24,7 @@ struct GreatestBaseImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool is_signed) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool is_signed) { if (!left->getType()->isIntegerTy()) { @@ -46,7 +46,7 @@ struct GreatestSpecialImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { static_assert(std::is_same_v, "ResultType != Result"); return accurate::greaterOp(a, b) ? static_cast(a) : static_cast(b); diff --git a/src/Functions/h3GetUnidirectionalEdge.cpp b/src/Functions/h3GetUnidirectionalEdge.cpp index 4e41cdbfef6..9e253e87104 100644 --- a/src/Functions/h3GetUnidirectionalEdge.cpp +++ b/src/Functions/h3GetUnidirectionalEdge.cpp @@ -108,7 +108,7 @@ public: /// suppress asan errors generated by the following: /// 'NEW_ADJUSTMENT_III' defined in '../contrib/h3/src/h3lib/lib/algos.c:142:24 /// 'NEW_DIGIT_III' defined in '../contrib/h3/src/h3lib/lib/algos.c:121:24 - __attribute__((no_sanitize_address)) static inline UInt64 getUnidirectionalEdge(const UInt64 origin, const UInt64 dest) + __attribute__((no_sanitize_address)) static UInt64 getUnidirectionalEdge(const UInt64 origin, const UInt64 dest) { const UInt64 res = cellsToDirectedEdge(origin, dest); return res; diff --git a/src/Functions/initialQueryID.cpp b/src/Functions/initialQueryID.cpp index 469f37cf614..9c9390d4e50 100644 --- a/src/Functions/initialQueryID.cpp +++ b/src/Functions/initialQueryID.cpp @@ -19,16 +19,16 @@ public: explicit FunctionInitialQueryID(const String & initial_query_id_) : initial_query_id(initial_query_id_) {} - inline String getName() const override { return name; } + String getName() const override { return name; } - inline size_t getNumberOfArguments() const override { return 0; } + size_t getNumberOfArguments() const override { return 0; } DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override { return std::make_shared(); } - inline bool isDeterministic() const override { return false; } + bool isDeterministic() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } diff --git a/src/Functions/intDiv.cpp b/src/Functions/intDiv.cpp index 38939556fa5..6b5bb00eacd 100644 --- a/src/Functions/intDiv.cpp +++ b/src/Functions/intDiv.cpp @@ -80,7 +80,7 @@ struct DivideIntegralByConstantImpl private: template - static inline void apply(const A * __restrict a, const B * __restrict b, ResultType * __restrict c, size_t i) + static void apply(const A * __restrict a, const B * __restrict b, ResultType * __restrict c, size_t i) { if constexpr (op_case == OpCase::Vector) c[i] = Op::template apply(a[i], b[i]); diff --git a/src/Functions/intDivOrZero.cpp b/src/Functions/intDivOrZero.cpp index 96ff6ea80fc..f32eac17127 100644 --- a/src/Functions/intDivOrZero.cpp +++ b/src/Functions/intDivOrZero.cpp @@ -13,7 +13,7 @@ struct DivideIntegralOrZeroImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { if (unlikely(divisionLeadsToFPE(a, b))) return 0; diff --git a/src/Functions/intExp10.cpp b/src/Functions/intExp10.cpp index 6944c4701bc..733f9d55702 100644 --- a/src/Functions/intExp10.cpp +++ b/src/Functions/intExp10.cpp @@ -19,7 +19,7 @@ struct IntExp10Impl using ResultType = UInt64; static constexpr const bool allow_string_or_fixed_string = false; - static inline ResultType apply([[maybe_unused]] A a) + static ResultType apply([[maybe_unused]] A a) { if constexpr (is_big_int_v || std::is_same_v) throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "IntExp10 is not implemented for big integers"); diff --git a/src/Functions/intExp2.cpp b/src/Functions/intExp2.cpp index 4e5cc60a731..7e016a0dbd2 100644 --- a/src/Functions/intExp2.cpp +++ b/src/Functions/intExp2.cpp @@ -20,7 +20,7 @@ struct IntExp2Impl using ResultType = UInt64; static constexpr bool allow_string_or_fixed_string = false; - static inline ResultType apply([[maybe_unused]] A a) + static ResultType apply([[maybe_unused]] A a) { if constexpr (is_big_int_v) throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "intExp2 not implemented for big integers"); @@ -31,7 +31,7 @@ struct IntExp2Impl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * arg, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * arg, bool) { if (!arg->getType()->isIntegerTy()) throw Exception(ErrorCodes::LOGICAL_ERROR, "IntExp2Impl expected an integral type"); diff --git a/src/Functions/isValidUTF8.cpp b/src/Functions/isValidUTF8.cpp index e7aba672356..d5f5e6a8986 100644 --- a/src/Functions/isValidUTF8.cpp +++ b/src/Functions/isValidUTF8.cpp @@ -65,9 +65,9 @@ SOFTWARE. */ #ifndef __SSE4_1__ - static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return DB::UTF8::isValidUTF8(data, len); } + static UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return DB::UTF8::isValidUTF8(data, len); } #else - static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) + static UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { /* * Map high nibble of "First Byte" to legal character length minus 1 diff --git a/src/Functions/jumpConsistentHash.cpp b/src/Functions/jumpConsistentHash.cpp index ffc21eb5cea..fbac5d4fdd5 100644 --- a/src/Functions/jumpConsistentHash.cpp +++ b/src/Functions/jumpConsistentHash.cpp @@ -29,7 +29,7 @@ struct JumpConsistentHashImpl using BucketsType = ResultType; static constexpr auto max_buckets = static_cast(std::numeric_limits::max()); - static inline ResultType apply(UInt64 hash, BucketsType n) + static ResultType apply(UInt64 hash, BucketsType n) { return JumpConsistentHash(hash, n); } diff --git a/src/Functions/kostikConsistentHash.cpp b/src/Functions/kostikConsistentHash.cpp index 47a9a928976..42004ed40d9 100644 --- a/src/Functions/kostikConsistentHash.cpp +++ b/src/Functions/kostikConsistentHash.cpp @@ -17,7 +17,7 @@ struct KostikConsistentHashImpl using BucketsType = ResultType; static constexpr auto max_buckets = 32768; - static inline ResultType apply(UInt64 hash, BucketsType n) + static ResultType apply(UInt64 hash, BucketsType n) { return ConsistentHashing(hash, n); } diff --git a/src/Functions/least.cpp b/src/Functions/least.cpp index f5680d4d468..babb8378d80 100644 --- a/src/Functions/least.cpp +++ b/src/Functions/least.cpp @@ -15,7 +15,7 @@ struct LeastBaseImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { /** gcc 4.9.2 successfully vectorizes a loop from this function. */ return static_cast(a) < static_cast(b) ? static_cast(a) : static_cast(b); @@ -24,7 +24,7 @@ struct LeastBaseImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool is_signed) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool is_signed) { if (!left->getType()->isIntegerTy()) { @@ -46,7 +46,7 @@ struct LeastSpecialImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { static_assert(std::is_same_v, "ResultType != Result"); return accurate::lessOp(a, b) ? static_cast(a) : static_cast(b); diff --git a/src/Functions/minus.cpp b/src/Functions/minus.cpp index 04877a42b18..f3b9b8a7bcb 100644 --- a/src/Functions/minus.cpp +++ b/src/Functions/minus.cpp @@ -13,7 +13,7 @@ struct MinusImpl static const constexpr bool allow_string_integer = false; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a, B b) + static NO_SANITIZE_UNDEFINED Result apply(A a, B b) { if constexpr (is_big_int_v || is_big_int_v) { @@ -28,7 +28,7 @@ struct MinusImpl /// Apply operation and check overflow. It's used for Deciamal operations. @returns true if overflowed, false otherwise. template - static inline bool apply(A a, B b, Result & c) + static bool apply(A a, B b, Result & c) { return common::subOverflow(static_cast(a), b, c); } @@ -36,7 +36,7 @@ struct MinusImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { return left->getType()->isIntegerTy() ? b.CreateSub(left, right) : b.CreateFSub(left, right); } diff --git a/src/Functions/modulo.cpp b/src/Functions/modulo.cpp index cbc2ec2cd0a..ebc1c4f5275 100644 --- a/src/Functions/modulo.cpp +++ b/src/Functions/modulo.cpp @@ -105,7 +105,7 @@ struct ModuloByConstantImpl private: template - static inline void apply(const A * __restrict a, const B * __restrict b, ResultType * __restrict c, size_t i) + static void apply(const A * __restrict a, const B * __restrict b, ResultType * __restrict c, size_t i) { if constexpr (op_case == OpCase::Vector) c[i] = Op::template apply(a[i], b[i]); diff --git a/src/Functions/moduloOrZero.cpp b/src/Functions/moduloOrZero.cpp index 3551ae74c5f..cd7873b3b9e 100644 --- a/src/Functions/moduloOrZero.cpp +++ b/src/Functions/moduloOrZero.cpp @@ -15,7 +15,7 @@ struct ModuloOrZeroImpl static const constexpr bool allow_string_integer = false; template - static inline Result apply(A a, B b) + static Result apply(A a, B b) { if constexpr (std::is_floating_point_v) { diff --git a/src/Functions/multiply.cpp b/src/Functions/multiply.cpp index 4dc8cd10f31..67b6fff6b58 100644 --- a/src/Functions/multiply.cpp +++ b/src/Functions/multiply.cpp @@ -14,7 +14,7 @@ struct MultiplyImpl static const constexpr bool allow_string_integer = false; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a, B b) + static NO_SANITIZE_UNDEFINED Result apply(A a, B b) { if constexpr (is_big_int_v || is_big_int_v) { @@ -29,7 +29,7 @@ struct MultiplyImpl /// Apply operation and check overflow. It's used for Decimal operations. @returns true if overflowed, false otherwise. template - static inline bool apply(A a, B b, Result & c) + static bool apply(A a, B b, Result & c) { if constexpr (std::is_same_v || std::is_same_v) { @@ -43,7 +43,7 @@ struct MultiplyImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { return left->getType()->isIntegerTy() ? b.CreateMul(left, right) : b.CreateFMul(left, right); } diff --git a/src/Functions/multiplyDecimal.cpp b/src/Functions/multiplyDecimal.cpp index ed6487c6683..7e30a893d72 100644 --- a/src/Functions/multiplyDecimal.cpp +++ b/src/Functions/multiplyDecimal.cpp @@ -17,7 +17,7 @@ struct MultiplyDecimalsImpl static constexpr auto name = "multiplyDecimal"; template - static inline Decimal256 + static Decimal256 execute(FirstType a, SecondType b, UInt16 scale_a, UInt16 scale_b, UInt16 result_scale) { if (a.value == 0 || b.value == 0) diff --git a/src/Functions/negate.cpp b/src/Functions/negate.cpp index bd47780dea8..2c9b461274d 100644 --- a/src/Functions/negate.cpp +++ b/src/Functions/negate.cpp @@ -11,7 +11,7 @@ struct NegateImpl using ResultType = std::conditional_t, A, typename NumberTraits::ResultOfNegate::Type>; static constexpr const bool allow_string_or_fixed_string = false; - static inline NO_SANITIZE_UNDEFINED ResultType apply(A a) + static NO_SANITIZE_UNDEFINED ResultType apply(A a) { return -static_cast(a); } @@ -19,7 +19,7 @@ struct NegateImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * arg, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * arg, bool) { return arg->getType()->isIntegerTy() ? b.CreateNeg(arg) : b.CreateFNeg(arg); } diff --git a/src/Functions/plus.cpp b/src/Functions/plus.cpp index cd9cf6cec5c..ffb0fe2ade7 100644 --- a/src/Functions/plus.cpp +++ b/src/Functions/plus.cpp @@ -14,7 +14,7 @@ struct PlusImpl static const constexpr bool is_commutative = true; template - static inline NO_SANITIZE_UNDEFINED Result apply(A a, B b) + static NO_SANITIZE_UNDEFINED Result apply(A a, B b) { /// Next everywhere, static_cast - so that there is no wrong result in expressions of the form Int64 c = UInt32(a) * Int32(-1). if constexpr (is_big_int_v || is_big_int_v) @@ -30,7 +30,7 @@ struct PlusImpl /// Apply operation and check overflow. It's used for Deciamal operations. @returns true if overflowed, false otherwise. template - static inline bool apply(A a, B b, Result & c) + static bool apply(A a, B b, Result & c) { return common::addOverflow(static_cast(a), b, c); } @@ -38,7 +38,7 @@ struct PlusImpl #if USE_EMBEDDED_COMPILER static constexpr bool compilable = true; - static inline llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) + static llvm::Value * compile(llvm::IRBuilder<> & b, llvm::Value * left, llvm::Value * right, bool) { return left->getType()->isIntegerTy() ? b.CreateAdd(left, right) : b.CreateFAdd(left, right); } diff --git a/src/Functions/queryID.cpp b/src/Functions/queryID.cpp index 704206e1de5..5d0ac719797 100644 --- a/src/Functions/queryID.cpp +++ b/src/Functions/queryID.cpp @@ -19,16 +19,16 @@ public: explicit FunctionQueryID(const String & query_id_) : query_id(query_id_) {} - inline String getName() const override { return name; } + String getName() const override { return name; } - inline size_t getNumberOfArguments() const override { return 0; } + size_t getNumberOfArguments() const override { return 0; } DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override { return std::make_shared(); } - inline bool isDeterministic() const override { return false; } + bool isDeterministic() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } diff --git a/src/Functions/repeat.cpp b/src/Functions/repeat.cpp index 84597f4eadc..7f2fe646062 100644 --- a/src/Functions/repeat.cpp +++ b/src/Functions/repeat.cpp @@ -22,14 +22,14 @@ namespace struct RepeatImpl { /// Safety threshold against DoS. - static inline void checkRepeatTime(UInt64 repeat_time) + static void checkRepeatTime(UInt64 repeat_time) { static constexpr UInt64 max_repeat_times = 1'000'000; if (repeat_time > max_repeat_times) throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too many times to repeat ({}), maximum is: {}", repeat_time, max_repeat_times); } - static inline void checkStringSize(UInt64 size) + static void checkStringSize(UInt64 size) { static constexpr UInt64 max_string_size = 1 << 30; if (size > max_string_size) diff --git a/src/Functions/roundAge.cpp b/src/Functions/roundAge.cpp index cca92c19b0c..38eda9f3383 100644 --- a/src/Functions/roundAge.cpp +++ b/src/Functions/roundAge.cpp @@ -12,7 +12,7 @@ struct RoundAgeImpl using ResultType = UInt8; static constexpr const bool allow_string_or_fixed_string = false; - static inline ResultType apply(A x) + static ResultType apply(A x) { return x < 1 ? 0 : (x < 18 ? 17 diff --git a/src/Functions/roundDuration.cpp b/src/Functions/roundDuration.cpp index 918f0b3425d..963080ba0d2 100644 --- a/src/Functions/roundDuration.cpp +++ b/src/Functions/roundDuration.cpp @@ -12,7 +12,7 @@ struct RoundDurationImpl using ResultType = UInt16; static constexpr bool allow_string_or_fixed_string = false; - static inline ResultType apply(A x) + static ResultType apply(A x) { return x < 1 ? 0 : (x < 10 ? 1 diff --git a/src/Functions/roundToExp2.cpp b/src/Functions/roundToExp2.cpp index 607c67b742e..eb0df8884c5 100644 --- a/src/Functions/roundToExp2.cpp +++ b/src/Functions/roundToExp2.cpp @@ -65,7 +65,7 @@ struct RoundToExp2Impl using ResultType = T; static constexpr const bool allow_string_or_fixed_string = false; - static inline T apply(T x) + static T apply(T x) { return roundDownToPowerOfTwo(x); } diff --git a/src/Functions/sign.cpp b/src/Functions/sign.cpp index 6c849760eed..3dd2ac8e3aa 100644 --- a/src/Functions/sign.cpp +++ b/src/Functions/sign.cpp @@ -11,7 +11,7 @@ struct SignImpl using ResultType = Int8; static constexpr bool allow_string_or_fixed_string = false; - static inline NO_SANITIZE_UNDEFINED ResultType apply(A a) + static NO_SANITIZE_UNDEFINED ResultType apply(A a) { if constexpr (is_decimal || std::is_floating_point_v) return a < A(0) ? -1 : a == A(0) ? 0 : 1; diff --git a/src/Functions/space.cpp b/src/Functions/space.cpp index 4cfa629aa33..83183c991bc 100644 --- a/src/Functions/space.cpp +++ b/src/Functions/space.cpp @@ -27,7 +27,7 @@ private: static constexpr auto space = ' '; /// Safety threshold against DoS. - static inline void checkRepeatTime(size_t repeat_time) + static void checkRepeatTime(size_t repeat_time) { static constexpr auto max_repeat_times = 1'000'000uz; if (repeat_time > max_repeat_times) diff --git a/src/Functions/tokenExtractors.cpp b/src/Functions/tokenExtractors.cpp index a29d759d2ca..e7dcb5cced3 100644 --- a/src/Functions/tokenExtractors.cpp +++ b/src/Functions/tokenExtractors.cpp @@ -116,7 +116,7 @@ public: private: template - inline void executeImpl( + void executeImpl( const ExtractorType & extractor, StringColumnType & input_data_column, ResultStringColumnType & result_data_column, diff --git a/src/IO/BufferBase.h b/src/IO/BufferBase.h index e98f00270e2..62fe011c0b6 100644 --- a/src/IO/BufferBase.h +++ b/src/IO/BufferBase.h @@ -37,13 +37,13 @@ public: { Buffer(Position begin_pos_, Position end_pos_) : begin_pos(begin_pos_), end_pos(end_pos_) {} - inline Position begin() const { return begin_pos; } - inline Position end() const { return end_pos; } - inline size_t size() const { return size_t(end_pos - begin_pos); } - inline void resize(size_t size) { end_pos = begin_pos + size; } - inline bool empty() const { return size() == 0; } + Position begin() const { return begin_pos; } + Position end() const { return end_pos; } + size_t size() const { return size_t(end_pos - begin_pos); } + void resize(size_t size) { end_pos = begin_pos + size; } + bool empty() const { return size() == 0; } - inline void swap(Buffer & other) noexcept + void swap(Buffer & other) noexcept { std::swap(begin_pos, other.begin_pos); std::swap(end_pos, other.end_pos); @@ -71,21 +71,21 @@ public: } /// get buffer - inline Buffer & internalBuffer() { return internal_buffer; } + Buffer & internalBuffer() { return internal_buffer; } /// get the part of the buffer from which you can read / write data - inline Buffer & buffer() { return working_buffer; } + Buffer & buffer() { return working_buffer; } /// get (for reading and modifying) the position in the buffer - inline Position & position() { return pos; } + Position & position() { return pos; } /// offset in bytes of the cursor from the beginning of the buffer - inline size_t offset() const { return size_t(pos - working_buffer.begin()); } + size_t offset() const { return size_t(pos - working_buffer.begin()); } /// How many bytes are available for read/write - inline size_t available() const { return size_t(working_buffer.end() - pos); } + size_t available() const { return size_t(working_buffer.end() - pos); } - inline void swap(BufferBase & other) noexcept + void swap(BufferBase & other) noexcept { internal_buffer.swap(other.internal_buffer); working_buffer.swap(other.working_buffer); diff --git a/src/IO/HTTPHeaderEntries.h b/src/IO/HTTPHeaderEntries.h index 5862f1ead15..36b2ccc4ba5 100644 --- a/src/IO/HTTPHeaderEntries.h +++ b/src/IO/HTTPHeaderEntries.h @@ -10,7 +10,7 @@ struct HTTPHeaderEntry std::string value; HTTPHeaderEntry(const std::string & name_, const std::string & value_) : name(name_), value(value_) {} - inline bool operator==(const HTTPHeaderEntry & other) const { return name == other.name && value == other.value; } + bool operator==(const HTTPHeaderEntry & other) const { return name == other.name && value == other.value; } }; using HTTPHeaderEntries = std::vector; diff --git a/src/IO/HadoopSnappyReadBuffer.h b/src/IO/HadoopSnappyReadBuffer.h index 73e52f2c503..eba614d9d0a 100644 --- a/src/IO/HadoopSnappyReadBuffer.h +++ b/src/IO/HadoopSnappyReadBuffer.h @@ -37,7 +37,7 @@ public: Status readBlock(size_t * avail_in, const char ** next_in, size_t * avail_out, char ** next_out); - inline void reset() + void reset() { buffer_length = 0; block_length = -1; @@ -73,7 +73,7 @@ class HadoopSnappyReadBuffer : public CompressedReadBufferWrapper public: using Status = HadoopSnappyDecoder::Status; - inline static String statusToString(Status status) + static String statusToString(Status status) { switch (status) { diff --git a/src/IO/IReadableWriteBuffer.h b/src/IO/IReadableWriteBuffer.h index dda5fc07c8e..db379fef969 100644 --- a/src/IO/IReadableWriteBuffer.h +++ b/src/IO/IReadableWriteBuffer.h @@ -8,7 +8,7 @@ namespace DB struct IReadableWriteBuffer { /// At the first time returns getReadBufferImpl(). Next calls return nullptr. - inline std::unique_ptr tryGetReadBuffer() + std::unique_ptr tryGetReadBuffer() { if (!can_reread) return nullptr; diff --git a/src/IO/PeekableReadBuffer.h b/src/IO/PeekableReadBuffer.h index 2ee209ffd6c..e831956956f 100644 --- a/src/IO/PeekableReadBuffer.h +++ b/src/IO/PeekableReadBuffer.h @@ -83,9 +83,9 @@ private: bool peekNext(); - inline bool useSubbufferOnly() const { return !peeked_size; } - inline bool currentlyReadFromOwnMemory() const { return working_buffer.begin() != sub_buf->buffer().begin(); } - inline bool checkpointInOwnMemory() const { return checkpoint_in_own_memory; } + bool useSubbufferOnly() const { return !peeked_size; } + bool currentlyReadFromOwnMemory() const { return working_buffer.begin() != sub_buf->buffer().begin(); } + bool checkpointInOwnMemory() const { return checkpoint_in_own_memory; } void checkStateCorrect() const; diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h index 056e25a5fbe..73f5335411f 100644 --- a/src/IO/ReadBuffer.h +++ b/src/IO/ReadBuffer.h @@ -85,7 +85,7 @@ public: } - inline void nextIfAtEnd() + void nextIfAtEnd() { if (!hasPendingData()) next(); diff --git a/src/IO/S3/Requests.h b/src/IO/S3/Requests.h index 424cf65caf2..3b03356a8fb 100644 --- a/src/IO/S3/Requests.h +++ b/src/IO/S3/Requests.h @@ -169,7 +169,7 @@ using DeleteObjectsRequest = ExtendedRequest; class ComposeObjectRequest : public ExtendedRequest { public: - inline const char * GetServiceRequestName() const override { return "ComposeObject"; } + const char * GetServiceRequestName() const override { return "ComposeObject"; } AWS_S3_API Aws::String SerializePayload() const override; diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 1ceb938e454..ef4e0058ec3 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -41,7 +41,7 @@ public: * If direct write is performed into [position(), buffer().end()) and its length is not enough, * you need to fill it first (i.g with write call), after it the capacity is regained. */ - inline void next() + void next() { if (!offset()) return; @@ -69,7 +69,7 @@ public: /// Calling finalize() in the destructor of derived classes is a bad practice. virtual ~WriteBuffer(); - inline void nextIfAtEnd() + void nextIfAtEnd() { if (!hasPendingData()) next(); @@ -96,7 +96,7 @@ public: } } - inline void write(char x) + void write(char x) { if (finalized) throw Exception{ErrorCodes::LOGICAL_ERROR, "Cannot write to finalized buffer"}; diff --git a/src/IO/ZstdDeflatingAppendableWriteBuffer.h b/src/IO/ZstdDeflatingAppendableWriteBuffer.h index d9c4f32d6da..34cdf03df25 100644 --- a/src/IO/ZstdDeflatingAppendableWriteBuffer.h +++ b/src/IO/ZstdDeflatingAppendableWriteBuffer.h @@ -27,7 +27,7 @@ class ZstdDeflatingAppendableWriteBuffer : public BufferWithOwnMemory; /// Frame end block. If we read non-empty file and see no such flag we should add it. - static inline constexpr ZSTDLastBlock ZSTD_CORRECT_TERMINATION_LAST_BLOCK = {0x01, 0x00, 0x00}; + static constexpr ZSTDLastBlock ZSTD_CORRECT_TERMINATION_LAST_BLOCK = {0x01, 0x00, 0x00}; ZstdDeflatingAppendableWriteBuffer( std::unique_ptr out_, diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 5a8a5bfb184..0b0460b26c8 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -133,10 +133,10 @@ struct DDLTaskBase virtual void createSyncedNodeIfNeed(const ZooKeeperPtr & /*zookeeper*/) {} - inline String getActiveNodePath() const { return fs::path(entry_path) / "active" / host_id_str; } - inline String getFinishedNodePath() const { return fs::path(entry_path) / "finished" / host_id_str; } - inline String getShardNodePath() const { return fs::path(entry_path) / "shards" / getShardID(); } - inline String getSyncedNodePath() const { return fs::path(entry_path) / "synced" / host_id_str; } + String getActiveNodePath() const { return fs::path(entry_path) / "active" / host_id_str; } + String getFinishedNodePath() const { return fs::path(entry_path) / "finished" / host_id_str; } + String getShardNodePath() const { return fs::path(entry_path) / "shards" / getShardID(); } + String getSyncedNodePath() const { return fs::path(entry_path) / "synced" / host_id_str; } static String getLogEntryName(UInt32 log_entry_number); static UInt32 getLogEntryNumber(const String & log_entry_name); diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h index 5caa034e0e9..37125d9900c 100644 --- a/src/Interpreters/DatabaseCatalog.h +++ b/src/Interpreters/DatabaseCatalog.h @@ -284,7 +284,7 @@ private: static constexpr UInt64 bits_for_first_level = 4; using UUIDToStorageMap = std::array; - static inline size_t getFirstLevelIdx(const UUID & uuid) + static size_t getFirstLevelIdx(const UUID & uuid) { return UUIDHelpers::getHighBytes(uuid) >> (64 - bits_for_first_level); } diff --git a/src/Interpreters/JIT/CHJIT.cpp b/src/Interpreters/JIT/CHJIT.cpp index 046d0b4fc10..21c773ee1d7 100644 --- a/src/Interpreters/JIT/CHJIT.cpp +++ b/src/Interpreters/JIT/CHJIT.cpp @@ -119,9 +119,9 @@ public: return result; } - inline size_t getAllocatedSize() const { return allocated_size; } + size_t getAllocatedSize() const { return allocated_size; } - inline size_t getPageSize() const { return page_size; } + size_t getPageSize() const { return page_size; } ~PageArena() { @@ -177,10 +177,10 @@ private: { } - inline void * base() const { return pages_base; } - inline size_t pagesSize() const { return pages_size; } - inline size_t pageSize() const { return page_size; } - inline size_t blockSize() const { return pages_size * page_size; } + void * base() const { return pages_base; } + size_t pagesSize() const { return pages_size; } + size_t pageSize() const { return page_size; } + size_t blockSize() const { return pages_size * page_size; } private: void * pages_base; @@ -298,7 +298,7 @@ public: return true; } - inline size_t allocatedSize() const + size_t allocatedSize() const { size_t data_size = rw_page_arena.getAllocatedSize() + ro_page_arena.getAllocatedSize(); size_t code_size = ex_page_arena.getAllocatedSize(); diff --git a/src/Interpreters/JIT/CHJIT.h b/src/Interpreters/JIT/CHJIT.h index fc883802426..89d446fd3b3 100644 --- a/src/Interpreters/JIT/CHJIT.h +++ b/src/Interpreters/JIT/CHJIT.h @@ -85,7 +85,7 @@ public: /** Total compiled code size for module that are currently valid. */ - inline size_t getCompiledCodeSize() const { return compiled_code_size.load(std::memory_order_relaxed); } + size_t getCompiledCodeSize() const { return compiled_code_size.load(std::memory_order_relaxed); } private: diff --git a/src/Interpreters/JIT/CompileDAG.h b/src/Interpreters/JIT/CompileDAG.h index 13ec763b6fc..8db4ac5e110 100644 --- a/src/Interpreters/JIT/CompileDAG.h +++ b/src/Interpreters/JIT/CompileDAG.h @@ -65,17 +65,17 @@ public: nodes.emplace_back(std::move(node)); } - inline size_t getNodesCount() const { return nodes.size(); } - inline size_t getInputNodesCount() const { return input_nodes_count; } + size_t getNodesCount() const { return nodes.size(); } + size_t getInputNodesCount() const { return input_nodes_count; } - inline Node & operator[](size_t index) { return nodes[index]; } - inline const Node & operator[](size_t index) const { return nodes[index]; } + Node & operator[](size_t index) { return nodes[index]; } + const Node & operator[](size_t index) const { return nodes[index]; } - inline Node & front() { return nodes.front(); } - inline const Node & front() const { return nodes.front(); } + Node & front() { return nodes.front(); } + const Node & front() const { return nodes.front(); } - inline Node & back() { return nodes.back(); } - inline const Node & back() const { return nodes.back(); } + Node & back() { return nodes.back(); } + const Node & back() const { return nodes.back(); } private: std::vector nodes; diff --git a/src/Interpreters/JoinUtils.h b/src/Interpreters/JoinUtils.h index ff48f34d82c..f15ee2c2fb2 100644 --- a/src/Interpreters/JoinUtils.h +++ b/src/Interpreters/JoinUtils.h @@ -49,7 +49,7 @@ public: return nullptr; } - inline bool isRowFiltered(size_t row) const + bool isRowFiltered(size_t row) const { return !assert_cast(*column).getData()[row]; } diff --git a/src/Interpreters/examples/hash_map_string_3.cpp b/src/Interpreters/examples/hash_map_string_3.cpp index 57e36bed545..44ee3542bd9 100644 --- a/src/Interpreters/examples/hash_map_string_3.cpp +++ b/src/Interpreters/examples/hash_map_string_3.cpp @@ -96,7 +96,7 @@ inline bool operator==(StringRef_CompareAlwaysTrue, StringRef_CompareAlwaysTrue) struct FastHash64 { - static inline uint64_t mix(uint64_t h) + static uint64_t mix(uint64_t h) { h ^= h >> 23; h *= 0x2127599bf4325c37ULL; diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index ab16aaa56ad..58f78e5af42 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -80,7 +80,7 @@ public: bool allowVariableNumberOfColumns() const override { return format_settings.custom.allow_variable_number_of_columns; } bool checkForSuffixImpl(bool check_eof); - inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf, true); } + void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf, true); } EscapingRule getEscapingRule() const override { return format_settings.custom.escaping_rule; } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 38870473289..9a7bc03ea78 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -84,7 +84,7 @@ public: void readPrefix(); void skipField(EscapingRule escaping_rule); - inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } + void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } template ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); diff --git a/src/Processors/Port.h b/src/Processors/Port.h index f3c7bbb5fee..2d39f2dd6be 100644 --- a/src/Processors/Port.h +++ b/src/Processors/Port.h @@ -38,7 +38,7 @@ public: UInt64 version = 0; UInt64 prev_version = 0; - void inline ALWAYS_INLINE update() + void ALWAYS_INLINE update() { if (version == prev_version && update_list) update_list->push_back(id); @@ -46,7 +46,7 @@ public: ++version; } - void inline ALWAYS_INLINE trigger() { prev_version = version; } + void ALWAYS_INLINE trigger() { prev_version = version; } }; protected: @@ -249,7 +249,7 @@ public: } protected: - void inline ALWAYS_INLINE updateVersion() + void ALWAYS_INLINE updateVersion() { if (likely(update_info)) update_info->update(); diff --git a/src/Server/HTTPHandler.h b/src/Server/HTTPHandler.h index ae4cf034276..a96402247a2 100644 --- a/src/Server/HTTPHandler.h +++ b/src/Server/HTTPHandler.h @@ -77,12 +77,12 @@ private: bool exception_is_written = false; std::function exception_writer; - inline bool hasDelayed() const + bool hasDelayed() const { return out_maybe_delayed_and_compressed != out_maybe_compressed.get(); } - inline void finalize() + void finalize() { if (finalized) return; @@ -94,7 +94,7 @@ private: out->finalize(); } - inline bool isFinalized() const + bool isFinalized() const { return finalized; } diff --git a/src/Storages/Cache/ExternalDataSourceCache.h b/src/Storages/Cache/ExternalDataSourceCache.h index a5dea2f63db..4c8c7974005 100644 --- a/src/Storages/Cache/ExternalDataSourceCache.h +++ b/src/Storages/Cache/ExternalDataSourceCache.h @@ -70,7 +70,7 @@ public: void initOnce(ContextPtr context, const String & root_dir_, size_t limit_size_, size_t bytes_read_before_flush_); - inline bool isInitialized() const { return initialized; } + bool isInitialized() const { return initialized; } std::pair, std::unique_ptr> createReader(ContextPtr context, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr & read_buffer, bool is_random_accessed); diff --git a/src/Storages/Cache/RemoteCacheController.h b/src/Storages/Cache/RemoteCacheController.h index 782a6b89519..22b3d64b1db 100644 --- a/src/Storages/Cache/RemoteCacheController.h +++ b/src/Storages/Cache/RemoteCacheController.h @@ -45,41 +45,41 @@ public: */ void waitMoreData(size_t start_offset_, size_t end_offset_); - inline size_t size() const { return current_offset; } + size_t size() const { return current_offset; } - inline const std::filesystem::path & getLocalPath() { return local_path; } - inline String getRemotePath() const { return file_metadata_ptr->remote_path; } + const std::filesystem::path & getLocalPath() { return local_path; } + String getRemotePath() const { return file_metadata_ptr->remote_path; } - inline UInt64 getLastModificationTimestamp() const { return file_metadata_ptr->last_modification_timestamp; } + UInt64 getLastModificationTimestamp() const { return file_metadata_ptr->last_modification_timestamp; } bool isModified(IRemoteFileMetadataPtr file_metadata_); - inline void markInvalid() + void markInvalid() { std::lock_guard lock(mutex); valid = false; } - inline bool isValid() + bool isValid() { std::lock_guard lock(mutex); return valid; } - inline bool isEnable() + bool isEnable() { std::lock_guard lock(mutex); return is_enable; } - inline void disable() + void disable() { std::lock_guard lock(mutex); is_enable = false; } - inline void enable() + void enable() { std::lock_guard lock(mutex); is_enable = true; } IRemoteFileMetadataPtr getFileMetadata() { return file_metadata_ptr; } - inline size_t getFileSize() const { return file_metadata_ptr->file_size; } + size_t getFileSize() const { return file_metadata_ptr->file_size; } void startBackgroundDownload(std::unique_ptr in_readbuffer_, BackgroundSchedulePool & thread_pool); diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index 536214e159f..20d005c8038 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -65,8 +65,8 @@ public: {ORC_INPUT_FORMAT, FileFormat::ORC}, }; - static inline bool isFormatClass(const String & format_class) { return VALID_HDFS_FORMATS.contains(format_class); } - static inline FileFormat toFileFormat(const String & format_class) + static bool isFormatClass(const String & format_class) { return VALID_HDFS_FORMATS.contains(format_class); } + static FileFormat toFileFormat(const String & format_class) { if (isFormatClass(format_class)) { diff --git a/src/Storages/Kafka/KafkaConsumer.h b/src/Storages/Kafka/KafkaConsumer.h index f160d1c0855..a3bc97779b3 100644 --- a/src/Storages/Kafka/KafkaConsumer.h +++ b/src/Storages/Kafka/KafkaConsumer.h @@ -82,17 +82,17 @@ public: auto pollTimeout() const { return poll_timeout; } - inline bool hasMorePolledMessages() const + bool hasMorePolledMessages() const { return (stalled_status == NOT_STALLED) && (current != messages.end()); } - inline bool polledDataUnusable() const + bool polledDataUnusable() const { return (stalled_status != NOT_STALLED) && (stalled_status != NO_MESSAGES_RETURNED); } - inline bool isStalled() const { return stalled_status != NOT_STALLED; } + bool isStalled() const { return stalled_status != NOT_STALLED; } void storeLastReadMessageOffset(); void resetToLastCommitted(const char * msg); diff --git a/src/Storages/MergeTree/BackgroundProcessList.h b/src/Storages/MergeTree/BackgroundProcessList.h index c9a4887cca3..bf29aaf32d0 100644 --- a/src/Storages/MergeTree/BackgroundProcessList.h +++ b/src/Storages/MergeTree/BackgroundProcessList.h @@ -87,7 +87,7 @@ public: virtual void onEntryCreate(const Entry & /* entry */) {} virtual void onEntryDestroy(const Entry & /* entry */) {} - virtual inline ~BackgroundProcessList() = default; + virtual ~BackgroundProcessList() = default; }; } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index c380f99060e..c63f811363a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -456,23 +456,23 @@ public: /// File with compression codec name which was used to compress part columns /// by default. Some columns may have their own compression codecs, but /// default will be stored in this file. - static inline constexpr auto DEFAULT_COMPRESSION_CODEC_FILE_NAME = "default_compression_codec.txt"; + static constexpr auto DEFAULT_COMPRESSION_CODEC_FILE_NAME = "default_compression_codec.txt"; /// "delete-on-destroy.txt" is deprecated. It is no longer being created, only is removed. - static inline constexpr auto DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED = "delete-on-destroy.txt"; + static constexpr auto DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED = "delete-on-destroy.txt"; - static inline constexpr auto UUID_FILE_NAME = "uuid.txt"; + static constexpr auto UUID_FILE_NAME = "uuid.txt"; /// File that contains information about kinds of serialization of columns /// and information that helps to choose kind of serialization later during merging /// (number of rows, number of rows with default values, etc). - static inline constexpr auto SERIALIZATION_FILE_NAME = "serialization.json"; + static constexpr auto SERIALIZATION_FILE_NAME = "serialization.json"; /// Version used for transactions. - static inline constexpr auto TXN_VERSION_METADATA_FILE_NAME = "txn_version.txt"; + static constexpr auto TXN_VERSION_METADATA_FILE_NAME = "txn_version.txt"; - static inline constexpr auto METADATA_VERSION_FILE_NAME = "metadata_version.txt"; + static constexpr auto METADATA_VERSION_FILE_NAME = "metadata_version.txt"; /// One of part files which is used to check how many references (I'd like /// to say hardlinks, but it will confuse even more) we have for the part @@ -484,7 +484,7 @@ public: /// it was mutation without any change for source part. In this case we /// really don't need to remove data from remote FS and need only decrement /// reference counter locally. - static inline constexpr auto FILE_FOR_REFERENCES_CHECK = "checksums.txt"; + static constexpr auto FILE_FOR_REFERENCES_CHECK = "checksums.txt"; /// Checks that all TTLs (table min/max, column ttls, so on) for part /// calculated. Part without calculated TTL may exist if TTL was added after diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index b19c42c8db8..c1514416301 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -41,13 +41,13 @@ struct MergeTreeBlockSizePredictor void update(const Block & sample_block, const Columns & columns, size_t num_rows, double decay = calculateDecay()); /// Return current block size (after update()) - inline size_t getBlockSize() const + size_t getBlockSize() const { return block_size_bytes; } /// Predicts what number of rows should be read to exhaust byte quota per column - inline size_t estimateNumRowsForMaxSizeColumn(size_t bytes_quota) const + size_t estimateNumRowsForMaxSizeColumn(size_t bytes_quota) const { double max_size_per_row = std::max(std::max(max_size_per_row_fixed, 1), max_size_per_row_dynamic); return (bytes_quota > block_size_rows * max_size_per_row) @@ -56,14 +56,14 @@ struct MergeTreeBlockSizePredictor } /// Predicts what number of rows should be read to exhaust byte quota per block - inline size_t estimateNumRows(size_t bytes_quota) const + size_t estimateNumRows(size_t bytes_quota) const { return (bytes_quota > block_size_bytes) ? static_cast((bytes_quota - block_size_bytes) / std::max(1, static_cast(bytes_per_row_current))) : 0; } - inline void updateFilteredRowsRation(size_t rows_was_read, size_t rows_was_filtered, double decay = calculateDecay()) + void updateFilteredRowsRation(size_t rows_was_read, size_t rows_was_filtered, double decay = calculateDecay()) { double alpha = std::pow(1. - decay, rows_was_read); double current_ration = rows_was_filtered / std::max(1.0, static_cast(rows_was_read)); diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h index 85006c3ffde..87445c99ade 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h +++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h @@ -64,8 +64,8 @@ public: std::string describe() const; }; -constexpr inline auto getNonAdaptiveMrkSizeWide() { return sizeof(UInt64) * 2; } -constexpr inline auto getAdaptiveMrkSizeWide() { return sizeof(UInt64) * 3; } +constexpr auto getNonAdaptiveMrkSizeWide() { return sizeof(UInt64) * 2; } +constexpr auto getAdaptiveMrkSizeWide() { return sizeof(UInt64) * 3; } inline size_t getAdaptiveMrkSizeCompact(size_t columns_num); } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 9d086e1dc37..f96206ce657 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -307,7 +307,7 @@ public: /// Get best replica having this partition on a same type remote disk String getSharedDataReplica(const IMergeTreeDataPart & part, const DataSourceDescription & data_source_description) const; - inline const String & getReplicaName() const { return replica_name; } + const String & getReplicaName() const { return replica_name; } /// Restores table metadata if ZooKeeper lost it. /// Used only on restarted readonly replicas (not checked). All active (Active) parts are moved to detached/ diff --git a/src/Storages/UVLoop.h b/src/Storages/UVLoop.h index dd1d64973d1..907a3fc0b13 100644 --- a/src/Storages/UVLoop.h +++ b/src/Storages/UVLoop.h @@ -57,9 +57,9 @@ public: } } - inline uv_loop_t * getLoop() { return loop_ptr.get(); } + uv_loop_t * getLoop() { return loop_ptr.get(); } - inline const uv_loop_t * getLoop() const { return loop_ptr.get(); } + const uv_loop_t * getLoop() const { return loop_ptr.get(); } private: std::unique_ptr loop_ptr; diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index 1946d8e8905..ed7f80e5df9 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -39,7 +39,7 @@ class Context; class ITableFunction : public std::enable_shared_from_this { public: - static inline std::string getDatabaseName() { return "_table_function"; } + static std::string getDatabaseName() { return "_table_function"; } /// Get the main function name. virtual std::string getName() const = 0; From 75d163da12b8c6b5671d40f33eaa12e0409f2566 Mon Sep 17 00:00:00 2001 From: avogar Date: Sun, 19 May 2024 12:17:01 +0000 Subject: [PATCH 183/392] Fix tests --- .../03159_dynamic_type_all_types.reference | 12 ++---------- .../0_stateless/03159_dynamic_type_all_types.sql | 8 ++------ 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.reference b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference index abecca893f9..72c5b90dbba 100644 --- a/tests/queries/0_stateless/03159_dynamic_type_all_types.reference +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.reference @@ -109,10 +109,6 @@ MultiPolygon [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)] Map(Dynamic, Dynamic) {'11':'v1','22':'1'} Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] -Object(\'json\') {"1":"2"} -Object(Nullable(\'json\')) {"k1":1,"k2":2,"1":null,"2":null,"2020-10-10":null} -Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":null,"2":null,"2020-10-10":"foo"} -Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":2,"2":3,"2020-10-10":null} Point (1.23,4.5600000000000005) Ring [(1.23,4.5600000000000005),(2.34,5.67)] String string @@ -258,10 +254,6 @@ MultiPolygon [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)] Map(Dynamic, Dynamic) {'11':'v1','22':'1'} Nested(x UInt32, y String) [(1,'aa'),(2,'bb')] Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String)) [(1,(2,['aa','bb']),[(3,'cc'),(4,'dd')]),(5,(6,['ee','ff']),[(7,'gg'),(8,'hh')])] -Object(\'json\') {"1":"2"} -Object(Nullable(\'json\')) {"k1":1,"k2":2,"1":null,"2":null,"2020-10-10":null} -Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":null,"2":null,"2020-10-10":"foo"} -Object(Nullable(\'json\')) {"k1":null,"k2":null,"1":2,"2":3,"2020-10-10":null} Point (1.23,4.5600000000000005) Ring [(1.23,4.5600000000000005),(2.34,5.67)] String string @@ -296,5 +288,5 @@ UInt256 1 UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639934 UInt256 115792089237316195423570985008687907853269984665640564039457584007913129639935 -50 -50 +48 +48 diff --git a/tests/queries/0_stateless/03159_dynamic_type_all_types.sql b/tests/queries/0_stateless/03159_dynamic_type_all_types.sql index 64fab07ed4f..d302205ca23 100644 --- a/tests/queries/0_stateless/03159_dynamic_type_all_types.sql +++ b/tests/queries/0_stateless/03159_dynamic_type_all_types.sql @@ -49,10 +49,6 @@ INSERT INTO t VALUES ('1'::Bool), (0::Bool); -- Dates: use Date and Date32 for days, and DateTime and DateTime64 for instances in time INSERT INTO t VALUES ('2022-01-01'::Date), ('2022-01-01'::Date32), ('2022-01-01 01:01:01'::DateTime), ('2022-01-01 01:01:01.011'::DateTime64); --- JSON -INSERT INTO t VALUES ('{"1":"2"}'::JSON); -INSERT INTO t FORMAT JSONEachRow {"d" : {"k1" : 1, "k2" : 2}} {"d" : {"1" : 2, "2" : 3}} {"d" : {"2020-10-10" : "foo"}}; - -- UUID INSERT INTO t VALUES ('dededdb6-7835-4ce4-8d11-b5de6f2820e9'::UUID); INSERT INTO t VALUES ('00000000-0000-0000-0000-000000000000'::UUID); @@ -86,13 +82,13 @@ INSERT INTO t VALUES (interval '1' day), (interval '2' month), (interval '3' yea INSERT INTO t VALUES ([(1, 'aa'), (2, 'bb')]::Nested(x UInt32, y String)); INSERT INTO t VALUES ([(1, (2, ['aa', 'bb']), [(3, 'cc'), (4, 'dd')]), (5, (6, ['ee', 'ff']), [(7, 'gg'), (8, 'hh')])]::Nested(x UInt32, y Tuple(y1 UInt32, y2 Array(String)), z Nested(z1 UInt32, z2 String))); -SELECT dynamicType(d), d FROM t ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d, toString(d); +SELECT dynamicType(d), d FROM t ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d; CREATE TABLE t2 (d Dynamic(max_types=255)) ENGINE = Memory; INSERT INTO t2 SELECT * FROM t; SELECT ''; -SELECT dynamicType(d), d FROM t2 ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d, toString(d); +SELECT dynamicType(d), d FROM t2 ORDER BY substring(dynamicType(d),1,1), length(dynamicType(d)), d; SELECT ''; SELECT uniqExact(dynamicType(d)) t_ FROM t; From bb0fcc929695701ccde2ca49298e50792636fa1c Mon Sep 17 00:00:00 2001 From: pufit Date: Sun, 19 May 2024 08:33:37 -0400 Subject: [PATCH 184/392] better tests --- ...te_view_with_sql_security_option.reference | 2 + ...84_create_view_with_sql_security_option.sh | 78 +++++++++---------- 2 files changed, 41 insertions(+), 39 deletions(-) diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference index 931cf8ac19c..0589fdeef04 100644 --- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference +++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference @@ -24,6 +24,8 @@ OK 2 OK OK +OK +100 100 ===== TestGrants ===== OK diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh index f1da343da36..f32aee44bee 100755 --- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh +++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh @@ -159,6 +159,45 @@ ${CLICKHOUSE_CLIENT} --query "REVOKE SELECT ON $db.test_table FROM $user1" (( $(${CLICKHOUSE_CLIENT} --user $user2 --query "SELECT * FROM $db.test_mv_4" 2>&1 | grep -c "Not enough privileges") >= 1 )) && echo "OK" || echo "UNEXPECTED" (( $(${CLICKHOUSE_CLIENT} --query "INSERT INTO $db.test_table VALUES ('foo'), ('bar');" 2>&1 | grep -c "Not enough privileges") >= 1 )) && echo "OK" || echo "UNEXPECTED" +${CLICKHOUSE_CLIENT} --multiquery <&1 | grep -c "Not enough privileges") >= 1 )) && echo "OK" || echo "UNEXPECTED" +${CLICKHOUSE_CLIENT} --query "GRANT INSERT ON $db.source TO $user2" +${CLICKHOUSE_CLIENT} --user $user2 --query "INSERT INTO source SELECT * FROM generateRandom() LIMIT 100" + +${CLICKHOUSE_CLIENT} --query "SELECT count() FROM destination1" +${CLICKHOUSE_CLIENT} --query "SELECT count() FROM destination2" echo "===== TestGrants =====" ${CLICKHOUSE_CLIENT} --query "GRANT CREATE ON *.* TO $user1" @@ -192,45 +231,6 @@ ${CLICKHOUSE_CLIENT} --user $user1 --query " ${CLICKHOUSE_CLIENT} --query "GRANT SET DEFINER ON $user2 TO $user1" -${CLICKHOUSE_CLIENT} --multiquery < Date: Sun, 19 May 2024 12:51:14 +0000 Subject: [PATCH 185/392] Restore the warning --- .clang-tidy | 2 ++ src/Common/CurrentThread.h | 2 +- src/Common/findExtreme.cpp | 4 ++-- src/Functions/ExtractString.h | 6 +++--- .../FunctionsLanguageClassification.cpp | 2 +- .../FunctionsProgrammingClassification.cpp | 2 +- src/Functions/FunctionsStringHash.cpp | 20 +++++++++---------- src/Functions/FunctionsStringSimilarity.cpp | 6 +++--- .../FunctionsTonalityClassification.cpp | 2 +- src/Functions/PolygonUtils.h | 2 +- src/Processors/Port.h | 6 +++--- 11 files changed, 28 insertions(+), 26 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 66417c41c46..7e8f604467b 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -118,6 +118,8 @@ Checks: [ '-readability-magic-numbers', '-readability-named-parameter', '-readability-redundant-declaration', + '-readability-redundant-inline-specifier', # generally useful but incompatible with __attribute((always_inline))__ (aka. ALWAYS_INLINE). + # it has an effect only if combined with `inline`: https://godbolt.org/z/Eefd74qdM '-readability-simplify-boolean-expr', '-readability-suspicious-call-argument', '-readability-uppercase-literal-suffix', diff --git a/src/Common/CurrentThread.h b/src/Common/CurrentThread.h index 8dade8c6fd5..e1eb926c951 100644 --- a/src/Common/CurrentThread.h +++ b/src/Common/CurrentThread.h @@ -62,7 +62,7 @@ public: static void updatePerformanceCountersIfNeeded(); static ProfileEvents::Counters & getProfileEvents(); - static MemoryTracker * getMemoryTracker() + ALWAYS_INLINE inline static MemoryTracker * getMemoryTracker() { if (!current_thread) [[unlikely]] return nullptr; diff --git a/src/Common/findExtreme.cpp b/src/Common/findExtreme.cpp index a99b1f2dd3d..ce3bbb86d7c 100644 --- a/src/Common/findExtreme.cpp +++ b/src/Common/findExtreme.cpp @@ -11,13 +11,13 @@ namespace DB template struct MinComparator { - static ALWAYS_INLINE const T & cmp(const T & a, const T & b) { return std::min(a, b); } + static ALWAYS_INLINE inline const T & cmp(const T & a, const T & b) { return std::min(a, b); } }; template struct MaxComparator { - static ALWAYS_INLINE const T & cmp(const T & a, const T & b) { return std::max(a, b); } + static ALWAYS_INLINE inline const T & cmp(const T & a, const T & b) { return std::max(a, b); } }; MULTITARGET_FUNCTION_AVX2_SSE42( diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index 5b8fa41958a..aa0e1b04835 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -20,7 +20,7 @@ namespace DB // includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word struct ExtractStringImpl { - static ALWAYS_INLINE const UInt8 * readOneWord(const UInt8 *& pos, const UInt8 * end) + static ALWAYS_INLINE inline const UInt8 * readOneWord(const UInt8 *& pos, const UInt8 * end) { // jump separators while (pos < end && isUTF8Sep(*pos)) @@ -35,10 +35,10 @@ struct ExtractStringImpl } // we use ASCII non-alphanum character as UTF8 separator - static ALWAYS_INLINE bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); } + static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); } // read one UTF8 character - static ALWAYS_INLINE void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end) + static ALWAYS_INLINE inline void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end) { size_t length = UTF8::seqLength(*pos); diff --git a/src/Functions/FunctionsLanguageClassification.cpp b/src/Functions/FunctionsLanguageClassification.cpp index 94391606762..55485d41ce0 100644 --- a/src/Functions/FunctionsLanguageClassification.cpp +++ b/src/Functions/FunctionsLanguageClassification.cpp @@ -31,7 +31,7 @@ extern const int SUPPORT_IS_DISABLED; struct FunctionDetectLanguageImpl { - static ALWAYS_INLINE std::string_view codeISO(std::string_view code_string) + static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string) { if (code_string.ends_with("-Latn")) code_string.remove_suffix(code_string.size() - 5); diff --git a/src/Functions/FunctionsProgrammingClassification.cpp b/src/Functions/FunctionsProgrammingClassification.cpp index 8e9eff50aab..a93e1d9a87d 100644 --- a/src/Functions/FunctionsProgrammingClassification.cpp +++ b/src/Functions/FunctionsProgrammingClassification.cpp @@ -21,7 +21,7 @@ namespace DB struct FunctionDetectProgrammingLanguageImpl { /// Calculate total weight - static ALWAYS_INLINE Float64 stateMachine( + static ALWAYS_INLINE inline Float64 stateMachine( const FrequencyHolder::Map & standard, const std::unordered_map & model) { diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index cd33564caf9..0bf6e39e651 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -99,7 +99,7 @@ struct Hash } template - static ALWAYS_INLINE UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size) + static ALWAYS_INLINE inline UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size) { if (size & 1) { @@ -153,7 +153,7 @@ struct Hash } template - static ALWAYS_INLINE UInt64 shingleHash(const std::vector & shingle, size_t offset = 0) + static ALWAYS_INLINE inline UInt64 shingleHash(const std::vector & shingle, size_t offset = 0) { UInt64 crc = -1ULL; @@ -177,14 +177,14 @@ struct SimHashImpl static constexpr size_t min_word_size = 4; /// Update fingerprint according to hash_value bits. - static ALWAYS_INLINE void updateFingerVector(Int64 * finger_vec, UInt64 hash_value) + static ALWAYS_INLINE inline void updateFingerVector(Int64 * finger_vec, UInt64 hash_value) { for (size_t i = 0; i < 64; ++i) finger_vec[i] += (hash_value & (1ULL << i)) ? 1 : -1; } /// Return a 64 bit value according to finger_vec. - static ALWAYS_INLINE UInt64 getSimHash(const Int64 * finger_vec) + static ALWAYS_INLINE inline UInt64 getSimHash(const Int64 * finger_vec) { UInt64 res = 0; @@ -200,7 +200,7 @@ struct SimHashImpl // for each ngram, calculate a 64 bit hash value, and update the vector according the hash value // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 - static ALWAYS_INLINE UInt64 ngramHashASCII(const UInt8 * data, size_t size, size_t shingle_size) + static ALWAYS_INLINE inline UInt64 ngramHashASCII(const UInt8 * data, size_t size, size_t shingle_size) { if (size < shingle_size) return Hash::shingleHash(-1ULL, data, size); @@ -217,7 +217,7 @@ struct SimHashImpl return getSimHash(finger_vec); } - static ALWAYS_INLINE UInt64 ngramHashUTF8(const UInt8 * data, size_t size, size_t shingle_size) + static ALWAYS_INLINE inline UInt64 ngramHashUTF8(const UInt8 * data, size_t size, size_t shingle_size) { const UInt8 * start = data; const UInt8 * end = data + size; @@ -259,7 +259,7 @@ struct SimHashImpl // 2. next, we extract one word each time, and calculate a new hash value of the new word,then use the latest N hash // values to calculate the next word shingle hash value - static ALWAYS_INLINE UInt64 wordShingleHash(const UInt8 * data, size_t size, size_t shingle_size) + static ALWAYS_INLINE inline UInt64 wordShingleHash(const UInt8 * data, size_t size, size_t shingle_size) { const UInt8 * start = data; const UInt8 * end = data + size; @@ -400,7 +400,7 @@ struct MinHashImpl using MaxHeap = Heap>; using MinHeap = Heap>; - static ALWAYS_INLINE void ngramHashASCII( + static ALWAYS_INLINE inline void ngramHashASCII( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, @@ -429,7 +429,7 @@ struct MinHashImpl } } - static ALWAYS_INLINE void ngramHashUTF8( + static ALWAYS_INLINE inline void ngramHashUTF8( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, @@ -472,7 +472,7 @@ struct MinHashImpl // MinHash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64) // for each word shingle, we calculate a hash value, but in fact, we just maintain the // K minimum and K maximum hash value - static ALWAYS_INLINE void wordShingleHash( + static ALWAYS_INLINE inline void wordShingleHash( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, diff --git a/src/Functions/FunctionsStringSimilarity.cpp b/src/Functions/FunctionsStringSimilarity.cpp index 5224c76d7b0..7b3f2337c89 100644 --- a/src/Functions/FunctionsStringSimilarity.cpp +++ b/src/Functions/FunctionsStringSimilarity.cpp @@ -85,7 +85,7 @@ struct NgramDistanceImpl } template - static ALWAYS_INLINE void unrollLowering(Container & cont, const std::index_sequence &) + static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) { ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); } @@ -195,7 +195,7 @@ struct NgramDistanceImpl } template - static ALWAYS_INLINE size_t calculateNeedleStats( + static ALWAYS_INLINE inline size_t calculateNeedleStats( const char * data, const size_t size, NgramCount * ngram_stats, @@ -228,7 +228,7 @@ struct NgramDistanceImpl } template - static ALWAYS_INLINE UInt64 calculateHaystackStatsAndMetric( + static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric( const char * data, const size_t size, NgramCount * ngram_stats, diff --git a/src/Functions/FunctionsTonalityClassification.cpp b/src/Functions/FunctionsTonalityClassification.cpp index a8cc09186f6..3de38d99c88 100644 --- a/src/Functions/FunctionsTonalityClassification.cpp +++ b/src/Functions/FunctionsTonalityClassification.cpp @@ -18,7 +18,7 @@ namespace DB */ struct FunctionDetectTonalityImpl { - static ALWAYS_INLINE Float32 detectTonality( + static ALWAYS_INLINE inline Float32 detectTonality( const UInt8 * str, const size_t str_len, const FrequencyHolder::Map & emotional_dict) diff --git a/src/Functions/PolygonUtils.h b/src/Functions/PolygonUtils.h index 0c57fd7f0b5..4ab146b085f 100644 --- a/src/Functions/PolygonUtils.h +++ b/src/Functions/PolygonUtils.h @@ -124,7 +124,7 @@ public: bool hasEmptyBound() const { return has_empty_bound; } - bool ALWAYS_INLINE contains(CoordinateType x, CoordinateType y) const + bool ALWAYS_INLINE inline contains(CoordinateType x, CoordinateType y) const { Point point(x, y); diff --git a/src/Processors/Port.h b/src/Processors/Port.h index 2d39f2dd6be..f3c7bbb5fee 100644 --- a/src/Processors/Port.h +++ b/src/Processors/Port.h @@ -38,7 +38,7 @@ public: UInt64 version = 0; UInt64 prev_version = 0; - void ALWAYS_INLINE update() + void inline ALWAYS_INLINE update() { if (version == prev_version && update_list) update_list->push_back(id); @@ -46,7 +46,7 @@ public: ++version; } - void ALWAYS_INLINE trigger() { prev_version = version; } + void inline ALWAYS_INLINE trigger() { prev_version = version; } }; protected: @@ -249,7 +249,7 @@ public: } protected: - void ALWAYS_INLINE updateVersion() + void inline ALWAYS_INLINE updateVersion() { if (likely(update_info)) update_info->update(); From 639f7f166f6ba1f4c078b30e66fd40605b9866f5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 19 May 2024 12:53:17 +0000 Subject: [PATCH 186/392] Fix typo --- .clang-tidy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clang-tidy b/.clang-tidy index 7e8f604467b..7dafaeb9e3f 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -119,7 +119,7 @@ Checks: [ '-readability-named-parameter', '-readability-redundant-declaration', '-readability-redundant-inline-specifier', # generally useful but incompatible with __attribute((always_inline))__ (aka. ALWAYS_INLINE). - # it has an effect only if combined with `inline`: https://godbolt.org/z/Eefd74qdM + # ALWAYS_INLINE has an effect only if combined with `inline`: https://godbolt.org/z/Eefd74qdM '-readability-simplify-boolean-expr', '-readability-suspicious-call-argument', '-readability-uppercase-literal-suffix', From ff392b0aeb668d34049dfaee0966fba91186227c Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 19 May 2024 13:00:30 +0000 Subject: [PATCH 187/392] Minor corrections --- src/Common/CurrentThread.h | 2 +- src/Functions/PolygonUtils.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/CurrentThread.h b/src/Common/CurrentThread.h index e1eb926c951..53b61ba315f 100644 --- a/src/Common/CurrentThread.h +++ b/src/Common/CurrentThread.h @@ -62,7 +62,7 @@ public: static void updatePerformanceCountersIfNeeded(); static ProfileEvents::Counters & getProfileEvents(); - ALWAYS_INLINE inline static MemoryTracker * getMemoryTracker() + inline ALWAYS_INLINE static MemoryTracker * getMemoryTracker() { if (!current_thread) [[unlikely]] return nullptr; diff --git a/src/Functions/PolygonUtils.h b/src/Functions/PolygonUtils.h index 4ab146b085f..c4851718da6 100644 --- a/src/Functions/PolygonUtils.h +++ b/src/Functions/PolygonUtils.h @@ -124,7 +124,7 @@ public: bool hasEmptyBound() const { return has_empty_bound; } - bool ALWAYS_INLINE inline contains(CoordinateType x, CoordinateType y) const + inline bool ALWAYS_INLINE contains(CoordinateType x, CoordinateType y) const { Point point(x, y); From f143ae6969c77b5ebe44ec4865251caaa18db7fa Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 19 May 2024 14:31:21 +0000 Subject: [PATCH 188/392] Fix build --- src/Coordination/KeeperServer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index b07c90b8660..736a01443ce 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -990,7 +990,7 @@ KeeperServer::ConfigUpdateState KeeperServer::applyConfigUpdate( raft_instance->set_priority(update->id, update->priority, /*broadcast on live leader*/true); return Accepted; } - chassert(false); + std::unreachable(); } ClusterUpdateActions KeeperServer::getRaftConfigurationDiff(const Poco::Util::AbstractConfiguration & config) From 513900cb524d7b3e96cfbe8b8b56d9b0b0eb6070 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sun, 19 May 2024 15:44:19 +0000 Subject: [PATCH 189/392] assume columns from projection are aggregates --- src/Planner/PlannerExpressionAnalysis.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index e7d553af944..399bbfc67cf 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -454,6 +454,13 @@ SortAnalysisResult analyzeSort(const QueryNode & query_node, before_sort_actions_inputs_name_to_node.emplace(node->result_name, node); std::unordered_set aggregation_keys; + + auto projection_expression_dag = std::make_shared(); + for (const auto & node : query_node.getProjection()) + actions_visitor.visit(projection_expression_dag, node); + for (const auto & node : projection_expression_dag->getNodes()) + aggregation_keys.insert(node.result_name); + if (aggregation_analysis_result_optional) aggregation_keys.insert(aggregation_analysis_result_optional->aggregation_keys.begin(), aggregation_analysis_result_optional->aggregation_keys.end()); From 1293a0f79572213f2cd90f5a6f09fbe39d8dbf9e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 19 May 2024 18:47:58 +0000 Subject: [PATCH 190/392] Cosmetics, pt. I --- src/Functions/generateSnowflakeID.cpp | 95 +++++++++++++-------------- 1 file changed, 45 insertions(+), 50 deletions(-) diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 1decda0ab46..28fc2eb6b05 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -18,8 +18,7 @@ namespace ErrorCodes namespace { -/* - Snowflake ID +/* Snowflake ID https://en.wikipedia.org/wiki/Snowflake_ID 0 1 2 3 @@ -30,35 +29,34 @@ namespace | | machine_id | machine_seq_num | ├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ -- The first 41 (+ 1 top zero bit) bits is timestamp in Unix time milliseconds -- The middle 10 bits are the machine ID. -- The last 12 bits decode to number of ids processed by the machine at the given millisecond. +- The first 41 (+ 1 top zero bit) bits is the timestamp (millisecond since Unix epoch 1 Jan 1970) +- The middle 10 bits are the machine ID +- The last 12 bits are a counter to disambiguate multiple snowflakeIDs generated within the same millisecond by differen processes */ -constexpr auto timestamp_size = 41; -constexpr auto machine_id_size = 10; -constexpr auto machine_seq_num_size = 12; +constexpr auto timestamp_bits_count = 41; +constexpr auto machine_id_bits_count = 10; +constexpr auto machine_seq_num_bits_count = 12; -constexpr int64_t timestamp_mask = ((1LL << timestamp_size) - 1) << (machine_id_size + machine_seq_num_size); -constexpr int64_t machine_id_mask = ((1LL << machine_id_size) - 1) << machine_seq_num_size; -constexpr int64_t machine_seq_num_mask = (1LL << machine_seq_num_size) - 1; +constexpr int64_t timestamp_mask = ((1LL << timestamp_bits_count) - 1) << (machine_id_bits_count + machine_seq_num_bits_count); +constexpr int64_t machine_id_mask = ((1LL << machine_id_bits_count) - 1) << machine_seq_num_bits_count; +constexpr int64_t machine_seq_num_mask = (1LL << machine_seq_num_bits_count) - 1; constexpr int64_t max_machine_seq_num = machine_seq_num_mask; Int64 getMachineID() { - auto serverUUID = ServerUUID::get(); - - // hash serverUUID into 64 bits - Int64 h = UUIDHelpers::getHighBytes(serverUUID); - Int64 l = UUIDHelpers::getLowBytes(serverUUID); - return ((h * 11) ^ (l * 17)) & machine_id_mask; + UUID server_uuid = ServerUUID::get(); + /// hash into 64 bits + UInt64 hi = UUIDHelpers::getHighBytes(server_uuid); + UInt64 lo = UUIDHelpers::getLowBytes(server_uuid); + return ((hi * 11) ^ (lo * 17)) & machine_id_mask; } Int64 getTimestamp() { - const auto tm_point = std::chrono::system_clock::now(); - return std::chrono::duration_cast( - tm_point.time_since_epoch()).count() & ((1LL << timestamp_size) - 1); + auto now = std::chrono::system_clock::now(); + auto ticks_since_epoch = std::chrono::duration_cast(now.time_since_epoch()).count(); + return ticks_since_epoch & ((1LL << timestamp_bits_count) - 1); } } @@ -66,16 +64,11 @@ Int64 getTimestamp() class FunctionSnowflakeID : public IFunction { private: - mutable std::atomic lowest_available_snowflake_id{0}; - // 1 atomic value because we don't want to use mutex + mutable std::atomic lowest_available_snowflake_id = 0; /// atomic to avoid a mutex public: static constexpr auto name = "generateSnowflakeID"; - - static FunctionPtr create(ContextPtr /*context*/) - { - return std::make_shared(); - } + static FunctionPtr create(ContextPtr /*context*/) { return std::make_shared(); } String getName() const override { return name; } size_t getNumberOfArguments() const override { return 0; } @@ -95,31 +88,34 @@ public: return std::make_shared(); } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & /*arguments*/, const DataTypePtr &, size_t input_rows_count) const override { auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_to = col_res->getData(); - Int64 size64 = static_cast(input_rows_count); + vec_to.resize(input_rows_count); if (input_rows_count == 0) { return col_res; } - Int64 machine_id = getMachineID(); + const Int64 machine_id = getMachineID(); Int64 current_timestamp = getTimestamp(); Int64 current_machine_seq_num; - Int64 available_id, next_available_id; + Int64 available_snowflake_id, next_available_snowflake_id; + + const Int64 size64 = static_cast(input_rows_count); + do { - available_id = lowest_available_snowflake_id.load(); - Int64 available_timestamp = (available_id & timestamp_mask) >> (machine_id_size + machine_seq_num_size); - Int64 available_machine_seq_num = available_id & machine_seq_num_mask; + available_snowflake_id = lowest_available_snowflake_id.load(); + const Int64 available_timestamp = (available_snowflake_id & timestamp_mask) >> (machine_id_bits_count + machine_seq_num_bits_count); + const Int64 available_machine_seq_num = available_snowflake_id & machine_seq_num_mask; if (current_timestamp > available_timestamp) { + /// handle overflow current_machine_seq_num = 0; } else @@ -128,24 +124,23 @@ public: current_machine_seq_num = available_machine_seq_num; } - // calculate new `lowest_available_snowflake_id` + /// calculate new lowest_available_snowflake_id + const Int64 seq_nums_in_current_timestamp_left = (max_machine_seq_num - current_machine_seq_num + 1); Int64 new_timestamp; - Int64 seq_nums_in_current_timestamp_left = (max_machine_seq_num - current_machine_seq_num + 1); - if (size64 >= seq_nums_in_current_timestamp_left) { + if (size64 >= seq_nums_in_current_timestamp_left) new_timestamp = current_timestamp + 1 + (size64 - seq_nums_in_current_timestamp_left) / max_machine_seq_num; - } else { + else new_timestamp = current_timestamp; - } - Int64 new_machine_seq_num = (current_machine_seq_num + size64) & machine_seq_num_mask; - next_available_id = (new_timestamp << (machine_id_size + machine_seq_num_size)) | machine_id | new_machine_seq_num; + const Int64 new_machine_seq_num = (current_machine_seq_num + size64) & machine_seq_num_mask; + next_available_snowflake_id = (new_timestamp << (machine_id_bits_count + machine_seq_num_bits_count)) | machine_id | new_machine_seq_num; } - while (!lowest_available_snowflake_id.compare_exchange_strong(available_id, next_available_id)); - // failed CAS => another thread updated `lowest_available_snowflake_id` - // successful CAS => we have our range of exclusive values + while (!lowest_available_snowflake_id.compare_exchange_strong(available_snowflake_id, next_available_snowflake_id)); + /// failed CAS => another thread updated `lowest_available_snowflake_id` + /// successful CAS => we have our range of exclusive values - for (Int64 & el : vec_to) + for (Int64 & to_row : vec_to) { - el = (current_timestamp << (machine_id_size + machine_seq_num_size)) | machine_id | current_machine_seq_num; + to_row = (current_timestamp << (machine_id_bits_count + machine_seq_num_bits_count)) | machine_id | current_machine_seq_num; if (current_machine_seq_num++ == max_machine_seq_num) { current_machine_seq_num = 0; @@ -163,10 +158,10 @@ REGISTER_FUNCTION(GenerateSnowflakeID) factory.registerFunction(FunctionDocumentation { .description=R"( -Generates Snowflake ID -- unique identificators contains: -- The first 41 (+ 1 top zero bit) bits is timestamp in Unix time milliseconds -- The middle 10 bits are the machine ID. -- The last 12 bits decode to number of ids processed by the machine at the given millisecond. +Generates a SnowflakeID -- unique identificators contains: +- The first 41 (+ 1 top zero bit) bits is the timestamp (millisecond since Unix epoch 1 Jan 1970) +- The middle 10 bits are the machine ID +- The last 12 bits are a counter to disambiguate multiple snowflakeIDs generated within the same millisecond by differen processes In case the number of ids processed overflows, the timestamp field is incremented by 1 and the counter is reset to 0. This function guarantees strict monotony on 1 machine and differences in values obtained on different machines. From 08a3c16a5aca95c73cc0ea1aaf2d57edb6acaef2 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 19 May 2024 18:53:51 +0000 Subject: [PATCH 191/392] Cosmetics, pt. II --- src/Functions/generateSnowflakeID.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 28fc2eb6b05..d70b8349cd8 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -105,7 +105,7 @@ public: Int64 available_snowflake_id, next_available_snowflake_id; - const Int64 size64 = static_cast(input_rows_count); + const Int64 input_rows_count_signed = static_cast(input_rows_count); do { @@ -127,11 +127,11 @@ public: /// calculate new lowest_available_snowflake_id const Int64 seq_nums_in_current_timestamp_left = (max_machine_seq_num - current_machine_seq_num + 1); Int64 new_timestamp; - if (size64 >= seq_nums_in_current_timestamp_left) - new_timestamp = current_timestamp + 1 + (size64 - seq_nums_in_current_timestamp_left) / max_machine_seq_num; + if (input_rows_count_signed >= seq_nums_in_current_timestamp_left) + new_timestamp = current_timestamp + 1 + (input_rows_count_signed - seq_nums_in_current_timestamp_left) / max_machine_seq_num; else new_timestamp = current_timestamp; - const Int64 new_machine_seq_num = (current_machine_seq_num + size64) & machine_seq_num_mask; + const Int64 new_machine_seq_num = (current_machine_seq_num + input_rows_count_signed) & machine_seq_num_mask; next_available_snowflake_id = (new_timestamp << (machine_id_bits_count + machine_seq_num_bits_count)) | machine_id | new_machine_seq_num; } while (!lowest_available_snowflake_id.compare_exchange_strong(available_snowflake_id, next_available_snowflake_id)); From e8d66bf4d79d4ee1f3b18a4ccb1865f3f7ce7294 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 19 May 2024 19:16:24 +0000 Subject: [PATCH 192/392] Cosmetics, pt. III --- src/Functions/serial.cpp | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/Functions/serial.cpp b/src/Functions/serial.cpp index 3da2f4ce218..de3036ad242 100644 --- a/src/Functions/serial.cpp +++ b/src/Functions/serial.cpp @@ -17,16 +17,16 @@ namespace ErrorCodes class FunctionSerial : public IFunction { private: - mutable zkutil::ZooKeeperPtr zk{nullptr}; + mutable zkutil::ZooKeeperPtr zk; ContextPtr context; public: static constexpr auto name = "serial"; - explicit FunctionSerial(ContextPtr ctx) : context(ctx) + explicit FunctionSerial(ContextPtr context_) : context(context_) { - if (ctx->hasZooKeeper()) { - zk = ctx->getZooKeeper(); + if (context->hasZooKeeper()) { + zk = context->getZooKeeper(); } } @@ -37,7 +37,6 @@ public: String getName() const override { return name; } size_t getNumberOfArguments() const override { return 1; } - bool isStateful() const override { return true; } bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } @@ -74,14 +73,14 @@ public: auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_to = col_res->getData(); - size_t size = input_rows_count; - vec_to.resize(size); + + vec_to.resize(input_rows_count); const auto & serial_path = "/serials/" + arguments[0].column->getDataAt(0).toString(); - // CAS in ZooKeeper - // `get` value and version, `trySet` new with version check - // I didn't get how to do it with `multi` + /// CAS in ZooKeeper + /// `get` value and version, `trySet` new with version check + /// I didn't get how to do it with `multi` Int64 counter; std::string counter_path = serial_path + "/counter"; @@ -93,10 +92,10 @@ public: Coordination::Stat stat; while (true) { - std::string counter_string = zk->get(counter_path, &stat); + const String counter_string = zk->get(counter_path, &stat); counter = std::stoll(counter_string); - std::string updated_counter = std::to_string(counter + input_rows_count); - Coordination::Error err = zk->trySet(counter_path, updated_counter); + String updated_counter = std::to_string(counter + input_rows_count); + const Coordination::Error err = zk->trySet(counter_path, updated_counter); if (err == Coordination::Error::ZOK) { // CAS is done @@ -111,7 +110,7 @@ public: } // Make a result - for (auto& val : vec_to) + for (auto & val : vec_to) { val = counter; ++counter; @@ -137,16 +136,16 @@ The server should be configured with a ZooKeeper. }, .returned_value = "Sequential numbers of type Int64 starting from the previous counter value", .examples{ - {"first call", "SELECT serial('name')", R"( -┌─serial('name')─┐ + {"first call", "SELECT serial('id1')", R"( +┌─serial('id1')──┐ │ 1 │ └────────────────┘)"}, - {"second call", "SELECT serial('name')", R"( -┌─serial('name')─┐ + {"second call", "SELECT serial('id1')", R"( +┌─serial('id1')──┐ │ 2 │ └────────────────┘)"}, - {"column call", "SELECT *, serial('name') FROM test_table", R"( -┌─CounterID─┬─UserID─┬─ver─┬─serial('name')─┐ + {"column call", "SELECT *, serial('id1') FROM test_table", R"( +┌─CounterID─┬─UserID─┬─ver─┬─serial('id1')──┐ │ 1 │ 3 │ 3 │ 3 │ │ 1 │ 1 │ 1 │ 4 │ │ 1 │ 2 │ 2 │ 5 │ From 5d848aa32f1127098895cc29ad3200b5b325768a Mon Sep 17 00:00:00 2001 From: copperybean Date: Sun, 19 May 2024 23:20:40 +0800 Subject: [PATCH 193/392] update comment of method visitNullableBySteps, try to suppress clang-18 tidy warnings Change-Id: I3119c44dc764caed0dc471f52ac5e634c75c7b50 --- .../Impl/Parquet/ParquetDataValuesReader.cpp | 14 +++++++++++--- .../Formats/Impl/Parquet/ParquetDataValuesReader.h | 13 +++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp index 65f569ec264..b8e4db8700c 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp @@ -14,6 +14,17 @@ namespace ErrorCodes extern const int PARQUET_EXCEPTION; } +RleValuesReader::RleValuesReader( + std::unique_ptr bit_reader_, Int32 bit_width_) + : bit_reader(std::move(bit_reader_)), bit_width(bit_width_) +{ + if (unlikely(bit_width >= 64)) + { + // e.g. in GetValue_ in bit_stream_utils.h, uint64 type is used to read bit values + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "unsupported bit width {}", bit_width); + } +} + void RleValuesReader::nextGroup() { // refer to: @@ -29,9 +40,6 @@ void RleValuesReader::nextGroup() { cur_group_size *= 8; cur_packed_bit_values.resize(cur_group_size); - - // try to suppress clang tidy warnings by assertion - assert(bit_width < 64); bit_reader->GetBatch(bit_width, cur_packed_bit_values.data(), cur_group_size); } else diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index 0f916ff862d..75adb55df7e 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -18,8 +18,7 @@ namespace DB class RleValuesReader { public: - RleValuesReader(std::unique_ptr bit_reader_, Int32 bit_width_) - : bit_reader(std::move(bit_reader_)), bit_width(bit_width_) {} + RleValuesReader(std::unique_ptr bit_reader_, Int32 bit_width_); /** * @brief Used when the bit_width is 0, so all elements have same value. @@ -71,12 +70,14 @@ public: * @tparam IndividualNullVisitor A callback with signature: void(size_t cursor), used to process null value * @tparam SteppedValidVisitor A callback with signature: * void(size_t cursor, const std::vector & valid_index_steps) - * for n valid elements with null value interleaved in a BitPacked group, + * valid_index_steps records the gap size between two valid elements, * i-th item in valid_index_steps describes how many elements there are * from i-th valid element (include) to (i+1)-th valid element (exclude). * - * take following BitPacked group with 2 valid elements for example: - * null valid null null valid null + * take following BitPacked group values for example, and assuming max_def_level is 1: + * [1, 0, 1, 1, 0, 1 ] + * null valid null null valid null + * the second line shows the corresponding validation state, * then the valid_index_steps has values [1, 3, 2]. * Please note that the the sum of valid_index_steps is same as elements number in this group. * @@ -117,7 +118,7 @@ private: std::vector cur_packed_bit_values; std::vector valid_index_steps; - Int32 bit_width; + const Int32 bit_width; UInt32 cur_group_size = 0; UInt32 cur_group_cursor = 0; From ad5f6f27dff104f6229819be27fba3732226603e Mon Sep 17 00:00:00 2001 From: copperybean Date: Mon, 20 May 2024 16:28:21 +0800 Subject: [PATCH 194/392] fix reader type, update comment Change-Id: Iefec91bca223eedaabe302b7891808c6d94eed9d --- .../Impl/Parquet/ParquetDataValuesReader.h | 1 + .../Impl/Parquet/ParquetRecordReader.cpp | 29 ++++++++++++++----- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h index 75adb55df7e..fbccb612b3c 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h +++ b/src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h @@ -80,6 +80,7 @@ public: * the second line shows the corresponding validation state, * then the valid_index_steps has values [1, 3, 2]. * Please note that the the sum of valid_index_steps is same as elements number in this group. + * TODO the definition of valid_index_steps should be updated when supporting nested types * * @tparam RepeatedVisitor A callback with signature: void(bool is_valid, UInt32 cursor, UInt32 count) */ diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 0b797dd66ad..69da40b47e6 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -27,6 +27,7 @@ namespace DB namespace ErrorCodes { + extern const int NOT_IMPLEMENTED; extern const int PARQUET_EXCEPTION; } @@ -225,7 +226,7 @@ std::unique_ptr ColReaderFactory::fromInt32INT(const parque { switch (int_type.bit_width()) { - case sizeof(Int32): + case 32: { if (int_type.is_signed()) return makeLeafReader(); @@ -241,7 +242,7 @@ std::unique_ptr ColReaderFactory::fromInt64INT(const parque { switch (int_type.bit_width()) { - case sizeof(Int64): + case 64: { if (int_type.is_signed()) return makeLeafReader(); @@ -312,16 +313,28 @@ ParquetRecordReader::ParquetRecordReader( { log = &Poco::Logger::get("ParquetRecordReader"); + std::unordered_map parquet_columns; + auto root = file_reader->metadata()->schema()->group_node(); + for (int i = 0; i < root->field_count(); ++i) + { + auto & node = root->field(i); + parquet_columns.emplace(node->name(), node); + } + parquet_col_indice.reserve(header.columns()); column_readers.reserve(header.columns()); for (const auto & col_with_name : header) { - auto idx = file_reader->metadata()->schema()->ColumnIndex(col_with_name.name); - if (idx < 0) - { - auto msg = PreformattedMessage::create("can not find column with name: {}", col_with_name.name); - throw Exception(std::move(msg), ErrorCodes::PARQUET_EXCEPTION); - } + auto it = parquet_columns.find(col_with_name.name); + if (it == parquet_columns.end()) + throw Exception(ErrorCodes::PARQUET_EXCEPTION, "no column with '{}' in parquet file", col_with_name.name); + + auto node = it->second; + if (!node->is_primitive()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "arrays and maps are not implemented in native parquet reader"); + + auto idx = file_reader->metadata()->schema()->ColumnIndex(*node); + chassert(idx >= 0); parquet_col_indice.push_back(idx); } if (reader_properties.pre_buffer()) From 84459052b6cddd9a5e1ca4bcd00e5edfc6e49f12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 20 May 2024 21:27:24 +0200 Subject: [PATCH 195/392] Prevent LOGICAL_ERROR on CREATE TABLE as MaterializedView --- src/Interpreters/InterpreterCreateQuery.cpp | 7 +++++++ .../0_stateless/03161_create_table_as_mv.reference | 0 .../0_stateless/03161_create_table_as_mv.sql | 14 ++++++++++++++ 3 files changed, 21 insertions(+) create mode 100644 tests/queries/0_stateless/03161_create_table_as_mv.reference create mode 100644 tests/queries/0_stateless/03161_create_table_as_mv.sql diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 519cbde588f..711693f71b1 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -977,6 +977,13 @@ void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const if (as_create.is_ordinary_view) throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot CREATE a table AS {}, it is a View", qualified_name); + if (as_create.is_materialized_view && as_create.to_table_id) + throw Exception( + ErrorCodes::INCORRECT_QUERY, + "Cannot CREATE a table AS {}, it is a Materialized View without storage. Use \"AS `{}`\" instead", + qualified_name, + as_create.to_table_id.getQualifiedName()); + if (as_create.is_live_view) throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot CREATE a table AS {}, it is a Live View", qualified_name); diff --git a/tests/queries/0_stateless/03161_create_table_as_mv.reference b/tests/queries/0_stateless/03161_create_table_as_mv.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03161_create_table_as_mv.sql b/tests/queries/0_stateless/03161_create_table_as_mv.sql new file mode 100644 index 00000000000..e80659ac923 --- /dev/null +++ b/tests/queries/0_stateless/03161_create_table_as_mv.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS base_table; +DROP TABLE IF EXISTS target_table; +DROP TABLE IF EXISTS mv_from_base_to_target; +DROP TABLE IF EXISTS mv_with_storage; +DROP TABLE IF EXISTS other_table_1; +DROP TABLE IF EXISTS other_table_2; + +CREATE TABLE base_table (date DateTime, id String, cost Float64) ENGINE = MergeTree() ORDER BY date; +CREATE TABLE target_table (id String, total AggregateFunction(sum, Float64)) ENGINE = MergeTree() ORDER BY id; +CREATE MATERIALIZED VIEW mv_from_base_to_target TO target_table AS Select id, sumState(cost) FROM base_table GROUP BY id; +CREATE MATERIALIZED VIEW mv_with_storage ENGINE=MergeTree() ORDER BY id AS Select id, sumState(cost) FROM base_table GROUP BY id; + +CREATE TABLE other_table_1 AS mv_with_storage; +CREATE TABLE other_table_2 AS mv_from_base_to_target; -- { serverError INCORRECT_QUERY } From d66f0d6420e2d7972ce7eeb95188d394ed5a575f Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 21 May 2024 10:36:13 +0200 Subject: [PATCH 196/392] Properly fallback when native copy fails --- src/Backups/BackupIO_S3.cpp | 9 +- src/Disks/ObjectStorages/IObjectStorage.h | 12 ++- .../ObjectStorages/S3/S3ObjectStorage.cpp | 5 + src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 1 + .../ObjectStorages/Web/WebObjectStorage.h | 2 + src/IO/S3/copyS3File.cpp | 95 ++++++++++++------- src/IO/S3/copyS3File.h | 5 +- 7 files changed, 89 insertions(+), 40 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 15860363615..eb6773b196e 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -195,7 +195,8 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s blob_storage_log, object_attributes, threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupReaderS3"), - /* for_disk_s3= */ true); + /* for_disk_s3= */ true, + destination_disk->getObjectStorage()->getS3StorageClient()); return file_size; }; @@ -252,7 +253,7 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src { LOG_TRACE(log, "Copying file {} from disk {} to S3", src_path, src_disk->getName()); copyS3File( - client, + src_disk->getObjectStorage()->getS3StorageClient(), /* src_bucket */ blob_path[1], /* src_key= */ blob_path[0], start_pos, @@ -263,7 +264,9 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src read_settings, blob_storage_log, {}, - threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupWriterS3")); + threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupWriterS3"), + /*for_disk_s3=*/false, + client); return; /// copied! } } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index eae31af9d44..5ec318a1ca4 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -31,6 +30,10 @@ #include #endif +#if USE_AWS_S3 +#include +#endif + namespace DB { @@ -244,6 +247,13 @@ public: } #endif +#if USE_AWS_S3 + virtual std::shared_ptr getS3StorageClient() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "This function is only implemented for S3ObjectStorage"); + } +#endif + private: mutable std::mutex throttlers_mutex; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 043e5b8ef8c..223f9d34a44 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -573,6 +573,11 @@ ObjectStorageKey S3ObjectStorage::generateObjectKeyForPath(const std::string & p return key_generator->generate(path, /* is_directory */ false); } +std::shared_ptr S3ObjectStorage::getS3StorageClient() +{ + return client.get(); +} + } #endif diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 5eaab4b585c..b9fd2cbf4b2 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -162,6 +162,7 @@ public: bool isReadOnly() const override { return s3_settings.get()->read_only; } + std::shared_ptr getS3StorageClient() override; private: void setNewSettings(std::unique_ptr && s3_settings_); diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.h b/src/Disks/ObjectStorages/Web/WebObjectStorage.h index b8ab510a6fb..d57da588601 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.h +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.h @@ -3,6 +3,8 @@ #include "config.h" #include + +#include #include namespace Poco diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index 549d0a569c6..46cadcef68c 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -652,7 +652,8 @@ namespace const std::optional> & object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, bool for_disk_s3_, - BlobStorageLogWriterPtr blob_storage_log_) + BlobStorageLogWriterPtr blob_storage_log_, + std::function fallback_method_) : UploadHelper(client_ptr_, dest_bucket_, dest_key_, request_settings_, object_metadata_, schedule_, for_disk_s3_, blob_storage_log_, getLogger("copyS3File")) , src_bucket(src_bucket_) , src_key(src_key_) @@ -660,6 +661,7 @@ namespace , size(src_size_) , supports_multipart_copy(client_ptr_->supportsMultiPartCopy()) , read_settings(read_settings_) + , fallback_method(std::move(fallback_method_)) { } @@ -682,14 +684,7 @@ namespace size_t size; bool supports_multipart_copy; const ReadSettings read_settings; - - CreateReadBuffer getSourceObjectReadBuffer() - { - return [&] - { - return std::make_unique(client_ptr, src_bucket, src_key, "", request_settings, read_settings); - }; - } + std::function fallback_method; void performSingleOperationCopy() { @@ -754,18 +749,7 @@ namespace dest_bucket, dest_key, size); - copyDataToS3File( - getSourceObjectReadBuffer(), - offset, - size, - client_ptr, - dest_bucket, - dest_key, - request_settings, - blob_storage_log, - object_metadata, - schedule, - for_disk_s3); + fallback_method(); break; } else @@ -859,13 +843,24 @@ void copyDataToS3File( ThreadPoolCallbackRunnerUnsafe schedule, bool for_disk_s3) { - CopyDataToFileHelper helper{create_read_buffer, offset, size, dest_s3_client, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_s3, blob_storage_log}; + CopyDataToFileHelper helper{ + create_read_buffer, + offset, + size, + dest_s3_client, + dest_bucket, + dest_key, + settings, + object_metadata, + schedule, + for_disk_s3, + blob_storage_log}; helper.performCopy(); } void copyS3File( - const std::shared_ptr & s3_client, + const std::shared_ptr & src_s3_client, const String & src_bucket, const String & src_key, size_t src_offset, @@ -877,21 +872,53 @@ void copyS3File( BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata, ThreadPoolCallbackRunnerUnsafe schedule, - bool for_disk_s3) + bool for_disk_s3, + std::shared_ptr dest_s3_client) { - if (settings.allow_native_copy) + if (!dest_s3_client) + dest_s3_client = src_s3_client; + + std::function fallback_method = [&] { - CopyFileHelper helper{s3_client, src_bucket, src_key, src_offset, src_size, dest_bucket, dest_key, settings, read_settings, object_metadata, schedule, for_disk_s3, blob_storage_log}; - helper.performCopy(); - } - else + auto create_read_buffer + = [&] { return std::make_unique(src_s3_client, src_bucket, src_key, "", settings, read_settings); }; + + copyDataToS3File( + create_read_buffer, + src_offset, + src_size, + dest_s3_client, + dest_bucket, + dest_key, + settings, + blob_storage_log, + object_metadata, + schedule, + for_disk_s3); + }; + + if (!settings.allow_native_copy) { - auto create_read_buffer = [&] - { - return std::make_unique(s3_client, src_bucket, src_key, "", settings, read_settings); - }; - copyDataToS3File(create_read_buffer, src_offset, src_size, s3_client, dest_bucket, dest_key, settings, blob_storage_log, object_metadata, schedule, for_disk_s3); + fallback_method(); + return; } + + CopyFileHelper helper{ + src_s3_client, + src_bucket, + src_key, + src_offset, + src_size, + dest_bucket, + dest_key, + settings, + read_settings, + object_metadata, + schedule, + for_disk_s3, + blob_storage_log, + std::move(fallback_method)}; + helper.performCopy(); } } diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index d5da4d260b1..cb1960cc368 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -31,7 +31,7 @@ using CreateReadBuffer = std::function()>; /// /// read_settings - is used for throttling in case of native copy is not possible void copyS3File( - const std::shared_ptr & s3_client, + const std::shared_ptr & src_s3_client, const String & src_bucket, const String & src_key, size_t src_offset, @@ -43,7 +43,8 @@ void copyS3File( BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunnerUnsafe schedule_ = {}, - bool for_disk_s3 = false); + bool for_disk_s3 = false, + std::shared_ptr dest_s3_client = nullptr); /// Copies data from any seekable source to S3. /// The same functionality can be done by using the function copyData() and the class WriteBufferFromS3 From b253ca36084ec50e8d06dfe50cb3561cd915a602 Mon Sep 17 00:00:00 2001 From: copperybean Date: Mon, 20 May 2024 23:12:07 +0800 Subject: [PATCH 197/392] fix clang-tidy warnings Change-Id: Iff9f5f894e815b184ac35f61b4cac87908c612b5 --- contrib/arrow | 2 +- src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/arrow b/contrib/arrow index 8f36d71d185..5cfccd8ea65 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 8f36d71d18587f1f315ec832f424183cb6519cbb +Subproject commit 5cfccd8ea65f33d4517e7409815d761c7650b45d diff --git a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp index 69da40b47e6..a7e51f88b3c 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetRecordReader.cpp @@ -314,10 +314,10 @@ ParquetRecordReader::ParquetRecordReader( log = &Poco::Logger::get("ParquetRecordReader"); std::unordered_map parquet_columns; - auto root = file_reader->metadata()->schema()->group_node(); + const auto * root = file_reader->metadata()->schema()->group_node(); for (int i = 0; i < root->field_count(); ++i) { - auto & node = root->field(i); + const auto & node = root->field(i); parquet_columns.emplace(node->name(), node); } @@ -329,7 +329,7 @@ ParquetRecordReader::ParquetRecordReader( if (it == parquet_columns.end()) throw Exception(ErrorCodes::PARQUET_EXCEPTION, "no column with '{}' in parquet file", col_with_name.name); - auto node = it->second; + const auto & node = it->second; if (!node->is_primitive()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "arrays and maps are not implemented in native parquet reader"); From e1caea6ab51d032fcba5e4356d7a4b5869e2eb9c Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 11:29:07 +0200 Subject: [PATCH 198/392] Split attached table count into attached tables, views and dictionaries --- programs/server/Server.cpp | 2 ++ src/Common/CurrentMetrics.cpp | 2 ++ src/Core/ServerSettings.h | 2 ++ src/Databases/DatabaseLazy.cpp | 24 +++++++++++++++++++++-- src/Databases/DatabasesCommon.cpp | 22 +++++++++++++++++++-- src/Interpreters/Context.cpp | 20 +++++++++++++++++++ src/Interpreters/Context.h | 2 ++ tests/config/config.d/max_num_to_warn.xml | 2 ++ 8 files changed, 72 insertions(+), 4 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 9c9476d1aa7..223bc1f77e7 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1476,6 +1476,8 @@ try global_context->setMaxTableSizeToDrop(new_server_settings.max_table_size_to_drop); global_context->setMaxPartitionSizeToDrop(new_server_settings.max_partition_size_to_drop); global_context->setMaxTableNumToWarn(new_server_settings.max_table_num_to_warn); + global_context->setMaxViewNumToWarn(new_server_settings.max_view_num_to_warn); + global_context->setMaxDictionaryNumToWarn(new_server_settings.max_dictionary_num_to_warn); global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn); global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn); diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 21b4d114d79..b557edc3e12 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -224,6 +224,8 @@ M(PartsActive, "Active data part, used by current and upcoming SELECTs.") \ M(AttachedDatabase, "Active database, used by current and upcoming SELECTs.") \ M(AttachedTable, "Active table, used by current and upcoming SELECTs.") \ + M(AttachedView, "Active view, used by current and upcoming SELECTs.") \ + M(AttachedDictionary, "Active dictionary, used by current and upcoming SELECTs.") \ M(PartsOutdated, "Not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes.") \ M(PartsDeleting, "Not active data part with identity refcounter, it is deleting right now by a cleaner.") \ M(PartsDeleteOnDestroy, "Part was moved to another disk and should be deleted in own destructor.") \ diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 524d6ec07c2..af96ca3a557 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -97,6 +97,8 @@ namespace DB M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \ M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \ M(UInt64, max_table_num_to_warn, 5000lu, "If number of tables is greater than this value, server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_view_num_to_warn, 5000lu, "If number of views is greater than this value, server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_dictionary_num_to_warn, 5000lu, "If number of dictionaries is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, max_database_num_to_warn, 1000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, max_part_num_to_warn, 100000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, concurrent_threads_soft_limit_num, 0, "Sets how many concurrent thread can be allocated before applying CPU pressure. Zero means unlimited.", 0) \ diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index fb1b3ee626b..ca985b5a7c8 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "Common/CurrentMetrics.h" #include #include @@ -24,6 +25,8 @@ namespace fs = std::filesystem; namespace CurrentMetrics { extern const Metric AttachedTable; + extern const Metric AttachedView; + extern const Metric AttachedDictionary; } @@ -184,7 +187,16 @@ void DatabaseLazy::attachTable(ContextPtr /* context_ */, const String & table_n throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Table {}.{} already exists.", backQuote(database_name), backQuote(table_name)); it->second.expiration_iterator = cache_expiration_queue.emplace(cache_expiration_queue.end(), current_time, table_name); - CurrentMetrics::add(CurrentMetrics::AttachedTable, 1); + CurrentMetrics::Metric metric; + if (table->isView()) { + metric = CurrentMetrics::AttachedView; + } else if (table->isDictionary()) { + metric = CurrentMetrics::AttachedDictionary; + } else { + metric = CurrentMetrics::AttachedTable; + } + CurrentMetrics::add(metric, 1); + } StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & table_name) @@ -200,7 +212,15 @@ StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & ta if (it->second.expiration_iterator != cache_expiration_queue.end()) cache_expiration_queue.erase(it->second.expiration_iterator); tables_cache.erase(it); - CurrentMetrics::sub(CurrentMetrics::AttachedTable, 1); + CurrentMetrics::Metric metric; + if (res->isView()) { + metric = CurrentMetrics::AttachedView; + } else if (res->isDictionary()) { + metric = CurrentMetrics::AttachedDictionary; + } else { + metric = CurrentMetrics::AttachedTable; + } + CurrentMetrics::sub(metric, 1); } return res; } diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index fc75f8e44b9..ab7f2fff5aa 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -18,6 +18,8 @@ namespace CurrentMetrics { extern const Metric AttachedTable; + extern const Metric AttachedView; + extern const Metric AttachedDictionary; } @@ -263,7 +265,15 @@ StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_n res = it->second; tables.erase(it); res->is_detached = true; - CurrentMetrics::sub(CurrentMetrics::AttachedTable, 1); + CurrentMetrics::Metric metric; + if (res->isView()) { + metric = CurrentMetrics::AttachedView; + } else if (res->isDictionary()) { + metric = CurrentMetrics::AttachedDictionary; + } else { + metric = CurrentMetrics::AttachedTable; + } + CurrentMetrics::sub(metric, 1); auto table_id = res->getStorageID(); if (table_id.hasUUID()) @@ -304,7 +314,15 @@ void DatabaseWithOwnTablesBase::attachTableUnlocked(const String & table_name, c /// It is important to reset is_detached here since in case of RENAME in /// non-Atomic database the is_detached is set to true before RENAME. table->is_detached = false; - CurrentMetrics::add(CurrentMetrics::AttachedTable, 1); + CurrentMetrics::Metric metric; + if (table->isView()) { + metric = CurrentMetrics::AttachedView; + } else if (table->isDictionary()) { + metric = CurrentMetrics::AttachedDictionary; + } else { + metric = CurrentMetrics::AttachedTable; + } + CurrentMetrics::add(metric, 1); } void DatabaseWithOwnTablesBase::shutdown() diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 1bd9601dd7e..4c5df8ef4ea 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -160,6 +160,8 @@ namespace CurrentMetrics extern const Metric TablesLoaderForegroundThreadsScheduled; extern const Metric IOWriterThreadsScheduled; extern const Metric AttachedTable; + extern const Metric AttachedView; + extern const Metric AttachedDictionary; extern const Metric AttachedDatabase; extern const Metric PartsActive; } @@ -359,6 +361,8 @@ struct ContextSharedPart : boost::noncopyable /// No lock required for format_schema_path modified only during initialization std::atomic_size_t max_database_num_to_warn = 1000lu; std::atomic_size_t max_table_num_to_warn = 5000lu; + std::atomic_size_t max_view_num_to_warn = 5000lu; + std::atomic_size_t max_dictionary_num_to_warn = 5000lu; std::atomic_size_t max_part_num_to_warn = 100000lu; String format_schema_path; /// Path to a directory that contains schema files used by input formats. String google_protos_path; /// Path to a directory that contains the proto files for the well-known Protobuf types. @@ -935,6 +939,10 @@ Strings Context::getWarnings() const common_warnings = shared->warnings; if (CurrentMetrics::get(CurrentMetrics::AttachedTable) > static_cast(shared->max_table_num_to_warn)) common_warnings.emplace_back(fmt::format("The number of attached tables is more than {}", shared->max_table_num_to_warn)); + if (CurrentMetrics::get(CurrentMetrics::AttachedView) > static_cast(shared->max_view_num_to_warn)) + common_warnings.emplace_back(fmt::format("The number of attached views is more than {}", shared->max_view_num_to_warn)); + if (CurrentMetrics::get(CurrentMetrics::AttachedDictionary) > static_cast(shared->max_dictionary_num_to_warn)) + common_warnings.emplace_back(fmt::format("The number of attached dictionaries is more than {}", shared->max_dictionary_num_to_warn)); if (CurrentMetrics::get(CurrentMetrics::AttachedDatabase) > static_cast(shared->max_database_num_to_warn)) common_warnings.emplace_back(fmt::format("The number of attached databases is more than {}", shared->max_database_num_to_warn)); if (CurrentMetrics::get(CurrentMetrics::PartsActive) > static_cast(shared->max_part_num_to_warn)) @@ -3711,6 +3719,18 @@ void Context::setMaxTableNumToWarn(size_t max_table_to_warn) shared->max_table_num_to_warn= max_table_to_warn; } +void Context::setMaxViewNumToWarn(size_t max_view_to_warn) +{ + SharedLockGuard lock(shared->mutex); + shared->max_view_num_to_warn= max_view_to_warn; +} + +void Context::setMaxDictionaryNumToWarn(size_t max_dictionary_to_warn) +{ + SharedLockGuard lock(shared->mutex); + shared->max_dictionary_num_to_warn= max_dictionary_to_warn; +} + void Context::setMaxDatabaseNumToWarn(size_t max_database_to_warn) { SharedLockGuard lock(shared->mutex); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 7f663773e52..814534f7035 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -861,6 +861,8 @@ public: const HTTPHeaderFilter & getHTTPHeaderFilter() const; void setMaxTableNumToWarn(size_t max_table_to_warn); + void setMaxViewNumToWarn(size_t max_view_to_warn); + void setMaxDictionaryNumToWarn(size_t max_dictionary_to_warn); void setMaxDatabaseNumToWarn(size_t max_database_to_warn); void setMaxPartNumToWarn(size_t max_part_to_warn); /// The port that the server listens for executing SQL queries. diff --git a/tests/config/config.d/max_num_to_warn.xml b/tests/config/config.d/max_num_to_warn.xml index 776c270823d..1f55e6fd674 100644 --- a/tests/config/config.d/max_num_to_warn.xml +++ b/tests/config/config.d/max_num_to_warn.xml @@ -1,5 +1,7 @@ 5 + 5 + 5 2 10 From 311d6d6baa32ad0bdee1c58813c6d551aaeb53e0 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 21 May 2024 09:38:36 +0000 Subject: [PATCH 199/392] Fix: 02124_insert_deduplication_token_multiple_blocks_replica --- .../02124_insert_deduplication_token_multiple_blocks_replica.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.sh b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.sh index 1c776263f78..0c95abb9867 100755 --- a/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.sh +++ b/tests/queries/0_stateless/02124_insert_deduplication_token_multiple_blocks_replica.sh @@ -9,6 +9,8 @@ INSERT_BLOCK_SETTINGS="max_insert_block_size=1&min_insert_block_size_rows=0&min_ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS block_dedup_token_replica SYNC" $CLICKHOUSE_CLIENT --query="CREATE TABLE block_dedup_token_replica (id Int32) ENGINE=ReplicatedMergeTree('/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/{table}', '{replica}') ORDER BY id" +# Need to stop merges due to randomization of old_parts_lifetime setting, so all initial parts are guaranteed to exist when we check them +$CLICKHOUSE_CLIENT --query="SYSTEM STOP MERGES block_dedup_token_replica" $CLICKHOUSE_CLIENT --query="SELECT 'insert 2 blocks with dedup token, 1 row per block'" DEDUP_TOKEN='dedup1' From e1fef7ecd77da0b1eaed4b0dbc7a73b36cd228ac Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Tue, 21 May 2024 12:54:46 +0200 Subject: [PATCH 200/392] Group const fields --- src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp | 4 ++-- src/Storages/MergeTree/IMergeTreeDataPartWriter.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index e01572715d6..b3e33e94073 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -56,14 +56,14 @@ IMergeTreeDataPartWriter::IMergeTreeDataPartWriter( const MergeTreeIndexGranularity & index_granularity_) : data_part_name(data_part_name_) , serializations(serializations_) - , data_part_storage(data_part_storage_) , index_granularity_info(index_granularity_info_) , storage_settings(storage_settings_) , metadata_snapshot(metadata_snapshot_) , columns_list(columns_list_) , settings(settings_) - , index_granularity(index_granularity_) , with_final_mark(settings.can_use_adaptive_granularity) + , data_part_storage(data_part_storage_) + , index_granularity(index_granularity_) { } diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index 3245a23339b..d2bf03483c9 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -50,19 +50,19 @@ protected: IDataPartStorage & getDataPartStorage() { return *data_part_storage; } - /// Serializations for every columns and subcolumns by their names. const String data_part_name; + /// Serializations for every columns and subcolumns by their names. const SerializationByName serializations; - MutableDataPartStoragePtr data_part_storage; const MergeTreeIndexGranularityInfo index_granularity_info; const MergeTreeSettingsPtr storage_settings; const StorageMetadataPtr metadata_snapshot; const NamesAndTypesList columns_list; const MergeTreeWriterSettings settings; - MergeTreeIndexGranularity index_granularity; const bool with_final_mark; + MutableDataPartStoragePtr data_part_storage; MutableColumns index_columns; + MergeTreeIndexGranularity index_granularity; }; using MergeTreeDataPartWriterPtr = std::unique_ptr; From b80d878b4c7d20d6ba7ec0e820e01ae68f498c58 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 21 May 2024 13:21:53 +0200 Subject: [PATCH 201/392] Add test --- .../Cached/CachedObjectStorage.h | 7 + src/IO/S3/copyS3File.cpp | 14 +- tests/integration/helpers/cluster.py | 1 + .../configs/disk_s3_restricted_user.xml | 22 +++ .../test_backup_restore_s3/test.py | 132 ++++++++++++++++++ 5 files changed, 171 insertions(+), 5 deletions(-) create mode 100644 tests/integration/test_backup_restore_s3/configs/disk_s3_restricted_user.xml diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 961c2709efc..fbb9a7e731e 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -126,6 +126,13 @@ public: } #endif +#if USE_AWS_S3 + std::shared_ptr getS3StorageClient() override + { + return object_storage->getS3StorageClient(); + } +#endif + private: FileCacheKey getCacheKey(const std::string & path) const; diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index 46cadcef68c..218bdf78907 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -739,16 +739,20 @@ namespace if (outcome.GetError().GetExceptionName() == "EntityTooLarge" || outcome.GetError().GetExceptionName() == "InvalidRequest" || outcome.GetError().GetExceptionName() == "InvalidArgument" || + outcome.GetError().GetExceptionName() == "AccessDenied" || (outcome.GetError().GetExceptionName() == "InternalError" && outcome.GetError().GetResponseCode() == Aws::Http::HttpResponseCode::GATEWAY_TIMEOUT && outcome.GetError().GetMessage().contains("use the Rewrite method in the JSON API"))) { - if (!supports_multipart_copy) + if (!supports_multipart_copy || outcome.GetError().GetExceptionName() == "AccessDenied") { - LOG_INFO(log, "Multipart upload using copy is not supported, will try regular upload for Bucket: {}, Key: {}, Object size: {}", - dest_bucket, - dest_key, - size); + LOG_INFO( + log, + "Multipart upload using copy is not supported, will try regular upload for Bucket: {}, Key: {}, Object size: " + "{}", + dest_bucket, + dest_key, + size); fallback_method(); break; } diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index c2bea3060aa..41c162217d2 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -513,6 +513,7 @@ class ClickHouseCluster: self.minio_redirect_host = "proxy1" self.minio_redirect_ip = None self.minio_redirect_port = 8080 + self.minio_docker_id = self.get_instance_docker_id(self.minio_host) self.spark_session = None diff --git a/tests/integration/test_backup_restore_s3/configs/disk_s3_restricted_user.xml b/tests/integration/test_backup_restore_s3/configs/disk_s3_restricted_user.xml new file mode 100644 index 00000000000..323e986f966 --- /dev/null +++ b/tests/integration/test_backup_restore_s3/configs/disk_s3_restricted_user.xml @@ -0,0 +1,22 @@ + + + + + + s3 + http://minio1:9001/root/data/disks/disk_s3_restricted_user/ + miniorestricted1 + minio123 + + + + + +
+ disk_s3_restricted_user +
+
+
+
+
+
diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index 05424887736..4ad2c133694 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -3,8 +3,11 @@ import pytest from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV import uuid +import os +CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs") + cluster = ClickHouseCluster(__file__) node = cluster.add_instance( "node", @@ -20,13 +23,122 @@ node = cluster.add_instance( ], with_minio=True, with_zookeeper=True, + stay_alive=True, ) +def setup_minio_users(): + for user, bucket in [("miniorestricted1", "root"), ("miniorestricted2", "root2")]: + print( + cluster.exec_in_container( + cluster.minio_docker_id, + [ + "mc", + "alias", + "set", + "root", + "http://minio1:9001", + "minio", + "minio123", + ], + ) + ) + policy = f""" +{{ + "Version": "2012-10-17", + "Statement": [ + {{ + "Effect": "Allow", + "Principal": {{ + "AWS": [ + "*" + ] + }}, + "Action": [ + "s3:GetBucketLocation", + "s3:ListBucket", + "s3:ListBucketMultipartUploads" + ], + "Resource": [ + "arn:aws:s3:::{bucket}" + ] + }}, + {{ + "Effect": "Allow", + "Principal": {{ + "AWS": [ + "*" + ] + }}, + "Action": [ + "s3:AbortMultipartUpload", + "s3:DeleteObject", + "s3:GetObject", + "s3:ListMultipartUploadParts", + "s3:PutObject" + ], + "Resource": [ + "arn:aws:s3:::{bucket}/*" + ] + }} + ] +}}""" + + cluster.exec_in_container( + cluster.minio_docker_id, + ["bash", "-c", f"cat >/tmp/{bucket}_policy.json < Date: Tue, 21 May 2024 13:28:20 +0200 Subject: [PATCH 202/392] Cleanups --- src/Storages/MergeTree/IMergeTreeDataPart.h | 1 - src/Storages/MergeTree/MergeTreeDataPartCompact.cpp | 7 ++++--- src/Storages/MergeTree/MergeTreeDataPartCompact.h | 1 - src/Storages/MergeTree/MergeTreeDataPartWide.cpp | 9 ++++++--- src/Storages/MergeTree/MergeTreeDataPartWide.h | 1 - src/Storages/MergeTree/MergedBlockOutputStream.cpp | 2 +- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 091a7ceb5bd..f4889d64179 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -104,7 +104,6 @@ public: const ValueSizeMap & avg_value_size_hints_, const ReadBufferFromFileBase::ProfileCallback & profile_callback_) const = 0; -// TODO: remove? virtual bool isStoredOnDisk() const = 0; virtual bool isStoredOnRemoteDisk() const = 0; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 373ad6c23ea..fb1c2fe35ed 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -74,9 +74,10 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( //// { return *getColumnPosition(lhs.name) < *getColumnPosition(rhs.name); }); //// return std::make_unique( - data_part_name_, logger_name_, serializations_, data_part_storage_, - index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, indices_to_recalc, stats_to_recalc_, - marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); + data_part_name_, logger_name_, serializations_, data_part_storage_, + index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, + indices_to_recalc, stats_to_recalc_, marks_file_extension_, + default_codec_, writer_settings, computed_index_granularity); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index ca88edba7b3..1fb84424774 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -40,7 +40,6 @@ public: const ValueSizeMap & avg_value_size_hints, const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; -// TODO: remove? bool isStoredOnDisk() const override { return true; } bool isStoredOnRemoteDisk() const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index 34a3f30c4ba..74cab30064a 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -69,9 +69,12 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) { - return std::make_unique(data_part_name_, logger_name_, serializations_, data_part_storage_, - index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, indices_to_recalc, stats_to_recalc_, - marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); + return std::make_unique( + data_part_name_, logger_name_, serializations_, data_part_storage_, + index_granularity_info_, storage_settings_, columns_list, + metadata_snapshot, indices_to_recalc, stats_to_recalc_, + marks_file_extension_, + default_codec_, writer_settings, computed_index_granularity); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index e3cb3f04335..7465e08b7c4 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -35,7 +35,6 @@ public: const ValueSizeMap & avg_value_size_hints, const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; -// TODO: remove? bool isStoredOnDisk() const override { return true; } bool isStoredOnRemoteDisk() const override; diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index e0fb4f703a0..0fe3ee30a0d 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -40,7 +40,7 @@ MergedBlockOutputStream::MergedBlockOutputStream( /* rewrite_primary_key = */ true, blocks_are_granules_size); -// TODO: looks like isStoredOnDisk() is always true for MergeTreeDataPart + /// TODO: looks like isStoredOnDisk() is always true for MergeTreeDataPart if (data_part->isStoredOnDisk()) data_part_storage->createDirectories(); From 1e273f10e2056f25be2a616e8fa911a00dbb948e Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 21 May 2024 11:36:57 +0000 Subject: [PATCH 203/392] Automatic style fix --- tests/integration/test_backup_restore_s3/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index 4ad2c133694..a76b32bce39 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -134,6 +134,7 @@ def setup_minio_users(): ) node.start_clickhouse() + @pytest.fixture(scope="module", autouse=True) def start_cluster(): try: From 8fc1abf2ab06485d0c4c63d6a0a2484189f71f84 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 15:51:24 +0200 Subject: [PATCH 204/392] Add documentation of new settings --- .../settings.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 28831404a1f..4d239309886 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -561,6 +561,25 @@ Default value: 5000 400 ``` +## max\_view\_num\_to\_warn {#max-view-num-to-warn} +If the number of attached views exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table. +Default value: 5000 + +**Example** + +``` xml +400 +``` + +## max\_dictionary\_num\_to\_warn {#max-dictionary-num-to-warn} +If the number of attached dictionaries exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table. +Default value: 5000 + +**Example** + +``` xml +400 +``` ## max\_part\_num\_to\_warn {#max-part-num-to-warn} If the number of active parts exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table. From 681de0145888b4dd30d75fd9b1fabe5e2e084b10 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 16:00:51 +0200 Subject: [PATCH 205/392] Extract common counter logic to method --- src/Databases/DatabaseLazy.cpp | 31 ++++++++++++------------------- src/Databases/DatabasesCommon.cpp | 31 +++++++++++++------------------ 2 files changed, 25 insertions(+), 37 deletions(-) diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index ca985b5a7c8..a27e69c7e63 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -174,6 +174,16 @@ bool DatabaseLazy::empty() const return tables_cache.empty(); } +static CurrentMetrics::Metric get_attached_count_metric_for_storage(const StoragePtr & storage) { + if (storage->isView()) { + return CurrentMetrics::AttachedView; + } else if (storage->isDictionary()) { + return CurrentMetrics::AttachedDictionary; + } else { + return CurrentMetrics::AttachedTable; + } +} + void DatabaseLazy::attachTable(ContextPtr /* context_ */, const String & table_name, const StoragePtr & table, const String &) { LOG_DEBUG(log, "Attach table {}.", backQuote(table_name)); @@ -187,16 +197,7 @@ void DatabaseLazy::attachTable(ContextPtr /* context_ */, const String & table_n throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Table {}.{} already exists.", backQuote(database_name), backQuote(table_name)); it->second.expiration_iterator = cache_expiration_queue.emplace(cache_expiration_queue.end(), current_time, table_name); - CurrentMetrics::Metric metric; - if (table->isView()) { - metric = CurrentMetrics::AttachedView; - } else if (table->isDictionary()) { - metric = CurrentMetrics::AttachedDictionary; - } else { - metric = CurrentMetrics::AttachedTable; - } - CurrentMetrics::add(metric, 1); - + CurrentMetrics::add(get_attached_count_metric_for_storage(table), 1); } StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & table_name) @@ -212,15 +213,7 @@ StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & ta if (it->second.expiration_iterator != cache_expiration_queue.end()) cache_expiration_queue.erase(it->second.expiration_iterator); tables_cache.erase(it); - CurrentMetrics::Metric metric; - if (res->isView()) { - metric = CurrentMetrics::AttachedView; - } else if (res->isDictionary()) { - metric = CurrentMetrics::AttachedDictionary; - } else { - metric = CurrentMetrics::AttachedTable; - } - CurrentMetrics::sub(metric, 1); + CurrentMetrics::sub(get_attached_count_metric_for_storage(res), 1); } return res; } diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index ab7f2fff5aa..03a8feb845f 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -254,6 +254,17 @@ StoragePtr DatabaseWithOwnTablesBase::detachTable(ContextPtr /* context_ */, con return detachTableUnlocked(table_name); } + +static CurrentMetrics::Metric get_attached_count_metric_for_storage(const StoragePtr & storage) { + if (storage->isView()) { + return CurrentMetrics::AttachedView; + } else if (storage->isDictionary()) { + return CurrentMetrics::AttachedDictionary; + } else { + return CurrentMetrics::AttachedTable; + } +} + StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_name) { StoragePtr res; @@ -265,15 +276,7 @@ StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_n res = it->second; tables.erase(it); res->is_detached = true; - CurrentMetrics::Metric metric; - if (res->isView()) { - metric = CurrentMetrics::AttachedView; - } else if (res->isDictionary()) { - metric = CurrentMetrics::AttachedDictionary; - } else { - metric = CurrentMetrics::AttachedTable; - } - CurrentMetrics::sub(metric, 1); + CurrentMetrics::sub(get_attached_count_metric_for_storage(res), 1); auto table_id = res->getStorageID(); if (table_id.hasUUID()) @@ -314,15 +317,7 @@ void DatabaseWithOwnTablesBase::attachTableUnlocked(const String & table_name, c /// It is important to reset is_detached here since in case of RENAME in /// non-Atomic database the is_detached is set to true before RENAME. table->is_detached = false; - CurrentMetrics::Metric metric; - if (table->isView()) { - metric = CurrentMetrics::AttachedView; - } else if (table->isDictionary()) { - metric = CurrentMetrics::AttachedDictionary; - } else { - metric = CurrentMetrics::AttachedTable; - } - CurrentMetrics::add(metric, 1); + CurrentMetrics::add(get_attached_count_metric_for_storage(table), 1); } void DatabaseWithOwnTablesBase::shutdown() From 98b89323c8239ce71153f88f6232806993b1a411 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Tue, 21 May 2024 16:14:48 +0200 Subject: [PATCH 206/392] Pass virtual columns descriptions to writer --- .../MergeTree/IMergeTreeDataPartWriter.cpp | 16 ++++++++++------ .../MergeTree/IMergeTreeDataPartWriter.h | 4 ++++ .../MergeTree/MergeTreeDataPartCompact.cpp | 3 ++- src/Storages/MergeTree/MergeTreeDataPartWide.cpp | 3 ++- .../MergeTree/MergeTreeDataPartWriterCompact.cpp | 3 ++- .../MergeTree/MergeTreeDataPartWriterCompact.h | 1 + .../MergeTree/MergeTreeDataPartWriterOnDisk.cpp | 3 ++- .../MergeTree/MergeTreeDataPartWriterOnDisk.h | 1 + .../MergeTree/MergeTreeDataPartWriterWide.cpp | 3 ++- .../MergeTree/MergeTreeDataPartWriterWide.h | 1 + .../MergeTree/MergedBlockOutputStream.cpp | 3 ++- .../MergeTree/MergedColumnOnlyOutputStream.cpp | 1 + 12 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index b3e33e94073..27da53de9b0 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -52,6 +52,7 @@ IMergeTreeDataPartWriter::IMergeTreeDataPartWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, + const VirtualsDescriptionPtr virtual_columns_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) : data_part_name(data_part_name_) @@ -59,6 +60,7 @@ IMergeTreeDataPartWriter::IMergeTreeDataPartWriter( , index_granularity_info(index_granularity_info_) , storage_settings(storage_settings_) , metadata_snapshot(metadata_snapshot_) + , virtual_columns(virtual_columns_) , columns_list(columns_list_) , settings(settings_) , with_final_mark(settings.can_use_adaptive_granularity) @@ -95,10 +97,9 @@ ASTPtr IMergeTreeDataPartWriter::getCodecDescOrDefault(const String & column_nam if (const auto * column_desc = columns.tryGet(column_name)) return get_codec_or_default(*column_desc); -///// TODO: is this needed? -// if (const auto * virtual_desc = virtual_columns->tryGetDescription(column_name)) -// return get_codec_or_default(*virtual_desc); -// + if (const auto * virtual_desc = virtual_columns->tryGetDescription(column_name)) + return get_codec_or_default(*virtual_desc); + return default_codec->getFullCodecDesc(); } @@ -115,6 +116,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, + const VirtualsDescriptionPtr virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -131,6 +133,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, + const VirtualsDescriptionPtr virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -149,6 +152,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, + const VirtualsDescriptionPtr virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -158,11 +162,11 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( { if (part_type == MergeTreeDataPartType::Compact) return createMergeTreeDataPartCompactWriter(data_part_name_, logger_name_, serializations_, data_part_storage_, - index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, indices_to_recalc, stats_to_recalc_, + index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, virtual_columns, indices_to_recalc, stats_to_recalc_, marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); else if (part_type == MergeTreeDataPartType::Wide) return createMergeTreeDataPartWideWriter(data_part_name_, logger_name_, serializations_, data_part_storage_, - index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, indices_to_recalc, stats_to_recalc_, + index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, virtual_columns, indices_to_recalc, stats_to_recalc_, marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); else throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown part type: {}", part_type.toString()); diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index d2bf03483c9..5dcc7ddc599 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB @@ -29,6 +30,7 @@ public: const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, + const VirtualsDescriptionPtr virtual_columns_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_ = {}); @@ -56,6 +58,7 @@ protected: const MergeTreeIndexGranularityInfo index_granularity_info; const MergeTreeSettingsPtr storage_settings; const StorageMetadataPtr metadata_snapshot; + const VirtualsDescriptionPtr virtual_columns; const NamesAndTypesList columns_list; const MergeTreeWriterSettings settings; const bool with_final_mark; @@ -77,6 +80,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, + const VirtualsDescriptionPtr virtual_columns_, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension, diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index fb1c2fe35ed..332b7d04f7f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -56,6 +56,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, + const VirtualsDescriptionPtr virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -75,7 +76,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( //// return std::make_unique( data_part_name_, logger_name_, serializations_, data_part_storage_, - index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, + index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, virtual_columns, indices_to_recalc, stats_to_recalc_, marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index 74cab30064a..d4630d3dd3f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -62,6 +62,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, + const VirtualsDescriptionPtr virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -72,7 +73,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( return std::make_unique( data_part_name_, logger_name_, serializations_, data_part_storage_, index_granularity_info_, storage_settings_, columns_list, - metadata_snapshot, indices_to_recalc, stats_to_recalc_, + metadata_snapshot, virtual_columns, indices_to_recalc, stats_to_recalc_, marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 3f08d8eea21..328e3118ba9 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -18,6 +18,7 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, + const VirtualsDescriptionPtr virtual_columns_, const std::vector & indices_to_recalc_, const Statistics & stats_to_recalc, const String & marks_file_extension_, @@ -27,7 +28,7 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( : MergeTreeDataPartWriterOnDisk( data_part_name_, logger_name_, serializations_, data_part_storage_, index_granularity_info_, storage_settings_, - columns_list_, metadata_snapshot_, + columns_list_, metadata_snapshot_, virtual_columns_, indices_to_recalc_, stats_to_recalc, marks_file_extension_, default_codec_, settings_, index_granularity_) , plain_file(getDataPartStorage().writeFile( diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index 03804ff4966..f62f060fde2 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -19,6 +19,7 @@ public: const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, + const VirtualsDescriptionPtr virtual_columns_, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc, const String & marks_file_extension, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 25eb83a82c0..30f01c1acd6 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -148,6 +148,7 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, + const VirtualsDescriptionPtr virtual_columns_, const MergeTreeIndices & indices_to_recalc_, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -156,7 +157,7 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const MergeTreeIndexGranularity & index_granularity_) : IMergeTreeDataPartWriter( data_part_name_, serializations_, data_part_storage_, index_granularity_info_, - storage_settings_, columns_list_, metadata_snapshot_, settings_, index_granularity_) + storage_settings_, columns_list_, metadata_snapshot_, virtual_columns_, settings_, index_granularity_) , skip_indices(indices_to_recalc_) , stats(stats_to_recalc_) , marks_file_extension(marks_file_extension_) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index e17724fa1d0..a60fcd43a58 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -109,6 +109,7 @@ public: const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, + const VirtualsDescriptionPtr virtual_columns_, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index a57bf7d2037..001f09b81b3 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -84,6 +84,7 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, + const VirtualsDescriptionPtr virtual_columns_, const std::vector & indices_to_recalc_, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -93,7 +94,7 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( : MergeTreeDataPartWriterOnDisk( data_part_name_, logger_name_, serializations_, data_part_storage_, index_granularity_info_, storage_settings_, - columns_list_, metadata_snapshot_, + columns_list_, metadata_snapshot_, virtual_columns_, indices_to_recalc_, stats_to_recalc_, marks_file_extension_, default_codec_, settings_, index_granularity_) { diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index 5789213c910..8dc488788c6 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -29,6 +29,7 @@ public: const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, + const VirtualsDescriptionPtr virtual_columns_, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension, diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 0fe3ee30a0d..5ef967d930a 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -55,7 +55,8 @@ MergedBlockOutputStream::MergedBlockOutputStream( data_part->name, data_part->storage.getLogName(), data_part->getSerializations(), data_part_storage, data_part->index_granularity_info, storage_settings, - columns_list, metadata_snapshot, skip_indices, statistics, data_part->getMarksFileExtension(), default_codec, writer_settings, computed_index_granularity); + columns_list, metadata_snapshot, data_part->storage.getVirtualsPtr(), + skip_indices, statistics, data_part->getMarksFileExtension(), default_codec, writer_settings, computed_index_granularity); } /// If data is pre-sorted. diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 1c75d81eca5..1d1783b1b43 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -39,6 +39,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( storage_settings, header.getNamesAndTypesList(), metadata_snapshot_, + data_part->storage.getVirtualsPtr(), indices_to_recalc, stats_to_recalc_, data_part->getMarksFileExtension(), From 372acbd3fcbb06d9cd650b785b99da346d6ce5c9 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 21 May 2024 14:15:14 +0000 Subject: [PATCH 207/392] Refactor aliases a bit. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 314 +++++++++++------- .../02341_analyzer_aliases_basics.reference | 1 + .../02341_analyzer_aliases_basics.sql | 2 + .../0_stateless/02343_analyzer_lambdas.sql | 8 + 4 files changed, 204 insertions(+), 121 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 10f2290b34f..e50ad7911a0 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -471,7 +471,6 @@ struct TableExpressionData return buffer.str(); } }; - class ExpressionsStack { public: @@ -586,6 +585,82 @@ private: std::unordered_map alias_name_to_expressions; }; +struct ScopeAliases +{ + /// Alias name to query expression node + std::unordered_map alias_name_to_expression_node_before_group_by; + std::unordered_map alias_name_to_expression_node_after_group_by; + + std::unordered_map * alias_name_to_expression_node = nullptr; + + /// Alias name to lambda node + std::unordered_map alias_name_to_lambda_node; + + /// Alias name to table expression node + std::unordered_map alias_name_to_table_expression_node; + + /// Expressions like `x as y` where we can't say whether it's a function, expression or table. + std::unordered_map transitive_aliases; + + /// Nodes with duplicated aliases + std::unordered_set nodes_with_duplicated_aliases; + std::vector cloned_nodes_with_duplicated_aliases; + + std::unordered_map & getAliasMap(IdentifierLookupContext lookup_context) + { + switch (lookup_context) + { + case IdentifierLookupContext::EXPRESSION: return *alias_name_to_expression_node; + case IdentifierLookupContext::FUNCTION: return alias_name_to_lambda_node; + case IdentifierLookupContext::TABLE_EXPRESSION: return alias_name_to_table_expression_node; + } + + __builtin_unreachable(); + } + + enum class FindOption + { + FIRST_NAME, + FULL_NAME, + }; + + const std::string & getKey(const Identifier & identifier, FindOption find_option) + { + switch (find_option) + { + case FindOption::FIRST_NAME: return identifier.front(); + case FindOption::FULL_NAME: return identifier.getFullName(); + } + + __builtin_unreachable(); + } + + QueryTreeNodePtr * find(IdentifierLookup lookup, FindOption find_option) + { + auto & alias_map = getAliasMap(lookup.lookup_context); + const std::string * key = &getKey(lookup.identifier, find_option); + + auto it = alias_map.find(*key); + while (it == alias_map.end()) + { + auto jt = transitive_aliases.find(*key); + if (jt == transitive_aliases.end()) + return {}; + + key = &(getKey(jt->second, find_option)); + it = alias_map.find(*key); + } + + return &it->second; + } + + const QueryTreeNodePtr * find(IdentifierLookup lookup, FindOption find_option) const + { + return const_cast(this)->find(lookup, find_option); + } +}; + + /** Projection names is name of query tree node that is used in projection part of query node. * Example: SELECT id FROM test_table; * `id` is projection name of column node @@ -731,7 +806,7 @@ struct IdentifierResolveScope else if (parent_scope) join_use_nulls = parent_scope->join_use_nulls; - alias_name_to_expression_node = &alias_name_to_expression_node_before_group_by; + aliases.alias_name_to_expression_node = &aliases.alias_name_to_expression_node_before_group_by; } QueryTreeNodePtr scope_node; @@ -746,17 +821,7 @@ struct IdentifierResolveScope /// Argument can be expression like constant, column, function or table expression std::unordered_map expression_argument_name_to_node; - /// Alias name to query expression node - std::unordered_map alias_name_to_expression_node_before_group_by; - std::unordered_map alias_name_to_expression_node_after_group_by; - - std::unordered_map * alias_name_to_expression_node = nullptr; - - /// Alias name to lambda node - std::unordered_map alias_name_to_lambda_node; - - /// Alias name to table expression node - std::unordered_map alias_name_to_table_expression_node; + ScopeAliases aliases; /// Table column name to column node. Valid only during table ALIAS columns resolve. ColumnNameToColumnNodeMap column_name_to_column_node; @@ -767,10 +832,6 @@ struct IdentifierResolveScope /// Window name to window node std::unordered_map window_name_to_window_node; - /// Nodes with duplicated aliases - std::unordered_set nodes_with_duplicated_aliases; - std::vector cloned_nodes_with_duplicated_aliases; - /// Current scope expression in resolve process stack ExpressionsStack expressions_in_resolve_process_stack; @@ -889,7 +950,7 @@ struct IdentifierResolveScope bool had_aggregate_function = expressions_in_resolve_process_stack.hasAggregateFunction(); expressions_in_resolve_process_stack.push(node); if (group_by_use_nulls && had_aggregate_function != expressions_in_resolve_process_stack.hasAggregateFunction()) - alias_name_to_expression_node = &alias_name_to_expression_node_before_group_by; + aliases.alias_name_to_expression_node = &aliases.alias_name_to_expression_node_before_group_by; } void popExpressionNode() @@ -897,7 +958,7 @@ struct IdentifierResolveScope bool had_aggregate_function = expressions_in_resolve_process_stack.hasAggregateFunction(); expressions_in_resolve_process_stack.pop(); if (group_by_use_nulls && had_aggregate_function != expressions_in_resolve_process_stack.hasAggregateFunction()) - alias_name_to_expression_node = &alias_name_to_expression_node_after_group_by; + aliases.alias_name_to_expression_node = &aliases.alias_name_to_expression_node_after_group_by; } /// Dump identifier resolve scope @@ -916,16 +977,16 @@ struct IdentifierResolveScope for (const auto & [alias_name, node] : expression_argument_name_to_node) buffer << "Alias name " << alias_name << " node " << node->formatASTForErrorMessage() << '\n'; - buffer << "Alias name to expression node table size " << alias_name_to_expression_node->size() << '\n'; - for (const auto & [alias_name, node] : *alias_name_to_expression_node) + buffer << "Alias name to expression node table size " << aliases.alias_name_to_expression_node->size() << '\n'; + for (const auto & [alias_name, node] : *aliases.alias_name_to_expression_node) buffer << "Alias name " << alias_name << " expression node " << node->dumpTree() << '\n'; - buffer << "Alias name to function node table size " << alias_name_to_lambda_node.size() << '\n'; - for (const auto & [alias_name, node] : alias_name_to_lambda_node) + buffer << "Alias name to function node table size " << aliases.alias_name_to_lambda_node.size() << '\n'; + for (const auto & [alias_name, node] : aliases.alias_name_to_lambda_node) buffer << "Alias name " << alias_name << " lambda node " << node->formatASTForErrorMessage() << '\n'; - buffer << "Alias name to table expression node table size " << alias_name_to_table_expression_node.size() << '\n'; - for (const auto & [alias_name, node] : alias_name_to_table_expression_node) + buffer << "Alias name to table expression node table size " << aliases.alias_name_to_table_expression_node.size() << '\n'; + for (const auto & [alias_name, node] : aliases.alias_name_to_table_expression_node) buffer << "Alias name " << alias_name << " node " << node->formatASTForErrorMessage() << '\n'; buffer << "CTE name to query node table size " << cte_name_to_query_node.size() << '\n'; @@ -936,8 +997,8 @@ struct IdentifierResolveScope for (const auto & [window_name, node] : window_name_to_window_node) buffer << "CTE name " << window_name << " node " << node->formatASTForErrorMessage() << '\n'; - buffer << "Nodes with duplicated aliases size " << nodes_with_duplicated_aliases.size() << '\n'; - for (const auto & node : nodes_with_duplicated_aliases) + buffer << "Nodes with duplicated aliases size " << aliases.nodes_with_duplicated_aliases.size() << '\n'; + for (const auto & node : aliases.nodes_with_duplicated_aliases) buffer << "Alias name " << node->getAlias() << " node " << node->formatASTForErrorMessage() << '\n'; buffer << "Expression resolve process stack " << '\n'; @@ -996,8 +1057,8 @@ struct IdentifierResolveScope class QueryExpressionsAliasVisitor : public InDepthQueryTreeVisitor { public: - explicit QueryExpressionsAliasVisitor(IdentifierResolveScope & scope_) - : scope(scope_) + explicit QueryExpressionsAliasVisitor(ScopeAliases & aliases_) + : aliases(aliases_) {} void visitImpl(QueryTreeNodePtr & node) @@ -1034,10 +1095,10 @@ public: private: void addDuplicatingAlias(const QueryTreeNodePtr & node) { - scope.nodes_with_duplicated_aliases.emplace(node); + aliases.nodes_with_duplicated_aliases.emplace(node); auto cloned_node = node->clone(); - scope.cloned_nodes_with_duplicated_aliases.emplace_back(cloned_node); - scope.nodes_with_duplicated_aliases.emplace(cloned_node); + aliases.cloned_nodes_with_duplicated_aliases.emplace_back(cloned_node); + aliases.nodes_with_duplicated_aliases.emplace(cloned_node); } void updateAliasesIfNeeded(const QueryTreeNodePtr & node, bool is_lambda_node) @@ -1053,25 +1114,29 @@ private: if (is_lambda_node) { - if (scope.alias_name_to_expression_node->contains(alias)) + if (aliases.alias_name_to_expression_node->contains(alias)) addDuplicatingAlias(node); - auto [_, inserted] = scope.alias_name_to_lambda_node.insert(std::make_pair(alias, node)); + auto [_, inserted] = aliases.alias_name_to_lambda_node.insert(std::make_pair(alias, node)); if (!inserted) addDuplicatingAlias(node); return; } - if (scope.alias_name_to_lambda_node.contains(alias)) - addDuplicatingAlias(node); + if (aliases.alias_name_to_lambda_node.contains(alias)) + addDuplicatingAlias(node); - auto [_, inserted] = scope.alias_name_to_expression_node->insert(std::make_pair(alias, node)); + auto [_, inserted] = aliases.alias_name_to_expression_node->insert(std::make_pair(alias, node)); if (!inserted) - addDuplicatingAlias(node); + addDuplicatingAlias(node); + + /// If node is identifier put it into transitive aliases map. + if (const auto * identifier = typeid_cast(node.get())) + aliases.transitive_aliases.insert(std::make_pair(alias, identifier->getIdentifier())); } - IdentifierResolveScope & scope; + ScopeAliases & aliases; }; class TableExpressionsAliasVisitor : public InDepthQueryTreeVisitor @@ -1118,7 +1183,7 @@ private: return; const auto & node_alias = node->getAlias(); - auto [_, inserted] = scope.alias_name_to_table_expression_node.emplace(node_alias, node); + auto [_, inserted] = scope.aliases.alias_name_to_table_expression_node.emplace(node_alias, node); if (!inserted) throw Exception(ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS, "Multiple table expressions with same alias {}. In scope {}", @@ -1189,7 +1254,7 @@ public: } case QueryTreeNodeType::TABLE_FUNCTION: { - QueryExpressionsAliasVisitor expressions_alias_visitor(scope); + QueryExpressionsAliasVisitor expressions_alias_visitor(scope.aliases); resolveTableFunction(node, scope, expressions_alias_visitor, false /*nested_table_function*/); break; } @@ -1864,7 +1929,7 @@ void QueryAnalyzer::collectScopeValidIdentifiersForTypoCorrection( if (allow_expression_identifiers) { - for (const auto & [name, expression] : *scope.alias_name_to_expression_node) + for (const auto & [name, expression] : *scope.aliases.alias_name_to_expression_node) { assert(expression); auto expression_identifier = Identifier(name); @@ -1894,13 +1959,13 @@ void QueryAnalyzer::collectScopeValidIdentifiersForTypoCorrection( { if (allow_function_identifiers) { - for (const auto & [name, _] : *scope.alias_name_to_expression_node) + for (const auto & [name, _] : *scope.aliases.alias_name_to_expression_node) valid_identifiers_result.insert(Identifier(name)); } if (allow_table_expression_identifiers) { - for (const auto & [name, _] : scope.alias_name_to_table_expression_node) + for (const auto & [name, _] : scope.aliases.alias_name_to_table_expression_node) valid_identifiers_result.insert(Identifier(name)); } } @@ -2789,21 +2854,22 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromExpressionArguments(cons bool QueryAnalyzer::tryBindIdentifierToAliases(const IdentifierLookup & identifier_lookup, const IdentifierResolveScope & scope) { - const auto & identifier_bind_part = identifier_lookup.identifier.front(); + //const auto & identifier_bind_part = identifier_lookup.identifier.front(); + return scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME) != nullptr; - auto get_alias_name_to_node_map = [&]() -> const std::unordered_map & - { - if (identifier_lookup.isExpressionLookup()) - return *scope.alias_name_to_expression_node; - else if (identifier_lookup.isFunctionLookup()) - return scope.alias_name_to_lambda_node; + // auto get_alias_name_to_node_map = [&]() -> const std::unordered_map & + // { + // if (identifier_lookup.isExpressionLookup()) + // return *scope.alias_name_to_expression_node; + // else if (identifier_lookup.isFunctionLookup()) + // return scope.alias_name_to_lambda_node; - return scope.alias_name_to_table_expression_node; - }; + // return scope.alias_name_to_table_expression_node; + // }; - const auto & alias_name_to_node_map = get_alias_name_to_node_map(); + // const auto & alias_name_to_node_map = get_alias_name_to_node_map(); - return alias_name_to_node_map.contains(identifier_bind_part); + // return alias_name_to_node_map.contains(identifier_bind_part); } /** Resolve identifier from scope aliases. @@ -2853,23 +2919,29 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier { const auto & identifier_bind_part = identifier_lookup.identifier.front(); - auto get_alias_name_to_node_map = [&]() -> std::unordered_map & - { - if (identifier_lookup.isExpressionLookup()) - return *scope.alias_name_to_expression_node; - else if (identifier_lookup.isFunctionLookup()) - return scope.alias_name_to_lambda_node; + // auto get_alias_name_to_node_map = [&]() -> std::unordered_map & + // { + // if (identifier_lookup.isExpressionLookup()) + // return *scope.alias_name_to_expression_node; + // else if (identifier_lookup.isFunctionLookup()) + // return scope.alias_name_to_lambda_node; - return scope.alias_name_to_table_expression_node; - }; + // return scope.alias_name_to_table_expression_node; + // }; - auto & alias_name_to_node_map = get_alias_name_to_node_map(); - auto it = alias_name_to_node_map.find(identifier_bind_part); + // auto & alias_name_to_node_map = get_alias_name_to_node_map(); + // auto it = alias_name_to_node_map.find(identifier_bind_part); - if (it == alias_name_to_node_map.end()) + // if (it == alias_name_to_node_map.end()) + // return {}; + + auto it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME); + if (it == nullptr) return {}; - if (!it->second) + QueryTreeNodePtr & alias_node = *it; + + if (!alias_node) throw Exception(ErrorCodes::LOGICAL_ERROR, "Node with alias {} is not valid. In scope {}", identifier_bind_part, @@ -2889,14 +2961,14 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier return {}; } - auto node_type = it->second->getNodeType(); + auto node_type = alias_node->getNodeType(); /// Resolve expression if necessary if (node_type == QueryTreeNodeType::IDENTIFIER) { - scope.pushExpressionNode(it->second); + scope.pushExpressionNode(alias_node); - auto & alias_identifier_node = it->second->as(); + auto & alias_identifier_node = alias_node->as(); auto identifier = alias_identifier_node.getIdentifier(); auto lookup_result = tryResolveIdentifier(IdentifierLookup{identifier, identifier_lookup.lookup_context}, scope, identifier_resolve_settings); if (!lookup_result.resolved_identifier) @@ -2912,7 +2984,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier getHintsErrorMessageSuffix(hints)); } - it->second = lookup_result.resolved_identifier; + alias_node = lookup_result.resolved_identifier; /** During collection of aliases if node is identifier and has alias, we cannot say if it is * column or function node. Check QueryExpressionsAliasVisitor documentation for clarification. @@ -2922,33 +2994,31 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier * If we resolved identifier node as function, we must remove identifier node alias from * expression alias map. */ - if (identifier_lookup.isExpressionLookup()) - scope.alias_name_to_lambda_node.erase(identifier_bind_part); - else if (identifier_lookup.isFunctionLookup()) - scope.alias_name_to_expression_node->erase(identifier_bind_part); + // if (identifier_lookup.isExpressionLookup()) + // scope.alises.alias_name_to_lambda_node.erase(identifier_bind_part); + // else if (identifier_lookup.isFunctionLookup()) + // scope.aliases.alias_name_to_expression_node->erase(identifier_bind_part); scope.popExpressionNode(); } else if (node_type == QueryTreeNodeType::FUNCTION) { - resolveExpressionNode(it->second, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); + resolveExpressionNode(alias_node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); } else if (node_type == QueryTreeNodeType::QUERY || node_type == QueryTreeNodeType::UNION) { if (identifier_resolve_settings.allow_to_resolve_subquery_during_identifier_resolution) - resolveExpressionNode(it->second, scope, false /*allow_lambda_expression*/, identifier_lookup.isTableExpressionLookup() /*allow_table_expression*/); + resolveExpressionNode(alias_node, scope, false /*allow_lambda_expression*/, identifier_lookup.isTableExpressionLookup() /*allow_table_expression*/); } - QueryTreeNodePtr result = it->second; - - if (identifier_lookup.identifier.isCompound() && result) + if (identifier_lookup.identifier.isCompound() && alias_node) { if (identifier_lookup.isExpressionLookup()) { return tryResolveIdentifierFromCompoundExpression( identifier_lookup.identifier, 1 /*identifier_bind_size*/, - it->second, + alias_node, {} /* compound_expression_source */, scope, identifier_resolve_settings.allow_to_check_join_tree /* can_be_not_found */); @@ -2963,7 +3033,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier } } - return result; + return alias_node; } /** Resolve identifier from table columns. @@ -4124,10 +4194,12 @@ IdentifierResolveResult QueryAnalyzer::tryResolveIdentifier(const IdentifierLook * SELECT id FROM ( SELECT ... ) AS subquery ARRAY JOIN [0] AS id INNER JOIN second_table USING (id) * In the example, identifier `id` should be resolved into one from USING (id) column. */ - auto alias_it = scope.alias_name_to_expression_node->find(identifier_lookup.identifier.getFullName()); - if (alias_it != scope.alias_name_to_expression_node->end() && alias_it->second->getNodeType() == QueryTreeNodeType::COLUMN) + + auto alias_it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FULL_NAME); + //auto alias_it = scope.alias_name_to_expression_node->find(identifier_lookup.identifier.getFullName()); + if (alias_it && (*alias_it)->getNodeType() == QueryTreeNodeType::COLUMN) { - const auto & column_node = alias_it->second->as(); + const auto & column_node = (*alias_it)->as(); if (column_node.getColumnSource()->getNodeType() == QueryTreeNodeType::ARRAY_JOIN) prefer_column_name_to_alias = true; } @@ -5232,7 +5304,7 @@ ProjectionNames QueryAnalyzer::resolveLambda(const QueryTreeNodePtr & lambda_nod scope.scope_node->formatASTForErrorMessage()); /// Initialize aliases in lambda scope - QueryExpressionsAliasVisitor visitor(scope); + QueryExpressionsAliasVisitor visitor(scope.aliases); visitor.visit(lambda_to_resolve.getExpression()); /** Replace lambda arguments with new arguments. @@ -5252,8 +5324,8 @@ ProjectionNames QueryAnalyzer::resolveLambda(const QueryTreeNodePtr & lambda_nod const auto & lambda_argument_name = lambda_argument_identifier ? lambda_argument_identifier->getIdentifier().getFullName() : lambda_argument_column->getColumnName(); - bool has_expression_node = scope.alias_name_to_expression_node->contains(lambda_argument_name); - bool has_alias_node = scope.alias_name_to_lambda_node.contains(lambda_argument_name); + bool has_expression_node = scope.aliases.alias_name_to_expression_node->contains(lambda_argument_name); + bool has_alias_node = scope.aliases.alias_name_to_lambda_node.contains(lambda_argument_name); if (has_expression_node || has_alias_node) { @@ -5929,7 +6001,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi function_names = AggregateFunctionFactory::instance().getAllRegisteredNames(); possible_function_names.insert(possible_function_names.end(), function_names.begin(), function_names.end()); - for (auto & [name, lambda_node] : scope.alias_name_to_lambda_node) + for (auto & [name, lambda_node] : scope.aliases.alias_name_to_lambda_node) { if (lambda_node->getNodeType() == QueryTreeNodeType::LAMBDA) possible_function_names.push_back(name); @@ -6263,7 +6335,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id result_projection_names.push_back(node_alias); } - bool is_duplicated_alias = scope.nodes_with_duplicated_aliases.contains(node); + bool is_duplicated_alias = scope.aliases.nodes_with_duplicated_aliases.contains(node); if (is_duplicated_alias) scope.non_cached_identifier_lookups_during_expression_resolve.insert({Identifier{node_alias}, IdentifierLookupContext::EXPRESSION}); @@ -6287,14 +6359,14 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id * * To resolve b we need to resolve a. */ - auto it = scope.alias_name_to_expression_node->find(node_alias); - if (it != scope.alias_name_to_expression_node->end()) + auto it = scope.aliases.alias_name_to_expression_node->find(node_alias); + if (it != scope.aliases.alias_name_to_expression_node->end()) node = it->second; if (allow_lambda_expression) { - it = scope.alias_name_to_lambda_node.find(node_alias); - if (it != scope.alias_name_to_lambda_node.end()) + it = scope.aliases.alias_name_to_lambda_node.find(node_alias); + if (it != scope.aliases.alias_name_to_lambda_node.end()) node = it->second; } } @@ -6320,15 +6392,15 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id result_projection_names.push_back(projection_name_it->second); } - if (resolved_identifier_node && !node_alias.empty()) - scope.alias_name_to_lambda_node.erase(node_alias); + // if (resolved_identifier_node && !node_alias.empty()) + // scope.alias_name_to_lambda_node.erase(node_alias); if (!resolved_identifier_node && allow_lambda_expression) { resolved_identifier_node = tryResolveIdentifier({unresolved_identifier, IdentifierLookupContext::FUNCTION}, scope).resolved_identifier; - if (resolved_identifier_node && !node_alias.empty()) - scope.alias_name_to_expression_node->erase(node_alias); + // if (resolved_identifier_node && !node_alias.empty()) + // scope.alias_name_to_expression_node->erase(node_alias); } if (!resolved_identifier_node && allow_table_expression) @@ -6569,14 +6641,14 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id */ if (!node_alias.empty() && use_alias_table && !scope.group_by_use_nulls) { - auto it = scope.alias_name_to_expression_node->find(node_alias); - if (it != scope.alias_name_to_expression_node->end()) + auto it = scope.aliases.alias_name_to_expression_node->find(node_alias); + if (it != scope.aliases.alias_name_to_expression_node->end()) it->second = node; if (allow_lambda_expression) { - it = scope.alias_name_to_lambda_node.find(node_alias); - if (it != scope.alias_name_to_lambda_node.end()) + it = scope.aliases.alias_name_to_lambda_node.find(node_alias); + if (it != scope.aliases.alias_name_to_lambda_node.end()) it->second = node; } } @@ -6949,8 +7021,8 @@ void QueryAnalyzer::initializeQueryJoinTreeNode(QueryTreeNodePtr & join_tree_nod resolved_identifier = resolved_identifier->clone(); /// Update alias name to table expression map - auto table_expression_it = scope.alias_name_to_table_expression_node.find(from_table_identifier_alias); - if (table_expression_it != scope.alias_name_to_table_expression_node.end()) + auto table_expression_it = scope.aliases.alias_name_to_table_expression_node.find(from_table_identifier_alias); + if (table_expression_it != scope.aliases.alias_name_to_table_expression_node.end()) table_expression_it->second = resolved_identifier; auto table_expression_modifiers = from_table_identifier.getTableExpressionModifiers(); @@ -7149,7 +7221,7 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table alias_column_resolve_scope.context = scope.context; /// Initialize aliases in alias column scope - QueryExpressionsAliasVisitor visitor(alias_column_resolve_scope); + QueryExpressionsAliasVisitor visitor(alias_column_resolve_scope.aliases); visitor.visit(alias_column_to_resolve->getExpression()); resolveExpressionNode(alias_column_resolve_scope.scope_node, @@ -7519,7 +7591,7 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif for (auto & array_join_expression : array_join_nodes) { auto array_join_expression_alias = array_join_expression->getAlias(); - if (!array_join_expression_alias.empty() && scope.alias_name_to_expression_node->contains(array_join_expression_alias)) + if (!array_join_expression_alias.empty() && scope.aliases.alias_name_to_expression_node->contains(array_join_expression_alias)) throw Exception(ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS, "ARRAY JOIN expression {} with duplicate alias {}. In scope {}", array_join_expression->formatASTForErrorMessage(), @@ -7613,8 +7685,8 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif array_join_nodes = std::move(array_join_column_expressions); for (auto & array_join_column_expression : array_join_nodes) { - auto it = scope.alias_name_to_expression_node->find(array_join_column_expression->getAlias()); - if (it != scope.alias_name_to_expression_node->end()) + auto it = scope.aliases.alias_name_to_expression_node->find(array_join_column_expression->getAlias()); + if (it != scope.aliases.alias_name_to_expression_node->end()) { auto & array_join_column_expression_typed = array_join_column_expression->as(); auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), @@ -7911,7 +7983,7 @@ void QueryAnalyzer::resolveQueryJoinTreeNode(QueryTreeNodePtr & join_tree_node, if (alias_name.empty()) return; - auto [it, inserted] = scope.alias_name_to_table_expression_node.emplace(alias_name, table_expression_node); + auto [it, inserted] = scope.aliases.alias_name_to_table_expression_node.emplace(alias_name, table_expression_node); if (!inserted) throw Exception(ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS, "Duplicate aliases {} for table expressions in FROM section are not allowed. Try to register {}. Already registered {}.", @@ -7980,7 +8052,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier throw Exception(ErrorCodes::NOT_IMPLEMENTED, "WITH TOTALS and WITH ROLLUP or CUBE are not supported together in presence of QUALIFY"); /// Initialize aliases in query node scope - QueryExpressionsAliasVisitor visitor(scope); + QueryExpressionsAliasVisitor visitor(scope.aliases); if (query_node_typed.hasWith()) visitor.visit(query_node_typed.getWithNode()); @@ -8098,7 +8170,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier table_expressions_visitor.visit(query_node_typed.getJoinTree()); initializeQueryJoinTreeNode(query_node_typed.getJoinTree(), scope); - scope.alias_name_to_table_expression_node.clear(); + scope.aliases.alias_name_to_table_expression_node.clear(); resolveQueryJoinTreeNode(query_node_typed.getJoinTree(), scope, visitor); } @@ -8148,10 +8220,10 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier /// Clone is needed cause aliases share subtrees. /// If not clone, the same (shared) subtree could be resolved again with different (Nullable) type /// See 03023_group_by_use_nulls_analyzer_crashes - for (auto & [key, node] : scope.alias_name_to_expression_node_before_group_by) - scope.alias_name_to_expression_node_after_group_by[key] = node->clone(); + for (auto & [key, node] : scope.aliases.alias_name_to_expression_node_before_group_by) + scope.aliases.alias_name_to_expression_node_after_group_by[key] = node->clone(); - scope.alias_name_to_expression_node = &scope.alias_name_to_expression_node_after_group_by; + scope.aliases.alias_name_to_expression_node = &scope.aliases.alias_name_to_expression_node_after_group_by; } if (query_node_typed.hasHaving()) @@ -8223,7 +8295,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier * After scope nodes are resolved, we can compare node with duplicate alias with * node from scope alias table. */ - for (const auto & node_with_duplicated_alias : scope.cloned_nodes_with_duplicated_aliases) + for (const auto & node_with_duplicated_alias : scope.aliases.cloned_nodes_with_duplicated_aliases) { auto node = node_with_duplicated_alias; auto node_alias = node->getAlias(); @@ -8234,8 +8306,8 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier bool has_node_in_alias_table = false; - auto it = scope.alias_name_to_expression_node->find(node_alias); - if (it != scope.alias_name_to_expression_node->end()) + auto it = scope.aliases.alias_name_to_expression_node->find(node_alias); + if (it != scope.aliases.alias_name_to_expression_node->end()) { has_node_in_alias_table = true; @@ -8248,8 +8320,8 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier scope.scope_node->formatASTForErrorMessage()); } - it = scope.alias_name_to_lambda_node.find(node_alias); - if (it != scope.alias_name_to_lambda_node.end()) + it = scope.aliases.alias_name_to_lambda_node.find(node_alias); + if (it != scope.aliases.alias_name_to_lambda_node.end()) { has_node_in_alias_table = true; @@ -8294,10 +8366,10 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier /// Remove aliases from expression and lambda nodes - for (auto & [_, node] : *scope.alias_name_to_expression_node) + for (auto & [_, node] : *scope.aliases.alias_name_to_expression_node) node->removeAlias(); - for (auto & [_, node] : scope.alias_name_to_lambda_node) + for (auto & [_, node] : scope.aliases.alias_name_to_lambda_node) node->removeAlias(); query_node_typed.resolveProjectionColumns(std::move(projection_columns)); diff --git a/tests/queries/0_stateless/02341_analyzer_aliases_basics.reference b/tests/queries/0_stateless/02341_analyzer_aliases_basics.reference index 3733d6b6084..e39cdce92b0 100644 --- a/tests/queries/0_stateless/02341_analyzer_aliases_basics.reference +++ b/tests/queries/0_stateless/02341_analyzer_aliases_basics.reference @@ -17,3 +17,4 @@ Alias conflict with identifier inside expression Alias setting prefer_column_name_to_alias 0 Value +/a/b/c diff --git a/tests/queries/0_stateless/02341_analyzer_aliases_basics.sql b/tests/queries/0_stateless/02341_analyzer_aliases_basics.sql index 52a1cd1dae8..467073fc4e8 100644 --- a/tests/queries/0_stateless/02341_analyzer_aliases_basics.sql +++ b/tests/queries/0_stateless/02341_analyzer_aliases_basics.sql @@ -48,3 +48,5 @@ WITH id AS value SELECT value FROM test_table; SET prefer_column_name_to_alias = 0; DROP TABLE test_table; + +WITH path('clickhouse.com/a/b/c') AS x SELECT x AS path; diff --git a/tests/queries/0_stateless/02343_analyzer_lambdas.sql b/tests/queries/0_stateless/02343_analyzer_lambdas.sql index 0c257cf6f18..25928acb2c3 100644 --- a/tests/queries/0_stateless/02343_analyzer_lambdas.sql +++ b/tests/queries/0_stateless/02343_analyzer_lambdas.sql @@ -93,3 +93,11 @@ SELECT arrayMap(lambda(tuple(x), x + 1), [1, 2, 3]), lambda2(tuple(x), x + 1), 1 DROP TABLE test_table_tuple; DROP TABLE test_table; + +WITH x -> (lambda(x) + 1) AS lambda +SELECT lambda(1); -- {serverError UNSUPPORTED_METHOD } + +WITH + x -> (lambda1(x) + 1) AS lambda, + lambda AS lambda1 +SELECT lambda(1); -- {serverError UNSUPPORTED_METHOD } From d4430b583c4e4531ad1372fd3e40ff6bad5a414d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 21 May 2024 16:19:14 +0200 Subject: [PATCH 208/392] Create snapshot --- utils/keeper-bench/Runner.cpp | 100 +++++++++++++++++----------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp index 0050230b6ec..a625a7f157d 100644 --- a/utils/keeper-bench/Runner.cpp +++ b/utils/keeper-bench/Runner.cpp @@ -628,7 +628,11 @@ struct ZooKeeperRequestFromLogReader set_request->path = current_block->getPath(idx_in_block); set_request->data = current_block->getData(idx_in_block); if (auto version = current_block->getVersion(idx_in_block)) - set_request->version = *version; + { + /// we just need to make sure that the request with version that need to fail, fail when replaying + if (request_from_log.expected_result == Coordination::Error::ZBADVERSION) + set_request->version = std::numeric_limits::max(); + } request_from_log.request = set_request; break; } @@ -637,7 +641,11 @@ struct ZooKeeperRequestFromLogReader auto remove_request = std::make_shared(); remove_request->path = current_block->getPath(idx_in_block); if (auto version = current_block->getVersion(idx_in_block)) - remove_request->version = *version; + { + /// we just need to make sure that the request with version that need to fail, fail when replaying + if (request_from_log.expected_result == Coordination::Error::ZBADVERSION) + remove_request->version = std::numeric_limits::max(); + } request_from_log.request = remove_request; break; } @@ -647,7 +655,11 @@ struct ZooKeeperRequestFromLogReader auto check_request = std::make_shared(); check_request->path = current_block->getPath(idx_in_block); if (auto version = current_block->getVersion(idx_in_block)) - check_request->version = *version; + { + /// we just need to make sure that the request with version that need to fail, fail when replaying + if (request_from_log.expected_result == Coordination::Error::ZBADVERSION) + check_request->version = std::numeric_limits::max(); + } if (op_num == Coordination::OpNum::CheckNotExists) check_request->not_exists = true; request_from_log.request = check_request; @@ -791,10 +803,12 @@ struct SetupNodeCollector if (!request_from_log.expected_result.has_value()) return; + auto process_request = [&](const Coordination::ZooKeeperRequest & request, const auto expected_result) { const auto & path = request.getPath(); - if (processed_paths.contains(path)) + + if (nodes_created_during_replay.contains(path)) return; auto op_num = request.getOpNum(); @@ -804,64 +818,43 @@ struct SetupNodeCollector if (expected_result == Coordination::Error::ZNODEEXISTS) { addExpectedNode(path); - processed_paths.insert(path); } else if (expected_result == Coordination::Error::ZOK) { + nodes_created_during_replay.insert(path); /// we need to make sure ancestors exist auto position = path.find_last_of('/'); if (position != 0) { auto parent_path = path.substr(0, position); - if (!processed_paths.contains(parent_path)) - { - addExpectedNode(parent_path); - processed_paths.insert(parent_path); - } + addExpectedNode(parent_path); } - - processed_paths.insert(path); } } else if (op_num == Coordination::OpNum::Remove) { - if (expected_result == Coordination::Error::ZOK) - { + if (expected_result == Coordination::Error::ZOK || expected_result == Coordination::Error::ZBADVERSION) addExpectedNode(path); - processed_paths.insert(path); - } } else if (op_num == Coordination::OpNum::Set) { - if (expected_result == Coordination::Error::ZOK) - { + if (expected_result == Coordination::Error::ZOK || expected_result == Coordination::Error::ZBADVERSION) addExpectedNode(path); - processed_paths.insert(path); - } } else if (op_num == Coordination::OpNum::Check) { - if (expected_result == Coordination::Error::ZOK) - { + if (expected_result == Coordination::Error::ZOK || expected_result == Coordination::Error::ZBADVERSION) addExpectedNode(path); - processed_paths.insert(path); - } } else if (op_num == Coordination::OpNum::CheckNotExists) { - if (expected_result == Coordination::Error::ZNODEEXISTS) - { + if (expected_result == Coordination::Error::ZNODEEXISTS || expected_result == Coordination::Error::ZBADVERSION) addExpectedNode(path); - processed_paths.insert(path); - } } else if (request.isReadRequest()) { if (expected_result == Coordination::Error::ZOK) - { addExpectedNode(path); - processed_paths.insert(path); - } } }; @@ -940,7 +933,7 @@ struct SetupNodeCollector std::mutex nodes_mutex; DB::KeeperContextPtr keeper_context; Coordination::KeeperStoragePtr initial_storage; - std::unordered_set processed_paths; + std::unordered_set nodes_created_during_replay; std::optional snapshot_manager; }; @@ -979,23 +972,23 @@ void requestFromLogExecutor(std::shared_ptrtoString(), response.error, *expected_result) - << std::endl; + //if (*expected_result != response.error) + //{ + // std::cerr << fmt::format( + // "Unexpected result for {}\ngot {}, expected {}\n", request->toString(), response.error, *expected_result) + // << std::endl; - if (const auto * multi_response = dynamic_cast(&response)) - { - std::string subresponses; - for (size_t i = 0; i < multi_response->responses.size(); ++i) - { - subresponses += fmt::format("{} = {}\n", i, multi_response->responses[i]->error); - } + // if (const auto * multi_response = dynamic_cast(&response)) + // { + // std::string subresponses; + // for (size_t i = 0; i < multi_response->responses.size(); ++i) + // { + // subresponses += fmt::format("{} = {}\n", i, multi_response->responses[i]->error); + // } - std::cerr << "Subresponses\n" << subresponses << std::endl; - } - } + // std::cerr << "Subresponses\n" << subresponses << std::endl; + // } + //} } request_promise->set_value(); @@ -1049,7 +1042,7 @@ void Runner::runBenchmarkFromLog() std::unordered_map>> executor_id_to_queue; - SCOPE_EXIT({ + SCOPE_EXIT_SAFE({ for (const auto & [executor_id, executor_queue] : executor_id_to_queue) executor_queue->finish(); @@ -1262,8 +1255,15 @@ Runner::~Runner() if (pool) pool->wait(); - auto connection = getConnection(connection_infos[0], 0); - benchmark_context.cleanup(*connection); + try + { + auto connection = getConnection(connection_infos[0], 0); + benchmark_context.cleanup(*connection); + } + catch (...) + { + DB::tryLogCurrentException("While trying to clean nodes"); + } } namespace From 23eaa0de40d92d61e453a86dfa7c1a38b5d67b75 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 21 May 2024 14:28:19 +0000 Subject: [PATCH 209/392] Fix style. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index e50ad7911a0..7ecb91e7972 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -615,7 +615,7 @@ struct ScopeAliases case IdentifierLookupContext::TABLE_EXPRESSION: return alias_name_to_table_expression_node; } - __builtin_unreachable(); + UNREACHABLE(); } enum class FindOption @@ -632,7 +632,7 @@ struct ScopeAliases case FindOption::FULL_NAME: return identifier.getFullName(); } - __builtin_unreachable(); + UNREACHABLE(); } QueryTreeNodePtr * find(IdentifierLookup lookup, FindOption find_option) From dd9bb8fe9cc2d3187906cd65e0757ae29c67f032 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 16:54:28 +0200 Subject: [PATCH 210/392] Add tests --- .../02931_max_num_to_warn.reference | 2 + .../0_stateless/02931_max_num_to_warn.sql | 43 ++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02931_max_num_to_warn.reference b/tests/queries/0_stateless/02931_max_num_to_warn.reference index 7de998eebfa..419149b0bd2 100644 --- a/tests/queries/0_stateless/02931_max_num_to_warn.reference +++ b/tests/queries/0_stateless/02931_max_num_to_warn.reference @@ -1,3 +1,5 @@ The number of attached tables is more than 5 +The number of attached views is more than 5 +The number of attached dictionaries is more than 5 The number of attached databases is more than 2 The number of active parts is more than 10 diff --git a/tests/queries/0_stateless/02931_max_num_to_warn.sql b/tests/queries/0_stateless/02931_max_num_to_warn.sql index 23f04816d5a..4087a536cd0 100644 --- a/tests/queries/0_stateless/02931_max_num_to_warn.sql +++ b/tests/queries/0_stateless/02931_max_num_to_warn.sql @@ -13,6 +13,41 @@ CREATE TABLE IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_9 (id CREATE TABLE IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_10 (id Int32, str String) Engine=Memory; CREATE TABLE IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_11 (id Int32, str String) Engine=Memory; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_1 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_1; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_2 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_2; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_3 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_3; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_4 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_4; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_5 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_5; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_6 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_6; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_7 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_7; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_8 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_8; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_9 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_9; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_10 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_10; +CREATE VIEW IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_view_11 AS SELECT * FROM test_max_num_to_warn_02931.test_max_num_to_warn_11; + +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_1 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_1'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_2 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_2'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_3 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_3'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_4 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_4'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_5 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_5'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_6 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_6'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_7 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_7'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_8 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_8'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_9 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_9'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_10 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_10'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); +CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_11 (id Int32, str String) PRIMARY KEY id +SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_11'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); + CREATE DATABASE IF NOT EXISTS test_max_num_to_warn_1; CREATE DATABASE IF NOT EXISTS test_max_num_to_warn_2; CREATE DATABASE IF NOT EXISTS test_max_num_to_warn_3; @@ -37,7 +72,13 @@ INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_9 VALUES (1, 'Hello' INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_10 VALUES (1, 'Hello'); INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_11 VALUES (1, 'Hello'); -SELECT * FROM system.warnings where message in ('The number of attached tables is more than 5', 'The number of attached databases is more than 2', 'The number of active parts is more than 10'); +SELECT * FROM system.warnings where message in ( + 'The number of attached tables is more than 5', + 'The number of attached views is more than 5', + 'The number of attached dictionaries is more than 5', + 'The number of attached databases is more than 2', + 'The number of active parts is more than 10' +); DROP DATABASE IF EXISTS test_max_num_to_warn_02931; DROP DATABASE IF EXISTS test_max_num_to_warn_1; From f1f8a35bab0e9dc46aa46faa4c3be7609b77a509 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 21 May 2024 15:03:16 +0000 Subject: [PATCH 211/392] Fix #64136 --- src/Interpreters/Cache/QueryCache.cpp | 26 ++++++++++++---- src/Interpreters/Cache/QueryCache.h | 3 +- src/Interpreters/executeQuery.cpp | 4 +-- .../02494_query_cache_use_database.reference | 2 ++ .../02494_query_cache_use_database.sql | 30 +++++++++++++++++++ 5 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/02494_query_cache_use_database.reference create mode 100644 tests/queries/0_stateless/02494_query_cache_use_database.sql diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp index fafe50c170f..2fddbc0b044 100644 --- a/src/Interpreters/Cache/QueryCache.cpp +++ b/src/Interpreters/Cache/QueryCache.cpp @@ -177,6 +177,22 @@ ASTPtr removeQueryCacheSettings(ASTPtr ast) return transformed_ast; } +IAST::Hash calculateAstHash(ASTPtr ast, const String & current_database) +{ + ast = removeQueryCacheSettings(ast); + + /// Hash the AST, it must consider aliases (issue #56258) + constexpr bool ignore_aliases = false; + IAST::Hash ast_hash = ast->getTreeHash(ignore_aliases); + + /// Also hash the database specified via SQL `USE db`, otherwise identifiers in same query (AST) may mean different columns in different tables (issue #64136) + IAST::Hash cur_database_hash = CityHash_v1_0_2::CityHash128(current_database.data(), current_database.size()); + UInt64 low_combined = ast_hash.low64 ^ cur_database_hash.low64; + UInt64 high_combined = ast_hash.high64 ^ cur_database_hash.high64; + + return {low_combined, high_combined}; +} + String queryStringFromAST(ASTPtr ast) { WriteBufferFromOwnString buf; @@ -186,17 +202,15 @@ String queryStringFromAST(ASTPtr ast) } -/// Hashing of ASTs must consider aliases (issue #56258) -static constexpr bool ignore_aliases = false; - QueryCache::Key::Key( ASTPtr ast_, + String current_database, Block header_, std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, std::chrono::time_point expires_at_, bool is_compressed_) - : ast_hash(removeQueryCacheSettings(ast_)->getTreeHash(ignore_aliases)) + : ast_hash(calculateAstHash(ast_, current_database)) , header(header_) , user_id(user_id_) , current_user_roles(current_user_roles_) @@ -207,8 +221,8 @@ QueryCache::Key::Key( { } -QueryCache::Key::Key(ASTPtr ast_, std::optional user_id_, const std::vector & current_user_roles_) - : QueryCache::Key(ast_, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) /// dummy values for everything != AST or user name +QueryCache::Key::Key(ASTPtr ast_, String current_database, std::optional user_id_, const std::vector & current_user_roles_) + : QueryCache::Key(ast_, current_database, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) /// dummy values for everything != AST, current database, user name/roles { } diff --git a/src/Interpreters/Cache/QueryCache.h b/src/Interpreters/Cache/QueryCache.h index 814cad37f82..c234ea3d464 100644 --- a/src/Interpreters/Cache/QueryCache.h +++ b/src/Interpreters/Cache/QueryCache.h @@ -88,6 +88,7 @@ public: /// Ctor to construct a Key for writing into query cache. Key(ASTPtr ast_, + String current_database, Block header_, std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, @@ -95,7 +96,7 @@ public: bool is_compressed); /// Ctor to construct a Key for reading from query cache (this operation only needs the AST + user name). - Key(ASTPtr ast_, std::optional user_id_, const std::vector & current_user_roles_); + Key(ASTPtr ast_, String current_database, std::optional user_id_, const std::vector & current_user_roles_); bool operator==(const Key & other) const; }; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index f1f72a4ea4a..90e6406c792 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -1102,7 +1102,7 @@ static std::tuple executeQueryImpl( { if (can_use_query_cache && settings.enable_reads_from_query_cache) { - QueryCache::Key key(ast, context->getUserID(), context->getCurrentRoles()); + QueryCache::Key key(ast, context->getCurrentDatabase(), context->getUserID(), context->getCurrentRoles()); QueryCache::Reader reader = query_cache->createReader(key); if (reader.hasCacheEntryForKey()) { @@ -1225,7 +1225,7 @@ static std::tuple executeQueryImpl( && (!ast_contains_system_tables || system_table_handling == QueryCacheSystemTableHandling::Save)) { QueryCache::Key key( - ast, res.pipeline.getHeader(), + ast, context->getCurrentDatabase(), res.pipeline.getHeader(), context->getUserID(), context->getCurrentRoles(), settings.query_cache_share_between_users, std::chrono::system_clock::now() + std::chrono::seconds(settings.query_cache_ttl), diff --git a/tests/queries/0_stateless/02494_query_cache_use_database.reference b/tests/queries/0_stateless/02494_query_cache_use_database.reference new file mode 100644 index 00000000000..1191247b6d9 --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_use_database.reference @@ -0,0 +1,2 @@ +1 +2 diff --git a/tests/queries/0_stateless/02494_query_cache_use_database.sql b/tests/queries/0_stateless/02494_query_cache_use_database.sql new file mode 100644 index 00000000000..df560f82ebb --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_use_database.sql @@ -0,0 +1,30 @@ +-- Tags: no-parallel, no-fasttest +-- Tag no-fasttest: Depends on OpenSSL +-- Tag no-parallel: Messes with internal cache + +-- Test for issue #64136 + +SYSTEM DROP QUERY CACHE; + +DROP DATABASE IF EXISTS db1; +DROP DATABASE IF EXISTS db2; + +CREATE DATABASE db1; +CREATE DATABASE db2; + +CREATE TABLE db1.tab(a UInt64, PRIMARY KEY a); +CREATE TABLE db2.tab(a UInt64, PRIMARY KEY a); + +INSERT INTO db1.tab values(1); +INSERT INTO db2.tab values(2); + +USE db1; +SELECT * FROM tab SETTINGS use_query_cache=1; + +USE db2; +SELECT * FROM tab SETTINGS use_query_cache=1; + +DROP DATABASE db1; +DROP DATABASE db2; + +SYSTEM DROP QUERY CACHE; From 3dbf32a558458b50bafb017d45b83446ef0ec2e8 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 17:03:43 +0200 Subject: [PATCH 212/392] Remove dict creation --- tests/queries/0_stateless/02931_max_num_to_warn.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/queries/0_stateless/02931_max_num_to_warn.sql b/tests/queries/0_stateless/02931_max_num_to_warn.sql index 4087a536cd0..1c96e017646 100644 --- a/tests/queries/0_stateless/02931_max_num_to_warn.sql +++ b/tests/queries/0_stateless/02931_max_num_to_warn.sql @@ -45,8 +45,6 @@ CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_ SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_9'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_10 (id Int32, str String) PRIMARY KEY id SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_10'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); -CREATE DICTIONARY IF NOT EXISTS test_max_num_to_warn_02931.test_max_num_to_warn_dict_11 (id Int32, str String) PRIMARY KEY id -SOURCE(CLICKHOUSE(DB 'test_max_num_to_warn_02931' TABLE 'test_max_num_to_warn_11'))LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); CREATE DATABASE IF NOT EXISTS test_max_num_to_warn_1; CREATE DATABASE IF NOT EXISTS test_max_num_to_warn_2; From ac7da1cc388edf03fd189bd24376c4a571c7b12a Mon Sep 17 00:00:00 2001 From: Max K Date: Tue, 21 May 2024 12:57:05 +0200 Subject: [PATCH 213/392] CI: cancel running PR wf after adding to MQ --- .github/workflows/merge_queue.yml | 3 + tests/ci/ci.py | 45 ++++++++++-- tests/ci/ci_metadata.py | 112 ++++++++++++++++++++++++++++++ tests/ci/github_helper.py | 19 +++++ 4 files changed, 172 insertions(+), 7 deletions(-) create mode 100644 tests/ci/ci_metadata.py diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index 1b6cc320ec4..97aa0db4cdb 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -22,6 +22,9 @@ jobs: clear-repository: true # to ensure correct digests fetch-depth: 0 # to get version filter: tree:0 + - name: Cancel PR workflow + run: | + python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --cancel-previous-run - name: Python unit tests run: | cd "$GITHUB_WORKSPACE/tests/ci" diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 3a616c8aad6..046550c62f8 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -44,6 +44,7 @@ from env_helper import ( REPORT_PATH, S3_BUILDS_BUCKET, TEMP_PATH, + GITHUB_RUN_ID, ) from get_robot_token import get_best_robot_token from git_helper import GIT_PREFIX, Git @@ -52,6 +53,7 @@ from github_helper import GitHub from pr_info import PRInfo from report import ERROR, SUCCESS, BuildResult, JobReport from s3_helper import S3Helper +from ci_metadata import CiMetadata from version_helper import get_version_from_repo # pylint: disable=too-many-lines @@ -66,12 +68,12 @@ class PendingState: class CiCache: """ CI cache is a bunch of records. Record is a file stored under special location on s3. - The file name has following format + The file name has a format: _[]--___.ci RECORD_TYPE: - SUCCESSFUL - for successfuly finished jobs + SUCCESSFUL - for successful jobs PENDING - for pending jobs ATTRIBUTES: @@ -991,7 +993,11 @@ def normalize_check_name(check_name: str) -> str: def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: - # FIXME: consider switching to sub_parser for configure, pre, run, post actions + parser.add_argument( + "--cancel-previous-run", + action="store_true", + help="Action that cancels previous running PR workflow if PR added into the Merge Queue", + ) parser.add_argument( "--configure", action="store_true", @@ -1000,17 +1006,19 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: parser.add_argument( "--update-gh-statuses", action="store_true", - help="Action that recreate success GH statuses for jobs that finished successfully in past and will be skipped this time", + help="Action that recreate success GH statuses for jobs that finished successfully in past and will be " + "skipped this time", ) parser.add_argument( "--pre", action="store_true", - help="Action that executes prerequesetes for the job provided in --job-name", + help="Action that executes prerequisites for the job provided in --job-name", ) parser.add_argument( "--run", action="store_true", - help="Action that executes run action for specified --job-name. run_command must be configured for a given job name.", + help="Action that executes run action for specified --job-name. run_command must be configured for a given " + "job name.", ) parser.add_argument( "--post", @@ -1088,7 +1096,8 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "--rebuild-all-binaries", action="store_true", default=False, - help="[DEPRECATED. to be removed, once no wf use it] will create run config without skipping build jobs in any case, used in --configure action (for release branches)", + help="[DEPRECATED. to be removed, once no wf use it] will create run config without skipping build jobs in " + "any case, used in --configure action (for release branches)", ) parser.add_argument( "--commit-message", @@ -1902,6 +1911,15 @@ def _get_ext_check_name(check_name: str) -> str: return check_name_with_group +def _cancel_pr_wf(s3: S3Helper, pr_number: int) -> None: + run_id = CiMetadata(s3, pr_number).run_id + if not run_id: + print("ERROR: FIX IT: Run id has not been found!") + else: + print(f"Canceling PR workflow run_id: [{run_id}], pr: [{pr_number}]") + GitHub.cancel_wf(run_id) + + def main() -> int: logging.basicConfig(level=logging.INFO) exit_code = 0 @@ -1930,6 +1948,12 @@ def main() -> int: ### CONFIGURE action: start if args.configure: + if CI and pr_info.is_pr: + # store meta on s3 (now we need it only for PRs) + meta = CiMetadata(s3, pr_info.number) + meta.run_id = int(GITHUB_RUN_ID) + meta.push_meta() + ci_options = CiOptions.create_from_pr_message( args.commit_message or None, update_from_api=True ) @@ -2222,6 +2246,13 @@ def main() -> int: assert indata, "Run config must be provided via --infile" _update_gh_statuses_action(indata=indata, s3=s3) + ### CANCEL PREVIOUS WORKFLOW RUN + elif args.cancel_previous_run: + assert ( + pr_info.is_merge_queue + ), "Currently it's supposed to be used in MQ wf to cancel running PR wf if any" + _cancel_pr_wf(s3, pr_info.merged_pr) + ### print results _print_results(result, args.outfile, args.pretty) diff --git a/tests/ci/ci_metadata.py b/tests/ci/ci_metadata.py new file mode 100644 index 00000000000..5856e9a8501 --- /dev/null +++ b/tests/ci/ci_metadata.py @@ -0,0 +1,112 @@ +from pathlib import Path +from typing import Optional + +from env_helper import ( + S3_BUILDS_BUCKET, + TEMP_PATH, +) +from s3_helper import S3Helper + + +# pylint: disable=too-many-lines + + +class CiMetadata: + """ + CI Metadata class owns data like workflow run_id for a given pr, etc. + Goal is to have everything we need to manage workflows on S3 and rely on GH api as little as possible + """ + + _S3_PREFIX = "CI_meta_v1" + _LOCAL_PATH = Path(TEMP_PATH) / "ci_meta" + _FILE_SUFFIX = ".cimd" + _FILENAME_RUN_ID = "run_id" + _FILE_SUFFIX + + def __init__( + self, + s3: S3Helper, + pr_number: Optional[int] = None, + sha: Optional[str] = None, + git_ref: Optional[str] = None, + ): + assert pr_number or (sha and git_ref) + + self.sha = sha + self.pr_number = pr_number + self.git_ref = git_ref + self.s3 = s3 + self.run_id = 0 + + if self.pr_number: + self.s3_path = f"{self._S3_PREFIX}/PRs/{self.pr_number}/" + else: + self.s3_path = f"{self._S3_PREFIX}/{self.git_ref}/{self.sha}/" + + self._updated = False + + if not self._LOCAL_PATH.exists(): + self._LOCAL_PATH.mkdir(parents=True, exist_ok=True) + + def fetch_meta(self): + """ + Fetches meta from s3 + """ + + # clean up + for file in self._LOCAL_PATH.glob("*" + self._FILE_SUFFIX): + file.unlink() + + _ = self.s3.download_files( + bucket=S3_BUILDS_BUCKET, + s3_path=self.s3_path, + file_suffix=self._FILE_SUFFIX, + local_directory=self._LOCAL_PATH, + ) + + meta_files = Path(self._LOCAL_PATH).rglob("*" + self._FILE_SUFFIX) + for file_name in meta_files: + path_in_str = str(file_name) + with open(path_in_str, "r", encoding="utf-8") as f: + # Read all lines in the file + lines = f.readlines() + assert len(lines) == 1 + if file_name.name == self._FILENAME_RUN_ID: + self.run_id = int(lines[0]) + + self._updated = True + return self + + def push_meta( + self, + ) -> None: + """ + Uploads meta on s3 + """ + assert self.run_id + print("Storing workflow meta on s3") + + local_file = self._LOCAL_PATH / self._FILENAME_RUN_ID + with open(local_file, "w", encoding="utf-8") as file: + file.write(f"{self.run_id}\n") + + _ = self.s3.upload_file( + bucket=S3_BUILDS_BUCKET, + file_path=local_file, + s3_path=self.s3_path + local_file.name, + ) + + +if __name__ == "__main__": + # TEST: + s3 = S3Helper() + a = CiMetadata(s3, 12345, "deadbeaf", "test_branch") + a.run_id = 111 + a.push_meta() + b = CiMetadata(s3, 12345, "deadbeaf", "test_branch") + assert b.fetch_meta().run_id == a.run_id + + a = CiMetadata(s3, 0, "deadbeaf", "test_branch") + a.run_id = 112 + a.push_meta() + b = CiMetadata(s3, 0, "deadbeaf", "test_branch") + assert b.fetch_meta().run_id == a.run_id diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index ae1eaf4c06a..81603c66bae 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -9,6 +9,7 @@ from time import sleep from typing import List, Optional, Tuple, Union import github +import requests # explicit reimport # pylint: disable=useless-import-alias @@ -21,6 +22,9 @@ from github.NamedUser import NamedUser as NamedUser from github.PullRequest import PullRequest as PullRequest from github.Repository import Repository as Repository +from env_helper import GITHUB_REPOSITORY +from get_robot_token import get_best_robot_token + # pylint: enable=useless-import-alias CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache") @@ -260,3 +264,18 @@ class GitHub(github.Github): def retries(self, value: int) -> None: assert isinstance(value, int) self._retries = value + + # minimalistic static methods not using pygithub + @staticmethod + def cancel_wf(run_id, strict=False): + token = get_best_robot_token() + headers = {"Authorization": f"token {token}"} + url = f"https://api.github.com/repos/{GITHUB_REPOSITORY}/actions/runs/{run_id}/cancel" + try: + response = requests.post(url, headers=headers, timeout=10) + response.raise_for_status() + print(f"NOTE: Workflow [{run_id}] has been cancelled") + except Exception as ex: + print("ERROR: Got exception executing wf cancel request", ex) + if strict: + raise ex From f815b4e037bb1ecd938ad659660f4d05326d0b7d Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 17:15:55 +0200 Subject: [PATCH 214/392] Fix style --- src/Databases/DatabaseLazy.cpp | 14 ++++++++++---- src/Databases/DatabasesCommon.cpp | 14 ++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index a27e69c7e63..c95d690f331 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -174,12 +174,18 @@ bool DatabaseLazy::empty() const return tables_cache.empty(); } -static CurrentMetrics::Metric get_attached_count_metric_for_storage(const StoragePtr & storage) { - if (storage->isView()) { +static CurrentMetrics::Metric get_attached_count_metric_for_storage(const StoragePtr & storage) +{ + if (storage->isView()) + { return CurrentMetrics::AttachedView; - } else if (storage->isDictionary()) { + } + else if (storage->isDictionary()) + { return CurrentMetrics::AttachedDictionary; - } else { + } + else + { return CurrentMetrics::AttachedTable; } } diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 03a8feb845f..ff721e8e5c4 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -255,12 +255,18 @@ StoragePtr DatabaseWithOwnTablesBase::detachTable(ContextPtr /* context_ */, con } -static CurrentMetrics::Metric get_attached_count_metric_for_storage(const StoragePtr & storage) { - if (storage->isView()) { +static CurrentMetrics::Metric get_attached_count_metric_for_storage(const StoragePtr & storage) +{ + if (storage->isView()) + { return CurrentMetrics::AttachedView; - } else if (storage->isDictionary()) { + } + else if (storage->isDictionary()) + { return CurrentMetrics::AttachedDictionary; - } else { + } + else + { return CurrentMetrics::AttachedTable; } } From 0106f558fb9040c97fcb7691dc5d72a144ad637b Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 17:19:52 +0200 Subject: [PATCH 215/392] Update limits --- .../en/operations/server-configuration-parameters/settings.md | 4 ++-- src/Core/ServerSettings.h | 4 ++-- src/Interpreters/Context.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 4d239309886..a5fe74fd0c6 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -563,7 +563,7 @@ Default value: 5000 ## max\_view\_num\_to\_warn {#max-view-num-to-warn} If the number of attached views exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table. -Default value: 5000 +Default value: 10000 **Example** @@ -573,7 +573,7 @@ Default value: 5000 ## max\_dictionary\_num\_to\_warn {#max-dictionary-num-to-warn} If the number of attached dictionaries exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table. -Default value: 5000 +Default value: 1000 **Example** diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index af96ca3a557..ea0b155b22d 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -97,8 +97,8 @@ namespace DB M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \ M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \ M(UInt64, max_table_num_to_warn, 5000lu, "If number of tables is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_view_num_to_warn, 5000lu, "If number of views is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_dictionary_num_to_warn, 5000lu, "If number of dictionaries is greater than this value, server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_view_num_to_warn, 10000lu, "If number of views is greater than this value, server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_dictionary_num_to_warn, 1000lu, "If number of dictionaries is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, max_database_num_to_warn, 1000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, max_part_num_to_warn, 100000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ M(UInt64, concurrent_threads_soft_limit_num, 0, "Sets how many concurrent thread can be allocated before applying CPU pressure. Zero means unlimited.", 0) \ diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 4c5df8ef4ea..e1d82a8f604 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -361,8 +361,8 @@ struct ContextSharedPart : boost::noncopyable /// No lock required for format_schema_path modified only during initialization std::atomic_size_t max_database_num_to_warn = 1000lu; std::atomic_size_t max_table_num_to_warn = 5000lu; - std::atomic_size_t max_view_num_to_warn = 5000lu; - std::atomic_size_t max_dictionary_num_to_warn = 5000lu; + std::atomic_size_t max_view_num_to_warn = 10000lu; + std::atomic_size_t max_dictionary_num_to_warn = 1000lu; std::atomic_size_t max_part_num_to_warn = 100000lu; String format_schema_path; /// Path to a directory that contains schema files used by input formats. String google_protos_path; /// Path to a directory that contains the proto files for the well-known Protobuf types. From 828885c66c8a06d24c34b0d92c6cddda3525b30f Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 May 2024 17:20:52 +0200 Subject: [PATCH 216/392] Fix applyNewSettings --- .../AzureBlobStorage/AzureObjectStorage.cpp | 4 +++- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 3 ++- src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp | 5 +++-- src/Disks/ObjectStorages/Cached/CachedObjectStorage.h | 3 ++- src/Disks/ObjectStorages/DiskObjectStorage.cpp | 2 +- src/Disks/ObjectStorages/IObjectStorage.h | 9 +++++++-- src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp | 5 ----- src/Disks/ObjectStorages/Local/LocalObjectStorage.h | 5 ----- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 5 +++-- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 3 ++- src/Disks/ObjectStorages/Web/WebObjectStorage.cpp | 5 ----- src/Disks/ObjectStorages/Web/WebObjectStorage.h | 5 ----- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 5 ++--- 13 files changed, 25 insertions(+), 34 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index c09cb5e24e1..e7ecf7cd515 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -398,7 +398,9 @@ void AzureObjectStorage::copyObject( /// NOLINT dest_blob_client.CopyFromUri(source_blob_client.GetUrl(), copy_options); } -void AzureObjectStorage::applyNewSettings(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, ContextPtr context) +void AzureObjectStorage::applyNewSettings( + const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, + ContextPtr context, const ApplyNewSettingsOptions &) { auto new_settings = getAzureBlobStorageSettings(config, config_prefix, context); settings.set(std::move(new_settings)); diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index c38b5906f4e..e09f5e6753d 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -143,7 +143,8 @@ public: void applyNewSettings( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - ContextPtr context) override; + ContextPtr context, + const ApplyNewSettingsOptions & options) override; String getObjectsNamespace() const override { return object_namespace ; } diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp index c834ef56644..f2f33684fde 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp @@ -192,9 +192,10 @@ void CachedObjectStorage::shutdown() } void CachedObjectStorage::applyNewSettings( - const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, ContextPtr context) + const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, + ContextPtr context, const ApplyNewSettingsOptions & options) { - object_storage->applyNewSettings(config, config_prefix, context); + object_storage->applyNewSettings(config, config_prefix, context, options); } String CachedObjectStorage::getObjectsNamespace() const diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index ed78eb90ef4..a4d263e92eb 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -91,7 +91,8 @@ public: void applyNewSettings( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - ContextPtr context) override; + ContextPtr context, + const ApplyNewSettingsOptions & options) override; String getObjectsNamespace() const override; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index f6980d1e8f1..27e0cc78a38 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -536,7 +536,7 @@ void DiskObjectStorage::applyNewSettings( { /// FIXME we cannot use config_prefix that was passed through arguments because the disk may be wrapped with cache and we need another name const auto config_prefix = "storage_configuration.disks." + name; - object_storage->applyNewSettings(config, config_prefix, context_); + object_storage->applyNewSettings(config, config_prefix, context_, IObjectStorage::ApplyNewSettingsOptions{ .allow_client_change = true }); { std::unique_lock lock(resource_mutex); diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 5724ae8929c..d4ac6ea0239 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -199,10 +199,15 @@ public: virtual void startup() = 0; /// Apply new settings, in most cases reiniatilize client and some other staff + struct ApplyNewSettingsOptions + { + bool allow_client_change = true; + }; virtual void applyNewSettings( - const Poco::Util::AbstractConfiguration &, + const Poco::Util::AbstractConfiguration & /* config */, const std::string & /*config_prefix*/, - ContextPtr) {} + ContextPtr /* context */, + const ApplyNewSettingsOptions & /* options */) {} /// Sometimes object storages have something similar to chroot or namespace, for example /// buckets in S3. If object storage doesn't have any namepaces return empty string. diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index fa27e08f404..a247d86ddce 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -222,11 +222,6 @@ std::unique_ptr LocalObjectStorage::cloneObjectStorage( throw Exception(ErrorCodes::NOT_IMPLEMENTED, "cloneObjectStorage() is not implemented for LocalObjectStorage"); } -void LocalObjectStorage::applyNewSettings( - const Poco::Util::AbstractConfiguration & /* config */, const std::string & /* config_prefix */, ContextPtr /* context */) -{ -} - ObjectStorageKey LocalObjectStorage::generateObjectKeyForPath(const std::string & /* path */) const { constexpr size_t key_name_total_size = 32; diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h index 4c667818c88..371cd37f8b2 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h @@ -73,11 +73,6 @@ public: void startup() override; - void applyNewSettings( - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - ContextPtr context) override; - String getObjectsNamespace() const override { return ""; } std::unique_ptr cloneObjectStorage( diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 7891be64b06..d18468411ea 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -572,7 +572,8 @@ void S3ObjectStorage::startup() void S3ObjectStorage::applyNewSettings( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - ContextPtr context) + ContextPtr context, + const ApplyNewSettingsOptions & options) { auto new_s3_settings = getSettings(config, config_prefix, context); if (!static_headers.empty()) @@ -586,7 +587,7 @@ void S3ObjectStorage::applyNewSettings( new_s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); auto current_s3_settings = s3_settings.get(); - if (current_s3_settings->auth_settings.hasUpdates(new_s3_settings->auth_settings) || for_disk_s3) + if (options.allow_client_change && (current_s3_settings->auth_settings.hasUpdates(new_s3_settings->auth_settings) || for_disk_s3)) { auto new_client = getClient(config, config_prefix, context, *new_s3_settings, for_disk_s3, &uri); client.set(std::move(new_client)); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 74bc5bef3c7..1fff6d67e23 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -149,7 +149,8 @@ public: void applyNewSettings( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - ContextPtr context) override; + ContextPtr context, + const ApplyNewSettingsOptions & options) override; std::string getObjectsNamespace() const override { return uri.bucket; } diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp index 69f6137cd2d..e837e056acc 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp @@ -344,11 +344,6 @@ void WebObjectStorage::startup() { } -void WebObjectStorage::applyNewSettings( - const Poco::Util::AbstractConfiguration & /* config */, const std::string & /* config_prefix */, ContextPtr /* context */) -{ -} - ObjectMetadata WebObjectStorage::getObjectMetadata(const std::string & /* path */) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Metadata is not supported for {}", getName()); diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.h b/src/Disks/ObjectStorages/Web/WebObjectStorage.h index b8ab510a6fb..9d3b9a3a8f0 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.h +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.h @@ -72,11 +72,6 @@ public: void startup() override; - void applyNewSettings( - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - ContextPtr context) override; - String getObjectsNamespace() const override { return ""; } std::unique_ptr cloneObjectStorage( diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index c45752c10f5..ba91f3038b6 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -87,9 +87,8 @@ bool StorageObjectStorage::supportsSubsetOfColumns(const ContextPtr & context) c void StorageObjectStorage::updateConfiguration(ContextPtr context) { - /// FIXME: we should be able to update everything apart from client if static_configuration == true. - if (!configuration->isStaticConfiguration()) - object_storage->applyNewSettings(context->getConfigRef(), configuration->getTypeName() + ".", context); + IObjectStorage::ApplyNewSettingsOptions options{ .allow_client_change = !configuration->isStaticConfiguration() }; + object_storage->applyNewSettings(context->getConfigRef(), configuration->getTypeName() + ".", context, options); } namespace From a38bb095d800686c27cdf45275af7dc7a5dde149 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 May 2024 18:12:22 +0200 Subject: [PATCH 217/392] Disallow write and truncate if archive --- .../ObjectStorage/StorageObjectStorage.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index ba91f3038b6..b38636e9144 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -242,6 +242,13 @@ SinkToStoragePtr StorageObjectStorage::write( const auto sample_block = metadata_snapshot->getSampleBlock(); const auto & settings = configuration->getQuerySettings(local_context); + if (configuration->isArchive()) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Path '{}' contains archive. Write into archive is not supported", + configuration->getPath()); + } + if (configuration->withGlobsIgnorePartitionWildcard()) { throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, @@ -289,6 +296,13 @@ void StorageObjectStorage::truncate( ContextPtr /* context */, TableExclusiveLockHolder & /* table_holder */) { + if (configuration->isArchive()) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Path '{}' contains archive. Table cannot be truncated", + configuration->getPath()); + } + if (configuration->withGlobs()) { throw Exception( From 2bf5f0e0fdb6e4ccffad95964622b5da9107ba5b Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 21 May 2024 16:13:29 +0000 Subject: [PATCH 218/392] Fix style. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 7ecb91e7972..52cd6207dde 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -614,8 +614,6 @@ struct ScopeAliases case IdentifierLookupContext::FUNCTION: return alias_name_to_lambda_node; case IdentifierLookupContext::TABLE_EXPRESSION: return alias_name_to_table_expression_node; } - - UNREACHABLE(); } enum class FindOption @@ -631,8 +629,6 @@ struct ScopeAliases case FindOption::FIRST_NAME: return identifier.front(); case FindOption::FULL_NAME: return identifier.getFullName(); } - - UNREACHABLE(); } QueryTreeNodePtr * find(IdentifierLookup lookup, FindOption find_option) From 3c4fb4f3b632ed4480e730536cb3fe976ca831d0 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 21 May 2024 16:22:13 +0000 Subject: [PATCH 219/392] Incorporate review feedback --- src/Interpreters/Cache/QueryCache.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp index 2fddbc0b044..e30da7f233d 100644 --- a/src/Interpreters/Cache/QueryCache.cpp +++ b/src/Interpreters/Cache/QueryCache.cpp @@ -182,15 +182,14 @@ IAST::Hash calculateAstHash(ASTPtr ast, const String & current_database) ast = removeQueryCacheSettings(ast); /// Hash the AST, it must consider aliases (issue #56258) - constexpr bool ignore_aliases = false; - IAST::Hash ast_hash = ast->getTreeHash(ignore_aliases); + SipHash hash; + ast->updateTreeHash(hash, /*ignore_aliases=*/ false); - /// Also hash the database specified via SQL `USE db`, otherwise identifiers in same query (AST) may mean different columns in different tables (issue #64136) - IAST::Hash cur_database_hash = CityHash_v1_0_2::CityHash128(current_database.data(), current_database.size()); - UInt64 low_combined = ast_hash.low64 ^ cur_database_hash.low64; - UInt64 high_combined = ast_hash.high64 ^ cur_database_hash.high64; + /// Also hash the database specified via SQL `USE db`, otherwise identifiers in same query (AST) may mean different columns in different + /// tables (issue #64136) + hash.update(current_database); - return {low_combined, high_combined}; + return getSipHash128AsPair(hash); } String queryStringFromAST(ASTPtr ast) From 532fe901293968b8dc4fa49299ff09079a9b3cd2 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 May 2024 18:32:19 +0200 Subject: [PATCH 220/392] Remove redundant includes --- src/Storages/ObjectStorage/StorageObjectStorageCluster.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index b38eb722df5..1c244b1ca36 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -1,10 +1,7 @@ #pragma once - -// #include #include #include #include -// #include namespace DB { From 96715f611bd54127f43f29123b9a06757d3d7daa Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 May 2024 18:43:53 +0200 Subject: [PATCH 221/392] Apply change from PR #63642 (https://github.com/ClickHouse/ClickHouse/pull/63642) --- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index b38636e9144..dba4aedf7b7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -131,7 +131,7 @@ public: void applyFilters(ActionDAGNodes added_filter_nodes) override { - filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes); + SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); From c1920130bb308e2d329117113ddf6ada3da2b908 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 May 2024 19:28:49 +0200 Subject: [PATCH 222/392] Apply changes from PR #62120 --- .../ObjectStorageIteratorAsync.cpp | 1 - .../ObjectStorage/StorageObjectStorage.cpp | 18 +++++++++-- .../StorageObjectStorageSource.cpp | 31 ++++++++++++++++--- .../StorageObjectStorageSource.h | 7 ++++- src/Storages/S3Queue/StorageS3Queue.cpp | 1 + 5 files changed, 49 insertions(+), 9 deletions(-) diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index 3fb615b2a5c..0420de0f8dd 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -93,7 +93,6 @@ std::future IObjectStorageIterator }, Priority{}); } - bool IObjectStorageIteratorAsync::isValid() { if (!is_initialized) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index dba4aedf7b7..5de7f41b4f7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -141,14 +141,28 @@ public: void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override { createIterator(nullptr); + Pipes pipes; auto context = getContext(); + const size_t max_threads = context->getSettingsRef().max_threads; + size_t estimated_keys_count = iterator_wrapper->estimatedKeysCount(); + + if (estimated_keys_count > 1) + num_streams = std::min(num_streams, estimated_keys_count); + else + { + /// The amount of keys (zero) was probably underestimated. + /// We will keep one stream for this particular case. + num_streams = 1; + } + + const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / std::max(num_streams, 1ul)); for (size_t i = 0; i < num_streams; ++i) { auto source = std::make_shared( getName(), object_storage, configuration, info, format_settings, - context, max_block_size, iterator_wrapper, need_only_count); + context, max_block_size, iterator_wrapper, max_parsing_threads, need_only_count); source->setKeyCondition(filter_actions_dag, context); pipes.emplace_back(std::move(source)); @@ -175,7 +189,7 @@ private: const String name; const bool need_only_count; const size_t max_block_size; - const size_t num_streams; + size_t num_streams; const bool distributed_processing; void createIterator(const ActionsDAG::Node * predicate) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index d3b67876224..8d946f515a3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -48,6 +48,7 @@ StorageObjectStorageSource::StorageObjectStorageSource( ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, + size_t max_parsing_threads_, bool need_only_count_) : SourceWithKeyCondition(info.source_header, false) , WithContext(context_) @@ -57,6 +58,7 @@ StorageObjectStorageSource::StorageObjectStorageSource( , format_settings(format_settings_) , max_block_size(max_block_size_) , need_only_count(need_only_count_) + , max_parsing_threads(max_parsing_threads_) , read_from_format_info(info) , create_reader_pool(std::make_shared( CurrentMetrics::StorageObjectStorageThreads, @@ -277,8 +279,6 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade else { CompressionMethod compression_method; - const auto max_parsing_threads = need_only_count ? std::optional(1) : std::nullopt; - if (auto object_info_in_archive = dynamic_cast(object_info.get())) { compression_method = chooseCompressionMethod(configuration->getPathInArchive(), configuration->compression_method); @@ -292,9 +292,17 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade } auto input_format = FormatFactory::instance().getInput( - configuration->format, *read_buf, read_from_format_info.format_header, - getContext(), max_block_size, format_settings, max_parsing_threads, - std::nullopt, /* is_remote_fs */ true, compression_method); + configuration->format, + *read_buf, + read_from_format_info.format_header, + getContext(), + max_block_size, + format_settings, + need_only_count ? 1 : max_parsing_threads, + std::nullopt, + true/* is_remote_fs */, + compression_method, + need_only_count); if (key_condition) input_format->setKeyCondition(key_condition); @@ -440,6 +448,19 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } } +size_t StorageObjectStorageSource::GlobIterator::estimatedKeysCount() +{ + if (object_infos.empty() && !is_finished && object_storage_iterator->isValid()) + { + /// 1000 files were listed, and we cannot make any estimation of _how many more_ there are (because we list bucket lazily); + /// If there are more objects in the bucket, limiting the number of streams is the last thing we may want to do + /// as it would lead to serious slow down of the execution, since objects are going + /// to be fetched sequentially rather than in-parallel with up to times. + return std::numeric_limits::max(); + } + return object_infos.size(); +} + StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::nextImpl(size_t processor) { std::lock_guard lock(next_mutex); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index fb0ad3e32f1..8dbb31fdfba 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -37,6 +37,7 @@ public: ContextPtr context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, + size_t max_parsing_threads_, bool need_only_count_); ~StorageObjectStorageSource() override; @@ -64,6 +65,7 @@ protected: const std::optional format_settings; const UInt64 max_block_size; const bool need_only_count; + const size_t max_parsing_threads; const ReadFromFormatInfo read_from_format_info; const std::shared_ptr create_reader_pool; @@ -165,12 +167,13 @@ public: ~GlobIterator() override = default; - size_t estimatedKeysCount() override { return object_infos.size(); } + size_t estimatedKeysCount() override; private: ObjectInfoPtr nextImpl(size_t processor) override; ObjectInfoPtr nextImplUnlocked(size_t processor); void createFilterAST(const String & any_key); + void fillBufferForKey(const std::string & uri_key); const ObjectStoragePtr object_storage; const ConfigurationPtr configuration; @@ -184,6 +187,8 @@ private: ActionsDAGPtr filter_dag; ObjectStorageIteratorPtr object_storage_iterator; bool recursive{false}; + std::vector expanded_keys; + std::vector::iterator expanded_keys_iter; std::unique_ptr matcher; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 867f22ef5fe..f8eb288921c 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -359,6 +359,7 @@ std::shared_ptr StorageS3Queue::createSource( local_context, max_block_size, file_iterator, + local_context->getSettingsRef().max_download_threads, false); auto file_deleter = [=, this](const std::string & path) mutable From dc749325df1fa7f4d686beddd7551c30b881a0fc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 21 May 2024 17:31:13 +0000 Subject: [PATCH 223/392] Faaaaaaaaaster --- src/Interpreters/Cache/QueryCache.cpp | 4 ++-- src/Interpreters/Cache/QueryCache.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp index e30da7f233d..4b10bfd3dcd 100644 --- a/src/Interpreters/Cache/QueryCache.cpp +++ b/src/Interpreters/Cache/QueryCache.cpp @@ -203,7 +203,7 @@ String queryStringFromAST(ASTPtr ast) QueryCache::Key::Key( ASTPtr ast_, - String current_database, + const String & current_database, Block header_, std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, @@ -220,7 +220,7 @@ QueryCache::Key::Key( { } -QueryCache::Key::Key(ASTPtr ast_, String current_database, std::optional user_id_, const std::vector & current_user_roles_) +QueryCache::Key::Key(ASTPtr ast_, const String & current_database, std::optional user_id_, const std::vector & current_user_roles_) : QueryCache::Key(ast_, current_database, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) /// dummy values for everything != AST, current database, user name/roles { } diff --git a/src/Interpreters/Cache/QueryCache.h b/src/Interpreters/Cache/QueryCache.h index c234ea3d464..b5b6f477137 100644 --- a/src/Interpreters/Cache/QueryCache.h +++ b/src/Interpreters/Cache/QueryCache.h @@ -88,7 +88,7 @@ public: /// Ctor to construct a Key for writing into query cache. Key(ASTPtr ast_, - String current_database, + const String & current_database, Block header_, std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, @@ -96,7 +96,7 @@ public: bool is_compressed); /// Ctor to construct a Key for reading from query cache (this operation only needs the AST + user name). - Key(ASTPtr ast_, String current_database, std::optional user_id_, const std::vector & current_user_roles_); + Key(ASTPtr ast_, const String & current_database, std::optional user_id_, const std::vector & current_user_roles_); bool operator==(const Key & other) const; }; From 3ff53b8a0f5b62c7d64aaff263211ec060cd3ba7 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 19:38:30 +0200 Subject: [PATCH 224/392] Change double quotes in import --- src/Databases/DatabaseLazy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index c95d690f331..b5535ff2a74 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -10,7 +10,7 @@ #include #include #include -#include "Common/CurrentMetrics.h" +#include #include #include From 24805423544afd3e5c47a736f0da3e47dedac293 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Tue, 21 May 2024 19:42:03 +0200 Subject: [PATCH 225/392] Order imports --- src/Databases/DatabaseLazy.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index b5535ff2a74..7b47a1a2423 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -1,3 +1,10 @@ +#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -10,14 +17,7 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include namespace fs = std::filesystem; From 9f71988f01aa70acccac5e1c178f1cbcb8dc74ae Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 21 May 2024 17:44:40 +0000 Subject: [PATCH 226/392] Fix tests --- src/Columns/ColumnDynamic.h | 6 +++--- .../0_stateless/03039_dynamic_all_merge_algorithms_1.sh | 2 +- .../0_stateless/03039_dynamic_all_merge_algorithms_2.sh | 2 +- .../0_stateless/03151_dynamic_type_scale_max_types.sql | 3 +++ 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h index 40e8e350733..8aece765308 100644 --- a/src/Columns/ColumnDynamic.h +++ b/src/Columns/ColumnDynamic.h @@ -96,13 +96,13 @@ public: MutableColumnPtr cloneEmpty() const override { - /// Keep current dynamic structure. - return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types, statistics); + /// Keep current dynamic structure but not statistics. + return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types); } MutableColumnPtr cloneResized(size_t size) const override { - return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types, statistics); + return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types); } size_t size() const override diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh index 0941f2da369..9cfd2294c8d 100755 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --optimize_aggregation_in_order 0" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --optimize_aggregation_in_order 0 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" function test() diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh index f067a99ca19..02362012960 100755 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh +++ b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" function test() diff --git a/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql index 04322fc4f0c..632f3504fdb 100644 --- a/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql +++ b/tests/queries/0_stateless/03151_dynamic_type_scale_max_types.sql @@ -1,4 +1,7 @@ SET allow_experimental_dynamic_type=1; +set min_compress_block_size = 585572, max_compress_block_size = 373374, max_block_size = 60768, max_joined_block_size_rows = 18966, max_insert_threads = 5, max_threads = 50, max_read_buffer_size = 708232, connect_timeout_with_failover_ms = 2000, connect_timeout_with_failover_secure_ms = 3000, idle_connection_timeout = 36000, use_uncompressed_cache = true, stream_like_engine_allow_direct_select = true, replication_wait_for_inactive_replica_timeout = 30, compile_aggregate_expressions = false, min_count_to_compile_aggregate_expression = 0, compile_sort_description = false, group_by_two_level_threshold = 1000000, group_by_two_level_threshold_bytes = 12610083, enable_memory_bound_merging_of_aggregation_results = false, min_chunk_bytes_for_parallel_parsing = 18769830, merge_tree_coarse_index_granularity = 12, min_bytes_to_use_direct_io = 10737418240, min_bytes_to_use_mmap_io = 10737418240, log_queries = true, insert_quorum_timeout = 60000, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability = 0.05000000074505806, http_response_buffer_size = 294986, fsync_metadata = true, http_send_timeout = 60., http_receive_timeout = 60., opentelemetry_start_trace_probability = 0.10000000149011612, max_bytes_before_external_group_by = 1, max_bytes_before_external_sort = 10737418240, max_bytes_before_remerge_sort = 1326536545, max_untracked_memory = 1048576, memory_profiler_step = 1048576, log_comment = '03151_dynamic_type_scale_max_types.sql', send_logs_level = 'fatal', prefer_localhost_replica = false, optimize_read_in_order = false, optimize_aggregation_in_order = true, aggregation_in_order_max_block_bytes = 27069500, read_in_order_two_level_merge_threshold = 75, allow_introspection_functions = true, database_atomic_wait_for_drop_and_detach_synchronously = true, remote_filesystem_read_method = 'read', local_filesystem_read_prefetch = true, remote_filesystem_read_prefetch = false, merge_tree_compact_parts_min_granules_to_multibuffer_read = 119, async_insert_busy_timeout_max_ms = 5000, read_from_filesystem_cache_if_exists_otherwise_bypass_cache = true, filesystem_cache_segments_batch_size = 10, use_page_cache_for_disks_without_file_cache = true, page_cache_inject_eviction = true, allow_prefetched_read_pool_for_remote_filesystem = false, filesystem_prefetch_step_marks = 50, filesystem_prefetch_min_bytes_for_single_read_task = 16777216, filesystem_prefetch_max_memory_usage = 134217728, filesystem_prefetches_limit = 10, optimize_sorting_by_input_stream_properties = false, allow_experimental_dynamic_type = true, session_timezone = 'Africa/Khartoum', prefer_warmed_unmerged_parts_seconds = 2; + +drop table if exists to_table; CREATE TABLE to_table ( From 51afec49107864e97eb36f9e5760efd1e11bfea8 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 21 May 2024 17:59:26 +0000 Subject: [PATCH 227/392] Fixing test. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 52cd6207dde..cfea45732db 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -637,6 +637,10 @@ struct ScopeAliases const std::string * key = &getKey(lookup.identifier, find_option); auto it = alias_map.find(*key); + + if (it == alias_map.end() && lookup.lookup_context == IdentifierLookupContext::TABLE_EXPRESSION) + return {}; + while (it == alias_map.end()) { auto jt = transitive_aliases.find(*key); @@ -4191,7 +4195,7 @@ IdentifierResolveResult QueryAnalyzer::tryResolveIdentifier(const IdentifierLook * In the example, identifier `id` should be resolved into one from USING (id) column. */ - auto alias_it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FULL_NAME); + auto * alias_it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FULL_NAME); //auto alias_it = scope.alias_name_to_expression_node->find(identifier_lookup.identifier.getFullName()); if (alias_it && (*alias_it)->getNodeType() == QueryTreeNodeType::COLUMN) { From c9d29213d8e6af3569fef6be235f0074888a0261 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 21 May 2024 21:04:28 +0200 Subject: [PATCH 228/392] Update InterpreterCreateQuery.cpp --- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 4fdd804452d..541717f1c04 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1493,7 +1493,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, validateVirtualColumns(*res); - if (!res->supportsDynamicSubcolumns() && hasDynamicSubcolumns(res->getInMemoryMetadataPtr()->getColumns())) + if (!res->supportsDynamicSubcolumns() && hasDynamicSubcolumns(res->getInMemoryMetadataPtr()->getColumns()) && mode <= LoadingStrictnessLevel::CREATE) { throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot create table with column of type Object, " From 42efc4e2f641b1abec484a36aa32b2cc97e6b49d Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Tue, 21 May 2024 21:31:52 +0200 Subject: [PATCH 229/392] Pass column position to compact part writer --- src/Storages/MergeTree/IMergeTreeDataPart.h | 1 + .../MergeTree/IMergeTreeDataPartWriter.cpp | 4 +++- .../MergeTree/IMergeTreeDataPartWriter.h | 2 ++ .../MergeTree/MergeTreeDataPartCompact.cpp | 21 +++++++++---------- .../MergeTree/MergedBlockOutputStream.cpp | 8 +++---- .../MergedColumnOnlyOutputStream.cpp | 1 + 6 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index f4889d64179..15c8760141a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -186,6 +186,7 @@ public: /// take place, you must take original name of column for this part from /// storage and pass it to this method. std::optional getColumnPosition(const String & column_name) const; + const NameToNumber & getColumnPositions() const { return column_name_to_position; } /// Returns the name of a column with minimum compressed size (as returned by getColumnSize()). /// If no checksums are present returns the name of the first physically existing column. diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index 27da53de9b0..e8792be6293 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -115,6 +115,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, + const ColumnPositions & column_positions, const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr virtual_columns, const std::vector & indices_to_recalc, @@ -151,6 +152,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, + const ColumnPositions & column_positions, const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr virtual_columns, const std::vector & indices_to_recalc, @@ -162,7 +164,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( { if (part_type == MergeTreeDataPartType::Compact) return createMergeTreeDataPartCompactWriter(data_part_name_, logger_name_, serializations_, data_part_storage_, - index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, virtual_columns, indices_to_recalc, stats_to_recalc_, + index_granularity_info_, storage_settings_, columns_list, column_positions, metadata_snapshot, virtual_columns, indices_to_recalc, stats_to_recalc_, marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); else if (part_type == MergeTreeDataPartType::Wide) return createMergeTreeDataPartWideWriter(data_part_name_, logger_name_, serializations_, data_part_storage_, diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index 5dcc7ddc599..8eb546c4f2c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -69,6 +69,7 @@ protected: }; using MergeTreeDataPartWriterPtr = std::unique_ptr; +using ColumnPositions = std::unordered_map; MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( MergeTreeDataPartType part_type, @@ -79,6 +80,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, + const ColumnPositions & column_positions, const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr virtual_columns_, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 332b7d04f7f..98eda5573ce 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -55,6 +55,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const MergeTreeIndexGranularityInfo & index_granularity_info_, const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, + const ColumnPositions & column_positions, const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr virtual_columns, const std::vector & indices_to_recalc, @@ -64,19 +65,17 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) { -////// TODO: fix the order of columns -//// -//// NamesAndTypesList ordered_columns_list; -//// std::copy_if(columns_list.begin(), columns_list.end(), std::back_inserter(ordered_columns_list), -//// [this](const auto & column) { return getColumnPosition(column.name) != std::nullopt; }); -//// -//// /// Order of writing is important in compact format -//// ordered_columns_list.sort([this](const auto & lhs, const auto & rhs) -//// { return *getColumnPosition(lhs.name) < *getColumnPosition(rhs.name); }); -//// + NamesAndTypesList ordered_columns_list; + std::copy_if(columns_list.begin(), columns_list.end(), std::back_inserter(ordered_columns_list), + [&column_positions](const auto & column) { return column_positions.contains(column.name); }); + + /// Order of writing is important in compact format + ordered_columns_list.sort([&column_positions](const auto & lhs, const auto & rhs) + { return column_positions.at(lhs.name) < column_positions.at(rhs.name); }); + return std::make_unique( data_part_name_, logger_name_, serializations_, data_part_storage_, - index_granularity_info_, storage_settings_, columns_list, metadata_snapshot, virtual_columns, + index_granularity_info_, storage_settings_, ordered_columns_list, metadata_snapshot, virtual_columns, indices_to_recalc, stats_to_recalc_, marks_file_extension_, default_codec_, writer_settings, computed_index_granularity); } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 5ef967d930a..ee5c197336d 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -44,8 +44,6 @@ MergedBlockOutputStream::MergedBlockOutputStream( if (data_part->isStoredOnDisk()) data_part_storage->createDirectories(); -// /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. -// TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; /// NOTE do not pass context for writing to system.transactions_info_log, /// because part may have temporary name (with temporary block numbers). Will write it later. data_part->version.setCreationTID(tid, nullptr); @@ -55,7 +53,7 @@ MergedBlockOutputStream::MergedBlockOutputStream( data_part->name, data_part->storage.getLogName(), data_part->getSerializations(), data_part_storage, data_part->index_granularity_info, storage_settings, - columns_list, metadata_snapshot, data_part->storage.getVirtualsPtr(), + columns_list, data_part->getColumnPositions(), metadata_snapshot, data_part->storage.getVirtualsPtr(), skip_indices, statistics, data_part->getMarksFileExtension(), default_codec, writer_settings, computed_index_granularity); } @@ -243,9 +241,9 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis if (new_part->storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) { - if (auto file = new_part->partition.store(//storage, + if (auto file = new_part->partition.store( new_part->storage.getInMemoryMetadataPtr(), new_part->storage.getContext(), - new_part->getDataPartStorage(), checksums)) + new_part->getDataPartStorage(), checksums)) written_files.emplace_back(std::move(file)); if (new_part->minmax_idx->initialized) diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 1d1783b1b43..674a9bd498f 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -38,6 +38,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( data_part_storage, data_part->index_granularity_info, storage_settings, header.getNamesAndTypesList(), + data_part->getColumnPositions(), metadata_snapshot_, data_part->storage.getVirtualsPtr(), indices_to_recalc, From bb0b135c3642d2972fddc9c4e4a584dd5e246f9f Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Wed, 22 May 2024 06:18:43 +0200 Subject: [PATCH 230/392] Do not decrement counter if table pointer is nut in lazy database detachtable --- src/Databases/DatabaseLazy.cpp | 34 ++++++-------------------- src/Databases/DatabasesCommon.cpp | 40 +++++++------------------------ src/Storages/Utils.cpp | 28 ++++++++++++++++++++++ src/Storages/Utils.h | 7 ++++++ 4 files changed, 51 insertions(+), 58 deletions(-) create mode 100644 src/Storages/Utils.cpp create mode 100644 src/Storages/Utils.h diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index 7b47a1a2423..c2fd184f8bc 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -7,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -17,19 +18,12 @@ #include #include #include +#include namespace fs = std::filesystem; -namespace CurrentMetrics -{ - extern const Metric AttachedTable; - extern const Metric AttachedView; - extern const Metric AttachedDictionary; -} - - namespace DB { @@ -174,22 +168,6 @@ bool DatabaseLazy::empty() const return tables_cache.empty(); } -static CurrentMetrics::Metric get_attached_count_metric_for_storage(const StoragePtr & storage) -{ - if (storage->isView()) - { - return CurrentMetrics::AttachedView; - } - else if (storage->isDictionary()) - { - return CurrentMetrics::AttachedDictionary; - } - else - { - return CurrentMetrics::AttachedTable; - } -} - void DatabaseLazy::attachTable(ContextPtr /* context_ */, const String & table_name, const StoragePtr & table, const String &) { LOG_DEBUG(log, "Attach table {}.", backQuote(table_name)); @@ -203,7 +181,7 @@ void DatabaseLazy::attachTable(ContextPtr /* context_ */, const String & table_n throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Table {}.{} already exists.", backQuote(database_name), backQuote(table_name)); it->second.expiration_iterator = cache_expiration_queue.emplace(cache_expiration_queue.end(), current_time, table_name); - CurrentMetrics::add(get_attached_count_metric_for_storage(table), 1); + CurrentMetrics::add(getAttachedCounterForStorage(table), 1); } StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & table_name) @@ -219,7 +197,9 @@ StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & ta if (it->second.expiration_iterator != cache_expiration_queue.end()) cache_expiration_queue.erase(it->second.expiration_iterator); tables_cache.erase(it); - CurrentMetrics::sub(get_attached_count_metric_for_storage(res), 1); + if (res != nullptr) { + CurrentMetrics::sub(getAttachedCounterForStorage(res), 1); + } } return res; } diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index ff721e8e5c4..5fee14ecc2a 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -1,4 +1,10 @@ #include + +#include +#include +#include +#include +#include #include #include #include @@ -8,19 +14,8 @@ #include #include #include -#include -#include -#include +#include #include -#include -#include - -namespace CurrentMetrics -{ - extern const Metric AttachedTable; - extern const Metric AttachedView; - extern const Metric AttachedDictionary; -} namespace DB @@ -254,23 +249,6 @@ StoragePtr DatabaseWithOwnTablesBase::detachTable(ContextPtr /* context_ */, con return detachTableUnlocked(table_name); } - -static CurrentMetrics::Metric get_attached_count_metric_for_storage(const StoragePtr & storage) -{ - if (storage->isView()) - { - return CurrentMetrics::AttachedView; - } - else if (storage->isDictionary()) - { - return CurrentMetrics::AttachedDictionary; - } - else - { - return CurrentMetrics::AttachedTable; - } -} - StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_name) { StoragePtr res; @@ -282,7 +260,7 @@ StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_n res = it->second; tables.erase(it); res->is_detached = true; - CurrentMetrics::sub(get_attached_count_metric_for_storage(res), 1); + CurrentMetrics::sub(getAttachedCounterForStorage(res), 1); auto table_id = res->getStorageID(); if (table_id.hasUUID()) @@ -323,7 +301,7 @@ void DatabaseWithOwnTablesBase::attachTableUnlocked(const String & table_name, c /// It is important to reset is_detached here since in case of RENAME in /// non-Atomic database the is_detached is set to true before RENAME. table->is_detached = false; - CurrentMetrics::add(get_attached_count_metric_for_storage(table), 1); + CurrentMetrics::add(getAttachedCounterForStorage(table), 1); } void DatabaseWithOwnTablesBase::shutdown() diff --git a/src/Storages/Utils.cpp b/src/Storages/Utils.cpp new file mode 100644 index 00000000000..670d6a242e8 --- /dev/null +++ b/src/Storages/Utils.cpp @@ -0,0 +1,28 @@ +#include +#include + +namespace CurrentMetrics +{ + extern const Metric AttachedTable; + extern const Metric AttachedView; + extern const Metric AttachedDictionary; +} + +namespace DB { + + CurrentMetrics::Metric getAttachedCounterForStorage(const StoragePtr & storage) + { + if (storage->isView()) + { + return CurrentMetrics::AttachedView; + } + else if (storage->isDictionary()) + { + return CurrentMetrics::AttachedDictionary; + } + else + { + return CurrentMetrics::AttachedTable; + } + } +} diff --git a/src/Storages/Utils.h b/src/Storages/Utils.h new file mode 100644 index 00000000000..ffb8479d633 --- /dev/null +++ b/src/Storages/Utils.h @@ -0,0 +1,7 @@ +#include +#include + +namespace DB +{ + CurrentMetrics::Metric getAttachedCounterForStorage(const StoragePtr & storage); +} From 3f46e4e4305693c9542001fb9e718f2fb098a137 Mon Sep 17 00:00:00 2001 From: jsc0218 Date: Wed, 22 May 2024 04:35:06 +0000 Subject: [PATCH 231/392] better exception message in delete table with projection --- src/Interpreters/InterpreterDeleteQuery.cpp | 15 ++++++++++++++- src/Storages/IStorage.h | 3 +++ src/Storages/MergeTree/IMergeTreeDataPart.h | 2 ++ src/Storages/MergeTree/MergeTreeData.cpp | 15 +++++++++++++++ src/Storages/MergeTree/MergeTreeData.h | 2 ++ .../03161_lightweight_delete_projection.reference | 0 .../03161_lightweight_delete_projection.sql | 15 +++++++++++++++ 7 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03161_lightweight_delete_projection.reference create mode 100644 tests/queries/0_stateless/03161_lightweight_delete_projection.sql diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index ee774994145..9cfb8e486cb 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -25,6 +25,7 @@ namespace ErrorCodes extern const int TABLE_IS_READ_ONLY; extern const int SUPPORT_IS_DISABLED; extern const int BAD_ARGUMENTS; + extern const int NOT_IMPLEMENTED; } @@ -107,7 +108,19 @@ BlockIO InterpreterDeleteQuery::execute() } else { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "DELETE query is not supported for table {}", table->getStorageID().getFullTableName()); + /// Currently just better exception for the case of a table with projection, + /// can act differently according to the setting. + if (table->hasProjection()) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DELETE query is not supported for table {} as it has projections. " + "User should drop all the projections manually before running the query", + table->getStorageID().getFullTableName()); + } + + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "DELETE query is not supported for table {}", + table->getStorageID().getFullTableName()); } } diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 87a04c3fcc6..37613704c6a 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -259,6 +259,9 @@ public: /// Return true if storage can execute lightweight delete mutations. virtual bool supportsLightweightDelete() const { return false; } + /// Return true if storage has any projection. + virtual bool hasProjection() const { return false; } + /// Return true if storage can execute 'DELETE FROM' mutations. This is different from lightweight delete /// because those are internally translated into 'ALTER UDPATE' mutations. virtual bool supportsDelete() const { return false; } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index c380f99060e..f38a80455c4 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -442,6 +442,8 @@ public: bool hasProjection(const String & projection_name) const { return projection_parts.contains(projection_name); } + bool hasProjection() const { return !projection_parts.empty(); } + bool hasBrokenProjection(const String & projection_name) const; /// Return true, if all projections were loaded successfully and none was marked as broken. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 167160db317..1f7e0a19b3a 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6133,6 +6133,21 @@ bool MergeTreeData::supportsLightweightDelete() const return true; } +bool MergeTreeData::hasProjection() const +{ + auto lock = lockParts(); + for (const auto & part : data_parts_by_info) + { + if (part->getState() == MergeTreeDataPartState::Outdated + || part->getState() == MergeTreeDataPartState::Deleting) + continue; + + if (part->hasProjection()) + return true; + } + return false; +} + MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states) const { ProjectionPartsVector res; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 2f9283659e3..ff93c7c5ae4 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -438,6 +438,8 @@ public: bool supportsLightweightDelete() const override; + bool hasProjection() const override; + bool areAsynchronousInsertsEnabled() const override { return getSettings()->async_insert; } bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override; diff --git a/tests/queries/0_stateless/03161_lightweight_delete_projection.reference b/tests/queries/0_stateless/03161_lightweight_delete_projection.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03161_lightweight_delete_projection.sql b/tests/queries/0_stateless/03161_lightweight_delete_projection.sql new file mode 100644 index 00000000000..cd29fae8fd7 --- /dev/null +++ b/tests/queries/0_stateless/03161_lightweight_delete_projection.sql @@ -0,0 +1,15 @@ + +DROP TABLE IF EXISTS users; + +CREATE TABLE users ( + uid Int16, + name String, + age Int16, + projection p1 (select count(), age group by age) +) ENGINE = MergeTree order by uid; + +INSERT INTO users VALUES (1231, 'John', 33); +INSERT INTO users VALUES (6666, 'Ksenia', 48); +INSERT INTO users VALUES (8888, 'Alice', 50); + +DELETE FROM users WHERE 1; -- { serverError NOT_IMPLEMENTED } From 04de82e96524b88f168b5be18195863e1cf4b18b Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Wed, 22 May 2024 06:35:25 +0200 Subject: [PATCH 232/392] Fix style --- src/Databases/DatabaseLazy.cpp | 3 ++- src/Storages/Utils.cpp | 2 +- src/Storages/Utils.h | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index c2fd184f8bc..b9c61400eb3 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -197,7 +197,8 @@ StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & ta if (it->second.expiration_iterator != cache_expiration_queue.end()) cache_expiration_queue.erase(it->second.expiration_iterator); tables_cache.erase(it); - if (res != nullptr) { + if (res != nullptr) + { CurrentMetrics::sub(getAttachedCounterForStorage(res), 1); } } diff --git a/src/Storages/Utils.cpp b/src/Storages/Utils.cpp index 670d6a242e8..df86ef15cff 100644 --- a/src/Storages/Utils.cpp +++ b/src/Storages/Utils.cpp @@ -25,4 +25,4 @@ namespace DB { return CurrentMetrics::AttachedTable; } } -} +} diff --git a/src/Storages/Utils.h b/src/Storages/Utils.h index ffb8479d633..3e92f6247c6 100644 --- a/src/Storages/Utils.h +++ b/src/Storages/Utils.h @@ -1,3 +1,5 @@ +#pragma once + #include #include From a8fe7294d2e39b00f24fce5077b2a3a6ae63bf01 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Wed, 22 May 2024 07:06:19 +0200 Subject: [PATCH 233/392] Do not distinguish resource types for lazy database --- src/Databases/DatabaseLazy.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index b9c61400eb3..003943fbbe4 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -18,11 +18,15 @@ #include #include #include -#include namespace fs = std::filesystem; +namespace CurrentMetrics +{ + extern const Metric AttachedTable; +} + namespace DB { @@ -181,7 +185,8 @@ void DatabaseLazy::attachTable(ContextPtr /* context_ */, const String & table_n throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Table {}.{} already exists.", backQuote(database_name), backQuote(table_name)); it->second.expiration_iterator = cache_expiration_queue.emplace(cache_expiration_queue.end(), current_time, table_name); - CurrentMetrics::add(getAttachedCounterForStorage(table), 1); + CurrentMetrics::add(CurrentMetrics::AttachedTable, 1); + } StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & table_name) @@ -197,10 +202,7 @@ StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & ta if (it->second.expiration_iterator != cache_expiration_queue.end()) cache_expiration_queue.erase(it->second.expiration_iterator); tables_cache.erase(it); - if (res != nullptr) - { - CurrentMetrics::sub(getAttachedCounterForStorage(res), 1); - } + CurrentMetrics::sub(CurrentMetrics::AttachedTable, 1); } return res; } From 49529a1af9e15c1f3b6cda267034b93a48ce7e8a Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Wed, 22 May 2024 07:18:17 +0200 Subject: [PATCH 234/392] Remove trailing whitespace --- src/Databases/DatabaseLazy.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index 003943fbbe4..f0a56a0243d 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -186,7 +186,6 @@ void DatabaseLazy::attachTable(ContextPtr /* context_ */, const String & table_n it->second.expiration_iterator = cache_expiration_queue.emplace(cache_expiration_queue.end(), current_time, table_name); CurrentMetrics::add(CurrentMetrics::AttachedTable, 1); - } StoragePtr DatabaseLazy::detachTable(ContextPtr /* context */, const String & table_name) From 7be50ee90d688567a88152a324dc783369acde48 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Wed, 22 May 2024 07:26:24 +0200 Subject: [PATCH 235/392] Add missing newline~ --- src/Databases/DatabaseLazy.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index f0a56a0243d..e72834eddbe 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -22,6 +22,7 @@ namespace fs = std::filesystem; + namespace CurrentMetrics { extern const Metric AttachedTable; From a0ad4a96c72525b0fb2e9ac9a8b70c88d847b56b Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Wed, 22 May 2024 07:34:38 +0200 Subject: [PATCH 236/392] Add yet more missing newlines --- src/Storages/Utils.cpp | 2 ++ src/Storages/Utils.h | 1 + 2 files changed, 3 insertions(+) diff --git a/src/Storages/Utils.cpp b/src/Storages/Utils.cpp index df86ef15cff..b0c06f5ccf6 100644 --- a/src/Storages/Utils.cpp +++ b/src/Storages/Utils.cpp @@ -1,6 +1,7 @@ #include #include + namespace CurrentMetrics { extern const Metric AttachedTable; @@ -8,6 +9,7 @@ namespace CurrentMetrics extern const Metric AttachedDictionary; } + namespace DB { CurrentMetrics::Metric getAttachedCounterForStorage(const StoragePtr & storage) diff --git a/src/Storages/Utils.h b/src/Storages/Utils.h index 3e92f6247c6..c86c2a4c341 100644 --- a/src/Storages/Utils.h +++ b/src/Storages/Utils.h @@ -3,6 +3,7 @@ #include #include + namespace DB { CurrentMetrics::Metric getAttachedCounterForStorage(const StoragePtr & storage); From 8869094c9986906034f3368a2cdeee179a7976b1 Mon Sep 17 00:00:00 2001 From: Francisco Javier Jurado Moreno <9376816+Beetelbrox@users.noreply.github.com> Date: Wed, 22 May 2024 07:42:05 +0200 Subject: [PATCH 237/392] Move opening brackets to its own line --- src/Storages/Utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/Utils.cpp b/src/Storages/Utils.cpp index b0c06f5ccf6..ff73888e19d 100644 --- a/src/Storages/Utils.cpp +++ b/src/Storages/Utils.cpp @@ -10,8 +10,8 @@ namespace CurrentMetrics } -namespace DB { - +namespace DB +{ CurrentMetrics::Metric getAttachedCounterForStorage(const StoragePtr & storage) { if (storage->isView()) From 12ce276b8af09da46cb89ed9e2e15bb9ceff758a Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Wed, 22 May 2024 08:51:41 +0200 Subject: [PATCH 238/392] clang-tidy fix --- src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp | 8 ++++---- src/Storages/MergeTree/IMergeTreeDataPartWriter.h | 4 ++-- src/Storages/MergeTree/MergeTreeDataPartCompact.cpp | 2 +- src/Storages/MergeTree/MergeTreeDataPartWide.cpp | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index e8792be6293..891ba1b9660 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -52,7 +52,7 @@ IMergeTreeDataPartWriter::IMergeTreeDataPartWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) : data_part_name(data_part_name_) @@ -117,7 +117,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const NamesAndTypesList & columns_list, const ColumnPositions & column_positions, const StorageMetadataPtr & metadata_snapshot, - const VirtualsDescriptionPtr virtual_columns, + const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -134,7 +134,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, - const VirtualsDescriptionPtr virtual_columns, + const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, @@ -154,7 +154,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const NamesAndTypesList & columns_list, const ColumnPositions & column_positions, const StorageMetadataPtr & metadata_snapshot, - const VirtualsDescriptionPtr virtual_columns, + const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index 8eb546c4f2c..f04beb37ebb 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -30,7 +30,7 @@ public: const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_ = {}); @@ -82,7 +82,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const NamesAndTypesList & columns_list, const ColumnPositions & column_positions, const StorageMetadataPtr & metadata_snapshot, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension, diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 98eda5573ce..4a160e5e229 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -57,7 +57,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const NamesAndTypesList & columns_list, const ColumnPositions & column_positions, const StorageMetadataPtr & metadata_snapshot, - const VirtualsDescriptionPtr virtual_columns, + const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index d4630d3dd3f..149f86cef00 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -62,7 +62,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, - const VirtualsDescriptionPtr virtual_columns, + const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension_, From 58e655e07b128c4dfd26ffe60ad9d9ee285b3fa9 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 22 May 2024 07:24:42 +0000 Subject: [PATCH 239/392] Incorporate review feedback --- programs/keeper-client/Commands.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index 3c649cad0d3..860840a2d06 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -10,8 +10,8 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int KEEPER_EXCEPTION; - extern const int UNEXPECTED_ZOOKEEPER_ERROR; } bool LSCommand::parse(IParser::Pos & pos, std::shared_ptr & node, Expected & expected) const @@ -442,7 +442,7 @@ void ReconfigCommand::execute(const DB::ASTKeeperQuery * query, DB::KeeperClient new_members = query->args[1].safeGet(); break; default: - throw Exception(ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR, "Unexpected operation: {}", operation); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected operation: {}", operation); } auto response = client->zookeeper->reconfig(joining, leaving, new_members); From 376282dd6dce879008f0f0295402bc197d2b1e39 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Wed, 22 May 2024 09:58:31 +0200 Subject: [PATCH 240/392] Revert "Prevent conversion to Replicated if zookeeper path already exists" --- src/Databases/DatabaseOrdinary.cpp | 14 ---- .../configs/config.d/clusters.xml | 2 +- ...sters_zk_path.xml => clusters_unusual.xml} | 2 +- .../test_unusual_path.py | 6 +- .../test_zk_path.py | 69 ------------------- 5 files changed, 5 insertions(+), 88 deletions(-) rename tests/integration/test_modify_engine_on_restart/configs/config.d/{clusters_zk_path.xml => clusters_unusual.xml} (80%) delete mode 100644 tests/integration/test_modify_engine_on_restart/test_zk_path.py diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 58fa7f01947..5d36f1cc3d6 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -76,20 +76,6 @@ static void setReplicatedEngine(ASTCreateQuery * create_query, ContextPtr contex String replica_path = server_settings.default_replica_path; String replica_name = server_settings.default_replica_name; - /// Check that replica path doesn't exist - Macros::MacroExpansionInfo info; - StorageID table_id = StorageID(create_query->getDatabase(), create_query->getTable(), create_query->uuid); - info.table_id = table_id; - info.expand_special_macros_only = false; - - String zookeeper_path = context->getMacros()->expand(replica_path, info); - if (context->getZooKeeper()->exists(zookeeper_path)) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Found existing ZooKeeper path {} while trying to convert table {} to replicated. Table will not be converted.", - zookeeper_path, backQuote(table_id.getFullTableName()) - ); - auto args = std::make_shared(); args->children.push_back(std::make_shared(replica_path)); args->children.push_back(std::make_shared(replica_name)); diff --git a/tests/integration/test_modify_engine_on_restart/configs/config.d/clusters.xml b/tests/integration/test_modify_engine_on_restart/configs/config.d/clusters.xml index c8bbb7f3530..d3a9d4fb8f0 100644 --- a/tests/integration/test_modify_engine_on_restart/configs/config.d/clusters.xml +++ b/tests/integration/test_modify_engine_on_restart/configs/config.d/clusters.xml @@ -19,4 +19,4 @@ 01 - + \ No newline at end of file diff --git a/tests/integration/test_modify_engine_on_restart/configs/config.d/clusters_zk_path.xml b/tests/integration/test_modify_engine_on_restart/configs/config.d/clusters_unusual.xml similarity index 80% rename from tests/integration/test_modify_engine_on_restart/configs/config.d/clusters_zk_path.xml rename to tests/integration/test_modify_engine_on_restart/configs/config.d/clusters_unusual.xml index ba13cd87031..812291335b8 100644 --- a/tests/integration/test_modify_engine_on_restart/configs/config.d/clusters_zk_path.xml +++ b/tests/integration/test_modify_engine_on_restart/configs/config.d/clusters_unusual.xml @@ -15,6 +15,6 @@ 01 -/clickhouse/'/{database}/{table}/{uuid} +/lol/kek/'/{uuid} diff --git a/tests/integration/test_modify_engine_on_restart/test_unusual_path.py b/tests/integration/test_modify_engine_on_restart/test_unusual_path.py index 20d2c29257b..e82f48e8b34 100644 --- a/tests/integration/test_modify_engine_on_restart/test_unusual_path.py +++ b/tests/integration/test_modify_engine_on_restart/test_unusual_path.py @@ -6,7 +6,7 @@ cluster = ClickHouseCluster(__file__) ch1 = cluster.add_instance( "ch1", main_configs=[ - "configs/config.d/clusters_zk_path.xml", + "configs/config.d/clusters_unusual.xml", "configs/config.d/distributed_ddl.xml", ], with_zookeeper=True, @@ -63,7 +63,7 @@ def check_tables(): ) .strip() .startswith( - "ReplicatedReplacingMergeTree(\\'/clickhouse/\\\\\\'/{database}/{table}/{uuid}\\', \\'{replica}\\', D)" + "ReplicatedReplacingMergeTree(\\'/lol/kek/\\\\\\'/{uuid}\\', \\'{replica}\\', D)" ) ) assert ( @@ -73,7 +73,7 @@ def check_tables(): ) .strip() .startswith( - "ReplicatedVersionedCollapsingMergeTree(\\'/clickhouse/\\\\\\'/{database}/{table}/{uuid}\\', \\'{replica}\\', Sign, Version)" + "ReplicatedVersionedCollapsingMergeTree(\\'/lol/kek/\\\\\\'/{uuid}\\', \\'{replica}\\', Sign, Version)" ) ) diff --git a/tests/integration/test_modify_engine_on_restart/test_zk_path.py b/tests/integration/test_modify_engine_on_restart/test_zk_path.py deleted file mode 100644 index dd633ad0810..00000000000 --- a/tests/integration/test_modify_engine_on_restart/test_zk_path.py +++ /dev/null @@ -1,69 +0,0 @@ -import pytest -from test_modify_engine_on_restart.common import ( - get_table_path, - set_convert_flags, -) -from helpers.cluster import ClickHouseCluster - -cluster = ClickHouseCluster(__file__) -ch1 = cluster.add_instance( - "ch1", - main_configs=[ - "configs/config.d/clusters_zk_path.xml", - "configs/config.d/distributed_ddl.xml", - ], - with_zookeeper=True, - macros={"replica": "node1"}, - stay_alive=True, -) - -database_name = "modify_engine_zk_path" - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - yield cluster - - finally: - cluster.shutdown() - - -def q(node, query): - return node.query(database=database_name, sql=query) - - -def test_modify_engine_fails_if_zk_path_exists(started_cluster): - ch1.query("CREATE DATABASE " + database_name) - - q( - ch1, - "CREATE TABLE already_exists_1 ( A Int64, D Date, S String ) ENGINE MergeTree() PARTITION BY toYYYYMM(D) ORDER BY A;", - ) - uuid = q( - ch1, - f"SELECT uuid FROM system.tables WHERE table = 'already_exists_1' and database = '{database_name}'", - ).strip("'[]\n") - - q( - ch1, - f"CREATE TABLE already_exists_2 ( A Int64, D Date, S String ) ENGINE ReplicatedMergeTree('/clickhouse/\\'/{database_name}/already_exists_1/{uuid}', 'r2') PARTITION BY toYYYYMM(D) ORDER BY A;", - ) - - set_convert_flags(ch1, database_name, ["already_exists_1"]) - - table_data_path = get_table_path(ch1, "already_exists_1", database_name) - - ch1.stop_clickhouse() - ch1.start_clickhouse(retry_start=False, expected_to_fail=True) - - # Check if we can cancel convertation - ch1.exec_in_container( - [ - "bash", - "-c", - f"rm {table_data_path}convert_to_replicated", - ] - ) - ch1.start_clickhouse() From 7f46eae7b4961b3d58e2d592bc42ba5a32297f7c Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Wed, 22 May 2024 11:31:01 +0200 Subject: [PATCH 241/392] clang-tidy fix --- src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp | 2 +- src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h | 2 +- src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp | 2 +- src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h | 2 +- src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp | 2 +- src/Storages/MergeTree/MergeTreeDataPartWriterWide.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 328e3118ba9..2d86e0f0770 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -18,7 +18,7 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc_, const Statistics & stats_to_recalc, const String & marks_file_extension_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index f62f060fde2..ebf96c1ebb2 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -19,7 +19,7 @@ public: const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc, const String & marks_file_extension, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 30f01c1acd6..0a8920790e0 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -148,7 +148,7 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const MergeTreeIndices & indices_to_recalc_, const Statistics & stats_to_recalc_, const String & marks_file_extension_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index a60fcd43a58..0c31cabc8c4 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -109,7 +109,7 @@ public: const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 001f09b81b3..9df6cc5e2f7 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -84,7 +84,7 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc_, const Statistics & stats_to_recalc_, const String & marks_file_extension_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index 8dc488788c6..63205775c58 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -29,7 +29,7 @@ public: const MergeTreeSettingsPtr & storage_settings_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, - const VirtualsDescriptionPtr virtual_columns_, + const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, const Statistics & stats_to_recalc_, const String & marks_file_extension, From 8dd52a26257a9dc11723e5a87507f6815f4fb818 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 21 May 2024 18:42:14 +0200 Subject: [PATCH 242/392] Ignore allow_suspicious_primary_key on ATTACH and verify on ALTER Signed-off-by: Azat Khuzhin Co-authored-by: Alexander Tokmakov --- src/Storages/MergeTree/MergeTreeData.cpp | 12 +++++++++++ src/Storages/MergeTree/MergeTreeData.h | 2 ++ .../MergeTree/registerStorageMergeTree.cpp | 20 ++++-------------- src/Storages/StorageMergeTree.cpp | 8 +++++-- src/Storages/StorageReplicatedMergeTree.cpp | 12 +++++++++-- ...03020_order_by_SimpleAggregateFunction.sql | 21 ++++++++++++++++--- 6 files changed, 52 insertions(+), 23 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 527dac01b71..13d59d671ea 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -188,6 +189,7 @@ namespace ErrorCodes extern const int CANNOT_SCHEDULE_TASK; extern const int LIMIT_EXCEEDED; extern const int CANNOT_FORGET_PARTITION; + extern const int DATA_TYPE_CANNOT_BE_USED_IN_KEY; } static void checkSuspiciousIndices(const ASTFunction * index_function) @@ -8538,6 +8540,16 @@ void MergeTreeData::unloadPrimaryKeys() } } +void MergeTreeData::verifySortingKey(const KeyDescription & sorting_key) +{ + /// Aggregate functions already forbidden, but SimpleAggregateFunction are not + for (const auto & data_type : sorting_key.data_types) + { + if (dynamic_cast(data_type->getCustomName())) + throw Exception(ErrorCodes::DATA_TYPE_CANNOT_BE_USED_IN_KEY, "Column with type {} is not allowed in key expression", data_type->getCustomName()->getName()); + } +} + bool updateAlterConversionsMutations(const MutationCommands & commands, std::atomic & alter_conversions_mutations, bool remove) { for (const auto & command : commands) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 2f9283659e3..062f967bb93 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -736,6 +736,8 @@ public: const ASTPtr & new_settings, AlterLockHolder & table_lock_holder); + static void verifySortingKey(const KeyDescription & sorting_key); + /// Should be called if part data is suspected to be corrupted. /// Has the ability to check all other parts /// which reside on the same disk of the suspicious part. diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 4244ccccfe0..d234103e52b 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -32,7 +31,6 @@ namespace ErrorCodes extern const int UNKNOWN_STORAGE; extern const int NO_REPLICA_NAME_GIVEN; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int DATA_TYPE_CANNOT_BE_USED_IN_KEY; } @@ -113,16 +111,6 @@ static ColumnsDescription getColumnsDescriptionFromZookeeper(const String & raw_ return ColumnsDescription::parse(zookeeper->get(fs::path(zookeeper_path) / "columns", &columns_stat)); } -static void verifySortingKey(const KeyDescription & sorting_key) -{ - /// Aggregate functions already forbidden, but SimpleAggregateFunction are not - for (const auto & data_type : sorting_key.data_types) - { - if (dynamic_cast(data_type->getCustomName())) - throw Exception(ErrorCodes::DATA_TYPE_CANNOT_BE_USED_IN_KEY, "Column with type {} is not allowed in key expression", data_type->getCustomName()->getName()); - } -} - /// Returns whether a new syntax is used to define a table engine, i.e. MergeTree() PRIMARY KEY ... PARTITION BY ... SETTINGS ... /// instead of MergeTree(MergeTree(date, [sample_key], primary_key). static bool isExtendedStorageDef(const ASTCreateQuery & query) @@ -678,8 +666,8 @@ static StoragePtr create(const StorageFactory::Arguments & args) /// column if sorting key will be changed. metadata.sorting_key = KeyDescription::getSortingKeyFromAST( args.storage_def->order_by->ptr(), metadata.columns, context, merging_param_key_arg); - if (!local_settings.allow_suspicious_primary_key) - verifySortingKey(metadata.sorting_key); + if (!local_settings.allow_suspicious_primary_key && args.mode <= LoadingStrictnessLevel::CREATE) + MergeTreeData::verifySortingKey(metadata.sorting_key); /// If primary key explicitly defined, than get it from AST if (args.storage_def->primary_key) @@ -792,8 +780,8 @@ static StoragePtr create(const StorageFactory::Arguments & args) /// column if sorting key will be changed. metadata.sorting_key = KeyDescription::getSortingKeyFromAST(engine_args[arg_num], metadata.columns, context, merging_param_key_arg); - if (!local_settings.allow_suspicious_primary_key) - verifySortingKey(metadata.sorting_key); + if (!local_settings.allow_suspicious_primary_key && args.mode <= LoadingStrictnessLevel::CREATE) + MergeTreeData::verifySortingKey(metadata.sorting_key); /// In old syntax primary_key always equals to sorting key. metadata.primary_key = KeyDescription::getKeyFromAST(engine_args[arg_num], metadata.columns, context); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 9144ef7c0f7..ea698775298 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -333,17 +333,21 @@ void StorageMergeTree::alter( auto table_id = getStorageID(); auto old_storage_settings = getSettings(); + const auto & query_settings = local_context->getSettingsRef(); StorageInMemoryMetadata new_metadata = getInMemoryMetadata(); StorageInMemoryMetadata old_metadata = getInMemoryMetadata(); - auto maybe_mutation_commands = commands.getMutationCommands(new_metadata, local_context->getSettingsRef().materialize_ttl_after_modify, local_context); + auto maybe_mutation_commands = commands.getMutationCommands(new_metadata, query_settings.materialize_ttl_after_modify, local_context); if (!maybe_mutation_commands.empty()) delayMutationOrThrowIfNeeded(nullptr, local_context); Int64 mutation_version = -1; commands.apply(new_metadata, local_context); + if (!query_settings.allow_suspicious_primary_key) + MergeTreeData::verifySortingKey(new_metadata.sorting_key); + /// This alter can be performed at new_metadata level only if (commands.isSettingsAlter()) { @@ -396,7 +400,7 @@ void StorageMergeTree::alter( resetObjectColumnsFromActiveParts(parts_lock); } - if (!maybe_mutation_commands.empty() && local_context->getSettingsRef().alter_sync > 0) + if (!maybe_mutation_commands.empty() && query_settings.alter_sync > 0) waitForMutation(mutation_version, false); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 378b81c6d18..e0a24ceac4d 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -6027,6 +6027,7 @@ void StorageReplicatedMergeTree::alter( assertNotReadonly(); auto table_id = getStorageID(); + const auto & query_settings = query_context->getSettingsRef(); if (commands.isSettingsAlter()) { @@ -6054,6 +6055,13 @@ void StorageReplicatedMergeTree::alter( return; } + if (!query_settings.allow_suspicious_primary_key) + { + StorageInMemoryMetadata future_metadata = getInMemoryMetadata(); + commands.apply(future_metadata, query_context); + + MergeTreeData::verifySortingKey(future_metadata.sorting_key); + } auto ast_to_str = [](ASTPtr query) -> String { @@ -6186,7 +6194,7 @@ void StorageReplicatedMergeTree::alter( auto maybe_mutation_commands = commands.getMutationCommands( *current_metadata, - query_context->getSettingsRef().materialize_ttl_after_modify, + query_settings.materialize_ttl_after_modify, query_context); bool have_mutation = !maybe_mutation_commands.empty(); @@ -6309,7 +6317,7 @@ void StorageReplicatedMergeTree::alter( { LOG_DEBUG(log, "Metadata changes applied. Will wait for data changes."); merge_selecting_task->schedule(); - waitMutation(*mutation_znode, query_context->getSettingsRef().alter_sync); + waitMutation(*mutation_znode, query_settings.alter_sync); LOG_DEBUG(log, "Data changes applied."); } } diff --git a/tests/queries/0_stateless/03020_order_by_SimpleAggregateFunction.sql b/tests/queries/0_stateless/03020_order_by_SimpleAggregateFunction.sql index f1727cb9e5c..fee42d1abc6 100644 --- a/tests/queries/0_stateless/03020_order_by_SimpleAggregateFunction.sql +++ b/tests/queries/0_stateless/03020_order_by_SimpleAggregateFunction.sql @@ -1,6 +1,6 @@ set allow_suspicious_primary_key = 0; -DROP TABLE IF EXISTS data; +drop table if exists data; create table data (key Int, value AggregateFunction(sum, UInt64)) engine=AggregatingMergeTree() order by (key, value); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } create table data (key Int, value SimpleAggregateFunction(sum, UInt64)) engine=AggregatingMergeTree() order by (key, value); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } @@ -12,7 +12,22 @@ create table data (key Int, value AggregateFunction(sum, UInt64)) engine=Aggrega create table data (key Int, value SimpleAggregateFunction(sum, UInt64)) engine=AggregatingMergeTree() primary key value order by (value, key); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } set allow_suspicious_primary_key = 1; - create table data (key Int, value SimpleAggregateFunction(sum, UInt64)) engine=AggregatingMergeTree() primary key value order by (value, key); -DROP TABLE data; +-- ATTACH should work regardless allow_suspicious_primary_key +set allow_suspicious_primary_key = 0; +detach table data; +attach table data; +drop table data; + +-- ALTER AggregatingMergeTree +create table data (key Int) engine=AggregatingMergeTree() order by (key); +alter table data add column value SimpleAggregateFunction(sum, UInt64), modify order by (key, value); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } +alter table data add column value SimpleAggregateFunction(sum, UInt64), modify order by (key, value) settings allow_suspicious_primary_key=1; +drop table data; + +-- ALTER ReplicatedAggregatingMergeTree +create table data_rep (key Int) engine=ReplicatedAggregatingMergeTree('/tables/{database}', 'r1') order by (key); +alter table data_rep add column value SimpleAggregateFunction(sum, UInt64), modify order by (key, value); -- { serverError DATA_TYPE_CANNOT_BE_USED_IN_KEY } +alter table data_rep add column value SimpleAggregateFunction(sum, UInt64), modify order by (key, value) settings allow_suspicious_primary_key=1; +drop table data_rep; From d5d8d689748fbc125c37381fd9680c32468e07d0 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Wed, 22 May 2024 13:06:56 +0200 Subject: [PATCH 243/392] Remove unused storage_snapshot field --- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 6 +++--- src/Storages/MergeTree/MergeTreeSelectProcessor.cpp | 2 -- src/Storages/MergeTree/MergeTreeSelectProcessor.h | 2 -- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 6f0fa55c349..503031eb04b 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -381,7 +381,7 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas( auto algorithm = std::make_unique(i); auto processor = std::make_unique( - pool, std::move(algorithm), storage_snapshot, prewhere_info, + pool, std::move(algorithm), prewhere_info, actions_settings, block_size_copy, reader_settings); auto source = std::make_shared(std::move(processor)); @@ -480,7 +480,7 @@ Pipe ReadFromMergeTree::readFromPool( auto algorithm = std::make_unique(i); auto processor = std::make_unique( - pool, std::move(algorithm), storage_snapshot, prewhere_info, + pool, std::move(algorithm), prewhere_info, actions_settings, block_size_copy, reader_settings); auto source = std::make_shared(std::move(processor)); @@ -592,7 +592,7 @@ Pipe ReadFromMergeTree::readInOrder( algorithm = std::make_unique(i); auto processor = std::make_unique( - pool, std::move(algorithm), storage_snapshot, prewhere_info, + pool, std::move(algorithm), prewhere_info, actions_settings, block_size, reader_settings); processor->addPartLevelToChunk(isQueryWithFinal()); diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index fce733d47b7..78b67de1a7e 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -26,14 +26,12 @@ namespace ErrorCodes MergeTreeSelectProcessor::MergeTreeSelectProcessor( MergeTreeReadPoolPtr pool_, MergeTreeSelectAlgorithmPtr algorithm_, - const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, const ExpressionActionsSettings & actions_settings_, const MergeTreeReadTask::BlockSizeParams & block_size_params_, const MergeTreeReaderSettings & reader_settings_) : pool(std::move(pool_)) , algorithm(std::move(algorithm_)) - , storage_snapshot(storage_snapshot_) , prewhere_info(prewhere_info_) , actions_settings(actions_settings_) , prewhere_actions(getPrewhereActions(prewhere_info, actions_settings, reader_settings_.enable_multiple_prewhere_read_steps)) diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.h b/src/Storages/MergeTree/MergeTreeSelectProcessor.h index 6b663e0fd36..8f41f5deacb 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.h @@ -41,7 +41,6 @@ public: MergeTreeSelectProcessor( MergeTreeReadPoolPtr pool_, MergeTreeSelectAlgorithmPtr algorithm_, - const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, const ExpressionActionsSettings & actions_settings_, const MergeTreeReadTask::BlockSizeParams & block_size_params_, @@ -71,7 +70,6 @@ private: const MergeTreeReadPoolPtr pool; const MergeTreeSelectAlgorithmPtr algorithm; - const StorageSnapshotPtr storage_snapshot; const PrewhereInfoPtr prewhere_info; const ExpressionActionsSettings actions_settings; From 5f01b14e0dc2f9a96d1c06cd2f9fb0112209ab59 Mon Sep 17 00:00:00 2001 From: Max K Date: Wed, 22 May 2024 12:00:29 +0200 Subject: [PATCH 244/392] add prints --- tests/ci/ci.py | 4 ++-- tests/ci/ci_metadata.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 046550c62f8..40f5617f165 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -1912,9 +1912,9 @@ def _get_ext_check_name(check_name: str) -> str: def _cancel_pr_wf(s3: S3Helper, pr_number: int) -> None: - run_id = CiMetadata(s3, pr_number).run_id + run_id = CiMetadata(s3, pr_number).fetch_meta().run_id if not run_id: - print("ERROR: FIX IT: Run id has not been found!") + print(f"ERROR: FIX IT: Run id has not been found PR [{pr_number}]!") else: print(f"Canceling PR workflow run_id: [{run_id}], pr: [{pr_number}]") GitHub.cancel_wf(run_id) diff --git a/tests/ci/ci_metadata.py b/tests/ci/ci_metadata.py index 5856e9a8501..82d44cf1adc 100644 --- a/tests/ci/ci_metadata.py +++ b/tests/ci/ci_metadata.py @@ -6,6 +6,7 @@ from env_helper import ( TEMP_PATH, ) from s3_helper import S3Helper +from ci_utils import GHActions # pylint: disable=too-many-lines @@ -83,7 +84,10 @@ class CiMetadata: Uploads meta on s3 """ assert self.run_id - print("Storing workflow meta on s3") + GHActions.print_in_group( + f"Storing workflow metadata: PR [{self.pr_number}]", + [f"run_id: {self.run_id}"], + ) local_file = self._LOCAL_PATH / self._FILENAME_RUN_ID with open(local_file, "w", encoding="utf-8") as file: From 5c47b091144e24ee1fbd6627186e7965c9ad233e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 22 May 2024 13:18:51 +0200 Subject: [PATCH 245/392] Ignore text_log for Keeper --- programs/keeper/Keeper.cpp | 5 +++++ programs/keeper/Keeper.h | 2 ++ src/Loggers/Loggers.cpp | 2 +- src/Loggers/Loggers.h | 4 ++++ src/Loggers/OwnSplitChannel.cpp | 9 +++++---- src/Loggers/OwnSplitChannel.h | 1 - 6 files changed, 17 insertions(+), 6 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 267b725b02b..dba5c2b7d2a 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -182,6 +182,11 @@ std::string Keeper::getDefaultConfigFileName() const return "keeper_config.xml"; } +bool Keeper::allowTextLog() const +{ + return false; +} + void Keeper::handleCustomArguments(const std::string & arg, [[maybe_unused]] const std::string & value) // NOLINT { if (arg == "force-recovery") diff --git a/programs/keeper/Keeper.h b/programs/keeper/Keeper.h index f889ffa595b..c449c40b610 100644 --- a/programs/keeper/Keeper.h +++ b/programs/keeper/Keeper.h @@ -65,6 +65,8 @@ protected: std::string getDefaultConfigFileName() const override; + bool allowTextLog() const override; + private: Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const; diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index 4b17469f4d7..0bd4b94d999 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -263,7 +263,7 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log } } #ifndef WITHOUT_TEXT_LOG - if (config.has("text_log")) + if (allowTextLog() && config.has("text_log")) { String text_log_level_str = config.getString("text_log.level", "trace"); int text_log_level = Poco::Logger::parseLevel(text_log_level_str); diff --git a/src/Loggers/Loggers.h b/src/Loggers/Loggers.h index 9eff731a4c5..9923d66ebcb 100644 --- a/src/Loggers/Loggers.h +++ b/src/Loggers/Loggers.h @@ -23,6 +23,10 @@ public: /// Close log files. On next log write files will be reopened. void closeLogs(Poco::Logger & logger); + virtual ~Loggers() = default; + +protected: + virtual bool allowTextLog() const { return true; } private: Poco::AutoPtr log_file; diff --git a/src/Loggers/OwnSplitChannel.cpp b/src/Loggers/OwnSplitChannel.cpp index fee33781c27..dc51a13e01f 100644 --- a/src/Loggers/OwnSplitChannel.cpp +++ b/src/Loggers/OwnSplitChannel.cpp @@ -107,6 +107,10 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg) [[maybe_unused]] bool push_result = logs_queue->emplace(std::move(columns)); } + auto text_log_locked = text_log.lock(); + if (!text_log_locked) + return; + /// Also log to system.text_log table, if message is not too noisy auto text_log_max_priority_loaded = text_log_max_priority.load(std::memory_order_relaxed); if (text_log_max_priority_loaded && msg.getPriority() <= text_log_max_priority_loaded) @@ -146,10 +150,7 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg) #undef SET_VALUE_IF_EXISTS - std::shared_ptr> text_log_locked{}; - text_log_locked = text_log.lock(); - if (text_log_locked) - text_log_locked->push(std::move(elem)); + text_log_locked->push(std::move(elem)); } #endif } diff --git a/src/Loggers/OwnSplitChannel.h b/src/Loggers/OwnSplitChannel.h index b75554eefc4..7ca27cf6584 100644 --- a/src/Loggers/OwnSplitChannel.h +++ b/src/Loggers/OwnSplitChannel.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include From 03fc077be7d8576c4e3e550842f2fd7c6d06a78f Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 22 May 2024 14:12:37 +0200 Subject: [PATCH 246/392] Fxi --- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- src/Storages/ObjectStorage/ReadBufferIterator.cpp | 6 +++--- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 1 - src/Storages/ObjectStorage/StorageObjectStorageSource.cpp | 4 ++-- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index d18468411ea..c07313b52db 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -575,7 +575,7 @@ void S3ObjectStorage::applyNewSettings( ContextPtr context, const ApplyNewSettingsOptions & options) { - auto new_s3_settings = getSettings(config, config_prefix, context); + auto new_s3_settings = getSettings(config, config_prefix, context, context->getSettingsRef().s3_validate_request_settings); if (!static_headers.empty()) { new_s3_settings->auth_settings.headers.insert( diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index e065de16e55..5a8a4735fe1 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -145,7 +145,7 @@ std::unique_ptr ReadBufferIterator::recreateLastReadBuffer() auto context = getContext(); const auto & path = current_object_info->isArchive() ? current_object_info->getPathToArchive() : current_object_info->getPath(); - auto impl = object_storage->readObject(StoredObject(), context->getReadSettings()); + auto impl = object_storage->readObject(StoredObject(path), context->getReadSettings()); const auto compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->compression_method); const auto zstd_window_log_max = static_cast(context->getSettingsRef().zstd_window_log_max); @@ -258,10 +258,10 @@ ReadBufferIterator::Data ReadBufferIterator::next() std::unique_ptr read_buf; CompressionMethod compression_method; using ObjectInfoInArchive = StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive; - if (auto object_info_in_archive = dynamic_cast(current_object_info.get())) + if (const auto * object_info_in_archive = dynamic_cast(current_object_info.get())) { compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->compression_method); - auto & archive_reader = object_info_in_archive->archive_reader; + const auto & archive_reader = object_info_in_archive->archive_reader; read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); } else diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 5de7f41b4f7..2c8e60b49d0 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 8d946f515a3..a2b3ca5b69e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -279,10 +279,10 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade else { CompressionMethod compression_method; - if (auto object_info_in_archive = dynamic_cast(object_info.get())) + if (const auto * object_info_in_archive = dynamic_cast(object_info.get())) { compression_method = chooseCompressionMethod(configuration->getPathInArchive(), configuration->compression_method); - auto & archive_reader = object_info_in_archive->archive_reader; + const auto & archive_reader = object_info_in_archive->archive_reader; read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); } else From 6942ae0c1e6204d8ee91b8e69e88be85ec289620 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 13 May 2024 12:00:52 +0000 Subject: [PATCH 247/392] Fix SimpleSquashingChunksTransform (02115_rewrite_local_join_right_distribute_table) --- src/Processors/IInflatingTransform.cpp | 22 +++++--- src/Processors/IInflatingTransform.h | 8 ++- .../Transforms/ArrayJoinTransform.cpp | 4 +- .../Transforms/ArrayJoinTransform.h | 2 +- .../Transforms/SquashingChunksTransform.cpp | 52 +++++++------------ .../Transforms/SquashingChunksTransform.h | 12 ++--- ...rite_local_join_right_distribute_table.sql | 4 -- 7 files changed, 48 insertions(+), 56 deletions(-) diff --git a/src/Processors/IInflatingTransform.cpp b/src/Processors/IInflatingTransform.cpp index ffa5b55dc76..bc0b3e8459e 100644 --- a/src/Processors/IInflatingTransform.cpp +++ b/src/Processors/IInflatingTransform.cpp @@ -45,8 +45,13 @@ IInflatingTransform::Status IInflatingTransform::prepare() { if (input.isFinished()) { - output.finish(); - return Status::Finished; + if (is_finished) + { + output.finish(); + return Status::Finished; + } + is_finished = true; + return Status::Ready; } input.setNeeded(); @@ -71,16 +76,17 @@ void IInflatingTransform::work() current_chunk = generate(); generated = true; - can_generate = canGenerate(); + can_generate = canGenerate(is_finished); } else { - if (!has_input) - throw Exception(ErrorCodes::LOGICAL_ERROR, "IInflatingTransform cannot consume chunk because it wasn't read"); + if (has_input) + { + consume(std::move(current_chunk)); + has_input = false; + } - consume(std::move(current_chunk)); - has_input = false; - can_generate = canGenerate(); + can_generate = canGenerate(is_finished); } } diff --git a/src/Processors/IInflatingTransform.h b/src/Processors/IInflatingTransform.h index 0ad12f6cd65..3f832b0e5bc 100644 --- a/src/Processors/IInflatingTransform.h +++ b/src/Processors/IInflatingTransform.h @@ -10,13 +10,14 @@ namespace DB /// for (chunk : input_chunks) /// { /// transform.consume(chunk); -/// /// while (transform.canGenerate()) /// { /// transformed_chunk = transform.generate(); /// ... (process transformed chunk) /// } /// } +/// while (transform.canGenerate(true)) +/// ... (process remaining data) /// class IInflatingTransform : public IProcessor { @@ -30,7 +31,7 @@ protected: bool can_generate = false; virtual void consume(Chunk chunk) = 0; - virtual bool canGenerate() = 0; + virtual bool canGenerate(bool is_read_finished) = 0; virtual Chunk generate() = 0; public: @@ -41,6 +42,9 @@ public: InputPort & getInputPort() { return input; } OutputPort & getOutputPort() { return output; } + + /// canGenerate can flush data when input is finished. + bool is_finished = false; }; } diff --git a/src/Processors/Transforms/ArrayJoinTransform.cpp b/src/Processors/Transforms/ArrayJoinTransform.cpp index 1304434d74e..b7a6ba85963 100644 --- a/src/Processors/Transforms/ArrayJoinTransform.cpp +++ b/src/Processors/Transforms/ArrayJoinTransform.cpp @@ -38,14 +38,14 @@ void ArrayJoinTransform::consume(Chunk chunk) } -bool ArrayJoinTransform::canGenerate() +bool ArrayJoinTransform::canGenerate(bool) { return result_iterator && result_iterator->hasNext(); } Chunk ArrayJoinTransform::generate() { - if (!canGenerate()) + if (!canGenerate(false)) throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't generate chunk in ArrayJoinTransform"); auto block = result_iterator->next(); diff --git a/src/Processors/Transforms/ArrayJoinTransform.h b/src/Processors/Transforms/ArrayJoinTransform.h index 4219135982d..de291a0422f 100644 --- a/src/Processors/Transforms/ArrayJoinTransform.h +++ b/src/Processors/Transforms/ArrayJoinTransform.h @@ -26,7 +26,7 @@ public: protected: void consume(Chunk chunk) override; - bool canGenerate() override; + bool canGenerate(bool is_read_finished) override; Chunk generate() override; private: diff --git a/src/Processors/Transforms/SquashingChunksTransform.cpp b/src/Processors/Transforms/SquashingChunksTransform.cpp index 0d69b6e0a8d..b79987161fd 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.cpp +++ b/src/Processors/Transforms/SquashingChunksTransform.cpp @@ -56,49 +56,35 @@ void SquashingChunksTransform::work() SimpleSquashingChunksTransform::SimpleSquashingChunksTransform( const Block & header, size_t min_block_size_rows, size_t min_block_size_bytes) - : ISimpleTransform(header, header, true), squashing(min_block_size_rows, min_block_size_bytes) + : IInflatingTransform(header, header), squashing(min_block_size_rows, min_block_size_bytes) { } -void SimpleSquashingChunksTransform::transform(Chunk & chunk) +void SimpleSquashingChunksTransform::consume(Chunk chunk) { - if (!finished) - { - if (auto block = squashing.add(getInputPort().getHeader().cloneWithColumns(chunk.detachColumns()))) - chunk.setColumns(block.getColumns(), block.rows()); - } - else - { - if (chunk.hasRows()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk expected to be empty, otherwise it will be lost"); - - auto block = squashing.add({}); - chunk.setColumns(block.getColumns(), block.rows()); - } + current_block = squashing.add(getInputPort().getHeader().cloneWithColumns(chunk.detachColumns())); } -IProcessor::Status SimpleSquashingChunksTransform::prepare() +Chunk SimpleSquashingChunksTransform::generate() { - if (!finished && input.isFinished()) - { - if (output.isFinished()) - return Status::Finished; + if (!current_block) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't generate chunk in SimpleSquashingChunksTransform"); - if (!output.canPush()) - return Status::PortFull; + Chunk result(current_block.getColumns(), current_block.rows()); + current_block.clear(); + return result; +} - if (has_output) - { - output.pushData(std::move(output_data)); - has_output = false; - return Status::PortFull; - } - finished = true; - /// On the next call to transform() we will return all data buffered in `squashing` (if any) - return Status::Ready; - } - return ISimpleTransform::prepare(); +bool SimpleSquashingChunksTransform::canGenerate(bool is_read_finished) +{ + if (current_block) + return true; + + if (is_read_finished) + current_block = squashing.add({}); + + return bool(current_block); } } diff --git a/src/Processors/Transforms/SquashingChunksTransform.h b/src/Processors/Transforms/SquashingChunksTransform.h index f82e9e46a61..d0316c39a43 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.h +++ b/src/Processors/Transforms/SquashingChunksTransform.h @@ -2,6 +2,7 @@ #include #include +#include #include namespace DB @@ -29,7 +30,7 @@ private: }; /// Doesn't care about propagating exceptions and thus doesn't throw LOGICAL_ERROR if the following transform closes its input port. -class SimpleSquashingChunksTransform : public ISimpleTransform +class SimpleSquashingChunksTransform : public IInflatingTransform { public: explicit SimpleSquashingChunksTransform(const Block & header, size_t min_block_size_rows, size_t min_block_size_bytes); @@ -37,14 +38,13 @@ public: String getName() const override { return "SimpleSquashingTransform"; } protected: - void transform(Chunk &) override; - - IProcessor::Status prepare() override; + void consume(Chunk chunk) override; + bool canGenerate(bool is_read_finished) override; + Chunk generate() override; private: SquashingTransform squashing; - /// When consumption is finished we need to release the final chunk regardless of its size. - bool finished = false; + Block current_block; }; } diff --git a/tests/queries/0_stateless/02115_rewrite_local_join_right_distribute_table.sql b/tests/queries/0_stateless/02115_rewrite_local_join_right_distribute_table.sql index 2ab324df787..d5ab82ba064 100644 --- a/tests/queries/0_stateless/02115_rewrite_local_join_right_distribute_table.sql +++ b/tests/queries/0_stateless/02115_rewrite_local_join_right_distribute_table.sql @@ -23,10 +23,6 @@ select t1.* from t1_all t1 join t2_all t2 on t1.a = t2.a ORDER BY t1.a; SELECT '-'; --- make sure data is fully written when reading from distributed -optimize table t1_local final; -optimize table t2_local final; - set distributed_product_mode = 'global'; select * from t1_all t1 where t1.a in (select t2.a from t2_all t2); explain syntax select t1.* from t1_all t1 join t2_all t2 on t1.a = t2.a; From 3f4f253c39b7118aab95b20af900d79cf1065cad Mon Sep 17 00:00:00 2001 From: MikhailBurdukov Date: Mon, 20 May 2024 08:09:55 +0000 Subject: [PATCH 248/392] Enable keep_free_space_bytes for metadata storage --- .../ObjectStorages/MetadataStorageFactory.cpp | 4 ++- ...02963_test_flexible_disk_configuration.sql | 26 +++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/Disks/ObjectStorages/MetadataStorageFactory.cpp b/src/Disks/ObjectStorages/MetadataStorageFactory.cpp index 4a3e8a37d28..ab7c2069b43 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFactory.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFactory.cpp @@ -99,8 +99,10 @@ void registerMetadataStorageFromDisk(MetadataStorageFactory & factory) { auto metadata_path = config.getString(config_prefix + ".metadata_path", fs::path(Context::getGlobalContextInstance()->getPath()) / "disks" / name / ""); + auto metadata_keep_free_space_bytes = config.getUInt64(config_prefix + ".metadata_keep_free_space_bytes", 0); + fs::create_directories(metadata_path); - auto metadata_disk = std::make_shared(name + "-metadata", metadata_path, 0, config, config_prefix); + auto metadata_disk = std::make_shared(name + "-metadata", metadata_path, metadata_keep_free_space_bytes, config, config_prefix); auto key_compatibility_prefix = getObjectKeyCompatiblePrefix(*object_storage, config, config_prefix); return std::make_shared(metadata_disk, key_compatibility_prefix); }); diff --git a/tests/queries/0_stateless/02963_test_flexible_disk_configuration.sql b/tests/queries/0_stateless/02963_test_flexible_disk_configuration.sql index 552291b2f83..8f67cd7e030 100644 --- a/tests/queries/0_stateless/02963_test_flexible_disk_configuration.sql +++ b/tests/queries/0_stateless/02963_test_flexible_disk_configuration.sql @@ -30,6 +30,28 @@ settings disk=disk(name='test2', drop table test; create table test (a Int32) engine = MergeTree() order by tuple() settings disk=disk(name='test3', + type = object_storage, + object_storage_type = s3, + metadata_storage_type = local, + metadata_keep_free_space_bytes = 1024, + endpoint = 'http://localhost:11111/test/common/', + access_key_id = clickhouse, + secret_access_key = clickhouse); +drop table test; + +create table test (a Int32) engine = MergeTree() order by tuple() +settings disk=disk(name='test4', + type = object_storage, + object_storage_type = s3, + metadata_storage_type = local, + metadata_keep_free_space_bytes = 0, + endpoint = 'http://localhost:11111/test/common/', + access_key_id = clickhouse, + secret_access_key = clickhouse); +drop table test; + +create table test (a Int32) engine = MergeTree() order by tuple() +settings disk=disk(name='test5', type = object_storage, object_storage_type = s3, metadata_type = lll, @@ -38,7 +60,7 @@ settings disk=disk(name='test3', secret_access_key = clickhouse); -- { serverError UNKNOWN_ELEMENT_IN_CONFIG } create table test (a Int32) engine = MergeTree() order by tuple() -settings disk=disk(name='test4', +settings disk=disk(name='test6', type = object_storage, object_storage_type = kkk, metadata_type = local, @@ -47,7 +69,7 @@ settings disk=disk(name='test4', secret_access_key = clickhouse); -- { serverError UNKNOWN_ELEMENT_IN_CONFIG } create table test (a Int32) engine = MergeTree() order by tuple() -settings disk=disk(name='test5', +settings disk=disk(name='test7', type = kkk, object_storage_type = s3, metadata_type = local, From e055de32bedb80dff96bd0f8809e967dafe1c0cb Mon Sep 17 00:00:00 2001 From: MikhailBurdukov Date: Mon, 20 May 2024 08:11:48 +0000 Subject: [PATCH 249/392] Add docs --- docs/en/operations/storing-data.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 9b316960750..53ecd66396d 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -421,6 +421,7 @@ Other parameters: * `skip_access_check` - If true, disk access checks will not be performed on disk start-up. Default value is `false`. * `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). * `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). +* `metadata_keep_free_space_bytes` - the amount of free metadata disk space to be reserved. Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)). From 6e605030d14d1ddba62d97d42a47067d08a78d8b Mon Sep 17 00:00:00 2001 From: MikhailBurdukov Date: Tue, 21 May 2024 11:55:39 +0000 Subject: [PATCH 250/392] Trigger Ci From b899bd07cfdee3a2919583482c0da2354bbb348a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 22 May 2024 16:12:33 +0200 Subject: [PATCH 251/392] Better --- utils/keeper-bench/Runner.cpp | 90 +++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp index a625a7f157d..ed7e09685f0 100644 --- a/utils/keeper-bench/Runner.cpp +++ b/utils/keeper-bench/Runner.cpp @@ -4,30 +4,28 @@ #include #include +#include #include -#include "Common/ConcurrentBoundedQueue.h" -#include "Common/Exception.h" -#include "Common/ZooKeeper/IKeeper.h" -#include "Common/ZooKeeper/ZooKeeperArgs.h" -#include "Common/ZooKeeper/ZooKeeperCommon.h" -#include "Common/ZooKeeper/ZooKeeperConstants.h" -#include -#include -#include "Coordination/KeeperSnapshotManager.h" -#include "Core/ColumnWithTypeAndName.h" -#include "Core/ColumnsWithTypeAndName.h" +#include +#include #include -#include "IO/ReadBuffer.h" -#include "IO/ReadBufferFromFile.h" -#include "base/Decimal.h" -#include "base/types.h" -#include +#include +#include +#include +#include #include #include #include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace CurrentMetrics @@ -884,6 +882,7 @@ struct SetupNodeCollector if (initial_storage->container.contains(path)) return; + new_nodes = true; std::cerr << "Adding expected node " << path << std::endl; Coordination::Requests create_ops; @@ -923,11 +922,19 @@ struct SetupNodeCollector void generateSnapshot() { - std::cerr << "Generating snapshot with starting data" << std::endl; std::lock_guard lock(nodes_mutex); + if (!new_nodes) + { + std::cerr << "No new nodes added" << std::endl; + return; + } + + std::cerr << "Generating snapshot with starting data" << std::endl; DB::SnapshotMetadataPtr snapshot_meta = std::make_shared(initial_storage->getZXID(), 1, std::make_shared()); DB::KeeperStorageSnapshot snapshot(initial_storage.get(), snapshot_meta); snapshot_manager->serializeSnapshotToDisk(snapshot); + + new_nodes = false; } std::mutex nodes_mutex; @@ -935,6 +942,7 @@ struct SetupNodeCollector Coordination::KeeperStoragePtr initial_storage; std::unordered_set nodes_created_during_replay; std::optional snapshot_manager; + bool new_nodes = false; }; void dumpStats(std::string_view type, const RequestFromLogStats::Stats & stats_for_type) @@ -972,23 +980,25 @@ void requestFromLogExecutor(std::shared_ptrtoString(), response.error, *expected_result) - // << std::endl; +#if 0 + if (*expected_result != response.error) + { + std::cerr << fmt::format( + "Unexpected result for {}\ngot {}, expected {}\n", request->toString(), response.error, *expected_result) + << std::endl; - // if (const auto * multi_response = dynamic_cast(&response)) - // { - // std::string subresponses; - // for (size_t i = 0; i < multi_response->responses.size(); ++i) - // { - // subresponses += fmt::format("{} = {}\n", i, multi_response->responses[i]->error); - // } + if (const auto * multi_response = dynamic_cast(&response)) + { + std::string subresponses; + for (size_t i = 0; i < multi_response->responses.size(); ++i) + { + subresponses += fmt::format("{} = {}\n", i, multi_response->responses[i]->error); + } - // std::cerr << "Subresponses\n" << subresponses << std::endl; - // } - //} + std::cerr << "Subresponses\n" << subresponses << std::endl; + } + } +#endif } request_promise->set_value(); @@ -1048,8 +1058,16 @@ void Runner::runBenchmarkFromLog() pool->wait(); - dumpStats("Write", stats.write_requests); - dumpStats("Read", stats.read_requests); + + if (setup_nodes_collector) + { + setup_nodes_collector->generateSnapshot(); + } + else + { + dumpStats("Write", stats.write_requests); + dumpStats("Read", stats.read_requests); + } }); auto push_request = [&](RequestFromLog request) From e05305692eaf0a5a6cab6d72196b9575ccf56fa6 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 22 May 2024 16:33:01 +0200 Subject: [PATCH 252/392] Fix encrypted --- src/Disks/DiskEncrypted.h | 5 +++++ src/Disks/IDisk.h | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 27000dcc8af..27cf3096344 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -350,6 +350,11 @@ public: return delegate; } + ObjectStoragePtr getObjectStorage() override + { + return delegate->getObjectStorage(); + } + private: String wrappedPath(const String & path) const { diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 614fe413503..b59e5b7f558 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -116,13 +116,18 @@ public: /// Default constructor. IDisk(const String & name_, const Poco::Util::AbstractConfiguration & config, const String & config_prefix) : name(name_) - , copying_thread_pool(CurrentMetrics::IDiskCopierThreads, CurrentMetrics::IDiskCopierThreadsActive, CurrentMetrics::IDiskCopierThreadsScheduled, config.getUInt(config_prefix + ".thread_pool_size", 16)) + , copying_thread_pool( + CurrentMetrics::IDiskCopierThreads, + CurrentMetrics::IDiskCopierThreadsActive, + CurrentMetrics::IDiskCopierThreadsScheduled, + config.getUInt(config_prefix + ".thread_pool_size", 16)) { } explicit IDisk(const String & name_) : name(name_) - , copying_thread_pool(CurrentMetrics::IDiskCopierThreads, CurrentMetrics::IDiskCopierThreadsActive, CurrentMetrics::IDiskCopierThreadsScheduled, 16) + , copying_thread_pool( + CurrentMetrics::IDiskCopierThreads, CurrentMetrics::IDiskCopierThreadsActive, CurrentMetrics::IDiskCopierThreadsScheduled, 16) { } From 39eef359dbc142c53d9f0162a36f0fee74e5edcc Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 22 May 2024 14:39:13 +0000 Subject: [PATCH 253/392] Add IInflatingTransform::getRemaining instead of flag in canGenerate --- src/Processors/IInflatingTransform.cpp | 21 ++++++++++------ src/Processors/IInflatingTransform.h | 7 +++--- .../Transforms/ArrayJoinTransform.cpp | 4 +-- .../Transforms/ArrayJoinTransform.h | 2 +- .../Transforms/SquashingChunksTransform.cpp | 25 +++++++++---------- .../Transforms/SquashingChunksTransform.h | 7 +++--- 6 files changed, 37 insertions(+), 29 deletions(-) diff --git a/src/Processors/IInflatingTransform.cpp b/src/Processors/IInflatingTransform.cpp index bc0b3e8459e..a59eda0feb2 100644 --- a/src/Processors/IInflatingTransform.cpp +++ b/src/Processors/IInflatingTransform.cpp @@ -76,17 +76,24 @@ void IInflatingTransform::work() current_chunk = generate(); generated = true; - can_generate = canGenerate(is_finished); + can_generate = canGenerate(); + } + else if (is_finished) + { + if (can_generate || generated || has_input) + throw Exception(ErrorCodes::LOGICAL_ERROR, "IInflatingTransform cannot finish work because it has generated data or has input data"); + + current_chunk = getRemaining(); + generated = !current_chunk.empty(); } else { - if (has_input) - { - consume(std::move(current_chunk)); - has_input = false; - } + if (!has_input) + throw Exception(ErrorCodes::LOGICAL_ERROR, "IInflatingTransform cannot consume chunk because it wasn't read"); - can_generate = canGenerate(is_finished); + consume(std::move(current_chunk)); + has_input = false; + can_generate = canGenerate(); } } diff --git a/src/Processors/IInflatingTransform.h b/src/Processors/IInflatingTransform.h index 3f832b0e5bc..0cb7fc06cc4 100644 --- a/src/Processors/IInflatingTransform.h +++ b/src/Processors/IInflatingTransform.h @@ -16,8 +16,8 @@ namespace DB /// ... (process transformed chunk) /// } /// } -/// while (transform.canGenerate(true)) -/// ... (process remaining data) +/// transformed_chunk = transform.getRemaining(); +/// ... (process remaining data) /// class IInflatingTransform : public IProcessor { @@ -31,8 +31,9 @@ protected: bool can_generate = false; virtual void consume(Chunk chunk) = 0; - virtual bool canGenerate(bool is_read_finished) = 0; + virtual bool canGenerate() = 0; virtual Chunk generate() = 0; + virtual Chunk getRemaining() { return {}; } public: IInflatingTransform(Block input_header, Block output_header); diff --git a/src/Processors/Transforms/ArrayJoinTransform.cpp b/src/Processors/Transforms/ArrayJoinTransform.cpp index b7a6ba85963..1304434d74e 100644 --- a/src/Processors/Transforms/ArrayJoinTransform.cpp +++ b/src/Processors/Transforms/ArrayJoinTransform.cpp @@ -38,14 +38,14 @@ void ArrayJoinTransform::consume(Chunk chunk) } -bool ArrayJoinTransform::canGenerate(bool) +bool ArrayJoinTransform::canGenerate() { return result_iterator && result_iterator->hasNext(); } Chunk ArrayJoinTransform::generate() { - if (!canGenerate(false)) + if (!canGenerate()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't generate chunk in ArrayJoinTransform"); auto block = result_iterator->next(); diff --git a/src/Processors/Transforms/ArrayJoinTransform.h b/src/Processors/Transforms/ArrayJoinTransform.h index de291a0422f..4219135982d 100644 --- a/src/Processors/Transforms/ArrayJoinTransform.h +++ b/src/Processors/Transforms/ArrayJoinTransform.h @@ -26,7 +26,7 @@ public: protected: void consume(Chunk chunk) override; - bool canGenerate(bool is_read_finished) override; + bool canGenerate() override; Chunk generate() override; private: diff --git a/src/Processors/Transforms/SquashingChunksTransform.cpp b/src/Processors/Transforms/SquashingChunksTransform.cpp index b79987161fd..267490dc89e 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.cpp +++ b/src/Processors/Transforms/SquashingChunksTransform.cpp @@ -62,29 +62,28 @@ SimpleSquashingChunksTransform::SimpleSquashingChunksTransform( void SimpleSquashingChunksTransform::consume(Chunk chunk) { - current_block = squashing.add(getInputPort().getHeader().cloneWithColumns(chunk.detachColumns())); + Block current_block = squashing.add(getInputPort().getHeader().cloneWithColumns(chunk.detachColumns())); + squashed_chunk.setColumns(current_block.getColumns(), current_block.rows()); } Chunk SimpleSquashingChunksTransform::generate() { - if (!current_block) + if (squashed_chunk.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't generate chunk in SimpleSquashingChunksTransform"); - Chunk result(current_block.getColumns(), current_block.rows()); - current_block.clear(); - return result; + return std::move(squashed_chunk); } - -bool SimpleSquashingChunksTransform::canGenerate(bool is_read_finished) +bool SimpleSquashingChunksTransform::canGenerate() { - if (current_block) - return true; + return !squashed_chunk.empty(); +} - if (is_read_finished) - current_block = squashing.add({}); - - return bool(current_block); +Chunk SimpleSquashingChunksTransform::getRemaining() +{ + Block current_block = squashing.add({}); + squashed_chunk.setColumns(current_block.getColumns(), current_block.rows()); + return std::move(squashed_chunk); } } diff --git a/src/Processors/Transforms/SquashingChunksTransform.h b/src/Processors/Transforms/SquashingChunksTransform.h index d0316c39a43..8c30a6032e4 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.h +++ b/src/Processors/Transforms/SquashingChunksTransform.h @@ -39,12 +39,13 @@ public: protected: void consume(Chunk chunk) override; - bool canGenerate(bool is_read_finished) override; + bool canGenerate() override; Chunk generate() override; + Chunk getRemaining() override; private: SquashingTransform squashing; - - Block current_block; + Chunk squashed_chunk; }; + } From 7e0e953ec9913435505d75285d1e5244c869a797 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 21 May 2024 17:01:16 +0000 Subject: [PATCH 254/392] Add debug logging to EmbeddedRocksDBBulkSink --- src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp | 13 ++++++++----- src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h | 2 +- src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp | 16 ++++++++-------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp index 7094578a9cc..0baa234e7a3 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp +++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp @@ -155,7 +155,7 @@ std::vector EmbeddedRocksDBBulkSink::squash(Chunk chunk) return {}; } -std::pair EmbeddedRocksDBBulkSink::serializeChunks(const std::vector & input_chunks) const +std::pair EmbeddedRocksDBBulkSink::serializeChunks(std::vector && input_chunks) const { auto serialized_key_column = ColumnString::create(); auto serialized_value_column = ColumnString::create(); @@ -168,7 +168,7 @@ std::pair EmbeddedRocksDBBulkSink::seriali WriteBufferFromVector writer_key(serialized_key_data); WriteBufferFromVector writer_value(serialized_value_data); - for (const auto & chunk : input_chunks) + for (auto && chunk : input_chunks) { const auto & columns = chunk.getColumns(); auto rows = chunk.getNumRows(); @@ -193,13 +193,14 @@ std::pair EmbeddedRocksDBBulkSink::seriali void EmbeddedRocksDBBulkSink::consume(Chunk chunk_) { - std::vector to_written = squash(std::move(chunk_)); + std::vector chunks_to_write = squash(std::move(chunk_)); - if (to_written.empty()) + if (chunks_to_write.empty()) return; - auto [serialized_key_column, serialized_value_column] = serializeChunks(to_written); + auto [serialized_key_column, serialized_value_column] = serializeChunks(std::move(chunks_to_write)); auto sst_file_path = getTemporarySSTFilePath(); + LOG_DEBUG(getLogger("EmbeddedRocksDBBulkSink"), "Writing {} rows to SST file {}", serialized_key_column->size(), sst_file_path); if (auto status = buildSSTFile(sst_file_path, *serialized_key_column, *serialized_value_column); !status.ok()) throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString()); @@ -209,6 +210,7 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk_) if (auto status = storage.rocksdb_ptr->IngestExternalFile({sst_file_path}, ingest_options); !status.ok()) throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString()); + LOG_DEBUG(getLogger("EmbeddedRocksDBBulkSink"), "SST file {} has been ingested", sst_file_path); if (fs::exists(sst_file_path)) (void)fs::remove(sst_file_path); } @@ -237,4 +239,5 @@ bool EmbeddedRocksDBBulkSink::isEnoughSize(const Chunk & chunk) const { return chunk.getNumRows() >= min_block_size_rows; } + } diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h index 19ce1e3b83e..46193b152ca 100644 --- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h +++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h @@ -49,7 +49,7 @@ private: bool isEnoughSize(const std::vector & input_chunks) const; bool isEnoughSize(const Chunk & chunk) const; /// Serialize chunks to rocksdb key-value pairs - std::pair serializeChunks(const std::vector & input_chunks) const; + std::pair serializeChunks(std::vector && input_chunks) const; StorageEmbeddedRocksDB & storage; StorageMetadataPtr metadata_snapshot; diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index 1a9aa6d0f41..e00cea27c49 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -316,6 +316,7 @@ void StorageEmbeddedRocksDB::mutate(const MutationCommands & commands, ContextPt void StorageEmbeddedRocksDB::drop() { + std::lock_guard lock(rocksdb_ptr_mx); rocksdb_ptr->Close(); rocksdb_ptr = nullptr; } @@ -463,18 +464,13 @@ void StorageEmbeddedRocksDB::initDB() { rocksdb::DB * db; if (read_only) - { status = rocksdb::DB::OpenForReadOnly(merged, rocksdb_dir, &db); - } else - { status = rocksdb::DB::Open(merged, rocksdb_dir, &db); - } + if (!status.ok()) - { - throw Exception(ErrorCodes::ROCKSDB_ERROR, "Failed to open rocksdb path at: {}: {}", - rocksdb_dir, status.ToString()); - } + throw Exception(ErrorCodes::ROCKSDB_ERROR, "Failed to open rocksdb path at: {}: {}", rocksdb_dir, status.ToString()); + rocksdb_ptr = std::unique_ptr(db); } } @@ -589,8 +585,12 @@ SinkToStoragePtr StorageEmbeddedRocksDB::write( const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context, bool /*async_insert*/) { if (getSettings().optimize_for_bulk_insert) + { + LOG_DEBUG(getLogger("StorageEmbeddedRocksDB"), "Using bulk insert"); return std::make_shared(query_context, *this, metadata_snapshot); + } + LOG_DEBUG(getLogger("StorageEmbeddedRocksDB"), "Using regular insert"); return std::make_shared(*this, metadata_snapshot); } From 7314689712549c1c2bf528fc8ef7638a2eb77ddf Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 22 May 2024 11:04:17 +0000 Subject: [PATCH 255/392] Store logger in StorageEmbeddedRocksDB --- src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp | 5 +++-- src/Storages/RocksDB/StorageEmbeddedRocksDB.h | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index e00cea27c49..c3b7ae64c7e 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -189,6 +189,7 @@ StorageEmbeddedRocksDB::StorageEmbeddedRocksDB(const StorageID & table_id_, , rocksdb_dir(std::move(rocksdb_dir_)) , ttl(ttl_) , read_only(read_only_) + , log(getLogger(fmt::format("StorageEmbeddedRocksDB ({})", getStorageID().getNameForLogs()))) { setInMemoryMetadata(metadata_); setSettings(std::move(settings_)); @@ -586,11 +587,11 @@ SinkToStoragePtr StorageEmbeddedRocksDB::write( { if (getSettings().optimize_for_bulk_insert) { - LOG_DEBUG(getLogger("StorageEmbeddedRocksDB"), "Using bulk insert"); + LOG_DEBUG(log, "Using bulk insert"); return std::make_shared(query_context, *this, metadata_snapshot); } - LOG_DEBUG(getLogger("StorageEmbeddedRocksDB"), "Using regular insert"); + LOG_DEBUG(log, "Using regular insert"); return std::make_shared(*this, metadata_snapshot); } diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index 9fc58ea6b38..61592398954 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -124,5 +124,7 @@ private: bool read_only; void initDB(); + + LoggerPtr log; }; } From 6f4a8bf2ea5bff2afd619f1bad8b034b325bcbfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 22 May 2024 17:32:01 +0200 Subject: [PATCH 256/392] Simplify test --- .../03033_final_undefined_last_mark.reference | 4 ++-- .../03033_final_undefined_last_mark.sql | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/03033_final_undefined_last_mark.reference b/tests/queries/0_stateless/03033_final_undefined_last_mark.reference index bf0a25f24e4..a30b755709b 100644 --- a/tests/queries/0_stateless/03033_final_undefined_last_mark.reference +++ b/tests/queries/0_stateless/03033_final_undefined_last_mark.reference @@ -1,2 +1,2 @@ -GOOD 11338881281426660955 14765404159170880511 -GOOD 11338881281426660955 14765404159170880511 +Disabled 11338881281426660955 14765404159170880511 +Enabled 11338881281426660955 14765404159170880511 diff --git a/tests/queries/0_stateless/03033_final_undefined_last_mark.sql b/tests/queries/0_stateless/03033_final_undefined_last_mark.sql index 2c13da42ca4..25a30a365a5 100644 --- a/tests/queries/0_stateless/03033_final_undefined_last_mark.sql +++ b/tests/queries/0_stateless/03033_final_undefined_last_mark.sql @@ -1,23 +1,23 @@ -- Tags: no-random-settings, no-random-merge-tree-settings +DROP TABLE IF EXISTS account_test; + CREATE TABLE account_test ( `id` UInt64, `row_ver` UInt64, ) ENGINE = ReplacingMergeTree(row_ver) -PARTITION BY id % 64 ORDER BY id -SETTINGS index_granularity = 512, index_granularity_bytes = 0, +SETTINGS index_granularity = 16, index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, min_rows_for_compact_part = 0, min_bytes_for_compact_part = 0; -INSERT INTO account_test - SELECT * FROM generateRandom('id UInt64, row_ver UInt64',1234) LIMIT 50000; +SYSTEM STOP MERGES account_test; -INSERT INTO account_test - SELECT * FROM (SELECT * FROM generateRandom('id UInt64, row_ver UInt64',1234) LIMIT 1000) WHERE row_ver > 14098131981223776000; +INSERT INTO account_test VALUES (11338881281426660955,717769962224129342),(12484100559155738267,7950971667203174918),(7603729260199571867,3255798127676911942),(7023543111808724827,911615979861855126),(10293135086416484571,3264379259750736572),(15561193439904316763,8419819469587131454),(17632407413882870235,7252071832370181502),(17009726455991851227,7525297506591593939),(12392078953873778779,8473049173389293961),(15283366022689446555,11692491360262171467),(9087459014730986523,2783662960221838603),(293823584550906267,4847630088179732782),(15693186194430465755,8163804880526285623),(7353080168325584795,17315892478487497859),(5980311238303466523,6943353798059390089),(14242621660019578011,8684624667957352769),(8241843507567433563,15731952080102886438); +INSERT INTO account_test VALUES (11338881281426660955, 14765404159170880511); -SELECT 'GOOD', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 0; -SELECT 'GOOD', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 1; +SELECT 'Disabled', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 0; +SELECT 'Enabled', * FROM account_test FINAL WHERE id = 11338881281426660955 SETTINGS split_parts_ranges_into_intersecting_and_non_intersecting_final = 1; From 48cab9e9dbeb16d1be33bdcce9206c472445cd9f Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 22 May 2024 15:53:32 +0000 Subject: [PATCH 257/392] Fix tests --- src/Columns/ColumnDynamic.cpp | 6 +++--- src/Columns/ColumnDynamic.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index d63a03dbafd..3c147b6f123 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -9,7 +9,7 @@ #include #include #include - +#include namespace DB { @@ -662,8 +662,8 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source all_variants.push_back(source_variants[i]); it = total_sizes.emplace(variant_name, 0).first; } - - size_t size = source_statistics.data.empty() ? source_variant_column.getVariantByGlobalDiscriminator(i).size() : source_statistics.data.at(variant_name); + auto statistics_it = source_statistics.data.find(variant_name); + size_t size = statistics_it == source_statistics.data.end() ? source_variant_column.getVariantByGlobalDiscriminator(i).size() : statistics_it->second; it->second += size; } } diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h index 8aece765308..27ad0dd583f 100644 --- a/src/Columns/ColumnDynamic.h +++ b/src/Columns/ColumnDynamic.h @@ -96,13 +96,13 @@ public: MutableColumnPtr cloneEmpty() const override { - /// Keep current dynamic structure but not statistics. - return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types); + /// Keep current dynamic structure + return Base::create(variant_column->cloneEmpty(), variant_info, max_dynamic_types, statistics); } MutableColumnPtr cloneResized(size_t size) const override { - return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types); + return Base::create(variant_column->cloneResized(size), variant_info, max_dynamic_types, statistics); } size_t size() const override From 332f449a0cec30616180266d4a43a4e658794b1f Mon Sep 17 00:00:00 2001 From: Danila Puzov Date: Wed, 22 May 2024 18:59:39 +0300 Subject: [PATCH 258/392] Issues --- src/Functions/generateSnowflakeID.cpp | 272 +++++++++++------- src/Functions/serial.cpp | 67 +++-- .../03129_serial_test_zookeeper.sql | 16 +- .../03130_generateSnowflakeId.reference | 11 + .../0_stateless/03130_generateSnowflakeId.sql | 29 ++ .../03130_generate_snowflake_id.reference | 3 - .../03130_generate_snowflake_id.sql | 11 - 7 files changed, 252 insertions(+), 157 deletions(-) create mode 100644 tests/queries/0_stateless/03130_generateSnowflakeId.reference create mode 100644 tests/queries/0_stateless/03130_generateSnowflakeId.sql delete mode 100644 tests/queries/0_stateless/03130_generate_snowflake_id.reference delete mode 100644 tests/queries/0_stateless/03130_generate_snowflake_id.sql diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index d70b8349cd8..6ae5dc13af0 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -5,6 +5,7 @@ #include #include #include +#include "base/types.h" namespace DB @@ -34,43 +35,153 @@ namespace - The last 12 bits are a counter to disambiguate multiple snowflakeIDs generated within the same millisecond by differen processes */ +/// bit counts constexpr auto timestamp_bits_count = 41; constexpr auto machine_id_bits_count = 10; constexpr auto machine_seq_num_bits_count = 12; -constexpr int64_t timestamp_mask = ((1LL << timestamp_bits_count) - 1) << (machine_id_bits_count + machine_seq_num_bits_count); -constexpr int64_t machine_id_mask = ((1LL << machine_id_bits_count) - 1) << machine_seq_num_bits_count; -constexpr int64_t machine_seq_num_mask = (1LL << machine_seq_num_bits_count) - 1; -constexpr int64_t max_machine_seq_num = machine_seq_num_mask; +/// bits masks for Snowflake ID components +// constexpr uint64_t timestamp_mask = ((1ULL << timestamp_bits_count) - 1) << (machine_id_bits_count + machine_seq_num_bits_count); // unused +constexpr uint64_t machine_id_mask = ((1ULL << machine_id_bits_count) - 1) << machine_seq_num_bits_count; +constexpr uint64_t machine_seq_num_mask = (1ULL << machine_seq_num_bits_count) - 1; -Int64 getMachineID() +/// max values +constexpr uint64_t max_machine_seq_num = machine_seq_num_mask; + +uint64_t getMachineID() { UUID server_uuid = ServerUUID::get(); /// hash into 64 bits - UInt64 hi = UUIDHelpers::getHighBytes(server_uuid); - UInt64 lo = UUIDHelpers::getLowBytes(server_uuid); - return ((hi * 11) ^ (lo * 17)) & machine_id_mask; + uint64_t hi = UUIDHelpers::getHighBytes(server_uuid); + uint64_t lo = UUIDHelpers::getLowBytes(server_uuid); + /// return only 10 bits + return (((hi * 11) ^ (lo * 17)) & machine_id_mask) >> machine_seq_num_bits_count; } -Int64 getTimestamp() +uint64_t getTimestamp() { auto now = std::chrono::system_clock::now(); auto ticks_since_epoch = std::chrono::duration_cast(now.time_since_epoch()).count(); - return ticks_since_epoch & ((1LL << timestamp_bits_count) - 1); + return static_cast(ticks_since_epoch) & ((1ULL << timestamp_bits_count) - 1); } +struct SnowflakeComponents { + uint64_t timestamp; + uint64_t machind_id; + uint64_t machine_seq_num; +}; + +SnowflakeComponents toComponents(uint64_t snowflake) { + return { + .timestamp = (snowflake >> (machine_id_bits_count + machine_seq_num_bits_count)), + .machind_id = ((snowflake & machine_id_mask) >> machine_seq_num_bits_count), + .machine_seq_num = (snowflake & machine_seq_num_mask) + }; } -class FunctionSnowflakeID : public IFunction +uint64_t toSnowflakeID(SnowflakeComponents components) { + return (components.timestamp << (machine_id_bits_count + machine_seq_num_bits_count) | + components.machind_id << (machine_seq_num_bits_count) | + components.machine_seq_num); +} + +struct RangeOfSnowflakeIDs { + /// [begin, end) + SnowflakeComponents begin, end; +}; + +/* Get range of `input_rows_count` Snowflake IDs from `max(available, now)` + +1. Calculate Snowflake ID by current timestamp (`now`) +2. `begin = max(available, now)` +3. Calculate `end = begin + input_rows_count` handling `machine_seq_num` overflow +*/ +RangeOfSnowflakeIDs getRangeOfAvailableIDs(const SnowflakeComponents& available, size_t input_rows_count) { -private: - mutable std::atomic lowest_available_snowflake_id = 0; /// atomic to avoid a mutex + /// 1. `now` + SnowflakeComponents begin = { + .timestamp = getTimestamp(), + .machind_id = getMachineID(), + .machine_seq_num = 0 + }; -public: + /// 2. `begin` + if (begin.timestamp <= available.timestamp) + { + begin.timestamp = available.timestamp; + begin.machine_seq_num = available.machine_seq_num; + } + + /// 3. `end = begin + input_rows_count` + SnowflakeComponents end; + const uint64_t seq_nums_in_current_timestamp_left = (max_machine_seq_num - begin.machine_seq_num + 1); + if (input_rows_count >= seq_nums_in_current_timestamp_left) + /// if sequence numbers in current timestamp is not enough for rows => update timestamp + end.timestamp = begin.timestamp + 1 + (input_rows_count - seq_nums_in_current_timestamp_left) / (max_machine_seq_num + 1); + else + end.timestamp = begin.timestamp; + + end.machind_id = begin.machind_id; + end.machine_seq_num = (begin.machine_seq_num + input_rows_count) & machine_seq_num_mask; + + return {begin, end}; +} + +struct GlobalCounterPolicy +{ static constexpr auto name = "generateSnowflakeID"; - static FunctionPtr create(ContextPtr /*context*/) { return std::make_shared(); } + static constexpr auto doc_description = R"(Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. Function generateSnowflakeID guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)"; - String getName() const override { return name; } + /// Guarantee counter monotonicity within one timestamp across all threads generating Snowflake IDs simultaneously. + struct Data + { + static inline std::atomic lowest_available_snowflake_id = 0; + + SnowflakeComponents reserveRange(size_t input_rows_count) + { + uint64_t available_snowflake_id = lowest_available_snowflake_id.load(); + RangeOfSnowflakeIDs range; + do + { + range = getRangeOfAvailableIDs(toComponents(available_snowflake_id), input_rows_count); + } + while (!lowest_available_snowflake_id.compare_exchange_weak(available_snowflake_id, toSnowflakeID(range.end))); + /// if `compare_exhange` failed => another thread updated `lowest_available_snowflake_id` and we should try again + /// completed => range of IDs [begin, end) is reserved, can return the beginning of the range + + return range.begin; + } + }; +}; + +struct ThreadLocalCounterPolicy +{ + static constexpr auto name = "generateSnowflakeIDThreadMonotonic"; + static constexpr auto doc_description = R"(Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. This function behaves like generateSnowflakeID but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate Snowflake IDs.)"; + + /// Guarantee counter monotonicity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads. + struct Data + { + static inline thread_local uint64_t lowest_available_snowflake_id = 0; + + SnowflakeComponents reserveRange(size_t input_rows_count) + { + RangeOfSnowflakeIDs range = getRangeOfAvailableIDs(toComponents(lowest_available_snowflake_id), input_rows_count); + lowest_available_snowflake_id = toSnowflakeID(range.end); + return range.begin; + } + }; +}; + +} + +template +class FunctionGenerateSnowflakeID : public IFunction, public FillPolicy +{ +public: + static FunctionPtr create(ContextPtr /*context*/) { return std::make_shared(); } + + String getName() const override { return FillPolicy::name; } size_t getNumberOfArguments() const override { return 0; } bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } @@ -80,71 +191,36 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (!arguments.empty()) { - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 0.", - getName(), arguments.size()); - } - return std::make_shared(); + FunctionArgumentDescriptors mandatory_args; + FunctionArgumentDescriptors optional_args{ + {"expr", nullptr, nullptr, "Arbitrary Expression"} + }; + validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); + + return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & /*arguments*/, const DataTypePtr &, size_t input_rows_count) const override { - auto col_res = ColumnVector::create(); - typename ColumnVector::Container & vec_to = col_res->getData(); + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_to = col_res->getData(); vec_to.resize(input_rows_count); - if (input_rows_count == 0) { - return col_res; - } - - const Int64 machine_id = getMachineID(); - Int64 current_timestamp = getTimestamp(); - Int64 current_machine_seq_num; - - Int64 available_snowflake_id, next_available_snowflake_id; - - const Int64 input_rows_count_signed = static_cast(input_rows_count); - - do + if (input_rows_count != 0) { - available_snowflake_id = lowest_available_snowflake_id.load(); - const Int64 available_timestamp = (available_snowflake_id & timestamp_mask) >> (machine_id_bits_count + machine_seq_num_bits_count); - const Int64 available_machine_seq_num = available_snowflake_id & machine_seq_num_mask; + typename FillPolicy::Data data; + /// get the begin of available snowflake ids range + SnowflakeComponents snowflake_id = data.reserveRange(input_rows_count); - if (current_timestamp > available_timestamp) + for (UInt64 & to_row : vec_to) { - /// handle overflow - current_machine_seq_num = 0; - } - else - { - current_timestamp = available_timestamp; - current_machine_seq_num = available_machine_seq_num; - } - - /// calculate new lowest_available_snowflake_id - const Int64 seq_nums_in_current_timestamp_left = (max_machine_seq_num - current_machine_seq_num + 1); - Int64 new_timestamp; - if (input_rows_count_signed >= seq_nums_in_current_timestamp_left) - new_timestamp = current_timestamp + 1 + (input_rows_count_signed - seq_nums_in_current_timestamp_left) / max_machine_seq_num; - else - new_timestamp = current_timestamp; - const Int64 new_machine_seq_num = (current_machine_seq_num + input_rows_count_signed) & machine_seq_num_mask; - next_available_snowflake_id = (new_timestamp << (machine_id_bits_count + machine_seq_num_bits_count)) | machine_id | new_machine_seq_num; - } - while (!lowest_available_snowflake_id.compare_exchange_strong(available_snowflake_id, next_available_snowflake_id)); - /// failed CAS => another thread updated `lowest_available_snowflake_id` - /// successful CAS => we have our range of exclusive values - - for (Int64 & to_row : vec_to) - { - to_row = (current_timestamp << (machine_id_bits_count + machine_seq_num_bits_count)) | machine_id | current_machine_seq_num; - if (current_machine_seq_num++ == max_machine_seq_num) - { - current_machine_seq_num = 0; - ++current_timestamp; + to_row = toSnowflakeID(snowflake_id); + if (snowflake_id.machine_seq_num++ == max_machine_seq_num) + { + snowflake_id.machine_seq_num = 0; + ++snowflake_id.timestamp; + } } } @@ -153,43 +229,27 @@ public: }; +template +void registerSnowflakeIDGenerator(auto& factory) +{ + static constexpr auto doc_syntax_format = "{}([expression])"; + static constexpr auto example_format = "SELECT {}()"; + static constexpr auto multiple_example_format = "SELECT {f}(1), {f}(2)"; + + FunctionDocumentation::Description doc_description = FillPolicy::doc_description; + FunctionDocumentation::Syntax doc_syntax = fmt::format(doc_syntax_format, FillPolicy::name); + FunctionDocumentation::Arguments doc_arguments = {{"expression", "The expression is used to bypass common subexpression elimination if the function is called multiple times in a query but otherwise ignored. Optional."}}; + FunctionDocumentation::ReturnedValue doc_returned_value = "A value of type UInt64"; + FunctionDocumentation::Examples doc_examples = {{"uuid", fmt::format(example_format, FillPolicy::name), ""}, {"multiple", fmt::format(multiple_example_format, fmt::arg("f", FillPolicy::name)), ""}}; + FunctionDocumentation::Categories doc_categories = {"Snowflake ID"}; + + factory.template registerFunction>({doc_description, doc_syntax, doc_arguments, doc_returned_value, doc_examples, doc_categories}, FunctionFactory::CaseInsensitive); +} + REGISTER_FUNCTION(GenerateSnowflakeID) { - factory.registerFunction(FunctionDocumentation - { - .description=R"( -Generates a SnowflakeID -- unique identificators contains: -- The first 41 (+ 1 top zero bit) bits is the timestamp (millisecond since Unix epoch 1 Jan 1970) -- The middle 10 bits are the machine ID -- The last 12 bits are a counter to disambiguate multiple snowflakeIDs generated within the same millisecond by differen processes - -In case the number of ids processed overflows, the timestamp field is incremented by 1 and the counter is reset to 0. -This function guarantees strict monotony on 1 machine and differences in values obtained on different machines. -)", - .syntax = "generateSnowflakeID()", - .arguments{}, - .returned_value = "Column of Int64", - .examples{ - {"single call", "SELECT generateSnowflakeID();", R"( -┌─generateSnowflakeID()─┐ -│ 7195510166884597760 │ -└───────────────────────┘)"}, - {"column call", "SELECT generateSnowflakeID() FROM numbers(10);", R"( -┌─generateSnowflakeID()─┐ -│ 7195516038159417344 │ -│ 7195516038159417345 │ -│ 7195516038159417346 │ -│ 7195516038159417347 │ -│ 7195516038159417348 │ -│ 7195516038159417349 │ -│ 7195516038159417350 │ -│ 7195516038159417351 │ -│ 7195516038159417352 │ -│ 7195516038159417353 │ -└───────────────────────┘)"}, - }, - .categories{"Unique identifiers", "Snowflake ID"} - }); + registerSnowflakeIDGenerator(factory); + registerSnowflakeIDGenerator(factory); } } diff --git a/src/Functions/serial.cpp b/src/Functions/serial.cpp index de3036ad242..d65df83c9f9 100644 --- a/src/Functions/serial.cpp +++ b/src/Functions/serial.cpp @@ -1,9 +1,12 @@ +#include "Common/Exception.h" #include #include #include #include +#include #include + namespace DB { @@ -14,6 +17,9 @@ namespace ErrorCodes extern const int KEEPER_EXCEPTION; } +constexpr auto function_node_name = "/serial_ids/"; +constexpr size_t MAX_SERIES_NUMBER = 1000; // ? + class FunctionSerial : public IFunction { private: @@ -21,7 +27,7 @@ private: ContextPtr context; public: - static constexpr auto name = "serial"; + static constexpr auto name = "generateSerialID"; explicit FunctionSerial(ContextPtr context_) : context(context_) { @@ -48,16 +54,12 @@ public: bool hasInformationAboutMonotonicity() const override { return true; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (arguments.size() != 1) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 1.", - getName(), arguments.size()); - if (!isStringOrFixedString(arguments[0])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Type of argument for function {} doesn't match: passed {}, should be string", - getName(), arguments[0]->getName()); + FunctionArgumentDescriptors mandatory_args{ + {"series identifier", static_cast(&isStringOrFixedString), nullptr, "String or FixedString"} + }; + validateFunctionArgumentTypes(*this, arguments, mandatory_args); return std::make_shared(); } @@ -71,12 +73,19 @@ public: if (zk->expired()) zk = context->getZooKeeper(); + // slow? + if (zk->exists(function_node_name) && zk->getChildren(function_node_name).size() == MAX_SERIES_NUMBER) { + throw Exception(ErrorCodes::KEEPER_EXCEPTION, + "At most {} serial nodes can be created", + MAX_SERIES_NUMBER); + } + auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_to = col_res->getData(); vec_to.resize(input_rows_count); - const auto & serial_path = "/serials/" + arguments[0].column->getDataAt(0).toString(); + const auto & serial_path = function_node_name + arguments[0].column->getDataAt(0).toString(); /// CAS in ZooKeeper /// `get` value and version, `trySet` new with version check @@ -130,28 +139,28 @@ Generates and returns sequential numbers starting from the previous counter valu This function takes a constant string argument - a series identifier. The server should be configured with a ZooKeeper. )", - .syntax = "serial(identifier)", + .syntax = "generateSerialID(identifier)", .arguments{ - {"series identifier", "Series identifier (String)"} + {"series identifier", "Series identifier (String or FixedString)"} }, .returned_value = "Sequential numbers of type Int64 starting from the previous counter value", .examples{ - {"first call", "SELECT serial('id1')", R"( -┌─serial('id1')──┐ -│ 1 │ -└────────────────┘)"}, - {"second call", "SELECT serial('id1')", R"( -┌─serial('id1')──┐ -│ 2 │ -└────────────────┘)"}, - {"column call", "SELECT *, serial('id1') FROM test_table", R"( -┌─CounterID─┬─UserID─┬─ver─┬─serial('id1')──┐ -│ 1 │ 3 │ 3 │ 3 │ -│ 1 │ 1 │ 1 │ 4 │ -│ 1 │ 2 │ 2 │ 5 │ -│ 1 │ 5 │ 5 │ 6 │ -│ 1 │ 4 │ 4 │ 7 │ -└───────────┴────────┴─────┴────────────────┘ + {"first call", "SELECT generateSerialID('id1')", R"( +┌─generateSerialID('id1')──┐ +│ 1 │ +└──────────────────────────┘)"}, + {"second call", "SELECT generateSerialID('id1')", R"( +┌─generateSerialID('id1')──┐ +│ 2 │ +└──────────────────────────┘)"}, + {"column call", "SELECT *, generateSerialID('id1') FROM test_table", R"( +┌─CounterID─┬─UserID─┬─ver─┬─generateSerialID('id1')──┐ +│ 1 │ 3 │ 3 │ 3 │ +│ 1 │ 1 │ 1 │ 4 │ +│ 1 │ 2 │ 2 │ 5 │ +│ 1 │ 5 │ 5 │ 6 │ +│ 1 │ 4 │ 4 │ 7 │ +└───────────┴────────┴─────┴──────────────────────────┘ )"}}, .categories{"Unique identifiers"} }); diff --git a/tests/queries/0_stateless/03129_serial_test_zookeeper.sql b/tests/queries/0_stateless/03129_serial_test_zookeeper.sql index c3395009477..2bd60656259 100644 --- a/tests/queries/0_stateless/03129_serial_test_zookeeper.sql +++ b/tests/queries/0_stateless/03129_serial_test_zookeeper.sql @@ -1,12 +1,12 @@ -- Tags: zookeeper -SELECT serial('x'); -SELECT serial('x'); -SELECT serial('y'); -SELECT serial('x') FROM numbers(5); +SELECT generateSerialID('x'); +SELECT generateSerialID('x'); +SELECT generateSerialID('y'); +SELECT generateSerialID('x') FROM numbers(5); -SELECT serial(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT serial('x', 'y'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT serial(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT generateSerialID(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT generateSerialID('x', 'y'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT generateSerialID(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -SELECT serial('z'), serial('z') FROM numbers(5); +SELECT generateSerialID('z'), generateSerialID('z') FROM numbers(5); diff --git a/tests/queries/0_stateless/03130_generateSnowflakeId.reference b/tests/queries/0_stateless/03130_generateSnowflakeId.reference new file mode 100644 index 00000000000..8cdced96770 --- /dev/null +++ b/tests/queries/0_stateless/03130_generateSnowflakeId.reference @@ -0,0 +1,11 @@ +-- generateSnowflakeID -- +1 +1 +0 +0 +1 +100 +-- generateSnowflakeIDThreadMonotonic -- +1 +1 +100 diff --git a/tests/queries/0_stateless/03130_generateSnowflakeId.sql b/tests/queries/0_stateless/03130_generateSnowflakeId.sql new file mode 100644 index 00000000000..3e994149d2b --- /dev/null +++ b/tests/queries/0_stateless/03130_generateSnowflakeId.sql @@ -0,0 +1,29 @@ +SELECT '-- generateSnowflakeID --'; +SELECT bitShiftLeft(toUInt64(generateSnowflakeID()), 52) = 0; -- check machine sequence number is zero +SELECT bitAnd(bitShiftRight(toUInt64(generateSnowflakeID()), 63), 1) = 0; -- check first bit is zero + +SELECT generateSnowflakeID(1) = generateSnowflakeID(2); +SELECT generateSnowflakeID() = generateSnowflakeID(1); +SELECT generateSnowflakeID(1) = generateSnowflakeID(1); + +SELECT generateSnowflakeID(1, 2); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } + +SELECT count(*) +FROM +( + SELECT DISTINCT generateSnowflakeID() + FROM numbers(100) +); + +SELECT '-- generateSnowflakeIDThreadMonotonic --'; +SELECT bitShiftLeft(toUInt64(generateSnowflakeIDThreadMonotonic()), 52) = 0; -- check machine sequence number is zero +SELECT bitAnd(bitShiftRight(toUInt64(generateSnowflakeIDThreadMonotonic()), 63), 1) = 0; -- check first bit is zero + +SELECT generateSnowflakeIDThreadMonotonic(1, 2); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } + +SELECT count(*) +FROM +( + SELECT DISTINCT generateSnowflakeIDThreadMonotonic() + FROM numbers(100) +); \ No newline at end of file diff --git a/tests/queries/0_stateless/03130_generate_snowflake_id.reference b/tests/queries/0_stateless/03130_generate_snowflake_id.reference deleted file mode 100644 index 2049ba26379..00000000000 --- a/tests/queries/0_stateless/03130_generate_snowflake_id.reference +++ /dev/null @@ -1,3 +0,0 @@ -1 -1 -10 diff --git a/tests/queries/0_stateless/03130_generate_snowflake_id.sql b/tests/queries/0_stateless/03130_generate_snowflake_id.sql deleted file mode 100644 index 669814c9ecb..00000000000 --- a/tests/queries/0_stateless/03130_generate_snowflake_id.sql +++ /dev/null @@ -1,11 +0,0 @@ -SELECT bitShiftLeft(toUInt64(generateSnowflakeID()), 52) = 0; -SELECT bitAnd(bitShiftRight(toUInt64(generateSnowflakeID()), 63), 1) = 0; - -SELECT generateSnowflakeID(1); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } - -SELECT count(*) -FROM -( - SELECT DISTINCT generateSnowflakeID() - FROM numbers(10) -) \ No newline at end of file From b6aa841e575a6594d159be2cc2a5fbc1391190ce Mon Sep 17 00:00:00 2001 From: Danila Puzov Date: Wed, 22 May 2024 19:26:48 +0300 Subject: [PATCH 259/392] Docs for generateSnowflakeID --- .../sql-reference/functions/uuid-functions.md | 126 ++++++++++++++++++ src/Functions/generateSnowflakeID.cpp | 2 +- 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index d1b833c2439..80d7215b9ef 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -690,6 +690,132 @@ serverUUID() Type: [UUID](../data-types/uuid.md). +## generateSnowflakeID + +Generates a [Snowflake ID](https://github.com/twitter-archive/snowflake/tree/b3f6a3c6ca8e1b6847baa6ff42bf72201e2c2231). + +Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. +For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. +In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. + +Function `generateSnowflakeID` guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries. + +``` + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +|0| timestamp | +├─┼ ┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| | machine_id | machine_seq_num | +└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘ +``` + +**Syntax** + +``` sql +generateSnowflakeID([expr]) +``` + +**Arguments** + +- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned Snowflake ID. Optional. + +**Returned value** + +A value of type UInt64. + +**Example** + +First, create a table with a column of type UInt64, then insert a generated Snowflake ID into the table. + +``` sql +CREATE TABLE tab (id UInt64) ENGINE = Memory; + +INSERT INTO tab SELECT generateSnowflakeID(); + +SELECT * FROM tab; +``` + +Result: + +```response +┌──────────────────id─┐ +│ 7199081390080409600 │ +└─────────────────────┘ +``` + +**Example with multiple Snowflake IDs generated per row** + +```sql +SELECT generateSnowflakeID(1), generateSnowflakeID(2); + +┌─generateSnowflakeID(1)─┬─generateSnowflakeID(2)─┐ +│ 7199081609652224000 │ 7199081609652224001 │ +└────────────────────────┴────────────────────────┘ +``` + +## generateSnowflakeIDThreadMonotonic + +Generates a [Snowflake ID](https://github.com/twitter-archive/snowflake/tree/b3f6a3c6ca8e1b6847baa6ff42bf72201e2c2231). + +Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. + +This function behaves like `generateSnowflakeID` but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate Snowflake IDs. + +``` + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +|0| timestamp | +├─┼ ┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +| | machine_id | machine_seq_num | +└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘ +``` + +**Syntax** + +``` sql +generateSnowflakeIDThreadMonotonic([expr]) +``` + +**Arguments** + +- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned Snowflake ID. Optional. + +**Returned value** + +A value of type UInt64. + +**Example** + +First, create a table with a column of type UInt64, then insert a generated Snowflake ID into the table. + +``` sql +CREATE TABLE tab (id UInt64) ENGINE = Memory; + +INSERT INTO tab SELECT generateSnowflakeIDThreadMonotonic(); + +SELECT * FROM tab; +``` + +Result: + +```response +┌──────────────────id─┐ +│ 7199082832006627328 │ +└─────────────────────┘ +``` + +**Example with multiple Snowflake IDs generated per row** + +```sql +SELECT generateSnowflakeIDThreadMonotonic(1), generateSnowflakeIDThreadMonotonic(2); + +┌─generateSnowflakeIDThreadMonotonic(1)─┬─generateSnowflakeIDThreadMonotonic(2)─┐ +│ 7199082940311945216 │ 7199082940316139520 │ +└───────────────────────────────────────┴───────────────────────────────────────┘ +``` + ## See also - [dictGetUUID](../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions-other) diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 6ae5dc13af0..1b26bf44adb 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -28,7 +28,7 @@ namespace |0| timestamp | ├─┼ ┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ | | machine_id | machine_seq_num | -├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ +└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘ - The first 41 (+ 1 top zero bit) bits is the timestamp (millisecond since Unix epoch 1 Jan 1970) - The middle 10 bits are the machine ID From a73d60bae5b49bf6b09e4acc05f59cecd528a007 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 22 May 2024 18:35:28 +0200 Subject: [PATCH 260/392] tests for qps_limit_exceeded --- contrib/aws | 2 +- .../integration/helpers/s3_mocks/broken_s3.py | 40 +++- .../test_checking_s3_blobs_paranoid/test.py | 206 +++++++++--------- 3 files changed, 143 insertions(+), 105 deletions(-) diff --git a/contrib/aws b/contrib/aws index 2e12d7c6daf..b7ae6e5bf48 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit 2e12d7c6dafa81311ee3d73ac6a178550ffa75be +Subproject commit b7ae6e5bf48fb4981f24476bdd187cd35df1e2c6 diff --git a/tests/integration/helpers/s3_mocks/broken_s3.py b/tests/integration/helpers/s3_mocks/broken_s3.py index 206f960293f..238b8aac112 100644 --- a/tests/integration/helpers/s3_mocks/broken_s3.py +++ b/tests/integration/helpers/s3_mocks/broken_s3.py @@ -165,11 +165,35 @@ class _ServerRuntime: '' "" "ExpectedError" - "mock s3 injected error" + "mock s3 injected unretryable error" "txfbd566d03042474888193-00608d7537" "" ) - request_handler.write_error(data) + request_handler.write_error(500, data) + + class SlowDownAction: + def inject_error(self, request_handler): + data = ( + '' + "" + "SlowDown" + "Slow Down." + "txfbd566d03042474888193-00608d7537" + "" + ) + request_handler.write_error(429, data) + + class QpsLimitExceededAction: + def inject_error(self, request_handler): + data = ( + '' + "" + "QpsLimitExceeded" + "Please reduce your request rate." + "txfbd566d03042474888193-00608d7537" + "" + ) + request_handler.write_error(429, data) class RedirectAction: def __init__(self, host="localhost", port=1): @@ -239,6 +263,10 @@ class _ServerRuntime: self.error_handler = _ServerRuntime.BrokenPipeAction() elif self.action == "redirect_to": self.error_handler = _ServerRuntime.RedirectAction(*self.action_args) + elif self.action == "slow_down": + self.error_handler = _ServerRuntime.SlowDownAction(*self.action_args) + elif self.action == "qps_limit_exceeded": + self.error_handler = _ServerRuntime.QpsLimitExceededAction(*self.action_args) else: self.error_handler = _ServerRuntime.Expected500ErrorAction() @@ -344,12 +372,12 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): self.end_headers() self.wfile.write(b"Redirected") - def write_error(self, data, content_length=None): + def write_error(self, http_code, data, content_length=None): if content_length is None: content_length = len(data) self.log_message("write_error %s", data) self.read_all_input() - self.send_response(500) + self.send_response(http_code) self.send_header("Content-Type", "text/xml") self.send_header("Content-Length", str(content_length)) self.end_headers() @@ -418,7 +446,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): path = [x for x in parts.path.split("/") if x] assert path[0] == "mock_settings", path if len(path) < 2: - return self.write_error("_mock_settings: wrong command") + return self.write_error(400, "_mock_settings: wrong command") if path[1] == "at_part_upload": params = urllib.parse.parse_qs(parts.query, keep_blank_values=False) @@ -477,7 +505,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler): self.log_message("reset") return self._ok() - return self.write_error("_mock_settings: wrong command") + return self.write_error(400, "_mock_settings: wrong command") def do_GET(self): if self.path == "/": diff --git a/tests/integration/test_checking_s3_blobs_paranoid/test.py b/tests/integration/test_checking_s3_blobs_paranoid/test.py index 22d6d263d23..97fc5de65e7 100644 --- a/tests/integration/test_checking_s3_blobs_paranoid/test.py +++ b/tests/integration/test_checking_s3_blobs_paranoid/test.py @@ -91,7 +91,7 @@ def get_multipart_counters(node, query_id, log_type="ExceptionWhileProcessing"): SELECT ProfileEvents['S3CreateMultipartUpload'], ProfileEvents['S3UploadPart'], - ProfileEvents['S3WriteRequestsErrors'], + ProfileEvents['S3WriteRequestsErrors'] + ProfileEvents['S3WriteRequestsThrottling'], FROM system.query_log WHERE query_id='{query_id}' AND type='{log_type}' @@ -148,7 +148,7 @@ def test_upload_s3_fail_create_multi_part_upload(cluster, broken_s3, compression ) assert "Code: 499" in error, error - assert "mock s3 injected error" in error, error + assert "mock s3 injected unretryable error" in error, error create_multipart, upload_parts, s3_errors = get_multipart_counters( node, insert_query_id @@ -190,7 +190,7 @@ def test_upload_s3_fail_upload_part_when_multi_part_upload( ) assert "Code: 499" in error, error - assert "mock s3 injected error" in error, error + assert "mock s3 injected unretryable error" in error, error create_multipart, upload_parts, s3_errors = get_multipart_counters( node, insert_query_id @@ -200,18 +200,28 @@ def test_upload_s3_fail_upload_part_when_multi_part_upload( assert s3_errors >= 2 -def test_when_s3_connection_refused_is_retried(cluster, broken_s3): +@pytest.mark.parametrize( + "action_and_message", [ + ("slow_down", "DB::Exception: Slow Down."), + ("qps_limit_exceeded", "DB::Exception: Please reduce your request rate."), + ("connection_refused", "Poco::Exception. Code: 1000, e.code() = 111, Connection refused"), + ], + ids=lambda x: x[0] +) +def test_when_error_is_retried(cluster, broken_s3, action_and_message): node = cluster.instances["node"] - broken_s3.setup_fake_multpartuploads() - broken_s3.setup_at_part_upload(count=3, after=2, action="connection_refused") + action, message = action_and_message - insert_query_id = f"INSERT_INTO_TABLE_FUNCTION_CONNECTION_REFUSED_RETRIED" + broken_s3.setup_fake_multpartuploads() + broken_s3.setup_at_part_upload(count=3, after=2, action=action) + + insert_query_id = f"INSERT_INTO_TABLE_{action}_RETRIED" node.query( f""" INSERT INTO TABLE FUNCTION s3( - 'http://resolver:8083/root/data/test_when_s3_connection_refused_at_write_retried', + 'http://resolver:8083/root/data/test_when_{action}_retried', 'minio', 'minio123', 'CSV', auto, 'none' ) @@ -234,13 +244,13 @@ def test_when_s3_connection_refused_is_retried(cluster, broken_s3): assert upload_parts == 39 assert s3_errors == 3 - broken_s3.setup_at_part_upload(count=1000, after=2, action="connection_refused") - insert_query_id = f"INSERT_INTO_TABLE_FUNCTION_CONNECTION_REFUSED_RETRIED_1" + broken_s3.setup_at_part_upload(count=1000, after=2, action=action) + insert_query_id = f"INSERT_INTO_TABLE_{action}_RETRIED_1" error = node.query_and_get_error( f""" INSERT INTO TABLE FUNCTION s3( - 'http://resolver:8083/root/data/test_when_s3_connection_refused_at_write_retried', + 'http://resolver:8083/root/data/test_when_{action}_retried', 'minio', 'minio123', 'CSV', auto, 'none' ) @@ -258,7 +268,79 @@ def test_when_s3_connection_refused_is_retried(cluster, broken_s3): assert "Code: 499" in error, error assert ( - "Poco::Exception. Code: 1000, e.code() = 111, Connection refused" in error + message in error + ), error + + +def test_when_s3_broken_pipe_at_upload_is_retried(cluster, broken_s3): + node = cluster.instances["node"] + + broken_s3.setup_fake_multpartuploads() + broken_s3.setup_at_part_upload( + count=3, + after=2, + action="broken_pipe", + ) + + insert_query_id = f"TEST_WHEN_S3_BROKEN_PIPE_AT_UPLOAD" + node.query( + f""" + INSERT INTO + TABLE FUNCTION s3( + 'http://resolver:8083/root/data/test_when_s3_broken_pipe_at_upload_is_retried', + 'minio', 'minio123', + 'CSV', auto, 'none' + ) + SELECT + * + FROM system.numbers + LIMIT 1000000 + SETTINGS + s3_max_single_part_upload_size=100, + s3_min_upload_part_size=1000000, + s3_check_objects_after_upload=0 + """, + query_id=insert_query_id, + ) + + create_multipart, upload_parts, s3_errors = get_multipart_counters( + node, insert_query_id, log_type="QueryFinish" + ) + + assert create_multipart == 1 + assert upload_parts == 7 + assert s3_errors == 3 + + broken_s3.setup_at_part_upload( + count=1000, + after=2, + action="broken_pipe", + ) + insert_query_id = f"TEST_WHEN_S3_BROKEN_PIPE_AT_UPLOAD_1" + error = node.query_and_get_error( + f""" + INSERT INTO + TABLE FUNCTION s3( + 'http://resolver:8083/root/data/test_when_s3_broken_pipe_at_upload_is_retried', + 'minio', 'minio123', + 'CSV', auto, 'none' + ) + SELECT + * + FROM system.numbers + LIMIT 1000000 + SETTINGS + s3_max_single_part_upload_size=100, + s3_min_upload_part_size=1000000, + s3_check_objects_after_upload=0 + """, + query_id=insert_query_id, + ) + + assert "Code: 1000" in error, error + assert ( + "DB::Exception: Poco::Exception. Code: 1000, e.code() = 32, I/O error: Broken pipe" + in error ), error @@ -401,20 +483,20 @@ def test_when_s3_connection_reset_by_peer_at_create_mpu_retried( ) error = node.query_and_get_error( f""" - INSERT INTO - TABLE FUNCTION s3( - 'http://resolver:8083/root/data/test_when_s3_connection_reset_by_peer_at_create_mpu_retried', - 'minio', 'minio123', - 'CSV', auto, 'none' - ) - SELECT - * - FROM system.numbers - LIMIT 1000 - SETTINGS - s3_max_single_part_upload_size=100, - s3_min_upload_part_size=100, - s3_check_objects_after_upload=0 + INSERT INTO + TABLE FUNCTION s3( + 'http://resolver:8083/root/data/test_when_s3_connection_reset_by_peer_at_create_mpu_retried', + 'minio', 'minio123', + 'CSV', auto, 'none' + ) + SELECT + * + FROM system.numbers + LIMIT 1000 + SETTINGS + s3_max_single_part_upload_size=100, + s3_min_upload_part_size=100, + s3_check_objects_after_upload=0 """, query_id=insert_query_id, ) @@ -427,78 +509,6 @@ def test_when_s3_connection_reset_by_peer_at_create_mpu_retried( ), error -def test_when_s3_broken_pipe_at_upload_is_retried(cluster, broken_s3): - node = cluster.instances["node"] - - broken_s3.setup_fake_multpartuploads() - broken_s3.setup_at_part_upload( - count=3, - after=2, - action="broken_pipe", - ) - - insert_query_id = f"TEST_WHEN_S3_BROKEN_PIPE_AT_UPLOAD" - node.query( - f""" - INSERT INTO - TABLE FUNCTION s3( - 'http://resolver:8083/root/data/test_when_s3_broken_pipe_at_upload_is_retried', - 'minio', 'minio123', - 'CSV', auto, 'none' - ) - SELECT - * - FROM system.numbers - LIMIT 1000000 - SETTINGS - s3_max_single_part_upload_size=100, - s3_min_upload_part_size=1000000, - s3_check_objects_after_upload=0 - """, - query_id=insert_query_id, - ) - - create_multipart, upload_parts, s3_errors = get_multipart_counters( - node, insert_query_id, log_type="QueryFinish" - ) - - assert create_multipart == 1 - assert upload_parts == 7 - assert s3_errors == 3 - - broken_s3.setup_at_part_upload( - count=1000, - after=2, - action="broken_pipe", - ) - insert_query_id = f"TEST_WHEN_S3_BROKEN_PIPE_AT_UPLOAD_1" - error = node.query_and_get_error( - f""" - INSERT INTO - TABLE FUNCTION s3( - 'http://resolver:8083/root/data/test_when_s3_broken_pipe_at_upload_is_retried', - 'minio', 'minio123', - 'CSV', auto, 'none' - ) - SELECT - * - FROM system.numbers - LIMIT 1000000 - SETTINGS - s3_max_single_part_upload_size=100, - s3_min_upload_part_size=1000000, - s3_check_objects_after_upload=0 - """, - query_id=insert_query_id, - ) - - assert "Code: 1000" in error, error - assert ( - "DB::Exception: Poco::Exception. Code: 1000, e.code() = 32, I/O error: Broken pipe" - in error - ), error - - def test_query_is_canceled_with_inf_retries(cluster, broken_s3): node = cluster.instances["node_with_inf_s3_retries"] From 52fe1fab97a5f39c99c33deb1054bf319fbbf230 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 22 May 2024 16:46:02 +0000 Subject: [PATCH 261/392] Automatic style fix --- tests/integration/helpers/s3_mocks/broken_s3.py | 4 +++- .../test_checking_s3_blobs_paranoid/test.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/integration/helpers/s3_mocks/broken_s3.py b/tests/integration/helpers/s3_mocks/broken_s3.py index 238b8aac112..7d0127bc1c4 100644 --- a/tests/integration/helpers/s3_mocks/broken_s3.py +++ b/tests/integration/helpers/s3_mocks/broken_s3.py @@ -266,7 +266,9 @@ class _ServerRuntime: elif self.action == "slow_down": self.error_handler = _ServerRuntime.SlowDownAction(*self.action_args) elif self.action == "qps_limit_exceeded": - self.error_handler = _ServerRuntime.QpsLimitExceededAction(*self.action_args) + self.error_handler = _ServerRuntime.QpsLimitExceededAction( + *self.action_args + ) else: self.error_handler = _ServerRuntime.Expected500ErrorAction() diff --git a/tests/integration/test_checking_s3_blobs_paranoid/test.py b/tests/integration/test_checking_s3_blobs_paranoid/test.py index 97fc5de65e7..a7fe02b16de 100644 --- a/tests/integration/test_checking_s3_blobs_paranoid/test.py +++ b/tests/integration/test_checking_s3_blobs_paranoid/test.py @@ -201,12 +201,16 @@ def test_upload_s3_fail_upload_part_when_multi_part_upload( @pytest.mark.parametrize( - "action_and_message", [ + "action_and_message", + [ ("slow_down", "DB::Exception: Slow Down."), ("qps_limit_exceeded", "DB::Exception: Please reduce your request rate."), - ("connection_refused", "Poco::Exception. Code: 1000, e.code() = 111, Connection refused"), + ( + "connection_refused", + "Poco::Exception. Code: 1000, e.code() = 111, Connection refused", + ), ], - ids=lambda x: x[0] + ids=lambda x: x[0], ) def test_when_error_is_retried(cluster, broken_s3, action_and_message): node = cluster.instances["node"] @@ -267,9 +271,7 @@ def test_when_error_is_retried(cluster, broken_s3, action_and_message): ) assert "Code: 499" in error, error - assert ( - message in error - ), error + assert message in error, error def test_when_s3_broken_pipe_at_upload_is_retried(cluster, broken_s3): From 1e5069b5dc6f07d7b29b3a94eaad1c15c9842635 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 22 May 2024 19:21:27 +0200 Subject: [PATCH 262/392] Fix duplicate include --- src/TableFunctions/ITableFunctionDataLake.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/TableFunctions/ITableFunctionDataLake.h b/src/TableFunctions/ITableFunctionDataLake.h index 6ad8689a9b4..fe6e5b3e593 100644 --- a/src/TableFunctions/ITableFunctionDataLake.h +++ b/src/TableFunctions/ITableFunctionDataLake.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include From 7c9f36ad1ea1e6cc1d480c44a94c9e473f3a27e0 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 22 May 2024 19:46:08 +0200 Subject: [PATCH 263/392] Add gh to style-check dockerfile --- docker/test/style/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 5d53d03606f..172fbce6406 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -11,6 +11,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ aspell \ curl \ git \ + gh \ file \ libxml2-utils \ moreutils \ From 6be79a35b6a55e88103056058ce9833ac62be77e Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 22 May 2024 20:30:19 +0200 Subject: [PATCH 264/392] update contrib/aws to the last head --- contrib/aws | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/aws b/contrib/aws index b7ae6e5bf48..eb96e740453 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit b7ae6e5bf48fb4981f24476bdd187cd35df1e2c6 +Subproject commit eb96e740453ae27afa1f367ba19f99bdcb38484d From 7ecfdbb3aaf4b7f4a68d6a332138dd90612e6120 Mon Sep 17 00:00:00 2001 From: Mikhail Artemenko Date: Wed, 22 May 2024 23:05:27 +0000 Subject: [PATCH 265/392] fix test_hdfsCluster_unset_skip_unavailable_shards --- tests/integration/test_storage_hdfs/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index bb72574c6e5..3c43918d8c0 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -895,7 +895,7 @@ def test_hdfsCluster_unset_skip_unavailable_shards(started_cluster): assert ( node1.query( - "select * from hdfsCluster('cluster_non_existent_port', 'hdfs://hdfs1:9000/skip_unavailable_shards', 'TSV', 'id UInt64, text String, number Float64')" + "select * from hdfsCluster('cluster_non_existent_port', 'hdfs://hdfs1:9000/unskip_unavailable_shards', 'TSV', 'id UInt64, text String, number Float64')" ) == data ) From c07c9d4c87efa2d4823526127bd52566773a2cd3 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Wed, 22 May 2024 21:57:43 -0300 Subject: [PATCH 266/392] test for #45804 --- ...l_and_prewhere_condition_ver_column.reference | 2 ++ ...1_final_and_prewhere_condition_ver_column.sql | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 tests/queries/0_stateless/00331_final_and_prewhere_condition_ver_column.reference create mode 100644 tests/queries/0_stateless/00331_final_and_prewhere_condition_ver_column.sql diff --git a/tests/queries/0_stateless/00331_final_and_prewhere_condition_ver_column.reference b/tests/queries/0_stateless/00331_final_and_prewhere_condition_ver_column.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/00331_final_and_prewhere_condition_ver_column.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/00331_final_and_prewhere_condition_ver_column.sql b/tests/queries/0_stateless/00331_final_and_prewhere_condition_ver_column.sql new file mode 100644 index 00000000000..78a58a979d1 --- /dev/null +++ b/tests/queries/0_stateless/00331_final_and_prewhere_condition_ver_column.sql @@ -0,0 +1,16 @@ +SET allow_experimental_analyzer = 1; + +-- https://github.com/ClickHouse/ClickHouse/issues/45804 + +CREATE TABLE myRMT( + key Int64, + someCol String, + ver DateTime +) ENGINE = ReplacingMergeTree(ver) +ORDER BY key as SELECT 1, 'test', '2020-01-01'; + +SELECT count(ver) FROM myRMT FINAL PREWHERE ver > '2000-01-01'; + +SELECT count() FROM myRMT FINAL PREWHERE ver > '2000-01-01'; + +DROP TABLE myRMT; From 88ae74f6fdd3d859674a588b8b6fba320d214950 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 23 May 2024 09:28:38 +0200 Subject: [PATCH 267/392] Add test for reinterpretXYZ --- .../functions/type-conversion-functions.md | 3 +- .../03156_reinterpret_functions.sql | 36 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03156_reinterpret_functions.sql diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 14a12ab5d5d..1030d92c76b 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1000,7 +1000,8 @@ Result: ## reinterpretAsInt(8\|16\|32\|64) -## reinterpretAsFloat(32\|64) +## reinterpretAsFloat* + ## reinterpretAsDate diff --git a/tests/queries/0_stateless/03156_reinterpret_functions.sql b/tests/queries/0_stateless/03156_reinterpret_functions.sql new file mode 100644 index 00000000000..4acaaf47cef --- /dev/null +++ b/tests/queries/0_stateless/03156_reinterpret_functions.sql @@ -0,0 +1,36 @@ +-- Date and DateTime + +SELECT reinterpretAsDate(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsDate('A',''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsDate([0, 1, 2]); -- { clientError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT reinterpretAsDateTime(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsDateTime('A',''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsDateTime([0, 1, 2]); -- { clientError ILLEGAL_TYPE_OF_ARGUMENT} + +SELECT reinterpretAsDate(65); +SELECT reinterpretAsDate('A'); +SELECT reinterpretAsDateTime(65); +SELECT reinterpretAsDate('A'); + +-- Fixed String + +SELECT reinterpretAsFixedString(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsFixedString(toDate('1970-01-01'),''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsFixedString([0, 1, 2]); -- { clientError ILLEGAL_TYPE_OF_ARGUMENT} + +SELECT reinterpretAsFixedString(toDate('1970-03-07')); +SELECT reinterpretAsFixedString(toDateTime('1970-01-01 01:01:05')); +SELECT reinterpretAsFixedString(65); + +-- Float32, Float64 + +SELECT reinterpretAsFloat32(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsFloat64(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsFloat32('1970-01-01', ''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsFloat64('1970-01-01', ''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT reinterpretAsFloat32([0, 1, 2]); -- { clientError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT reinterpretAsFloat64([0, 1, 2]); -- { clientError4 ILLEGAL_TYPE_OF_ARGUMENT} + + + + From 9234beaff8ef19ed758984fb70c82b4edb3762f0 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 23 May 2024 09:32:43 +0200 Subject: [PATCH 268/392] Fix typo and move from other-functions to math-functions --- .../sql-reference/functions/math-functions.md | 46 +++++++++++++++++++ .../functions/other-functions.md | 46 ------------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 945166056af..324adbfb4b3 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -947,3 +947,49 @@ Result: │ 11 │ └──────────────────────────────────┘ ``` + +## proportionsZTest + +Returns test statistics for the two proportion Z-test - a statistical test for comparing the proportions from two populations `x` and `y`. + +**Syntax** + +```sql +proportionsZTest(successes_x, successes_y, trials_x, trials_y, conf_level, pool_type) +``` + +**Arguments** + +- `successes_x`: Number of successes in population `x`. [UInt64](../data-types/int-uint.md). +- `successes_y`: Number of successes in population `y`. [UInt64](../data-types/int-uint.md). +- `trials_x`: Number of trials in population `x`. [UInt64](../data-types/int-uint.md). +- `trials_y`: Number of trials in population `y`. [UInt64](../data-types/int-uint.md). +- `conf_level`: Confidence level for the test. [Float64](../data-types/float.md). +- `pool_type`: Selection of pooling (way in which the standard error is estimated). Can be either `unpooled` or `pooled`. [String](../data-types/string.md). + +:::note +For argument `pool_type`: In the pooled version, the two proportions are averaged, and only one proportion is used to estimate the standard error. In the unpooled version, the two proportions are used separately. +::: + +**Returned value** + +- `z_stat`: Z statistic. [Float64](../data-types/float.md). +- `p_val`: P value. [Float64](../data-types/float.md). +- `ci_low`: The lower confidence interval. [Float64](../data-types/float.md). +- `ci_high`: The upper confidence interval. [Float64](../data-types/float.md). + +**Example** + +Query: + +```sql +SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled'); +``` + +Result: + +```response +┌─proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled')───────────────────────────────┐ +│ (-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) │ +└────────────────────────────────────────────────────────────────────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 288432167bb..2b0215115cb 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -903,52 +903,6 @@ SELECT parseTimeDelta('1yr2mo') └──────────────────────────┘ ``` -## proportionsZTest - -Returns test statistics for the two proportion Z-test - a statistical test for comparing the proportions from two populations `x` and `y`. - -**Syntax** - -```sql -proportionsZTest(successes_x, successes_y, trials_x, trials_y, conf_level, pool_type) -``` - -**Arguments** - -- `successes_x`: Number of successes in population `x`. [UInt64](../data-types/int-uint.md). -- `successes_y`: Number of successes in population `y`. [UInt64](../data-types/int-uint.md). -- `trials_x`: Number of trials in population `x`. [UInt64](../data-types/int-uint.md). -- `trials_y`: Number of trials in population `y`. [UInt64](../data-types/int-uint.md). -- `conf_level`: Confidence level for the test. [Float64](../data-types/float.md). -- `pool_type`: Selection of pooling (way in which the standard error is estimated). can be either `unpooled` or `pooled`. [String](../data-types/string.md). - -:::note -For argument `pool_type`: In the pooled version, the two proportions are averaged, and only one proportion is used to estimate the standard error. In the unpooled version, the two proportions are used separately. -::: - -**Returned value** - -- `z_stat`: Z statistic. [Float64](../data-types/float.md). -- `p_val`: P value. [Float64](../data-types/float.md). -- `ci_low`: The lower confidence interval. [Float64](../data-types/float.md). -- `ci_high`: The upper confidence interval. [Float64](../data-types/float.md). - -**Example** - -Query: - -```sql -SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled'); -``` - -Result: - -```response -┌─proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled')───────────────────────────────┐ -│ (-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) │ -└────────────────────────────────────────────────────────────────────────────────────┘ -``` - ## least(a, b) Returns the smaller value of a and b. From 45492baf440418267c8187607650a6ceddc061d3 Mon Sep 17 00:00:00 2001 From: MikhailBurdukov Date: Thu, 23 May 2024 08:20:16 +0000 Subject: [PATCH 269/392] Restart Ci From a21377cf5131de31e2109c117774fdb8058e8bc9 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 May 2024 11:51:34 +0200 Subject: [PATCH 270/392] Update src/Analyzer/Passes/QueryAnalysisPass.cpp Co-authored-by: Dmitry Novik --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index cfea45732db..3ccecac951d 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -638,7 +638,10 @@ struct ScopeAliases auto it = alias_map.find(*key); - if (it == alias_map.end() && lookup.lookup_context == IdentifierLookupContext::TABLE_EXPRESSION) + if (it != alias_map.end()) + return &it->second; + + if (lookup.lookup_context == IdentifierLookupContext::TABLE_EXPRESSION) return {}; while (it == alias_map.end()) From 9d63095db9445f4963da914ddbc819b0a57bc7e2 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 16 Apr 2024 12:55:50 +0000 Subject: [PATCH 271/392] Revert "Revert "Speed up `splitByRegexp`"" This reverts commit 08e5c2ba4d9620551b0de5791876d35888d2c81a. --- src/Functions/splitByRegexp.cpp | 66 ++++++++++++++++++- tests/performance/function_tokens.xml | 2 + .../01866_split_by_regexp.reference | 12 ++++ .../0_stateless/01866_split_by_regexp.sql | 17 +++++ 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/src/Functions/splitByRegexp.cpp b/src/Functions/splitByRegexp.cpp index 32afb813a04..e28fe9c38bb 100644 --- a/src/Functions/splitByRegexp.cpp +++ b/src/Functions/splitByRegexp.cpp @@ -1,9 +1,11 @@ #include +#include +#include #include #include -#include #include #include +#include #include @@ -102,7 +104,7 @@ public: return false; } - pos += 1; + ++pos; token_end = pos; ++splits; } @@ -148,11 +150,69 @@ public: using FunctionSplitByRegexp = FunctionTokens; +/// Fallback splitByRegexp to splitByChar when its 1st argument is a trivial char for better performance +class SplitByRegexpOverloadResolver : public IFunctionOverloadResolver +{ +public: + static constexpr auto name = "splitByRegexp"; + static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique(context); } + + explicit SplitByRegexpOverloadResolver(ContextPtr context_) + : context(context_) + , split_by_regexp(FunctionSplitByRegexp::create(context)) {} + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return SplitByRegexpImpl::getNumberOfArguments(); } + bool isVariadic() const override { return SplitByRegexpImpl::isVariadic(); } + + FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override + { + if (patternIsTrivialChar(arguments)) + return FunctionFactory::instance().getImpl("splitByChar", context)->build(arguments); + else + return std::make_unique( + split_by_regexp, collections::map(arguments, [](const auto & elem) { return elem.type; }), return_type); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + return split_by_regexp->getReturnTypeImpl(arguments); + } + +private: + bool patternIsTrivialChar(const ColumnsWithTypeAndName & arguments) const + { + const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); + if (!col) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}. " + "Must be constant string.", + arguments[0].column->getName(), + getName()); + + String pattern = col->getValue(); + if (pattern.size() == 1) + { + OptimizedRegularExpression re = Regexps::createRegexp(pattern); + + std::string required_substring; + bool is_trivial; + bool required_substring_is_prefix; + re.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); + return is_trivial && required_substring == pattern; + } + return false; + } + + ContextPtr context; + FunctionPtr split_by_regexp; +}; } REGISTER_FUNCTION(SplitByRegexp) { - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/tests/performance/function_tokens.xml b/tests/performance/function_tokens.xml index 63b72f83df3..1ff56323d62 100644 --- a/tests/performance/function_tokens.xml +++ b/tests/performance/function_tokens.xml @@ -1,3 +1,5 @@ with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByChar(' ', materialize(s)) as w from numbers(1000000) + with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp(' ', materialize(s)) as w from numbers(1000000) + with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp('\s+', materialize(s)) as w from numbers(100000) diff --git a/tests/queries/0_stateless/01866_split_by_regexp.reference b/tests/queries/0_stateless/01866_split_by_regexp.reference index a3ae2f35a5f..62939940545 100644 --- a/tests/queries/0_stateless/01866_split_by_regexp.reference +++ b/tests/queries/0_stateless/01866_split_by_regexp.reference @@ -5,3 +5,15 @@ ['gbye','bug'] [''] [] +Test fallback of splitByRegexp to splitByChar if regexp is trivial +['a','b','c'] +['a','b','c'] +['','','','','',''] +['a^b^c'] +['a$b$c'] +['a)b)c'] +['a','b','c'] +['a','b','c'] +['a','b','c'] +['a|b|c'] +['a\\b\\c'] diff --git a/tests/queries/0_stateless/01866_split_by_regexp.sql b/tests/queries/0_stateless/01866_split_by_regexp.sql index e472fb68d94..570bd1ba5c0 100644 --- a/tests/queries/0_stateless/01866_split_by_regexp.sql +++ b/tests/queries/0_stateless/01866_split_by_regexp.sql @@ -3,3 +3,20 @@ select splitByRegexp('', 'abcde'); select splitByRegexp('<[^<>]*>', x) from (select arrayJoin(['

hello

world

', 'gbyebug']) x); select splitByRegexp('ab', ''); select splitByRegexp('', ''); + +SELECT 'Test fallback of splitByRegexp to splitByChar if regexp is trivial'; +select splitByRegexp(' ', 'a b c'); +select splitByRegexp('-', 'a-b-c'); +select splitByRegexp('.', 'a.b.c'); +select splitByRegexp('^', 'a^b^c'); +select splitByRegexp('$', 'a$b$c'); +select splitByRegexp('+', 'a+b+c'); -- { serverError CANNOT_COMPILE_REGEXP } +select splitByRegexp('?', 'a?b?c'); -- { serverError CANNOT_COMPILE_REGEXP } +select splitByRegexp('(', 'a(b(c'); -- { serverError CANNOT_COMPILE_REGEXP } +select splitByRegexp(')', 'a)b)c'); +select splitByRegexp('[', 'a[b[c'); -- { serverError CANNOT_COMPILE_REGEXP } +select splitByRegexp(']', 'a]b]c'); +select splitByRegexp('{', 'a{b{c'); +select splitByRegexp('}', 'a}b}c'); +select splitByRegexp('|', 'a|b|c'); +select splitByRegexp('\\', 'a\\b\\c'); From 00bbffa6f056348a9252ca178edfee580a1939d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 23 May 2024 11:04:29 +0000 Subject: [PATCH 272/392] Update autogenerated version to 24.6.1.1 and contributors --- cmake/autogenerated_versions.txt | 10 +++---- .../StorageSystemContributors.generated.cpp | 30 +++++++++++++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index f8ff71876c6..dfbbb66a1e9 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54486) +SET(VERSION_REVISION 54487) SET(VERSION_MAJOR 24) -SET(VERSION_MINOR 5) +SET(VERSION_MINOR 6) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH 6d4b31322d168356c8b10c43b4cef157c82337ff) -SET(VERSION_DESCRIBE v24.5.1.1-testing) -SET(VERSION_STRING 24.5.1.1) +SET(VERSION_GITHASH 70a1d3a63d47f0be077d67b8deb907230fc7cfb0) +SET(VERSION_DESCRIBE v24.6.1.1-testing) +SET(VERSION_STRING 24.6.1.1) # end of autochange diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp index 909599c00af..b42b070d518 100644 --- a/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/src/Storages/System/StorageSystemContributors.generated.cpp @@ -48,6 +48,7 @@ const char * auto_contributors[] { "Alex Cao", "Alex Cheng", "Alex Karo", + "Alex Katsman", "Alex Krash", "Alex Ryndin", "Alex Zatelepin", @@ -101,6 +102,7 @@ const char * auto_contributors[] { "Alexey Korepanov", "Alexey Milovidov", "Alexey Perevyshin", + "Alexey Petrunyaka", "Alexey Tronov", "Alexey Vasiliev", "Alexey Zatelepin", @@ -109,6 +111,7 @@ const char * auto_contributors[] { "AlfVII", "Alfonso Martinez", "Alfred Xu", + "Ali", "Ali Demirci", "Aliaksandr Pliutau", "Aliaksandr Shylau", @@ -250,6 +253,7 @@ const char * auto_contributors[] { "Brian Hunter", "Brokenice0415", "Bulat Gaifullin", + "Caio Ricciuti", "Camden Cheek", "Camilo Sierra", "Carbyn", @@ -384,6 +388,7 @@ const char * auto_contributors[] { "Evgenii Pravda", "Evgeniia Sudarikova", "Evgeniy Gatov", + "Evgeniy Leko", "Evgeniy Udodov", "Evgeny", "Evgeny Konkov", @@ -413,6 +418,7 @@ const char * auto_contributors[] { "Fille", "Flowyi", "Francisco Barón", + "Francisco Javier Jurado Moreno", "Frank Chen", "Frank Zhao", "François Violette", @@ -425,6 +431,7 @@ const char * auto_contributors[] { "G5.Qin", "Gabriel", "Gabriel Archer", + "Gabriel Martinez", "Gagan Arneja", "Gagan Goel", "Gao Qiang", @@ -446,6 +453,7 @@ const char * auto_contributors[] { "Grigory Buteyko", "Grigory Pervakov", "GruffGemini", + "Grégoire Pineau", "Guillaume Tassery", "Guo Wangyang", "Guo Wei (William)", @@ -587,6 +595,7 @@ const char * auto_contributors[] { "Keiji Yoshida", "Ken Chen", "Ken MacInnis", + "KenL", "Kenji Noguchi", "Kerry Clendinning", "Kevin Chiang", @@ -640,6 +649,7 @@ const char * auto_contributors[] { "Leonardo Maciel", "Leonid Krylov", "Leopold Schabel", + "Leticia Webb", "Lev Borodin", "Lewinma", "Li Shuai", @@ -701,6 +711,7 @@ const char * auto_contributors[] { "Masha", "Mathieu Rey", "Matthew Peveler", + "Mattias Naarttijärvi", "Matwey V. Kornilov", "Max", "Max Akhmedov", @@ -711,6 +722,7 @@ const char * auto_contributors[] { "MaxTheHuman", "MaxWk", "Maxim Akhmedov", + "Maxim Alexeev", "Maxim Babenko", "Maxim Fedotov", "Maxim Fridental", @@ -739,6 +751,7 @@ const char * auto_contributors[] { "Michael Razuvaev", "Michael Schnerring", "Michael Smitasin", + "Michael Stetsyuk", "Michail Safronov", "Michal Lisowski", "MicrochipQ", @@ -879,6 +892,7 @@ const char * auto_contributors[] { "Pavlo Bashynskiy", "Pawel Rog", "Paweł Kudzia", + "Pazitiff9", "Peignon Melvyn", "Peng Jian", "Peng Liu", @@ -1084,6 +1098,7 @@ const char * auto_contributors[] { "Tom Bombadil", "Tom Risse", "Tomas Barton", + "Tomer Shafir", "Tomáš Hromada", "Tristan", "Tsarkova Anastasia", @@ -1123,6 +1138,7 @@ const char * auto_contributors[] { "Victor Krasnov", "Victor Tarnavsky", "Viktor Taranenko", + "Vinay Suryadevara", "Vincent", "Vincent Bernat", "Vitalii S", @@ -1162,6 +1178,9 @@ const char * auto_contributors[] { "Vladislav Smirnov", "Vladislav V", "Vojtech Splichal", + "Volodya", + "Volodya Giro", + "Volodyachan", "Volodymyr Kuznetsov", "Vsevolod Orlov", "Vxider", @@ -1179,6 +1198,7 @@ const char * auto_contributors[] { "XenoAmess", "Xianda Ke", "Xiang Zhou", + "Xiaofei Hu", "Xin Wang", "Xoel Lopez Barata", "Xudong Zhang", @@ -1224,6 +1244,7 @@ const char * auto_contributors[] { "Zhipeng", "Zhuo Qiu", "Zijie Lu", + "Zimu Li", "Ziy1-Tan", "Zoran Pandovski", "[데이터플랫폼팀] 이호선", @@ -1490,6 +1511,7 @@ const char * auto_contributors[] { "jiyoungyoooo", "jktng", "jkuklis", + "joe09@foxmail.com", "joelynch", "johanngan", "johnnymatthews", @@ -1658,6 +1680,7 @@ const char * auto_contributors[] { "ongkong", "orantius", "p0ny", + "p1rattttt", "palasonicq", "palegre-tiny", "pawelsz-rb", @@ -1667,6 +1690,7 @@ const char * auto_contributors[] { "pedro.riera", "pengxiangcai", "peshkurov", + "pet74alex", "peter279k", "philip.han", "pingyu", @@ -1680,6 +1704,7 @@ const char * auto_contributors[] { "pyos", "pzhdfy", "qaziqarta", + "qiangxuhui", "qianlixiang", "qianmoQ", "qieqieplus", @@ -1793,6 +1818,7 @@ const char * auto_contributors[] { "unknown", "urgordeadbeef", "usurai", + "v01dxyz", "vahid-sohrabloo", "vdimir", "velavokr", @@ -1802,6 +1828,7 @@ const char * auto_contributors[] { "vic", "vicdashkov", "vicgao", + "vinay92-ch", "vinity", "vitac", "vitstn", @@ -1818,6 +1845,7 @@ const char * auto_contributors[] { "weeds085490", "whysage", "wineternity", + "woodlzm", "wuxiaobai24", "wxybear", "wzl", @@ -1877,6 +1905,7 @@ const char * auto_contributors[] { "zhenjial", "zhifeng", "zhongyuankai", + "zhou", "zhoubintao", "zhukai", "zimv", @@ -1891,6 +1920,7 @@ const char * auto_contributors[] { "zxealous", "zy-kkk", "zzsmdfj", + "zzyReal666", "Šimon Podlipský", "Александр", "Александр Нам", From 299f0886bfda27e375be3edf9042af513cbf99c8 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 23 May 2024 13:48:17 +0200 Subject: [PATCH 273/392] Followup for #63691 --- src/Processors/Transforms/SquashingChunksTransform.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Processors/Transforms/SquashingChunksTransform.cpp b/src/Processors/Transforms/SquashingChunksTransform.cpp index 267490dc89e..ed67dd508f3 100644 --- a/src/Processors/Transforms/SquashingChunksTransform.cpp +++ b/src/Processors/Transforms/SquashingChunksTransform.cpp @@ -71,7 +71,9 @@ Chunk SimpleSquashingChunksTransform::generate() if (squashed_chunk.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't generate chunk in SimpleSquashingChunksTransform"); - return std::move(squashed_chunk); + Chunk result_chunk; + result_chunk.swap(squashed_chunk); + return result_chunk; } bool SimpleSquashingChunksTransform::canGenerate() @@ -83,7 +85,10 @@ Chunk SimpleSquashingChunksTransform::getRemaining() { Block current_block = squashing.add({}); squashed_chunk.setColumns(current_block.getColumns(), current_block.rows()); - return std::move(squashed_chunk); + + Chunk result_chunk; + result_chunk.swap(squashed_chunk); + return result_chunk; } } From f1c191a3cb2d2037de4346683fbc90a58a98a8a6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 23 May 2024 13:48:23 +0200 Subject: [PATCH 274/392] Better --- .../ObjectStorage/Azure/Configuration.cpp | 4 ++++ .../ObjectStorage/ReadBufferIterator.cpp | 23 +++++++++++------- .../ObjectStorage/ReadBufferIterator.h | 3 ++- .../StorageObjectStorageSource.cpp | 20 +++++++--------- .../StorageObjectStorageSource.h | 5 ++-- src/Storages/S3Queue/S3QueueSource.cpp | 24 ++++++++++--------- 6 files changed, 44 insertions(+), 35 deletions(-) diff --git a/src/Storages/ObjectStorage/Azure/Configuration.cpp b/src/Storages/ObjectStorage/Azure/Configuration.cpp index cca94488a30..ada3e2e9323 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.cpp +++ b/src/Storages/ObjectStorage/Azure/Configuration.cpp @@ -100,6 +100,10 @@ AzureObjectStorage::SettingsPtr StorageAzureConfiguration::createSettings(Contex settings_ptr->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; settings_ptr->max_single_read_retries = context_settings.azure_max_single_read_retries; settings_ptr->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); + settings_ptr->strict_upload_part_size = context_settings.azure_strict_upload_part_size; + settings_ptr->max_upload_part_size = context_settings.azure_max_upload_part_size; + settings_ptr->max_blocks_in_multipart_upload = context_settings.azure_max_blocks_in_multipart_upload; + settings_ptr->min_upload_part_size = context_settings.azure_min_upload_part_size; return settings_ptr; } diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 5a8a4735fe1..50d69129883 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -35,9 +35,10 @@ ReadBufferIterator::ReadBufferIterator( format = configuration->format; } -SchemaCache::Key ReadBufferIterator::getKeyForSchemaCache(const String & path, const String & format_name) const +SchemaCache::Key ReadBufferIterator::getKeyForSchemaCache(const ObjectInfo & object_info, const String & format_name) const { - auto source = std::filesystem::path(configuration->getDataSourceDescription()) / path; + chassert(!object_info.getPath().starts_with("/")); + auto source = std::filesystem::path(configuration->getDataSourceDescription()) / object_info.getPath(); return DB::getKeyForSchemaCache(source, format_name, format_settings, getContext()); } @@ -50,6 +51,7 @@ SchemaCache::Keys ReadBufferIterator::getKeysForSchemaCache() const std::back_inserter(sources), [&](const auto & elem) { + chassert(!elem->getPath().starts_with("/")); return std::filesystem::path(configuration->getDataSourceDescription()) / elem->getPath(); }); return DB::getKeysForSchemaCache(sources, *format, format_settings, getContext()); @@ -78,7 +80,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( if (format) { - auto cache_key = getKeyForSchemaCache(object_info->getPath(), *format); + const auto cache_key = getKeyForSchemaCache(*object_info, *format); if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) return columns; } @@ -89,7 +91,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( /// If we have such entry for some format, we can use this format to read the file. for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) { - auto cache_key = getKeyForSchemaCache(object_info->getPath(), format_name); + const auto cache_key = getKeyForSchemaCache(*object_info, format_name); if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) { /// Now format is known. It should be the same for all files. @@ -99,14 +101,13 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( } } } - return std::nullopt; } void ReadBufferIterator::setNumRowsToLastFile(size_t num_rows) { if (query_settings.schema_inference_use_cache) - schema_cache.addNumRows(getKeyForSchemaCache(current_object_info->getPath(), *format), num_rows); + schema_cache.addNumRows(getKeyForSchemaCache(*current_object_info, *format), num_rows); } void ReadBufferIterator::setSchemaToLastFile(const ColumnsDescription & columns) @@ -114,7 +115,7 @@ void ReadBufferIterator::setSchemaToLastFile(const ColumnsDescription & columns) if (query_settings.schema_inference_use_cache && query_settings.schema_inference_mode == SchemaInferenceMode::UNION) { - schema_cache.addColumns(getKeyForSchemaCache(current_object_info->getPath(), *format), columns); + schema_cache.addColumns(getKeyForSchemaCache(*current_object_info, *format), columns); } } @@ -135,7 +136,7 @@ void ReadBufferIterator::setFormatName(const String & format_name) String ReadBufferIterator::getLastFileName() const { if (current_object_info) - return current_object_info->getFileName(); + return current_object_info->getPath(); else return ""; } @@ -255,17 +256,21 @@ ReadBufferIterator::Data ReadBufferIterator::next() } } + LOG_TEST(getLogger("KSSENII"), "Will read columns from {}", current_object_info->getPath()); + std::unique_ptr read_buf; CompressionMethod compression_method; using ObjectInfoInArchive = StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive; if (const auto * object_info_in_archive = dynamic_cast(current_object_info.get())) { - compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->compression_method); + LOG_TEST(getLogger("KSSENII"), "Will read columns from {} from archive", current_object_info->getPath()); + compression_method = chooseCompressionMethod(filename, configuration->compression_method); const auto & archive_reader = object_info_in_archive->archive_reader; read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); } else { + LOG_TEST(getLogger("KSSENII"), "Will read columns from {} from s3", current_object_info->getPath()); compression_method = chooseCompressionMethod(filename, configuration->compression_method); read_buf = object_storage->readObject( StoredObject(current_object_info->getPath()), diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.h b/src/Storages/ObjectStorage/ReadBufferIterator.h index 287e316e243..6eeb52ec2ed 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.h +++ b/src/Storages/ObjectStorage/ReadBufferIterator.h @@ -13,6 +13,7 @@ public: using FileIterator = std::shared_ptr; using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; using ObjectInfoPtr = StorageObjectStorage::ObjectInfoPtr; + using ObjectInfo = StorageObjectStorage::ObjectInfo; using ObjectInfos = StorageObjectStorage::ObjectInfos; ReadBufferIterator( @@ -41,7 +42,7 @@ public: std::unique_ptr recreateLastReadBuffer() override; private: - SchemaCache::Key getKeyForSchemaCache(const String & path, const String & format_name) const; + SchemaCache::Key getKeyForSchemaCache(const ObjectInfo & object_info, const String & format_name) const; SchemaCache::Keys getKeysForSchemaCache() const; std::optional tryGetColumnsFromCache( const ObjectInfos::iterator & begin, const ObjectInfos::iterator & end); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index a2b3ca5b69e..7332574b246 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -183,14 +183,14 @@ Chunk StorageObjectStorageSource::generate() VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, - fs::path(configuration->getNamespace()) / reader.getRelativePath(), + fs::path(configuration->getNamespace()) / reader.getObjectInfo().getPath(), object_info.metadata->size_bytes, &filename); return chunk; } if (reader.getInputFormat() && getContext()->getSettingsRef().use_cache_for_count_from_files) - addNumRowsToCache(reader.getRelativePath(), total_rows_in_file); + addNumRowsToCache(reader.getObjectInfo(), total_rows_in_file); total_rows_in_file = 0; @@ -209,29 +209,28 @@ Chunk StorageObjectStorageSource::generate() return {}; } -void StorageObjectStorageSource::addNumRowsToCache(const String & path, size_t num_rows) +void StorageObjectStorageSource::addNumRowsToCache(const ObjectInfo & object_info, size_t num_rows) { const auto cache_key = getKeyForSchemaCache( - fs::path(configuration->getDataSourceDescription()) / path, + fs::path(configuration->getDataSourceDescription()) / object_info.getPath(), configuration->format, format_settings, getContext()); - schema_cache.addNumRows(cache_key, num_rows); } -std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const ObjectInfoPtr & object_info) +std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const ObjectInfo & object_info) { const auto cache_key = getKeyForSchemaCache( - fs::path(configuration->getDataSourceDescription()) / object_info->getPath(), + fs::path(configuration->getDataSourceDescription()) / object_info.getPath(), configuration->format, format_settings, getContext()); auto get_last_mod_time = [&]() -> std::optional { - return object_info->metadata - ? std::optional(object_info->metadata->last_modified.epochTime()) + return object_info.metadata + ? std::optional(object_info.metadata->last_modified.epochTime()) : std::nullopt; }; return schema_cache.tryGetNumRows(cache_key, get_last_mod_time); @@ -263,7 +262,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files - ? tryGetNumRowsFromCache(object_info) + ? tryGetNumRowsFromCache(*object_info) : std::nullopt; if (num_rows_from_cache) @@ -505,7 +504,6 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne index = 0; - LOG_TEST(logger, "Filter: {}", filter_dag != nullptr); if (filter_dag) { std::vector paths; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 8dbb31fdfba..e9635ff4dce 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -94,7 +94,6 @@ protected: PullingPipelineExecutor * operator->() { return reader.get(); } const PullingPipelineExecutor * operator->() const { return reader.get(); } - std::string getRelativePath() const { return object_info->getPath(); } const ObjectInfo & getObjectInfo() const { return *object_info; } const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } @@ -115,8 +114,8 @@ protected: std::future createReaderAsync(size_t processor = 0); std::unique_ptr createReadBuffer(const ObjectInfo & object_info); - void addNumRowsToCache(const String & path, size_t num_rows); - std::optional tryGetNumRowsFromCache(const ObjectInfoPtr & object_info); + void addNumRowsToCache(const ObjectInfo & object_info, size_t num_rows); + std::optional tryGetNumRowsFromCache(const ObjectInfo & object_info); void lazyInitialize(size_t processor); }; diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 458f681d7b5..c8aaece0711 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -238,12 +238,14 @@ Chunk StorageS3QueueSource::generate() key_with_info->relative_path, getCurrentExceptionMessage(true)); } - appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, false); + appendLogElement(reader.getObjectInfo().getPath(), *file_status, processed_rows_from_file, false); } break; } + const auto & path = reader.getObjectInfo().getPath(); + if (shutdown_called) { if (processed_rows_from_file == 0) @@ -253,7 +255,7 @@ Chunk StorageS3QueueSource::generate() { LOG_DEBUG( log, "Table is being dropped, {} rows are already processed from {}, but file is not fully processed", - processed_rows_from_file, reader.getRelativePath()); + processed_rows_from_file, path); try { @@ -265,7 +267,7 @@ Chunk StorageS3QueueSource::generate() key_with_info->relative_path, getCurrentExceptionMessage(true)); } - appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, false); + appendLogElement(path, *file_status, processed_rows_from_file, false); /// Leave the file half processed. Table is being dropped, so we do not care. break; @@ -273,7 +275,7 @@ Chunk StorageS3QueueSource::generate() LOG_DEBUG(log, "Shutdown called, but file {} is partially processed ({} rows). " "Will process the file fully and then shutdown", - reader.getRelativePath(), processed_rows_from_file); + path, processed_rows_from_file); } auto * prev_scope = CurrentThread::get().attachProfileCountersScope(&file_status->profile_counters); @@ -287,31 +289,31 @@ Chunk StorageS3QueueSource::generate() Chunk chunk; if (reader->pull(chunk)) { - LOG_TEST(log, "Read {} rows from file: {}", chunk.getNumRows(), reader.getRelativePath()); + LOG_TEST(log, "Read {} rows from file: {}", chunk.getNumRows(), path); file_status->processed_rows += chunk.getNumRows(); processed_rows_from_file += chunk.getNumRows(); VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( - chunk, requested_virtual_columns, reader.getRelativePath(), reader.getObjectInfo().metadata->size_bytes); + chunk, requested_virtual_columns, path, reader.getObjectInfo().metadata->size_bytes); return chunk; } } catch (...) { const auto message = getCurrentExceptionMessage(true); - LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", reader.getRelativePath(), message); + LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", path, message); files_metadata->setFileFailed(key_with_info->processing_holder, message); - appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, false); + appendLogElement(path, *file_status, processed_rows_from_file, false); throw; } files_metadata->setFileProcessed(key_with_info->processing_holder); - applyActionAfterProcessing(reader.getRelativePath()); + applyActionAfterProcessing(path); - appendLogElement(reader.getRelativePath(), *file_status, processed_rows_from_file, true); + appendLogElement(path, *file_status, processed_rows_from_file, true); file_status.reset(); processed_rows_from_file = 0; @@ -327,7 +329,7 @@ Chunk StorageS3QueueSource::generate() if (!reader) break; - file_status = files_metadata->getFileStatus(reader.getRelativePath()); + file_status = files_metadata->getFileStatus(reader.getObjectInfo().getPath()); /// Even if task is finished the thread may be not freed in pool. /// So wait until it will be freed before scheduling a new task. From c150c20512afef6ae816606f197b1ab0a2160712 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Thu, 23 May 2024 13:53:36 +0200 Subject: [PATCH 275/392] adjust tests in test_merge_tree_s3 --- tests/integration/test_merge_tree_s3/test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index 9216b08f942..0bf81e81383 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -857,9 +857,9 @@ def test_merge_canceled_by_s3_errors(cluster, broken_s3, node_name, storage_poli error = node.query_and_get_error( "OPTIMIZE TABLE test_merge_canceled_by_s3_errors FINAL", ) - assert "ExpectedError Message: mock s3 injected error" in error, error + assert "ExpectedError Message: mock s3 injected unretryable error" in error, error - node.wait_for_log_line("ExpectedError Message: mock s3 injected error") + node.wait_for_log_line("ExpectedError Message: mock s3 injected unretryable error") table_uuid = node.query( "SELECT uuid FROM system.tables WHERE database = 'default' AND name = 'test_merge_canceled_by_s3_errors' LIMIT 1" @@ -867,7 +867,7 @@ def test_merge_canceled_by_s3_errors(cluster, broken_s3, node_name, storage_poli node.query("SYSTEM FLUSH LOGS") error_count_in_blob_log = node.query( - f"SELECT count() FROM system.blob_storage_log WHERE query_id like '{table_uuid}::%' AND error like '%mock s3 injected error%'" + f"SELECT count() FROM system.blob_storage_log WHERE query_id like '{table_uuid}::%' AND error like '%mock s3 injected unretryable error%'" ).strip() assert int(error_count_in_blob_log) > 0, node.query( f"SELECT * FROM system.blob_storage_log WHERE query_id like '{table_uuid}::%' FORMAT PrettyCompactMonoBlock" @@ -911,7 +911,7 @@ def test_merge_canceled_by_s3_errors_when_move(cluster, broken_s3, node_name): node.query("OPTIMIZE TABLE merge_canceled_by_s3_errors_when_move FINAL") - node.wait_for_log_line("ExpectedError Message: mock s3 injected error") + node.wait_for_log_line("ExpectedError Message: mock s3 injected unretryable error") count = node.query("SELECT count() FROM merge_canceled_by_s3_errors_when_move") assert int(count) == 2000, count From ce26c4f65746ec3058f1639f83b675feef4fda1c Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 23 May 2024 13:54:45 +0200 Subject: [PATCH 276/392] =?UTF-8?q?Review=20changes=20and=20replace=20?= =?UTF-8?q?=E2=80=A6=20with=20...?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../template-setting.md | 2 +- docs/changelogs/v20.7.1.4310-prestable.md | 2 +- docs/changelogs/v21.12.1.9017-prestable.md | 2 +- docs/changelogs/v21.3.3.14-lts.md | 2 +- docs/changelogs/v21.4.1.6422-prestable.md | 2 +- docs/changelogs/v21.4.2.10-prestable.md | 2 +- docs/changelogs/v22.6.1.1985-stable.md | 4 +- docs/changelogs/v22.7.1.2484-stable.md | 2 +- docs/changelogs/v22.8.13.20-lts.md | 2 +- docs/changelogs/v23.11.1.2711-stable.md | 2 +- docs/changelogs/v23.12.1.1368-stable.md | 2 +- docs/changelogs/v23.3.1.2823-lts.md | 2 +- docs/changelogs/v23.5.1.3174-stable.md | 2 +- docs/changelogs/v23.8.1.2992-lts.md | 2 +- docs/changelogs/v24.1.3.31-stable.md | 2 +- docs/changelogs/v24.2.1.2248-stable.md | 2 +- docs/changelogs/v24.3.1.2672-lts.md | 2 +- docs/en/development/style.md | 6 +- .../table-engines/integrations/hdfs.md | 2 +- .../engines/table-engines/integrations/s3.md | 2 +- .../custom-partitioning-key.md | 2 +- .../mergetree-family/mergetree.md | 4 +- .../table-engines/special/external-data.md | 2 +- .../operations/settings/query-complexity.md | 4 +- docs/en/operations/settings/settings.md | 2 +- .../parametric-functions.md | 4 +- .../reference/quantiles.md | 2 +- .../data-types/aggregatefunction.md | 4 +- .../sql-reference/data-types/fixedstring.md | 4 +- .../nested-data-structures/index.md | 2 +- .../data-types/simpleaggregatefunction.md | 2 +- .../functions/arithmetic-functions.md | 54 ++++++++++++ .../functions/array-functions.md | 84 +++++++++---------- .../functions/date-time-functions.md | 2 +- .../sql-reference/functions/json-functions.md | 24 +++--- .../functions/other-functions.md | 62 +------------- .../functions/string-replace-functions.md | 2 +- .../functions/string-search-functions.md | 12 +-- .../functions/tuple-functions.md | 6 +- .../functions/tuple-map-functions.md | 4 +- .../sql-reference/functions/url-functions.md | 2 +- .../sql-reference/statements/alter/comment.md | 2 +- .../sql-reference/statements/alter/delete.md | 2 +- .../sql-reference/statements/alter/index.md | 2 +- .../sql-reference/statements/alter/update.md | 2 +- .../en/sql-reference/statements/alter/view.md | 6 +- .../sql-reference/statements/create/view.md | 2 +- .../sql-reference/statements/insert-into.md | 2 +- .../sql-reference/statements/select/limit.md | 4 +- .../statements/select/order-by.md | 2 +- docs/en/sql-reference/table-functions/file.md | 2 +- docs/en/sql-reference/table-functions/gcs.md | 2 +- docs/en/sql-reference/table-functions/hdfs.md | 2 +- docs/en/sql-reference/table-functions/s3.md | 2 +- docs/ru/development/style.md | 8 +- .../table-engines/integrations/hdfs.md | 2 +- .../engines/table-engines/integrations/s3.md | 2 +- .../custom-partitioning-key.md | 2 +- .../mergetree-family/mergetree.md | 4 +- .../table-engines/special/external-data.md | 2 +- docs/ru/faq/general/olap.md | 6 +- .../example-datasets/nyc-taxi.md | 2 +- docs/ru/index.md | 12 +-- .../operations/settings/query-complexity.md | 4 +- docs/ru/operations/settings/settings.md | 2 +- .../parametric-functions.md | 4 +- .../reference/quantiles.md | 2 +- .../data-types/aggregatefunction.md | 4 +- .../sql-reference/data-types/fixedstring.md | 4 +- .../nested-data-structures/nested.md | 2 +- docs/ru/sql-reference/data-types/tuple.md | 2 +- .../functions/array-functions.md | 40 ++++----- .../functions/date-time-functions.md | 2 +- .../sql-reference/functions/json-functions.md | 24 +++--- .../functions/other-functions.md | 2 +- .../functions/string-functions.md | 2 +- .../functions/string-search-functions.md | 18 ++-- .../functions/tuple-functions.md | 6 +- .../sql-reference/functions/url-functions.md | 2 +- .../sql-reference/statements/alter/comment.md | 2 +- .../sql-reference/statements/alter/delete.md | 2 +- .../sql-reference/statements/alter/index.md | 2 +- .../sql-reference/statements/alter/update.md | 2 +- .../ru/sql-reference/statements/alter/view.md | 4 +- .../sql-reference/statements/create/view.md | 2 +- .../sql-reference/statements/insert-into.md | 2 +- docs/ru/sql-reference/table-functions/file.md | 2 +- docs/ru/sql-reference/table-functions/s3.md | 2 +- docs/zh/changelog/index.md | 4 +- docs/zh/development/style.md | 8 +- .../table-engines/integrations/hdfs.md | 2 +- .../engines/table-engines/integrations/s3.md | 4 +- .../custom-partitioning-key.md | 2 +- .../mergetree-family/mergetree.md | 4 +- .../table-engines/special/external-data.md | 2 +- docs/zh/faq/general/olap.md | 6 +- .../example-datasets/nyc-taxi.md | 2 +- .../example-datasets/uk-price-paid.mdx | 2 +- .../sparse-primary-indexes.md | 2 +- docs/zh/index.md | 12 +-- .../operations/settings/query-complexity.md | 4 +- docs/zh/operations/settings/settings.md | 2 +- .../operations/system-tables/dictionaries.md | 2 +- .../parametric-functions.md | 4 +- .../reference/quantiles.md | 2 +- .../data-types/aggregatefunction.md | 2 +- .../sql-reference/data-types/domains/index.md | 4 +- .../sql-reference/data-types/fixedstring.md | 4 +- .../nested-data-structures/nested.md | 2 +- .../data-types/simpleaggregatefunction.md | 2 +- docs/zh/sql-reference/data-types/tuple.md | 2 +- .../functions/array-functions.md | 40 ++++----- .../functions/date-time-functions.md | 2 +- .../functions/higher-order-functions.md | 22 ++--- .../sql-reference/functions/in-functions.md | 4 +- .../sql-reference/functions/json-functions.md | 24 +++--- .../functions/other-functions.md | 2 +- .../functions/string-functions.md | 6 +- .../functions/string-search-functions.md | 18 ++-- .../sql-reference/functions/url-functions.md | 2 +- .../sql-reference/statements/alter/delete.md | 2 +- .../sql-reference/statements/alter/index.md | 2 +- .../sql-reference/statements/alter/update.md | 2 +- .../zh/sql-reference/statements/alter/view.md | 4 +- .../sql-reference/statements/create/view.md | 2 +- .../sql-reference/statements/insert-into.md | 2 +- .../sql-reference/statements/select/limit.md | 4 +- .../statements/select/order-by.md | 2 +- docs/zh/sql-reference/table-functions/file.md | 2 +- docs/zh/sql-reference/table-functions/hdfs.md | 2 +- docs/zh/sql-reference/table-functions/s3.md | 2 +- 131 files changed, 384 insertions(+), 384 deletions(-) diff --git a/docs/_description_templates/template-setting.md b/docs/_description_templates/template-setting.md index fc912aba3e1..f4525d872df 100644 --- a/docs/_description_templates/template-setting.md +++ b/docs/_description_templates/template-setting.md @@ -2,7 +2,7 @@ Description. -For the switch setting, use the typical phrase: “Enables or disables something …”. +For the switch setting, use the typical phrase: “Enables or disables something ...”. Possible values: diff --git a/docs/changelogs/v20.7.1.4310-prestable.md b/docs/changelogs/v20.7.1.4310-prestable.md index f47c7334228..aa1d993b263 100644 --- a/docs/changelogs/v20.7.1.4310-prestable.md +++ b/docs/changelogs/v20.7.1.4310-prestable.md @@ -166,4 +166,4 @@ * NO CL ENTRY: 'Revert "Abort on std::out_of_range in debug builds"'. [#12752](https://github.com/ClickHouse/ClickHouse/pull/12752) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * NO CL ENTRY: 'Bump protobuf from 3.12.2 to 3.12.4 in /docs/tools'. [#13102](https://github.com/ClickHouse/ClickHouse/pull/13102) ([dependabot-preview[bot]](https://github.com/apps/dependabot-preview)). * NO CL ENTRY: 'Merge [#12574](https://github.com/ClickHouse/ClickHouse/issues/12574)'. [#13158](https://github.com/ClickHouse/ClickHouse/pull/13158) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* NO CL ENTRY: 'Revert "Add QueryTimeMicroseconds, SelectQueryTimeMicroseconds and InsertQuer…"'. [#13303](https://github.com/ClickHouse/ClickHouse/pull/13303) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* NO CL ENTRY: 'Revert "Add QueryTimeMicroseconds, SelectQueryTimeMicroseconds and InsertQuer..."'. [#13303](https://github.com/ClickHouse/ClickHouse/pull/13303) ([Alexey Milovidov](https://github.com/alexey-milovidov)). diff --git a/docs/changelogs/v21.12.1.9017-prestable.md b/docs/changelogs/v21.12.1.9017-prestable.md index 88b8260e312..bd84873e67a 100644 --- a/docs/changelogs/v21.12.1.9017-prestable.md +++ b/docs/changelogs/v21.12.1.9017-prestable.md @@ -421,5 +421,5 @@ sidebar_label: 2022 * Fix possible crash in DataTypeAggregateFunction [#32287](https://github.com/ClickHouse/ClickHouse/pull/32287) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). * Update backport.py [#32323](https://github.com/ClickHouse/ClickHouse/pull/32323) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix graphite-bench build [#32351](https://github.com/ClickHouse/ClickHouse/pull/32351) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Revert "graphite: split tagged/plain rollup rules (for merges perfoma… [#32376](https://github.com/ClickHouse/ClickHouse/pull/32376) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Revert "graphite: split tagged/plain rollup rules (for merges perfoma... [#32376](https://github.com/ClickHouse/ClickHouse/pull/32376) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Another attempt to fix unit test Executor::RemoveTasksStress [#32390](https://github.com/ClickHouse/ClickHouse/pull/32390) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). diff --git a/docs/changelogs/v21.3.3.14-lts.md b/docs/changelogs/v21.3.3.14-lts.md index 57bde602f21..91d99deaa6b 100644 --- a/docs/changelogs/v21.3.3.14-lts.md +++ b/docs/changelogs/v21.3.3.14-lts.md @@ -18,4 +18,4 @@ sidebar_label: 2022 #### NOT FOR CHANGELOG / INSIGNIFICANT -* fix incorrect number of rows for Chunks with no columns in PartialSor… [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* fix incorrect number of rows for Chunks with no columns in PartialSor... [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). diff --git a/docs/changelogs/v21.4.1.6422-prestable.md b/docs/changelogs/v21.4.1.6422-prestable.md index 2eadb0d4754..66937c3be15 100644 --- a/docs/changelogs/v21.4.1.6422-prestable.md +++ b/docs/changelogs/v21.4.1.6422-prestable.md @@ -223,7 +223,7 @@ sidebar_label: 2022 * Do not overlap zookeeper path for ReplicatedMergeTree in stateless *.sh tests [#21724](https://github.com/ClickHouse/ClickHouse/pull/21724) ([Azat Khuzhin](https://github.com/azat)). * make the fuzzer use sources from the CI [#21754](https://github.com/ClickHouse/ClickHouse/pull/21754) ([Alexander Kuzmenkov](https://github.com/akuzm)). * Add one more variant to memcpy benchmark [#21759](https://github.com/ClickHouse/ClickHouse/pull/21759) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* fix incorrect number of rows for Chunks with no columns in PartialSor… [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* fix incorrect number of rows for Chunks with no columns in PartialSor... [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). * docs(fix): typo [#21775](https://github.com/ClickHouse/ClickHouse/pull/21775) ([Ali Demirci](https://github.com/depyronick)). * DDLWorker.cpp: fixed exceeded amount of tries typo [#21807](https://github.com/ClickHouse/ClickHouse/pull/21807) ([Eldar Nasyrov](https://github.com/3ldar-nasyrov)). * fix integration MaterializeMySQL test [#21819](https://github.com/ClickHouse/ClickHouse/pull/21819) ([TCeason](https://github.com/TCeason)). diff --git a/docs/changelogs/v21.4.2.10-prestable.md b/docs/changelogs/v21.4.2.10-prestable.md index 3db17ddfcf3..b9bdbd80c0c 100644 --- a/docs/changelogs/v21.4.2.10-prestable.md +++ b/docs/changelogs/v21.4.2.10-prestable.md @@ -226,7 +226,7 @@ sidebar_label: 2022 * Do not overlap zookeeper path for ReplicatedMergeTree in stateless *.sh tests [#21724](https://github.com/ClickHouse/ClickHouse/pull/21724) ([Azat Khuzhin](https://github.com/azat)). * make the fuzzer use sources from the CI [#21754](https://github.com/ClickHouse/ClickHouse/pull/21754) ([Alexander Kuzmenkov](https://github.com/akuzm)). * Add one more variant to memcpy benchmark [#21759](https://github.com/ClickHouse/ClickHouse/pull/21759) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* fix incorrect number of rows for Chunks with no columns in PartialSor… [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). +* fix incorrect number of rows for Chunks with no columns in PartialSor... [#21761](https://github.com/ClickHouse/ClickHouse/pull/21761) ([Alexander Kuzmenkov](https://github.com/akuzm)). * docs(fix): typo [#21775](https://github.com/ClickHouse/ClickHouse/pull/21775) ([Ali Demirci](https://github.com/depyronick)). * DDLWorker.cpp: fixed exceeded amount of tries typo [#21807](https://github.com/ClickHouse/ClickHouse/pull/21807) ([Eldar Nasyrov](https://github.com/3ldar-nasyrov)). * fix integration MaterializeMySQL test [#21819](https://github.com/ClickHouse/ClickHouse/pull/21819) ([TCeason](https://github.com/TCeason)). diff --git a/docs/changelogs/v22.6.1.1985-stable.md b/docs/changelogs/v22.6.1.1985-stable.md index c915d24fe00..7bd7038377a 100644 --- a/docs/changelogs/v22.6.1.1985-stable.md +++ b/docs/changelogs/v22.6.1.1985-stable.md @@ -160,7 +160,7 @@ sidebar_label: 2022 * fix toString error on DatatypeDate32. [#37775](https://github.com/ClickHouse/ClickHouse/pull/37775) ([LiuNeng](https://github.com/liuneng1994)). * The clickhouse-keeper setting `dead_session_check_period_ms` was transformed into microseconds (multiplied by 1000), which lead to dead sessions only being cleaned up after several minutes (instead of 500ms). [#37824](https://github.com/ClickHouse/ClickHouse/pull/37824) ([Michael Lex](https://github.com/mlex)). * Fix possible "No more packets are available" for distributed queries (in case of `async_socket_for_remote`/`use_hedged_requests` is disabled). [#37826](https://github.com/ClickHouse/ClickHouse/pull/37826) ([Azat Khuzhin](https://github.com/azat)). -* Do not drop the inner target table when executing `ALTER TABLE … MODIFY QUERY` in WindowView. [#37879](https://github.com/ClickHouse/ClickHouse/pull/37879) ([vxider](https://github.com/Vxider)). +* Do not drop the inner target table when executing `ALTER TABLE ... MODIFY QUERY` in WindowView. [#37879](https://github.com/ClickHouse/ClickHouse/pull/37879) ([vxider](https://github.com/Vxider)). * Fix directory ownership of coordination dir in clickhouse-keeper Docker image. Fixes [#37914](https://github.com/ClickHouse/ClickHouse/issues/37914). [#37915](https://github.com/ClickHouse/ClickHouse/pull/37915) ([James Maidment](https://github.com/jamesmaidment)). * Dictionaries fix custom query with update field and `{condition}`. Closes [#33746](https://github.com/ClickHouse/ClickHouse/issues/33746). [#37947](https://github.com/ClickHouse/ClickHouse/pull/37947) ([Maksim Kita](https://github.com/kitaisreal)). * Fix possible incorrect result of `SELECT ... WITH FILL` in the case when `ORDER BY` should be applied after `WITH FILL` result (e.g. for outer query). Incorrect result was caused by optimization for `ORDER BY` expressions ([#35623](https://github.com/ClickHouse/ClickHouse/issues/35623)). Closes [#37904](https://github.com/ClickHouse/ClickHouse/issues/37904). [#37959](https://github.com/ClickHouse/ClickHouse/pull/37959) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). @@ -180,7 +180,7 @@ sidebar_label: 2022 #### NO CL ENTRY * NO CL ENTRY: 'Revert "Fix mutations in tables with columns of type `Object`"'. [#37355](https://github.com/ClickHouse/ClickHouse/pull/37355) ([Alexander Tokmakov](https://github.com/tavplubix)). -* NO CL ENTRY: 'Revert "Remove height restrictions from the query div in play web tool, and m…"'. [#37501](https://github.com/ClickHouse/ClickHouse/pull/37501) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* NO CL ENTRY: 'Revert "Remove height restrictions from the query div in play web tool, and m..."'. [#37501](https://github.com/ClickHouse/ClickHouse/pull/37501) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * NO CL ENTRY: 'Revert "Add support for preprocessing ZooKeeper operations in `clickhouse-keeper`"'. [#37534](https://github.com/ClickHouse/ClickHouse/pull/37534) ([Antonio Andelic](https://github.com/antonio2368)). * NO CL ENTRY: 'Revert "(only with zero-copy replication, non-production experimental feature not recommended to use) fix possible deadlock during fetching part"'. [#37545](https://github.com/ClickHouse/ClickHouse/pull/37545) ([Alexander Tokmakov](https://github.com/tavplubix)). * NO CL ENTRY: 'Revert "RFC: Fix converting types for UNION queries (may produce LOGICAL_ERROR)"'. [#37582](https://github.com/ClickHouse/ClickHouse/pull/37582) ([Dmitry Novik](https://github.com/novikd)). diff --git a/docs/changelogs/v22.7.1.2484-stable.md b/docs/changelogs/v22.7.1.2484-stable.md index 7464b0449ee..c4a76c66e0c 100644 --- a/docs/changelogs/v22.7.1.2484-stable.md +++ b/docs/changelogs/v22.7.1.2484-stable.md @@ -410,7 +410,7 @@ sidebar_label: 2022 * Add test for [#39132](https://github.com/ClickHouse/ClickHouse/issues/39132) [#39173](https://github.com/ClickHouse/ClickHouse/pull/39173) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Suppression for BC check (`Cannot parse string 'Hello' as UInt64`) [#39176](https://github.com/ClickHouse/ClickHouse/pull/39176) ([Alexander Tokmakov](https://github.com/tavplubix)). * Fix 01961_roaring_memory_tracking test [#39187](https://github.com/ClickHouse/ClickHouse/pull/39187) ([Dmitry Novik](https://github.com/novikd)). -* Cleanup: done during [#38719](https://github.com/ClickHouse/ClickHouse/issues/38719) (SortingStep: deduce way to sort based on … [#39191](https://github.com/ClickHouse/ClickHouse/pull/39191) ([Igor Nikonov](https://github.com/devcrafter)). +* Cleanup: done during [#38719](https://github.com/ClickHouse/ClickHouse/issues/38719) (SortingStep: deduce way to sort based on ... [#39191](https://github.com/ClickHouse/ClickHouse/pull/39191) ([Igor Nikonov](https://github.com/devcrafter)). * Fix exception in AsynchronousMetrics for s390x [#39193](https://github.com/ClickHouse/ClickHouse/pull/39193) ([Harry Lee](https://github.com/HarryLeeIBM)). * Optimize accesses to system.stack_trace (filter by name before sending signal) [#39212](https://github.com/ClickHouse/ClickHouse/pull/39212) ([Azat Khuzhin](https://github.com/azat)). * Enable warning "-Wdeprecated-dynamic-exception-spec" [#39213](https://github.com/ClickHouse/ClickHouse/pull/39213) ([Robert Schulze](https://github.com/rschu1ze)). diff --git a/docs/changelogs/v22.8.13.20-lts.md b/docs/changelogs/v22.8.13.20-lts.md index 0734f40bf3e..ad44fbfc5d6 100644 --- a/docs/changelogs/v22.8.13.20-lts.md +++ b/docs/changelogs/v22.8.13.20-lts.md @@ -20,4 +20,4 @@ sidebar_label: 2023 * Fix wrong approved_at, simplify conditions [#45302](https://github.com/ClickHouse/ClickHouse/pull/45302) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Get rid of artifactory in favor of r2 + ch-repos-manager [#45421](https://github.com/ClickHouse/ClickHouse/pull/45421) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). * Trim refs/tags/ from GITHUB_TAG in release workflow [#45636](https://github.com/ClickHouse/ClickHouse/pull/45636) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Merge pull request [#38262](https://github.com/ClickHouse/ClickHouse/issues/38262) from PolyProgrammist/fix-ordinary-system-un… [#45650](https://github.com/ClickHouse/ClickHouse/pull/45650) ([alesapin](https://github.com/alesapin)). +* Merge pull request [#38262](https://github.com/ClickHouse/ClickHouse/issues/38262) from PolyProgrammist/fix-ordinary-system-un... [#45650](https://github.com/ClickHouse/ClickHouse/pull/45650) ([alesapin](https://github.com/alesapin)). diff --git a/docs/changelogs/v23.11.1.2711-stable.md b/docs/changelogs/v23.11.1.2711-stable.md index e32dee41dc7..0bdee08f5c9 100644 --- a/docs/changelogs/v23.11.1.2711-stable.md +++ b/docs/changelogs/v23.11.1.2711-stable.md @@ -217,7 +217,7 @@ sidebar_label: 2023 * S3Queue minor fix [#56999](https://github.com/ClickHouse/ClickHouse/pull/56999) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix file path validation for DatabaseFileSystem [#57029](https://github.com/ClickHouse/ClickHouse/pull/57029) ([San](https://github.com/santrancisco)). * Fix `fuzzBits` with `ARRAY JOIN` [#57033](https://github.com/ClickHouse/ClickHouse/pull/57033) ([Antonio Andelic](https://github.com/antonio2368)). -* Fix Nullptr dereference in partial merge join with joined_subquery_re… [#57048](https://github.com/ClickHouse/ClickHouse/pull/57048) ([vdimir](https://github.com/vdimir)). +* Fix Nullptr dereference in partial merge join with joined_subquery_re... [#57048](https://github.com/ClickHouse/ClickHouse/pull/57048) ([vdimir](https://github.com/vdimir)). * Fix race condition in RemoteSource [#57052](https://github.com/ClickHouse/ClickHouse/pull/57052) ([Raúl Marín](https://github.com/Algunenano)). * Implement `bitHammingDistance` for big integers [#57073](https://github.com/ClickHouse/ClickHouse/pull/57073) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * S3-style links bug fix [#57075](https://github.com/ClickHouse/ClickHouse/pull/57075) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). diff --git a/docs/changelogs/v23.12.1.1368-stable.md b/docs/changelogs/v23.12.1.1368-stable.md index 1a322ae9c0f..cb8ba57100e 100644 --- a/docs/changelogs/v23.12.1.1368-stable.md +++ b/docs/changelogs/v23.12.1.1368-stable.md @@ -272,7 +272,7 @@ sidebar_label: 2023 * Bump Azure to v1.6.0 [#58052](https://github.com/ClickHouse/ClickHouse/pull/58052) ([Robert Schulze](https://github.com/rschu1ze)). * Correct values for randomization [#58058](https://github.com/ClickHouse/ClickHouse/pull/58058) ([Anton Popov](https://github.com/CurtizJ)). * Non post request should be readonly [#58060](https://github.com/ClickHouse/ClickHouse/pull/58060) ([San](https://github.com/santrancisco)). -* Revert "Merge pull request [#55710](https://github.com/ClickHouse/ClickHouse/issues/55710) from guoxiaolongzte/clickhouse-test… [#58066](https://github.com/ClickHouse/ClickHouse/pull/58066) ([Raúl Marín](https://github.com/Algunenano)). +* Revert "Merge pull request [#55710](https://github.com/ClickHouse/ClickHouse/issues/55710) from guoxiaolongzte/clickhouse-test... [#58066](https://github.com/ClickHouse/ClickHouse/pull/58066) ([Raúl Marín](https://github.com/Algunenano)). * fix typo in the test 02479 [#58072](https://github.com/ClickHouse/ClickHouse/pull/58072) ([Sema Checherinda](https://github.com/CheSema)). * Bump Azure to 1.7.2 [#58075](https://github.com/ClickHouse/ClickHouse/pull/58075) ([Robert Schulze](https://github.com/rschu1ze)). * Fix flaky test `02567_and_consistency` [#58076](https://github.com/ClickHouse/ClickHouse/pull/58076) ([Anton Popov](https://github.com/CurtizJ)). diff --git a/docs/changelogs/v23.3.1.2823-lts.md b/docs/changelogs/v23.3.1.2823-lts.md index 0c9be3601da..f81aba53ebe 100644 --- a/docs/changelogs/v23.3.1.2823-lts.md +++ b/docs/changelogs/v23.3.1.2823-lts.md @@ -520,7 +520,7 @@ sidebar_label: 2023 * Improve script for updating clickhouse-docs [#48135](https://github.com/ClickHouse/ClickHouse/pull/48135) ([Alexander Tokmakov](https://github.com/tavplubix)). * Fix stdlib compatibility issues [#48150](https://github.com/ClickHouse/ClickHouse/pull/48150) ([DimasKovas](https://github.com/DimasKovas)). * Make test test_disallow_concurrency less flaky [#48152](https://github.com/ClickHouse/ClickHouse/pull/48152) ([Vitaly Baranov](https://github.com/vitlibar)). -* Remove unused mockSystemDatabase from gtest_transform_query_for_exter… [#48162](https://github.com/ClickHouse/ClickHouse/pull/48162) ([Vladimir C](https://github.com/vdimir)). +* Remove unused mockSystemDatabase from gtest_transform_query_for_exter... [#48162](https://github.com/ClickHouse/ClickHouse/pull/48162) ([Vladimir C](https://github.com/vdimir)). * Update environmental-sensors.md [#48166](https://github.com/ClickHouse/ClickHouse/pull/48166) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Correctly handle NULL constants in logical optimizer for new analyzer [#48168](https://github.com/ClickHouse/ClickHouse/pull/48168) ([Antonio Andelic](https://github.com/antonio2368)). * Try making KeeperMap test more stable [#48170](https://github.com/ClickHouse/ClickHouse/pull/48170) ([Antonio Andelic](https://github.com/antonio2368)). diff --git a/docs/changelogs/v23.5.1.3174-stable.md b/docs/changelogs/v23.5.1.3174-stable.md index 2212eb6e893..4bdd4139afc 100644 --- a/docs/changelogs/v23.5.1.3174-stable.md +++ b/docs/changelogs/v23.5.1.3174-stable.md @@ -474,7 +474,7 @@ sidebar_label: 2023 * Fix flakiness of test_distributed_load_balancing test [#49921](https://github.com/ClickHouse/ClickHouse/pull/49921) ([Azat Khuzhin](https://github.com/azat)). * Add some logging [#49925](https://github.com/ClickHouse/ClickHouse/pull/49925) ([Kseniia Sumarokova](https://github.com/kssenii)). * Support hardlinking parts transactionally [#49931](https://github.com/ClickHouse/ClickHouse/pull/49931) ([Michael Kolupaev](https://github.com/al13n321)). -* Fix for analyzer: 02377_ optimize_sorting_by_input_stream_properties_e… [#49943](https://github.com/ClickHouse/ClickHouse/pull/49943) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix for analyzer: 02377_ optimize_sorting_by_input_stream_properties_e... [#49943](https://github.com/ClickHouse/ClickHouse/pull/49943) ([Igor Nikonov](https://github.com/devcrafter)). * Follow up to [#49429](https://github.com/ClickHouse/ClickHouse/issues/49429) [#49964](https://github.com/ClickHouse/ClickHouse/pull/49964) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix flaky test_ssl_cert_authentication to use urllib3 [#49982](https://github.com/ClickHouse/ClickHouse/pull/49982) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). * Fix woboq codebrowser build with -Wno-poison-system-directories [#49992](https://github.com/ClickHouse/ClickHouse/pull/49992) ([Azat Khuzhin](https://github.com/azat)). diff --git a/docs/changelogs/v23.8.1.2992-lts.md b/docs/changelogs/v23.8.1.2992-lts.md index 7c224b19350..05385d9c52b 100644 --- a/docs/changelogs/v23.8.1.2992-lts.md +++ b/docs/changelogs/v23.8.1.2992-lts.md @@ -272,7 +272,7 @@ sidebar_label: 2023 * Add more checks into ThreadStatus ctor. [#42019](https://github.com/ClickHouse/ClickHouse/pull/42019) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Refactor Query Tree visitor [#46740](https://github.com/ClickHouse/ClickHouse/pull/46740) ([Dmitry Novik](https://github.com/novikd)). * Revert "Revert "Randomize JIT settings in tests"" [#48282](https://github.com/ClickHouse/ClickHouse/pull/48282) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Fix outdated cache configuration in s3 tests: s3_storage_policy_by_defau… [#48424](https://github.com/ClickHouse/ClickHouse/pull/48424) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix outdated cache configuration in s3 tests: s3_storage_policy_by_defau... [#48424](https://github.com/ClickHouse/ClickHouse/pull/48424) ([Kseniia Sumarokova](https://github.com/kssenii)). * Fix IN with decimal in analyzer [#48754](https://github.com/ClickHouse/ClickHouse/pull/48754) ([vdimir](https://github.com/vdimir)). * Some unclear change in StorageBuffer::reschedule() for something [#49723](https://github.com/ClickHouse/ClickHouse/pull/49723) ([DimasKovas](https://github.com/DimasKovas)). * MergeTree & SipHash checksum big-endian support [#50276](https://github.com/ClickHouse/ClickHouse/pull/50276) ([ltrk2](https://github.com/ltrk2)). diff --git a/docs/changelogs/v24.1.3.31-stable.md b/docs/changelogs/v24.1.3.31-stable.md index 046ca451fbc..e898fba5c87 100644 --- a/docs/changelogs/v24.1.3.31-stable.md +++ b/docs/changelogs/v24.1.3.31-stable.md @@ -13,7 +13,7 @@ sidebar_label: 2024 #### Bug Fix (user-visible misbehavior in an official stable release) -* Fix `ASTAlterCommand::formatImpl` in case of column specific settings… [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix `ASTAlterCommand::formatImpl` in case of column specific settings... [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). * Make MAX use the same rules as permutation for complex types [#59498](https://github.com/ClickHouse/ClickHouse/pull/59498) ([Raúl Marín](https://github.com/Algunenano)). * Fix corner case when passing `update_insert_deduplication_token_in_dependent_materialized_views` [#59544](https://github.com/ClickHouse/ClickHouse/pull/59544) ([Jordi Villar](https://github.com/jrdi)). * Fix incorrect result of arrayElement / map[] on empty value [#59594](https://github.com/ClickHouse/ClickHouse/pull/59594) ([Raúl Marín](https://github.com/Algunenano)). diff --git a/docs/changelogs/v24.2.1.2248-stable.md b/docs/changelogs/v24.2.1.2248-stable.md index 6113dd51ab1..02affe12c43 100644 --- a/docs/changelogs/v24.2.1.2248-stable.md +++ b/docs/changelogs/v24.2.1.2248-stable.md @@ -130,7 +130,7 @@ sidebar_label: 2024 * Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)). * Fix digest calculation in Keeper [#59439](https://github.com/ClickHouse/ClickHouse/pull/59439) ([Antonio Andelic](https://github.com/antonio2368)). * Fix stacktraces for binaries without debug symbols [#59444](https://github.com/ClickHouse/ClickHouse/pull/59444) ([Azat Khuzhin](https://github.com/azat)). -* Fix `ASTAlterCommand::formatImpl` in case of column specific settings… [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix `ASTAlterCommand::formatImpl` in case of column specific settings... [#59445](https://github.com/ClickHouse/ClickHouse/pull/59445) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). * Fix `SELECT * FROM [...] ORDER BY ALL` with Analyzer [#59462](https://github.com/ClickHouse/ClickHouse/pull/59462) ([zhongyuankai](https://github.com/zhongyuankai)). * Fix possible uncaught exception during distributed query cancellation [#59487](https://github.com/ClickHouse/ClickHouse/pull/59487) ([Azat Khuzhin](https://github.com/azat)). * Make MAX use the same rules as permutation for complex types [#59498](https://github.com/ClickHouse/ClickHouse/pull/59498) ([Raúl Marín](https://github.com/Algunenano)). diff --git a/docs/changelogs/v24.3.1.2672-lts.md b/docs/changelogs/v24.3.1.2672-lts.md index e5d008680a8..006ab941203 100644 --- a/docs/changelogs/v24.3.1.2672-lts.md +++ b/docs/changelogs/v24.3.1.2672-lts.md @@ -526,7 +526,7 @@ sidebar_label: 2024 * No "please" [#61916](https://github.com/ClickHouse/ClickHouse/pull/61916) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Update version_date.tsv and changelogs after v23.12.6.19-stable [#61917](https://github.com/ClickHouse/ClickHouse/pull/61917) ([robot-clickhouse](https://github.com/robot-clickhouse)). * Update version_date.tsv and changelogs after v24.1.8.22-stable [#61918](https://github.com/ClickHouse/ClickHouse/pull/61918) ([robot-clickhouse](https://github.com/robot-clickhouse)). -* Fix flaky test_broken_projestions/test.py::test_broken_ignored_replic… [#61932](https://github.com/ClickHouse/ClickHouse/pull/61932) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix flaky test_broken_projestions/test.py::test_broken_ignored_replic... [#61932](https://github.com/ClickHouse/ClickHouse/pull/61932) ([Kseniia Sumarokova](https://github.com/kssenii)). * Check is Rust avaiable for build, if not, suggest a way to disable Rust support [#61938](https://github.com/ClickHouse/ClickHouse/pull/61938) ([Azat Khuzhin](https://github.com/azat)). * CI: new ci menu in PR body [#61948](https://github.com/ClickHouse/ClickHouse/pull/61948) ([Max K.](https://github.com/maxknv)). * Remove flaky test `01193_metadata_loading` [#61961](https://github.com/ClickHouse/ClickHouse/pull/61961) ([Nikita Taranov](https://github.com/nickitat)). diff --git a/docs/en/development/style.md b/docs/en/development/style.md index 77a550f2a0e..1444bc0e452 100644 --- a/docs/en/development/style.md +++ b/docs/en/development/style.md @@ -57,7 +57,7 @@ memcpy(&buf[place_value], &x, sizeof(x)); for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** Add spaces around binary operators (`+`, `-`, `*`, `/`, `%`, …) and the ternary operator `?:`. +**7.** Add spaces around binary operators (`+`, `-`, `*`, `/`, `%`, ...) and the ternary operator `?:`. ``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); @@ -86,7 +86,7 @@ dst.ClickGoodEvent = click.GoodEvent; If necessary, the operator can be wrapped to the next line. In this case, the offset in front of it is increased. -**11.** Do not use a space to separate unary operators (`--`, `++`, `*`, `&`, …) from the argument. +**11.** Do not use a space to separate unary operators (`--`, `++`, `*`, `&`, ...) from the argument. **12.** Put a space after a comma, but not before it. The same rule goes for a semicolon inside a `for` expression. @@ -115,7 +115,7 @@ public: **16.** If the same `namespace` is used for the entire file, and there isn’t anything else significant, an offset is not necessary inside `namespace`. -**17.** If the block for an `if`, `for`, `while`, or other expression consists of a single `statement`, the curly brackets are optional. Place the `statement` on a separate line, instead. This rule is also valid for nested `if`, `for`, `while`, … +**17.** If the block for an `if`, `for`, `while`, or other expression consists of a single `statement`, the curly brackets are optional. Place the `statement` on a separate line, instead. This rule is also valid for nested `if`, `for`, `while`, ... But if the inner `statement` contains curly brackets or `else`, the external block should be written in curly brackets. diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index dbd1c270a4a..2749fa7e479 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -118,7 +118,7 @@ If the listing of files contains number ranges with leading zeros, use the const **Example** -Create table with files named `file000`, `file001`, … , `file999`: +Create table with files named `file000`, `file001`, ... , `file999`: ``` sql CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index dfa06801d04..cb1da1c8e68 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -178,7 +178,7 @@ If the listing of files contains number ranges with leading zeros, use the const **Example with wildcards 1** -Create table with files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Create table with files named `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql CREATE TABLE big_table (name String, value UInt32) diff --git a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md index 23d98d4b20e..eda87fd06c1 100644 --- a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -71,7 +71,7 @@ WHERE table = 'visits' └───────────┴───────────────────┴────────┘ ``` -The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md) queries. +The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER ... PARTITION](../../../sql-reference/statements/alter/partition.md) queries. The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 7862eef69f8..a009c4a32f3 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -954,7 +954,7 @@ In the case of `MergeTree` tables, data is getting to disk in different ways: - As a result of an insert (`INSERT` query). - During background merges and [mutations](/docs/en/sql-reference/statements/alter/index.md#alter-mutations). - When downloading from another replica. -- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](/docs/en/sql-reference/statements/alter/partition.md/#alter_freeze-partition). +- As a result of partition freezing [ALTER TABLE ... FREEZE PARTITION](/docs/en/sql-reference/statements/alter/partition.md/#alter_freeze-partition). In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy: @@ -966,7 +966,7 @@ Under the hood, mutations and partition freezing make use of [hard links](https: In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file. Data is never transferred from the last one and into the first one. One may use system tables [system.part_log](/docs/en/operations/system-tables/part_log.md/#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](/docs/en/operations/system-tables/parts.md/#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. -User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](/docs/en/sql-reference/statements/alter/partition.md/#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. +User can force moving a part or a partition from one volume to another using the query [ALTER TABLE ... MOVE PART\|PARTITION ... TO VOLUME\|DISK ...](/docs/en/sql-reference/statements/alter/partition.md/#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas. diff --git a/docs/en/engines/table-engines/special/external-data.md b/docs/en/engines/table-engines/special/external-data.md index 7ea3f3e30d6..f6d6dae7eb6 100644 --- a/docs/en/engines/table-engines/special/external-data.md +++ b/docs/en/engines/table-engines/special/external-data.md @@ -29,7 +29,7 @@ Only a single table can be retrieved from stdin. The following parameters are optional: **–name**– Name of the table. If omitted, _data is used. **–format** – Data format in the file. If omitted, TabSeparated is used. -One of the following parameters is required:**–types** – A list of comma-separated column types. For example: `UInt64,String`. The columns will be named _1, _2, … +One of the following parameters is required:**–types** – A list of comma-separated column types. For example: `UInt64,String`. The columns will be named _1, _2, ... **–structure**– The table structure in the format`UserID UInt64`, `URL String`. Defines the column names and types. The files specified in ‘file’ will be parsed by the format specified in ‘format’, using the data types specified in ‘types’ or ‘structure’. The table will be uploaded to the server and accessible there as a temporary table with the name in ‘name’. diff --git a/docs/en/operations/settings/query-complexity.md b/docs/en/operations/settings/query-complexity.md index d86f18ff982..2a20e74e20f 100644 --- a/docs/en/operations/settings/query-complexity.md +++ b/docs/en/operations/settings/query-complexity.md @@ -303,7 +303,7 @@ What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘ Limits the number of rows in the hash table that is used when joining tables. -This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. +This settings applies to [SELECT ... JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. If a query contains multiple joins, ClickHouse checks this setting for every intermediate result. @@ -320,7 +320,7 @@ Default value: 0. Limits the size in bytes of the hash table used when joining tables. -This setting applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). +This setting applies to [SELECT ... JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). If the query contains joins, ClickHouse checks this setting for every intermediate result. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 91b544c6a82..2b5cd11819a 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -2248,7 +2248,7 @@ Default value: 0. ## count_distinct_implementation {#count_distinct_implementation} -Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction. +Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction. Possible values: diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 8981ac1f752..1dc89b8dcf9 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -82,7 +82,7 @@ FROM In this case, you should remember that you do not know the histogram bin borders. -## sequenceMatch(pattern)(timestamp, cond1, cond2, …) +## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) Checks whether the sequence contains an event chain that matches the pattern. @@ -172,7 +172,7 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM - [sequenceCount](#function-sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, …) +## sequenceCount(pattern)(time, cond1, cond2, ...) Counts the number of event chains that matched the pattern. The function searches event chains that do not overlap. It starts to search for the next chain after the current chain is matched. diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md index e2a5bc53e32..856d447ac13 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md @@ -7,7 +7,7 @@ sidebar_position: 201 ## quantiles -Syntax: `quantiles(level1, level2, …)(x)` +Syntax: `quantiles(level1, level2, ...)(x)` All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`, `quantilesDD`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. diff --git a/docs/en/sql-reference/data-types/aggregatefunction.md b/docs/en/sql-reference/data-types/aggregatefunction.md index 87511a505dc..37f0d0e50ae 100644 --- a/docs/en/sql-reference/data-types/aggregatefunction.md +++ b/docs/en/sql-reference/data-types/aggregatefunction.md @@ -6,9 +6,9 @@ sidebar_label: AggregateFunction # AggregateFunction -Aggregate functions can have an implementation-defined intermediate state that can be serialized to an `AggregateFunction(…)` data type and stored in a table, usually, by means of [a materialized view](../../sql-reference/statements/create/view.md). The common way to produce an aggregate function state is by calling the aggregate function with the `-State` suffix. To get the final result of aggregation in the future, you must use the same aggregate function with the `-Merge`suffix. +Aggregate functions can have an implementation-defined intermediate state that can be serialized to an `AggregateFunction(...)` data type and stored in a table, usually, by means of [a materialized view](../../sql-reference/statements/create/view.md). The common way to produce an aggregate function state is by calling the aggregate function with the `-State` suffix. To get the final result of aggregation in the future, you must use the same aggregate function with the `-Merge`suffix. -`AggregateFunction(name, types_of_arguments…)` — parametric data type. +`AggregateFunction(name, types_of_arguments...)` — parametric data type. **Parameters** diff --git a/docs/en/sql-reference/data-types/fixedstring.md b/docs/en/sql-reference/data-types/fixedstring.md index 0316df7fe34..0c021b28f74 100644 --- a/docs/en/sql-reference/data-types/fixedstring.md +++ b/docs/en/sql-reference/data-types/fixedstring.md @@ -21,8 +21,8 @@ The `FixedString` type is efficient when data has the length of precisely `N` by Examples of the values that can be efficiently stored in `FixedString`-typed columns: - The binary representation of IP addresses (`FixedString(16)` for IPv6). -- Language codes (ru_RU, en_US … ). -- Currency codes (USD, RUB … ). +- Language codes (ru_RU, en_US ... ). +- Currency codes (USD, RUB ... ). - Binary representation of hashes (`FixedString(16)` for MD5, `FixedString(32)` for SHA256). To store UUID values, use the [UUID](../../sql-reference/data-types/uuid.md) data type. diff --git a/docs/en/sql-reference/data-types/nested-data-structures/index.md b/docs/en/sql-reference/data-types/nested-data-structures/index.md index d118170cd39..579ee9bfa8b 100644 --- a/docs/en/sql-reference/data-types/nested-data-structures/index.md +++ b/docs/en/sql-reference/data-types/nested-data-structures/index.md @@ -6,7 +6,7 @@ sidebar_label: Nested(Name1 Type1, Name2 Type2, ...) # Nested -## Nested(name1 Type1, Name2 Type2, …) +## Nested(name1 Type1, Name2 Type2, ...) A nested data structure is like a table inside a cell. The parameters of a nested data structure – the column names and types – are specified the same way as in a [CREATE TABLE](../../../sql-reference/statements/create/table.md) query. Each table row can correspond to any number of rows in a nested data structure. diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index 39f8409c1e1..4fb74ac30e4 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -5,7 +5,7 @@ sidebar_label: SimpleAggregateFunction --- # SimpleAggregateFunction -`SimpleAggregateFunction(name, types_of_arguments…)` data type stores current value of the aggregate function, and does not store its full state as [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) does. This optimization can be applied to functions for which the following property holds: the result of applying a function `f` to a row set `S1 UNION ALL S2` can be obtained by applying `f` to parts of the row set separately, and then again applying `f` to the results: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. This property guarantees that partial aggregation results are enough to compute the combined one, so we do not have to store and process any extra data. +`SimpleAggregateFunction(name, types_of_arguments...)` data type stores current value of the aggregate function, and does not store its full state as [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) does. This optimization can be applied to functions for which the following property holds: the result of applying a function `f` to a row set `S1 UNION ALL S2` can be obtained by applying `f` to parts of the row set separately, and then again applying `f` to the results: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. This property guarantees that partial aggregation results are enough to compute the combined one, so we do not have to store and process any extra data. The common way to produce an aggregate function value is by calling the aggregate function with the [-SimpleState](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-simplestate) suffix. diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index 6d95f3dc358..8b8527acfdf 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -140,6 +140,60 @@ Same as `intDiv` but returns zero when dividing by zero or when dividing a minim intDivOrZero(a, b) ``` +## isFinite + +Returns 1 if the Float32 or Float64 argument not infinite and not a NaN, otherwise this function returns 0. + +**Syntax** + +```sql +isFinite(x) +``` + +## isInfinite + +Returns 1 if the Float32 or Float64 argument is infinite, otherwise this function returns 0. Note that 0 is returned for a NaN. + +**Syntax** + +```sql +isInfinite(x) +``` + +## ifNotFinite + +Checks whether a floating point value is finite. + +**Syntax** + +```sql +ifNotFinite(x,y) +``` + +**Arguments** + +- `x` — Value to check for infinity. [Float\*](../../sql-reference/data-types/float.md). +- `y` — Fallback value. [Float\*](../../sql-reference/data-types/float.md). + +**Returned value** + +- `x` if `x` is finite. +- `y` if `x` is not finite. + +**Example** + +Query: + + SELECT 1/0 as infimum, ifNotFinite(infimum,42) + +Result: + + ┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ + │ inf │ 42 │ + └─────────┴───────────────────────────────┘ + +You can get similar result by using the [ternary operator](../../sql-reference/functions/conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. + ## modulo Calculates the remainder of the division of two values `a` by `b`. diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 87e733a4b0c..f929ea00b8b 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -561,7 +561,7 @@ Result: └─────────────┴─────────────┴────────────────┴─────────────────┘ ``` -## array(x1, …), operator \[x1, …\] +## array(x1, ...), operator \[x1, ...\] Creates an array from the function arguments. The arguments must be constants and have types that have the smallest common type. At least one argument must be passed, because otherwise it isn’t clear which type of array to create. That is, you can’t use this function to create an empty array (to do that, use the ‘emptyArray\*’ function described above). @@ -768,9 +768,9 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) Elements set to `NULL` are handled as normal values. -## arrayCount(\[func,\] arr1, …) +## arrayCount(\[func,\] arr1, ...) -Returns the number of elements for which `func(arr1[i], …, arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array. +Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array. Note that the `arrayCount` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. @@ -847,7 +847,7 @@ SELECT countEqual([1, 2, NULL, NULL], NULL) ## arrayEnumerate(arr) -Returns the array \[1, 2, 3, …, length (arr) \] +Returns the array \[1, 2, 3, ..., length (arr) \] This function is normally used with ARRAY JOIN. It allows counting something just once for each array after applying ARRAY JOIN. Example: @@ -887,7 +887,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) This function can also be used in higher-order functions. For example, you can use it to get array indexes for elements that match a condition. -## arrayEnumerateUniq(arr, …) +## arrayEnumerateUniq(arr, ...) Returns an array the same size as the source array, indicating for each element what its position is among elements with the same value. For example: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. @@ -1206,7 +1206,7 @@ Result: └───────────────────┘ ``` -## arraySort(\[func,\] arr, …) {#sort} +## arraySort(\[func,\] arr, ...) {#sort} Sorts the elements of the `arr` array in ascending order. If the `func` function is specified, sorting order is determined by the result of the `func` function applied to the elements of the array. If `func` accepts multiple arguments, the `arraySort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arraySort` description. @@ -1307,11 +1307,11 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia.org/wiki/Schwartzian_transform) is used. ::: -## arrayPartialSort(\[func,\] limit, arr, …) +## arrayPartialSort(\[func,\] limit, arr, ...) Same as `arraySort` with additional `limit` argument allowing partial sorting. Returns an array of the same size as the original array where elements in range `[1..limit]` are sorted in ascending order. Remaining elements `(limit..N]` shall contain elements in unspecified order. -## arrayReverseSort(\[func,\] arr, …) {#reverse-sort} +## arrayReverseSort(\[func,\] arr, ...) {#reverse-sort} Sorts the elements of the `arr` array in descending order. If the `func` function is specified, `arr` is sorted according to the result of the `func` function applied to the elements of the array, and then the sorted array is reversed. If `func` accepts multiple arguments, the `arrayReverseSort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arrayReverseSort` description. @@ -1412,7 +1412,7 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; └─────────┘ ``` -## arrayPartialReverseSort(\[func,\] limit, arr, …) +## arrayPartialReverseSort(\[func,\] limit, arr, ...) Same as `arrayReverseSort` with additional `limit` argument allowing partial sorting. Returns an array of the same size as the original array where elements in range `[1..limit]` are sorted in descending order. Remaining elements `(limit..N]` shall contain elements in unspecified order. @@ -1535,7 +1535,7 @@ Result: [3,9,1,4,5,6,7,8,2,10] ``` -## arrayUniq(arr, …) +## arrayUniq(arr, ...) If one argument is passed, it counts the number of different elements in the array. If multiple arguments are passed, it counts the number of different tuples of elements at corresponding positions in multiple arrays. @@ -2079,9 +2079,9 @@ Result: └───────────────────────────────────────────────┘ ``` -## arrayMap(func, arr1, …) +## arrayMap(func, arr1, ...) -Returns an array obtained from the original arrays by application of `func(arr1[i], …, arrN[i])` for each element. Arrays `arr1` … `arrN` must have the same number of elements. +Returns an array obtained from the original arrays by application of `func(arr1[i], ..., arrN[i])` for each element. Arrays `arr1` ... `arrN` must have the same number of elements. Examples: @@ -2109,9 +2109,9 @@ SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res Note that the `arrayMap` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayFilter(func, arr1, …) +## arrayFilter(func, arr1, ...) -Returns an array containing only the elements in `arr1` for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns an array containing only the elements in `arr1` for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Examples: @@ -2142,9 +2142,9 @@ SELECT Note that the `arrayFilter` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayFill(func, arr1, …) +## arrayFill(func, arr1, ...) -Scan through `arr1` from the first element to the last element and replace `arr1[i]` by `arr1[i - 1]` if `func(arr1[i], …, arrN[i])` returns 0. The first element of `arr1` will not be replaced. +Scan through `arr1` from the first element to the last element and replace `arr1[i]` by `arr1[i - 1]` if `func(arr1[i], ..., arrN[i])` returns 0. The first element of `arr1` will not be replaced. Examples: @@ -2160,9 +2160,9 @@ SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, Note that the `arrayFill` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayReverseFill(func, arr1, …) +## arrayReverseFill(func, arr1, ...) -Scan through `arr1` from the last element to the first element and replace `arr1[i]` by `arr1[i + 1]` if `func(arr1[i], …, arrN[i])` returns 0. The last element of `arr1` will not be replaced. +Scan through `arr1` from the last element to the first element and replace `arr1[i]` by `arr1[i + 1]` if `func(arr1[i], ..., arrN[i])` returns 0. The last element of `arr1` will not be replaced. Examples: @@ -2178,9 +2178,9 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, Note that the `arrayReverseFill` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arraySplit(func, arr1, …) +## arraySplit(func, arr1, ...) -Split `arr1` into multiple arrays. When `func(arr1[i], …, arrN[i])` returns something other than 0, the array will be split on the left hand side of the element. The array will not be split before the first element. +Split `arr1` into multiple arrays. When `func(arr1[i], ..., arrN[i])` returns something other than 0, the array will be split on the left hand side of the element. The array will not be split before the first element. Examples: @@ -2196,9 +2196,9 @@ SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res Note that the `arraySplit` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayReverseSplit(func, arr1, …) +## arrayReverseSplit(func, arr1, ...) -Split `arr1` into multiple arrays. When `func(arr1[i], …, arrN[i])` returns something other than 0, the array will be split on the right hand side of the element. The array will not be split after the last element. +Split `arr1` into multiple arrays. When `func(arr1[i], ..., arrN[i])` returns something other than 0, the array will be split on the right hand side of the element. The array will not be split after the last element. Examples: @@ -2214,30 +2214,30 @@ SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res Note that the `arrayReverseSplit` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayExists(\[func,\] arr1, …) +## arrayExists(\[func,\] arr1, ...) -Returns 1 if there is at least one element in `arr` for which `func(arr1[i], …, arrN[i])` returns something other than 0. Otherwise, it returns 0. +Returns 1 if there is at least one element in `arr` for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Otherwise, it returns 0. Note that the `arrayExists` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -## arrayAll(\[func,\] arr1, …) +## arrayAll(\[func,\] arr1, ...) -Returns 1 if `func(arr1[i], …, arrN[i])` returns something other than 0 for all the elements in arrays. Otherwise, it returns 0. +Returns 1 if `func(arr1[i], ..., arrN[i])` returns something other than 0 for all the elements in arrays. Otherwise, it returns 0. Note that the `arrayAll` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -## arrayFirst(func, arr1, …) +## arrayFirst(func, arr1, ...) -Returns the first element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns the first element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0. ## arrayFirstOrNull -Returns the first element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0, otherwise it returns `NULL`. +Returns the first element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0, otherwise it returns `NULL`. **Syntax** ```sql -arrayFirstOrNull(func, arr1, …) +arrayFirstOrNull(func, arr1, ...) ``` **Parameters** @@ -2292,20 +2292,20 @@ Result: \N ``` -## arrayLast(func, arr1, …) +## arrayLast(func, arr1, ...) -Returns the last element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns the last element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Note that the `arrayLast` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. ## arrayLastOrNull -Returns the last element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0, otherwise returns `NULL`. +Returns the last element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0, otherwise returns `NULL`. **Syntax** ```sql -arrayLastOrNull(func, arr1, …) +arrayLastOrNull(func, arr1, ...) ``` **Parameters** @@ -2348,15 +2348,15 @@ Result: \N ``` -## arrayFirstIndex(func, arr1, …) +## arrayFirstIndex(func, arr1, ...) -Returns the index of the first element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns the index of the first element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Note that the `arrayFirstIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. -## arrayLastIndex(func, arr1, …) +## arrayLastIndex(func, arr1, ...) -Returns the index of the last element in the `arr1` array for which `func(arr1[i], …, arrN[i])` returns something other than 0. +Returns the index of the last element in the `arr1` array for which `func(arr1[i], ..., arrN[i])` returns something other than 0. Note that the `arrayLastIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. @@ -2580,9 +2580,9 @@ Result: └─────┘ ``` -## arrayCumSum(\[func,\] arr1, …) +## arrayCumSum(\[func,\] arr1, ...) -Returns an array of the partial (running) sums of the elements in the source array `arr1`. If `func` is specified, then the sum is computed from applying `func` to `arr1`, `arr2`, ..., `arrN`, i.e. `func(arr1[i], …, arrN[i])`. +Returns an array of the partial (running) sums of the elements in the source array `arr1`. If `func` is specified, then the sum is computed from applying `func` to `arr1`, `arr2`, ..., `arrN`, i.e. `func(arr1[i], ..., arrN[i])`. **Syntax** @@ -2614,9 +2614,9 @@ SELECT arrayCumSum([1, 1, 1, 1]) AS res Note that the `arrayCumSum` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -## arrayCumSumNonNegative(\[func,\] arr1, …) +## arrayCumSumNonNegative(\[func,\] arr1, ...) -Same as `arrayCumSum`, returns an array of the partial (running) sums of the elements in the source array. If `func` is specified, then the sum is computed from applying `func` to `arr1`, `arr2`, ..., `arrN`, i.e. `func(arr1[i], …, arrN[i])`. Unlike `arrayCumSum`, if the current running sum is smaller than `0`, it is replaced by `0`. +Same as `arrayCumSum`, returns an array of the partial (running) sums of the elements in the source array. If `func` is specified, then the sum is computed from applying `func` to `arr1`, `arr2`, ..., `arrN`, i.e. `func(arr1[i], ..., arrN[i])`. Unlike `arrayCumSum`, if the current running sum is smaller than `0`, it is replaced by `0`. **Syntax** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 843f22e5a6f..1a56691ffc0 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1499,7 +1499,7 @@ This function returns the week number for date or datetime. The two-argument for The following table describes how the mode argument works. -| Mode | First day of week | Range | Week 1 is the first week … | +| Mode | First day of week | Range | Week 1 is the first week ... | |------|-------------------|-------|-------------------------------| | 0 | Sunday | 0-53 | with a Sunday in this year | | 1 | Monday | 0-53 | with 4 or more days this year | diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index e920ab82988..ba72b3cc6ed 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -386,7 +386,7 @@ SELECT isValidJSON('{"a": "hello", "b": [-100, 200.0, 300]}') = 1 SELECT isValidJSON('not a json') = 0 ``` -## JSONHas(json\[, indices_or_keys\]…) +## JSONHas(json\[, indices_or_keys\]...) If the value exists in the JSON document, `1` will be returned. @@ -419,7 +419,7 @@ SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' ``` -## JSONLength(json\[, indices_or_keys\]…) +## JSONLength(json\[, indices_or_keys\]...) Return the length of a JSON array or a JSON object. @@ -432,7 +432,7 @@ SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 ``` -## JSONType(json\[, indices_or_keys\]…) +## JSONType(json\[, indices_or_keys\]...) Return the type of a JSON value. @@ -446,13 +446,13 @@ SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' ``` -## JSONExtractUInt(json\[, indices_or_keys\]…) +## JSONExtractUInt(json\[, indices_or_keys\]...) -## JSONExtractInt(json\[, indices_or_keys\]…) +## JSONExtractInt(json\[, indices_or_keys\]...) -## JSONExtractFloat(json\[, indices_or_keys\]…) +## JSONExtractFloat(json\[, indices_or_keys\]...) -## JSONExtractBool(json\[, indices_or_keys\]…) +## JSONExtractBool(json\[, indices_or_keys\]...) Parses a JSON and extract a value. These functions are similar to `visitParam` functions. @@ -466,7 +466,7 @@ SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200 SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 ``` -## JSONExtractString(json\[, indices_or_keys\]…) +## JSONExtractString(json\[, indices_or_keys\]...) Parses a JSON and extract a string. This function is similar to `visitParamExtractString` functions. @@ -484,7 +484,7 @@ SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' ``` -## JSONExtract(json\[, indices_or_keys…\], Return_type) +## JSONExtract(json\[, indices_or_keys...\], Return_type) Parses a JSON and extract a value of the given ClickHouse data type. @@ -506,7 +506,7 @@ SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' ``` -## JSONExtractKeysAndValues(json\[, indices_or_keys…\], Value_type) +## JSONExtractKeysAndValues(json\[, indices_or_keys...\], Value_type) Parses key-value pairs from a JSON where the values are of the given ClickHouse data type. @@ -554,7 +554,7 @@ text └────────────────────────────────────────────────────────────┘ ``` -## JSONExtractRaw(json\[, indices_or_keys\]…) +## JSONExtractRaw(json\[, indices_or_keys\]...) Returns a part of JSON as unparsed string. @@ -566,7 +566,7 @@ Example: SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]'; ``` -## JSONExtractArrayRaw(json\[, indices_or_keys…\]) +## JSONExtractArrayRaw(json\[, indices_or_keys...\]) Returns an array with elements of JSON array, each represented as unparsed string. diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 5b77f16027b..4501d1f43d3 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -172,7 +172,7 @@ Result: ## visibleWidth Calculates the approximate width when outputting values to the console in text format (tab-separated). -This function is used by the system to implement [Pretty formats](../formats.mdx). +This function is used by the system to implement [Pretty formats](../../interfaces/formats.md). `NULL` is represented as a string corresponding to `NULL` in `Pretty` formats. @@ -335,7 +335,7 @@ The argument is internally still evaluated. Useful e.g. for benchmarks. **Syntax** ```sql -ignore(…) +ignore(x) ``` ## sleep @@ -541,60 +541,6 @@ Result: └────────────────────┘ ``` -## isFinite - -Returns 1 if the Float32 or Float64 argument not infinite and not a NaN, otherwise this function returns 0. - -**Syntax** - -```sql -isFinite(x) -``` - -## isInfinite - -Returns 1 if the Float32 or Float64 argument is infinite, otherwise this function returns 0. Note that 0 is returned for a NaN. - -**Syntax** - -```sql -isInfinite(x) -``` - -## ifNotFinite - -Checks whether a floating point value is finite. - -**Syntax** - -```sql -ifNotFinite(x,y) -``` - -**Arguments** - -- `x` — Value to check for infinity. [Float\*](../../sql-reference/data-types/float.md). -- `y` — Fallback value. [Float\*](../../sql-reference/data-types/float.md). - -**Returned value** - -- `x` if `x` is finite. -- `y` if `x` is not finite. - -**Example** - -Query: - - SELECT 1/0 as infimum, ifNotFinite(infimum,42) - -Result: - - ┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ - │ inf │ 42 │ - └─────────┴───────────────────────────────┘ - -You can get similar result by using the [ternary operator](../../sql-reference/functions/conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. - ## isNaN Returns 1 if the Float32 and Float64 argument is NaN, otherwise this function 0. @@ -2303,7 +2249,7 @@ Accepts a path to a catboost model and model arguments (features). Returns Float **Syntax** ```sql -catboostEvaluate(path_to_model, feature_1, feature_2, …, feature_n) +catboostEvaluate(path_to_model, feature_1, feature_2, ..., feature_n) ``` **Example** @@ -2351,7 +2297,7 @@ Throw an exception if argument `x` is true. **Syntax** ```sql -throwIf(x\[, message\[, error_code\]\]) +throwIf(x[, message[, error_code]]) ``` **Arguments** diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 0b761b62006..0e183626555 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -139,7 +139,7 @@ Format the `pattern` string with the values (strings, integers, etc.) listed in **Syntax** ```sql -format(pattern, s0, s1, …) +format(pattern, s0, s1, ...) ``` **Example** diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 9738c19bf3c..a6eb4a4ceff 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -799,7 +799,7 @@ If you only want to search multiple substrings in a string, you can use function **Syntax** ```sql -multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAny(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiMatchAnyIndex @@ -809,7 +809,7 @@ Like `multiMatchAny` but returns any index that matches the haystack. **Syntax** ```sql -multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAnyIndex(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiMatchAllIndices @@ -819,7 +819,7 @@ Like `multiMatchAny` but returns the array of all indices that match the haystac **Syntax** ```sql -multiMatchAllIndices(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAllIndices(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAny @@ -833,7 +833,7 @@ Like `multiMatchAny` but returns 1 if any pattern matches the haystack within a **Syntax** ```sql -multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAnyIndex @@ -843,7 +843,7 @@ Like `multiFuzzyMatchAny` but returns any index that matches the haystack within **Syntax** ```sql -multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAllIndices @@ -853,7 +853,7 @@ Like `multiFuzzyMatchAny` but returns the array of all indices in any order that **Syntax** ```sql -multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## extract diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 64b1732597f..c2219bb3f90 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -7,15 +7,15 @@ sidebar_label: Tuples ## tuple A function that allows grouping multiple columns. -For columns with the types T1, T2, …, it returns a Tuple(T1, T2, …) type tuple containing these columns. There is no cost to execute the function. +For columns with the types T1, T2, ..., it returns a Tuple(T1, T2, ...) type tuple containing these columns. There is no cost to execute the function. Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can’t be written to a table. -The function implements the operator `(x, y, …)`. +The function implements the operator `(x, y, ...)`. **Syntax** ``` sql -tuple(x, y, …) +tuple(x, y, ...) ``` ## tupleElement diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index 377283bc006..6386b4d5b1d 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -589,7 +589,7 @@ mapApply(func, map) **Returned value** -- Returns a map obtained from the original map by application of `func(map1[i], …, mapN[i])` for each element. +- Returns a map obtained from the original map by application of `func(map1[i], ..., mapN[i])` for each element. **Example** @@ -629,7 +629,7 @@ mapFilter(func, map) **Returned value** -- Returns a map containing only the elements in `map` for which `func(map1[i], …, mapN[i])` returns something other than 0. +- Returns a map containing only the elements in `map` for which `func(map1[i], ..., mapN[i])` returns something other than 0. **Example** diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index a0b0170721c..6da82e689a9 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -16,7 +16,7 @@ If the relevant part isn’t present in a URL, an empty string is returned. Extracts the protocol from a URL. -Examples of typical returned values: http, https, ftp, mailto, tel, magnet… +Examples of typical returned values: http, https, ftp, mailto, tel, magnet... ### domain diff --git a/docs/en/sql-reference/statements/alter/comment.md b/docs/en/sql-reference/statements/alter/comment.md index f6fb179d969..320828f0de9 100644 --- a/docs/en/sql-reference/statements/alter/comment.md +++ b/docs/en/sql-reference/statements/alter/comment.md @@ -4,7 +4,7 @@ sidebar_position: 51 sidebar_label: COMMENT --- -# ALTER TABLE … MODIFY COMMENT +# ALTER TABLE ... MODIFY COMMENT Adds, modifies, or removes comment to the table, regardless if it was set before or not. Comment change is reflected in both [system.tables](../../../operations/system-tables/tables.md) and `SHOW CREATE TABLE` query. diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md index b6f45b67d52..af56bec7a11 100644 --- a/docs/en/sql-reference/statements/alter/delete.md +++ b/docs/en/sql-reference/statements/alter/delete.md @@ -4,7 +4,7 @@ sidebar_position: 39 sidebar_label: DELETE --- -# ALTER TABLE … DELETE Statement +# ALTER TABLE ... DELETE Statement ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr diff --git a/docs/en/sql-reference/statements/alter/index.md b/docs/en/sql-reference/statements/alter/index.md index 7961315c193..3cfb99cff83 100644 --- a/docs/en/sql-reference/statements/alter/index.md +++ b/docs/en/sql-reference/statements/alter/index.md @@ -42,7 +42,7 @@ These `ALTER` statements modify entities related to role-based access control: ## Mutations -`ALTER` queries that are intended to manipulate table data are implemented with a mechanism called “mutations”, most notably [ALTER TABLE … DELETE](/docs/en/sql-reference/statements/alter/delete.md) and [ALTER TABLE … UPDATE](/docs/en/sql-reference/statements/alter/update.md). They are asynchronous background processes similar to merges in [MergeTree](/docs/en/engines/table-engines/mergetree-family/index.md) tables that to produce new “mutated” versions of parts. +`ALTER` queries that are intended to manipulate table data are implemented with a mechanism called “mutations”, most notably [ALTER TABLE ... DELETE](/docs/en/sql-reference/statements/alter/delete.md) and [ALTER TABLE ... UPDATE](/docs/en/sql-reference/statements/alter/update.md). They are asynchronous background processes similar to merges in [MergeTree](/docs/en/engines/table-engines/mergetree-family/index.md) tables that to produce new “mutated” versions of parts. For `*MergeTree` tables mutations execute by **rewriting whole data parts**. There is no atomicity - parts are substituted for mutated parts as soon as they are ready and a `SELECT` query that started executing during a mutation will see data from parts that have already been mutated along with data from parts that have not been mutated yet. diff --git a/docs/en/sql-reference/statements/alter/update.md b/docs/en/sql-reference/statements/alter/update.md index ab7d0ca7378..0b300e5849a 100644 --- a/docs/en/sql-reference/statements/alter/update.md +++ b/docs/en/sql-reference/statements/alter/update.md @@ -4,7 +4,7 @@ sidebar_position: 40 sidebar_label: UPDATE --- -# ALTER TABLE … UPDATE Statements +# ALTER TABLE ... UPDATE Statements ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr diff --git a/docs/en/sql-reference/statements/alter/view.md b/docs/en/sql-reference/statements/alter/view.md index e063b27424e..83e8e9311b4 100644 --- a/docs/en/sql-reference/statements/alter/view.md +++ b/docs/en/sql-reference/statements/alter/view.md @@ -4,9 +4,9 @@ sidebar_position: 50 sidebar_label: VIEW --- -# ALTER TABLE … MODIFY QUERY Statement +# ALTER TABLE ... MODIFY QUERY Statement -You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE … MODIFY QUERY` statement without interrupting ingestion process. +You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE ... MODIFY QUERY` statement without interrupting ingestion process. This command is created to change materialized view created with `TO [db.]name` clause. It does not change the structure of the underlying storage table and it does not change the columns' definition of the materialized view, because of this the application of this command is very limited for materialized views are created without `TO [db.]name` clause. @@ -198,6 +198,6 @@ SELECT * FROM mv; `ALTER LIVE VIEW ... REFRESH` statement refreshes a [Live view](../create/view.md#live-view). See [Force Live View Refresh](../create/view.md#live-view-alter-refresh). -## ALTER TABLE … MODIFY REFRESH Statement +## ALTER TABLE ... MODIFY REFRESH Statement `ALTER TABLE ... MODIFY REFRESH` statement changes refresh parameters of a [Refreshable Materialized View](../create/view.md#refreshable-materialized-view). See [Changing Refresh Parameters](../create/view.md#changing-refresh-parameters). diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 073a3c0d246..b526c94e508 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -306,7 +306,7 @@ CREATE WINDOW VIEW test.wv TO test.dst WATERMARK=ASCENDING ALLOWED_LATENESS=INTE Note that elements emitted by a late firing should be treated as updated results of a previous computation. Instead of firing at the end of windows, the window view will fire immediately when the late event arrives. Thus, it will result in multiple outputs for the same window. Users need to take these duplicated results into account or deduplicate them. -You can modify `SELECT` query that was specified in the window view by using `ALTER TABLE … MODIFY QUERY` statement. The data structure resulting in a new `SELECT` query should be the same as the original `SELECT` query when with or without `TO [db.]name` clause. Note that the data in the current window will be lost because the intermediate state cannot be reused. +You can modify `SELECT` query that was specified in the window view by using `ALTER TABLE ... MODIFY QUERY` statement. The data structure resulting in a new `SELECT` query should be the same as the original `SELECT` query when with or without `TO [db.]name` clause. Note that the data in the current window will be lost because the intermediate state cannot be reused. ### Monitoring New Windows diff --git a/docs/en/sql-reference/statements/insert-into.md b/docs/en/sql-reference/statements/insert-into.md index a76692cf291..f3dadabd25f 100644 --- a/docs/en/sql-reference/statements/insert-into.md +++ b/docs/en/sql-reference/statements/insert-into.md @@ -73,7 +73,7 @@ Data can be passed to the INSERT in any [format](../../interfaces/formats.md#for INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -For example, the following query format is identical to the basic version of INSERT … VALUES: +For example, the following query format is identical to the basic version of INSERT ... VALUES: ``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... diff --git a/docs/en/sql-reference/statements/select/limit.md b/docs/en/sql-reference/statements/select/limit.md index d61a5a44b58..58fdf988bf3 100644 --- a/docs/en/sql-reference/statements/select/limit.md +++ b/docs/en/sql-reference/statements/select/limit.md @@ -17,11 +17,11 @@ If there is no [ORDER BY](../../../sql-reference/statements/select/order-by.md) The number of rows in the result set can also depend on the [limit](../../../operations/settings/settings.md#limit) setting. ::: -## LIMIT … WITH TIES Modifier +## LIMIT ... WITH TIES Modifier When you set `WITH TIES` modifier for `LIMIT n[,m]` and specify `ORDER BY expr_list`, you will get in result first `n` or `n,m` rows and all rows with same `ORDER BY` fields values equal to row at position `n` for `LIMIT n` and `m` for `LIMIT n,m`. -This modifier also can be combined with [ORDER BY … WITH FILL modifier](../../../sql-reference/statements/select/order-by.md#orderby-with-fill). +This modifier also can be combined with [ORDER BY ... WITH FILL modifier](../../../sql-reference/statements/select/order-by.md#orderby-with-fill). For example, the following query diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md index d6432a7b4f8..512a58d7cd9 100644 --- a/docs/en/sql-reference/statements/select/order-by.md +++ b/docs/en/sql-reference/statements/select/order-by.md @@ -283,7 +283,7 @@ In `MaterializedView`-engine tables the optimization works with views like `SELE ## ORDER BY Expr WITH FILL Modifier -This modifier also can be combined with [LIMIT … WITH TIES modifier](../../../sql-reference/statements/select/limit.md#limit-with-ties). +This modifier also can be combined with [LIMIT ... WITH TIES modifier](../../../sql-reference/statements/select/limit.md#limit-with-ties). `WITH FILL` modifier can be set after `ORDER BY expr` with optional `FROM expr`, `TO expr` and `STEP expr` parameters. All missed values of `expr` column will be filled sequentially and other columns will be filled as defaults. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 3a63811add6..f66178afbb2 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -169,7 +169,7 @@ If your listing of files contains number ranges with leading zeros, use the cons **Example** -Query the total number of rows in files named `file000`, `file001`, … , `file999`: +Query the total number of rows in files named `file000`, `file001`, ... , `file999`: ``` sql SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32'); diff --git a/docs/en/sql-reference/table-functions/gcs.md b/docs/en/sql-reference/table-functions/gcs.md index 80077ecdb33..b891d88df31 100644 --- a/docs/en/sql-reference/table-functions/gcs.md +++ b/docs/en/sql-reference/table-functions/gcs.md @@ -130,7 +130,7 @@ FROM gcs('https://storage.googleapis.com/my-test-bucket-768/{some,another}_prefi If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. ::: -Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql SELECT count(*) diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index 92f904b8841..d65615e7588 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -85,7 +85,7 @@ If your listing of files contains number ranges with leading zeros, use the cons **Example** -Query the data from files named `file000`, `file001`, … , `file999`: +Query the data from files named `file000`, `file001`, ... , `file999`: ``` sql SELECT count(*) diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 38d77a98749..cbef80371a3 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -137,7 +137,7 @@ FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/ If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. ::: -Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql SELECT count(*) diff --git a/docs/ru/development/style.md b/docs/ru/development/style.md index cd1297504af..08fa7a1e603 100644 --- a/docs/ru/development/style.md +++ b/docs/ru/development/style.md @@ -57,7 +57,7 @@ memcpy(&buf[place_value], &x, sizeof(x)); for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** Вокруг бинарных операторов (`+`, `-`, `*`, `/`, `%`, …), а также тернарного оператора `?:` ставятся пробелы. +**7.** Вокруг бинарных операторов (`+`, `-`, `*`, `/`, `%`, ...), а также тернарного оператора `?:` ставятся пробелы. ``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); @@ -86,7 +86,7 @@ dst.ClickGoodEvent = click.GoodEvent; При необходимости, оператор может быть перенесён на новую строку. В этом случае, перед ним увеличивается отступ. -**11.** Унарные операторы `--`, `++`, `*`, `&`, … не отделяются от аргумента пробелом. +**11.** Унарные операторы `--`, `++`, `*`, `&`, ... не отделяются от аргумента пробелом. **12.** После запятой ставится пробел, а перед — нет. Аналогично для точки с запятой внутри выражения `for`. @@ -115,7 +115,7 @@ public: **16.** Если на весь файл один `namespace` и кроме него ничего существенного нет, то отступ внутри `namespace` не нужен. -**17.** Если блок для выражения `if`, `for`, `while`, … состоит из одного `statement`, то фигурные скобки не обязательны. Вместо этого поместите `statement` на отдельную строку. Это правило справедливо и для вложенных `if`, `for`, `while`, … +**17.** Если блок для выражения `if`, `for`, `while`, ... состоит из одного `statement`, то фигурные скобки не обязательны. Вместо этого поместите `statement` на отдельную строку. Это правило справедливо и для вложенных `if`, `for`, `while`, ... Если внутренний `statement` содержит фигурные скобки или `else`, то внешний блок следует писать в фигурных скобках. @@ -266,7 +266,7 @@ void executeQuery( Пример взят с ресурса http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/. -**7.** Нельзя писать мусорные комментарии (автор, дата создания…) в начале каждого файла. +**7.** Нельзя писать мусорные комментарии (автор, дата создания...) в начале каждого файла. **8.** Однострочные комментарии начинаются с трёх слешей: `///` , многострочные с `/**`. Такие комментарии считаются «документирующими». diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index 72087b56652..cf43eef73e3 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -103,7 +103,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs **Example** -Создадим таблицу с именами `file000`, `file001`, … , `file999`: +Создадим таблицу с именами `file000`, `file001`, ... , `file999`: ``` sql CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') diff --git a/docs/ru/engines/table-engines/integrations/s3.md b/docs/ru/engines/table-engines/integrations/s3.md index 720aa589122..a1c69df4d0a 100644 --- a/docs/ru/engines/table-engines/integrations/s3.md +++ b/docs/ru/engines/table-engines/integrations/s3.md @@ -73,7 +73,7 @@ SELECT * FROM s3_engine_table LIMIT 2; **Пример подстановки 1** -Таблица содержит данные из файлов с именами `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Таблица содержит данные из файлов с именами `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql CREATE TABLE big_table (name String, value UInt32) diff --git a/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md index 46597c94370..c3203804211 100644 --- a/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/ru/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -66,7 +66,7 @@ WHERE table = 'visits' └───────────┴───────────────────┴────────┘ ``` -Столбец `partition` содержит имена всех партиций таблицы. Таблица `visits` из нашего примера содержит две партиции: `201901` и `201902`. Используйте значения из этого столбца в запросах [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md). +Столбец `partition` содержит имена всех партиций таблицы. Таблица `visits` из нашего примера содержит две партиции: `201901` и `201902`. Используйте значения из этого столбца в запросах [ALTER ... PARTITION](../../../sql-reference/statements/alter/partition.md). Столбец `name` содержит названия кусков партиций. Значения из этого столбца можно использовать в запросах [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition). diff --git a/docs/ru/engines/table-engines/mergetree-family/mergetree.md b/docs/ru/engines/table-engines/mergetree-family/mergetree.md index faa492d4d85..49ba229b1d5 100644 --- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md @@ -771,7 +771,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' - В результате вставки (запрос `INSERT`). - В фоновых операциях слияний и [мутаций](../../../sql-reference/statements/alter/index.md#mutations). - При скачивании данных с другой реплики. -- В результате заморозки партиций [ALTER TABLE … FREEZE PARTITION](../../../engines/table-engines/mergetree-family/mergetree.md#alter_freeze-partition). +- В результате заморозки партиций [ALTER TABLE ... FREEZE PARTITION](../../../engines/table-engines/mergetree-family/mergetree.md#alter_freeze-partition). Во всех случаях, кроме мутаций и заморозки партиций, при записи куска выбирается том и диск в соответствии с указанной конфигурацией хранилища: @@ -781,7 +781,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' Мутации и запросы заморозки партиций в реализации используют [жесткие ссылки](https://ru.wikipedia.org/wiki/%D0%96%D1%91%D1%81%D1%82%D0%BA%D0%B0%D1%8F_%D1%81%D1%81%D1%8B%D0%BB%D0%BA%D0%B0). Жесткие ссылки между различными дисками не поддерживаются, поэтому в случае таких операций куски размещаются на тех же дисках, что и исходные. В фоне куски перемещаются между томами на основе информации о занятом месте (настройка `move_factor`) по порядку, в котором указаны тома в конфигурации. Данные никогда не перемещаются с последнего тома и на первый том. Следить за фоновыми перемещениями можно с помощью системных таблиц [system.part_log](../../../engines/table-engines/mergetree-family/mergetree.md#system_tables-part-log) (поле `type = MOVE_PART`) и [system.parts](../../../engines/table-engines/mergetree-family/mergetree.md#system_tables-parts) (поля `path` и `disk`). Также подробная информация о перемещениях доступна в логах сервера. -С помощью запроса [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../engines/table-engines/mergetree-family/mergetree.md#alter_move-partition) пользователь может принудительно перенести кусок или партицию с одного раздела на другой. При этом учитываются все ограничения, указанные для фоновых операций. Запрос самостоятельно инициирует процесс перемещения не дожидаясь фоновых операций. В случае недостатка места или неудовлетворения ограничениям пользователь получит сообщение об ошибке. +С помощью запроса [ALTER TABLE ... MOVE PART\|PARTITION ... TO VOLUME\|DISK ...](../../../engines/table-engines/mergetree-family/mergetree.md#alter_move-partition) пользователь может принудительно перенести кусок или партицию с одного раздела на другой. При этом учитываются все ограничения, указанные для фоновых операций. Запрос самостоятельно инициирует процесс перемещения не дожидаясь фоновых операций. В случае недостатка места или неудовлетворения ограничениям пользователь получит сообщение об ошибке. Перемещения данных не взаимодействуют с репликацией данных, поэтому на разных репликах одной и той же таблицы могут быть указаны разные политики хранения. diff --git a/docs/ru/engines/table-engines/special/external-data.md b/docs/ru/engines/table-engines/special/external-data.md index 881566e5f34..3d9737096f5 100644 --- a/docs/ru/engines/table-engines/special/external-data.md +++ b/docs/ru/engines/table-engines/special/external-data.md @@ -31,7 +31,7 @@ ClickHouse позволяет отправить на сервер данные, - **--format** - формат данных в файле. Если не указано - используется TabSeparated. Должен быть указан один из следующих параметров: -- **--types** - список типов столбцов через запятую. Например, `UInt64,String`. Столбцы будут названы _1, _2, … +- **--types** - список типов столбцов через запятую. Например, `UInt64,String`. Столбцы будут названы _1, _2, ... - **--structure** - структура таблицы, в форме `UserID UInt64`, `URL String`. Определяет имена и типы столбцов. Файлы, указанные в file, будут разобраны форматом, указанным в format, с использованием типов данных, указанных в types или structure. Таблица будет загружена на сервер, и доступна там в качестве временной таблицы с именем name. diff --git a/docs/ru/faq/general/olap.md b/docs/ru/faq/general/olap.md index c9021f7c92e..bcfe9663381 100644 --- a/docs/ru/faq/general/olap.md +++ b/docs/ru/faq/general/olap.md @@ -9,13 +9,13 @@ sidebar_position: 100 [OLAP](https://ru.wikipedia.org/wiki/OLAP) (OnLine Analytical Processing) переводится как обработка данных в реальном времени. Это широкий термин, который можно рассмотреть с двух сторон: с технической и с точки зрения бизнеса. Для самого общего понимания можно просто прочитать его с конца: **Processing** - Обрабатываются некие исходные данные… + Обрабатываются некие исходные данные... **Analytical** -: … чтобы получить какие-то аналитические отчеты или новые знания… +: ... чтобы получить какие-то аналитические отчеты или новые знания... **OnLine** -: … в реальном времени, практически без задержек на обработку. +: ... в реальном времени, практически без задержек на обработку. ## OLAP с точки зрения бизнеса {#olap-from-the-business-perspective} diff --git a/docs/ru/getting-started/example-datasets/nyc-taxi.md b/docs/ru/getting-started/example-datasets/nyc-taxi.md index 12d0c18c3a1..a42033e7d41 100644 --- a/docs/ru/getting-started/example-datasets/nyc-taxi.md +++ b/docs/ru/getting-started/example-datasets/nyc-taxi.md @@ -196,7 +196,7 @@ real 75m56.214s (Импорт данных напрямую из Postgres также возможен с использованием `COPY ... TO PROGRAM`.) -К сожалению, все поля, связанные с погодой (precipitation…average_wind_speed) заполнены NULL. Из-за этого мы исключим их из финального набора данных. +К сожалению, все поля, связанные с погодой (precipitation...average_wind_speed) заполнены NULL. Из-за этого мы исключим их из финального набора данных. Для начала мы создадим таблицу на одном сервере. Позже мы сделаем таблицу распределенной. diff --git a/docs/ru/index.md b/docs/ru/index.md index 29f2bbe07fb..d551d492af5 100644 --- a/docs/ru/index.md +++ b/docs/ru/index.md @@ -15,7 +15,7 @@ ClickHouse — столбцовая система управления база | #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | | #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | | #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | … | … | … | … | … | +| #N | ... | ... | ... | ... | ... | То есть, значения, относящиеся к одной строке, физически хранятся рядом. @@ -26,11 +26,11 @@ ClickHouse — столбцовая система управления база | Строка: | #0 | #1 | #2 | #N | |-------------|---------------------|---------------------|---------------------|-----| -| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | -| JavaEnable: | 1 | 0 | 1 | … | -| Title: | Investor Relations | Contact us | Mission | … | -| GoodEvent: | 1 | 1 | 1 | … | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | +| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | +| JavaEnable: | 1 | 0 | 1 | ... | +| Title: | Investor Relations | Contact us | Mission | ... | +| GoodEvent: | 1 | 1 | 1 | ... | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | В примерах изображён только порядок расположения данных. То есть значения из разных столбцов хранятся отдельно, а данные одного столбца — вместе. diff --git a/docs/ru/operations/settings/query-complexity.md b/docs/ru/operations/settings/query-complexity.md index d1d38a587c6..e82a5a008eb 100644 --- a/docs/ru/operations/settings/query-complexity.md +++ b/docs/ru/operations/settings/query-complexity.md @@ -260,7 +260,7 @@ FORMAT Null; Ограничивает количество строк в хэш-таблице, используемой при соединении таблиц. -Параметр применяется к операциям [SELECT… JOIN](../../sql-reference/statements/select/join.md#select-join) и к движку таблиц [Join](../../engines/table-engines/special/join.md). +Параметр применяется к операциям [SELECT... JOIN](../../sql-reference/statements/select/join.md#select-join) и к движку таблиц [Join](../../engines/table-engines/special/join.md). Если запрос содержит несколько `JOIN`, то ClickHouse проверяет значение настройки для каждого промежуточного результата. @@ -277,7 +277,7 @@ FORMAT Null; Ограничивает размер (в байтах) хэш-таблицы, используемой при объединении таблиц. -Параметр применяется к операциям [SELECT… JOIN](../../sql-reference/statements/select/join.md#select-join) и к движку таблиц [Join](../../engines/table-engines/special/join.md). +Параметр применяется к операциям [SELECT... JOIN](../../sql-reference/statements/select/join.md#select-join) и к движку таблиц [Join](../../engines/table-engines/special/join.md). Если запрос содержит несколько `JOIN`, то ClickHouse проверяет значение настройки для каждого промежуточного результата. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 2b3607dcf08..3a70a0bac12 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1859,7 +1859,7 @@ SELECT * FROM test_table ## count_distinct_implementation {#settings-count_distinct_implementation} -Задаёт, какая из функций `uniq*` используется при выполнении конструкции [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count). +Задаёт, какая из функций `uniq*` используется при выполнении конструкции [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count). Возможные значения: diff --git a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md index 6463f6bd95d..e6a61d9b381 100644 --- a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md @@ -82,7 +82,7 @@ FROM В этом случае необходимо помнить, что границы корзин гистограммы не известны. -## sequenceMatch(pattern)(timestamp, cond1, cond2, …) {#function-sequencematch} +## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} Проверяет, содержит ли последовательность событий цепочку, которая соответствует указанному шаблону. @@ -172,7 +172,7 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM - [sequenceCount](#function-sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount} +## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} Вычисляет количество цепочек событий, соответствующих шаблону. Функция обнаруживает только непересекающиеся цепочки событий. Она начинает искать следующую цепочку только после того, как полностью совпала текущая цепочка событий. diff --git a/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md b/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md index fed0f8b328b..a0a430f7a68 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/quantiles.md @@ -7,7 +7,7 @@ sidebar_position: 201 ## quantiles {#quantiles} -Синтаксис: `quantiles(level1, level2, …)(x)` +Синтаксис: `quantiles(level1, level2, ...)(x)` Все функции для вычисления квантилей имеют соответствующие функции для вычисления нескольких квантилей: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`, `quantilesBFloat16`. Эти функции вычисляют все квантили указанных уровней в один проход и возвращают массив с вычисленными значениями. diff --git a/docs/ru/sql-reference/data-types/aggregatefunction.md b/docs/ru/sql-reference/data-types/aggregatefunction.md index e42b467e4af..0481151c7e4 100644 --- a/docs/ru/sql-reference/data-types/aggregatefunction.md +++ b/docs/ru/sql-reference/data-types/aggregatefunction.md @@ -6,9 +6,9 @@ sidebar_label: AggregateFunction # AggregateFunction {#data-type-aggregatefunction} -Агрегатные функции могут обладать определяемым реализацией промежуточным состоянием, которое может быть сериализовано в тип данных, соответствующий AggregateFunction(…), и быть записано в таблицу обычно посредством [материализованного представления](../../sql-reference/statements/create/view.md). Чтобы получить промежуточное состояние, обычно используются агрегатные функции с суффиксом `-State`. Чтобы в дальнейшем получить агрегированные данные необходимо использовать те же агрегатные функции с суффиксом `-Merge`. +Агрегатные функции могут обладать определяемым реализацией промежуточным состоянием, которое может быть сериализовано в тип данных, соответствующий AggregateFunction(...), и быть записано в таблицу обычно посредством [материализованного представления](../../sql-reference/statements/create/view.md). Чтобы получить промежуточное состояние, обычно используются агрегатные функции с суффиксом `-State`. Чтобы в дальнейшем получить агрегированные данные необходимо использовать те же агрегатные функции с суффиксом `-Merge`. -`AggregateFunction(name, types_of_arguments…)` — параметрический тип данных. +`AggregateFunction(name, types_of_arguments...)` — параметрический тип данных. **Параметры** diff --git a/docs/ru/sql-reference/data-types/fixedstring.md b/docs/ru/sql-reference/data-types/fixedstring.md index d7a4e865903..56a5632f88d 100644 --- a/docs/ru/sql-reference/data-types/fixedstring.md +++ b/docs/ru/sql-reference/data-types/fixedstring.md @@ -21,8 +21,8 @@ sidebar_label: FixedString(N) Примеры значений, которые можно эффективно хранить в столбцах типа `FixedString`: - Двоичное представление IP-адреса (`FixedString(16)` для IPv6). -- Коды языков (ru_RU, en_US … ). -- Коды валют (USD, RUB … ). +- Коды языков (ru_RU, en_US ... ). +- Коды валют (USD, RUB ... ). - Двоичное представление хэшей (`FixedString(16)` для MD5, `FixedString(32)` для SHA256). Для хранения значений UUID используйте тип данных [UUID](uuid.md). diff --git a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md index 4ec8333d563..8fd293a0415 100644 --- a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md +++ b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md @@ -3,7 +3,7 @@ slug: /ru/sql-reference/data-types/nested-data-structures/nested --- # Nested {#nested} -## Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} +## Nested(Name1 Type1, Name2 Type2, ...) {#nestedname1-type1-name2-type2} Вложенная структура данных - это как будто вложенная таблица. Параметры вложенной структуры данных - имена и типы столбцов, указываются так же, как у запроса CREATE. Каждой строке таблицы может соответствовать произвольное количество строк вложенной структуры данных. diff --git a/docs/ru/sql-reference/data-types/tuple.md b/docs/ru/sql-reference/data-types/tuple.md index 8953134d154..9d86c26c563 100644 --- a/docs/ru/sql-reference/data-types/tuple.md +++ b/docs/ru/sql-reference/data-types/tuple.md @@ -4,7 +4,7 @@ sidebar_position: 54 sidebar_label: Tuple(T1, T2, ...) --- -# Tuple(T1, T2, …) {#tuplet1-t2} +# Tuple(T1, T2, ...) {#tuplet1-t2} Кортеж из элементов любого [типа](index.md#data_types). Элементы кортежа могут быть одного или разных типов. diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 1f06bdf264a..825e3f06be2 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -161,7 +161,7 @@ SELECT range(5), range(1, 5), range(1, 5, 2); ``` -## array(x1, …), оператор \[x1, …\] {#arrayx1-operator-x1} +## array(x1, ...), оператор \[x1, ...\] {#arrayx1-operator-x1} Создаёт массив из аргументов функции. Аргументы должны быть константами и иметь типы, для которых есть наименьший общий тип. Должен быть передан хотя бы один аргумент, так как иначе непонятно, какого типа создавать массив. То есть, с помощью этой функции невозможно создать пустой массив (для этого используйте функции emptyArray\*, описанные выше). @@ -308,7 +308,7 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) Элементы, равные `NULL`, обрабатываются как обычные значения. -## arrayCount(\[func,\] arr1, …) {#array-count} +## arrayCount(\[func,\] arr1, ...) {#array-count} Возвращает количество элементов массива `arr`, для которых функция `func` возвращает не 0. Если `func` не указана - возвращает количество ненулевых элементов массива. @@ -335,7 +335,7 @@ SELECT countEqual([1, 2, NULL, NULL], NULL) ## arrayEnumerate(arr) {#array_functions-arrayenumerate} -Возвращает массив \[1, 2, 3, …, length(arr)\] +Возвращает массив \[1, 2, 3, ..., length(arr)\] Эта функция обычно используется совместно с ARRAY JOIN. Она позволяет, после применения ARRAY JOIN, посчитать что-либо только один раз для каждого массива. Пример: @@ -375,7 +375,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) Также эта функция может быть использована в функциях высшего порядка. Например, с её помощью можно достать индексы массива для элементов, удовлетворяющих некоторому условию. -## arrayEnumerateUniq(arr, …) {#arrayenumerateuniqarr} +## arrayEnumerateUniq(arr, ...) {#arrayenumerateuniqarr} Возвращает массив, такого же размера, как исходный, где для каждого элемента указано, какой он по счету среди элементов с таким же значением. Например: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. @@ -597,7 +597,7 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res; Элементы массива равные `NULL` обрабатываются как обычные значения. -## arraySort(\[func,\] arr, …) {#array_functions-sort} +## arraySort(\[func,\] arr, ...) {#array_functions-sort} Возвращает массив `arr`, отсортированный в восходящем порядке. Если задана функция `func`, то порядок сортировки определяется результатом применения этой функции на элементы массива `arr`. Если `func` принимает несколько аргументов, то в функцию `arraySort` нужно передавать несколько массивов, которые будут соответствовать аргументам функции `func`. Подробные примеры рассмотрены в конце описания `arraySort`. @@ -698,11 +698,11 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; Для улучшения эффективности сортировки применяется [преобразование Шварца](https://ru.wikipedia.org/wiki/%D0%9F%D1%80%D0%B5%D0%BE%D0%B1%D1%80%D0%B0%D0%B7%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%A8%D0%B2%D0%B0%D1%80%D1%86%D0%B0). ::: -## arrayPartialSort(\[func,\] limit, arr, …) {#array_functions-sort} +## arrayPartialSort(\[func,\] limit, arr, ...) {#array_functions-sort} То же, что и `arraySort` с дополнительным аргументом `limit`, позволяющим частичную сортировку. Возвращает массив того же размера, как и исходный, в котором элементы `[1..limit]` отсортированы в возрастающем порядке. Остальные элементы `(limit..N]` остаются в неспецифицированном порядке. -## arrayReverseSort(\[func,\] arr, …) {#array_functions-reverse-sort} +## arrayReverseSort(\[func,\] arr, ...) {#array_functions-reverse-sort} Возвращает массив `arr`, отсортированный в нисходящем порядке. Если указана функция `func`, то массив `arr` сначала сортируется в порядке, который определяется функцией `func`, а затем отсортированный массив переворачивается. Если функция `func` принимает несколько аргументов, то в функцию `arrayReverseSort` необходимо передавать несколько массивов, которые будут соответствовать аргументам функции `func`. Подробные примеры рассмотрены в конце описания функции `arrayReverseSort`. @@ -803,11 +803,11 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; └─────────┘ ``` -## arrayPartialReverseSort(\[func,\] limit, arr, …) {#array_functions-sort} +## arrayPartialReverseSort(\[func,\] limit, arr, ...) {#array_functions-sort} То же, что и `arrayReverseSort` с дополнительным аргументом `limit`, позволяющим частичную сортировку. Возвращает массив того же размера, как и исходный, в котором элементы `[1..limit]` отсортированы в убывающем порядке. Остальные элементы `(limit..N]` остаются в неспецифицированном порядке. -## arrayUniq(arr, …) {#array-functions-arrayuniq} +## arrayUniq(arr, ...) {#array-functions-arrayuniq} Если передан один аргумент, считает количество разных элементов в массиве. Если передано несколько аргументов, считает количество разных кортежей из элементов на соответствующих позициях в нескольких массивах. @@ -1174,7 +1174,7 @@ SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]); └──────────────────────────────────────┘ ``` -## arrayMap(func, arr1, …) {#array-map} +## arrayMap(func, arr1, ...) {#array-map} Возвращает массив, полученный на основе результатов применения функции `func` к каждому элементу массива `arr`. @@ -1204,7 +1204,7 @@ SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res; Функция `arrayMap` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayFilter(func, arr1, …) {#array-filter} +## arrayFilter(func, arr1, ...) {#array-filter} Возвращает массив, содержащий только те элементы массива `arr1`, для которых функция `func` возвращает не 0. @@ -1237,7 +1237,7 @@ SELECT Функция `arrayFilter` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayFill(func, arr1, …) {#array-fill} +## arrayFill(func, arr1, ...) {#array-fill} Перебирает `arr1` от первого элемента к последнему и заменяет `arr1[i]` на `arr1[i - 1]`, если `func` вернула 0. Первый элемент `arr1` остаётся неизменным. @@ -1255,7 +1255,7 @@ SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, Функция `arrayFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayReverseFill(func, arr1, …) {#array-reverse-fill} +## arrayReverseFill(func, arr1, ...) {#array-reverse-fill} Перебирает `arr1` от последнего элемента к первому и заменяет `arr1[i]` на `arr1[i + 1]`, если `func` вернула 0. Последний элемент `arr1` остаётся неизменным. @@ -1273,7 +1273,7 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, Функция `arrayReverseFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arraySplit(func, arr1, …) {#array-split} +## arraySplit(func, arr1, ...) {#array-split} Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в левую часть. Массив не разбивается по первому элементу. @@ -1291,7 +1291,7 @@ SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res Функция `arraySplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayReverseSplit(func, arr1, …) {#array-reverse-split} +## arrayReverseSplit(func, arr1, ...) {#array-reverse-split} Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в правую часть. Массив не разбивается по последнему элементу. @@ -1309,25 +1309,25 @@ SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res Функция `arrayReverseSplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} +## arrayExists(\[func,\] arr1, ...) {#arrayexistsfunc-arr1} Возвращает 1, если существует хотя бы один элемент массива `arr`, для которого функция func возвращает не 0. Иначе возвращает 0. Функция `arrayExists` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) - в качестве первого аргумента ей можно передать лямбда-функцию. -## arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} +## arrayAll(\[func,\] arr1, ...) {#arrayallfunc-arr1} Возвращает 1, если для всех элементов массива `arr`, функция `func` возвращает не 0. Иначе возвращает 0. Функция `arrayAll` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) - в качестве первого аргумента ей можно передать лямбда-функцию. -## arrayFirst(func, arr1, …) {#array-first} +## arrayFirst(func, arr1, ...) {#array-first} Возвращает первый элемент массива `arr1`, для которого функция func возвращает не 0. Функция `arrayFirst` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. -## arrayFirstIndex(func, arr1, …) {#array-first-index} +## arrayFirstIndex(func, arr1, ...) {#array-first-index} Возвращает индекс первого элемента массива `arr1`, для которого функция func возвращает не 0. @@ -1599,7 +1599,7 @@ SELECT arraySum(x -> x*x, [2, 3]) AS res; └─────┘ ``` -## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} +## arrayCumSum(\[func,\] arr1, ...) {#arraycumsumfunc-arr1} Возвращает массив из частичных сумм элементов исходного массива (сумма с накоплением). Если указана функция `func`, то значения элементов массива преобразуются этой функцией перед суммированием. diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 56ae4359bf1..bcc5f807c32 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -559,7 +559,7 @@ SELECT Описание режимов (mode): -| Mode | Первый день недели | Диапазон | Неделя 1 это первая неделя … | +| Mode | Первый день недели | Диапазон | Неделя 1 это первая неделя ... | | ----------- | -------- | -------- | ------------------ | |0|Воскресенье|0-53|с воскресеньем в этом году |1|Понедельник|0-53|с 4-мя или более днями в этом году diff --git a/docs/ru/sql-reference/functions/json-functions.md b/docs/ru/sql-reference/functions/json-functions.md index 123f40ce05d..18f625bf80f 100644 --- a/docs/ru/sql-reference/functions/json-functions.md +++ b/docs/ru/sql-reference/functions/json-functions.md @@ -88,7 +88,7 @@ SELECT isValidJSON('{"a": "hello", "b": [-100, 200.0, 300]}') = 1 SELECT isValidJSON('not a json') = 0 ``` -## JSONHas(json\[, indices_or_keys\]…) {#jsonhasjson-indices-or-keys} +## JSONHas(json\[, indices_or_keys\]...) {#jsonhasjson-indices-or-keys} Если значение существует в документе JSON, то возвращается `1`. @@ -121,7 +121,7 @@ SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' ``` -## JSONLength(json\[, indices_or_keys\]…) {#jsonlengthjson-indices-or-keys} +## JSONLength(json\[, indices_or_keys\]...) {#jsonlengthjson-indices-or-keys} Возвращает длину массива JSON или объекта JSON. @@ -134,7 +134,7 @@ SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 ``` -## JSONType(json\[, indices_or_keys\]…) {#jsontypejson-indices-or-keys} +## JSONType(json\[, indices_or_keys\]...) {#jsontypejson-indices-or-keys} Возвращает тип значения JSON. @@ -148,13 +148,13 @@ SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' ``` -## JSONExtractUInt(json\[, indices_or_keys\]…) {#jsonextractuintjson-indices-or-keys} +## JSONExtractUInt(json\[, indices_or_keys\]...) {#jsonextractuintjson-indices-or-keys} -## JSONExtractInt(json\[, indices_or_keys\]…) {#jsonextractintjson-indices-or-keys} +## JSONExtractInt(json\[, indices_or_keys\]...) {#jsonextractintjson-indices-or-keys} -## JSONExtractFloat(json\[, indices_or_keys\]…) {#jsonextractfloatjson-indices-or-keys} +## JSONExtractFloat(json\[, indices_or_keys\]...) {#jsonextractfloatjson-indices-or-keys} -## JSONExtractBool(json\[, indices_or_keys\]…) {#jsonextractbooljson-indices-or-keys} +## JSONExtractBool(json\[, indices_or_keys\]...) {#jsonextractbooljson-indices-or-keys} Парсит JSON и извлекает значение. Эти функции аналогичны функциям `visitParam`. @@ -168,7 +168,7 @@ SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200 SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 ``` -## JSONExtractString(json\[, indices_or_keys\]…) {#jsonextractstringjson-indices-or-keys} +## JSONExtractString(json\[, indices_or_keys\]...) {#jsonextractstringjson-indices-or-keys} Парсит JSON и извлекает строку. Эта функция аналогична функции `visitParamExtractString`. @@ -186,7 +186,7 @@ SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' ``` -## JSONExtract(json\[, indices_or_keys…\], Return_type) {#jsonextractjson-indices-or-keys-return-type} +## JSONExtract(json\[, indices_or_keys...\], Return_type) {#jsonextractjson-indices-or-keys-return-type} Парсит JSON и извлекает значение с заданным типом данных. @@ -207,7 +207,7 @@ SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' ``` -## JSONExtractKeysAndValues(json\[, indices_or_keys…\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} +## JSONExtractKeysAndValues(json\[, indices_or_keys...\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} Разбор пар ключ-значение из JSON, где значение имеет тип данных ClickHouse. @@ -255,7 +255,7 @@ text └────────────────────────────────────────────────────────────┘ ``` -## JSONExtractRaw(json\[, indices_or_keys\]…) {#jsonextractrawjson-indices-or-keys} +## JSONExtractRaw(json\[, indices_or_keys\]...) {#jsonextractrawjson-indices-or-keys} Возвращает часть JSON в виде строки, содержащей неразобранную подстроку. @@ -267,7 +267,7 @@ text SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]'; ``` -## JSONExtractArrayRaw(json\[, indices_or_keys\]…) {#jsonextractarrayrawjson-indices-or-keys} +## JSONExtractArrayRaw(json\[, indices_or_keys\]...) {#jsonextractarrayrawjson-indices-or-keys} Возвращает массив из элементов JSON массива, каждый из которых представлен в виде строки с неразобранными подстроками из JSON. diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md index 835aed934d5..f7637cfa3f7 100644 --- a/docs/ru/sql-reference/functions/other-functions.md +++ b/docs/ru/sql-reference/functions/other-functions.md @@ -286,7 +286,7 @@ SELECT byteSize(NULL, 1, 0.3, ''); Превращает константу в полноценный столбец, содержащий только одно значение. В ClickHouse полноценные столбцы и константы представлены в памяти по-разному. Функции по-разному работают для аргументов-констант и обычных аргументов (выполняется разный код), хотя результат почти всегда должен быть одинаковым. Эта функция предназначена для отладки такого поведения. -## ignore(…) {#ignore} +## ignore(...) {#ignore} Принимает любые аргументы, в т.ч. `NULL`, всегда возвращает 0. При этом, аргумент всё равно вычисляется. Это может использоваться для бенчмарков. diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index eeb5752c626..fc258f7b4cf 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -358,7 +358,7 @@ SELECT repeat('abc', 10); Разворачивает последовательность кодовых точек Unicode, при допущении, что строка содержит набор байтов, представляющий текст в кодировке UTF-8. Иначе — что-то делает (не кидает исключение). -## format(pattern, s0, s1, …) {#format} +## format(pattern, s0, s1, ...) {#format} Форматирует константный шаблон со строками, перечисленными в аргументах. `pattern` — упрощенная версия шаблона в языке Python. Шаблон содержит «заменяющие поля», которые окружены фигурными скобками `{}`. Всё, что не содержится в скобках, интерпретируется как обычный текст и просто копируется. Если нужно использовать символ фигурной скобки, можно экранировать двойной скобкой `{{ '{{' }}` или `{{ '}}' }}`. Имя полей могут быть числами (нумерация с нуля) или пустыми (тогда они интерпретируются как последовательные числа). diff --git a/docs/ru/sql-reference/functions/string-search-functions.md b/docs/ru/sql-reference/functions/string-search-functions.md index 4f9ae4428a4..53da9a6e791 100644 --- a/docs/ru/sql-reference/functions/string-search-functions.md +++ b/docs/ru/sql-reference/functions/string-search-functions.md @@ -311,19 +311,19 @@ Result: Смотрите `multiSearchAllPositions`. -## multiSearchFirstPosition(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstpositionhaystack-needle1-needle2-needlen} +## multiSearchFirstPosition(haystack, \[needle1, needle2, ..., needlen\]) {#multisearchfirstpositionhaystack-needle1-needle2-needlen} Так же, как и `position`, только возвращает оффсет первого вхождения любого из needles. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`. -## multiSearchFirstIndex(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} +## multiSearchFirstIndex(haystack, \[needle1, needle2, ..., needlen\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} Возвращает индекс `i` (нумерация с единицы) первой найденной строки needlei в строке `haystack` и 0 иначе. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, \[needle1, needle2, …, needlen\]) {#function-multisearchany} +## multiSearchAny(haystack, \[needle1, needle2, ..., needlen\]) {#function-multisearchany} Возвращает 1, если хотя бы одна подстрока needlei нашлась в строке `haystack` и 0 иначе. @@ -343,30 +343,30 @@ Result: Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты. Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее. -## multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} +## multiMatchAny(haystack, \[pattern1, pattern2, ..., patternn\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется библиотека [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearchAny`, так как она работает существенно быстрее. :::note Примечание Длина любой строки из `haystack` должна быть меньше 232 байт, иначе бросается исключение. Это ограничение связано с ограничением hyperscan API. ::: -## multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} +## multiMatchAnyIndex(haystack, \[pattern1, pattern2, ..., patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения. -## multiMatchAllIndices(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchallindiceshaystack-pattern1-pattern2-patternn} +## multiMatchAllIndices(haystack, \[pattern1, pattern2, ..., patternn\]) {#multimatchallindiceshaystack-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке. -## multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} +## multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, ..., patternn\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, но возвращает 1 если любой шаблон соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция основана на экспериментальной библиотеке [hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching) и может быть медленной для некоторых частных случаев. Производительность зависит от значения редакционного расстояния и используемых шаблонов, но всегда медленнее по сравнению с non-fuzzy вариантами. -## multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} +## multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, ..., patternn\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния. -## multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchallindiceshaystack-distance-pattern1-pattern2-patternn} +## multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, ..., patternn\]) {#multifuzzymatchallindiceshaystack-distance-pattern1-pattern2-patternn} То же, что и `multiFuzzyMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке в пределах константного редакционного расстояния. diff --git a/docs/ru/sql-reference/functions/tuple-functions.md b/docs/ru/sql-reference/functions/tuple-functions.md index c702e5d00b1..70ae44aa627 100644 --- a/docs/ru/sql-reference/functions/tuple-functions.md +++ b/docs/ru/sql-reference/functions/tuple-functions.md @@ -9,15 +9,15 @@ sidebar_label: Функции для работы с кортежами ## tuple {#tuple} Функция, позволяющая сгруппировать несколько столбцов. -Для столбцов, имеющих типы T1, T2, … возвращает кортеж типа Tuple(T1, T2, …), содержащий эти столбцы. Выполнение функции ничего не стоит. +Для столбцов, имеющих типы T1, T2, ... возвращает кортеж типа Tuple(T1, T2, ...), содержащий эти столбцы. Выполнение функции ничего не стоит. Кортежи обычно используются как промежуточное значение в качестве аргумента операторов IN, или для создания списка формальных параметров лямбда-функций. Кортежи не могут быть записаны в таблицу. -С помощью функции реализуется оператор `(x, y, …)`. +С помощью функции реализуется оператор `(x, y, ...)`. **Синтаксис** ``` sql -tuple(x, y, …) +tuple(x, y, ...) ``` ## tupleElement {#tupleelement} diff --git a/docs/ru/sql-reference/functions/url-functions.md b/docs/ru/sql-reference/functions/url-functions.md index 3c6e6151ef8..087891f4347 100644 --- a/docs/ru/sql-reference/functions/url-functions.md +++ b/docs/ru/sql-reference/functions/url-functions.md @@ -14,7 +14,7 @@ sidebar_label: "Функции для работы с URL" ### protocol {#protocol} -Возвращает протокол. Примеры: http, ftp, mailto, magnet… +Возвращает протокол. Примеры: http, ftp, mailto, magnet... ### domain {#domain} diff --git a/docs/ru/sql-reference/statements/alter/comment.md b/docs/ru/sql-reference/statements/alter/comment.md index 727af15d03e..f841c8540f3 100644 --- a/docs/ru/sql-reference/statements/alter/comment.md +++ b/docs/ru/sql-reference/statements/alter/comment.md @@ -4,7 +4,7 @@ sidebar_position: 51 sidebar_label: COMMENT --- -# ALTER TABLE … MODIFY COMMENT {#alter-modify-comment} +# ALTER TABLE ... MODIFY COMMENT {#alter-modify-comment} Добавляет, изменяет или удаляет комментарий к таблице, независимо от того, был ли он установлен раньше или нет. Изменение комментария отражается как в системной таблице [system.tables](../../../operations/system-tables/tables.md), так и в результате выполнения запроса `SHOW CREATE TABLE`. diff --git a/docs/ru/sql-reference/statements/alter/delete.md b/docs/ru/sql-reference/statements/alter/delete.md index dc968a17349..c91a79f5cdd 100644 --- a/docs/ru/sql-reference/statements/alter/delete.md +++ b/docs/ru/sql-reference/statements/alter/delete.md @@ -4,7 +4,7 @@ sidebar_position: 39 sidebar_label: DELETE --- -# ALTER TABLE … DELETE {#alter-mutations} +# ALTER TABLE ... DELETE {#alter-mutations} ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr diff --git a/docs/ru/sql-reference/statements/alter/index.md b/docs/ru/sql-reference/statements/alter/index.md index 07f5ff0a298..e8b8af39e11 100644 --- a/docs/ru/sql-reference/statements/alter/index.md +++ b/docs/ru/sql-reference/statements/alter/index.md @@ -46,7 +46,7 @@ ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN ### Мутации {#mutations} -Мутации - разновидность запроса ALTER, позволяющая изменять или удалять данные в таблице. В отличие от стандартных запросов [ALTER TABLE … DELETE](../../../sql-reference/statements/alter/delete.md) и [ALTER TABLE … UPDATE](../../../sql-reference/statements/alter/update.md), рассчитанных на точечное изменение данных, область применения мутаций - достаточно тяжёлые изменения, затрагивающие много строк в таблице. Поддержана для движков таблиц семейства [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md), в том числе для движков с репликацией. +Мутации - разновидность запроса ALTER, позволяющая изменять или удалять данные в таблице. В отличие от стандартных запросов [ALTER TABLE ... DELETE](../../../sql-reference/statements/alter/delete.md) и [ALTER TABLE ... UPDATE](../../../sql-reference/statements/alter/update.md), рассчитанных на точечное изменение данных, область применения мутаций - достаточно тяжёлые изменения, затрагивающие много строк в таблице. Поддержана для движков таблиц семейства [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md), в том числе для движков с репликацией. Конвертировать существующие таблицы для работы с мутациями не нужно. Но после применения первой мутации формат данных таблицы становится несовместимым с предыдущими версиями и откатиться на предыдущую версию уже не получится. diff --git a/docs/ru/sql-reference/statements/alter/update.md b/docs/ru/sql-reference/statements/alter/update.md index b2032ac77d1..01574a8a9b7 100644 --- a/docs/ru/sql-reference/statements/alter/update.md +++ b/docs/ru/sql-reference/statements/alter/update.md @@ -4,7 +4,7 @@ sidebar_position: 40 sidebar_label: UPDATE --- -# ALTER TABLE … UPDATE {#alter-table-update-statements} +# ALTER TABLE ... UPDATE {#alter-table-update-statements} ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] WHERE filter_expr diff --git a/docs/ru/sql-reference/statements/alter/view.md b/docs/ru/sql-reference/statements/alter/view.md index e6f6730ff99..53e295f6bbe 100644 --- a/docs/ru/sql-reference/statements/alter/view.md +++ b/docs/ru/sql-reference/statements/alter/view.md @@ -4,9 +4,9 @@ sidebar_position: 50 sidebar_label: VIEW --- -# Выражение ALTER TABLE … MODIFY QUERY {#alter-modify-query} +# Выражение ALTER TABLE ... MODIFY QUERY {#alter-modify-query} -Вы можете изменить запрос `SELECT`, который был задан при создании [материализованного представления](../create/view.md#materialized), с помощью запроса 'ALTER TABLE … MODIFY QUERY'. Используйте его если при создании материализованного представления не использовалась секция `TO [db.]name`. Настройка `allow_experimental_alter_materialized_view_structure` должна быть включена. +Вы можете изменить запрос `SELECT`, который был задан при создании [материализованного представления](../create/view.md#materialized), с помощью запроса 'ALTER TABLE ... MODIFY QUERY'. Используйте его если при создании материализованного представления не использовалась секция `TO [db.]name`. Настройка `allow_experimental_alter_materialized_view_structure` должна быть включена. Если при создании материализованного представления использовалась конструкция `TO [db.]name`, то для изменения отсоедините представление с помощью [DETACH](../detach.md), измените таблицу с помощью [ALTER TABLE](index.md), а затем снова присоедините запрос с помощью [ATTACH](../attach.md). diff --git a/docs/ru/sql-reference/statements/create/view.md b/docs/ru/sql-reference/statements/create/view.md index 032bdc6e6d4..8fa30446bb3 100644 --- a/docs/ru/sql-reference/statements/create/view.md +++ b/docs/ru/sql-reference/statements/create/view.md @@ -60,7 +60,7 @@ AS SELECT ... Если указано `POPULATE`, то при создании представления в него будут добавлены данные, уже содержащиеся в исходной таблице, как если бы был сделан запрос `CREATE TABLE ... AS SELECT ...` . Если `POPULATE` не указано, представление будет содержать только данные, добавленные в таблицу после создания представления. Использовать `POPULATE` не рекомендуется, так как в представление не попадут данные, добавляемые в таблицу во время создания представления. -Запрос `SELECT` может содержать `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Следует иметь ввиду, что соответствующие преобразования будут выполняться независимо, на каждый блок вставляемых данных. Например, при наличии `GROUP BY`, данные будут агрегироваться при вставке, но только в рамках одной пачки вставляемых данных. Далее, данные не будут доагрегированы. Исключение - использование ENGINE, производящего агрегацию данных самостоятельно, например, `SummingMergeTree`. +Запрос `SELECT` может содержать `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`... Следует иметь ввиду, что соответствующие преобразования будут выполняться независимо, на каждый блок вставляемых данных. Например, при наличии `GROUP BY`, данные будут агрегироваться при вставке, но только в рамках одной пачки вставляемых данных. Далее, данные не будут доагрегированы. Исключение - использование ENGINE, производящего агрегацию данных самостоятельно, например, `SummingMergeTree`. Выполнение запросов [ALTER](../../../sql-reference/statements/alter/view.md) над материализованными представлениями имеет свои особенности, поэтому эти запросы могут быть неудобными для использования. Если материализованное представление использует конструкцию `TO [db.]name`, то можно выполнить `DETACH` представления, `ALTER` для целевой таблицы и последующий `ATTACH` ранее отсоединенного (`DETACH`) представления. diff --git a/docs/ru/sql-reference/statements/insert-into.md b/docs/ru/sql-reference/statements/insert-into.md index 747e36b8809..309d4852b11 100644 --- a/docs/ru/sql-reference/statements/insert-into.md +++ b/docs/ru/sql-reference/statements/insert-into.md @@ -73,7 +73,7 @@ INSERT INTO insert_select_testtable VALUES (1, DEFAULT, 1) ; INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -Например, следующий формат запроса идентичен базовому варианту INSERT … VALUES: +Например, следующий формат запроса идентичен базовому варианту INSERT ... VALUES: ``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index 5331cf00728..546a674d41a 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -116,7 +116,7 @@ SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UIn **Пример** -Запрос данных из файлов с именами `file000`, `file001`, … , `file999`: +Запрос данных из файлов с именами `file000`, `file001`, ... , `file999`: ``` sql SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32'); diff --git a/docs/ru/sql-reference/table-functions/s3.md b/docs/ru/sql-reference/table-functions/s3.md index fe40cb0c507..2847a95bf19 100644 --- a/docs/ru/sql-reference/table-functions/s3.md +++ b/docs/ru/sql-reference/table-functions/s3.md @@ -108,7 +108,7 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefi Если список файлов содержит диапазоны чисел с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры отдельно или используйте `?`. ::: -Подсчитаем общее количество строк в файлах с именами `file-000.csv`, `file-001.csv`, … , `file-999.csv`: +Подсчитаем общее количество строк в файлах с именами `file-000.csv`, `file-001.csv`, ... , `file-999.csv`: ``` sql SELECT count(*) diff --git a/docs/zh/changelog/index.md b/docs/zh/changelog/index.md index 7afcc07c6fb..c91d8bcf4d1 100644 --- a/docs/zh/changelog/index.md +++ b/docs/zh/changelog/index.md @@ -190,7 +190,7 @@ sidebar_label: "\u53D8\u66F4\u65E5\u5FD7" - 如果在获取系统数据时发生了zookeeper异常。副本,将其显示在单独的列中。 这实现了 [#9137](https://github.com/ClickHouse/ClickHouse/issues/9137) [#9138](https://github.com/ClickHouse/ClickHouse/pull/9138) ([阿列克谢-米洛维多夫](https://github.com/alexey-milovidov)) - 原子删除destroy上的MergeTree数据部分。 [#8402](https://github.com/ClickHouse/ClickHouse/pull/8402) ([Vladimir Chebotarev](https://github.com/excitoon)) - 支持分布式表的行级安全性。 [#8926](https://github.com/ClickHouse/ClickHouse/pull/8926) ([伊万](https://github.com/abyss7)) -- Now we recognize suffix (like KB, KiB…) in settings values. [#8072](https://github.com/ClickHouse/ClickHouse/pull/8072) ([米哈伊尔\*科罗托夫](https://github.com/millb)) +- Now we recognize suffix (like KB, KiB...) in settings values. [#8072](https://github.com/ClickHouse/ClickHouse/pull/8072) ([米哈伊尔\*科罗托夫](https://github.com/millb)) - 在构建大型连接的结果时防止内存不足。 [#8637](https://github.com/ClickHouse/ClickHouse/pull/8637) ([Artem Zuikov](https://github.com/4ertus2)) - 在交互模式下为建议添加群集名称 `clickhouse-client`. [#8709](https://github.com/ClickHouse/ClickHouse/pull/8709) ([阿列克谢-米洛维多夫](https://github.com/alexey-milovidov)) - Initialize query profiler for all threads in a group, e.g. it allows to fully profile insert-queries [#8820](https://github.com/ClickHouse/ClickHouse/pull/8820) ([伊万](https://github.com/abyss7)) @@ -523,7 +523,7 @@ sidebar_label: "\u53D8\u66F4\u65E5\u5FD7" - 现在后台在磁盘之间移动,运行它的seprate线程池。 [#7670](https://github.com/ClickHouse/ClickHouse/pull/7670) ([Vladimir Chebotarev](https://github.com/excitoon)) - `SYSTEM RELOAD DICTIONARY` 现在同步执行。 [#8240](https://github.com/ClickHouse/ClickHouse/pull/8240) ([维塔利\*巴拉诺夫](https://github.com/vitlibar)) - 堆栈跟踪现在显示物理地址(对象文件中的偏移量),而不是虚拟内存地址(加载对象文件的位置)。 这允许使用 `addr2line` 当二进制独立于位置并且ASLR处于活动状态时。 这修复 [#8360](https://github.com/ClickHouse/ClickHouse/issues/8360). [#8387](https://github.com/ClickHouse/ClickHouse/pull/8387) ([阿列克谢-米洛维多夫](https://github.com/alexey-milovidov)) -- 支持行级安全筛选器的新语法: `…
`. 修复 [#5779](https://github.com/ClickHouse/ClickHouse/issues/5779). [#8381](https://github.com/ClickHouse/ClickHouse/pull/8381) ([伊万](https://github.com/abyss7)) +- 支持行级安全筛选器的新语法: `...
`. 修复 [#5779](https://github.com/ClickHouse/ClickHouse/issues/5779). [#8381](https://github.com/ClickHouse/ClickHouse/pull/8381) ([伊万](https://github.com/abyss7)) - 现在 `cityHash` 功能可以与工作 `Decimal` 和 `UUID` 类型。 修复 [#5184](https://github.com/ClickHouse/ClickHouse/issues/5184). [#7693](https://github.com/ClickHouse/ClickHouse/pull/7693) ([米哈伊尔\*科罗托夫](https://github.com/millb)) - 从系统日志中删除了固定的索引粒度(它是1024),因为它在实现自适应粒度之后已经过时。 [#7698](https://github.com/ClickHouse/ClickHouse/pull/7698) ([阿列克谢-米洛维多夫](https://github.com/alexey-milovidov)) - 当ClickHouse在没有SSL的情况下编译时,启用MySQL兼容服务器。 [#7852](https://github.com/ClickHouse/ClickHouse/pull/7852) ([尤里\*巴拉诺夫](https://github.com/yurriy)) diff --git a/docs/zh/development/style.md b/docs/zh/development/style.md index c0a08291e02..724b22ad461 100644 --- a/docs/zh/development/style.md +++ b/docs/zh/development/style.md @@ -53,7 +53,7 @@ memcpy(&buf[place_value], &x, sizeof(x)); for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** 在二元运算符(`+`,`-`,`*`,`/`,`%`,…)和三元运算符 `?:` 周围添加空格。 +**7.** 在二元运算符(`+`,`-`,`*`,`/`,`%`,...)和三元运算符 `?:` 周围添加空格。 ``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); @@ -82,7 +82,7 @@ dst.ClickGoodEvent = click.GoodEvent; 如有必要,运算符可以包裹到下一行。 在这种情况下,它前面的偏移量增加。 -**11.** 不要使用空格来分开一元运算符 (`--`, `++`, `*`, `&`, …) 和参数。 +**11.** 不要使用空格来分开一元运算符 (`--`, `++`, `*`, `&`, ...) 和参数。 **12.** 在逗号后面加一个空格,而不是在之前。同样的规则也适合 `for` 循环中的分号。 @@ -111,7 +111,7 @@ public: **16.** 如果对整个文件使用相同的 `namespace`,并且没有其他重要的东西,则 `namespace` 中不需要偏移量。 -**17.** 在 `if`, `for`, `while` 中包裹的代码块中,若代码是一个单行的 `statement`,那么大括号是可选的。 可以将 `statement` 放到一行中。这个规则同样适用于嵌套的 `if`, `for`, `while`, … +**17.** 在 `if`, `for`, `while` 中包裹的代码块中,若代码是一个单行的 `statement`,那么大括号是可选的。 可以将 `statement` 放到一行中。这个规则同样适用于嵌套的 `if`, `for`, `while`, ... 但是如果内部 `statement` 包含大括号或 `else`,则外部块应该用大括号括起来。 @@ -262,7 +262,7 @@ void executeQuery( 这个示例来源于 http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/。 -**7.** 不要在每个文件的开头写入垃圾注释(作者,创建日期…)。 +**7.** 不要在每个文件的开头写入垃圾注释(作者,创建日期...)。 **8.** 单行注释用三个斜杆: `///` ,多行注释以 `/**`开始。 这些注释会当做文档。 diff --git a/docs/zh/engines/table-engines/integrations/hdfs.md b/docs/zh/engines/table-engines/integrations/hdfs.md index 55648afe407..be673b6ce92 100644 --- a/docs/zh/engines/table-engines/integrations/hdfs.md +++ b/docs/zh/engines/table-engines/integrations/hdfs.md @@ -103,7 +103,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs **示例** -创建具有名为文件的表 `file000`, `file001`, … , `file999`: +创建具有名为文件的表 `file000`, `file001`, ... , `file999`: ``` sql CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') diff --git a/docs/zh/engines/table-engines/integrations/s3.md b/docs/zh/engines/table-engines/integrations/s3.md index f2585decabf..f18814675c3 100644 --- a/docs/zh/engines/table-engines/integrations/s3.md +++ b/docs/zh/engines/table-engines/integrations/s3.md @@ -109,7 +109,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = S3('https: **示例** -使用文件`file-000.csv`, `file-001.csv`, … , `file-999.csv`来创建表: +使用文件`file-000.csv`, `file-001.csv`, ... , `file-999.csv`来创建表: ``` sql CREATE TABLE big_table (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV'); @@ -202,7 +202,7 @@ ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_p !!! warning "Warning" 如果文件列表中包含有从0开头的数字范围,请对每个数字分别使用带括号的结构,或者使用`?`. -4. 从文件`file-000.csv`, `file-001.csv`, … , `file-999.csv`创建表: +4. 从文件`file-000.csv`, `file-001.csv`, ... , `file-999.csv`创建表: ``` sql CREATE TABLE big_table (name String, value UInt32) diff --git a/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md index 4fecf4e5669..e283a4c7510 100644 --- a/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -59,7 +59,7 @@ WHERE table = 'visits' └───────────┴────────────────┴────────┘ ``` -`partition` 列存储分区的名称。此示例中有两个分区:`201901` 和 `201902`。在 [ALTER … PARTITION](#alter_manipulations-with-partitions) 语句中你可以使用该列值来指定分区名称。 +`partition` 列存储分区的名称。此示例中有两个分区:`201901` 和 `201902`。在 [ALTER ... PARTITION](#alter_manipulations-with-partitions) 语句中你可以使用该列值来指定分区名称。 `name` 列为分区中数据片段的名称。在 [ALTER ATTACH PART](#alter_attach-partition) 语句中你可以使用此列值中来指定片段名称。 diff --git a/docs/zh/engines/table-engines/mergetree-family/mergetree.md b/docs/zh/engines/table-engines/mergetree-family/mergetree.md index bfa69338657..67bd681269b 100644 --- a/docs/zh/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/zh/engines/table-engines/mergetree-family/mergetree.md @@ -702,7 +702,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' - 插入(`INSERT`查询) - 后台合并和[数据变异](../../../sql-reference/statements/alter.md#alter-mutations) - 从另一个副本下载 -- [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter.md#alter_freeze-partition) 冻结分区 +- [ALTER TABLE ... FREEZE PARTITION](../../../sql-reference/statements/alter.md#alter_freeze-partition) 冻结分区 除了数据变异和冻结分区以外的情况下,数据按照以下逻辑存储到卷或磁盘上: @@ -713,7 +713,7 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' 在后台,数据片段基于剩余空间(`move_factor`参数)根据卷在配置文件中定义的顺序进行转移。数据永远不会从最后一个移出也不会从第一个移入。可以通过系统表 [system.part_log](../../../operations/system-tables/part_log.md#system_tables-part-log) (字段 `type = MOVE_PART`) 和 [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) (字段 `path` 和 `disk`) 来监控后台的移动情况。具体细节可以通过服务器日志查看。 -用户可以通过 [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter.md#alter_move-partition) 强制移动一个数据片段或分区到另外一个卷,所有后台移动的限制都会被考虑在内。这个查询会自行启动,无需等待后台操作完成。如果没有足够的可用空间或任何必须条件没有被满足,用户会收到报错信息。 +用户可以通过 [ALTER TABLE ... MOVE PART\|PARTITION ... TO VOLUME\|DISK ...](../../../sql-reference/statements/alter.md#alter_move-partition) 强制移动一个数据片段或分区到另外一个卷,所有后台移动的限制都会被考虑在内。这个查询会自行启动,无需等待后台操作完成。如果没有足够的可用空间或任何必须条件没有被满足,用户会收到报错信息。 数据移动不会妨碍到数据复制。也就是说,同一张表的不同副本可以指定不同的存储策略。 diff --git a/docs/zh/engines/table-engines/special/external-data.md b/docs/zh/engines/table-engines/special/external-data.md index 688e25402ab..06c6331b4f3 100644 --- a/docs/zh/engines/table-engines/special/external-data.md +++ b/docs/zh/engines/table-engines/special/external-data.md @@ -26,7 +26,7 @@ ClickHouse 允许向服务器发送处理查询所需的数据以及 SELECT 查 以下的参数是可选的:**–name** – 表的名称,如果省略,则采用 _data。 **–format** – 文件中的数据格式。 如果省略,则使用 TabSeparated。 -以下的参数必选一个:**–types** – 逗号分隔列类型的列表。例如:`UInt64,String`。列将被命名为 _1,_2,… +以下的参数必选一个:**–types** – 逗号分隔列类型的列表。例如:`UInt64,String`。列将被命名为 _1,_2,... **–structure**– 表结构的格式 `UserID UInt64`,`URL String`。定义列的名字以及类型。 在 «file» 中指定的文件将由 «format» 中指定的格式解析,使用在 «types» 或 «structure» 中指定的数据类型。该表将被上传到服务器,并在作为名称为 «name»临时表。 diff --git a/docs/zh/faq/general/olap.md b/docs/zh/faq/general/olap.md index b014419578b..c4b36b138fa 100644 --- a/docs/zh/faq/general/olap.md +++ b/docs/zh/faq/general/olap.md @@ -10,13 +10,13 @@ sidebar_position: 100 [OLAP](https://en.wikipedia.org/wiki/Online_analytical_processing) stands for Online Analytical Processing. It is a broad term that can be looked at from two perspectives: technical and business. But at the very high level, you can just read these words backward: Processing -: Some source data is processed… +: Some source data is processed... Analytical -: …to produce some analytical reports and insights… +: ...to produce some analytical reports and insights... Online -: …in real-time. +: ...in real-time. ## OLAP from the Business Perspective {#olap-from-the-business-perspective} diff --git a/docs/zh/getting-started/example-datasets/nyc-taxi.md b/docs/zh/getting-started/example-datasets/nyc-taxi.md index 9c487140df3..ceeb6fbb9e0 100644 --- a/docs/zh/getting-started/example-datasets/nyc-taxi.md +++ b/docs/zh/getting-started/example-datasets/nyc-taxi.md @@ -196,7 +196,7 @@ real 75m56.214s (也可以直接使用`COPY ... TO PROGRAM`从Postgres中导入数据) -数据中所有与天气相关的字段(precipitation……average_wind_speed)都填充了NULL。 所以,我们将从最终数据集中删除它们 +数据中所有与天气相关的字段(precipitation...average_wind_speed)都填充了NULL。 所以,我们将从最终数据集中删除它们 首先,我们使用单台服务器创建表,后面我们将在多台节点上创建这些表。 diff --git a/docs/zh/getting-started/example-datasets/uk-price-paid.mdx b/docs/zh/getting-started/example-datasets/uk-price-paid.mdx index ecfdcddbbe2..7d4c299b919 100644 --- a/docs/zh/getting-started/example-datasets/uk-price-paid.mdx +++ b/docs/zh/getting-started/example-datasets/uk-price-paid.mdx @@ -212,7 +212,7 @@ ORDER BY year └──────┴─────────┴───────────────────────────────────────────────────────┘ ``` -2020 年房价出事了!但这并不令人意外…… +2020 年房价出事了!但这并不令人意外... ### 查询 3. 最昂贵的社区 {#most-expensive-neighborhoods} diff --git a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md index 758992e4084..975d5eb764c 100644 --- a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md +++ b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md @@ -371,7 +371,7 @@ UserID.bin,URL.bin,和EventTime.bin是UserID :::note - 最后一个索引条目(上图中的“mark 1082”)存储了上图中颗粒1082的主键列的最大值。 -- 索引条目(索引标记)不是基于表中的特定行,而是基于颗粒。例如,对于上图中的索引条目‘mark 0’,在我们的表中没有UserID为240.923且URL为“goal://metry=10000467796a411…”的行,相反,对于该表,有一个颗粒0,在该颗粒中,最小UserID值是240.923,最小URL值是“goal://metry=10000467796a411…”,这两个值来自不同的行。 +- 索引条目(索引标记)不是基于表中的特定行,而是基于颗粒。例如,对于上图中的索引条目‘mark 0’,在我们的表中没有UserID为240.923且URL为“goal://metry=10000467796a411...”的行,相反,对于该表,有一个颗粒0,在该颗粒中,最小UserID值是240.923,最小URL值是“goal://metry=10000467796a411...”,这两个值来自不同的行。 - 主索引文件完全加载到主内存中。如果文件大于可用的空闲内存空间,则ClickHouse将发生错误。 ::: diff --git a/docs/zh/index.md b/docs/zh/index.md index fab00dbcd1b..ec4b6dce1f8 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -16,7 +16,7 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) | #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | | #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | | #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | … | … | … | … | … | +| #N | ... | ... | ... | ... | ... | 处于同一行中的数据总是被物理的存储在一起。 @@ -26,11 +26,11 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) | Row: | #0 | #1 | #2 | #N | |-------------|---------------------|---------------------|---------------------|-----| -| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | -| JavaEnable: | 1 | 0 | 1 | … | -| Title: | Investor Relations | Contact us | Mission | … | -| GoodEvent: | 1 | 1 | 1 | … | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | +| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | +| JavaEnable: | 1 | 0 | 1 | ... | +| Title: | Investor Relations | Contact us | Mission | ... | +| GoodEvent: | 1 | 1 | 1 | ... | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | 这些示例只显示了数据的排列顺序。来自不同列的值被单独存储,来自同一列的数据被存储在一起。 diff --git a/docs/zh/operations/settings/query-complexity.md b/docs/zh/operations/settings/query-complexity.md index 124d5fa5d1a..b1b5ca75018 100644 --- a/docs/zh/operations/settings/query-complexity.md +++ b/docs/zh/operations/settings/query-complexity.md @@ -196,7 +196,7 @@ Restrictions on the «maximum amount of something» can take the value 0, which Limits the number of rows in the hash table that is used when joining tables. -This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. +This settings applies to [SELECT ... JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. If a query contains multiple joins, ClickHouse checks this setting for every intermediate result. @@ -213,7 +213,7 @@ Default value: 0. Limits the size in bytes of the hash table used when joining tables. -This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). +This settings applies to [SELECT ... JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). If the query contains joins, ClickHouse checks this setting for every intermediate result. diff --git a/docs/zh/operations/settings/settings.md b/docs/zh/operations/settings/settings.md index c3b4194ed44..5e59196f56c 100644 --- a/docs/zh/operations/settings/settings.md +++ b/docs/zh/operations/settings/settings.md @@ -1002,7 +1002,7 @@ ClickHouse生成异常 ## count_distinct_implementation {#settings-count_distinct_implementation} -指定其中的 `uniq*` 函数应用于执行 [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) 建筑。 +指定其中的 `uniq*` 函数应用于执行 [COUNT(DISTINCT ...)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) 建筑。 可能的值: diff --git a/docs/zh/operations/system-tables/dictionaries.md b/docs/zh/operations/system-tables/dictionaries.md index 0cf91e45e86..c7b1bdd04be 100644 --- a/docs/zh/operations/system-tables/dictionaries.md +++ b/docs/zh/operations/system-tables/dictionaries.md @@ -21,7 +21,7 @@ machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. - `origin` ([字符串](../../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. - `type` ([字符串](../../sql-reference/data-types/string.md)) — Type of dictionary allocation. [在内存中存储字典](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). -- `key` — [密钥类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key):数字键 ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) or Сomposite key ([字符串](../../sql-reference/data-types/string.md)) — form “(type 1, type 2, …, type n)”. +- `key` — [密钥类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key):数字键 ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) or Сomposite key ([字符串](../../sql-reference/data-types/string.md)) — form “(type 1, type 2, ..., type n)”. - `attribute.names` ([阵列](../../sql-reference/data-types/array.md)([字符串](../../sql-reference/data-types/string.md))) — Array of [属性名称](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) 由字典提供。 - `attribute.types` ([阵列](../../sql-reference/data-types/array.md)([字符串](../../sql-reference/data-types/string.md))) — Corresponding array of [属性类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) 这是由字典提供。 - `bytes_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Amount of RAM allocated for the dictionary. diff --git a/docs/zh/sql-reference/aggregate-functions/parametric-functions.md b/docs/zh/sql-reference/aggregate-functions/parametric-functions.md index cb1dcc35f5c..27d3375aebb 100644 --- a/docs/zh/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/zh/sql-reference/aggregate-functions/parametric-functions.md @@ -80,7 +80,7 @@ FROM 在这种情况下,您应该记住您不知道直方图bin边界。 -## sequenceMatch(pattern)(timestamp, cond1, cond2, …) {#function-sequencematch} +## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} 检查序列是否包含与模式匹配的事件链。 @@ -167,7 +167,7 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM - [sequenceCount](#function-sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount} +## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} 计算与模式匹配的事件链的数量。该函数搜索不重叠的事件链。当前链匹配后,它开始搜索下一个链。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md index 4dce65af1ed..253eb9ef82d 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md @@ -7,7 +7,7 @@ sidebar_position: 201 **语法** ``` sql -quantiles(level1, level2, …)(x) +quantiles(level1, level2, ...)(x) ``` 所有分位数函数(quantile)也有相应的分位数(quantiles)函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。 这些函数一次计算所列的级别的所有分位数, 并返回结果值的数组。 diff --git a/docs/zh/sql-reference/data-types/aggregatefunction.md b/docs/zh/sql-reference/data-types/aggregatefunction.md index e8f28b367a5..80648eb165b 100644 --- a/docs/zh/sql-reference/data-types/aggregatefunction.md +++ b/docs/zh/sql-reference/data-types/aggregatefunction.md @@ -1,7 +1,7 @@ --- slug: /zh/sql-reference/data-types/aggregatefunction --- -# AggregateFunction(name, types_of_arguments…) {#data-type-aggregatefunction} +# AggregateFunction(name, types_of_arguments...) {#data-type-aggregatefunction} 聚合函数的中间状态,可以通过聚合函数名称加`-State`后缀的形式得到它。与此同时,当您需要访问该类型的最终状态数据时,您需要以相同的聚合函数名加`-Merge`后缀的形式来得到最终状态数据。 diff --git a/docs/zh/sql-reference/data-types/domains/index.md b/docs/zh/sql-reference/data-types/domains/index.md index c123b10f6fe..9f12018732b 100644 --- a/docs/zh/sql-reference/data-types/domains/index.md +++ b/docs/zh/sql-reference/data-types/domains/index.md @@ -19,9 +19,9 @@ Domain类型是特定实现的类型,它总是与某个现存的基础类型 ### Domains的额外特性 {#domainsde-e-wai-te-xing} - 在执行SHOW CREATE TABLE 或 DESCRIBE TABLE时,其对应的列总是展示为Domain类型的名称 -- 在INSERT INTO domain_table(domain_column) VALUES(…)中输入数据总是以更人性化的格式进行输入 +- 在INSERT INTO domain_table(domain_column) VALUES(...)中输入数据总是以更人性化的格式进行输入 - 在SELECT domain_column FROM domain_table中数据总是以更人性化的格式输出 -- 在INSERT INTO domain_table FORMAT CSV …中,实现外部源数据以更人性化的格式载入 +- 在INSERT INTO domain_table FORMAT CSV ...中,实现外部源数据以更人性化的格式载入 ### Domains类型的限制 {#domainslei-xing-de-xian-zhi} diff --git a/docs/zh/sql-reference/data-types/fixedstring.md b/docs/zh/sql-reference/data-types/fixedstring.md index 633307938a9..d454e935fe7 100644 --- a/docs/zh/sql-reference/data-types/fixedstring.md +++ b/docs/zh/sql-reference/data-types/fixedstring.md @@ -18,8 +18,8 @@ slug: /zh/sql-reference/data-types/fixedstring 可以有效存储在`FixedString`类型的列中的值的示例: - 二进制表示的IP地址(IPv6使用`FixedString(16)`) -- 语言代码(ru_RU, en_US … ) -- 货币代码(USD, RUB … ) +- 语言代码(ru_RU, en_US ... ) +- 货币代码(USD, RUB ... ) - 二进制表示的哈希值(MD5使用`FixedString(16)`,SHA256使用`FixedString(32)`) 请使用[UUID](uuid.md)数据类型来存储UUID值,。 diff --git a/docs/zh/sql-reference/data-types/nested-data-structures/nested.md b/docs/zh/sql-reference/data-types/nested-data-structures/nested.md index 5ef8256b483..57b30de0881 100644 --- a/docs/zh/sql-reference/data-types/nested-data-structures/nested.md +++ b/docs/zh/sql-reference/data-types/nested-data-structures/nested.md @@ -1,7 +1,7 @@ --- slug: /zh/sql-reference/data-types/nested-data-structures/nested --- -# Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} +# Nested(Name1 Type1, Name2 Type2, ...) {#nestedname1-type1-name2-type2} 嵌套数据结构类似于嵌套表。嵌套数据结构的参数(列名和类型)与 CREATE 查询类似。每个表可以包含任意多行嵌套数据结构。 diff --git a/docs/zh/sql-reference/data-types/simpleaggregatefunction.md b/docs/zh/sql-reference/data-types/simpleaggregatefunction.md index 601cb602a78..fbaa76365ec 100644 --- a/docs/zh/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/zh/sql-reference/data-types/simpleaggregatefunction.md @@ -3,7 +3,7 @@ slug: /zh/sql-reference/data-types/simpleaggregatefunction --- # SimpleAggregateFunction {#data-type-simpleaggregatefunction} -`SimpleAggregateFunction(name, types_of_arguments…)` 数据类型存储聚合函数的当前值, 并不像 [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) 那样存储其全部状态。这种优化可以应用于具有以下属性函数: 将函数 `f` 应用于行集合 `S1 UNION ALL S2` 的结果,可以通过将 `f` 分别应用于行集合的部分, 然后再将 `f` 应用于结果来获得: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`。 这个属性保证了部分聚合结果足以计算出合并的结果,所以我们不必存储和处理任何额外的数据。 +`SimpleAggregateFunction(name, types_of_arguments...)` 数据类型存储聚合函数的当前值, 并不像 [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) 那样存储其全部状态。这种优化可以应用于具有以下属性函数: 将函数 `f` 应用于行集合 `S1 UNION ALL S2` 的结果,可以通过将 `f` 分别应用于行集合的部分, 然后再将 `f` 应用于结果来获得: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`。 这个属性保证了部分聚合结果足以计算出合并的结果,所以我们不必存储和处理任何额外的数据。 支持以下聚合函数: diff --git a/docs/zh/sql-reference/data-types/tuple.md b/docs/zh/sql-reference/data-types/tuple.md index 004c80ff916..38813701c70 100644 --- a/docs/zh/sql-reference/data-types/tuple.md +++ b/docs/zh/sql-reference/data-types/tuple.md @@ -1,7 +1,7 @@ --- slug: /zh/sql-reference/data-types/tuple --- -# Tuple(T1, T2, …) {#tuplet1-t2} +# Tuple(T1, T2, ...) {#tuplet1-t2} 元组,其中每个元素都有单独的 [类型](index.md#data_types)。 diff --git a/docs/zh/sql-reference/functions/array-functions.md b/docs/zh/sql-reference/functions/array-functions.md index d150b94b8af..69db34e4a36 100644 --- a/docs/zh/sql-reference/functions/array-functions.md +++ b/docs/zh/sql-reference/functions/array-functions.md @@ -152,7 +152,7 @@ SELECT range(5), range(1, 5), range(1, 5, 2), range(-1, 5, 2); └─────────────┴─────────────┴────────────────┴─────────────────┘ ``` -## array(x1, …), operator \[x1, …\] {#arrayx1-operator-x1} +## array(x1, ...), operator \[x1, ...\] {#arrayx1-operator-x1} 使用函数的参数作为数组元素创建一个数组。 参数必须是常量,并且具有最小公共类型的类型。必须至少传递一个参数,否则将不清楚要创建哪种类型的数组。也就是说,你不能使用这个函数来创建一个空数组(为此,使用上面描述的’emptyArray  \*’函数)。 @@ -337,7 +337,7 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) 设置为«NULL»的元素将作为普通的元素值处理。 -## arrayCount(\[func,\] arr1, …) {#array-count} +## arrayCount(\[func,\] arr1, ...) {#array-count} `func`将arr数组作为参数,其返回结果为非零值的数量。如果未指定“func”,则返回数组中非零元素的数量。 @@ -363,7 +363,7 @@ SELECT countEqual([1, 2, NULL, NULL], NULL) ## arrayEnumerate(arr) {#array_functions-arrayenumerate} -返回 Array \[1, 2, 3, …, length (arr) \] +返回 Array \[1, 2, 3, ..., length (arr) \] 此功能通常与ARRAY JOIN一起使用。它允许在应用ARRAY JOIN后为每个数组计算一次。例如: @@ -403,7 +403,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) 此功能也可用于高阶函数。例如,您可以使用它来获取与条件匹配的元素的数组索引。 -## arrayEnumerateUniq(arr, …) {#arrayenumerateuniqarr} +## arrayEnumerateUniq(arr, ...) {#arrayenumerateuniqarr} 返回与源数组大小相同的数组,其中每个元素表示与其下标对应的源数组元素在源数组中出现的次数。 例如:arrayEnumerateUniq( \[10,20,10,30 \])=  \[1,1,2,1 \]。 @@ -621,7 +621,7 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res 设置为«NULL»的数组元素作为普通的数组元素值处理。 -## arraySort(\[func,\] arr, …) {#array_functions-reverse-sort} +## arraySort(\[func,\] arr, ...) {#array_functions-reverse-sort} 以升序对`arr`数组的元素进行排序。如果指定了`func`函数,则排序顺序由`func`函数的调用结果决定。如果`func`接受多个参数,那么`arraySort`函数也将解析与`func`函数参数相同数量的数组参数。更详细的示例在`arraySort`的末尾。 @@ -721,7 +721,7 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; !!! 注意 "注意" 为了提高排序效率, 使用了[施瓦茨变换](https://en.wikipedia.org/wiki/Schwartzian_transform)。 -## arrayReverseSort(\[func,\] arr, …) {#array_functions-reverse-sort} +## arrayReverseSort(\[func,\] arr, ...) {#array_functions-reverse-sort} 以降序对`arr`数组的元素进行排序。如果指定了`func`函数,则排序顺序由`func`函数的调用结果决定。如果`func`接受多个参数,那么`arrayReverseSort`函数也将解析与`func`函数参数相同数量的数组作为参数。更详细的示例在`arrayReverseSort`的末尾。 @@ -822,7 +822,7 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; └─────────┘ ``` -## arrayUniq(arr, …) {#arrayuniqarr} +## arrayUniq(arr, ...) {#arrayuniqarr} 如果传递一个参数,则计算数组中不同元素的数量。 如果传递了多个参数,则它计算多个数组中相应位置的不同元素元组的数量。 @@ -1221,7 +1221,7 @@ select arrayAUC([0.1, 0.4, 0.35, 0.8], [0, 0, 1, 1]); └───────────────────────────────────────────────┘ ``` -## arrayMap(func, arr1, …) {#array-map} +## arrayMap(func, arr1, ...) {#array-map} 将从 `func` 函数的原始应用中获得的数组返回给 `arr` 数组中的每个元素。 @@ -1251,7 +1251,7 @@ SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res 请注意,`arrayMap` 是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayFilter(func, arr1, …) {#array-filter} +## arrayFilter(func, arr1, ...) {#array-filter} 返回一个仅包含 `arr1` 中的元素的数组,其中 `func` 返回的值不是 0。 @@ -1284,7 +1284,7 @@ SELECT 请注意,`arrayFilter`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayFill(func, arr1, …) {#array-fill} +## arrayFill(func, arr1, ...) {#array-fill} 从第一个元素到最后一个元素扫描`arr1`,如果`func`返回0,则用`arr1[i - 1]`替换`arr1[i]`。`arr1`的第一个元素不会被替换。 @@ -1302,7 +1302,7 @@ SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, 请注意,`arrayFill` 是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayReverseFill(func, arr1, …) {#array-reverse-fill} +## arrayReverseFill(func, arr1, ...) {#array-reverse-fill} 从最后一个元素到第一个元素扫描`arr1`,如果`func`返回0,则用`arr1[i + 1]`替换`arr1[i]`。`arr1`的最后一个元素不会被替换。 @@ -1320,7 +1320,7 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 请注意,`arrayReverseFill`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arraySplit(func, arr1, …) {#array-split} +## arraySplit(func, arr1, ...) {#array-split} 将 `arr1` 拆分为多个数组。当 `func` 返回 0 以外的值时,数组将在元素的左侧拆分。数组不会在第一个元素之前被拆分。 @@ -1338,7 +1338,7 @@ SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res 请注意,`arraySplit`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayReverseSplit(func, arr1, …) {#array-reverse-split} +## arrayReverseSplit(func, arr1, ...) {#array-reverse-split} 将 `arr1` 拆分为多个数组。当 `func` 返回 0 以外的值时,数组将在元素的右侧拆分。数组不会在最后一个元素之后被拆分。 @@ -1356,37 +1356,37 @@ SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res 请注意,`arrayReverseSplit`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。 您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} +## arrayExists(\[func,\] arr1, ...) {#arrayexistsfunc-arr1} 如果 `arr` 中至少有一个元素 `func` 返回 0 以外的值,则返回 1。否则,它返回 0。 请注意,`arrayExists`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您可以将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} +## arrayAll(\[func,\] arr1, ...) {#arrayallfunc-arr1} 如果 `func` 为 `arr` 中的所有元素返回 0 以外的值,则返回 1。否则,它返回 0。 请注意,`arrayAll`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您可以将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayFirst(func, arr1, …) {#array-first} +## arrayFirst(func, arr1, ...) {#array-first} 返回 `arr1` 数组中 `func` 返回非 0 的值的第一个元素。 请注意,`arrayFirst`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayLast(func, arr1, …) {#array-last} +## arrayLast(func, arr1, ...) {#array-last} 返回 `arr1` 数组中的最后一个元素,其中 `func` 返回的值不是 0。 请注意,`arrayLast`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayFirstIndex(func, arr1, …) {#array-first-index} +## arrayFirstIndex(func, arr1, ...) {#array-first-index} 返回 `arr1` 数组中第一个元素的索引,其中 `func` 返回的值不是 0。 请注意,`arrayFirstIndex`是一个[高阶函数](../../sql-reference/functions/index.md#higher-order-functions)。您必须将 lambda 函数作为第一个参数传递给它,并且不能省略。 -## arrayLastIndex(func, arr1, …) {#array-last-index} +## arrayLastIndex(func, arr1, ...) {#array-last-index} 返回 `arr1` 数组中最后一个元素的索引,其中 `func` 返回的值不是 0。 @@ -1612,7 +1612,7 @@ SELECT arrayAvg(x -> (x * x), [2, 4]) AS res; └─────┘ ``` -## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} +## arrayCumSum(\[func,\] arr1, ...) {#arraycumsumfunc-arr1} 返回源数组中元素的部分和的数组(运行总和)。如果指定了 func 函数,则数组元素的值在求和之前由该函数转换。 diff --git a/docs/zh/sql-reference/functions/date-time-functions.md b/docs/zh/sql-reference/functions/date-time-functions.md index d6493ffe605..18b9f3495c0 100644 --- a/docs/zh/sql-reference/functions/date-time-functions.md +++ b/docs/zh/sql-reference/functions/date-time-functions.md @@ -443,7 +443,7 @@ SELECT toStartOfSecond(dt64, 'Asia/Istanbul'); `toISOWeek()`是一个兼容函数,等效于`toWeek(date,3)`。 下表描述了mode参数的工作方式。 -| Mode | First day of week | Range | Week 1 is the first week … | +| Mode | First day of week | Range | Week 1 is the first week ... | |------|-------------------|-------|-------------------------------| | 0 | Sunday | 0-53 | with a Sunday in this year | | 1 | Monday | 0-53 | with 4 or more days this year | diff --git a/docs/zh/sql-reference/functions/higher-order-functions.md b/docs/zh/sql-reference/functions/higher-order-functions.md index 929dc6f3ea7..0e08f88bba1 100644 --- a/docs/zh/sql-reference/functions/higher-order-functions.md +++ b/docs/zh/sql-reference/functions/higher-order-functions.md @@ -15,13 +15,13 @@ slug: /zh/sql-reference/functions/higher-order-functions 除了’arrayMap’和’arrayFilter’以外的所有其他函数,都可以省略第一个参数(lambda函数)。在这种情况下,默认返回数组元素本身。 -### arrayMap(func, arr1, …) {#higher_order_functions-array-map} +### arrayMap(func, arr1, ...) {#higher_order_functions-array-map} 将arr 将从’func’函数的原始应用程序获得的数组返回到’arr’数组中的每个元素。 返回从原始应用程序获得的数组 ‘func’ 函数中的每个元素 ‘arr’ 阵列。 -### arrayFilter(func, arr1, …) {#arrayfilterfunc-arr1} +### arrayFilter(func, arr1, ...) {#arrayfilterfunc-arr1} 返回一个仅包含以下元素的数组 ‘arr1’ 对于哪个 ‘func’ 返回0以外的内容。 @@ -48,31 +48,31 @@ SELECT │ [2] │ └─────┘ -### arrayCount(\[func,\] arr1, …) {#arraycountfunc-arr1} +### arrayCount(\[func,\] arr1, ...) {#arraycountfunc-arr1} 返回数组arr中非零元素的数量,如果指定了’func’,则通过’func’的返回值确定元素是否为非零元素。 -### arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} +### arrayExists(\[func,\] arr1, ...) {#arrayexistsfunc-arr1} 返回数组’arr’中是否存在非零元素,如果指定了’func’,则使用’func’的返回值确定元素是否为非零元素。 -### arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} +### arrayAll(\[func,\] arr1, ...) {#arrayallfunc-arr1} 返回数组’arr’中是否存在为零的元素,如果指定了’func’,则使用’func’的返回值确定元素是否为零元素。 -### arraySum(\[func,\] arr1, …) {#arraysumfunc-arr1} +### arraySum(\[func,\] arr1, ...) {#arraysumfunc-arr1} 计算arr数组的总和,如果指定了’func’,则通过’func’的返回值计算数组的总和。 -### arrayFirst(func, arr1, …) {#arrayfirstfunc-arr1} +### arrayFirst(func, arr1, ...) {#arrayfirstfunc-arr1} 返回数组中第一个匹配的元素,函数使用’func’匹配所有元素,直到找到第一个匹配的元素。 -### arrayFirstIndex(func, arr1, …) {#arrayfirstindexfunc-arr1} +### arrayFirstIndex(func, arr1, ...) {#arrayfirstindexfunc-arr1} 返回数组中第一个匹配的元素的下标索引,函数使用’func’匹配所有元素,直到找到第一个匹配的元素。 -### arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} +### arrayCumSum(\[func,\] arr1, ...) {#arraycumsumfunc-arr1} 返回源数组部分数据的总和,如果指定了`func`函数,则使用`func`的返回值计算总和。 @@ -98,7 +98,7 @@ SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res │ [1,2,0,1] │ └───────────┘ -### arraySort(\[func,\] arr1, …) {#arraysortfunc-arr1} +### arraySort(\[func,\] arr1, ...) {#arraysortfunc-arr1} 返回升序排序`arr1`的结果。如果指定了`func`函数,则排序顺序由`func`的结果决定。 @@ -124,7 +124,7 @@ SELECT arraySort([1, nan, 2, NULL, 3, nan, 4, NULL]) │ [1,2,3,4,nan,nan,NULL,NULL] │ └───────────────────────────────────────────────┘ -### arrayReverseSort(\[func,\] arr1, …) {#arrayreversesortfunc-arr1} +### arrayReverseSort(\[func,\] arr1, ...) {#arrayreversesortfunc-arr1} 返回降序排序`arr1`的结果。如果指定了`func`函数,则排序顺序由`func`的结果决定。 diff --git a/docs/zh/sql-reference/functions/in-functions.md b/docs/zh/sql-reference/functions/in-functions.md index 346e076310e..9858159a495 100644 --- a/docs/zh/sql-reference/functions/in-functions.md +++ b/docs/zh/sql-reference/functions/in-functions.md @@ -10,10 +10,10 @@ sidebar_label: IN 运算符 请参阅[IN 运算符](../../sql-reference/operators/in.md#select-in-operators)部分。 -## tuple(x, y, …), 运算符 (x, y, …) {#tuplex-y-operator-x-y} +## tuple(x, y, ...), 运算符 (x, y, ...) {#tuplex-y-operator-x-y} 函数用于对多个列进行分组。 -对于具有类型T1,T2,…的列,它返回包含这些列的元组(T1,T2,…)。 执行该函数没有任何成本。 +对于具有类型T1,T2,...的列,它返回包含这些列的元组(T1,T2,...)。 执行该函数没有任何成本。 元组通常用作IN运算符的中间参数值,或用于创建lambda函数的形参列表。 元组不能写入表。 ## tupleElement(tuple, n), 运算符 x.N {#tupleelementtuple-n-operator-x-n} diff --git a/docs/zh/sql-reference/functions/json-functions.md b/docs/zh/sql-reference/functions/json-functions.md index 52ec0ed1535..f07de564847 100644 --- a/docs/zh/sql-reference/functions/json-functions.md +++ b/docs/zh/sql-reference/functions/json-functions.md @@ -56,7 +56,7 @@ slug: /zh/sql-reference/functions/json-functions 以下函数基于[simdjson](https://github.com/lemire/simdjson),专为更复杂的JSON解析要求而设计。但上述假设2仍然适用。 -## JSONHas(json\[, indices_or_keys\]…) {#jsonhasjson-indices-or-keys} +## JSONHas(json\[, indices_or_keys\]...) {#jsonhasjson-indices-or-keys} 如果JSON中存在该值,则返回`1`。 @@ -83,7 +83,7 @@ slug: /zh/sql-reference/functions/json-functions select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' -## JSONLength(json\[, indices_or_keys\]…) {#jsonlengthjson-indices-or-keys} +## JSONLength(json\[, indices_or_keys\]...) {#jsonlengthjson-indices-or-keys} 返回JSON数组或JSON对象的长度。 @@ -94,7 +94,7 @@ slug: /zh/sql-reference/functions/json-functions select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 -## JSONType(json\[, indices_or_keys\]…) {#jsontypejson-indices-or-keys} +## JSONType(json\[, indices_or_keys\]...) {#jsontypejson-indices-or-keys} 返回JSON值的类型。 @@ -106,13 +106,13 @@ slug: /zh/sql-reference/functions/json-functions select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' -## JSONExtractUInt(json\[, indices_or_keys\]…) {#jsonextractuintjson-indices-or-keys} +## JSONExtractUInt(json\[, indices_or_keys\]...) {#jsonextractuintjson-indices-or-keys} -## JSONExtractInt(json\[, indices_or_keys\]…) {#jsonextractintjson-indices-or-keys} +## JSONExtractInt(json\[, indices_or_keys\]...) {#jsonextractintjson-indices-or-keys} -## JSONExtractFloat(json\[, indices_or_keys\]…) {#jsonextractfloatjson-indices-or-keys} +## JSONExtractFloat(json\[, indices_or_keys\]...) {#jsonextractfloatjson-indices-or-keys} -## JSONExtractBool(json\[, indices_or_keys\]…) {#jsonextractbooljson-indices-or-keys} +## JSONExtractBool(json\[, indices_or_keys\]...) {#jsonextractbooljson-indices-or-keys} 解析JSON并提取值。这些函数类似于`visitParam*`函数。 @@ -124,7 +124,7 @@ slug: /zh/sql-reference/functions/json-functions select JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 -## JSONExtractString(json\[, indices_or_keys\]…) {#jsonextractstringjson-indices-or-keys} +## JSONExtractString(json\[, indices_or_keys\]...) {#jsonextractstringjson-indices-or-keys} 解析JSON并提取字符串。此函数类似于`visitParamExtractString`函数。 @@ -140,11 +140,11 @@ slug: /zh/sql-reference/functions/json-functions select JSONExtractString('{"abc":"\\u263"}', 'abc') = '' select JSONExtractString('{"abc":"hello}', 'abc') = '' -## JSONExtract(json\[, indices_or_keys…\], Return_type) {#jsonextractjson-indices-or-keys-return-type} +## JSONExtract(json\[, indices_or_keys...\], Return_type) {#jsonextractjson-indices-or-keys-return-type} 解析JSON并提取给定ClickHouse数据类型的值。 -这是以前的`JSONExtract函数的变体。 这意味着`JSONExtract(…, ‘String’)`返回与`JSONExtractString()`返回完全相同。`JSONExtract(…, ‘Float64’)`返回于`JSONExtractFloat()\`返回完全相同。 +这是以前的`JSONExtract函数的变体。 这意味着`JSONExtract(..., ‘String’)`返回与`JSONExtractString()`返回完全相同。`JSONExtract(..., ‘Float64’)`返回于`JSONExtractFloat()\`返回完全相同。 示例: @@ -156,7 +156,7 @@ slug: /zh/sql-reference/functions/json-functions SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Thursday' SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' -## JSONExtractKeysAndValues(json\[, indices_or_keys…\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} +## JSONExtractKeysAndValues(json\[, indices_or_keys...\], Value_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} 从JSON中解析键值对,其中值是给定的ClickHouse数据类型。 @@ -164,7 +164,7 @@ slug: /zh/sql-reference/functions/json-functions SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; -## JSONExtractRaw(json\[, indices_or_keys\]…) {#jsonextractrawjson-indices-or-keys} +## JSONExtractRaw(json\[, indices_or_keys\]...) {#jsonextractrawjson-indices-or-keys} 返回JSON的部分。 diff --git a/docs/zh/sql-reference/functions/other-functions.md b/docs/zh/sql-reference/functions/other-functions.md index 2eeaad63694..9c28ff867c5 100644 --- a/docs/zh/sql-reference/functions/other-functions.md +++ b/docs/zh/sql-reference/functions/other-functions.md @@ -90,7 +90,7 @@ SELECT 'some-file-name' AS a, basename(a) 将一个常量列变为一个非常量列。 在ClickHouse中,非常量列和常量列在内存中的表示方式不同。尽管函数对于常量列和非常量总是返回相同的结果,但它们的工作方式可能完全不同(执行不同的代码)。此函数用于调试这种行为。 -## ignore(…) {#ignore} +## ignore(...) {#ignore} 接受任何参数,包括`NULL`。始终返回0。 但是,函数的参数总是被计算的。该函数可以用于基准测试。 diff --git a/docs/zh/sql-reference/functions/string-functions.md b/docs/zh/sql-reference/functions/string-functions.md index d1914839d7c..c28735c7dc7 100644 --- a/docs/zh/sql-reference/functions/string-functions.md +++ b/docs/zh/sql-reference/functions/string-functions.md @@ -95,7 +95,7 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') 以Unicode字符为单位反转UTF-8编码的字符串。如果字符串不是UTF-8编码,则可能获取到一个非预期的结果(不会抛出异常)。 -## format(pattern, s0, s1, …) {#formatpattern-s0-s1} +## format(pattern, s0, s1, ...) {#formatpattern-s0-s1} 使用常量字符串`pattern`格式化其他参数。`pattern`字符串中包含由大括号`{}`包围的«替换字段»。 未被包含在大括号中的任何内容都被视为文本内容,它将原样保留在返回值中。 如果你需要在文本内容中包含一个大括号字符,它可以通过加倍来转义:`{{ '{{' }}`和`{{ '{{' }} '}}' }}`。 字段名称可以是数字(从零开始)或空(然后将它们视为连续数字) @@ -113,11 +113,11 @@ SELECT format('{} {}', 'Hello', 'World') └───────────────────────────────────┘ ``` -## concat(s1, s2, …) {#concat-s1-s2} +## concat(s1, s2, ...) {#concat-s1-s2} 将参数中的多个字符串拼接,不带分隔符。 -## concatAssumeInjective(s1, s2, …) {#concatassumeinjectives1-s2} +## concatAssumeInjective(s1, s2, ...) {#concatassumeinjectives1-s2} 与[concat](#concat-s1-s2)相同,区别在于,你需要保证concat(s1, s2, s3) -\> s4是单射的,它将用于GROUP BY的优化。 diff --git a/docs/zh/sql-reference/functions/string-search-functions.md b/docs/zh/sql-reference/functions/string-search-functions.md index 972fd84e2a1..8ada76eeeda 100644 --- a/docs/zh/sql-reference/functions/string-search-functions.md +++ b/docs/zh/sql-reference/functions/string-search-functions.md @@ -204,7 +204,7 @@ SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']); **语法** ```sql -multiSearchFirstPosition(haystack, [needle1, needle2, …, needleN]) +multiSearchFirstPosition(haystack, [needle1, needle2, ..., needleN]) ``` ## multiSearchFirstIndex @@ -216,7 +216,7 @@ multiSearchFirstPosition(haystack, [needle1, needle2, …, needleN]) **语法** ```sql -multiSearchFirstIndex(haystack, \[needle1, needle2, …, needlen\]) +multiSearchFirstIndex(haystack, \[needle1, needle2, ..., needlen\]) ``` ## multiSearchAny {#multisearchany} @@ -229,7 +229,7 @@ multiSearchFirstIndex(haystack, \[needle1, needle2, …, n **语法** ```sql -multiSearchAny(haystack, [needle1, needle2, …, needleN]) +multiSearchAny(haystack, [needle1, needle2, ..., needleN]) ``` ## match {#match} @@ -273,7 +273,7 @@ Hyperscan 通常容易受到正则表达式拒绝服务 (ReDoS) 攻击。有关 **语法** ```sql -multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAny(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiMatchAnyIndex @@ -283,7 +283,7 @@ multiMatchAny(haystack, \[pattern1, pattern2, …, pattern **语法** ```sql -multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAnyIndex(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiMatchAllIndices @@ -293,7 +293,7 @@ multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, pa **语法** ```sql -multiMatchAllIndices(haystack, \[pattern1, pattern2, …, patternn\]) +multiMatchAllIndices(haystack, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAny @@ -307,7 +307,7 @@ multiMatchAllIndices(haystack, \[pattern1, pattern2, …, **语法** ```sql -multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAnyIndex @@ -317,7 +317,7 @@ multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern21, pattern2, …, patternn\]) +multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## multiFuzzyMatchAllIndices @@ -327,7 +327,7 @@ multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2 **语法** ```sql -multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, …, patternn\]) +multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, ..., patternn\]) ``` ## extract diff --git a/docs/zh/sql-reference/functions/url-functions.md b/docs/zh/sql-reference/functions/url-functions.md index 44880b6ca1a..e7a0354c0bf 100644 --- a/docs/zh/sql-reference/functions/url-functions.md +++ b/docs/zh/sql-reference/functions/url-functions.md @@ -11,7 +11,7 @@ slug: /zh/sql-reference/functions/url-functions ### 协议 {#protocol} -返回URL的协议。例如: http、ftp、mailto、magnet… +返回URL的协议。例如: http、ftp、mailto、magnet... ### 域 {#domain} diff --git a/docs/zh/sql-reference/statements/alter/delete.md b/docs/zh/sql-reference/statements/alter/delete.md index 5eb77c35a93..f0b41c4e214 100644 --- a/docs/zh/sql-reference/statements/alter/delete.md +++ b/docs/zh/sql-reference/statements/alter/delete.md @@ -4,7 +4,7 @@ sidebar_position: 39 sidebar_label: DELETE --- -# ALTER TABLE … DELETE 语句 {#alter-mutations} +# ALTER TABLE ... DELETE 语句 {#alter-mutations} ``` sql ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr diff --git a/docs/zh/sql-reference/statements/alter/index.md b/docs/zh/sql-reference/statements/alter/index.md index e173837a16c..2286dcccd13 100644 --- a/docs/zh/sql-reference/statements/alter/index.md +++ b/docs/zh/sql-reference/statements/alter/index.md @@ -38,7 +38,7 @@ sidebar_label: ALTER ## Mutations 突变 {#mutations} -用来操作表数据的ALTER查询是通过一种叫做“突变”的机制来实现的,最明显的是[ALTER TABLE … DELETE](../../../sql-reference/statements/alter/delete.md)和[ALTER TABLE … UPDATE](../../../sql-reference/statements/alter/update.md)。它们是异步的后台进程,类似于[MergeTree](../../../engines/table-engines/mergetree-family/index.md)表的合并,产生新的“突变”版本的部件。 +用来操作表数据的ALTER查询是通过一种叫做“突变”的机制来实现的,最明显的是[ALTER TABLE ... DELETE](../../../sql-reference/statements/alter/delete.md)和[ALTER TABLE ... UPDATE](../../../sql-reference/statements/alter/update.md)。它们是异步的后台进程,类似于[MergeTree](../../../engines/table-engines/mergetree-family/index.md)表的合并,产生新的“突变”版本的部件。 diff --git a/docs/zh/sql-reference/statements/alter/update.md b/docs/zh/sql-reference/statements/alter/update.md index 97b2b43d889..7cf37401dc5 100644 --- a/docs/zh/sql-reference/statements/alter/update.md +++ b/docs/zh/sql-reference/statements/alter/update.md @@ -4,7 +4,7 @@ sidebar_position: 40 sidebar_label: UPDATE --- -# ALTER TABLE … UPDATE 语句 {#alter-table-update-statements} +# ALTER TABLE ... UPDATE 语句 {#alter-table-update-statements} ``` sql ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr diff --git a/docs/zh/sql-reference/statements/alter/view.md b/docs/zh/sql-reference/statements/alter/view.md index 34a612803c1..a19d918612a 100644 --- a/docs/zh/sql-reference/statements/alter/view.md +++ b/docs/zh/sql-reference/statements/alter/view.md @@ -4,9 +4,9 @@ sidebar_position: 50 sidebar_label: VIEW --- -# ALTER TABLE … MODIFY QUERY 语句 {#alter-modify-query} +# ALTER TABLE ... MODIFY QUERY 语句 {#alter-modify-query} -当使用`ALTER TABLE … MODIFY QUERY`语句创建一个[物化视图](../create/view.md#materialized)时,可以修改`SELECT`查询。当物化视图在没有 `TO [db.]name` 的情况下创建时使用它。必须启用 `allow_experimental_alter_materialized_view_structure`设置。 +当使用`ALTER TABLE ... MODIFY QUERY`语句创建一个[物化视图](../create/view.md#materialized)时,可以修改`SELECT`查询。当物化视图在没有 `TO [db.]name` 的情况下创建时使用它。必须启用 `allow_experimental_alter_materialized_view_structure`设置。 如果一个物化视图使用`TO [db.]name`,你必须先 [DETACH](../detach.mdx) 视图。用[ALTER TABLE](index.md)修改目标表,然后 [ATTACH](../attach.mdx)之前分离的(`DETACH`)视图。 diff --git a/docs/zh/sql-reference/statements/create/view.md b/docs/zh/sql-reference/statements/create/view.md index bce0994ecd2..49a1d66bdf1 100644 --- a/docs/zh/sql-reference/statements/create/view.md +++ b/docs/zh/sql-reference/statements/create/view.md @@ -55,7 +55,7 @@ ClickHouse 中的物化视图更像是插入触发器。 如果视图查询中 如果指定`POPULATE`,则在创建视图时将现有表数据插入到视图中,就像创建一个`CREATE TABLE ... AS SELECT ...`一样。 否则,查询仅包含创建视图后插入表中的数据。 我们**不建议**使用POPULATE,因为在创建视图期间插入表中的数据不会插入其中。 -`SELECT` 查询可以包含`DISTINCT`、`GROUP BY`、`ORDER BY`、`LIMIT`……请注意,相应的转换是在每个插入数据块上独立执行的。 例如,如果设置了`GROUP BY`,则在插入期间聚合数据,但仅在插入数据的单个数据包内。 数据不会被进一步聚合。 例外情况是使用独立执行数据聚合的`ENGINE`,例如`SummingMergeTree`。 +`SELECT` 查询可以包含`DISTINCT`、`GROUP BY`、`ORDER BY`、`LIMIT`...请注意,相应的转换是在每个插入数据块上独立执行的。 例如,如果设置了`GROUP BY`,则在插入期间聚合数据,但仅在插入数据的单个数据包内。 数据不会被进一步聚合。 例外情况是使用独立执行数据聚合的`ENGINE`,例如`SummingMergeTree`。 在物化视图上执行[ALTER](../../../sql-reference/statements/alter/index.md)查询有局限性,因此可能不方便。 如果物化视图使用构造`TO [db.]name`,你可以`DETACH`视图,为目标表运行`ALTER`,然后`ATTACH`先前分离的(`DETACH`)视图。 diff --git a/docs/zh/sql-reference/statements/insert-into.md b/docs/zh/sql-reference/statements/insert-into.md index f80c0a8a8ea..a08a78b6f1d 100644 --- a/docs/zh/sql-reference/statements/insert-into.md +++ b/docs/zh/sql-reference/statements/insert-into.md @@ -68,7 +68,7 @@ SELECT * FROM insert_select_testtable; INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -例如,下面的查询所使用的输入格式就与上面INSERT … VALUES的中使用的输入格式相同: +例如,下面的查询所使用的输入格式就与上面INSERT ... VALUES的中使用的输入格式相同: ``` sql INSERT INTO [TABLE] [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... diff --git a/docs/zh/sql-reference/statements/select/limit.md b/docs/zh/sql-reference/statements/select/limit.md index 2bbf2949707..795f3f4ecd1 100644 --- a/docs/zh/sql-reference/statements/select/limit.md +++ b/docs/zh/sql-reference/statements/select/limit.md @@ -13,11 +13,11 @@ sidebar_label: LIMIT 如果没有 [ORDER BY](../../../sql-reference/statements/select/order-by.md) 子句显式排序结果,结果的行选择可能是任意的和非确定性的。 -## LIMIT … WITH TIES 修饰符 {#limit-with-ties} +## LIMIT ... WITH TIES 修饰符 {#limit-with-ties} 如果为 `LIMIT n[,m]` 设置了 `WITH TIES` ,并且声明了 `ORDER BY expr_list`, 除了得到无修饰符的结果(正常情况下的 `limit n`, 前n行数据), 还会返回与第`n`行具有相同排序字段的行(即如果第n+1行的字段与第n行 拥有相同的排序字段,同样返回该结果. -此修饰符可以与: [ORDER BY … WITH FILL modifier](../../../sql-reference/statements/select/order-by.md#orderby-with-fill) 组合使用. +此修饰符可以与: [ORDER BY ... WITH FILL modifier](../../../sql-reference/statements/select/order-by.md#orderby-with-fill) 组合使用. 例如以下查询: diff --git a/docs/zh/sql-reference/statements/select/order-by.md b/docs/zh/sql-reference/statements/select/order-by.md index 3286fc9f9e7..2f2d9a4959c 100644 --- a/docs/zh/sql-reference/statements/select/order-by.md +++ b/docs/zh/sql-reference/statements/select/order-by.md @@ -89,7 +89,7 @@ SELECT a, b, c FROM t ORDER BY a, b, c ## ORDER BY Expr WITH FILL Modifier {#orderby-with-fill} -此修饰符可以与 [LIMIT … WITH TIES modifier](../../../sql-reference/statements/select/limit.md#limit-with-ties) 进行组合使用. +此修饰符可以与 [LIMIT ... WITH TIES modifier](../../../sql-reference/statements/select/limit.md#limit-with-ties) 进行组合使用. 可以在`ORDER BY expr`之后用可选的`FROM expr`,`TO expr`和`STEP expr`参数来设置`WITH FILL`修饰符。 所有`expr`列的缺失值将被顺序填充,而其他列将被填充为默认值。 diff --git a/docs/zh/sql-reference/table-functions/file.md b/docs/zh/sql-reference/table-functions/file.md index 28682255738..fa1ec12f7df 100644 --- a/docs/zh/sql-reference/table-functions/file.md +++ b/docs/zh/sql-reference/table-functions/file.md @@ -114,7 +114,7 @@ FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') **示例** -从名为 `file000`, `file001`, … , `file999`的文件中查询数据: +从名为 `file000`, `file001`, ... , `file999`的文件中查询数据: ``` sql SELECT count(*) diff --git a/docs/zh/sql-reference/table-functions/hdfs.md b/docs/zh/sql-reference/table-functions/hdfs.md index b10b10ae2d2..f8320d8d0bb 100644 --- a/docs/zh/sql-reference/table-functions/hdfs.md +++ b/docs/zh/sql-reference/table-functions/hdfs.md @@ -84,7 +84,7 @@ FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value U **示例** -从名为 `file000`, `file001`, … , `file999`的文件中查询数据: +从名为 `file000`, `file001`, ... , `file999`的文件中查询数据: ``` sql SELECT count(*) diff --git a/docs/zh/sql-reference/table-functions/s3.md b/docs/zh/sql-reference/table-functions/s3.md index f7384a7526e..4f2c7299d95 100644 --- a/docs/zh/sql-reference/table-functions/s3.md +++ b/docs/zh/sql-reference/table-functions/s3.md @@ -99,7 +99,7 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefi !!! warning "Warning" 如果文件列表中包含有从零开头的数字范围,请对每个数字分别使用带括号的结构,或者使用`?`。 -计算名为 `file-000.csv`, `file-001.csv`, … , `file-999.csv` 文件的总行数: +计算名为 `file-000.csv`, `file-001.csv`, ... , `file-999.csv` 文件的总行数: ``` sql SELECT count(*) From 713764f62fa92db1fab04dcb426682b4859d6de1 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 23 May 2024 14:01:00 +0200 Subject: [PATCH 277/392] Add missing space before link --- docs/en/sql-reference/functions/other-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 4501d1f43d3..829d46df9fa 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -37,7 +37,7 @@ getMacro(name); **Returned value** -- Value of the specified macro.[String](../../sql-reference/data-types/string.md). +- Value of the specified macro. [String](../../sql-reference/data-types/string.md). **Example** From dd7f3d1ba23bf2e18545ece2675f9836d84d7f69 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 23 May 2024 14:11:30 +0200 Subject: [PATCH 278/392] Fix test --- tests/integration/test_storage_s3/test.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index dc929b7db46..09b27fff1e8 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1816,27 +1816,13 @@ def test_schema_inference_cache(started_cluster): check_cache(instance, []) run_describe_query(instance, files, storage_name, started_cluster, bucket) - check_cache_misses( - instance, - files, - storage_name, - started_cluster, - bucket, - 4 if storage_name == "url" else 1, - ) + check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4) instance.query("system drop schema cache") check_cache(instance, []) run_describe_query(instance, files, storage_name, started_cluster, bucket) - check_cache_misses( - instance, - files, - storage_name, - started_cluster, - bucket, - 4 if storage_name == "url" else 1, - ) + check_cache_misses(instance, files, storage_name, started_cluster, bucket, 4) instance.query("system drop schema cache") From 147516f1626f656da5fc4dcc0d9254202a8de860 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 16 Apr 2024 13:05:07 +0000 Subject: [PATCH 279/392] Fix AST fuzzer failure --- src/Functions/FunctionHelpers.cpp | 2 ++ src/Functions/splitByRegexp.cpp | 10 ++++------ .../0_stateless/01866_split_by_regexp.reference | 1 + tests/queries/0_stateless/01866_split_by_regexp.sql | 3 +++ 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/Functions/FunctionHelpers.cpp b/src/Functions/FunctionHelpers.cpp index d85bb0e7060..3b057779ffe 100644 --- a/src/Functions/FunctionHelpers.cpp +++ b/src/Functions/FunctionHelpers.cpp @@ -21,6 +21,8 @@ namespace ErrorCodes const ColumnConst * checkAndGetColumnConstStringOrFixedString(const IColumn * column) { + if (!column) + return {}; if (!isColumnConst(*column)) return {}; diff --git a/src/Functions/splitByRegexp.cpp b/src/Functions/splitByRegexp.cpp index e28fe9c38bb..042db97794d 100644 --- a/src/Functions/splitByRegexp.cpp +++ b/src/Functions/splitByRegexp.cpp @@ -164,6 +164,7 @@ public: String getName() const override { return name; } size_t getNumberOfArguments() const override { return SplitByRegexpImpl::getNumberOfArguments(); } bool isVariadic() const override { return SplitByRegexpImpl::isVariadic(); } + /// ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return SplitByRegexpImpl::getArgumentsThatAreAlwaysConstant(); } FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override { @@ -182,14 +183,11 @@ public: private: bool patternIsTrivialChar(const ColumnsWithTypeAndName & arguments) const { + if (!arguments[0].column.get()) + return false; const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get()); if (!col) - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of first argument of function {}. " - "Must be constant string.", - arguments[0].column->getName(), - getName()); + return false; String pattern = col->getValue(); if (pattern.size() == 1) diff --git a/tests/queries/0_stateless/01866_split_by_regexp.reference b/tests/queries/0_stateless/01866_split_by_regexp.reference index 62939940545..552d4d1f96a 100644 --- a/tests/queries/0_stateless/01866_split_by_regexp.reference +++ b/tests/queries/0_stateless/01866_split_by_regexp.reference @@ -17,3 +17,4 @@ Test fallback of splitByRegexp to splitByChar if regexp is trivial ['a','b','c'] ['a|b|c'] ['a\\b\\c'] +AST Fuzzer failure diff --git a/tests/queries/0_stateless/01866_split_by_regexp.sql b/tests/queries/0_stateless/01866_split_by_regexp.sql index 570bd1ba5c0..bc25d3e1093 100644 --- a/tests/queries/0_stateless/01866_split_by_regexp.sql +++ b/tests/queries/0_stateless/01866_split_by_regexp.sql @@ -20,3 +20,6 @@ select splitByRegexp('{', 'a{b{c'); select splitByRegexp('}', 'a}b}c'); select splitByRegexp('|', 'a|b|c'); select splitByRegexp('\\', 'a\\b\\c'); + +SELECT 'AST Fuzzer failure'; +SELECT splitByRegexp(materialize(1), NULL, 3) -- { serverError ILLEGAL_COLUMN } From b1fe9ab5f0aa24408321382e9651517f7808a478 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 23 May 2024 15:33:21 +0200 Subject: [PATCH 280/392] CI: dependency fix for changelog.py #do_not_test --- tests/ci/ci.py | 3 ++- tests/ci/github_helper.py | 10 +++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/ci/ci.py b/tests/ci/ci.py index be922a306e1..99555b06bbf 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -45,6 +45,7 @@ from env_helper import ( S3_BUILDS_BUCKET, TEMP_PATH, GITHUB_RUN_ID, + GITHUB_REPOSITORY, ) from get_robot_token import get_best_robot_token from git_helper import GIT_PREFIX, Git @@ -1913,7 +1914,7 @@ def _cancel_pr_wf(s3: S3Helper, pr_number: int) -> None: print(f"ERROR: FIX IT: Run id has not been found PR [{pr_number}]!") else: print(f"Canceling PR workflow run_id: [{run_id}], pr: [{pr_number}]") - GitHub.cancel_wf(run_id) + GitHub.cancel_wf(GITHUB_REPOSITORY, get_best_robot_token(), run_id) def main() -> int: diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index 81603c66bae..eb0f6c24527 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -22,9 +22,6 @@ from github.NamedUser import NamedUser as NamedUser from github.PullRequest import PullRequest as PullRequest from github.Repository import Repository as Repository -from env_helper import GITHUB_REPOSITORY -from get_robot_token import get_best_robot_token - # pylint: enable=useless-import-alias CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache") @@ -265,12 +262,11 @@ class GitHub(github.Github): assert isinstance(value, int) self._retries = value - # minimalistic static methods not using pygithub + # static methods not using pygithub @staticmethod - def cancel_wf(run_id, strict=False): - token = get_best_robot_token() + def cancel_wf(repo, run_id, token, strict=False): headers = {"Authorization": f"token {token}"} - url = f"https://api.github.com/repos/{GITHUB_REPOSITORY}/actions/runs/{run_id}/cancel" + url = f"https://api.github.com/repos/{repo}/actions/runs/{run_id}/cancel" try: response = requests.post(url, headers=headers, timeout=10) response.raise_for_status() From 6e3a609907192d7cc378fb209d0e2431b8859eb0 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 23 May 2024 15:43:17 +0200 Subject: [PATCH 281/392] Fix formatting in ru/index.md --- docs/ru/index.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/ru/index.md b/docs/ru/index.md index d551d492af5..02be8912b94 100644 --- a/docs/ru/index.md +++ b/docs/ru/index.md @@ -12,10 +12,10 @@ ClickHouse — столбцовая система управления база | Строка | WatchID | JavaEnable | Title | GoodEvent | EventTime | |--------|-------------|------------|--------------------|-----------|---------------------| -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | ... | ... | ... | ... | ... | +| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| #N | ... | ... | ... | ... | ... | То есть, значения, относящиеся к одной строке, физически хранятся рядом. @@ -24,13 +24,13 @@ ClickHouse — столбцовая система управления база В столбцовых СУБД данные хранятся в таком порядке: -| Строка: | #0 | #1 | #2 | #N | +| Строка: | #0 | #1 | #2 | #N | |-------------|---------------------|---------------------|---------------------|-----| -| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | -| JavaEnable: | 1 | 0 | 1 | ... | -| Title: | Investor Relations | Contact us | Mission | ... | -| GoodEvent: | 1 | 1 | 1 | ... | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | +| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | +| JavaEnable: | 1 | 0 | 1 | ... | +| Title: | Investor Relations | Contact us | Mission | ... | +| GoodEvent: | 1 | 1 | 1 | ... | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | В примерах изображён только порядок расположения данных. То есть значения из разных столбцов хранятся отдельно, а данные одного столбца — вместе. From e24253c097ed2f0325c9be77fc87ebbe8f086a5c Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 23 May 2024 15:45:26 +0200 Subject: [PATCH 282/392] Fix formatting in zh/index.md --- docs/zh/index.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/zh/index.md b/docs/zh/index.md index ec4b6dce1f8..c092f296722 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -13,10 +13,10 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) | Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | |-----|-------------|------------|--------------------|-----------|---------------------| -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | ... | ... | ... | ... | ... | +| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| #N | ... | ... | ... | ... | ... | 处于同一行中的数据总是被物理的存储在一起。 @@ -24,13 +24,13 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) 在列式数据库系统中,数据按如下的顺序存储: -| Row: | #0 | #1 | #2 | #N | +| Row: | #0 | #1 | #2 | #N | |-------------|---------------------|---------------------|---------------------|-----| -| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | -| JavaEnable: | 1 | 0 | 1 | ... | -| Title: | Investor Relations | Contact us | Mission | ... | -| GoodEvent: | 1 | 1 | 1 | ... | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | +| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | +| JavaEnable: | 1 | 0 | 1 | ... | +| Title: | Investor Relations | Contact us | Mission | ... | +| GoodEvent: | 1 | 1 | 1 | ... | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | 这些示例只显示了数据的排列顺序。来自不同列的值被单独存储,来自同一列的数据被存储在一起。 From 87b4d43a3f93864c122f7fe2451c696720207809 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 23 May 2024 15:48:20 +0200 Subject: [PATCH 283/392] Update return type formatting --- .../functions/arithmetic-functions.md | 8 +- .../functions/array-functions.md | 86 +++---- .../sql-reference/functions/bit-functions.md | 24 +- .../functions/bitmap-functions.md | 22 +- .../functions/date-time-functions.md | 222 +++++------------- .../functions/distance-functions.md | 58 ++--- .../functions/encoding-functions.md | 38 +-- .../functions/ext-dict-functions.md | 24 +- .../sql-reference/functions/hash-functions.md | 134 +++-------- .../sql-reference/functions/introspection.md | 29 +-- .../functions/ip-address-functions.md | 20 +- .../sql-reference/functions/json-functions.md | 22 +- .../sql-reference/functions/math-functions.md | 4 +- .../functions/other-functions.md | 140 ++++------- .../functions/random-functions.md | 56 ++--- .../functions/rounding-functions.md | 2 +- .../functions/splitting-merging-functions.md | 57 +++-- .../functions/string-functions.md | 100 ++------ .../functions/string-search-functions.md | 64 ++--- .../functions/time-series-functions.md | 14 +- .../functions/time-window-functions.md | 8 +- .../functions/tuple-functions.md | 36 +-- .../functions/tuple-map-functions.md | 16 +- .../functions/type-conversion-functions.md | 8 +- .../sql-reference/functions/ulid-functions.md | 4 +- .../sql-reference/functions/url-functions.md | 36 +-- .../sql-reference/functions/uuid-functions.md | 8 +- 27 files changed, 369 insertions(+), 871 deletions(-) diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index 6d95f3dc358..aef4150ff50 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -320,9 +320,7 @@ multiplyDecimal(a, b[, result_scale]) **Returned value** -- The result of multiplication with given scale. - -Type: [Decimal256](../../sql-reference/data-types/decimal.md). +- The result of multiplication with given scale. [Decimal256](../../sql-reference/data-types/decimal.md). **Example** @@ -396,9 +394,7 @@ divideDecimal(a, b[, result_scale]) **Returned value** -- The result of division with given scale. - -Type: [Decimal256](../../sql-reference/data-types/decimal.md). +- The result of division with given scale. [Decimal256](../../sql-reference/data-types/decimal.md). **Example** diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 87e733a4b0c..512874d20b7 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -30,9 +30,7 @@ The function also works for [strings](string-functions.md#empty) or [UUID](uuid- **Returned value** -- Returns `1` for an empty array or `0` for a non-empty array. - -Type: [UInt8](../data-types/int-uint.md). +- Returns `1` for an empty array or `0` for a non-empty array. [UInt8](../data-types/int-uint.md). **Example** @@ -74,9 +72,7 @@ The function also works for [strings](string-functions.md#notempty) or [UUID](uu **Returned value** -- Returns `1` for a non-empty array or `0` for an empty array. - -Type: [UInt8](../data-types/int-uint.md). +- Returns `1` for a non-empty array or `0` for an empty array. [UInt8](../data-types/int-uint.md). **Example** @@ -797,9 +793,11 @@ The sizes of the two vectors must be equal. Arrays and Tuples may also contain m **Returned value** -- The dot product of the two vectors. +- The dot product of the two vectors. [Numeric](https://clickhouse.com/docs/en/native-protocol/columns#numeric-types). -Type: numeric - determined by the type of the arguments. If Arrays or Tuples contain mixed element types then the result type is the supertype. +:::note +The return type is determined by the type of the arguments. If Arrays or Tuples contain mixed element types then the result type is the supertype. +::: **Examples** @@ -1186,9 +1184,7 @@ arrayShingles(array, length) **Returned value** -- An array of generated shingles. - -Type: [Array](../../sql-reference/data-types/array.md). +- An array of generated shingles. [Array](../../sql-reference/data-types/array.md). **Examples** @@ -1562,9 +1558,7 @@ arrayDifference(array) **Returned values** -Returns an array of differences between adjacent array elements. - -Type: [UInt\*](https://clickhouse.com/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.com/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.com/docs/en/data_types/float/). +Returns an array of differences between adjacent array elements. [UInt\*](https://clickhouse.com/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.com/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.com/docs/en/data_types/float/). **Example** @@ -1841,9 +1835,7 @@ arrayReduceInRanges(agg_func, ranges, arr1, arr2, ..., arrN) **Returned value** -- Array containing results of the aggregate function over specified ranges. - -Type: [Array](../../sql-reference/data-types/array.md). +- Array containing results of the aggregate function over specified ranges. [Array](../../sql-reference/data-types/array.md). **Example** @@ -1986,9 +1978,7 @@ arrayCompact(arr) **Returned value** -The array without duplicate. - -Type: `Array`. +The array without duplicate. [Array](../data-types/array.md). **Example** @@ -2024,9 +2014,7 @@ The function can take any number of arrays of different types. All the input arr **Returned value** -- Array with elements from the source arrays grouped into [tuples](../../sql-reference/data-types/tuple.md). Data types in the tuple are the same as types of the input arrays and in the same order as arrays are passed. - -Type: [Array](../../sql-reference/data-types/array.md). +- Array with elements from the source arrays grouped into [tuples](../../sql-reference/data-types/tuple.md). Data types in the tuple are the same as types of the input arrays and in the same order as arrays are passed. [Array](../../sql-reference/data-types/array.md). **Example** @@ -2383,7 +2371,8 @@ arrayMin([func,] arr) - The minimum of function values (or the array minimum). -Type: if `func` is specified, matches `func` return value type, else matches the array elements type. +:::note +If `func` is specified, then the return type matches the return value type of `func`, otherwise it matches the type of the array elements. **Examples** @@ -2438,7 +2427,9 @@ arrayMax([func,] arr) - The maximum of function values (or the array maximum). -Type: if `func` is specified, matches `func` return value type, else matches the array elements type. +:::note +if `func` is specified then the return type matches the return value type of `func`, otherwise it matches the type of the array elements. +::: **Examples** @@ -2493,7 +2484,14 @@ arraySum([func,] arr) - The sum of the function values (or the array sum). -Type: for decimal numbers in source array (or for converted values, if `func` is specified) — [Decimal128](../../sql-reference/data-types/decimal.md), for floating point numbers — [Float64](../../sql-reference/data-types/float.md), for numeric unsigned — [UInt64](../../sql-reference/data-types/int-uint.md), and for numeric signed — [Int64](../../sql-reference/data-types/int-uint.md). +:::note +Return type: + +- For decimal numbers in the source array (or for converted values, if `func` is specified) — [Decimal128](../../sql-reference/data-types/decimal.md). +- For floating point numbers — [Float64](../../sql-reference/data-types/float.md). +- For numeric unsigned — [UInt64](../../sql-reference/data-types/int-uint.md). +- For numeric signed — [Int64](../../sql-reference/data-types/int-uint.md). +::: **Examples** @@ -2546,9 +2544,7 @@ arrayAvg([func,] arr) **Returned value** -- The average of function values (or the array average). - -Type: [Float64](../../sql-reference/data-types/float.md). +- The average of function values (or the array average). [Float64](../../sql-reference/data-types/float.md). **Examples** @@ -2596,9 +2592,7 @@ arrayCumSum(arr) **Returned value** -- Returns an array of the partial sums of the elements in the source array. - -Type: [UInt\*](https://clickhouse.com/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.com/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.com/docs/en/data_types/float/). +- Returns an array of the partial sums of the elements in the source array. [UInt\*](https://clickhouse.com/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.com/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.com/docs/en/data_types/float/). Example: @@ -2630,9 +2624,7 @@ arrayCumSumNonNegative(arr) **Returned value** -- Returns an array of non-negative partial sums of elements in the source array. - -Type: [UInt\*](https://clickhouse.com/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.com/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.com/docs/en/data_types/float/). +- Returns an array of non-negative partial sums of elements in the source array. [UInt\*](https://clickhouse.com/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.com/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.com/docs/en/data_types/float/). ``` sql SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res @@ -2662,9 +2654,7 @@ arrayProduct(arr) **Returned value** -- A product of array's elements. - -Type: [Float64](../../sql-reference/data-types/float.md). +- A product of array's elements. [Float64](../../sql-reference/data-types/float.md). **Examples** @@ -2714,9 +2704,7 @@ arrayRotateLeft(arr, n) **Returned value** -- An array rotated to the left by the specified number of elements. - -Type: [Array](../../sql-reference/data-types/array.md). +- An array rotated to the left by the specified number of elements. [Array](../../sql-reference/data-types/array.md). **Examples** @@ -2780,9 +2768,7 @@ arrayRotateRight(arr, n) **Returned value** -- An array rotated to the right by the specified number of elements. - -Type: [Array](../../sql-reference/data-types/array.md). +- An array rotated to the right by the specified number of elements. [Array](../../sql-reference/data-types/array.md). **Examples** @@ -2848,9 +2834,7 @@ arrayShiftLeft(arr, n[, default]) **Returned value** -- An array shifted to the left by the specified number of elements. - -Type: [Array](../../sql-reference/data-types/array.md). +- An array shifted to the left by the specified number of elements. [Array](../../sql-reference/data-types/array.md). **Examples** @@ -2944,9 +2928,7 @@ arrayShiftRight(arr, n[, default]) **Returned value** -- An array shifted to the right by the specified number of elements. - -Type: [Array](../../sql-reference/data-types/array.md). +- An array shifted to the right by the specified number of elements. [Array](../../sql-reference/data-types/array.md). **Examples** @@ -3038,9 +3020,7 @@ arrayRandomSample(arr, samples) **Returned Value** -- An array containing a random sample of elements from the input array. - -Type: [Array](../data-types/array.md). +- An array containing a random sample of elements from the input array. [Array](../data-types/array.md). **Examples** diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index 0951c783aae..709f438d67f 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -188,9 +188,7 @@ SELECT bitTest(number, index) **Returned values** -Returns a value of bit at specified position. - -Type: `UInt8`. +Returns a value of bit at specified position. [UInt8](../data-types/int-uint.md). **Example** @@ -253,9 +251,7 @@ SELECT bitTestAll(number, index1, index2, index3, index4, ...) **Returned values** -Returns result of logical conjuction. - -Type: `UInt8`. +Returns result of logical conjuction. [UInt8](../data-types/int-uint.md). **Example** @@ -318,9 +314,7 @@ SELECT bitTestAny(number, index1, index2, index3, index4, ...) **Returned values** -Returns result of logical disjunction. - -Type: `UInt8`. +Returns result of logical disjunction. [UInt8](../data-types/int-uint.md). **Example** @@ -372,11 +366,11 @@ bitCount(x) **Returned value** -- Number of bits set to one in the input number. +- Number of bits set to one in the input number. [UInt8](../data-types/int-uint.md). -The function does not convert input value to a larger type ([sign extension](https://en.wikipedia.org/wiki/Sign_extension)). So, for example, `bitCount(toUInt8(-1)) = 8`. - -Type: `UInt8`. +:::note +The function does not convert the input value to a larger type ([sign extension](https://en.wikipedia.org/wiki/Sign_extension)). So, for example, `bitCount(toUInt8(-1)) = 8`. +::: **Example** @@ -413,9 +407,7 @@ bitHammingDistance(int1, int2) **Returned value** -- The Hamming distance. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md). +- The Hamming distance. [UInt8](../../sql-reference/data-types/int-uint.md). **Examples** diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index 379be302881..e546de039da 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -75,8 +75,8 @@ bitmapSubsetInRange(bitmap, range_start, range_end) **Arguments** - `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – Start of the range (inclusive). Type: [UInt32](../../sql-reference/data-types/int-uint.md). -- `range_end` – End of the range (exclusive). Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- `range_start` – Start of the range (inclusive). [UInt32](../../sql-reference/data-types/int-uint.md). +- `range_end` – End of the range (exclusive). [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -105,8 +105,8 @@ bitmapSubsetLimit(bitmap, range_start, cardinality_limit) **Arguments** - `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – Start of the range (inclusive). Type: [UInt32](../../sql-reference/data-types/int-uint.md). -- `cardinality_limit` – Maximum cardinality of the subset. Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- `range_start` – Start of the range (inclusive). [UInt32](../../sql-reference/data-types/int-uint.md). +- `cardinality_limit` – Maximum cardinality of the subset. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -134,9 +134,9 @@ subBitmap(bitmap, offset, cardinality_limit) **Arguments** -- `bitmap` – The bitmap. Type: [Bitmap object](#bitmap_functions-bitmapbuild). -- `offset` – The position of the first element of the subset. Type: [UInt32](../../sql-reference/data-types/int-uint.md). -- `cardinality_limit` – The maximum number of elements in the subset. Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- `bitmap` – The bitmap. [Bitmap object](#bitmap_functions-bitmapbuild). +- `offset` – The position of the first element of the subset. [UInt32](../../sql-reference/data-types/int-uint.md). +- `cardinality_limit` – The maximum number of elements in the subset. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -163,14 +163,12 @@ bitmapContains(bitmap, needle) **Arguments** - `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `needle` – Searched bit value. Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- `needle` – Searched bit value. [UInt32](../../sql-reference/data-types/int-uint.md). **Returned values** -- 0 — If `bitmap` does not contain `needle`. -- 1 — If `bitmap` contains `needle`. - -Type: `UInt8`. +- 0 — If `bitmap` does not contain `needle`. [UInt8](../data-types/int-uint.md). +- 1 — If `bitmap` contains `needle`. [UInt8](../data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 843f22e5a6f..7de402d2349 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -50,9 +50,7 @@ Alias: **Returned value** -- A date created from the arguments. - -Type: [Date](../../sql-reference/data-types/date.md). +- A date created from the arguments. [Date](../../sql-reference/data-types/date.md). **Example** @@ -109,9 +107,7 @@ makeDateTime(year, month, day, hour, minute, second[, timezone]) **Returned value** -- A date with time created from the arguments. - -Type: [DateTime](../../sql-reference/data-types/datetime.md). +- A date with time created from the arguments. [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -152,7 +148,7 @@ Alias: `TIMESTAMP` **Arguments** -- `expr` - Date or date with time. Type: [String](../../sql-reference/data-types/string.md). +- `expr` - Date or date with time. [String](../../sql-reference/data-types/string.md). - `expr_time` - Optional parameter. Time to add. [String](../../sql-reference/data-types/string.md). **Examples** @@ -200,9 +196,7 @@ Alias: `timezone`. **Returned value** -- Timezone. - -Type: [String](../../sql-reference/data-types/string.md). +- Timezone. [String](../../sql-reference/data-types/string.md). **Example** @@ -237,9 +231,7 @@ Alias: `serverTimezone`. **Returned value** -- Timezone. - -Type: [String](../../sql-reference/data-types/string.md). +- Timezone. [String](../../sql-reference/data-types/string.md). **Example** @@ -278,9 +270,7 @@ Alias: `toTimezone`. **Returned value** -- Date and time. - -Type: [DateTime](../../sql-reference/data-types/datetime.md). +- Date and time. [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -336,9 +326,7 @@ Alias: `timezoneOf`. **Returned value** -- Timezone name. - -Type: [String](../../sql-reference/data-types/string.md). +- Timezone name. [String](../../sql-reference/data-types/string.md). **Example** @@ -373,9 +361,7 @@ Alias: `timezoneOffset`. **Returned value** -- Offset from UTC in seconds. - -Type: [Int32](../../sql-reference/data-types/int-uint.md). +- Offset from UTC in seconds. [Int32](../../sql-reference/data-types/int-uint.md). **Example** @@ -410,9 +396,7 @@ Alias: `YEAR` **Returned value** -- The year of the given date/time - -Type: `UInt16` +- The year of the given date/time. [UInt16](../data-types/int-uint.md). **Example** @@ -446,9 +430,7 @@ Alias: `QUARTER` **Returned value** -- The quarter of the year (1, 2, 3 or 4) of the given date/time - -Type: `UInt8` +- The quarter of the year (1, 2, 3 or 4) of the given date/time. [UInt8](../data-types/int-uint.md). **Example** @@ -482,9 +464,7 @@ Alias: `MONTH` **Returned value** -- The month of the year (1 - 12) of the given date/time - -Type: `UInt8` +- The month of the year (1 - 12) of the given date/time. [UInt8](../data-types/int-uint.md). **Example** @@ -518,9 +498,7 @@ Alias: `DAYOFYEAR` **Returned value** -- The day of the year (1 - 366) of the given date/time - -Type: `UInt16` +- The day of the year (1 - 366) of the given date/time. [UInt16](../data-types/int-uint.md). **Example** @@ -554,9 +532,7 @@ Aliases: `DAYOFMONTH`, `DAY` **Returned value** -- The day of the month (1 - 31) of the given date/time - -Type: `UInt8` +- The day of the month (1 - 31) of the given date/time. [UInt8](../data-types/int-uint.md). **Example** @@ -643,9 +619,7 @@ Alias: `HOUR` **Returned value** -- The hour of the day (0 - 23) of the given date/time - -Type: `UInt8` +- The hour of the day (0 - 23) of the given date/time. [UInt8](../data-types/int-uint.md). **Example** @@ -679,9 +653,7 @@ Alias: `MINUTE` **Returned value** -- The minute of the hour (0 - 59) of the given date/time - -Type: `UInt8` +- The minute of the hour (0 - 59) of the given date/time. [UInt8](../data-types/int-uint.md). **Example** @@ -715,9 +687,7 @@ Alias: `SECOND` **Returned value** -- The second in the minute (0 - 59) of the given date/time - -Type: `UInt8` +- The second in the minute (0 - 59) of the given date/time. [UInt8](../data-types/int-uint.md). **Example** @@ -763,9 +733,7 @@ Result: **Returned value** -- The millisecond in the minute (0 - 59) of the given date/time - -Type: `UInt16` +- The millisecond in the minute (0 - 59) of the given date/time. [UInt16](../data-types/int-uint.md). ## toUnixTimestamp @@ -782,9 +750,7 @@ toUnixTimestamp(str, [timezone]) **Returned value** -- Returns the unix timestamp. - -Type: `UInt32`. +- Returns the unix timestamp. [UInt32](../data-types/int-uint.md). **Example** @@ -842,9 +808,7 @@ toStartOfYear(value) **Returned value** -- The first day of the year of the input date/time - -Type: `Date` +- The first day of the year of the input date/time. [Date](../data-types/date.md). **Example** @@ -876,9 +840,7 @@ toStartOfISOYear(value) **Returned value** -- The first day of the year of the input date/time - -Type: `Date` +- The first day of the year of the input date/time. [Date](../data-types/date.md). **Example** @@ -911,9 +873,7 @@ toStartOfQuarter(value) **Returned value** -- The first day of the quarter of the given date/time - -Type: `Date` +- The first day of the quarter of the given date/time. [Date](../data-types/date.md). **Example** @@ -945,9 +905,7 @@ toStartOfMonth(value) **Returned value** -- The first day of the month of the given date/time - -Type: `Date` +- The first day of the month of the given date/time. [Date](../data-types/date.md). **Example** @@ -985,9 +943,7 @@ Alias: `LAST_DAY` **Returned value** -- The last day of the month of the given date/time - -Type: `Date` +- The last day of the month of the given date/time=. [Date](../data-types/date.md). **Example** @@ -1019,9 +975,7 @@ toMonday(value) **Returned value** -- The date of the nearest Monday on or prior to the given date - -Type: `Date` +- The date of the nearest Monday on or prior to the given date. [Date](../data-types/date.md). **Example** @@ -1057,9 +1011,7 @@ toStartOfWeek(t[, mode[, timezone]]) **Returned value** -- The date of the nearest Sunday or Monday on or prior to the given date, depending on the mode - -Type: `Date` +- The date of the nearest Sunday or Monday on or prior to the given date, depending on the mode. [Date](../data-types/date.md). **Example** @@ -1102,9 +1054,7 @@ toLastDayOfWeek(t[, mode[, timezone]]) **Returned value** -- The date of the nearest Sunday or Monday on or after the given date, depending on the mode - -Type: `Date` +- The date of the nearest Sunday or Monday on or after the given date, depending on the mode. [Date](../data-types/date.md). **Example** @@ -1144,9 +1094,7 @@ toStartOfDay(value) **Returned value** -- The start of the day of the given date/time - -Type: `DateTime` +- The start of the day of the given date/time. [DateTime](../data-types/datetime.md). **Example** @@ -1178,9 +1126,7 @@ toStartOfHour(value) **Returned value** -- The start of the hour of the given date/time - -Type: `DateTime` +- The start of the hour of the given date/time. [DateTime](../data-types/datetime.md). **Example** @@ -1214,9 +1160,7 @@ toStartOfMinute(value) **Returned value** -- The start of the minute of the given date/time - -Type: `DateTime` +- The start of the minute of the given date/time. [DateTime](../data-types/datetime.md). **Example** @@ -1253,9 +1197,7 @@ toStartOfSecond(value, [timezone]) **Returned value** -- Input value without sub-seconds. - -Type: [DateTime64](../../sql-reference/data-types/datetime64.md). +- Input value without sub-seconds. [DateTime64](../../sql-reference/data-types/datetime64.md). **Examples** @@ -1309,9 +1251,7 @@ toStartOfFiveMinutes(value) **Returned value** -- The start of the five-minute interval of the given date/time - -Type: `DateTime` +- The start of the five-minute interval of the given date/time. [DateTime](../data-types/datetime.md). **Example** @@ -1349,9 +1289,7 @@ toStartOfTenMinutes(value) **Returned value** -- The start of the ten-minute interval of the given date/time - -Type: `DateTime` +- The start of the ten-minute interval of the given date/time. [DateTime](../data-types/datetime.md). **Example** @@ -1389,9 +1327,7 @@ toStartOfFifteenMinutes(value) **Returned value** -- The start of the fifteen-minute interval of the given date/time - -Type: `DateTime` +- The start of the fifteen-minute interval of the given date/time. [DateTime](../data-types/datetime.md). **Example** @@ -1603,9 +1539,7 @@ Alias: `TO_DAYS` **Returned value** -The number of days passed since date 0000-01-01. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +The number of days passed since date 0000-01-01. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -1645,9 +1579,7 @@ Alias: `FROM_DAYS` **Returned value** -The date corresponding to the number of days passed since year zero. - -Type: [Date](../../sql-reference/data-types/date.md). +The date corresponding to the number of days passed since year zero. [Date](../../sql-reference/data-types/date.md). **Example** @@ -1709,9 +1641,7 @@ age('unit', startdate, enddate, [timezone]) **Returned value** -Difference between `enddate` and `startdate` expressed in `unit`. - -Type: [Int](../../sql-reference/data-types/int-uint.md). +Difference between `enddate` and `startdate` expressed in `unit`. [Int](../../sql-reference/data-types/int-uint.md). **Example** @@ -1787,9 +1717,7 @@ Aliases: `dateDiff`, `DATE_DIFF`, `timestampDiff`, `timestamp_diff`, `TIMESTAMP_ **Returned value** -Difference between `enddate` and `startdate` expressed in `unit`. - -Type: [Int](../../sql-reference/data-types/int-uint.md). +Difference between `enddate` and `startdate` expressed in `unit`. [Int](../../sql-reference/data-types/int-uint.md). **Example** @@ -1858,9 +1786,7 @@ Alias: `dateTrunc`. **Returned value** -- Value, truncated to the specified part of date. - -Type: [DateTime](../../sql-reference/data-types/datetime.md). +- Value, truncated to the specified part of date. [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -1935,9 +1861,7 @@ Aliases: `dateAdd`, `DATE_ADD`. **Returned value** -Date or date with time obtained by adding `value`, expressed in `unit`, to `date`. - -Type: [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by adding `value`, expressed in `unit`, to `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -2012,9 +1936,7 @@ Aliases: `dateSub`, `DATE_SUB`. **Returned value** -Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. - -Type: [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -2079,9 +2001,7 @@ Aliases: `timeStampAdd`, `TIMESTAMP_ADD`. **Returned value** -Date or date with time with the specified `value` expressed in `unit` added to `date`. - -Type: [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time with the specified `value` expressed in `unit` added to `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -2130,9 +2050,7 @@ Aliases: `timeStampSub`, `TIMESTAMP_SUB`. **Returned value** -Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. - -Type: [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -2167,9 +2085,7 @@ addDate(date, interval) **Returned value** -Date or date with time obtained by adding `interval` to `date`. - -Type: [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by adding `interval` to `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -2210,9 +2126,7 @@ subDate(date, interval) **Returned value** -Date or date with time obtained by subtracting `interval` from `date`. - -Type: [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by subtracting `interval` from `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -2252,9 +2166,7 @@ now([timezone]) **Returned value** -- Current date and time. - -Type: [DateTime](../../sql-reference/data-types/datetime.md). +- Current date and time. [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -2303,9 +2215,7 @@ now64([scale], [timezone]) **Returned value** -- Current date and time with sub-second precision. - -Type: [DateTime64](../../sql-reference/data-types/datetime64.md). +- Current date and time with sub-second precision. [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -2339,9 +2249,7 @@ nowInBlock([timezone]) **Returned value** -- Current date and time at the moment of processing of each block of data. - -Type: [DateTime](../../sql-reference/data-types/datetime.md). +- Current date and time at the moment of processing of each block of data. [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -2381,9 +2289,7 @@ today() **Returned value** -- Current date - -Type: [DateTime](../../sql-reference/data-types/datetime.md). +- Current date. [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -2491,9 +2397,7 @@ YYYYMMDDToDate(yyyymmdd); **Returned value** -- a date created from the arguments. - -Type: [Date](../../sql-reference/data-types/date.md). +- a date created from the arguments. [Date](../../sql-reference/data-types/date.md). **Example** @@ -2534,9 +2438,7 @@ YYYYMMDDhhmmssToDateTime(yyyymmddhhmmss[, timezone]); **Returned value** -- a date with time created from the arguments. - -Type: [DateTime](../../sql-reference/data-types/datetime.md). +- a date with time created from the arguments. [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -3743,9 +3645,7 @@ dateName(date_part, date) **Returned value** -- The specified part of date. - -Type: [String](../../sql-reference/data-types/string.md#string) +- The specified part of date. [String](../../sql-reference/data-types/string.md#string) **Example** @@ -3781,9 +3681,7 @@ monthName(date) **Returned value** -- The name of the month. - -Type: [String](../../sql-reference/data-types/string.md#string) +- The name of the month. [String](../../sql-reference/data-types/string.md#string) **Example** @@ -3878,9 +3776,7 @@ toModifiedJulianDay(date) **Returned value** -- Modified Julian Day number. - -Type: [Int32](../../sql-reference/data-types/int-uint.md). +- Modified Julian Day number. [Int32](../../sql-reference/data-types/int-uint.md). **Example** @@ -3912,9 +3808,7 @@ toModifiedJulianDayOrNull(date) **Returned value** -- Modified Julian Day number. - -Type: [Nullable(Int32)](../../sql-reference/data-types/int-uint.md). +- Modified Julian Day number. [Nullable(Int32)](../../sql-reference/data-types/int-uint.md). **Example** @@ -3946,9 +3840,7 @@ fromModifiedJulianDay(day) **Returned value** -- Date in text form. - -Type: [String](../../sql-reference/data-types/string.md) +- Date in text form. [String](../../sql-reference/data-types/string.md) **Example** @@ -3980,9 +3872,7 @@ fromModifiedJulianDayOrNull(day) **Returned value** -- Date in text form. - -Type: [Nullable(String)](../../sql-reference/data-types/string.md) +- Date in text form. [Nullable(String)](../../sql-reference/data-types/string.md) **Example** diff --git a/docs/en/sql-reference/functions/distance-functions.md b/docs/en/sql-reference/functions/distance-functions.md index 5f3514049c7..9fda491ac50 100644 --- a/docs/en/sql-reference/functions/distance-functions.md +++ b/docs/en/sql-reference/functions/distance-functions.md @@ -24,9 +24,7 @@ Alias: `normL1`. **Returned value** -- L1-norm or [taxicab geometry](https://en.wikipedia.org/wiki/Taxicab_geometry) distance. - -Type: [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). +- L1-norm or [taxicab geometry](https://en.wikipedia.org/wiki/Taxicab_geometry) distance. [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). **Examples** @@ -62,9 +60,7 @@ Alias: `normL2`. **Returned value** -- L2-norm or [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance). - -Type: [Float](../../sql-reference/data-types/float.md). +- L2-norm or [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance). [Float](../../sql-reference/data-types/float.md). **Example** @@ -99,9 +95,7 @@ Alias: `normL2Squared`. **Returned value** -- L2-norm squared. - -Type: [Float](../../sql-reference/data-types/float.md). +- L2-norm squared. [Float](../../sql-reference/data-types/float.md). **Example** @@ -137,9 +131,7 @@ Alias: `normLinf`. **Returned value** -- Linf-norm or the maximum absolute value. - -Type: [Float](../../sql-reference/data-types/float.md). +- Linf-norm or the maximum absolute value. [Float](../../sql-reference/data-types/float.md). **Example** @@ -176,9 +168,7 @@ Alias: `normLp`. **Returned value** -- [Lp-norm](https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm) - -Type: [Float](../../sql-reference/data-types/float.md). +- [Lp-norm](https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm). [Float](../../sql-reference/data-types/float.md). **Example** @@ -215,9 +205,7 @@ Alias: `distanceL1`. **Returned value** -- 1-norm distance. - -Type: [Float](../../sql-reference/data-types/float.md). +- 1-norm distance. [Float](../../sql-reference/data-types/float.md). **Example** @@ -254,9 +242,7 @@ Alias: `distanceL2`. **Returned value** -- 2-norm distance. - -Type: [Float](../../sql-reference/data-types/float.md). +- 2-norm distance. [Float](../../sql-reference/data-types/float.md). **Example** @@ -293,7 +279,7 @@ Alias: `distanceL2Squared`. **Returned value** -Type: [Float](../../sql-reference/data-types/float.md). +- Sum of the squares of the difference between the corresponding elements of two vectors. [Float](../../sql-reference/data-types/float.md). **Example** @@ -330,9 +316,7 @@ Alias: `distanceLinf`. **Returned value** -- Infinity-norm distance. - -Type: [Float](../../sql-reference/data-types/float.md). +- Infinity-norm distance. [Float](../../sql-reference/data-types/float.md). **Example** @@ -370,9 +354,7 @@ Alias: `distanceLp`. **Returned value** -- p-norm distance. - -Type: [Float](../../sql-reference/data-types/float.md). +- p-norm distance. [Float](../../sql-reference/data-types/float.md). **Example** @@ -409,9 +391,7 @@ Alias: `normalizeL1`. **Returned value** -- Unit vector. - -Type: [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). +- Unit vector. [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). **Example** @@ -447,9 +427,7 @@ Alias: `normalizeL1`. **Returned value** -- Unit vector. - -Type: [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). +- Unit vector. [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). **Example** @@ -485,9 +463,7 @@ Alias: `normalizeLinf `. **Returned value** -- Unit vector. - -Type: [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). +- Unit vector. [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). **Example** @@ -524,9 +500,7 @@ Alias: `normalizeLp `. **Returned value** -- Unit vector. - -Type: [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). +- Unit vector. [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). **Example** @@ -561,9 +535,7 @@ cosineDistance(vector1, vector2) **Returned value** -- Cosine of the angle between two vectors subtracted from one. - -Type: [Float](../../sql-reference/data-types/float.md). +- Cosine of the angle between two vectors subtracted from one. [Float](../../sql-reference/data-types/float.md). **Examples** diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index 4f6da764b3c..bc64fdea427 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -22,9 +22,7 @@ char(number_1, [number_2, ..., number_n]); **Returned value** -- a string of given bytes. - -Type: `String`. +- a string of given bytes. [String](../data-types/string.md). **Example** @@ -102,9 +100,7 @@ Values of [UUID](../data-types/uuid.md) type are encoded as big-endian order str **Returned value** -- A string with the hexadecimal representation of the argument. - -Type: [String](../../sql-reference/data-types/string.md). +- A string with the hexadecimal representation of the argument. [String](../../sql-reference/data-types/string.md). **Examples** @@ -185,15 +181,13 @@ unhex(arg) **Arguments** -- `arg` — A string containing any number of hexadecimal digits. Type: [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md). +- `arg` — A string containing any number of hexadecimal digits. [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md). Supports both uppercase and lowercase letters `A-F`. The number of hexadecimal digits does not have to be even. If it is odd, the last digit is interpreted as the least significant half of the `00-0F` byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn’t thrown). For a numeric argument the inverse of hex(N) is not performed by unhex(). **Returned value** -- A binary string (BLOB). - -Type: [String](../../sql-reference/data-types/string.md). +- A binary string (BLOB). [String](../../sql-reference/data-types/string.md). **Example** @@ -251,9 +245,7 @@ Values of [UUID](../data-types/uuid.md) type are encoded as big-endian order str **Returned value** -- A string with the binary representation of the argument. - -Type: [String](../../sql-reference/data-types/string.md). +- A string with the binary representation of the argument. [String](../../sql-reference/data-types/string.md). **Examples** @@ -342,9 +334,7 @@ Supports binary digits `0` and `1`. The number of binary digits does not have to **Returned value** -- A binary string (BLOB). - -Type: [String](../../sql-reference/data-types/string.md). +- A binary string (BLOB). [String](../../sql-reference/data-types/string.md). **Examples** @@ -400,9 +390,7 @@ bitPositionsToArray(arg) **Returned value** -- An array containing a list of positions of bits that equal `1`, in ascending order. - -Type: [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). +- An array containing a list of positions of bits that equal `1`, in ascending order. [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). **Example** @@ -458,9 +446,7 @@ mortonEncode(args) **Returned value** -- A UInt64 code - -Type: [UInt64](../../sql-reference/data-types/int-uint.md) +- A UInt64 code. [UInt64](../../sql-reference/data-types/int-uint.md) **Example** @@ -500,9 +486,7 @@ Note: when using columns for `args` the provided `range_mask` tuple should still **Returned value** -- A UInt64 code - -Type: [UInt64](../../sql-reference/data-types/int-uint.md) +- A UInt64 code. [UInt64](../../sql-reference/data-types/int-uint.md) **Example** @@ -621,9 +605,7 @@ mortonDecode(tuple_size, code) **Returned value** -- [tuple](../../sql-reference/data-types/tuple.md) of the specified size. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md) +- [tuple](../../sql-reference/data-types/tuple.md) of the specified size. [UInt64](../../sql-reference/data-types/int-uint.md) **Example** diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 4149afce044..41657aafbbe 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -243,10 +243,8 @@ dictHas('dict_name', id_expr) **Returned value** -- 0, if there is no key. -- 1, if there is a key. - -Type: `UInt8`. +- 0, if there is no key. [UInt8](../data-types/int-uint.md). +- 1, if there is a key. [UInt8](../data-types/int-uint.md). ## dictGetHierarchy @@ -265,9 +263,7 @@ dictGetHierarchy('dict_name', key) **Returned value** -- Parents for the key. - -Type: [Array(UInt64)](../../sql-reference/data-types/array.md). +- Parents for the key. [Array(UInt64)](../../sql-reference/data-types/array.md). ## dictIsIn @@ -285,10 +281,8 @@ dictIsIn('dict_name', child_id_expr, ancestor_id_expr) **Returned value** -- 0, if `child_id_expr` is not a child of `ancestor_id_expr`. -- 1, if `child_id_expr` is a child of `ancestor_id_expr` or if `child_id_expr` is an `ancestor_id_expr`. - -Type: `UInt8`. +- 0, if `child_id_expr` is not a child of `ancestor_id_expr`. [UInt8](../data-types/int-uint.md). +- 1, if `child_id_expr` is a child of `ancestor_id_expr` or if `child_id_expr` is an `ancestor_id_expr`. [UInt8](../data-types/int-uint.md). ## dictGetChildren @@ -307,9 +301,7 @@ dictGetChildren(dict_name, key) **Returned values** -- First-level descendants for the key. - -Type: [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). +- First-level descendants for the key. [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). **Example** @@ -357,9 +349,7 @@ dictGetDescendants(dict_name, key, level) **Returned values** -- Descendants for the key. - -Type: [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). +- Descendants for the key. [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). **Example** diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 1cd7eeb7c83..89b95888f85 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -341,9 +341,7 @@ Even in these cases, we recommend applying the function offline and pre-calculat **Returned value** -- SHA hash as a hex-unencoded FixedString. SHA-1 returns as FixedString(20), SHA-224 as FixedString(28), SHA-256 — FixedString(32), SHA-512 — FixedString(64). - -Type: [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). +- SHA hash as a hex-unencoded FixedString. SHA-1 returns as FixedString(20), SHA-224 as FixedString(28), SHA-256 — FixedString(32), SHA-512 — FixedString(64). [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). **Example** @@ -381,9 +379,7 @@ This cryptographic hash-function is integrated into ClickHouse with BLAKE3 Rust **Return value** -- BLAKE3 hash as a byte array with type FixedString(32). - -Type: [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). +- BLAKE3 hash as a byte array with type FixedString(32). [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). **Example** @@ -540,9 +536,7 @@ This is just [JavaHash](#javahash) with zeroed out sign bit. This function is us **Returned value** -A `Int32` data type hash value. - -Type: `hiveHash`. +- `hiveHash` hash value. [Int32](../data-types/int-uint.md). **Example** @@ -679,9 +673,7 @@ gccMurmurHash(par1, ...) **Returned value** -- Calculated hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Calculated hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -718,9 +710,7 @@ MurmurHash(par1, ...) **Returned value** -- Calculated hash value. - -Type: [UInt32](/docs/en/sql-reference/data-types/int-uint.md). +- Calculated hash value. [UInt32](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -786,9 +776,7 @@ murmurHash3_128(expr) **Returned value** -A 128-bit `MurmurHash3` hash value. - -Type: [FixedString(16)](/docs/en/sql-reference/data-types/fixedstring.md). +A 128-bit `MurmurHash3` hash value. [FixedString(16)](/docs/en/sql-reference/data-types/fixedstring.md). **Example** @@ -822,9 +810,7 @@ xxh3(expr) **Returned value** -A 64-bit `xxh3` hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +A 64-bit `xxh3` hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -856,9 +842,11 @@ SELECT xxHash64('') **Returned value** -A `UInt32` or `UInt64` data type hash value. +- Hash value. [UInt32/64](../data-types/int-uint.md). -Type: `UInt32` for `xxHash32` and `UInt64` for `xxHash64`. +note::: +The return type will be `UInt32` for `xxHash32` and `UInt64` for `xxHash64`. +::: **Example** @@ -899,9 +887,7 @@ ngramSimHash(string[, ngramsize]) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -938,9 +924,7 @@ ngramSimHashCaseInsensitive(string[, ngramsize]) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -977,9 +961,7 @@ ngramSimHashUTF8(string[, ngramsize]) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -1016,9 +998,7 @@ ngramSimHashCaseInsensitiveUTF8(string[, ngramsize]) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -1055,9 +1035,7 @@ wordShingleSimHash(string[, shinglesize]) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -1094,9 +1072,7 @@ wordShingleSimHashCaseInsensitive(string[, shinglesize]) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -1133,9 +1109,7 @@ wordShingleSimHashUTF8(string[, shinglesize]) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -1172,9 +1146,7 @@ wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize]) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -1208,9 +1180,7 @@ wyHash64(string) **Returned value** -- Hash value. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -1248,9 +1218,7 @@ ngramMinHash(string[, ngramsize, hashnum]) **Returned value** -- Tuple with two hashes — the minimum and the maximum. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). **Example** @@ -1288,9 +1256,7 @@ ngramMinHashCaseInsensitive(string[, ngramsize, hashnum]) **Returned value** -- Tuple with two hashes — the minimum and the maximum. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). **Example** @@ -1328,9 +1294,7 @@ ngramMinHashUTF8(string[, ngramsize, hashnum]) **Returned value** -- Tuple with two hashes — the minimum and the maximum. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). **Example** @@ -1368,9 +1332,7 @@ ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum]) **Returned value** -- Tuple with two hashes — the minimum and the maximum. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). **Example** @@ -1406,9 +1368,7 @@ ngramMinHashArg(string[, ngramsize, hashnum]) **Returned value** -- Tuple with two tuples with `hashnum` n-grams each. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` n-grams each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). **Example** @@ -1444,9 +1404,7 @@ ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum]) **Returned value** -- Tuple with two tuples with `hashnum` n-grams each. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` n-grams each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). **Example** @@ -1482,9 +1440,7 @@ ngramMinHashArgUTF8(string[, ngramsize, hashnum]) **Returned value** -- Tuple with two tuples with `hashnum` n-grams each. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` n-grams each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). **Example** @@ -1520,9 +1476,7 @@ ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum]) **Returned value** -- Tuple with two tuples with `hashnum` n-grams each. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` n-grams each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). **Example** @@ -1560,9 +1514,7 @@ wordShingleMinHash(string[, shinglesize, hashnum]) **Returned value** -- Tuple with two hashes — the minimum and the maximum. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). **Example** @@ -1600,9 +1552,7 @@ wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum]) **Returned value** -- Tuple with two hashes — the minimum and the maximum. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). **Example** @@ -1640,9 +1590,7 @@ wordShingleMinHashUTF8(string[, shinglesize, hashnum]) **Returned value** -- Tuple with two hashes — the minimum and the maximum. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). **Example** @@ -1680,9 +1628,7 @@ wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum]) **Returned value** -- Tuple with two hashes — the minimum and the maximum. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). **Example** @@ -1718,9 +1664,7 @@ wordShingleMinHashArg(string[, shinglesize, hashnum]) **Returned value** -- Tuple with two tuples with `hashnum` word shingles each. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` word shingles each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). **Example** @@ -1756,9 +1700,7 @@ wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum]) **Returned value** -- Tuple with two tuples with `hashnum` word shingles each. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` word shingles each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). **Example** @@ -1794,9 +1736,7 @@ wordShingleMinHashArgUTF8(string[, shinglesize, hashnum]) **Returned value** -- Tuple with two tuples with `hashnum` word shingles each. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` word shingles each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). **Example** @@ -1832,9 +1772,7 @@ wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum]) **Returned value** -- Tuple with two tuples with `hashnum` word shingles each. - -Type: [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` word shingles each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). **Example** diff --git a/docs/en/sql-reference/functions/introspection.md b/docs/en/sql-reference/functions/introspection.md index 1025b8bdc3d..be8a2956d41 100644 --- a/docs/en/sql-reference/functions/introspection.md +++ b/docs/en/sql-reference/functions/introspection.md @@ -40,15 +40,10 @@ addressToLine(address_of_binary_instruction) **Returned value** -- Source code filename and the line number in this file delimited by colon. - - For example, `/build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.cpp:199`, where `199` is a line number. - -- Name of a binary, if the function couldn’t find the debug information. - -- Empty string, if the address is not valid. - -Type: [String](../../sql-reference/data-types/string.md). +- Source code filename and the line number in this file delimited by colon. [String](../../sql-reference/data-types/string.md). + - For example, `/build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.cpp:199`, where `199` is a line number. +- Name of a binary, if the function couldn’t find the debug information. [String](../../sql-reference/data-types/string.md). +- Empty string, if the address is not valid. [String](../../sql-reference/data-types/string.md). **Example** @@ -137,9 +132,7 @@ addressToLineWithInlines(address_of_binary_instruction) - Array with single element which is name of a binary, if the function couldn’t find the debug information. -- Empty array, if the address is not valid. - -Type: [Array(String)](../../sql-reference/data-types/array.md). +- Empty array, if the address is not valid. [Array(String)](../../sql-reference/data-types/array.md). **Example** @@ -236,10 +229,8 @@ addressToSymbol(address_of_binary_instruction) **Returned value** -- Symbol from ClickHouse object files. -- Empty string, if the address is not valid. - -Type: [String](../../sql-reference/data-types/string.md). +- Symbol from ClickHouse object files. [String](../../sql-reference/data-types/string.md). +- Empty string, if the address is not valid. [String](../../sql-reference/data-types/string.md). **Example** @@ -333,10 +324,8 @@ demangle(symbol) **Returned value** -- Name of the C++ function. -- Empty string if a symbol is not valid. - -Type: [String](../../sql-reference/data-types/string.md). +- Name of the C++ function. [String](../../sql-reference/data-types/string.md). +- Empty string if a symbol is not valid. [String](../../sql-reference/data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index be20e02d77e..21beffbd0a8 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -151,9 +151,7 @@ IPv6StringToNum(string) **Returned value** -- IPv6 address in binary format. - -Type: [FixedString(16)](../../sql-reference/data-types/fixedstring.md). +- IPv6 address in binary format. [FixedString(16)](../../sql-reference/data-types/fixedstring.md). **Example** @@ -313,9 +311,7 @@ toIPv6(string) **Returned value** -- IP address. - -Type: [IPv6](../../sql-reference/data-types/ipv6.md). +- IP address. [IPv6](../../sql-reference/data-types/ipv6.md). **Examples** @@ -374,9 +370,7 @@ isIPv4String(string) **Returned value** -- `1` if `string` is IPv4 address, `0` otherwise. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `string` is IPv4 address, `0` otherwise. [UInt8](../../sql-reference/data-types/int-uint.md). **Examples** @@ -412,9 +406,7 @@ isIPv6String(string) **Returned value** -- `1` if `string` is IPv6 address, `0` otherwise. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `string` is IPv6 address, `0` otherwise. [UInt8](../../sql-reference/data-types/int-uint.md). **Examples** @@ -454,9 +446,7 @@ This function accepts both IPv4 and IPv6 addresses (and networks) represented as **Returned value** -- `1` or `0`. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` or `0`. [UInt8](../../sql-reference/data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index e920ab82988..fa02dca07db 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -533,9 +533,7 @@ JSONExtractKeys(json[, a, b, c...]) **Returned value** -Array with the keys of the JSON. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Array with the keys of the JSON. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). **Example** @@ -595,10 +593,8 @@ JSONExtractKeysAndValuesRaw(json[, p, a, t, h]) **Returned values** -- Array with `('key', 'value')` tuples. Both tuple members are strings. -- Empty array if the requested object does not exist, or input JSON is invalid. - -Type: [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md), [String](../../sql-reference/data-types/string.md)). +- Array with `('key', 'value')` tuples. Both tuple members are strings. [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md), [String](../../sql-reference/data-types/string.md)). +- Empty array if the requested object does not exist, or input JSON is invalid. [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md), [String](../../sql-reference/data-types/string.md)). **Examples** @@ -739,9 +735,7 @@ toJSONString(value) **Returned value** -- JSON representation of the value. - -Type: [String](../../sql-reference/data-types/string.md). +- JSON representation of the value. [String](../../sql-reference/data-types/string.md). **Example** @@ -786,9 +780,7 @@ Alias: `JSON_ARRAY_LENGTH(json)`. **Returned value** -- If `json` is a valid JSON array string, returns the number of array elements, otherwise returns NULL. - -Type: [Nullable(UInt64)](../../sql-reference/data-types/int-uint.md). +- If `json` is a valid JSON array string, returns the number of array elements, otherwise returns NULL. [Nullable(UInt64)](../../sql-reference/data-types/int-uint.md). **Example** @@ -819,9 +811,7 @@ jsonMergePatch(json1, json2, ...) **Returned value** -- If JSON object strings are valid, return the merged JSON object string. - -Type: [String](../../sql-reference/data-types/string.md). +- If JSON object strings are valid, return the merged JSON object string. [String](../../sql-reference/data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 945166056af..eb0de410f28 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -842,9 +842,7 @@ degrees(x) **Returned value** -- Value in degrees. - -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +- Value in degrees. [Float64](../../sql-reference/data-types/float.md#float32-float64). **Example** diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 11ee471d709..2b4f888d06f 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -27,9 +27,7 @@ getMacro(name); **Returned value** -- Value of the specified macro. - -Type: [String](../../sql-reference/data-types/string.md). +- Value of the specified macro. [String](../../sql-reference/data-types/string.md). **Example** @@ -82,9 +80,7 @@ This function is case-insensitive. **Returned value** -- String with the fully qualified domain name. - -Type: `String`. +- String with the fully qualified domain name. [String](../data-types/string.md). **Example** @@ -207,9 +203,7 @@ byteSize(argument [, ...]) **Returned value** -- Estimation of byte size of the arguments in memory. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- Estimation of byte size of the arguments in memory. [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** @@ -409,10 +403,8 @@ Aliases: `user()`, `USER()`, `current_user()`. Aliases are case insensitive. **Returned values** -- The name of the current user. -- In distributed queries, the login of the user who initiated the query. - -Type: `String`. +- The name of the current user. [String](../data-types/string.md). +- In distributed queries, the login of the user who initiated the query. [String](../data-types/string.md). **Example** @@ -448,10 +440,8 @@ isConstant(x) **Returned values** -- `1` if `x` is constant. -- `0` if `x` is non-constant. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `x` is constant. [UInt8](../../sql-reference/data-types/int-uint.md). +- `0` if `x` is non-constant. [UInt8](../../sql-reference/data-types/int-uint.md). **Examples** @@ -517,8 +507,8 @@ ifNotFinite(x,y) **Arguments** -- `x` — Value to check for infinity. Type: [Float\*](../../sql-reference/data-types/float.md). -- `y` — Fallback value. Type: [Float\*](../../sql-reference/data-types/float.md). +- `x` — Value to check for infinity. [Float\*](../../sql-reference/data-types/float.md). +- `y` — Fallback value. [Float\*](../../sql-reference/data-types/float.md). **Returned value** @@ -924,9 +914,7 @@ uptime() **Returned value** -- Time value of seconds. - -Type: [UInt32](/docs/en/sql-reference/data-types/int-uint.md). +- Time value of seconds. [UInt32](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -971,7 +959,7 @@ None. **Returned value** -Type: [String](../data-types/string) +- Current version of ClickHouse. [String](../data-types/string). **Implementation details** @@ -1041,7 +1029,9 @@ To prevent that you can create a subquery with [ORDER BY](../../sql-reference/st - Value of `column` with `offset` distance from current row, if `offset` is not outside the block boundaries. - The default value of `column` or `default_value` (if given), if `offset` is outside the block boundaries. -Type: type of data blocks affected or default value type. +:::note +The return type will be that of the data blocks affected or the default value type. +::: **Example** @@ -1238,9 +1228,7 @@ runningConcurrency(start, end) **Returned values** -- The number of concurrent events at each event start time. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md) +- The number of concurrent events at each event start time. [UInt32](../../sql-reference/data-types/int-uint.md) **Example** @@ -1535,7 +1523,7 @@ SELECT * FROM table WHERE indexHint() **Returned value** -Type: [Uint8](https://clickhouse.com/docs/en/data_types/int_uint/#diapazony-uint). +- `1`. [Uint8](../data-types/int-uint.md). **Example** @@ -1638,9 +1626,7 @@ SELECT replicate(x, arr); **Returned value** -An array of the lame length as `arr` filled with value `x`. - -Type: `Array`. +An array of the lame length as `arr` filled with value `x`. [Array](../data-types/array.md). **Example** @@ -1670,9 +1656,7 @@ filesystemAvailable() **Returned value** -- The amount of remaining space available in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The amount of remaining space available in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -1702,9 +1686,7 @@ filesystemFree() **Returned value** -- The amount of free space in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The amount of free space in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -1734,9 +1716,7 @@ filesystemCapacity() **Returned value** -- Capacity of the filesystem in bytes. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- Capacity of the filesystem in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). **Example** @@ -1847,7 +1827,9 @@ finalizeAggregation(state) - Value/values that was aggregated. -Type: Value of any types that was aggregated. +:::note +The return type is equal to that of any types which were aggregated. +::: **Examples** @@ -2284,9 +2266,7 @@ countDigits(x) **Returned value** -Number of digits. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). +Number of digits. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). :::note For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). @@ -2310,9 +2290,7 @@ Result: ## errorCodeToName -Returns the textual name of an error code. - -Type: [LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md). +Returns the textual name of an error code. [LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md). **Syntax** @@ -2343,9 +2321,7 @@ tcpPort() **Returned value** -- The TCP port number. - -Type: [UInt16](../../sql-reference/data-types/int-uint.md). +- The TCP port number. [UInt16](../../sql-reference/data-types/int-uint.md). **Example** @@ -2381,9 +2357,7 @@ currentProfiles() **Returned value** -- List of the current user settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the current user settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## enabledProfiles @@ -2397,9 +2371,7 @@ enabledProfiles() **Returned value** -- List of the enabled settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the enabled settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## defaultProfiles @@ -2413,9 +2385,7 @@ defaultProfiles() **Returned value** -- List of the default settings profiles. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the default settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## currentRoles @@ -2429,9 +2399,7 @@ currentRoles() **Returned value** -- A list of the current roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- A list of the current roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## enabledRoles @@ -2445,9 +2413,7 @@ enabledRoles() **Returned value** -- List of the enabled roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the enabled roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## defaultRoles @@ -2461,9 +2427,7 @@ defaultRoles() **Returned value** -- List of the default roles for the current user. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the default roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## getServerPort @@ -2492,9 +2456,7 @@ getServerPort(port_name) **Returned value** -- The number of the server port. - -Type: [UInt16](../../sql-reference/data-types/int-uint.md). +- The number of the server port. [UInt16](../../sql-reference/data-types/int-uint.md). **Example** @@ -2526,9 +2488,7 @@ queryID() **Returned value** -- The ID of the current query. - -Type: [String](../../sql-reference/data-types/string.md) +- The ID of the current query. [String](../../sql-reference/data-types/string.md) **Example** @@ -2562,9 +2522,7 @@ initialQueryID() **Returned value** -- The ID of the initial current query. - -Type: [String](../../sql-reference/data-types/string.md) +- The ID of the initial current query. [String](../../sql-reference/data-types/string.md) **Example** @@ -2597,9 +2555,7 @@ shardNum() **Returned value** -- Shard index or constant `0`. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Shard index or constant `0`. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -2639,9 +2595,7 @@ shardCount() **Returned value** -- Total number of shards or `0`. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Total number of shards or `0`. [UInt32](../../sql-reference/data-types/int-uint.md). **See Also** @@ -2663,9 +2617,7 @@ getOSKernelVersion() **Returned value** -- The current OS kernel version. - -Type: [String](../../sql-reference/data-types/string.md). +- The current OS kernel version. [String](../../sql-reference/data-types/string.md). **Example** @@ -2699,9 +2651,7 @@ zookeeperSessionUptime() **Returned value** -- Uptime of the current ZooKeeper session in seconds. - -Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- Uptime of the current ZooKeeper session in seconds. [UInt32](../../sql-reference/data-types/int-uint.md). **Example** @@ -2738,9 +2688,7 @@ All arguments must be constant. **Returned value** -- Randomly generated table structure. - -Type: [String](../../sql-reference/data-types/string.md). +- Randomly generated table structure. [String](../../sql-reference/data-types/string.md). **Examples** @@ -2807,9 +2755,7 @@ structureToCapnProtoSchema(structure) **Returned value** -- CapnProto schema - -Type: [String](../../sql-reference/data-types/string.md). +- CapnProto schema. [String](../../sql-reference/data-types/string.md). **Examples** @@ -2908,9 +2854,7 @@ structureToProtobufSchema(structure) **Returned value** -- Protobuf schema - -Type: [String](../../sql-reference/data-types/string.md). +- Protobuf schema. [String](../../sql-reference/data-types/string.md). **Examples** diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index 2d7752ed022..a7866c6d12e 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -204,9 +204,7 @@ randNormal(mean, variance) **Returned value** -- Random number. - -Type: [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). **Example** @@ -243,9 +241,7 @@ randLogNormal(mean, variance) **Returned value** -- Random number. - -Type: [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). **Example** @@ -282,9 +278,7 @@ randBinomial(experiments, probability) **Returned value** -- Random number. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Random number. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -321,9 +315,7 @@ randNegativeBinomial(experiments, probability) **Returned value** -- Random number. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Random number. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -359,9 +351,7 @@ randPoisson(n) **Returned value** -- Random number. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Random number. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -397,9 +387,7 @@ randBernoulli(probability) **Returned value** -- Random number. - -Type: [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Random number. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). **Example** @@ -435,9 +423,7 @@ randExponential(lambda) **Returned value** -- Random number. - -Type: [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). **Example** @@ -473,9 +459,7 @@ randChiSquared(degree_of_freedom) **Returned value** -- Random number. - -Type: [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). **Example** @@ -511,9 +495,7 @@ randStudentT(degree_of_freedom) **Returned value** -- Random number. - -Type: [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). **Example** @@ -550,9 +532,7 @@ randFisherF(d1, d2) **Returned value** -- Random number. - -Type: [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). **Example** @@ -588,9 +568,7 @@ randomString(length) **Returned value** -- String filled with random bytes. - -Type: [String](../../sql-reference/data-types/string.md). +- String filled with random bytes. [String](../../sql-reference/data-types/string.md). **Example** @@ -630,9 +608,7 @@ randomFixedString(length); **Returned value(s)** -- String filled with random bytes. - -Type: [FixedString](../../sql-reference/data-types/fixedstring.md). +- String filled with random bytes. [FixedString](../../sql-reference/data-types/fixedstring.md). **Example** @@ -667,9 +643,7 @@ randomPrintableASCII(length) **Returned value** -- String with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. - -Type: [String](../../sql-reference/data-types/string.md) +- String with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. [String](../../sql-reference/data-types/string.md) **Example** @@ -701,9 +675,7 @@ randomStringUTF8(length); **Returned value(s)** -- UTF-8 random string. - -Type: [String](../../sql-reference/data-types/string.md). +- UTF-8 random string. [String](../../sql-reference/data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index afec43cd6f4..6cbcc4e4ef3 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -336,7 +336,7 @@ roundAge(num) - Returns `45`, for $45 \leq age \leq 54$. - Returns `55`, for $age \geq 55$. -Type: [UInt8](../data-types/int-uint.md). +Type: [UInt8](../data-types/int-uint.md) in all cases. **Example** diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 8e50637cf30..77563713605 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -25,13 +25,15 @@ splitByChar(separator, s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. Empty substrings may be selected when: +Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + +:::note + Empty substrings may be selected when: - A separator occurs at the beginning or end of the string; - There are multiple consecutive separators; - The original string `s` is empty. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +::: :::note The behavior of parameter `max_substrings` changed starting with ClickHouse v22.11. In versions older than that, `max_substrings > 0` meant that `max_substring`-many splits were performed and that the remainder of the string was returned as the final element of the list. @@ -76,15 +78,17 @@ splitByString(separator, s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. Empty substrings may be selected when: +Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +:::note +Empty substrings may be selected when: - A non-empty separator occurs at the beginning or end of the string; - There are multiple consecutive non-empty separators; - The original string `s` is empty while the separator is not empty. Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. +::: **Example** @@ -131,15 +135,17 @@ splitByRegexp(regexp, s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. Empty substrings may be selected when: +Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + +:::note +Empty substrings may be selected when: - A non-empty regular expression match occurs at the beginning or end of the string; - There are multiple consecutive non-empty regular expression matches; - The original string `s` is empty while the regular expression is not empty. -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). - Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. +::: **Example** @@ -186,11 +192,11 @@ splitByWhitespace(s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). - +Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + +:::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. +::: **Example** @@ -225,11 +231,11 @@ splitByNonAlpha(s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +:::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. +::: **Example** @@ -287,11 +293,11 @@ Alias: `splitByAlpha` **Returned value(s)** -Returns an array of selected substrings. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +:::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. +::: **Example** @@ -322,11 +328,8 @@ extractAllGroups(text, regexp) **Returned values** -- If the function finds at least one matching group, it returns `Array(Array(String))` column, clustered by group_id (1 to N, where N is number of capturing groups in `regexp`). - -- If there is no matching group, returns an empty array. - -Type: [Array](../data-types/array.md). +- If the function finds at least one matching group, it returns `Array(Array(String))` column, clustered by group_id (1 to N, where N is number of capturing groups in `regexp`). [Array](../data-types/array.md). +- If there is no matching group, returns an empty array. [Array](../data-types/array.md). **Example** @@ -359,9 +362,7 @@ ngrams(string, ngramsize) **Returned values** -- Array with n-grams. - -Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- Array with n-grams. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). **Example** @@ -387,9 +388,7 @@ Splits a string into tokens using non-alphanumeric ASCII characters as separator **Returned value** -- The resulting array of tokens from input string. - -Type: [Array](../data-types/array.md). +- The resulting array of tokens from input string. [Array](../data-types/array.md). **Example** diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index ba23870a584..f45ceb99617 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -30,9 +30,7 @@ empty(x) **Returned value** -- Returns `1` for an empty string or `0` for a non-empty string. - -Type: [UInt8](../data-types/int-uint.md). +- Returns `1` for an empty string or `0` for a non-empty string. [UInt8](../data-types/int-uint.md). **Example** @@ -68,9 +66,7 @@ notEmpty(x) **Returned value** -- Returns `1` for a non-empty string or `0` for an empty string string. - -Type: [UInt8](../data-types/int-uint.md). +- Returns `1` for a non-empty string or `0` for an empty string string. [UInt8](../data-types/int-uint.md). **Example** @@ -289,9 +285,7 @@ Alias: `LPAD` **Returned value** -- A left-padded string of the given length. - -Type: [String](../data-types/string.md). +- A left-padded string of the given length. [String](../data-types/string.md). **Example** @@ -325,9 +319,7 @@ leftPadUTF8(string, length[, pad_string]) **Returned value** -- A left-padded string of the given length. - -Type: [String](../data-types/string.md). +- A left-padded string of the given length. [String](../data-types/string.md). **Example** @@ -457,9 +449,7 @@ Alias: `RPAD` **Returned value** -- A left-padded string of the given length. - -Type: [String](../data-types/string.md). +- A left-padded string of the given length. [String](../data-types/string.md). **Example** @@ -493,9 +483,7 @@ rightPadUTF8(string, length[, pad_string]) **Returned value** -- A right-padded string of the given length. - -Type: [String](../data-types/string.md). +- A right-padded string of the given length. [String](../data-types/string.md). **Example** @@ -676,9 +664,7 @@ Alias: `REPEAT` **Returned value** -A string containing string `s` repeated `n` times. If `n` <= 0, the function returns the empty string. - -Type: `String`. +A string containing string `s` repeated `n` times. If `n` <= 0, the function returns the empty string. [String](../data-types/string.md). **Example** @@ -712,9 +698,7 @@ Alias: `SPACE`. **Returned value** -The string containing string ` ` repeated `n` times. If `n` <= 0, the function returns the empty string. - -Type: `String`. +The string containing string ` ` repeated `n` times. If `n` <= 0, the function returns the empty string. [String](../data-types/string.md). **Example** @@ -913,9 +897,7 @@ Alias: **Returned value** -A substring of `s` with `length` many bytes, starting at index `offset`. - -Type: `String`. +A substring of `s` with `length` many bytes, starting at index `offset`. [String](../data-types/string.md). **Example** @@ -1072,9 +1054,7 @@ base58Encode(plaintext) **Returned value** -- A string containing the encoded value of the argument. - -Type: [String](../../sql-reference/data-types/string.md). +- A string containing the encoded value of the argument. [String](../../sql-reference/data-types/string.md). **Example** @@ -1106,9 +1086,7 @@ base58Decode(encoded) **Returned value** -- A string containing the decoded value of the argument. - -Type: [String](../../sql-reference/data-types/string.md). +- A string containing the decoded value of the argument. [String](../data-types/string.md). **Example** @@ -1284,9 +1262,7 @@ trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) **Returned value** -A string without leading and/or trailing specified characters. - -Type: `String`. +A string without leading and/or trailing specified characters. [String](../data-types/string.md). **Example** @@ -1320,9 +1296,7 @@ Alias: `ltrim(input_string)`. **Returned value** -A string without leading common whitespaces. - -Type: `String`. +A string without leading common whitespaces. [String](../data-types/string.md). **Example** @@ -1356,9 +1330,7 @@ Alias: `rtrim(input_string)`. **Returned value** -A string without trailing common whitespaces. - -Type: `String`. +A string without trailing common whitespaces. [String](../data-types/string.md). **Example** @@ -1392,9 +1364,7 @@ Alias: `trim(input_string)`. **Returned value** -A string without leading and trailing common whitespaces. - -Type: `String`. +A string without leading and trailing common whitespaces. [String](../data-types/string.md). **Example** @@ -1444,9 +1414,7 @@ normalizeQuery(x) **Returned value** -- Sequence of characters with placeholders. - -Type: [String](../../sql-reference/data-types/string.md). +- Sequence of characters with placeholders. [String](../../sql-reference/data-types/string.md). **Example** @@ -1478,9 +1446,7 @@ normalizedQueryHash(x) **Returned value** -- Hash value. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges). +- Hash value. [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges). **Example** @@ -1512,9 +1478,7 @@ normalizeUTF8NFC(words) **Returned value** -- String transformed to NFC normalization form. - -Type: [String](../../sql-reference/data-types/string.md). +- String transformed to NFC normalization form. [String](../../sql-reference/data-types/string.md). **Example** @@ -1546,9 +1510,7 @@ normalizeUTF8NFD(words) **Returned value** -- String transformed to NFD normalization form. - -Type: [String](../../sql-reference/data-types/string.md). +- String transformed to NFD normalization form. [String](../../sql-reference/data-types/string.md). **Example** @@ -1580,9 +1542,7 @@ normalizeUTF8NFKC(words) **Returned value** -- String transformed to NFKC normalization form. - -Type: [String](../../sql-reference/data-types/string.md). +- String transformed to NFKC normalization form. [String](../../sql-reference/data-types/string.md). **Example** @@ -1614,9 +1574,7 @@ normalizeUTF8NFKD(words) **Returned value** -- String transformed to NFKD normalization form. - -Type: [String](../../sql-reference/data-types/string.md). +- String transformed to NFKD normalization form. [String](../../sql-reference/data-types/string.md). **Example** @@ -1651,9 +1609,7 @@ encodeXMLComponent(x) **Returned value** -- The escaped string. - -Type: [String](../../sql-reference/data-types/string.md). +- The escaped string. [String](../../sql-reference/data-types/string.md). **Example** @@ -1691,9 +1647,7 @@ decodeXMLComponent(x) **Returned value** -- The un-escaped string. - -Type: [String](../../sql-reference/data-types/string.md). +- The un-escaped string. [String](../../sql-reference/data-types/string.md). **Example** @@ -1727,9 +1681,7 @@ decodeHTMLComponent(x) **Returned value** -- The un-escaped string. - -Type: [String](../../sql-reference/data-types/string.md). +- The un-escaped string. [String](../../sql-reference/data-types/string.md). **Example** @@ -1782,9 +1734,7 @@ extractTextFromHTML(x) **Returned value** -- Extracted text. - -Type: [String](../../sql-reference/data-types/string.md). +- Extracted text. [String](../../sql-reference/data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 9738c19bf3c..327eb8994db 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -896,14 +896,16 @@ extractAllGroupsHorizontal(haystack, pattern) **Arguments** -- `haystack` — Input string. Type: [String](../../sql-reference/data-types/string.md). -- `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. Type: [String](../../sql-reference/data-types/string.md). +- `haystack` — Input string. [String](../../sql-reference/data-types/string.md). +- `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. [String](../../sql-reference/data-types/string.md). **Returned value** -- Type: [Array](../../sql-reference/data-types/array.md). +- Array of arrays of matches. [Array](../../sql-reference/data-types/array.md). +:::note If `haystack` does not match the `pattern` regex, an array of empty arrays is returned. +::: **Example** @@ -931,14 +933,16 @@ extractAllGroupsVertical(haystack, pattern) **Arguments** -- `haystack` — Input string. Type: [String](../../sql-reference/data-types/string.md). -- `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. Type: [String](../../sql-reference/data-types/string.md). +- `haystack` — Input string. [String](../../sql-reference/data-types/string.md). +- `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. [String](../../sql-reference/data-types/string.md). **Returned value** -- Type: [Array](../../sql-reference/data-types/array.md). +- Array of arrays of matches. [Array](../../sql-reference/data-types/array.md). +:::note If `haystack` does not match the `pattern` regex, an empty array is returned. +::: **Example** @@ -1340,9 +1344,7 @@ countSubstrings(haystack, needle[, start_pos]) **Returned values** -- The number of occurrences. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of occurrences. [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** @@ -1389,9 +1391,7 @@ countSubstringsCaseInsensitive(haystack, needle[, start_pos]) **Returned values** -- The number of occurrences. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of occurrences. [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** @@ -1443,9 +1443,7 @@ countSubstringsCaseInsensitiveUTF8(haystack, needle[, start_pos]) **Returned values** -- The number of occurrences. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of occurrences. [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** @@ -1496,9 +1494,7 @@ countMatches(haystack, pattern) **Returned value** -- The number of matches. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of matches. [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** @@ -1543,9 +1539,7 @@ countMatchesCaseInsensitive(haystack, pattern) **Returned value** -- The number of matches. - -Type: [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of matches. [UInt64](../../sql-reference/data-types/int-uint.md). **Examples** @@ -1583,9 +1577,7 @@ Alias: `REGEXP_EXTRACT(haystack, pattern[, index])`. **Returned values** -`pattern` may contain multiple regexp groups, `index` indicates which regex group to extract. An index of 0 means matching the entire regular expression. - -Type: `String`. +`pattern` may contain multiple regexp groups, `index` indicates which regex group to extract. An index of 0 means matching the entire regular expression. [String](../data-types/string.md). **Examples** @@ -1624,10 +1616,8 @@ hasSubsequence(haystack, needle) **Returned values** -- 1, if needle is a subsequence of haystack. -- 0, otherwise. - -Type: `UInt8`. +- 1, if needle is a subsequence of haystack. [UInt8](../data-types/int-uint.md). +- 0, otherwise. [UInt8](../data-types/int-uint.md). **Examples** @@ -1662,10 +1652,8 @@ hasSubsequenceCaseInsensitive(haystack, needle) **Returned values** -- 1, if needle is a subsequence of haystack. -- 0, otherwise. - -Type: `UInt8`. +- 1, if needle is a subsequence of haystack. [UInt8](../data-types/int-uint.md). +- 0, otherwise. [UInt8](../data-types/int-uint.md). **Examples** @@ -1700,10 +1688,8 @@ hasSubsequenceUTF8(haystack, needle) **Returned values** -- 1, if needle is a subsequence of haystack. -- 0, otherwise. - -Type: `UInt8`. +- 1, if needle is a subsequence of haystack. [UInt8](../data-types/int-uint.md). +- 0, otherwise. [UInt8](../data-types/int-uint.md). Query: @@ -1738,10 +1724,8 @@ hasSubsequenceCaseInsensitiveUTF8(haystack, needle) **Returned values** -- 1, if needle is a subsequence of haystack. -- 0, otherwise. - -Type: `UInt8`. +- 1, if needle is a subsequence of haystack. [UInt8](../data-types/int-uint.md). +- 0, otherwise. [UInt8](../data-types/int-uint.md). **Examples** diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index e80a3fa9860..beb7a0503b9 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -30,9 +30,7 @@ At least four data points are required in `series` to detect outliers. **Returned value** -- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. - -Type: [Array](../../sql-reference/data-types/array.md). +- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. [Array](../../sql-reference/data-types/array.md). **Examples** @@ -81,10 +79,8 @@ seriesPeriodDetectFFT(series); **Returned value** -- A real value equal to the period of series data -- Returns NAN when number of data points are less than four. - -Type: [Float64](../../sql-reference/data-types/float.md). +- A real value equal to the period of series data. [Float64](../../sql-reference/data-types/float.md). +- Returns NAN when number of data points are less than four. [nan](../../sql-reference/data-types/float.md/#nan-and-inf). **Examples** @@ -134,9 +130,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** - An array of four arrays where the first array include seasonal components, the second array - trend, -the third array - residue component, and the fourth array - baseline(seasonal + trend) component. - -Type: [Array](../../sql-reference/data-types/array.md). +the third array - residue component, and the fourth array - baseline(seasonal + trend) component. [Array](../../sql-reference/data-types/array.md). **Examples** diff --git a/docs/en/sql-reference/functions/time-window-functions.md b/docs/en/sql-reference/functions/time-window-functions.md index d8f23c92e61..2b5f093c149 100644 --- a/docs/en/sql-reference/functions/time-window-functions.md +++ b/docs/en/sql-reference/functions/time-window-functions.md @@ -23,9 +23,7 @@ tumble(time_attr, interval [, timezone]) **Returned values** -- The inclusive lower and exclusive upper bound of the corresponding tumbling window. - -Type: `Tuple(DateTime, DateTime)` +- The inclusive lower and exclusive upper bound of the corresponding tumbling window. [Tuple](../data-types/tuple.md)([DateTime](../data-types/datetime.md), [DateTime](../data-types/datetime.md))`. **Example** @@ -60,9 +58,7 @@ hop(time_attr, hop_interval, window_interval [, timezone]) **Returned values** -- The inclusive lower and exclusive upper bound of the corresponding hopping window. Since one record can be assigned to multiple hop windows, the function only returns the bound of the **first** window when hop function is used **without** `WINDOW VIEW`. - -Type: `Tuple(DateTime, DateTime)` +- The inclusive lower and exclusive upper bound of the corresponding hopping window. Since one record can be assigned to multiple hop windows, the function only returns the bound of the **first** window when hop function is used **without** `WINDOW VIEW`. [Tuple](../data-types/tuple.md)([DateTime](../data-types/datetime.md), [DateTime](../data-types/datetime.md))`. **Example** diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 64b1732597f..cfedc01ce8f 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -134,7 +134,9 @@ Tuples should have the same type of the elements. - The Hamming distance. -Type: The result type is calculated the same way it is for [Arithmetic functions](../../sql-reference/functions/arithmetic-functions.md), based on the number of elements in the input tuples. +:::note +The result type is calculated the same way it is for [Arithmetic functions](../../sql-reference/functions/arithmetic-functions.md), based on the number of elements in the input tuples. +::: ``` sql SELECT @@ -200,9 +202,7 @@ tupleToNameValuePairs(tuple) **Returned value** -- An array with (name, value) pairs. - -Type: [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md), ...)). +- An array with (name, value) pairs. [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md), ...)). **Example** @@ -278,9 +278,7 @@ Alias: `vectorSum`. **Returned value** -- Tuple with the sum. - -Type: [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the sum. [Tuple](../../sql-reference/data-types/tuple.md). **Example** @@ -317,9 +315,7 @@ Alias: `vectorDifference`. **Returned value** -- Tuple with the result of subtraction. - -Type: [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the result of subtraction. [Tuple](../../sql-reference/data-types/tuple.md). **Example** @@ -354,9 +350,7 @@ tupleMultiply(tuple1, tuple2) **Returned value** -- Tuple with the multiplication. - -Type: [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the multiplication. [Tuple](../../sql-reference/data-types/tuple.md). **Example** @@ -391,9 +385,7 @@ tupleDivide(tuple1, tuple2) **Returned value** -- Tuple with the result of division. - -Type: [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the result of division. [Tuple](../../sql-reference/data-types/tuple.md). **Example** @@ -427,9 +419,7 @@ tupleNegate(tuple) **Returned value** -- Tuple with the result of negation. - -Type: [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the result of negation. [Tuple](../../sql-reference/data-types/tuple.md). **Example** @@ -464,9 +454,7 @@ tupleMultiplyByNumber(tuple, number) **Returned value** -- Tuple with multiplied values. - -Type: [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with multiplied values. [Tuple](../../sql-reference/data-types/tuple.md). **Example** @@ -501,9 +489,7 @@ tupleDivideByNumber(tuple, number) **Returned value** -- Tuple with divided values. - -Type: [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with divided values. [Tuple](../../sql-reference/data-types/tuple.md). **Example** diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index 377283bc006..9468228c737 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -21,9 +21,7 @@ map(key1, value1[, key2, value2, ...]) **Returned value** -- Data structure as `key:value` pairs. - -Type: [Map(key, value)](../../sql-reference/data-types/map.md). +- Data structure as `key:value` pairs. [Map(key, value)](../../sql-reference/data-types/map.md). **Examples** @@ -387,9 +385,7 @@ mapContains(map, key) **Returned value** -- `1` if `map` contains `key`, `0` if not. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `map` contains `key`, `0` if not. [UInt8](../../sql-reference/data-types/int-uint.md). **Example** @@ -431,9 +427,7 @@ mapKeys(map) **Returned value** -- Array containing all keys from the `map`. - -Type: [Array](../../sql-reference/data-types/array.md). +- Array containing all keys from the `map`. [Array](../../sql-reference/data-types/array.md). **Example** @@ -474,9 +468,7 @@ mapValues(map) **Returned value** -- Array containing all the values from `map`. - -Type: [Array](../../sql-reference/data-types/array.md). +- Array containing all the values from `map`. [Array](../../sql-reference/data-types/array.md). **Example** diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index ea08ffa50e7..f1c2e92f201 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -631,9 +631,7 @@ toDateTime64(expr, scale, [timezone]) **Returned value** -- A calendar date and time of day, with sub-second precision. - -Type: [DateTime64](/docs/en/sql-reference/data-types/datetime64.md). +- A calendar date and time of day, with sub-second precision. [DateTime64](/docs/en/sql-reference/data-types/datetime64.md). **Example** @@ -1749,9 +1747,7 @@ toLowCardinality(expr) **Returned values** -- Result of `expr`. - -Type: `LowCardinality(expr_result_type)` +- Result of `expr`. [LowCardinality](../data-types/lowcardinality.md) of the type of `expr`. **Example** diff --git a/docs/en/sql-reference/functions/ulid-functions.md b/docs/en/sql-reference/functions/ulid-functions.md index eb69b1779ae..b4e3fc2d164 100644 --- a/docs/en/sql-reference/functions/ulid-functions.md +++ b/docs/en/sql-reference/functions/ulid-functions.md @@ -65,9 +65,7 @@ ULIDStringToDateTime(ulid[, timezone]) **Returned value** -- Timestamp with milliseconds precision. - -Type: [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md). +- Timestamp with milliseconds precision. [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md). **Usage example** diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index a0b0170721c..52eeb539ef4 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -28,7 +28,7 @@ domain(url) **Arguments** -- `url` — URL. Type: [String](../../sql-reference/data-types/string.md). +- `url` — URL. [String](../../sql-reference/data-types/string.md). The URL can be specified with or without a scheme. Examples: @@ -48,10 +48,8 @@ clickhouse.com **Returned values** -- Host name. If ClickHouse can parse the input string as a URL. -- Empty string. If ClickHouse can’t parse the input string as a URL. - -Type: `String`. +- Host name. If ClickHouse can parse the input string as a URL. [String](../data-types/string.md). +- Empty string. If ClickHouse can’t parse the input string as a URL. [String](../data-types/string.md). **Example** @@ -79,7 +77,7 @@ topLevelDomain(url) **Arguments** -- `url` — URL. Type: [String](../../sql-reference/data-types/string.md). +- `url` — URL. [String](../../sql-reference/data-types/string.md). The URL can be specified with or without a scheme. Examples: @@ -91,10 +89,8 @@ https://clickhouse.com/time/ **Returned values** -- Domain name. If ClickHouse can parse the input string as a URL. -- Empty string. If ClickHouse cannot parse the input string as a URL. - -Type: `String`. +- Domain name. If ClickHouse can parse the input string as a URL. [String](../../sql-reference/data-types/string.md). +- Empty string. If ClickHouse cannot parse the input string as a URL. [String](../../sql-reference/data-types/string.md). **Example** @@ -162,9 +158,7 @@ cutToFirstSignificantSubdomain(URL, TLD) **Returned value** -- Part of the domain that includes top-level subdomains up to the first significant subdomain. - -Type: [String](../../sql-reference/data-types/string.md). +- Part of the domain that includes top-level subdomains up to the first significant subdomain. [String](../../sql-reference/data-types/string.md). **Example** @@ -216,9 +210,7 @@ cutToFirstSignificantSubdomainCustomWithWWW(URL, TLD) **Returned value** -- Part of the domain that includes top-level subdomains up to the first significant subdomain without stripping `www`. - -Type: [String](../../sql-reference/data-types/string.md). +- Part of the domain that includes top-level subdomains up to the first significant subdomain without stripping `www`. [String](../../sql-reference/data-types/string.md). **Example** @@ -270,9 +262,7 @@ firstSignificantSubdomainCustom(URL, TLD) **Returned value** -- First significant subdomain. - -Type: [String](../../sql-reference/data-types/string.md). +- First significant subdomain. [String](../../sql-reference/data-types/string.md). **Example** @@ -422,9 +412,7 @@ netloc(URL) **Returned value** -- `username:password@host:port`. - -Type: `String`. +- `username:password@host:port`. [String](../data-types/string.md). **Example** @@ -479,9 +467,7 @@ cutURLParameter(URL, name) **Returned value** -- URL with `name` URL parameter removed. - -Type: `String`. +- URL with `name` URL parameter removed. [String](../data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index d1b833c2439..0c1da88913d 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -289,9 +289,7 @@ The function also works for [Arrays](array-functions.md#function-empty) and [Str **Returned value** -- Returns `1` for an empty UUID or `0` for a non-empty UUID. - -Type: [UInt8](../data-types/int-uint.md). +- Returns `1` for an empty UUID or `0` for a non-empty UUID. [UInt8](../data-types/int-uint.md). **Example** @@ -331,9 +329,7 @@ The function also works for [Arrays](array-functions.md#function-notempty) or [S **Returned value** -- Returns `1` for a non-empty UUID or `0` for an empty UUID. - -Type: [UInt8](../data-types/int-uint.md). +- Returns `1` for a non-empty UUID or `0` for an empty UUID. [UInt8](../data-types/int-uint.md). **Example** From 508b0356543fc3a49e069166093147b3089ed29a Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 23 May 2024 14:08:48 +0000 Subject: [PATCH 284/392] Move is NaN from other-functions to arithmetic functions --- .../en/sql-reference/functions/arithmetic-functions.md | 10 ++++++++++ docs/en/sql-reference/functions/other-functions.md | 10 ---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index 8b8527acfdf..7b079152907 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -194,6 +194,16 @@ Result: You can get similar result by using the [ternary operator](../../sql-reference/functions/conditional-functions.md#ternary-operator): `isFinite(x) ? x : y`. +## isNaN + +Returns 1 if the Float32 and Float64 argument is NaN, otherwise this function 0. + +**Syntax** + +```sql +isNaN(x) +``` + ## modulo Calculates the remainder of the division of two values `a` by `b`. diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 79c0148d704..c16e8af1ef0 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -541,16 +541,6 @@ Result: └────────────────────┘ ``` -## isNaN - -Returns 1 if the Float32 and Float64 argument is NaN, otherwise this function 0. - -**Syntax** - -```sql -isNaN(x) -``` - ## hasColumnInTable Given the database name, the table name, and the column name as constant strings, returns 1 if the given column exists, otherwise 0. From 8df4da5efaa014f7866288e1aac799f40f52a8c2 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 23 May 2024 14:21:38 +0000 Subject: [PATCH 285/392] Print query in explain plan with parallel replicas --- src/Interpreters/ClusterProxy/executeQuery.cpp | 4 ++++ src/Processors/QueryPlan/ReadFromRemote.cpp | 2 ++ 2 files changed, 6 insertions(+) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 4bbda982f5b..13e6fa87051 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -403,6 +403,10 @@ void executeQueryWithParallelReplicas( ContextPtr context, std::shared_ptr storage_limits) { + auto logger = getLogger("executeQueryWithParallelReplicas"); + LOG_DEBUG(logger, "Executing read from {}, header {}, query ({}), stage {} with parallel replicas", + storage_id.getNameForLogs(), header.dumpStructure(), query_ast->formatForLogging(), processed_stage); + const auto & settings = context->getSettingsRef(); /// check cluster for parallel replicas diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index b4e35af85d6..84c2515e8ca 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -386,6 +386,8 @@ ReadFromParallelRemoteReplicasStep::ReadFromParallelRemoteReplicasStep( chassert(cluster->getShardCount() == 1); std::vector description; + description.push_back(fmt::format("query: {}", formattedAST(query_ast))); + for (const auto & pool : cluster->getShardsInfo().front().per_replica_pools) description.push_back(fmt::format("Replica: {}", pool->getHost())); From 71ce01404ddb4bf26f88d910452e70bb4a27a842 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 23 May 2024 16:34:52 +0200 Subject: [PATCH 286/392] Fix validation --- src/Analyzer/ValidationUtils.cpp | 3 +++ src/Planner/PlannerExpressionAnalysis.cpp | 24 ++++------------------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/Analyzer/ValidationUtils.cpp b/src/Analyzer/ValidationUtils.cpp index 9e977964755..59157838edf 100644 --- a/src/Analyzer/ValidationUtils.cpp +++ b/src/Analyzer/ValidationUtils.cpp @@ -276,6 +276,9 @@ void validateAggregates(const QueryTreeNodePtr & query_node, AggregatesValidatio if (query_node_typed.hasOrderBy()) validate_group_by_columns_visitor.visit(query_node_typed.getOrderByNode()); + if (query_node_typed.hasInterpolate()) + validate_group_by_columns_visitor.visit(query_node_typed.getInterpolate()); + validate_group_by_columns_visitor.visit(query_node_typed.getProjectionNode()); } diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index 399bbfc67cf..1cdff0a26aa 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -441,30 +441,20 @@ SortAnalysisResult analyzeSort(const QueryNode & query_node, auto & interpolate_list_node = query_node.getInterpolate()->as(); PlannerActionsVisitor interpolate_actions_visitor(planner_context); - auto interpolate_expression_dag = std::make_shared(); + auto interpolate_actions_dag = std::make_shared(); for (auto & interpolate_node : interpolate_list_node.getNodes()) { auto & interpolate_node_typed = interpolate_node->as(); - interpolate_actions_visitor.visit(interpolate_expression_dag, interpolate_node_typed.getInterpolateExpression()); + interpolate_actions_visitor.visit(interpolate_actions_dag, interpolate_node_typed.getExpression()); + interpolate_actions_visitor.visit(interpolate_actions_dag, interpolate_node_typed.getInterpolateExpression()); } std::unordered_map before_sort_actions_inputs_name_to_node; for (const auto & node : before_sort_actions->getInputs()) before_sort_actions_inputs_name_to_node.emplace(node->result_name, node); - std::unordered_set aggregation_keys; - - auto projection_expression_dag = std::make_shared(); - for (const auto & node : query_node.getProjection()) - actions_visitor.visit(projection_expression_dag, node); - for (const auto & node : projection_expression_dag->getNodes()) - aggregation_keys.insert(node.result_name); - - if (aggregation_analysis_result_optional) - aggregation_keys.insert(aggregation_analysis_result_optional->aggregation_keys.begin(), aggregation_analysis_result_optional->aggregation_keys.end()); - - for (const auto & node : interpolate_expression_dag->getNodes()) + for (const auto & node : interpolate_actions_dag->getNodes()) { if (before_sort_actions_dag_output_node_names.contains(node.result_name) || node.type != ActionsDAG::ActionType::INPUT) @@ -479,12 +469,6 @@ SortAnalysisResult analyzeSort(const QueryNode & query_node, input_node_it = it; } - if (aggregation_analysis_result_optional) - if (!aggregation_keys.contains(node.result_name)) - throw Exception(ErrorCodes::NOT_AN_AGGREGATE, - "Column {} is not under aggregate function and not in GROUP BY keys. In query {}", - node.result_name, query_node.formatASTForErrorMessage()); - before_sort_actions_outputs.push_back(input_node_it->second); before_sort_actions_dag_output_node_names.insert(node.result_name); } From 21f831da0d823b9f00b02100bedb847d7af6720e Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 23 May 2024 16:36:11 +0200 Subject: [PATCH 287/392] Remove unneeded changes --- src/Planner/PlannerExpressionAnalysis.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index 1cdff0a26aa..6e194b2c03e 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -28,7 +28,6 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int NOT_AN_AGGREGATE; } namespace @@ -398,8 +397,7 @@ ProjectionAnalysisResult analyzeProjection(const QueryNode & query_node, SortAnalysisResult analyzeSort(const QueryNode & query_node, const ColumnsWithTypeAndName & input_columns, const PlannerContextPtr & planner_context, - ActionsChain & actions_chain, - std::optional aggregation_analysis_result_optional) + ActionsChain & actions_chain) { ActionsDAGPtr before_sort_actions = std::make_shared(input_columns); auto & before_sort_actions_outputs = before_sort_actions->getOutputs(); @@ -570,7 +568,7 @@ PlannerExpressionsAnalysisResult buildExpressionAnalysisResult(const QueryTreeNo std::optional sort_analysis_result_optional; if (query_node.hasOrderBy()) { - sort_analysis_result_optional = analyzeSort(query_node, current_output_columns, planner_context, actions_chain, aggregation_analysis_result_optional); + sort_analysis_result_optional = analyzeSort(query_node, current_output_columns, planner_context, actions_chain); current_output_columns = actions_chain.getLastStepAvailableOutputColumns(); } From 47578772e4558ec044b676e13f5be6ae89d6c49f Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 23 May 2024 16:39:16 +0200 Subject: [PATCH 288/392] Fix hdfs assertion --- .../ObjectStorage/Azure/Configuration.h | 2 +- .../ObjectStorage/HDFS/Configuration.h | 2 +- .../ObjectStorage/ReadBufferIterator.cpp | 6 ++--- .../ObjectStorage/S3/Configuration.cpp | 2 +- src/Storages/ObjectStorage/S3/Configuration.h | 2 +- .../ObjectStorage/StorageObjectStorage.h | 2 +- .../StorageObjectStorageSource.cpp | 23 +++++++++++++++---- .../StorageObjectStorageSource.h | 6 +++++ 8 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/Storages/ObjectStorage/Azure/Configuration.h b/src/Storages/ObjectStorage/Azure/Configuration.h index 19b9cf56f93..35b19079ca9 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.h +++ b/src/Storages/ObjectStorage/Azure/Configuration.h @@ -36,7 +36,7 @@ public: void setPaths(const Paths & paths) override { blobs_paths = paths; } String getNamespace() const override { return container; } - String getDataSourceDescription() override { return std::filesystem::path(connection_url) / container; } + String getDataSourceDescription() const override { return std::filesystem::path(connection_url) / container; } StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; void check(ContextPtr context) const override; diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index dc06e754c44..01a8b9c5e3b 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -31,7 +31,7 @@ public: std::string getPathWithoutGlobs() const override; String getNamespace() const override { return ""; } - String getDataSourceDescription() override { return url; } + String getDataSourceDescription() const override { return url; } StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; void check(ContextPtr context) const override; diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 50d69129883..5e89a0a1b9d 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -37,8 +37,7 @@ ReadBufferIterator::ReadBufferIterator( SchemaCache::Key ReadBufferIterator::getKeyForSchemaCache(const ObjectInfo & object_info, const String & format_name) const { - chassert(!object_info.getPath().starts_with("/")); - auto source = std::filesystem::path(configuration->getDataSourceDescription()) / object_info.getPath(); + auto source = StorageObjectStorageSource::getUniqueStoragePathIdentifier(*configuration, object_info); return DB::getKeyForSchemaCache(source, format_name, format_settings, getContext()); } @@ -51,8 +50,7 @@ SchemaCache::Keys ReadBufferIterator::getKeysForSchemaCache() const std::back_inserter(sources), [&](const auto & elem) { - chassert(!elem->getPath().starts_with("/")); - return std::filesystem::path(configuration->getDataSourceDescription()) / elem->getPath(); + return StorageObjectStorageSource::getUniqueStoragePathIdentifier(*configuration, *elem); }); return DB::getKeysForSchemaCache(sources, *format, format_settings, getContext()); } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 00d569fea9f..6b6cde0c431 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -50,7 +50,7 @@ static const std::unordered_set optional_configuration_keys = "no_sign_request" }; -String StorageS3Configuration::getDataSourceDescription() +String StorageS3Configuration::getDataSourceDescription() const { return std::filesystem::path(url.uri.getHost() + std::to_string(url.uri.getPort())) / url.bucket; } diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index de6c02d5020..906d10a1a9a 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -31,7 +31,7 @@ public: void setPaths(const Paths & paths) override { keys = paths; } String getNamespace() const override { return url.bucket; } - String getDataSourceDescription() override; + String getDataSourceDescription() const override; StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override; bool isArchive() const override { return url.archive_pattern.has_value(); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 7b118cb7e6b..de75af5035b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -161,7 +161,7 @@ public: virtual const Paths & getPaths() const = 0; virtual void setPaths(const Paths & paths) = 0; - virtual String getDataSourceDescription() = 0; + virtual String getDataSourceDescription() const = 0; virtual String getNamespace() const = 0; virtual StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const = 0; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 7332574b246..b31d0f8a92e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -82,6 +82,21 @@ void StorageObjectStorageSource::setKeyCondition(const ActionsDAGPtr & filter_ac setKeyConditionImpl(filter_actions_dag, context_, read_from_format_info.format_header); } +std::string StorageObjectStorageSource::getUniqueStoragePathIdentifier( + const Configuration & configuration, + const ObjectInfo & object_info, + bool include_connection_info) +{ + auto path = object_info.getPath(); + if (path.starts_with("/")) + path = path.substr(1); + + if (include_connection_info) + return fs::path(configuration.getDataSourceDescription()) / path; + else + return fs::path(configuration.getNamespace()) / path; +} + std::shared_ptr StorageObjectStorageSource::createFileIterator( ConfigurationPtr configuration, ObjectStoragePtr object_storage, @@ -183,7 +198,7 @@ Chunk StorageObjectStorageSource::generate() VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, - fs::path(configuration->getNamespace()) / reader.getObjectInfo().getPath(), + getUniqueStoragePathIdentifier(*configuration, reader.getObjectInfo(), false), object_info.metadata->size_bytes, &filename); return chunk; @@ -212,7 +227,7 @@ Chunk StorageObjectStorageSource::generate() void StorageObjectStorageSource::addNumRowsToCache(const ObjectInfo & object_info, size_t num_rows) { const auto cache_key = getKeyForSchemaCache( - fs::path(configuration->getDataSourceDescription()) / object_info.getPath(), + getUniqueStoragePathIdentifier(*configuration, object_info), configuration->format, format_settings, getContext()); @@ -222,7 +237,7 @@ void StorageObjectStorageSource::addNumRowsToCache(const ObjectInfo & object_inf std::optional StorageObjectStorageSource::tryGetNumRowsFromCache(const ObjectInfo & object_info) { const auto cache_key = getKeyForSchemaCache( - fs::path(configuration->getDataSourceDescription()) / object_info.getPath(), + getUniqueStoragePathIdentifier(*configuration, object_info), configuration->format, format_settings, getContext()); @@ -511,7 +526,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne for (const auto & object_info : new_batch) { chassert(object_info); - paths.push_back(fs::path(configuration->getNamespace()) / object_info->getPath()); + paths.push_back(getUniqueStoragePathIdentifier(*configuration, *object_info, false)); } VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_dag, virtual_columns, getContext()); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index e9635ff4dce..fd7c7aa7102 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -17,6 +17,7 @@ class StorageObjectStorageSource : public SourceWithKeyCondition, WithContext { friend class StorageS3QueueSource; public: + using Configuration = StorageObjectStorage::Configuration; using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; using ObjectInfo = StorageObjectStorage::ObjectInfo; using ObjectInfos = StorageObjectStorage::ObjectInfos; @@ -58,6 +59,11 @@ public: ObjectInfos * read_keys, std::function file_progress_callback = {}); + static std::string getUniqueStoragePathIdentifier( + const Configuration & configuration, + const ObjectInfo & object_info, + bool include_connection_info = true); + protected: const String name; ObjectStoragePtr object_storage; From 9911f13c77588e089832c05aebfe0aff5b8241cd Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 23 May 2024 16:39:53 +0200 Subject: [PATCH 289/392] Update function return type for consistency --- .../en/sql-reference/functions/geo/geohash.md | 14 +- docs/en/sql-reference/functions/geo/h3.md | 276 +++++++----------- docs/en/sql-reference/functions/geo/s2.md | 42 ++- docs/en/sql-reference/functions/geo/svg.md | 4 +- .../functions/rounding-functions.md | 16 +- .../functions/string-search-functions.md | 6 +- .../sql-reference/functions/uuid-functions.md | 8 +- 7 files changed, 138 insertions(+), 228 deletions(-) diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md index ce16af44e90..80c55650b9c 100644 --- a/docs/en/sql-reference/functions/geo/geohash.md +++ b/docs/en/sql-reference/functions/geo/geohash.md @@ -74,11 +74,11 @@ geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precisi **Arguments** -- `longitude_min` — Minimum longitude. Range: `[-180°, 180°]`. Type: [Float](../../../sql-reference/data-types/float.md). -- `latitude_min` — Minimum latitude. Range: `[-90°, 90°]`. Type: [Float](../../../sql-reference/data-types/float.md). -- `longitude_max` — Maximum longitude. Range: `[-180°, 180°]`. Type: [Float](../../../sql-reference/data-types/float.md). -- `latitude_max` — Maximum latitude. Range: `[-90°, 90°]`. Type: [Float](../../../sql-reference/data-types/float.md). -- `precision` — Geohash precision. Range: `[1, 12]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `longitude_min` — Minimum longitude. Range: `[-180°, 180°]`. [Float](../../../sql-reference/data-types/float.md). +- `latitude_min` — Minimum latitude. Range: `[-90°, 90°]`. [Float](../../../sql-reference/data-types/float.md). +- `longitude_max` — Maximum longitude. Range: `[-180°, 180°]`. [Float](../../../sql-reference/data-types/float.md). +- `latitude_max` — Maximum latitude. Range: `[-90°, 90°]`. [Float](../../../sql-reference/data-types/float.md). +- `precision` — Geohash precision. Range: `[1, 12]`. [UInt8](../../../sql-reference/data-types/int-uint.md). :::note All coordinate parameters must be of the same type: either `Float32` or `Float64`. @@ -86,11 +86,9 @@ All coordinate parameters must be of the same type: either `Float32` or `Float64 **Returned values** -- Array of precision-long strings of geohash-boxes covering provided area, you should not rely on order of items. +- Array of precision-long strings of geohash-boxes covering provided area, you should not rely on order of items. [Array](../../../sql-reference/data-types/array.md)([String](../../../sql-reference/data-types/string.md)). - `[]` - Empty array if minimum latitude and longitude values aren’t less than corresponding maximum values. -Type: [Array](../../../sql-reference/data-types/array.md)([String](../../../sql-reference/data-types/string.md)). - :::note Function throws an exception if resulting array is over 10’000’000 items long. ::: diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index 29486c58e6a..7faff8288b3 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -26,14 +26,12 @@ h3IsValid(h3index) **Parameter** -- `h3index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned values** -- 1 — The number is a valid H3 index. -- 0 — The number is not a valid H3 index. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- 1 — The number is a valid H3 index. [UInt8](../../../sql-reference/data-types/int-uint.md). +- 0 — The number is not a valid H3 index. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -63,14 +61,12 @@ h3GetResolution(h3index) **Parameter** -- `h3index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned values** -- Index resolution. Range: `[0, 15]`. -- If the index is not valid, the function returns a random value. Use [h3IsValid](#h3isvalid) to verify the index. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- If the index is not valid, the function returns a random value. Use [h3IsValid](#h3isvalid) to verify the index. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -100,11 +96,11 @@ h3EdgeAngle(resolution) **Parameter** -- `resolution` — Index resolution. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. +- `resolution` — Index resolution. [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. **Returned values** -- The average length of the [H3](#h3index) hexagon edge in grades. Type: [Float64](../../../sql-reference/data-types/float.md). +- The average length of the [H3](#h3index) hexagon edge in grades. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -134,11 +130,11 @@ h3EdgeLengthM(resolution) **Parameter** -- `resolution` — Index resolution. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. +- `resolution` — Index resolution. [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. **Returned values** -- The average length of the [H3](#h3index) hexagon edge in meters. Type: [Float64](../../../sql-reference/data-types/float.md). +- The average length of the [H3](#h3index) hexagon edge in meters. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -168,11 +164,11 @@ h3EdgeLengthKm(resolution) **Parameter** -- `resolution` — Index resolution. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. +- `resolution` — Index resolution. [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. **Returned values** -- The average length of the [H3](#h3index) hexagon edge in kilometers. Type: [Float64](../../../sql-reference/data-types/float.md). +- The average length of the [H3](#h3index) hexagon edge in kilometers. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -202,16 +198,14 @@ geoToH3(lon, lat, resolution) **Arguments** -- `lon` — Longitude. Type: [Float64](../../../sql-reference/data-types/float.md). -- `lat` — Latitude. Type: [Float64](../../../sql-reference/data-types/float.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `lon` — Longitude. [Float64](../../../sql-reference/data-types/float.md). +- `lat` — Latitude. [Float64](../../../sql-reference/data-types/float.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned values** -- Hexagon index number. -- 0 in case of error. - -Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- 0 in case of error. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -275,12 +269,11 @@ h3ToGeoBoundary(h3Index) **Arguments** -- `h3Index` — H3 Index. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3Index` — H3 Index. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned values** -- Array of pairs '(lon, lat)'. -Type: [Array](../../../sql-reference/data-types/array.md)([Float64](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md)). +- Array of pairs '(lon, lat)'. [Array](../../../sql-reference/data-types/array.md)([Float64](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md)). **Example** @@ -311,14 +304,12 @@ h3kRing(h3index, k) **Arguments** -- `h3index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `k` — Radius. Type: [integer](../../../sql-reference/data-types/int-uint.md) +- `h3index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `k` — Radius. [integer](../../../sql-reference/data-types/int-uint.md) **Returned values** -- Array of H3 indexes. - -Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of H3 indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -354,13 +345,11 @@ h3GetBaseCell(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Hexagon base cell number. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- Hexagon base cell number. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -390,13 +379,11 @@ h3HexAreaM2(resolution) **Parameter** -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Area in square meters. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Area in square meters. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -426,13 +413,11 @@ h3HexAreaKm2(resolution) **Parameter** -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Area in square kilometers. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Area in square kilometers. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -462,15 +447,13 @@ h3IndexesAreNeighbors(index1, index2) **Arguments** -- `index1` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `index2` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index1` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index2` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- `1` — Indexes are neighbours. -- `0` — Indexes are not neighbours. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — Indexes are neighbours. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `0` — Indexes are not neighbours. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -500,14 +483,12 @@ h3ToChildren(index, resolution) **Arguments** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned values** -- Array of the child H3-indexes. - -Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of the child H3-indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -537,14 +518,12 @@ h3ToParent(index, resolution) **Arguments** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Parent H3 index. - -Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- Parent H3 index. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -572,13 +551,11 @@ h3ToString(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- String representation of the H3 index. - -Type: [String](../../../sql-reference/data-types/string.md). +- String representation of the H3 index. [String](../../../sql-reference/data-types/string.md). **Example** @@ -608,11 +585,11 @@ stringToH3(index_str) **Parameter** -- `index_str` — String representation of the H3 index. Type: [String](../../../sql-reference/data-types/string.md). +- `index_str` — String representation of the H3 index. [String](../../../sql-reference/data-types/string.md). **Returned value** -- Hexagon index number. Returns 0 on error. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- Hexagon index number. Returns 0 on error. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -642,11 +619,11 @@ h3GetResolution(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -676,14 +653,12 @@ h3IsResClassIII(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- `1` — Index has a resolution with Class III orientation. -- `0` — Index doesn't have a resolution with Class III orientation. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — Index has a resolution with Class III orientation. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `0` — Index doesn't have a resolution with Class III orientation. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -713,14 +688,12 @@ h3IsPentagon(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- `1` — Index represents a pentagonal cell. -- `0` — Index doesn't represent a pentagonal cell. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — Index represents a pentagonal cell. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `0` — Index doesn't represent a pentagonal cell. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -750,13 +723,11 @@ h3GetFaces(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned values** -- Array containing icosahedron faces intersected by a given H3 index. - -Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array containing icosahedron faces intersected by a given H3 index. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -786,13 +757,11 @@ h3CellAreaM2(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Cell area in square meters. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Cell area in square meters. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -822,13 +791,11 @@ h3CellAreaRads2(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Cell area in square radians. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Cell area in square radians. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -858,14 +825,12 @@ h3ToCenterChild(index, resolution) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned values** -- [H3](#h3index) index of the center child contained by given [H3](#h3index) at the given resolution. - -Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- [H3](#h3index) index of the center child contained by given [H3](#h3index) at the given resolution. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -895,13 +860,11 @@ h3ExactEdgeLengthM(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Exact edge length in meters. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Exact edge length in meters. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -931,13 +894,11 @@ h3ExactEdgeLengthKm(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Exact edge length in kilometers. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Exact edge length in kilometers. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -967,13 +928,11 @@ h3ExactEdgeLengthRads(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Exact edge length in radians. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Exact edge length in radians. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -1003,13 +962,11 @@ h3NumHexagons(resolution) **Parameter** -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Number of H3 indices. - -Type: [Int64](../../../sql-reference/data-types/int-uint.md). +- Number of H3 indices. [Int64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -1039,14 +996,12 @@ h3PointDistM(lat1, lon1, lat2, lon2) **Arguments** -- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. Type: [Float64](../../../sql-reference/data-types/float.md). -- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. Type: [Float64](../../../sql-reference/data-types/float.md). +- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../../sql-reference/data-types/float.md). +- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../../sql-reference/data-types/float.md). **Returned values** -- Haversine or great circle distance in meters. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Haversine or great circle distance in meters.[Float64](../../../sql-reference/data-types/float.md). **Example** @@ -1076,14 +1031,12 @@ h3PointDistKm(lat1, lon1, lat2, lon2) **Arguments** -- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. Type: [Float64](../../../sql-reference/data-types/float.md). -- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. Type: [Float64](../../../sql-reference/data-types/float.md). +- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../../sql-reference/data-types/float.md). +- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../../sql-reference/data-types/float.md). **Returned values** -- Haversine or great circle distance in kilometers. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Haversine or great circle distance in kilometers. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -1113,14 +1066,12 @@ h3PointDistRads(lat1, lon1, lat2, lon2) **Arguments** -- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. Type: [Float64](../../../sql-reference/data-types/float.md). -- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. Type: [Float64](../../../sql-reference/data-types/float.md). +- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../../sql-reference/data-types/float.md). +- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../../sql-reference/data-types/float.md). **Returned values** -- Haversine or great circle distance in radians. - -Type: [Float64](../../../sql-reference/data-types/float.md). +- Haversine or great circle distance in radians. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -1150,9 +1101,7 @@ h3GetRes0Indexes() **Returned values** -- Array of all the resolution 0 H3 indexes. - -Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of all the resolution 0 H3 indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -1183,13 +1132,11 @@ h3GetPentagonIndexes(resolution) **Parameter** -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Array of all pentagon H3 indexes. - -Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of all pentagon H3 indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -1219,14 +1166,12 @@ h3Line(start,end) **Parameter** -- `start` — Hexagon index number that represents a starting point. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `end` — Hexagon index number that represents an ending point. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `start` — Hexagon index number that represents a starting point. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `end` — Hexagon index number that represents an ending point. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -Array of h3 indexes representing the line of indices between the two provided indices: - -Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +Array of h3 indexes representing the line of indices between the two provided indices. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -1256,14 +1201,12 @@ h3Distance(start,end) **Parameter** -- `start` — Hexagon index number that represents a starting point. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `end` — Hexagon index number that represents an ending point. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `start` — Hexagon index number that represents a starting point. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `end` — Hexagon index number that represents an ending point. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Number of grid cells. - -Type: [Int64](../../../sql-reference/data-types/int-uint.md). +- Number of grid cells. [Int64](../../../sql-reference/data-types/int-uint.md). Returns a negative number if finding the distance fails. @@ -1297,14 +1240,12 @@ h3HexRing(index, k) **Parameter** -- `index` — Hexagon index number that represents the origin. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `k` — Distance. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number that represents the origin. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `k` — Distance. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned values** -- Array of H3 indexes. - -Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of H3 indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -1334,14 +1275,12 @@ h3GetUnidirectionalEdge(originIndex, destinationIndex) **Parameter** -- `originIndex` — Origin Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `destinationIndex` — Destination Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `originIndex` — Origin Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `destinationIndex` — Destination Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Unidirectional Edge Hexagon Index number. - -Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- Unidirectional Edge Hexagon Index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -1371,14 +1310,12 @@ h3UnidirectionalEdgeisValid(index) **Parameter** -- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- 1 — The H3 index is a valid unidirectional edge. -- 0 — The H3 index is not a valid unidirectional edge. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- 1 — The H3 index is a valid unidirectional edge. [UInt8](../../../sql-reference/data-types/int-uint.md). +- 0 — The H3 index is not a valid unidirectional edge. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -1408,13 +1345,11 @@ h3GetOriginIndexFromUnidirectionalEdge(edge) **Parameter** -- `edge` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Origin Hexagon Index number. - -Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- Origin Hexagon Index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -1444,13 +1379,11 @@ h3GetDestinationIndexFromUnidirectionalEdge(edge) **Parameter** -- `edge` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Destination Hexagon Index number. - -Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- Destination Hexagon Index number. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -1480,7 +1413,7 @@ h3GetIndexesFromUnidirectionalEdge(edge) **Parameter** -- `edge` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** @@ -1519,13 +1452,11 @@ h3GetUnidirectionalEdgesFromHexagon(index) **Parameter** -- `index` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -Array of h3 indexes representing each unidirectional edge: - -Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +Array of h3 indexes representing each unidirectional edge. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -1555,12 +1486,11 @@ h3GetUnidirectionalEdgeBoundary(index) **Parameter** -- `index` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). **Returned value** -- Array of pairs '(lon, lat)'. - Type: [Array](../../../sql-reference/data-types/array.md)([Float64](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md)). +- Array of pairs '(lon, lat)'. [Array](../../../sql-reference/data-types/array.md)([Float64](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md)). **Example** diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index f4702eff44b..424b547753d 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -26,9 +26,7 @@ geoToS2(lon, lat) **Returned values** -- S2 point index. - -Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -62,9 +60,9 @@ s2ToGeo(s2index) **Returned values** -- A tuple consisting of two values: `tuple(lon,lat)`. - -Type: `lon` — [Float64](../../../sql-reference/data-types/float.md). `lat` — [Float64](../../../sql-reference/data-types/float.md). +- A [tuple](../../data-types/tuple.md) consisting of two values: + - `lon`. [Float64](../../../sql-reference/data-types/float.md). + - `lat`. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -98,9 +96,7 @@ s2GetNeighbors(s2index) **Returned values** -- An array consisting of 4 neighbor indexes: `array[s2index1, s2index3, s2index2, s2index4]`. - -Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- An array consisting of 4 neighbor indexes: `array[s2index1, s2index3, s2index2, s2index4]`. [Array](../../data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). **Example** @@ -134,10 +130,8 @@ s2CellsIntersect(s2index1, s2index2) **Returned values** -- 1 — If the cells intersect. -- 0 — If the cells don't intersect. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- 1 — If the cells intersect. [UInt8](../../../sql-reference/data-types/int-uint.md). +- 0 — If the cells don't intersect. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -173,10 +167,8 @@ s2CapContains(center, degrees, point) **Returned values** -- 1 — If the cap contains the S2 point index. -- 0 — If the cap doesn't contain the S2 point index. - -Type: [UInt8](../../../sql-reference/data-types/int-uint.md). +- 1 — If the cap contains the S2 point index. [UInt8](../../../sql-reference/data-types/int-uint.md). +- 0 — If the cap doesn't contain the S2 point index. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -211,8 +203,8 @@ s2CapUnion(center1, radius1, center2, radius2) **Returned values** -- `center` — S2 point index corresponding the center of the smallest cap containing the two input caps. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `radius` — Radius of the smallest cap containing the two input caps. Type: [Float64](../../../sql-reference/data-types/float.md). +- `center` — S2 point index corresponding the center of the smallest cap containing the two input caps. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `radius` — Radius of the smallest cap containing the two input caps. [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -248,8 +240,8 @@ s2RectAdd(s2pointLow, s2pointHigh, s2Point) **Returned values** -- `s2PointLow` — Low S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2PointHigh` — Height S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/float.md). +- `s2PointLow` — Low S2 cell id corresponding to the grown rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2PointHigh` — Height S2 cell id corresponding to the grown rectangle. [UInt64](../../../sql-reference/data-types/float.md). **Example** @@ -321,8 +313,8 @@ s2RectUnion(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2PointHi) **Returned values** -- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2UnionRect2PointHi` — High S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the union rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2UnionRect2PointHi` — High S2 cell id corresponding to the union rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** @@ -357,8 +349,8 @@ s2RectIntersection(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2Poin **Returned values** -- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2UnionRect2PointHi` — High S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2UnionRect2PointHi` — High S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. [UInt64](../../../sql-reference/data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/geo/svg.md b/docs/en/sql-reference/functions/geo/svg.md index c565d1f9de7..320d4542fee 100644 --- a/docs/en/sql-reference/functions/geo/svg.md +++ b/docs/en/sql-reference/functions/geo/svg.md @@ -23,13 +23,11 @@ Aliases: `SVG`, `svg` **Returned value** -- The SVG representation of the geometry: +- The SVG representation of the geometry. [String](../../data-types/string). - SVG circle - SVG polygon - SVG path -Type: [String](../../data-types/string) - **Examples** **Circle** diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index 6cbcc4e4ef3..20f73de4410 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -328,15 +328,13 @@ roundAge(num) **Returned value** -- Returns `0`, for $age \lt 1$. -- Returns `17`, for $1 \leq age \leq 17$. -- Returns `18`, for $18 \leq age \leq 24$. -- Returns `25`, for $25 \leq age \leq 34$. -- Returns `35`, for $35 \leq age \leq 44$. -- Returns `45`, for $45 \leq age \leq 54$. -- Returns `55`, for $age \geq 55$. - -Type: [UInt8](../data-types/int-uint.md) in all cases. +- Returns `0`, for $age \lt 1$. [UInt8](../data-types/int-uint.md). +- Returns `17`, for $1 \leq age \leq 17$. [UInt8](../data-types/int-uint.md). +- Returns `18`, for $18 \leq age \leq 24$. [UInt8](../data-types/int-uint.md). +- Returns `25`, for $25 \leq age \leq 34$. [UInt8](../data-types/int-uint.md). +- Returns `35`, for $35 \leq age \leq 44$. [UInt8](../data-types/int-uint.md). +- Returns `45`, for $45 \leq age \leq 54$. [UInt8](../data-types/int-uint.md). +- Returns `55`, for $age \geq 55$. [UInt8](../data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 327eb8994db..f02c8f15aa9 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -42,8 +42,8 @@ Alias: **Returned values** -- Starting position in bytes and counting from 1, if the substring was found. -- 0, if the substring was not found. +- Starting position in bytes and counting from 1, if the substring was found. [UInt64](../../sql-reference/data-types/int-uint.md). +- 0, if the substring was not found. [UInt64](../../sql-reference/data-types/int-uint.md). If substring `needle` is empty, these rules apply: - if no `start_pos` was specified: return `1` @@ -53,8 +53,6 @@ If substring `needle` is empty, these rules apply: The same rules also apply to functions `locate`, `positionCaseInsensitive`, `positionUTF8` and `positionCaseInsensitiveUTF8`. -Type: `Integer`. - **Examples** Query: diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 0c1da88913d..a16663afc5b 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -640,9 +640,7 @@ UUIDv7ToDateTime(uuid[, timezone]) **Returned value** -- Timestamp with milliseconds precision. If the UUID is not a valid version 7 UUID, it returns 1970-01-01 00:00:00.000. - -Type: [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md). +- Timestamp with milliseconds precision. If the UUID is not a valid version 7 UUID, it returns 1970-01-01 00:00:00.000. [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md). **Usage examples** @@ -682,9 +680,7 @@ serverUUID() **Returned value** -- The UUID of the server. - -Type: [UUID](../data-types/uuid.md). +- The UUID of the server. [UUID](../data-types/uuid.md). ## See also From 45e4e30cfd13f35bda29629d42f881c69bbf5250 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 23 May 2024 16:51:17 +0200 Subject: [PATCH 290/392] Update retuurn type of logical functions --- .../functions/logical-functions.md | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/docs/en/sql-reference/functions/logical-functions.md b/docs/en/sql-reference/functions/logical-functions.md index 138b804a575..1977c5c2a7e 100644 --- a/docs/en/sql-reference/functions/logical-functions.md +++ b/docs/en/sql-reference/functions/logical-functions.md @@ -30,11 +30,9 @@ Alias: The [AND operator](../../sql-reference/operators/index.md#logical-and-ope **Returned value** -- `0`, if at least one argument evaluates to `false`, -- `NULL`, if no argument evaluates to `false` and at least one argument is `NULL`, -- `1`, otherwise. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `0`, if at least one argument evaluates to `false`. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `NULL`, if no argument evaluates to `false` and at least one argument is `NULL`. [NULL](../../sql-reference/syntax.md/#null). +- `1`, otherwise. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). **Example** @@ -136,11 +134,9 @@ Alias: The [Negation operator](../../sql-reference/operators/index.md#logical-ne **Returned value** -- `1`, if `val` evaluates to `false`, -- `0`, if `val` evaluates to `true`, -- `NULL`, if `val` is `NULL`. - -Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `1`, if `val` evaluates to `false`. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `0`, if `val` evaluates to `true`. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `NULL`, if `val` is `NULL`. [NULL](../../sql-reference/syntax.md/#null). **Example** @@ -172,11 +168,9 @@ xor(val1, val2...) **Returned value** -- `1`, for two values: if one of the values evaluates to `false` and other does not, -- `0`, for two values: if both values evaluate to `false` or to both `true`, -- `NULL`, if at least one of the inputs is `NULL` - -Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `1`, for two values: if one of the values evaluates to `false` and other does not. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `0`, for two values: if both values evaluate to `false` or to both `true`. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `NULL`, if at least one of the inputs is `NULL`. [NULL](../../sql-reference/syntax.md/#null). **Example** From 60e94af1ecd1e2b3e5b3f3194901d001653b7991 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 23 May 2024 16:55:02 +0200 Subject: [PATCH 291/392] Return one line change --- src/Planner/PlannerExpressionAnalysis.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index 6e194b2c03e..7984d97a1ea 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -444,7 +444,6 @@ SortAnalysisResult analyzeSort(const QueryNode & query_node, for (auto & interpolate_node : interpolate_list_node.getNodes()) { auto & interpolate_node_typed = interpolate_node->as(); - interpolate_actions_visitor.visit(interpolate_actions_dag, interpolate_node_typed.getExpression()); interpolate_actions_visitor.visit(interpolate_actions_dag, interpolate_node_typed.getInterpolateExpression()); } From a4903e6b5583b172496be8fa0dbf6cead2b51d86 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 23 May 2024 16:55:48 +0200 Subject: [PATCH 292/392] Add supportsDynamicSubcolumns() --- src/Storages/ObjectStorage/StorageObjectStorage.h | 2 ++ src/Storages/ObjectStorage/StorageObjectStorageCluster.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index de75af5035b..f45d8c1f01a 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -84,6 +84,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } bool supportsSubsetOfColumns(const ContextPtr & context) const; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 1c244b1ca36..69fec2b3c77 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -26,6 +26,8 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } RemoteQueryExecutor::Extension getTaskIteratorExtension( From 9481f2f32535630694b9c328384b69116f3b535b Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 23 May 2024 17:07:55 +0200 Subject: [PATCH 293/392] Update array-functions.md Add missing ::: for note --- docs/en/sql-reference/functions/array-functions.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 512874d20b7..458adb276fd 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -2373,6 +2373,7 @@ arrayMin([func,] arr) :::note If `func` is specified, then the return type matches the return value type of `func`, otherwise it matches the type of the array elements. +::: **Examples** From 9cfd2322d717fc6d2208683b224ee6969932de79 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 23 May 2024 17:14:56 +0200 Subject: [PATCH 294/392] Small edits to bit-functions.md --- docs/en/sql-reference/functions/bit-functions.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index 709f438d67f..2538ad32022 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -186,9 +186,9 @@ SELECT bitTest(number, index) - `number` – Integer number. - `index` – Position of bit. -**Returned values** +**Returned value** -Returns a value of bit at specified position. [UInt8](../data-types/int-uint.md). +- Value of the bit at the specified position. [UInt8](../data-types/int-uint.md). **Example** @@ -249,9 +249,9 @@ SELECT bitTestAll(number, index1, index2, index3, index4, ...) - `number` – Integer number. - `index1`, `index2`, `index3`, `index4` – Positions of bit. For example, for set of positions (`index1`, `index2`, `index3`, `index4`) is true if and only if all of its positions are true (`index1` ⋀ `index2`, ⋀ `index3` ⋀ `index4`). -**Returned values** +**Returned value** -Returns result of logical conjuction. [UInt8](../data-types/int-uint.md). +- Result of the logical conjuction. [UInt8](../data-types/int-uint.md). **Example** @@ -312,9 +312,9 @@ SELECT bitTestAny(number, index1, index2, index3, index4, ...) - `number` – Integer number. - `index1`, `index2`, `index3`, `index4` – Positions of bit. -**Returned values** +**Returned value** -Returns result of logical disjunction. [UInt8](../data-types/int-uint.md). +- Result of the logical disjunction. [UInt8](../data-types/int-uint.md). **Example** From a01b6e8e8278b531a72463eb6f1920fe8d682c0e Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 23 May 2024 17:19:03 +0200 Subject: [PATCH 295/392] Numbers in return type should be in `` --- docs/en/sql-reference/functions/geo/s2.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index 424b547753d..2158ef2d57d 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -94,7 +94,7 @@ s2GetNeighbors(s2index) - `s2index` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md). -**Returned values** +**Returned value** - An array consisting of 4 neighbor indexes: `array[s2index1, s2index3, s2index2, s2index4]`. [Array](../../data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). @@ -128,10 +128,10 @@ s2CellsIntersect(s2index1, s2index2) - `siIndex1`, `s2index2` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md). -**Returned values** +**Returned value** -- 1 — If the cells intersect. [UInt8](../../../sql-reference/data-types/int-uint.md). -- 0 — If the cells don't intersect. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — If the cells intersect. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `0` — If the cells don't intersect. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -165,10 +165,10 @@ s2CapContains(center, degrees, point) - `degrees` — Radius of the cap in degrees. [Float64](../../../sql-reference/data-types/float.md). - `point` — S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md). -**Returned values** +**Returned value** -- 1 — If the cap contains the S2 point index. [UInt8](../../../sql-reference/data-types/int-uint.md). -- 0 — If the cap doesn't contain the S2 point index. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — If the cap contains the S2 point index. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `0` — If the cap doesn't contain the S2 point index. [UInt8](../../../sql-reference/data-types/int-uint.md). **Example** @@ -275,10 +275,10 @@ s2RectContains(s2PointLow, s2PointHi, s2Point) - `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). - `s2Point` — Target S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md). -**Returned values** +**Returned value** -- 1 — If the rectangle contains the given S2 point. -- 0 — If the rectangle doesn't contain the given S2 point. +- `1` — If the rectangle contains the given S2 point. +- `0` — If the rectangle doesn't contain the given S2 point. **Example** From 732b6d1ecc5df7360e0290e950904b7512711777 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 23 May 2024 17:22:02 +0200 Subject: [PATCH 296/392] Add hyphens to return values --- .../functions/splitting-merging-functions.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 77563713605..8aa171949a3 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -25,7 +25,7 @@ splitByChar(separator, s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). :::note Empty substrings may be selected when: @@ -78,7 +78,7 @@ splitByString(separator, s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). :::note Empty substrings may be selected when: @@ -135,7 +135,7 @@ splitByRegexp(regexp, s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). :::note Empty substrings may be selected when: @@ -192,7 +192,7 @@ splitByWhitespace(s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). :::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. @@ -231,7 +231,7 @@ splitByNonAlpha(s[, max_substrings])) **Returned value(s)** -Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). :::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. @@ -293,7 +293,7 @@ Alias: `splitByAlpha` **Returned value(s)** -Returns an array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). :::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. From bab94ac56aa0ef568d34dd1e230e29190e8eaec9 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 23 May 2024 17:24:07 +0200 Subject: [PATCH 297/392] Correct "note:::" to ":::note" --- docs/en/sql-reference/functions/hash-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 89b95888f85..e3968a691a8 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -844,7 +844,7 @@ SELECT xxHash64('') - Hash value. [UInt32/64](../data-types/int-uint.md). -note::: +:::note The return type will be `UInt32` for `xxHash32` and `UInt64` for `xxHash64`. ::: From c1950236ced0b110e679c4042d1fab2c7df26f2f Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 May 2024 15:24:18 +0000 Subject: [PATCH 298/392] Cosmetics, pt. IV --- src/Functions/{serial.cpp => generateSerialID.cpp} | 2 -- 1 file changed, 2 deletions(-) rename src/Functions/{serial.cpp => generateSerialID.cpp} (98%) diff --git a/src/Functions/serial.cpp b/src/Functions/generateSerialID.cpp similarity index 98% rename from src/Functions/serial.cpp rename to src/Functions/generateSerialID.cpp index d65df83c9f9..db26d0d684b 100644 --- a/src/Functions/serial.cpp +++ b/src/Functions/generateSerialID.cpp @@ -12,8 +12,6 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int KEEPER_EXCEPTION; } From e6f135089f300a6e5cc0d1276e748750f2b59454 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 May 2024 15:25:38 +0000 Subject: [PATCH 299/392] Cosmetics, pt. V --- src/Functions/generateSnowflakeID.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 1b26bf44adb..bbae41e4f49 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -11,11 +11,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; -} - namespace { @@ -81,7 +76,7 @@ SnowflakeComponents toComponents(uint64_t snowflake) { uint64_t toSnowflakeID(SnowflakeComponents components) { return (components.timestamp << (machine_id_bits_count + machine_seq_num_bits_count) | - components.machind_id << (machine_seq_num_bits_count) | + components.machind_id << (machine_seq_num_bits_count) | components.machine_seq_num); } @@ -120,7 +115,7 @@ RangeOfSnowflakeIDs getRangeOfAvailableIDs(const SnowflakeComponents& available, end.timestamp = begin.timestamp + 1 + (input_rows_count - seq_nums_in_current_timestamp_left) / (max_machine_seq_num + 1); else end.timestamp = begin.timestamp; - + end.machind_id = begin.machind_id; end.machine_seq_num = (begin.machine_seq_num + input_rows_count) & machine_seq_num_mask; From 4611a44c1f76873482fff498f7e7f8414f24e375 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 May 2024 15:53:14 +0000 Subject: [PATCH 300/392] Cosmetics, pt. VI --- src/Functions/generateSnowflakeID.cpp | 100 +++++++++++++------------- src/Functions/generateUUIDv7.cpp | 25 ++++--- 2 files changed, 60 insertions(+), 65 deletions(-) diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index bbae41e4f49..4e61bd9fb1c 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -27,7 +27,7 @@ namespace - The first 41 (+ 1 top zero bit) bits is the timestamp (millisecond since Unix epoch 1 Jan 1970) - The middle 10 bits are the machine ID -- The last 12 bits are a counter to disambiguate multiple snowflakeIDs generated within the same millisecond by differen processes +- The last 12 bits are a counter to disambiguate multiple snowflakeIDs generated within the same millisecond by different processes */ /// bit counts @@ -36,14 +36,13 @@ constexpr auto machine_id_bits_count = 10; constexpr auto machine_seq_num_bits_count = 12; /// bits masks for Snowflake ID components -// constexpr uint64_t timestamp_mask = ((1ULL << timestamp_bits_count) - 1) << (machine_id_bits_count + machine_seq_num_bits_count); // unused -constexpr uint64_t machine_id_mask = ((1ULL << machine_id_bits_count) - 1) << machine_seq_num_bits_count; -constexpr uint64_t machine_seq_num_mask = (1ULL << machine_seq_num_bits_count) - 1; +constexpr uint64_t machine_id_mask = ((1ull << machine_id_bits_count) - 1) << machine_seq_num_bits_count; +constexpr uint64_t machine_seq_num_mask = (1ull << machine_seq_num_bits_count) - 1; /// max values constexpr uint64_t max_machine_seq_num = machine_seq_num_mask; -uint64_t getMachineID() +uint64_t getMachineId() { UUID server_uuid = ServerUUID::get(); /// hash into 64 bits @@ -57,48 +56,44 @@ uint64_t getTimestamp() { auto now = std::chrono::system_clock::now(); auto ticks_since_epoch = std::chrono::duration_cast(now.time_since_epoch()).count(); - return static_cast(ticks_since_epoch) & ((1ULL << timestamp_bits_count) - 1); + return static_cast(ticks_since_epoch) & ((1ull << timestamp_bits_count) - 1); } -struct SnowflakeComponents { +struct SnowflakeId +{ uint64_t timestamp; uint64_t machind_id; uint64_t machine_seq_num; }; -SnowflakeComponents toComponents(uint64_t snowflake) { - return { - .timestamp = (snowflake >> (machine_id_bits_count + machine_seq_num_bits_count)), - .machind_id = ((snowflake & machine_id_mask) >> machine_seq_num_bits_count), - .machine_seq_num = (snowflake & machine_seq_num_mask) - }; +SnowflakeId toSnowflakeId(uint64_t snowflake) +{ + return {.timestamp = (snowflake >> (machine_id_bits_count + machine_seq_num_bits_count)), + .machind_id = ((snowflake & machine_id_mask) >> machine_seq_num_bits_count), + .machine_seq_num = (snowflake & machine_seq_num_mask)}; } -uint64_t toSnowflakeID(SnowflakeComponents components) { +uint64_t fromSnowflakeId(SnowflakeId components) +{ return (components.timestamp << (machine_id_bits_count + machine_seq_num_bits_count) | components.machind_id << (machine_seq_num_bits_count) | components.machine_seq_num); } -struct RangeOfSnowflakeIDs { - /// [begin, end) - SnowflakeComponents begin, end; +struct SnowflakeIdRange +{ + SnowflakeId begin; /// inclusive + SnowflakeId end; /// exclusive }; -/* Get range of `input_rows_count` Snowflake IDs from `max(available, now)` - -1. Calculate Snowflake ID by current timestamp (`now`) -2. `begin = max(available, now)` -3. Calculate `end = begin + input_rows_count` handling `machine_seq_num` overflow -*/ -RangeOfSnowflakeIDs getRangeOfAvailableIDs(const SnowflakeComponents& available, size_t input_rows_count) +/// To get the range of `input_rows_count` Snowflake IDs from `max(available, now)`: +/// 1. calculate Snowflake ID by current timestamp (`now`) +/// 2. `begin = max(available, now)` +/// 3. Calculate `end = begin + input_rows_count` handling `machine_seq_num` overflow +SnowflakeIdRange getRangeOfAvailableIds(const SnowflakeId & available, size_t input_rows_count) { /// 1. `now` - SnowflakeComponents begin = { - .timestamp = getTimestamp(), - .machind_id = getMachineID(), - .machine_seq_num = 0 - }; + SnowflakeId begin = {.timestamp = getTimestamp(), .machind_id = getMachineId(), .machine_seq_num = 0}; /// 2. `begin` if (begin.timestamp <= available.timestamp) @@ -108,7 +103,7 @@ RangeOfSnowflakeIDs getRangeOfAvailableIDs(const SnowflakeComponents& available, } /// 3. `end = begin + input_rows_count` - SnowflakeComponents end; + SnowflakeId end; const uint64_t seq_nums_in_current_timestamp_left = (max_machine_seq_num - begin.machine_seq_num + 1); if (input_rows_count >= seq_nums_in_current_timestamp_left) /// if sequence numbers in current timestamp is not enough for rows => update timestamp @@ -125,22 +120,22 @@ RangeOfSnowflakeIDs getRangeOfAvailableIDs(const SnowflakeComponents& available, struct GlobalCounterPolicy { static constexpr auto name = "generateSnowflakeID"; - static constexpr auto doc_description = R"(Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. Function generateSnowflakeID guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)"; + static constexpr auto description = R"(Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. Function generateSnowflakeID guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)"; /// Guarantee counter monotonicity within one timestamp across all threads generating Snowflake IDs simultaneously. struct Data { static inline std::atomic lowest_available_snowflake_id = 0; - SnowflakeComponents reserveRange(size_t input_rows_count) + SnowflakeId reserveRange(size_t input_rows_count) { uint64_t available_snowflake_id = lowest_available_snowflake_id.load(); - RangeOfSnowflakeIDs range; + SnowflakeIdRange range; do { - range = getRangeOfAvailableIDs(toComponents(available_snowflake_id), input_rows_count); + range = getRangeOfAvailableIds(toSnowflakeId(available_snowflake_id), input_rows_count); } - while (!lowest_available_snowflake_id.compare_exchange_weak(available_snowflake_id, toSnowflakeID(range.end))); + while (!lowest_available_snowflake_id.compare_exchange_weak(available_snowflake_id, fromSnowflakeId(range.end))); /// if `compare_exhange` failed => another thread updated `lowest_available_snowflake_id` and we should try again /// completed => range of IDs [begin, end) is reserved, can return the beginning of the range @@ -152,17 +147,17 @@ struct GlobalCounterPolicy struct ThreadLocalCounterPolicy { static constexpr auto name = "generateSnowflakeIDThreadMonotonic"; - static constexpr auto doc_description = R"(Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. This function behaves like generateSnowflakeID but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate Snowflake IDs.)"; + static constexpr auto description = R"(Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. This function behaves like generateSnowflakeID but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate Snowflake IDs.)"; /// Guarantee counter monotonicity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads. struct Data { static inline thread_local uint64_t lowest_available_snowflake_id = 0; - SnowflakeComponents reserveRange(size_t input_rows_count) + SnowflakeId reserveRange(size_t input_rows_count) { - RangeOfSnowflakeIDs range = getRangeOfAvailableIDs(toComponents(lowest_available_snowflake_id), input_rows_count); - lowest_available_snowflake_id = toSnowflakeID(range.end); + SnowflakeIdRange range = getRangeOfAvailableIds(toSnowflakeId(lowest_available_snowflake_id), input_rows_count); + lowest_available_snowflake_id = fromSnowflakeId(range.end); return range.begin; } }; @@ -188,7 +183,7 @@ public: { FunctionArgumentDescriptors mandatory_args; FunctionArgumentDescriptors optional_args{ - {"expr", nullptr, nullptr, "Arbitrary Expression"} + {"expr", nullptr, nullptr, "Arbitrary expression"} }; validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); @@ -200,17 +195,18 @@ public: auto col_res = ColumnVector::create(); typename ColumnVector::Container & vec_to = col_res->getData(); - vec_to.resize(input_rows_count); - if (input_rows_count != 0) { + vec_to.resize(input_rows_count); + typename FillPolicy::Data data; + /// get the begin of available snowflake ids range - SnowflakeComponents snowflake_id = data.reserveRange(input_rows_count); + SnowflakeId snowflake_id = data.reserveRange(input_rows_count); for (UInt64 & to_row : vec_to) { - to_row = toSnowflakeID(snowflake_id); + to_row = fromSnowflakeId(snowflake_id); if (snowflake_id.machine_seq_num++ == max_machine_seq_num) { snowflake_id.machine_seq_num = 0; @@ -225,20 +221,20 @@ public: }; template -void registerSnowflakeIDGenerator(auto& factory) +void registerSnowflakeIDGenerator(auto & factory) { static constexpr auto doc_syntax_format = "{}([expression])"; static constexpr auto example_format = "SELECT {}()"; static constexpr auto multiple_example_format = "SELECT {f}(1), {f}(2)"; - FunctionDocumentation::Description doc_description = FillPolicy::doc_description; - FunctionDocumentation::Syntax doc_syntax = fmt::format(doc_syntax_format, FillPolicy::name); - FunctionDocumentation::Arguments doc_arguments = {{"expression", "The expression is used to bypass common subexpression elimination if the function is called multiple times in a query but otherwise ignored. Optional."}}; - FunctionDocumentation::ReturnedValue doc_returned_value = "A value of type UInt64"; - FunctionDocumentation::Examples doc_examples = {{"uuid", fmt::format(example_format, FillPolicy::name), ""}, {"multiple", fmt::format(multiple_example_format, fmt::arg("f", FillPolicy::name)), ""}}; - FunctionDocumentation::Categories doc_categories = {"Snowflake ID"}; + FunctionDocumentation::Description description = FillPolicy::description; + FunctionDocumentation::Syntax syntax = fmt::format(doc_syntax_format, FillPolicy::name); + FunctionDocumentation::Arguments arguments = {{"expression", "The expression is used to bypass common subexpression elimination if the function is called multiple times in a query but otherwise ignored. Optional."}}; + FunctionDocumentation::ReturnedValue returned_value = "A value of type UInt64"; + FunctionDocumentation::Examples examples = {{"single", fmt::format(example_format, FillPolicy::name), ""}, {"multiple", fmt::format(multiple_example_format, fmt::arg("f", FillPolicy::name)), ""}}; + FunctionDocumentation::Categories categories = {"Snowflake ID"}; - factory.template registerFunction>({doc_description, doc_syntax, doc_arguments, doc_returned_value, doc_examples, doc_categories}, FunctionFactory::CaseInsensitive); + factory.template registerFunction>({description, syntax, arguments, returned_value, examples, categories}, FunctionFactory::CaseInsensitive); } REGISTER_FUNCTION(GenerateSnowflakeID) diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp index 411a3a076ac..f2a82431c0a 100644 --- a/src/Functions/generateUUIDv7.cpp +++ b/src/Functions/generateUUIDv7.cpp @@ -76,7 +76,7 @@ void setVariant(UUID & uuid) struct FillAllRandomPolicy { static constexpr auto name = "generateUUIDv7NonMonotonic"; - static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), and a random field (74 bit, including a 2-bit variant field "2") to distinguish UUIDs within a millisecond. This function is the fastest generateUUIDv7* function but it gives no monotonicity guarantees within a timestamp.)"; + static constexpr auto description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), and a random field (74 bit, including a 2-bit variant field "2") to distinguish UUIDs within a millisecond. This function is the fastest generateUUIDv7* function but it gives no monotonicity guarantees within a timestamp.)"; struct Data { void generate(UUID & uuid, uint64_t ts) @@ -136,7 +136,7 @@ struct CounterFields struct GlobalCounterPolicy { static constexpr auto name = "generateUUIDv7"; - static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit, including a variant field "2", 2 bit) to distinguish UUIDs within a millisecond, and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. Function generateUUIDv7 guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)"; + static constexpr auto description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit, including a variant field "2", 2 bit) to distinguish UUIDs within a millisecond, and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. Function generateUUIDv7 guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)"; /// Guarantee counter monotonicity within one timestamp across all threads generating UUIDv7 simultaneously. struct Data @@ -159,7 +159,7 @@ struct GlobalCounterPolicy struct ThreadLocalCounterPolicy { static constexpr auto name = "generateUUIDv7ThreadMonotonic"; - static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit, including a variant field "2", 2 bit) to distinguish UUIDs within a millisecond, and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.)"; + static constexpr auto description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit, including a variant field "2", 2 bit) to distinguish UUIDs within a millisecond, and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.)"; /// Guarantee counter monotonicity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads. struct Data @@ -186,7 +186,6 @@ class FunctionGenerateUUIDv7Base : public IFunction, public FillPolicy { public: String getName() const final { return FillPolicy::name; } - size_t getNumberOfArguments() const final { return 0; } bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const final { return false; } @@ -198,7 +197,7 @@ public: { FunctionArgumentDescriptors mandatory_args; FunctionArgumentDescriptors optional_args{ - {"expr", nullptr, nullptr, "Arbitrary Expression"} + {"expr", nullptr, nullptr, "Arbitrary expression"} }; validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); @@ -264,20 +263,20 @@ private: }; template -void registerUUIDv7Generator(auto& factory) +void registerUUIDv7Generator(auto & factory) { static constexpr auto doc_syntax_format = "{}([expression])"; static constexpr auto example_format = "SELECT {}()"; static constexpr auto multiple_example_format = "SELECT {f}(1), {f}(2)"; - FunctionDocumentation::Description doc_description = FillPolicy::doc_description; - FunctionDocumentation::Syntax doc_syntax = fmt::format(doc_syntax_format, FillPolicy::name); - FunctionDocumentation::Arguments doc_arguments = {{"expression", "The expression is used to bypass common subexpression elimination if the function is called multiple times in a query but otherwise ignored. Optional."}}; - FunctionDocumentation::ReturnedValue doc_returned_value = "A value of type UUID version 7."; - FunctionDocumentation::Examples doc_examples = {{"uuid", fmt::format(example_format, FillPolicy::name), ""}, {"multiple", fmt::format(multiple_example_format, fmt::arg("f", FillPolicy::name)), ""}}; - FunctionDocumentation::Categories doc_categories = {"UUID"}; + FunctionDocumentation::Description description = FillPolicy::description; + FunctionDocumentation::Syntax syntax = fmt::format(doc_syntax_format, FillPolicy::name); + FunctionDocumentation::Arguments arguments = {{"expression", "The expression is used to bypass common subexpression elimination if the function is called multiple times in a query but otherwise ignored. Optional."}}; + FunctionDocumentation::ReturnedValue returned_value = "A value of type UUID version 7."; + FunctionDocumentation::Examples examples = {{"single", fmt::format(example_format, FillPolicy::name), ""}, {"multiple", fmt::format(multiple_example_format, fmt::arg("f", FillPolicy::name)), ""}}; + FunctionDocumentation::Categories categories = {"UUID"}; - factory.template registerFunction>({doc_description, doc_syntax, doc_arguments, doc_returned_value, doc_examples, doc_categories}, FunctionFactory::CaseInsensitive); + factory.template registerFunction>({description, syntax, arguments, returned_value, examples, categories}, FunctionFactory::CaseInsensitive); } REGISTER_FUNCTION(GenerateUUIDv7) From 91c1456141f2783234d1a7fd6a749e9e0493c46e Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Wed, 22 May 2024 22:11:46 +0300 Subject: [PATCH 301/392] CNF with mutually exclusive atoms reduction fix --- src/Analyzer/Passes/ConvertQueryToCNFPass.cpp | 20 +++++- src/Interpreters/TreeCNFConverter.h | 21 +++++- .../WhereConstraintsOptimizer.cpp | 19 ++++- .../0_stateless/03161_cnf_reduction.reference | 23 ++++++ .../0_stateless/03161_cnf_reduction.sql | 72 +++++++++++++++++++ 5 files changed, 152 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/03161_cnf_reduction.reference create mode 100644 tests/queries/0_stateless/03161_cnf_reduction.sql diff --git a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp index 96bc62212fd..5951e8fc5ea 100644 --- a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp +++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp @@ -99,6 +99,23 @@ bool checkIfGroupAlwaysTrueGraph(const Analyzer::CNF::OrGroup & group, const Com return false; } +bool checkIfGroupAlwaysTrueAtoms(const Analyzer::CNF::OrGroup & group) +{ + /// Filters out groups containing mutually exclusive atoms, + /// since these groups are always True + + for (const auto & atom : group) + { + auto negated(atom); + negated.negative = !atom.negative; + if (group.contains(negated)) + { + return true; + } + } + return false; +} + bool checkIfAtomAlwaysFalseFullMatch(const Analyzer::CNF::AtomicFormula & atom, const ConstraintsDescription::QueryTreeData & query_tree_constraints) { const auto constraint_atom_ids = query_tree_constraints.getAtomIds(atom.node_with_hash); @@ -644,7 +661,8 @@ void optimizeWithConstraints(Analyzer::CNF & cnf, const QueryTreeNodes & table_e cnf.filterAlwaysTrueGroups([&](const auto & group) { /// remove always true groups from CNF - return !checkIfGroupAlwaysTrueFullMatch(group, query_tree_constraints) && !checkIfGroupAlwaysTrueGraph(group, compare_graph); + return !checkIfGroupAlwaysTrueFullMatch(group, query_tree_constraints) + && !checkIfGroupAlwaysTrueGraph(group, compare_graph) && !checkIfGroupAlwaysTrueAtoms(group); }) .filterAlwaysFalseAtoms([&](const Analyzer::CNF::AtomicFormula & atom) { diff --git a/src/Interpreters/TreeCNFConverter.h b/src/Interpreters/TreeCNFConverter.h index 8258412f1a6..ae1551cd9c2 100644 --- a/src/Interpreters/TreeCNFConverter.h +++ b/src/Interpreters/TreeCNFConverter.h @@ -164,6 +164,12 @@ public: void pushNotIn(CNFQuery::AtomicFormula & atom); +/// Reduces CNF groups by removing mutually exclusive atoms +/// found across groups, in case other atoms are identical. +/// Might require multiple passes to complete reduction. +/// +/// Example: +/// (x OR y) AND (x OR !y) -> x template TAndGroup reduceOnceCNFStatements(const TAndGroup & groups) { @@ -175,10 +181,19 @@ TAndGroup reduceOnceCNFStatements(const TAndGroup & groups) bool inserted = false; for (const auto & atom : group) { - copy.erase(atom); using AtomType = std::decay_t; AtomType negative_atom(atom); negative_atom.negative = !atom.negative; + + // Sikpping erase-insert for mutually exclusive atoms within + // signle group, since it won't insert negative atom, which + // will break the logic of this rule + if (copy.contains(negative_atom)) + { + continue; + } + + copy.erase(atom); copy.insert(negative_atom); if (groups.contains(copy)) @@ -209,6 +224,10 @@ bool isCNFGroupSubset(const TOrGroup & left, const TOrGroup & right) return true; } +/// Removes CNF groups if subset group is found in CNF. +/// +/// Example: +/// (x OR y) AND (x) -> x template TAndGroup filterCNFSubsets(const TAndGroup & groups) { diff --git a/src/Interpreters/WhereConstraintsOptimizer.cpp b/src/Interpreters/WhereConstraintsOptimizer.cpp index 979a4f4dbf5..456cf76b987 100644 --- a/src/Interpreters/WhereConstraintsOptimizer.cpp +++ b/src/Interpreters/WhereConstraintsOptimizer.cpp @@ -91,6 +91,22 @@ bool checkIfGroupAlwaysTrueGraph(const CNFQuery::OrGroup & group, const Comparis return false; } +bool checkIfGroupAlwaysTrueAtoms(const CNFQuery::OrGroup & group) +{ + /// Filters out groups containing mutually exclusive atoms, + /// since these groups are always True + + for (const auto & atom : group) + { + auto negated(atom); + negated.negative = !atom.negative; + if (group.contains(negated)) + { + return true; + } + } + return false; +} bool checkIfAtomAlwaysFalseFullMatch(const CNFQuery::AtomicFormula & atom, const ConstraintsDescription & constraints_description) { @@ -158,7 +174,8 @@ void WhereConstraintsOptimizer::perform() .filterAlwaysTrueGroups([&compare_graph, this](const auto & group) { /// remove always true groups from CNF - return !checkIfGroupAlwaysTrueFullMatch(group, metadata_snapshot->getConstraints()) && !checkIfGroupAlwaysTrueGraph(group, compare_graph); + return !checkIfGroupAlwaysTrueFullMatch(group, metadata_snapshot->getConstraints()) + && !checkIfGroupAlwaysTrueGraph(group, compare_graph) && !checkIfGroupAlwaysTrueAtoms(group); }) .filterAlwaysFalseAtoms([&compare_graph, this](const auto & atom) { diff --git a/tests/queries/0_stateless/03161_cnf_reduction.reference b/tests/queries/0_stateless/03161_cnf_reduction.reference new file mode 100644 index 00000000000..5e39c0f3223 --- /dev/null +++ b/tests/queries/0_stateless/03161_cnf_reduction.reference @@ -0,0 +1,23 @@ +-- Expected plan with analyzer: +SELECT id +FROM `03161_table` +WHERE f +SETTINGS convert_query_to_cnf = 1, optimize_using_constraints = 1, allow_experimental_analyzer = 1 + +-- Expected result with analyzer: +1 + +-- Expected plan w/o analyzer: +SELECT id +FROM `03161_table` +WHERE f +SETTINGS convert_query_to_cnf = 1, optimize_using_constraints = 1, allow_experimental_analyzer = 0 + +-- Expected result w/o analyzer: +1 + +-- Reproducer from the issue with analyzer +2 + +-- Reproducer from the issue w/o analyzer +2 diff --git a/tests/queries/0_stateless/03161_cnf_reduction.sql b/tests/queries/0_stateless/03161_cnf_reduction.sql new file mode 100644 index 00000000000..b34e9171d45 --- /dev/null +++ b/tests/queries/0_stateless/03161_cnf_reduction.sql @@ -0,0 +1,72 @@ +DROP TABLE IF EXISTS 03161_table; + +CREATE TABLE 03161_table (id UInt32, f UInt8) ENGINE = Memory; + +INSERT INTO 03161_table VALUES (0, 0), (1, 1), (2, 0); + +SELECT '-- Expected plan with analyzer:'; + +EXPLAIN SYNTAX +SELECT id +FROM 03161_table +WHERE f AND (NOT(f) OR f) +SETTINGS convert_query_to_cnf = 1, optimize_using_constraints = 1, allow_experimental_analyzer = 1; + +SELECT ''; + +SELECT '-- Expected result with analyzer:'; + +SELECT id +FROM 03161_table +WHERE f AND (NOT(f) OR f) +SETTINGS convert_query_to_cnf = 1, optimize_using_constraints = 1, allow_experimental_analyzer = 1; + +SELECT ''; + +SELECT '-- Expected plan w/o analyzer:'; + +EXPLAIN SYNTAX +SELECT id +FROM 03161_table +WHERE f AND (NOT(f) OR f) +SETTINGS convert_query_to_cnf = 1, optimize_using_constraints = 1, allow_experimental_analyzer = 0; + +SELECT ''; + +SELECT '-- Expected result w/o analyzer:'; + +SELECT id +FROM 03161_table +WHERE f AND (NOT(f) OR f) +SETTINGS convert_query_to_cnf = 1, optimize_using_constraints = 1, allow_experimental_analyzer = 0; + +DROP TABLE IF EXISTS 03161_table; + +-- Checking reproducer from GitHub issue +-- https://github.com/ClickHouse/ClickHouse/issues/57400 + +DROP TABLE IF EXISTS 03161_reproducer; + +CREATE TABLE 03161_reproducer (c0 UInt8, c1 UInt8, c2 UInt8, c3 UInt8, c4 UInt8, c5 UInt8, c6 UInt8, c7 UInt8, c8 UInt8, c9 UInt8) ENGINE = Memory; + +INSERT INTO 03161_reproducer VALUES (0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 1), (0, 0, 0, 0, 0, 0, 0, 0, 1, 0), (0, 0, 0, 0, 0, 0, 0, 0, 1, 1), (0, 0, 0, 0, 0, 0, 0, 1, 0, 0), (0, 0, 0, 0, 0, 0, 0, 1, 0, 1), (0, 0, 0, 0, 0, 0, 0, 1, 1, 0), (0, 0, 0, 0, 0, 0, 0, 1, 1, 1); + +SELECT ''; + +SELECT '-- Reproducer from the issue with analyzer'; + +SELECT count() +FROM 03161_reproducer +WHERE ((NOT c2) AND c2 AND (NOT c1)) OR ((NOT c2) AND c3 AND (NOT c5)) OR ((NOT c7) AND (NOT c8)) OR (c9 AND c6 AND c8 AND (NOT c8) AND (NOT c7)) +SETTINGS convert_query_to_cnf = 1, optimize_using_constraints = 1, allow_experimental_analyzer = 1; + +SELECT ''; + +SELECT '-- Reproducer from the issue w/o analyzer'; + +SELECT count() +FROM 03161_reproducer +WHERE ((NOT c2) AND c2 AND (NOT c1)) OR ((NOT c2) AND c3 AND (NOT c5)) OR ((NOT c7) AND (NOT c8)) OR (c9 AND c6 AND c8 AND (NOT c8) AND (NOT c7)) +SETTINGS convert_query_to_cnf = 1, optimize_using_constraints = 1, allow_experimental_analyzer = 0; + +DROP TABLE IF EXISTS 03161_reproducer; From c7aa283b7a418f6372e67b386342815629e26f39 Mon Sep 17 00:00:00 2001 From: Eduard Karacharov <13005055+korowa@users.noreply.github.com> Date: Thu, 23 May 2024 14:20:15 +0300 Subject: [PATCH 302/392] Update src/Interpreters/TreeCNFConverter.h Co-authored-by: Antonio Andelic --- src/Interpreters/TreeCNFConverter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/TreeCNFConverter.h b/src/Interpreters/TreeCNFConverter.h index ae1551cd9c2..ec4b029eee9 100644 --- a/src/Interpreters/TreeCNFConverter.h +++ b/src/Interpreters/TreeCNFConverter.h @@ -186,7 +186,7 @@ TAndGroup reduceOnceCNFStatements(const TAndGroup & groups) negative_atom.negative = !atom.negative; // Sikpping erase-insert for mutually exclusive atoms within - // signle group, since it won't insert negative atom, which + // single group, since it won't insert negative atom, which // will break the logic of this rule if (copy.contains(negative_atom)) { From 2315991504b1e95d7bb2594e54e3c6f749897d79 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Thu, 23 May 2024 18:41:14 +0200 Subject: [PATCH 303/392] Build fix --- src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index df8fb6f6656..fb0f0ba9154 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -67,12 +67,11 @@ void MergeTreeDataPartWriterCompact::initDynamicStreamsIfNeeded(const Block & bl return; is_dynamic_streams_initialized = true; - auto storage_snapshot = std::make_shared(data_part->storage, metadata_snapshot); for (const auto & column : columns_list) { if (column.type->hasDynamicSubcolumns()) { - auto compression = storage_snapshot->getCodecDescOrDefault(column.name, default_codec); + auto compression = getCodecDescOrDefault(column.name, default_codec); addStreams(column, block.getByName(column.name).column, compression); } } From 8d697123dac574e727101d241e4d16eae2bce8da Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 23 May 2024 16:36:24 +0200 Subject: [PATCH 304/392] CI: Cancel sync wf on new push --- .github/workflows/pull_request.yml | 3 +++ tests/ci/ci.py | 37 +++++++++++++++++++-------- tests/ci/ci_metadata.py | 41 +++++++++++++++++++++++++++--- tests/ci/env_helper.py | 1 + 4 files changed, 68 insertions(+), 14 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index f20e987db97..48b4a558580 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -33,6 +33,9 @@ jobs: clear-repository: true # to ensure correct digests fetch-depth: 0 # to get version filter: tree:0 + - name: Cancel Sync PR workflow + run: | + python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --cancel-previous-run - name: Labels check run: | cd "$GITHUB_WORKSPACE/tests/ci" diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 99555b06bbf..68db08fbe96 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -1908,13 +1908,26 @@ def _get_ext_check_name(check_name: str) -> str: return check_name_with_group -def _cancel_pr_wf(s3: S3Helper, pr_number: int) -> None: - run_id = CiMetadata(s3, pr_number).fetch_meta().run_id - if not run_id: - print(f"ERROR: FIX IT: Run id has not been found PR [{pr_number}]!") +def _cancel_pr_wf(s3: S3Helper, pr_number: int, cancel_sync: bool = False) -> None: + wf_data = CiMetadata(s3, pr_number).fetch_meta() + if not cancel_sync: + if not wf_data.run_id: + print(f"ERROR: FIX IT: Run id has not been found PR [{pr_number}]!") + else: + print( + f"Canceling PR workflow run_id: [{wf_data.run_id}], pr: [{pr_number}]" + ) + GitHub.cancel_wf(GITHUB_REPOSITORY, get_best_robot_token(), wf_data.run_id) else: - print(f"Canceling PR workflow run_id: [{run_id}], pr: [{pr_number}]") - GitHub.cancel_wf(GITHUB_REPOSITORY, get_best_robot_token(), run_id) + if not wf_data.sync_pr_run_id: + print("WARNING: Sync PR run id has not been found") + else: + print(f"Canceling sync PR workflow run_id: [{wf_data.sync_pr_run_id}]") + GitHub.cancel_wf( + "ClickHouse/clickhouse-private", + get_best_robot_token(), + wf_data.sync_pr_run_id, + ) def main() -> int: @@ -1947,7 +1960,7 @@ def main() -> int: if args.configure: if CI and pr_info.is_pr: # store meta on s3 (now we need it only for PRs) - meta = CiMetadata(s3, pr_info.number) + meta = CiMetadata(s3, pr_info.number, pr_info.head_ref) meta.run_id = int(GITHUB_RUN_ID) meta.push_meta() @@ -2245,10 +2258,12 @@ def main() -> int: ### CANCEL PREVIOUS WORKFLOW RUN elif args.cancel_previous_run: - assert ( - pr_info.is_merge_queue - ), "Currently it's supposed to be used in MQ wf to cancel running PR wf if any" - _cancel_pr_wf(s3, pr_info.merged_pr) + if pr_info.is_merge_queue: + _cancel_pr_wf(s3, pr_info.merged_pr) + elif pr_info.is_pr: + _cancel_pr_wf(s3, pr_info.number, cancel_sync=True) + else: + assert False, "BUG! Not supported scenario" ### print results _print_results(result, args.outfile, args.pretty) diff --git a/tests/ci/ci_metadata.py b/tests/ci/ci_metadata.py index 82d44cf1adc..a767d102811 100644 --- a/tests/ci/ci_metadata.py +++ b/tests/ci/ci_metadata.py @@ -4,9 +4,13 @@ from typing import Optional from env_helper import ( S3_BUILDS_BUCKET, TEMP_PATH, + GITHUB_UPSTREAM_REPOSITORY, + GITHUB_REPOSITORY, + S3_BUILDS_BUCKET_PUBLIC, ) from s3_helper import S3Helper from ci_utils import GHActions +from synchronizer_utils import SYNC_BRANCH_PREFIX # pylint: disable=too-many-lines @@ -22,13 +26,14 @@ class CiMetadata: _LOCAL_PATH = Path(TEMP_PATH) / "ci_meta" _FILE_SUFFIX = ".cimd" _FILENAME_RUN_ID = "run_id" + _FILE_SUFFIX + _FILENAME_SYNC_PR_RUN_ID = "sync_pr_run_id" + _FILE_SUFFIX def __init__( self, s3: S3Helper, pr_number: Optional[int] = None, - sha: Optional[str] = None, git_ref: Optional[str] = None, + sha: Optional[str] = None, ): assert pr_number or (sha and git_ref) @@ -37,12 +42,25 @@ class CiMetadata: self.git_ref = git_ref self.s3 = s3 self.run_id = 0 + self.upstream_pr_number = 0 + self.sync_pr_run_id = 0 if self.pr_number: self.s3_path = f"{self._S3_PREFIX}/PRs/{self.pr_number}/" else: self.s3_path = f"{self._S3_PREFIX}/{self.git_ref}/{self.sha}/" + # Process upstream StatusNames.SYNC: + # metadata path for upstream pr + self.s3_path_upstream = "" + if ( + self.git_ref + and self.git_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") + and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY + ): + self.upstream_pr_number = int(self.git_ref.split("/pr/", maxsplit=1)[1]) + self.s3_path_upstream = f"{self._S3_PREFIX}/PRs/{self.upstream_pr_number}/" + self._updated = False if not self._LOCAL_PATH.exists(): @@ -73,6 +91,8 @@ class CiMetadata: assert len(lines) == 1 if file_name.name == self._FILENAME_RUN_ID: self.run_id = int(lines[0]) + elif file_name.name == self._FILENAME_SYNC_PR_RUN_ID: + self.sync_pr_run_id = int(lines[0]) self._updated = True return self @@ -84,8 +104,15 @@ class CiMetadata: Uploads meta on s3 """ assert self.run_id + assert self.git_ref, "Push meta only with full info" + + if not self.upstream_pr_number: + log_title = f"Storing workflow metadata: PR [{self.pr_number}]" + else: + log_title = f"Storing workflow metadata: PR [{self.pr_number}], upstream PR [{self.upstream_pr_number}]" + GHActions.print_in_group( - f"Storing workflow metadata: PR [{self.pr_number}]", + log_title, [f"run_id: {self.run_id}"], ) @@ -96,9 +123,17 @@ class CiMetadata: _ = self.s3.upload_file( bucket=S3_BUILDS_BUCKET, file_path=local_file, - s3_path=self.s3_path + local_file.name, + s3_path=self.s3_path + self._FILENAME_RUN_ID, ) + if self.upstream_pr_number: + # store run id in upstream pr meta as well + _ = self.s3.upload_file( + bucket=S3_BUILDS_BUCKET_PUBLIC, + file_path=local_file, + s3_path=self.s3_path_upstream + self._FILENAME_SYNC_PR_RUN_ID, + ) + if __name__ == "__main__": # TEST: diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py index 9b9652d5bd3..64614ffa611 100644 --- a/tests/ci/env_helper.py +++ b/tests/ci/env_helper.py @@ -31,6 +31,7 @@ IMAGES_PATH = os.getenv("IMAGES_PATH", TEMP_PATH) REPO_COPY = os.getenv("REPO_COPY", GITHUB_WORKSPACE) RUNNER_TEMP = os.getenv("RUNNER_TEMP", p.abspath(p.join(module_dir, "./tmp"))) S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "clickhouse-builds") +S3_BUILDS_BUCKET_PUBLIC = "clickhouse-builds" S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "clickhouse-test-reports") S3_URL = os.getenv("S3_URL", "https://s3.amazonaws.com") S3_DOWNLOAD = os.getenv("S3_DOWNLOAD", S3_URL) From 741e0aedab78a009840f6346e582c905bb80be17 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 May 2024 16:53:11 +0000 Subject: [PATCH 305/392] Remove commented code. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 57 +---------------------- 1 file changed, 2 insertions(+), 55 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 3ccecac951d..2d34f1024d5 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -471,6 +471,7 @@ struct TableExpressionData return buffer.str(); } }; + class ExpressionsStack { public: @@ -2857,22 +2858,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromExpressionArguments(cons bool QueryAnalyzer::tryBindIdentifierToAliases(const IdentifierLookup & identifier_lookup, const IdentifierResolveScope & scope) { - //const auto & identifier_bind_part = identifier_lookup.identifier.front(); return scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME) != nullptr; - - // auto get_alias_name_to_node_map = [&]() -> const std::unordered_map & - // { - // if (identifier_lookup.isExpressionLookup()) - // return *scope.alias_name_to_expression_node; - // else if (identifier_lookup.isFunctionLookup()) - // return scope.alias_name_to_lambda_node; - - // return scope.alias_name_to_table_expression_node; - // }; - - // const auto & alias_name_to_node_map = get_alias_name_to_node_map(); - - // return alias_name_to_node_map.contains(identifier_bind_part); } /** Resolve identifier from scope aliases. @@ -2922,23 +2908,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier { const auto & identifier_bind_part = identifier_lookup.identifier.front(); - // auto get_alias_name_to_node_map = [&]() -> std::unordered_map & - // { - // if (identifier_lookup.isExpressionLookup()) - // return *scope.alias_name_to_expression_node; - // else if (identifier_lookup.isFunctionLookup()) - // return scope.alias_name_to_lambda_node; - - // return scope.alias_name_to_table_expression_node; - // }; - - // auto & alias_name_to_node_map = get_alias_name_to_node_map(); - // auto it = alias_name_to_node_map.find(identifier_bind_part); - - // if (it == alias_name_to_node_map.end()) - // return {}; - - auto it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME); + auto * it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME); if (it == nullptr) return {}; @@ -2988,20 +2958,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier } alias_node = lookup_result.resolved_identifier; - - /** During collection of aliases if node is identifier and has alias, we cannot say if it is - * column or function node. Check QueryExpressionsAliasVisitor documentation for clarification. - * - * If we resolved identifier node as expression, we must remove identifier node alias from - * function alias map. - * If we resolved identifier node as function, we must remove identifier node alias from - * expression alias map. - */ - // if (identifier_lookup.isExpressionLookup()) - // scope.alises.alias_name_to_lambda_node.erase(identifier_bind_part); - // else if (identifier_lookup.isFunctionLookup()) - // scope.aliases.alias_name_to_expression_node->erase(identifier_bind_part); - scope.popExpressionNode(); } else if (node_type == QueryTreeNodeType::FUNCTION) @@ -4199,7 +4155,6 @@ IdentifierResolveResult QueryAnalyzer::tryResolveIdentifier(const IdentifierLook */ auto * alias_it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FULL_NAME); - //auto alias_it = scope.alias_name_to_expression_node->find(identifier_lookup.identifier.getFullName()); if (alias_it && (*alias_it)->getNodeType() == QueryTreeNodeType::COLUMN) { const auto & column_node = (*alias_it)->as(); @@ -6395,17 +6350,9 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id result_projection_names.push_back(projection_name_it->second); } - // if (resolved_identifier_node && !node_alias.empty()) - // scope.alias_name_to_lambda_node.erase(node_alias); - if (!resolved_identifier_node && allow_lambda_expression) - { resolved_identifier_node = tryResolveIdentifier({unresolved_identifier, IdentifierLookupContext::FUNCTION}, scope).resolved_identifier; - // if (resolved_identifier_node && !node_alias.empty()) - // scope.alias_name_to_expression_node->erase(node_alias); - } - if (!resolved_identifier_node && allow_table_expression) { resolved_identifier_node = tryResolveIdentifier({unresolved_identifier, IdentifierLookupContext::TABLE_EXPRESSION}, scope).resolved_identifier; From dab090e629afd3730457599d84e147bb512a1e81 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 May 2024 17:14:06 +0000 Subject: [PATCH 306/392] Cosmetics, pt. VII (includes a move of all snowflake-related functions in one document) --- .../functions/type-conversion-functions.md | 140 ---------------- .../sql-reference/functions/uuid-functions.md | 155 +++++++++++++++++- 2 files changed, 149 insertions(+), 146 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index ea08ffa50e7..bab92ff1e67 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1979,143 +1979,3 @@ Result: │ 2,"good" │ └───────────────────────────────────────────┘ ``` - -## snowflakeToDateTime - -Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime](/docs/en/sql-reference/data-types/datetime.md) format. - -**Syntax** - -``` sql -snowflakeToDateTime(value[, time_zone]) -``` - -**Arguments** - -- `value` — Snowflake ID. [Int64](/docs/en/sql-reference/data-types/int-uint.md). -- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](/docs/en/sql-reference/data-types/string.md). - -**Returned value** - -- The timestamp component of `value` as a [DateTime](/docs/en/sql-reference/data-types/datetime.md) value. - -**Example** - -Query: - -``` sql -SELECT snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC'); -``` - -Result: - -```response - -┌─snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC')─┐ -│ 2021-08-15 10:57:56 │ -└──────────────────────────────────────────────────────────────────┘ -``` - -## snowflakeToDateTime64 - -Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) format. - -**Syntax** - -``` sql -snowflakeToDateTime64(value[, time_zone]) -``` - -**Arguments** - -- `value` — Snowflake ID. [Int64](/docs/en/sql-reference/data-types/int-uint.md). -- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](/docs/en/sql-reference/data-types/string.md). - -**Returned value** - -- The timestamp component of `value` as a [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) with scale = 3, i.e. millisecond precision. - -**Example** - -Query: - -``` sql -SELECT snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC'); -``` - -Result: - -```response - -┌─snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC')─┐ -│ 2021-08-15 10:58:19.841 │ -└────────────────────────────────────────────────────────────────────┘ -``` - -## dateTimeToSnowflake - -Converts a [DateTime](/docs/en/sql-reference/data-types/datetime.md) value to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time. - -**Syntax** - -``` sql -dateTimeToSnowflake(value) -``` - -**Arguments** - -- `value` — Date with time. [DateTime](/docs/en/sql-reference/data-types/datetime.md). - -**Returned value** - -- Input value converted to the [Int64](/docs/en/sql-reference/data-types/int-uint.md) data type as the first Snowflake ID at that time. - -**Example** - -Query: - -``` sql -WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt SELECT dateTimeToSnowflake(dt); -``` - -Result: - -```response -┌─dateTimeToSnowflake(dt)─┐ -│ 1426860702823350272 │ -└─────────────────────────┘ -``` - -## dateTime64ToSnowflake - -Convert a [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time. - -**Syntax** - -``` sql -dateTime64ToSnowflake(value) -``` - -**Arguments** - -- `value` — Date with time. [DateTime64](/docs/en/sql-reference/data-types/datetime64.md). - -**Returned value** - -- Input value converted to the [Int64](/docs/en/sql-reference/data-types/int-uint.md) data type as the first Snowflake ID at that time. - -**Example** - -Query: - -``` sql -WITH toDateTime64('2021-08-15 18:57:56.492', 3, 'Asia/Shanghai') AS dt64 SELECT dateTime64ToSnowflake(dt64); -``` - -Result: - -```response -┌─dateTime64ToSnowflake(dt64)─┐ -│ 1426860704886947840 │ -└─────────────────────────────┘ -``` diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 80d7215b9ef..7c264450ef0 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -674,7 +674,7 @@ Result: └──────────────────────────────────────────────────────────────────────────────────────┘ ``` -## serverUUID() +## serverUUID Returns the random UUID generated during the first start of the ClickHouse server. The UUID is stored in file `uuid` in the ClickHouse server directory (e.g. `/var/lib/clickhouse/`) and retained between server restarts. @@ -692,9 +692,9 @@ Type: [UUID](../data-types/uuid.md). ## generateSnowflakeID -Generates a [Snowflake ID](https://github.com/twitter-archive/snowflake/tree/b3f6a3c6ca8e1b6847baa6ff42bf72201e2c2231). +Generates a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID). -Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. +The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. @@ -756,11 +756,14 @@ SELECT generateSnowflakeID(1), generateSnowflakeID(2); ## generateSnowflakeIDThreadMonotonic -Generates a [Snowflake ID](https://github.com/twitter-archive/snowflake/tree/b3f6a3c6ca8e1b6847baa6ff42bf72201e2c2231). +Generates a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID). -Generates a Snowflake ID. The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. +The generated Snowflake ID contains the current Unix timestamp in milliseconds 41 (+ 1 top zero bit) bits, followed by machine id (10 bits), a counter (12 bits) to distinguish IDs within a millisecond. +For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes. +In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0. -This function behaves like `generateSnowflakeID` but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate Snowflake IDs. +This function behaves like `generateSnowflakeID` but gives no guarantee on counter monotony across different simultaneous requests. +Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate Snowflake IDs. ``` 0 1 2 3 @@ -816,6 +819,146 @@ SELECT generateSnowflakeIDThreadMonotonic(1), generateSnowflakeIDThreadMonotonic └───────────────────────────────────────┴───────────────────────────────────────┘ ``` +## snowflakeToDateTime + +Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime](/docs/en/sql-reference/data-types/datetime.md) format. + +**Syntax** + +``` sql +snowflakeToDateTime(value[, time_zone]) +``` + +**Arguments** + +- `value` — Snowflake ID. [Int64](/docs/en/sql-reference/data-types/int-uint.md). +- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](/docs/en/sql-reference/data-types/string.md). + +**Returned value** + +- The timestamp component of `value` as a [DateTime](/docs/en/sql-reference/data-types/datetime.md) value. + +**Example** + +Query: + +``` sql +SELECT snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC'); +``` + +Result: + +```response + +┌─snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC')─┐ +│ 2021-08-15 10:57:56 │ +└──────────────────────────────────────────────────────────────────┘ +``` + +## snowflakeToDateTime64 + +Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) format. + +**Syntax** + +``` sql +snowflakeToDateTime64(value[, time_zone]) +``` + +**Arguments** + +- `value` — Snowflake ID. [Int64](/docs/en/sql-reference/data-types/int-uint.md). +- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](/docs/en/sql-reference/data-types/string.md). + +**Returned value** + +- The timestamp component of `value` as a [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) with scale = 3, i.e. millisecond precision. + +**Example** + +Query: + +``` sql +SELECT snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC'); +``` + +Result: + +```response + +┌─snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC')─┐ +│ 2021-08-15 10:58:19.841 │ +└────────────────────────────────────────────────────────────────────┘ +``` + +## dateTimeToSnowflake + +Converts a [DateTime](/docs/en/sql-reference/data-types/datetime.md) value to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time. + +**Syntax** + +``` sql +dateTimeToSnowflake(value) +``` + +**Arguments** + +- `value` — Date with time. [DateTime](/docs/en/sql-reference/data-types/datetime.md). + +**Returned value** + +- Input value converted to the [Int64](/docs/en/sql-reference/data-types/int-uint.md) data type as the first Snowflake ID at that time. + +**Example** + +Query: + +``` sql +WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt SELECT dateTimeToSnowflake(dt); +``` + +Result: + +```response +┌─dateTimeToSnowflake(dt)─┐ +│ 1426860702823350272 │ +└─────────────────────────┘ +``` + +## dateTime64ToSnowflake + +Convert a [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time. + +**Syntax** + +``` sql +dateTime64ToSnowflake(value) +``` + +**Arguments** + +- `value` — Date with time. [DateTime64](/docs/en/sql-reference/data-types/datetime64.md). + +**Returned value** + +- Input value converted to the [Int64](/docs/en/sql-reference/data-types/int-uint.md) data type as the first Snowflake ID at that time. + +**Example** + +Query: + +``` sql +WITH toDateTime64('2021-08-15 18:57:56.492', 3, 'Asia/Shanghai') AS dt64 SELECT dateTime64ToSnowflake(dt64); +``` + +Result: + +```response +┌─dateTime64ToSnowflake(dt64)─┐ +│ 1426860704886947840 │ +└─────────────────────────────┘ +``` + ## See also - [dictGetUUID](../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions-other) From 5d82a94615ef8a9fb7c39787d0e2b191641cbcb8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 May 2024 17:22:59 +0000 Subject: [PATCH 307/392] Revert generateSerialID --- src/Functions/generateSerialID.cpp | 167 ------------------ .../03129_serial_test_zookeeper.reference | 13 -- .../03129_serial_test_zookeeper.sql | 12 -- 3 files changed, 192 deletions(-) delete mode 100644 src/Functions/generateSerialID.cpp delete mode 100644 tests/queries/0_stateless/03129_serial_test_zookeeper.reference delete mode 100644 tests/queries/0_stateless/03129_serial_test_zookeeper.sql diff --git a/src/Functions/generateSerialID.cpp b/src/Functions/generateSerialID.cpp deleted file mode 100644 index db26d0d684b..00000000000 --- a/src/Functions/generateSerialID.cpp +++ /dev/null @@ -1,167 +0,0 @@ -#include "Common/Exception.h" -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int KEEPER_EXCEPTION; -} - -constexpr auto function_node_name = "/serial_ids/"; -constexpr size_t MAX_SERIES_NUMBER = 1000; // ? - -class FunctionSerial : public IFunction -{ -private: - mutable zkutil::ZooKeeperPtr zk; - ContextPtr context; - -public: - static constexpr auto name = "generateSerialID"; - - explicit FunctionSerial(ContextPtr context_) : context(context_) - { - if (context->hasZooKeeper()) { - zk = context->getZooKeeper(); - } - } - - static FunctionPtr create(ContextPtr context) - { - return std::make_shared(std::move(context)); - } - - String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 1; } - bool isStateful() const override { return true; } - bool isDeterministic() const override { return false; } - bool isDeterministicInScopeOfQuery() const override { return false; } - bool isSuitableForConstantFolding() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - bool useDefaultImplementationForNothing() const override { return false; } - bool canBeExecutedOnDefaultArguments() const override { return false; } - bool isInjective(const ColumnsWithTypeAndName & /*sample_columns*/) const override { return true; } - bool hasInformationAboutMonotonicity() const override { return true; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - FunctionArgumentDescriptors mandatory_args{ - {"series identifier", static_cast(&isStringOrFixedString), nullptr, "String or FixedString"} - }; - validateFunctionArgumentTypes(*this, arguments, mandatory_args); - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override - { - if (zk == nullptr) - throw Exception(ErrorCodes::KEEPER_EXCEPTION, - "ZooKeeper is not configured for function {}", - getName()); - if (zk->expired()) - zk = context->getZooKeeper(); - - // slow? - if (zk->exists(function_node_name) && zk->getChildren(function_node_name).size() == MAX_SERIES_NUMBER) { - throw Exception(ErrorCodes::KEEPER_EXCEPTION, - "At most {} serial nodes can be created", - MAX_SERIES_NUMBER); - } - - auto col_res = ColumnVector::create(); - typename ColumnVector::Container & vec_to = col_res->getData(); - - vec_to.resize(input_rows_count); - - const auto & serial_path = function_node_name + arguments[0].column->getDataAt(0).toString(); - - /// CAS in ZooKeeper - /// `get` value and version, `trySet` new with version check - /// I didn't get how to do it with `multi` - - Int64 counter; - std::string counter_path = serial_path + "/counter"; - - // if serial name used first time - zk->createAncestors(counter_path); - zk->createIfNotExists(counter_path, "1"); - - Coordination::Stat stat; - while (true) - { - const String counter_string = zk->get(counter_path, &stat); - counter = std::stoll(counter_string); - String updated_counter = std::to_string(counter + input_rows_count); - const Coordination::Error err = zk->trySet(counter_path, updated_counter); - if (err == Coordination::Error::ZOK) - { - // CAS is done - break; - } - if (err != Coordination::Error::ZBADVERSION) - { - throw Exception(ErrorCodes::KEEPER_EXCEPTION, - "ZooKeeper trySet operation failed with unexpected error = {} in function {}", - err, getName()); - } - } - - // Make a result - for (auto & val : vec_to) - { - val = counter; - ++counter; - } - - return col_res; - } - -}; - -REGISTER_FUNCTION(Serial) -{ - factory.registerFunction(FunctionDocumentation - { - .description=R"( -Generates and returns sequential numbers starting from the previous counter value. -This function takes a constant string argument - a series identifier. -The server should be configured with a ZooKeeper. -)", - .syntax = "generateSerialID(identifier)", - .arguments{ - {"series identifier", "Series identifier (String or FixedString)"} - }, - .returned_value = "Sequential numbers of type Int64 starting from the previous counter value", - .examples{ - {"first call", "SELECT generateSerialID('id1')", R"( -┌─generateSerialID('id1')──┐ -│ 1 │ -└──────────────────────────┘)"}, - {"second call", "SELECT generateSerialID('id1')", R"( -┌─generateSerialID('id1')──┐ -│ 2 │ -└──────────────────────────┘)"}, - {"column call", "SELECT *, generateSerialID('id1') FROM test_table", R"( -┌─CounterID─┬─UserID─┬─ver─┬─generateSerialID('id1')──┐ -│ 1 │ 3 │ 3 │ 3 │ -│ 1 │ 1 │ 1 │ 4 │ -│ 1 │ 2 │ 2 │ 5 │ -│ 1 │ 5 │ 5 │ 6 │ -│ 1 │ 4 │ 4 │ 7 │ -└───────────┴────────┴─────┴──────────────────────────┘ - )"}}, - .categories{"Unique identifiers"} - }); -} - -} diff --git a/tests/queries/0_stateless/03129_serial_test_zookeeper.reference b/tests/queries/0_stateless/03129_serial_test_zookeeper.reference deleted file mode 100644 index 479030db4be..00000000000 --- a/tests/queries/0_stateless/03129_serial_test_zookeeper.reference +++ /dev/null @@ -1,13 +0,0 @@ -1 -2 -1 -3 -4 -5 -6 -7 -1 1 -2 2 -3 3 -4 4 -5 5 diff --git a/tests/queries/0_stateless/03129_serial_test_zookeeper.sql b/tests/queries/0_stateless/03129_serial_test_zookeeper.sql deleted file mode 100644 index 2bd60656259..00000000000 --- a/tests/queries/0_stateless/03129_serial_test_zookeeper.sql +++ /dev/null @@ -1,12 +0,0 @@ --- Tags: zookeeper - -SELECT generateSerialID('x'); -SELECT generateSerialID('x'); -SELECT generateSerialID('y'); -SELECT generateSerialID('x') FROM numbers(5); - -SELECT generateSerialID(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT generateSerialID('x', 'y'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT generateSerialID(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } - -SELECT generateSerialID('z'), generateSerialID('z') FROM numbers(5); From 12f60a4969acda49422aef5d5d6fc431a71109f7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 May 2024 18:00:53 +0000 Subject: [PATCH 308/392] Cosmetics, pt. VIII --- src/Functions/generateSnowflakeID.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 4e61bd9fb1c..617693f017c 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -42,6 +42,13 @@ constexpr uint64_t machine_seq_num_mask = (1ull << machine_seq_num_bits_count) - /// max values constexpr uint64_t max_machine_seq_num = machine_seq_num_mask; +uint64_t getTimestamp() +{ + auto now = std::chrono::system_clock::now(); + auto ticks_since_epoch = std::chrono::duration_cast(now.time_since_epoch()).count(); + return static_cast(ticks_since_epoch) & ((1ull << timestamp_bits_count) - 1); +} + uint64_t getMachineId() { UUID server_uuid = ServerUUID::get(); @@ -52,31 +59,24 @@ uint64_t getMachineId() return (((hi * 11) ^ (lo * 17)) & machine_id_mask) >> machine_seq_num_bits_count; } -uint64_t getTimestamp() -{ - auto now = std::chrono::system_clock::now(); - auto ticks_since_epoch = std::chrono::duration_cast(now.time_since_epoch()).count(); - return static_cast(ticks_since_epoch) & ((1ull << timestamp_bits_count) - 1); -} - struct SnowflakeId { uint64_t timestamp; - uint64_t machind_id; + uint64_t machine_id; uint64_t machine_seq_num; }; SnowflakeId toSnowflakeId(uint64_t snowflake) { return {.timestamp = (snowflake >> (machine_id_bits_count + machine_seq_num_bits_count)), - .machind_id = ((snowflake & machine_id_mask) >> machine_seq_num_bits_count), + .machine_id = ((snowflake & machine_id_mask) >> machine_seq_num_bits_count), .machine_seq_num = (snowflake & machine_seq_num_mask)}; } uint64_t fromSnowflakeId(SnowflakeId components) { return (components.timestamp << (machine_id_bits_count + machine_seq_num_bits_count) | - components.machind_id << (machine_seq_num_bits_count) | + components.machine_id << (machine_seq_num_bits_count) | components.machine_seq_num); } @@ -93,7 +93,7 @@ struct SnowflakeIdRange SnowflakeIdRange getRangeOfAvailableIds(const SnowflakeId & available, size_t input_rows_count) { /// 1. `now` - SnowflakeId begin = {.timestamp = getTimestamp(), .machind_id = getMachineId(), .machine_seq_num = 0}; + SnowflakeId begin = {.timestamp = getTimestamp(), .machine_id = getMachineId(), .machine_seq_num = 0}; /// 2. `begin` if (begin.timestamp <= available.timestamp) @@ -111,7 +111,7 @@ SnowflakeIdRange getRangeOfAvailableIds(const SnowflakeId & available, size_t in else end.timestamp = begin.timestamp; - end.machind_id = begin.machind_id; + end.machine_id = begin.machine_id; end.machine_seq_num = (begin.machine_seq_num + input_rows_count) & machine_seq_num_mask; return {begin, end}; From ae8ceaa35e0cb6804774881e05bccf07ab23aa19 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 May 2024 18:38:30 +0000 Subject: [PATCH 309/392] Cosmetics, pt. IX and cached machineId computation --- src/Functions/generateSnowflakeID.cpp | 25 +++++++++++++------ .../03130_generateSnowflakeId.reference | 4 +-- .../0_stateless/03130_generateSnowflakeId.sql | 14 ++++++----- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 617693f017c..c3f7701a05a 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -49,7 +49,7 @@ uint64_t getTimestamp() return static_cast(ticks_since_epoch) & ((1ull << timestamp_bits_count) - 1); } -uint64_t getMachineId() +uint64_t getMachineIdImpl() { UUID server_uuid = ServerUUID::get(); /// hash into 64 bits @@ -59,6 +59,12 @@ uint64_t getMachineId() return (((hi * 11) ^ (lo * 17)) & machine_id_mask) >> machine_seq_num_bits_count; } +uint64_t getMachineId() +{ + static uint64_t machine_id = getMachineIdImpl(); + return machine_id; +} + struct SnowflakeId { uint64_t timestamp; @@ -106,7 +112,7 @@ SnowflakeIdRange getRangeOfAvailableIds(const SnowflakeId & available, size_t in SnowflakeId end; const uint64_t seq_nums_in_current_timestamp_left = (max_machine_seq_num - begin.machine_seq_num + 1); if (input_rows_count >= seq_nums_in_current_timestamp_left) - /// if sequence numbers in current timestamp is not enough for rows => update timestamp + /// if sequence numbers in current timestamp is not enough for rows --> depending on how many elements input_rows_count overflows, forward timestamp by at least 1 tick end.timestamp = begin.timestamp + 1 + (input_rows_count - seq_nums_in_current_timestamp_left) / (max_machine_seq_num + 1); else end.timestamp = begin.timestamp; @@ -136,8 +142,8 @@ struct GlobalCounterPolicy range = getRangeOfAvailableIds(toSnowflakeId(available_snowflake_id), input_rows_count); } while (!lowest_available_snowflake_id.compare_exchange_weak(available_snowflake_id, fromSnowflakeId(range.end))); - /// if `compare_exhange` failed => another thread updated `lowest_available_snowflake_id` and we should try again - /// completed => range of IDs [begin, end) is reserved, can return the beginning of the range + /// if CAS failed --> another thread updated `lowest_available_snowflake_id` and we re-try + /// else --> our thread reserved ID range [begin, end) and return the beginning of the range return range.begin; } @@ -200,18 +206,21 @@ public: vec_to.resize(input_rows_count); typename FillPolicy::Data data; - - /// get the begin of available snowflake ids range - SnowflakeId snowflake_id = data.reserveRange(input_rows_count); + SnowflakeId snowflake_id = data.reserveRange(input_rows_count); /// returns begin of available snowflake ids range for (UInt64 & to_row : vec_to) { to_row = fromSnowflakeId(snowflake_id); - if (snowflake_id.machine_seq_num++ == max_machine_seq_num) + if (snowflake_id.machine_seq_num == max_machine_seq_num) { + /// handle overflow snowflake_id.machine_seq_num = 0; ++snowflake_id.timestamp; } + else + { + ++snowflake_id.machine_seq_num; + } } } diff --git a/tests/queries/0_stateless/03130_generateSnowflakeId.reference b/tests/queries/0_stateless/03130_generateSnowflakeId.reference index 8cdced96770..6ec0cafab16 100644 --- a/tests/queries/0_stateless/03130_generateSnowflakeId.reference +++ b/tests/queries/0_stateless/03130_generateSnowflakeId.reference @@ -1,11 +1,11 @@ --- generateSnowflakeID -- +-- generateSnowflakeID 1 1 0 0 1 100 --- generateSnowflakeIDThreadMonotonic -- +-- generateSnowflakeIDThreadMonotonic 1 1 100 diff --git a/tests/queries/0_stateless/03130_generateSnowflakeId.sql b/tests/queries/0_stateless/03130_generateSnowflakeId.sql index 3e994149d2b..903be5b786c 100644 --- a/tests/queries/0_stateless/03130_generateSnowflakeId.sql +++ b/tests/queries/0_stateless/03130_generateSnowflakeId.sql @@ -1,10 +1,11 @@ -SELECT '-- generateSnowflakeID --'; +SELECT '-- generateSnowflakeID'; + SELECT bitShiftLeft(toUInt64(generateSnowflakeID()), 52) = 0; -- check machine sequence number is zero SELECT bitAnd(bitShiftRight(toUInt64(generateSnowflakeID()), 63), 1) = 0; -- check first bit is zero -SELECT generateSnowflakeID(1) = generateSnowflakeID(2); -SELECT generateSnowflakeID() = generateSnowflakeID(1); -SELECT generateSnowflakeID(1) = generateSnowflakeID(1); +SELECT generateSnowflakeID(1) = generateSnowflakeID(2); -- disabled common subexpression elimination --> lhs != rhs +SELECT generateSnowflakeID() = generateSnowflakeID(1); -- same as ^^ +SELECT generateSnowflakeID(1) = generateSnowflakeID(1); -- enabled common subexpression elimination SELECT generateSnowflakeID(1, 2); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } @@ -15,7 +16,8 @@ FROM FROM numbers(100) ); -SELECT '-- generateSnowflakeIDThreadMonotonic --'; +SELECT '-- generateSnowflakeIDThreadMonotonic'; + SELECT bitShiftLeft(toUInt64(generateSnowflakeIDThreadMonotonic()), 52) = 0; -- check machine sequence number is zero SELECT bitAnd(bitShiftRight(toUInt64(generateSnowflakeIDThreadMonotonic()), 63), 1) = 0; -- check first bit is zero @@ -26,4 +28,4 @@ FROM ( SELECT DISTINCT generateSnowflakeIDThreadMonotonic() FROM numbers(100) -); \ No newline at end of file +); From 0383fa5164cb07fdec7c5fc036137122545acd6a Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 23 May 2024 18:30:49 +0000 Subject: [PATCH 310/392] do not convert sparse columns to full on vertical merge --- src/Columns/ColumnSparse.cpp | 1 - .../Algorithms/AggregatingSortedAlgorithm.cpp | 5 ++- .../FinishAggregatingInOrderAlgorithm.cpp | 2 ++ .../Merges/Algorithms/IMergingAlgorithm.h | 13 +++++++- .../IMergingAlgorithmWithSharedChunks.cpp | 15 ++------- .../Algorithms/MergingSortedAlgorithm.cpp | 5 ++- .../Algorithms/SummingSortedAlgorithm.cpp | 5 ++- .../Transforms/ColumnGathererTransform.cpp | 31 ++++++++++++++----- .../Transforms/ColumnGathererTransform.h | 8 +++-- src/Storages/MergeTree/MergeTask.cpp | 6 ++-- 10 files changed, 54 insertions(+), 37 deletions(-) diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 49947be312d..2e75a2fd4ab 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp index 857f5040b79..a77bb0dabfc 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp @@ -76,9 +76,6 @@ static void preprocessChunk(Chunk & chunk, const AggregatingSortedAlgorithm::Col auto num_rows = chunk.getNumRows(); auto columns = chunk.detachColumns(); - for (auto & column : columns) - column = column->convertToFullColumnIfConst(); - for (const auto & desc : def.columns_to_simple_aggregate) if (desc.nested_type) columns[desc.column_number] = recursiveRemoveLowCardinality(columns[desc.column_number]); @@ -266,6 +263,7 @@ AggregatingSortedAlgorithm::AggregatingSortedAlgorithm( void AggregatingSortedAlgorithm::initialize(Inputs inputs) { + removeConstAndSparse(inputs); merged_data.initialize(header, inputs); for (auto & input : inputs) @@ -277,6 +275,7 @@ void AggregatingSortedAlgorithm::initialize(Inputs inputs) void AggregatingSortedAlgorithm::consume(Input & input, size_t source_num) { + removeConstAndSparse(input); preprocessChunk(input.chunk, columns_definition); updateCursor(input, source_num); } diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp index a5befca7233..466adf93538 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp @@ -40,6 +40,7 @@ FinishAggregatingInOrderAlgorithm::FinishAggregatingInOrderAlgorithm( void FinishAggregatingInOrderAlgorithm::initialize(Inputs inputs) { + removeConstAndSparse(inputs); current_inputs = std::move(inputs); states.resize(num_inputs); for (size_t i = 0; i < num_inputs; ++i) @@ -48,6 +49,7 @@ void FinishAggregatingInOrderAlgorithm::initialize(Inputs inputs) void FinishAggregatingInOrderAlgorithm::consume(Input & input, size_t source_num) { + removeConstAndSparse(input); if (!input.chunk.hasRows()) return; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithm.h b/src/Processors/Merges/Algorithms/IMergingAlgorithm.h index 6e352c3f104..9a1c7c24270 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithm.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithm.h @@ -39,7 +39,6 @@ public: void set(Chunk chunk_) { - convertToFullIfSparse(chunk_); chunk = std::move(chunk_); skip_last_row = false; } @@ -47,6 +46,18 @@ public: using Inputs = std::vector; + static void removeConstAndSparse(Input & input) + { + convertToFullIfConst(input.chunk); + convertToFullIfSparse(input.chunk); + } + + static void removeConstAndSparse(Inputs & inputs) + { + for (auto & input : inputs) + removeConstAndSparse(input); + } + virtual const char * getName() const = 0; virtual void initialize(Inputs inputs) = 0; virtual void consume(Input & input, size_t source_num) = 0; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp index fe5186736b5..47b7ddf38dc 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp @@ -17,18 +17,9 @@ IMergingAlgorithmWithSharedChunks::IMergingAlgorithmWithSharedChunks( { } -static void prepareChunk(Chunk & chunk) -{ - auto num_rows = chunk.getNumRows(); - auto columns = chunk.detachColumns(); - for (auto & column : columns) - column = column->convertToFullColumnIfConst(); - - chunk.setColumns(std::move(columns), num_rows); -} - void IMergingAlgorithmWithSharedChunks::initialize(Inputs inputs) { + removeConstAndSparse(inputs); merged_data->initialize(header, inputs); for (size_t source_num = 0; source_num < inputs.size(); ++source_num) @@ -36,8 +27,6 @@ void IMergingAlgorithmWithSharedChunks::initialize(Inputs inputs) if (!inputs[source_num].chunk) continue; - prepareChunk(inputs[source_num].chunk); - auto & source = sources[source_num]; source.skip_last_row = inputs[source_num].skip_last_row; @@ -55,7 +44,7 @@ void IMergingAlgorithmWithSharedChunks::initialize(Inputs inputs) void IMergingAlgorithmWithSharedChunks::consume(Input & input, size_t source_num) { - prepareChunk(input.chunk); + removeConstAndSparse(input); auto & source = sources[source_num]; source.skip_last_row = input.skip_last_row; diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp index d17a4d859ee..3a9cf7ee141 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp @@ -49,17 +49,16 @@ void MergingSortedAlgorithm::addInput() void MergingSortedAlgorithm::initialize(Inputs inputs) { + removeConstAndSparse(inputs); merged_data.initialize(header, inputs); current_inputs = std::move(inputs); for (size_t source_num = 0; source_num < current_inputs.size(); ++source_num) { auto & chunk = current_inputs[source_num].chunk; - if (!chunk) continue; - convertToFullIfConst(chunk); cursors[source_num] = SortCursorImpl(header, chunk.getColumns(), description, source_num); } @@ -83,7 +82,7 @@ void MergingSortedAlgorithm::initialize(Inputs inputs) void MergingSortedAlgorithm::consume(Input & input, size_t source_num) { - convertToFullIfConst(input.chunk); + removeConstAndSparse(input); current_inputs[source_num].swap(input); cursors[source_num].reset(current_inputs[source_num].chunk.getColumns(), header); diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp index 7329821cf97..e2c6371c44f 100644 --- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp @@ -387,9 +387,6 @@ static void preprocessChunk(Chunk & chunk, const SummingSortedAlgorithm::Columns auto num_rows = chunk.getNumRows(); auto columns = chunk.detachColumns(); - for (auto & column : columns) - column = column->convertToFullColumnIfConst(); - for (const auto & desc : def.columns_to_aggregate) { if (desc.nested_type) @@ -704,6 +701,7 @@ SummingSortedAlgorithm::SummingSortedAlgorithm( void SummingSortedAlgorithm::initialize(Inputs inputs) { + removeConstAndSparse(inputs); merged_data.initialize(header, inputs); for (auto & input : inputs) @@ -715,6 +713,7 @@ void SummingSortedAlgorithm::initialize(Inputs inputs) void SummingSortedAlgorithm::consume(Input & input, size_t source_num) { + removeConstAndSparse(input); preprocessChunk(input.chunk, columns_definition); updateCursor(input, source_num); } diff --git a/src/Processors/Transforms/ColumnGathererTransform.cpp b/src/Processors/Transforms/ColumnGathererTransform.cpp index b6bcec26c0c..15f8355bdc7 100644 --- a/src/Processors/Transforms/ColumnGathererTransform.cpp +++ b/src/Processors/Transforms/ColumnGathererTransform.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -20,11 +21,13 @@ ColumnGathererStream::ColumnGathererStream( size_t num_inputs, ReadBuffer & row_sources_buf_, size_t block_preferred_size_rows_, - size_t block_preferred_size_bytes_) + size_t block_preferred_size_bytes_, + bool is_result_sparse_) : sources(num_inputs) , row_sources_buf(row_sources_buf_) , block_preferred_size_rows(block_preferred_size_rows_) , block_preferred_size_bytes(block_preferred_size_bytes_) + , is_result_sparse(is_result_sparse_) { if (num_inputs == 0) throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "There are no streams to gather"); @@ -36,17 +39,23 @@ void ColumnGathererStream::initialize(Inputs inputs) source_columns.reserve(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { - if (inputs[i].chunk) - { - sources[i].update(inputs[i].chunk.detachColumns().at(0)); - source_columns.push_back(sources[i].column); - } + if (!inputs[i].chunk) + continue; + + if (!is_result_sparse) + convertToFullIfSparse(inputs[i].chunk); + + sources[i].update(inputs[i].chunk.detachColumns().at(0)); + source_columns.push_back(sources[i].column); } if (source_columns.empty()) return; result_column = source_columns[0]->cloneEmpty(); + if (is_result_sparse && !result_column->isSparse()) + result_column = ColumnSparse::create(std::move(result_column)); + if (result_column->hasDynamicStructure()) result_column->takeDynamicStructureFromSourceColumns(source_columns); } @@ -146,7 +155,12 @@ void ColumnGathererStream::consume(Input & input, size_t source_num) { auto & source = sources[source_num]; if (input.chunk) + { + if (!is_result_sparse) + convertToFullIfSparse(input.chunk); + source.update(input.chunk.getColumns().at(0)); + } if (0 == source.size) { @@ -159,10 +173,11 @@ ColumnGathererTransform::ColumnGathererTransform( size_t num_inputs, ReadBuffer & row_sources_buf_, size_t block_preferred_size_rows_, - size_t block_preferred_size_bytes_) + size_t block_preferred_size_bytes_, + bool is_result_sparse_) : IMergingTransform( num_inputs, header, header, /*have_all_inputs_=*/ true, /*limit_hint_=*/ 0, /*always_read_till_end_=*/ false, - num_inputs, row_sources_buf_, block_preferred_size_rows_, block_preferred_size_bytes_) + num_inputs, row_sources_buf_, block_preferred_size_rows_, block_preferred_size_bytes_, is_result_sparse_) , log(getLogger("ColumnGathererStream")) { if (header.columns() != 1) diff --git a/src/Processors/Transforms/ColumnGathererTransform.h b/src/Processors/Transforms/ColumnGathererTransform.h index 4e56cffa46a..ec5691316ce 100644 --- a/src/Processors/Transforms/ColumnGathererTransform.h +++ b/src/Processors/Transforms/ColumnGathererTransform.h @@ -60,7 +60,8 @@ public: size_t num_inputs, ReadBuffer & row_sources_buf_, size_t block_preferred_size_rows_, - size_t block_preferred_size_bytes_); + size_t block_preferred_size_bytes_, + bool is_result_sparse_); const char * getName() const override { return "ColumnGathererStream"; } void initialize(Inputs inputs) override; @@ -97,6 +98,7 @@ private: const size_t block_preferred_size_rows; const size_t block_preferred_size_bytes; + const bool is_result_sparse; Source * source_to_fully_copy = nullptr; @@ -113,7 +115,8 @@ public: size_t num_inputs, ReadBuffer & row_sources_buf_, size_t block_preferred_size_rows_, - size_t block_preferred_size_bytes_); + size_t block_preferred_size_bytes_, + bool is_result_sparse_); String getName() const override { return "ColumnGathererTransform"; } @@ -145,7 +148,6 @@ void ColumnGathererStream::gather(Column & column_res) next_required_source = -1; - /// We use do ... while here to ensure there will be at least one iteration of this loop. /// Because the column_res.byteSize() could be bigger than block_preferred_size_bytes already at this point. do diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index a9109832521..888042454a9 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -596,8 +596,9 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const pipes.emplace_back(std::move(pipe)); } - auto pipe = Pipe::unitePipes(std::move(pipes)); + bool is_result_sparse = global_ctx->new_data_part->getSerialization(column_name)->getKind() == ISerialization::Kind::SPARSE; + auto pipe = Pipe::unitePipes(std::move(pipes)); ctx->rows_sources_read_buf->seek(0, 0); const auto data_settings = global_ctx->data->getSettings(); @@ -606,7 +607,8 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const pipe.numOutputPorts(), *ctx->rows_sources_read_buf, data_settings->merge_max_block_size, - data_settings->merge_max_block_size_bytes); + data_settings->merge_max_block_size_bytes, + is_result_sparse); pipe.addTransform(std::move(transform)); From 40753ddefb0324d50bb8d455615da74828c7be76 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 23 May 2024 21:10:40 +0200 Subject: [PATCH 311/392] Update hdfs test --- tests/integration/test_storage_hdfs/test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 6ee12a87ebf..eeffa8ed00b 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -326,7 +326,7 @@ def test_virtual_columns(started_cluster): hdfs_api.write_data("/file1", "1\n") hdfs_api.write_data("/file2", "2\n") hdfs_api.write_data("/file3", "3\n") - expected = "1\tfile1\t/file1\n2\tfile2\t/file2\n3\tfile3\t/file3\n" + expected = "1\tfile1\tfile1\n2\tfile2\tfile2\n3\tfile3\tfile3\n" assert ( node1.query( "select id, _file as file_name, _path as file_path from virtual_cols order by id" @@ -493,13 +493,13 @@ def test_hdfsCluster(started_cluster): actual = node1.query( "select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id" ) - expected = "1\tfile1\t/test_hdfsCluster/file1\n2\tfile2\t/test_hdfsCluster/file2\n3\tfile3\t/test_hdfsCluster/file3\n" + expected = "1\tfile1\ttest_hdfsCluster/file1\n2\tfile2\ttest_hdfsCluster/file2\n3\tfile3\ttest_hdfsCluster/file3\n" assert actual == expected actual = node1.query( "select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id" ) - expected = "1\tfile1\t/test_hdfsCluster/file1\n2\tfile2\t/test_hdfsCluster/file2\n3\tfile3\t/test_hdfsCluster/file3\n" + expected = "1\tfile1\ttest_hdfsCluster/file1\n2\tfile2\ttest_hdfsCluster/file2\n3\tfile3\ttest_hdfsCluster/file3\n" assert actual == expected fs.delete(dir, recursive=True) @@ -665,7 +665,7 @@ def test_virtual_columns_2(started_cluster): node1.query(f"insert into table function {table_function} SELECT 1, 'kek'") result = node1.query(f"SELECT _path FROM {table_function}") - assert result.strip() == "/parquet_2" + assert result.strip() == "parquet_2" table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_3', 'Parquet', 'a Int32, _path String')" @@ -978,25 +978,25 @@ def test_read_subcolumns(started_cluster): f"select a.b.d, _path, a.b, _file, a.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert res == "2\t/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n" + assert res == "2\ttest_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n" res = node.query( f"select a.b.d, _path, a.b, _file, a.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert res == "2\t/test_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n" + assert res == "2\ttest_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n" res = node.query( f"select x.b.d, _path, x.b, _file, x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert res == "0\t/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" + assert res == "0\ttest_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" res = node.query( f"select x.b.d, _path, x.b, _file, x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')" ) - assert res == "42\t/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" + assert res == "42\ttest_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" def test_union_schema_inference_mode(started_cluster): From bd15e1311a949753a234cfed9571600af78eb906 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 23 May 2024 22:35:21 +0200 Subject: [PATCH 312/392] CI: fix --- tests/ci/ci.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 68db08fbe96..4afd3f46f9d 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -1917,7 +1917,7 @@ def _cancel_pr_wf(s3: S3Helper, pr_number: int, cancel_sync: bool = False) -> No print( f"Canceling PR workflow run_id: [{wf_data.run_id}], pr: [{pr_number}]" ) - GitHub.cancel_wf(GITHUB_REPOSITORY, get_best_robot_token(), wf_data.run_id) + GitHub.cancel_wf(GITHUB_REPOSITORY, wf_data.run_id, get_best_robot_token()) else: if not wf_data.sync_pr_run_id: print("WARNING: Sync PR run id has not been found") @@ -1925,8 +1925,8 @@ def _cancel_pr_wf(s3: S3Helper, pr_number: int, cancel_sync: bool = False) -> No print(f"Canceling sync PR workflow run_id: [{wf_data.sync_pr_run_id}]") GitHub.cancel_wf( "ClickHouse/clickhouse-private", - get_best_robot_token(), wf_data.sync_pr_run_id, + get_best_robot_token(), ) From dac31fb92a80982ec0a98472485fa02c4b917c07 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 21 May 2024 17:29:00 +0000 Subject: [PATCH 313/392] Include settings into query cache key --- src/Interpreters/Cache/QueryCache.cpp | 37 ++++++++-- src/Interpreters/Cache/QueryCache.h | 5 +- src/Interpreters/executeQuery.cpp | 4 +- .../02494_query_cache_key.reference | 6 ++ .../0_stateless/02494_query_cache_key.sql | 70 +++++++++++++++++++ .../02494_query_cache_use_database.reference | 2 - .../02494_query_cache_use_database.sql | 30 -------- 7 files changed, 113 insertions(+), 41 deletions(-) create mode 100644 tests/queries/0_stateless/02494_query_cache_key.reference create mode 100644 tests/queries/0_stateless/02494_query_cache_key.sql delete mode 100644 tests/queries/0_stateless/02494_query_cache_use_database.reference delete mode 100644 tests/queries/0_stateless/02494_query_cache_use_database.sql diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp index 4b10bfd3dcd..a3fe8c2e779 100644 --- a/src/Interpreters/Cache/QueryCache.cpp +++ b/src/Interpreters/Cache/QueryCache.cpp @@ -126,6 +126,11 @@ bool astContainsSystemTables(ASTPtr ast, ContextPtr context) namespace { +bool isQueryCacheRelatedSetting(const String & setting_name) +{ + return setting_name.starts_with("query_cache_") || setting_name.ends_with("_query_cache"); +} + class RemoveQueryCacheSettingsMatcher { public: @@ -141,7 +146,7 @@ public: auto is_query_cache_related_setting = [](const auto & change) { - return change.name.starts_with("query_cache_") || change.name.ends_with("_query_cache"); + return isQueryCacheRelatedSetting(change.name); }; std::erase_if(set_clause->changes, is_query_cache_related_setting); @@ -177,11 +182,11 @@ ASTPtr removeQueryCacheSettings(ASTPtr ast) return transformed_ast; } -IAST::Hash calculateAstHash(ASTPtr ast, const String & current_database) +IAST::Hash calculateAstHash(ASTPtr ast, const String & current_database, const Settings & settings) { ast = removeQueryCacheSettings(ast); - /// Hash the AST, it must consider aliases (issue #56258) + /// Hash the AST, we must consider aliases (issue #56258) SipHash hash; ast->updateTreeHash(hash, /*ignore_aliases=*/ false); @@ -189,6 +194,25 @@ IAST::Hash calculateAstHash(ASTPtr ast, const String & current_database) /// tables (issue #64136) hash.update(current_database); + /// Finally, hash the (changed) settings as they might affect the query result (e.g. think of settings `additional_table_filters` and `limit`). + /// Note: allChanged() returns the settings in random order. Also, update()-s of the composite hash must be done in deterministic order. + /// Therefore, collect and sort the settings first, then hash them. + Settings::Range changed_settings = settings.allChanged(); + std::vector> changed_settings_sorted; /// (name, value) + for (const auto & setting : changed_settings) + { + const String & name = setting.getName(); + const String & value = setting.getValueString(); + if (!isQueryCacheRelatedSetting(name)) /// see removeQueryCacheSettings() why this is a good idea + changed_settings_sorted.push_back({name, value}); + } + std::sort(changed_settings_sorted.begin(), changed_settings_sorted.end(), [](auto & lhs, auto & rhs) { return lhs.first < rhs.first; }); + for (const auto & setting : changed_settings_sorted) + { + hash.update(setting.first); + hash.update(setting.second); + } + return getSipHash128AsPair(hash); } @@ -204,12 +228,13 @@ String queryStringFromAST(ASTPtr ast) QueryCache::Key::Key( ASTPtr ast_, const String & current_database, + const Settings & settings, Block header_, std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, std::chrono::time_point expires_at_, bool is_compressed_) - : ast_hash(calculateAstHash(ast_, current_database)) + : ast_hash(calculateAstHash(ast_, current_database, settings)) , header(header_) , user_id(user_id_) , current_user_roles(current_user_roles_) @@ -220,8 +245,8 @@ QueryCache::Key::Key( { } -QueryCache::Key::Key(ASTPtr ast_, const String & current_database, std::optional user_id_, const std::vector & current_user_roles_) - : QueryCache::Key(ast_, current_database, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) /// dummy values for everything != AST, current database, user name/roles +QueryCache::Key::Key(ASTPtr ast_, const String & current_database, const Settings & settings, std::optional user_id_, const std::vector & current_user_roles_) + : QueryCache::Key(ast_, current_database, settings, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) /// dummy values for everything != AST, current database, user name/roles { } diff --git a/src/Interpreters/Cache/QueryCache.h b/src/Interpreters/Cache/QueryCache.h index b5b6f477137..461197cac32 100644 --- a/src/Interpreters/Cache/QueryCache.h +++ b/src/Interpreters/Cache/QueryCache.h @@ -14,6 +14,8 @@ namespace DB { +struct Settings; + /// Does AST contain non-deterministic functions like rand() and now()? bool astContainsNonDeterministicFunctions(ASTPtr ast, ContextPtr context); @@ -89,6 +91,7 @@ public: /// Ctor to construct a Key for writing into query cache. Key(ASTPtr ast_, const String & current_database, + const Settings & settings, Block header_, std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, @@ -96,7 +99,7 @@ public: bool is_compressed); /// Ctor to construct a Key for reading from query cache (this operation only needs the AST + user name). - Key(ASTPtr ast_, const String & current_database, std::optional user_id_, const std::vector & current_user_roles_); + Key(ASTPtr ast_, const String & current_database, const Settings & settings, std::optional user_id_, const std::vector & current_user_roles_); bool operator==(const Key & other) const; }; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 56f08dbb902..0b5f68f27f6 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -1101,7 +1101,7 @@ static std::tuple executeQueryImpl( { if (can_use_query_cache && settings.enable_reads_from_query_cache) { - QueryCache::Key key(ast, context->getCurrentDatabase(), context->getUserID(), context->getCurrentRoles()); + QueryCache::Key key(ast, context->getCurrentDatabase(), settings, context->getUserID(), context->getCurrentRoles()); QueryCache::Reader reader = query_cache->createReader(key); if (reader.hasCacheEntryForKey()) { @@ -1224,7 +1224,7 @@ static std::tuple executeQueryImpl( && (!ast_contains_system_tables || system_table_handling == QueryCacheSystemTableHandling::Save)) { QueryCache::Key key( - ast, context->getCurrentDatabase(), res.pipeline.getHeader(), + ast, context->getCurrentDatabase(), settings, res.pipeline.getHeader(), context->getUserID(), context->getCurrentRoles(), settings.query_cache_share_between_users, std::chrono::system_clock::now() + std::chrono::seconds(settings.query_cache_ttl), diff --git a/tests/queries/0_stateless/02494_query_cache_key.reference b/tests/queries/0_stateless/02494_query_cache_key.reference new file mode 100644 index 00000000000..8f5b61192d5 --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_key.reference @@ -0,0 +1,6 @@ +Test (1) +1 +2 +Test (2) +4 +4 diff --git a/tests/queries/0_stateless/02494_query_cache_key.sql b/tests/queries/0_stateless/02494_query_cache_key.sql new file mode 100644 index 00000000000..d8c68e0d267 --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_key.sql @@ -0,0 +1,70 @@ +-- Tags: no-parallel +-- Tag no-parallel: Messes with internal cache + +-- Tests that the key of the query cache is not only formed by the query AST but also by +-- (1) the current database (`USE db`, issue #64136), +-- (2) the query settings + + +SELECT 'Test (1)'; + +SYSTEM DROP QUERY CACHE; + +DROP DATABASE IF EXISTS db1; +DROP DATABASE IF EXISTS db2; + +CREATE DATABASE db1; +CREATE DATABASE db2; + +CREATE TABLE db1.tab(a UInt64, PRIMARY KEY a); +CREATE TABLE db2.tab(a UInt64, PRIMARY KEY a); + +INSERT INTO db1.tab values(1); +INSERT INTO db2.tab values(2); + +USE db1; +SELECT * FROM tab SETTINGS use_query_cache=1; + +USE db2; +SELECT * FROM tab SETTINGS use_query_cache=1; + +DROP DATABASE db1; +DROP DATABASE db2; + +SYSTEM DROP QUERY CACHE; + + +SELECT 'Test (2)'; + +-- test with query-level settings +SELECT 1 SETTINGS use_query_cache = 1, limit = 1, use_skip_indexes = 0 Format Null; +SELECT 1 SETTINGS use_query_cache = 1, use_skip_indexes = 0 Format Null; +SELECT 1 SETTINGS use_query_cache = 1, use_skip_indexes = 1 Format Null; +SELECT 1 SETTINGS use_query_cache = 1, max_block_size = 1 Format Null; + +-- 4x the same query but with different settings each. There should yield four entries in the query cache. +SELECT count(query) FROM system.query_cache; + +SYSTEM DROP QUERY CACHE; + +-- test with mixed session-level/query-level settings +SET use_query_cache = 1; +SET limit = 1; +SELECT 1 SETTINGS use_skip_indexes = 0 Format Null; +SET limit = default; +SET use_skip_indexes = 0; +SELECT 1 Format Null; +SET use_skip_indexes = 1; +SELECT 1 SETTINGS use_skip_indexes = 1 Format Null; +SET use_skip_indexes = default; +SET max_block_size = 1; +SELECT 1 Format Null; +SET max_block_size = default; + +SET use_query_cache = default; + +-- 4x the same query but with different settings each. There should yield four entries in the query cache. +SELECT count(query) FROM system.query_cache; + +SYSTEM DROP QUERY CACHE; + diff --git a/tests/queries/0_stateless/02494_query_cache_use_database.reference b/tests/queries/0_stateless/02494_query_cache_use_database.reference deleted file mode 100644 index 1191247b6d9..00000000000 --- a/tests/queries/0_stateless/02494_query_cache_use_database.reference +++ /dev/null @@ -1,2 +0,0 @@ -1 -2 diff --git a/tests/queries/0_stateless/02494_query_cache_use_database.sql b/tests/queries/0_stateless/02494_query_cache_use_database.sql deleted file mode 100644 index df560f82ebb..00000000000 --- a/tests/queries/0_stateless/02494_query_cache_use_database.sql +++ /dev/null @@ -1,30 +0,0 @@ --- Tags: no-parallel, no-fasttest --- Tag no-fasttest: Depends on OpenSSL --- Tag no-parallel: Messes with internal cache - --- Test for issue #64136 - -SYSTEM DROP QUERY CACHE; - -DROP DATABASE IF EXISTS db1; -DROP DATABASE IF EXISTS db2; - -CREATE DATABASE db1; -CREATE DATABASE db2; - -CREATE TABLE db1.tab(a UInt64, PRIMARY KEY a); -CREATE TABLE db2.tab(a UInt64, PRIMARY KEY a); - -INSERT INTO db1.tab values(1); -INSERT INTO db2.tab values(2); - -USE db1; -SELECT * FROM tab SETTINGS use_query_cache=1; - -USE db2; -SELECT * FROM tab SETTINGS use_query_cache=1; - -DROP DATABASE db1; -DROP DATABASE db2; - -SYSTEM DROP QUERY CACHE; From 6e6e2944b56245cd5eefd14deb7dba7b8459b935 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 23 May 2024 21:26:33 +0000 Subject: [PATCH 314/392] Fix glitch in #62696 --- src/Functions/FunctionHelpers.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Functions/FunctionHelpers.cpp b/src/Functions/FunctionHelpers.cpp index 3b057779ffe..d85bb0e7060 100644 --- a/src/Functions/FunctionHelpers.cpp +++ b/src/Functions/FunctionHelpers.cpp @@ -21,8 +21,6 @@ namespace ErrorCodes const ColumnConst * checkAndGetColumnConstStringOrFixedString(const IColumn * column) { - if (!column) - return {}; if (!isColumnConst(*column)) return {}; From 5710b5852f9e067fbcd8809196c9c403a8de43dc Mon Sep 17 00:00:00 2001 From: Nataly Merezhuk Date: Thu, 23 May 2024 17:45:58 -0400 Subject: [PATCH 315/392] Adds note - file engine unavailable in ClickHouse Cloud. --- docs/en/engines/table-engines/special/file.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/en/engines/table-engines/special/file.md b/docs/en/engines/table-engines/special/file.md index fdf5242ba3b..0d422f64762 100644 --- a/docs/en/engines/table-engines/special/file.md +++ b/docs/en/engines/table-engines/special/file.md @@ -14,6 +14,10 @@ Usage scenarios: - Convert data from one format to another. - Updating data in ClickHouse via editing a file on a disk. +:::note +This engine is not currently available in ClickHouse Cloud, please [use the S3 table function instead](/docs/en/sql-reference/table-functions/s3.md). +::: + ## Usage in ClickHouse Server {#usage-in-clickhouse-server} ``` sql From 251010f109a538c770f830bc254e031924486c46 Mon Sep 17 00:00:00 2001 From: TTPO100AJIEX Date: Fri, 24 May 2024 02:14:26 +0300 Subject: [PATCH 316/392] Move protocol-server and inter-server management into separate classes Co-authored-by: Alex Koledaev --- programs/server/Server.cpp | 987 +----------------- programs/server/Server.h | 95 +- src/CMakeLists.txt | 1 + src/Server/ServersManager/IServersManager.cpp | 268 +++++ src/Server/ServersManager/IServersManager.h | 74 ++ .../ServersManager/InterServersManager.cpp | 327 ++++++ .../ServersManager/InterServersManager.h | 45 + .../ServersManager/ProtocolServersManager.cpp | 523 ++++++++++ .../ServersManager/ProtocolServersManager.h | 37 + 9 files changed, 1325 insertions(+), 1032 deletions(-) create mode 100644 src/Server/ServersManager/IServersManager.cpp create mode 100644 src/Server/ServersManager/IServersManager.h create mode 100644 src/Server/ServersManager/InterServersManager.cpp create mode 100644 src/Server/ServersManager/InterServersManager.h create mode 100644 src/Server/ServersManager/ProtocolServersManager.cpp create mode 100644 src/Server/ServersManager/ProtocolServersManager.h diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 223bc1f77e7..b62ae40924c 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include #include @@ -44,11 +42,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -83,29 +79,19 @@ #include #include #include -#include #include "MetricsTransmitter.h" #include -#include -#include #include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include #include #include #include -#include #include "config.h" #include @@ -119,19 +105,9 @@ #endif #if USE_SSL -# include # include #endif -#if USE_GRPC -# include -#endif - -#if USE_NURAFT -# include -# include -#endif - #if USE_JEMALLOC # include #endif @@ -159,18 +135,6 @@ namespace ProfileEvents { extern const Event MainConfigLoads; extern const Event ServerStartupMilliseconds; - extern const Event InterfaceNativeSendBytes; - extern const Event InterfaceNativeReceiveBytes; - extern const Event InterfaceHTTPSendBytes; - extern const Event InterfaceHTTPReceiveBytes; - extern const Event InterfacePrometheusSendBytes; - extern const Event InterfacePrometheusReceiveBytes; - extern const Event InterfaceInterserverSendBytes; - extern const Event InterfaceInterserverReceiveBytes; - extern const Event InterfaceMySQLSendBytes; - extern const Event InterfaceMySQLReceiveBytes; - extern const Event InterfacePostgreSQLSendBytes; - extern const Event InterfacePostgreSQLReceiveBytes; } namespace fs = std::filesystem; @@ -238,11 +202,9 @@ namespace DB namespace ErrorCodes { extern const int NO_ELEMENTS_IN_CONFIG; - extern const int SUPPORT_IS_DISABLED; extern const int ARGUMENT_OUT_OF_BOUND; extern const int EXCESSIVE_ELEMENT_IN_CONFIG; extern const int INVALID_CONFIG_PARAMETER; - extern const int NETWORK_ERROR; extern const int CORRUPTED_DATA; } @@ -257,115 +219,6 @@ static std::string getCanonicalPath(std::string && path) return std::move(path); } -Poco::Net::SocketAddress Server::socketBindListen( - const Poco::Util::AbstractConfiguration & config, - Poco::Net::ServerSocket & socket, - const std::string & host, - UInt16 port, - [[maybe_unused]] bool secure) const -{ - auto address = makeSocketAddress(host, port, &logger()); - socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ config.getBool("listen_reuse_port", false)); - /// If caller requests any available port from the OS, discover it after binding. - if (port == 0) - { - address = socket.address(); - LOG_DEBUG(&logger(), "Requested any available port (port == 0), actual port is {:d}", address.port()); - } - - socket.listen(/* backlog = */ config.getUInt("listen_backlog", 4096)); - - return address; -} - -Strings getListenHosts(const Poco::Util::AbstractConfiguration & config) -{ - auto listen_hosts = DB::getMultipleValuesFromConfig(config, "", "listen_host"); - if (listen_hosts.empty()) - { - listen_hosts.emplace_back("::1"); - listen_hosts.emplace_back("127.0.0.1"); - } - return listen_hosts; -} - -Strings getInterserverListenHosts(const Poco::Util::AbstractConfiguration & config) -{ - auto interserver_listen_hosts = DB::getMultipleValuesFromConfig(config, "", "interserver_listen_host"); - if (!interserver_listen_hosts.empty()) - return interserver_listen_hosts; - - /// Use more general restriction in case of emptiness - return getListenHosts(config); -} - -bool getListenTry(const Poco::Util::AbstractConfiguration & config) -{ - bool listen_try = config.getBool("listen_try", false); - if (!listen_try) - { - Poco::Util::AbstractConfiguration::Keys protocols; - config.keys("protocols", protocols); - listen_try = - DB::getMultipleValuesFromConfig(config, "", "listen_host").empty() && - std::none_of(protocols.begin(), protocols.end(), [&](const auto & protocol) - { - return config.has("protocols." + protocol + ".host") && config.has("protocols." + protocol + ".port"); - }); - } - return listen_try; -} - - -void Server::createServer( - Poco::Util::AbstractConfiguration & config, - const std::string & listen_host, - const char * port_name, - bool listen_try, - bool start_server, - std::vector & servers, - CreateServerFunc && func) const -{ - /// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file. - if (config.getString(port_name, "").empty()) - return; - - /// If we already have an active server for this listen_host/port_name, don't create it again - for (const auto & server : servers) - { - if (!server.isStopping() && server.getListenHost() == listen_host && server.getPortName() == port_name) - return; - } - - auto port = config.getInt(port_name); - try - { - servers.push_back(func(port)); - if (start_server) - { - servers.back().start(); - LOG_INFO(&logger(), "Listening for {}", servers.back().getDescription()); - } - global_context->registerServerPort(port_name, port); - } - catch (const Poco::Exception &) - { - if (listen_try) - { - LOG_WARNING(&logger(), "Listen [{}]:{} failed: {}. If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, " - "then consider to " - "specify not disabled IPv4 or IPv6 address to listen in element of configuration " - "file. Example for disabled IPv6: 0.0.0.0 ." - " Example for disabled IPv4: ::", - listen_host, port, getCurrentExceptionMessage(false)); - } - else - { - throw Exception(ErrorCodes::NETWORK_ERROR, "Listen [{}]:{} failed: {}", listen_host, port, getCurrentExceptionMessage(false)); - } - } -} - #if defined(OS_LINUX) namespace @@ -665,6 +518,7 @@ try ServerSettings server_settings; server_settings.loadSettingsFromConfig(config()); + Poco::ThreadPool server_pool(3, server_settings.max_connections); ASTAlterCommand::setFormatAlterCommandsWithParentheses(server_settings.format_alter_operations_with_parentheses); @@ -721,11 +575,6 @@ try CurrentMetrics::set(CurrentMetrics::Revision, ClickHouseRevision::getVersionRevision()); CurrentMetrics::set(CurrentMetrics::VersionInteger, ClickHouseRevision::getVersionInteger()); - Poco::ThreadPool server_pool(3, server_settings.max_connections); - std::mutex servers_lock; - std::vector servers; - std::vector servers_to_start_before_tables; - /** Context contains all that query execution is dependent: * settings, available functions, data types, aggregate functions, databases, ... */ @@ -775,6 +624,10 @@ try bool will_have_trace_collector = hasPHDRCache() && config().has("trace_log"); + std::mutex servers_lock; + ProtocolServersManager servers(context(), &logger()); + InterServersManager servers_to_start_before_tables(context(), &logger()); + // Initialize global thread pool. Do it before we fetch configs from zookeeper // nodes (`from_zk`), because ZooKeeper interface uses the pool. We will // ignore `max_thread_pool_size` in configs we fetch from ZK, but oh well. @@ -806,32 +659,7 @@ try LOG_DEBUG(log, "Shut down storages."); - if (!servers_to_start_before_tables.empty()) - { - LOG_DEBUG(log, "Waiting for current connections to servers for tables to finish."); - size_t current_connections = 0; - { - std::lock_guard lock(servers_lock); - for (auto & server : servers_to_start_before_tables) - { - server.stop(); - current_connections += server.currentConnections(); - } - } - - if (current_connections) - LOG_INFO(log, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); - else - LOG_INFO(log, "Closed all listening sockets."); - - if (current_connections > 0) - current_connections = waitServersToFinish(servers_to_start_before_tables, servers_lock, server_settings.shutdown_wait_unfinished); - - if (current_connections) - LOG_INFO(log, "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections after context shutdown.", current_connections); - else - LOG_INFO(log, "Closed connections to servers for tables."); - } + servers_to_start_before_tables.stopServers(server_settings, servers_lock); global_context->shutdownKeeperDispatcher(); @@ -928,19 +756,13 @@ try server_settings.asynchronous_heavy_metrics_update_period_s, [&]() -> std::vector { - std::vector metrics; - std::lock_guard lock(servers_lock); - metrics.reserve(servers_to_start_before_tables.size() + servers.size()); - - for (const auto & server : servers_to_start_before_tables) - metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); - - for (const auto & server : servers) - metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); - return metrics; - } - ); + std::vector metrics1 = servers_to_start_before_tables.getMetrics(); + std::vector metrics2 = servers.getMetrics(); + metrics1.reserve(metrics1.size() + metrics2.size()); + metrics1.insert(metrics1.end(), std::make_move_iterator(metrics2.begin()), std::make_move_iterator(metrics2.end())); + return metrics1; + }); zkutil::validateZooKeeperConfig(config()); bool has_zookeeper = zkutil::hasZooKeeperConfig(config()); @@ -1588,7 +1410,8 @@ try if (global_context->isServerCompletelyStarted()) { std::lock_guard lock(servers_lock); - updateServers(*config, server_pool, async_metrics, servers, servers_to_start_before_tables); + servers.updateServers(*config, *this, servers_lock, server_pool, async_metrics, latest_config); + servers_to_start_before_tables.updateServers(*config, *this, servers_lock, server_pool, async_metrics, latest_config); } } @@ -1635,141 +1458,17 @@ try /// Must be the last. latest_config = config; }, - /* already_loaded = */ false); /// Reload it right now (initial loading) + /* already_loaded = */ false); /// Reload it right now (initial loading) - const auto listen_hosts = getListenHosts(config()); - const auto interserver_listen_hosts = getInterserverListenHosts(config()); - const auto listen_try = getListenTry(config()); - - if (config().has("keeper_server.server_id")) - { -#if USE_NURAFT - //// If we don't have configured connection probably someone trying to use clickhouse-server instead - //// of clickhouse-keeper, so start synchronously. - bool can_initialize_keeper_async = false; - - if (has_zookeeper) /// We have configured connection to some zookeeper cluster - { - /// If we cannot connect to some other node from our cluster then we have to wait our Keeper start - /// synchronously. - can_initialize_keeper_async = global_context->tryCheckClientConnectionToMyKeeperCluster(); - } - /// Initialize keeper RAFT. - global_context->initializeKeeperDispatcher(can_initialize_keeper_async); - FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); - - auto config_getter = [this] () -> const Poco::Util::AbstractConfiguration & - { - return global_context->getConfigRef(); - }; - - for (const auto & listen_host : listen_hosts) - { - /// TCP Keeper - const char * port_name = "keeper_server.tcp_port"; - createServer( - config(), listen_host, port_name, listen_try, /* start_server: */ false, - servers_to_start_before_tables, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config(), socket, listen_host, port); - socket.setReceiveTimeout(Poco::Timespan(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)); - socket.setSendTimeout(Poco::Timespan(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)); - return ProtocolServerAdapter( - listen_host, - port_name, - "Keeper (tcp): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory( - config_getter, global_context->getKeeperDispatcher(), - global_context->getSettingsRef().receive_timeout.totalSeconds(), - global_context->getSettingsRef().send_timeout.totalSeconds(), - false), server_pool, socket)); - }); - - const char * secure_port_name = "keeper_server.tcp_port_secure"; - createServer( - config(), listen_host, secure_port_name, listen_try, /* start_server: */ false, - servers_to_start_before_tables, - [&](UInt16 port) -> ProtocolServerAdapter - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(config(), socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)); - socket.setSendTimeout(Poco::Timespan(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)); - return ProtocolServerAdapter( - listen_host, - secure_port_name, - "Keeper with secure protocol (tcp_secure): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory( - config_getter, global_context->getKeeperDispatcher(), - global_context->getSettingsRef().receive_timeout.totalSeconds(), - global_context->getSettingsRef().send_timeout.totalSeconds(), true), server_pool, socket)); -#else - UNUSED(port); - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); -#endif - }); - - /// HTTP control endpoints - port_name = "keeper_server.http_control.port"; - createServer(config(), listen_host, port_name, listen_try, /* start_server: */ false, - servers_to_start_before_tables, - [&](UInt16 port) -> ProtocolServerAdapter - { - auto http_context = httpContext(); - Poco::Timespan keep_alive_timeout(config().getUInt("keep_alive_timeout", 10), 0); - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(http_context->getReceiveTimeout()); - http_params->setKeepAliveTimeout(keep_alive_timeout); - - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config(), socket, listen_host, port); - socket.setReceiveTimeout(http_context->getReceiveTimeout()); - socket.setSendTimeout(http_context->getSendTimeout()); - return ProtocolServerAdapter( - listen_host, - port_name, - "HTTP Control: http://" + address.toString(), - std::make_unique( - std::move(http_context), - createKeeperHTTPControlMainHandlerFactory( - config_getter(), - global_context->getKeeperDispatcher(), - "KeeperHTTPControlHandler-factory"), server_pool, socket, http_params)); - }); - } -#else - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination."); -#endif - - } - - { - std::lock_guard lock(servers_lock); - /// We should start interserver communications before (and more important shutdown after) tables. - /// Because server can wait for a long-running queries (for example in tcp_handler) after interserver handler was already shut down. - /// In this case we will have replicated tables which are unable to send any parts to other replicas, but still can - /// communicate with zookeeper, execute merges, etc. - createInterserverServers( - config(), - interserver_listen_hosts, - listen_try, - server_pool, - async_metrics, - servers_to_start_before_tables, - /* start_servers= */ false); - - - for (auto & server : servers_to_start_before_tables) - { - server.start(); - LOG_INFO(log, "Listening for {}", server.getDescription()); - } - } + servers_to_start_before_tables.createServers( + config(), + *this, + servers_lock, + server_pool, + async_metrics, + /* start_servers= */ false, + ServerType(ServerType::Type::QUERIES_ALL) + ); /// Initialize access storages. auto & access_control = global_context->getAccessControl(); @@ -1799,19 +1498,18 @@ try global_context->setStopServersCallback([&](const ServerType & server_type) { std::lock_guard lock(servers_lock); - stopServers(servers, server_type); + servers.stopServers(server_type); }); global_context->setStartServersCallback([&](const ServerType & server_type) { std::lock_guard lock(servers_lock); - createServers( + servers.createServers( config(), - listen_hosts, - listen_try, + *this, + servers_lock, server_pool, async_metrics, - servers, /* start_servers= */ true, server_type); }); @@ -2024,18 +1722,21 @@ try { std::lock_guard lock(servers_lock); - createServers(config(), listen_hosts, listen_try, server_pool, async_metrics, servers); + servers.createServers( + config(), + *this, + servers_lock, + server_pool, + async_metrics, + false, + ServerType(ServerType::Type::QUERIES_ALL)); if (servers.empty()) - throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, - "No servers started (add valid listen_host and 'tcp_port' or 'http_port' " - "to configuration file.)"); + throw Exception( + ErrorCodes::NO_ELEMENTS_IN_CONFIG, + "No servers started (add valid listen_host and 'tcp_port' " + "or 'http_port' to configuration file.)"); } - if (servers.empty()) - throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, - "No servers started (add valid listen_host and 'tcp_port' or 'http_port' " - "to configuration file.)"); - #if USE_SSL CertificateReloader::instance().tryLoad(config()); #endif @@ -2107,12 +1808,7 @@ try { std::lock_guard lock(servers_lock); - for (auto & server : servers) - { - server.start(); - LOG_INFO(log, "Listening for {}", server.getDescription()); - } - + servers.startServers(); global_context->setServerCompletelyStarted(); LOG_INFO(log, "Ready for connections."); } @@ -2148,46 +1844,10 @@ try access_control.stopPeriodicReloading(); is_cancelled = true; - - LOG_DEBUG(log, "Waiting for current connections to close."); - - size_t current_connections = 0; - { - std::lock_guard lock(servers_lock); - for (auto & server : servers) - { - server.stop(); - current_connections += server.currentConnections(); - } - } - - if (current_connections) - LOG_WARNING(log, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); - else - LOG_INFO(log, "Closed all listening sockets."); - - /// Wait for unfinished backups and restores. - /// This must be done after closing listening sockets (no more backups/restores) but before ProcessList::killAllQueries - /// (because killAllQueries() will cancel all running backups/restores). - if (server_settings.shutdown_wait_backups_and_restores) - global_context->waitAllBackupsAndRestores(); - - /// Killing remaining queries. - if (!server_settings.shutdown_wait_unfinished_queries) - global_context->getProcessList().killAllQueries(); - - if (current_connections) - current_connections = waitServersToFinish(servers, servers_lock, server_settings.shutdown_wait_unfinished); - - if (current_connections) - LOG_WARNING(log, "Closed connections. But {} remain." - " Tip: To increase wait time add to config: 60", current_connections); - else - LOG_INFO(log, "Closed connections."); - + const auto remaining_connections = servers.stopServers(server_settings, servers_lock); dns_cache_updater.reset(); - if (current_connections) + if (remaining_connections) { /// There is no better way to force connections to close in Poco. /// Otherwise connection handlers will continue to live @@ -2221,561 +1881,4 @@ catch (...) return code ? code : -1; } -std::unique_ptr Server::buildProtocolStackFromConfig( - const Poco::Util::AbstractConfiguration & config, - const std::string & protocol, - Poco::Net::HTTPServerParams::Ptr http_params, - AsynchronousMetrics & async_metrics, - bool & is_secure) -{ - auto create_factory = [&](const std::string & type, const std::string & conf_name) -> TCPServerConnectionFactory::Ptr - { - if (type == "tcp") - return TCPServerConnectionFactory::Ptr(new TCPHandlerFactory(*this, false, false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes)); - - if (type == "tls") -#if USE_SSL - return TCPServerConnectionFactory::Ptr(new TLSHandlerFactory(*this, conf_name)); -#else - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); -#endif - - if (type == "proxy1") - return TCPServerConnectionFactory::Ptr(new ProxyV1HandlerFactory(*this, conf_name)); - if (type == "mysql") - return TCPServerConnectionFactory::Ptr(new MySQLHandlerFactory(*this, ProfileEvents::InterfaceMySQLReceiveBytes, ProfileEvents::InterfaceMySQLSendBytes)); - if (type == "postgres") - return TCPServerConnectionFactory::Ptr(new PostgreSQLHandlerFactory(*this, ProfileEvents::InterfacePostgreSQLReceiveBytes, ProfileEvents::InterfacePostgreSQLSendBytes)); - if (type == "http") - return TCPServerConnectionFactory::Ptr( - new HTTPServerConnectionFactory(httpContext(), http_params, createHandlerFactory(*this, config, async_metrics, "HTTPHandler-factory"), ProfileEvents::InterfaceHTTPReceiveBytes, ProfileEvents::InterfaceHTTPSendBytes) - ); - if (type == "prometheus") - return TCPServerConnectionFactory::Ptr( - new HTTPServerConnectionFactory(httpContext(), http_params, createHandlerFactory(*this, config, async_metrics, "PrometheusHandler-factory"), ProfileEvents::InterfacePrometheusReceiveBytes, ProfileEvents::InterfacePrometheusSendBytes) - ); - if (type == "interserver") - return TCPServerConnectionFactory::Ptr( - new HTTPServerConnectionFactory(httpContext(), http_params, createHandlerFactory(*this, config, async_metrics, "InterserverIOHTTPHandler-factory"), ProfileEvents::InterfaceInterserverReceiveBytes, ProfileEvents::InterfaceInterserverSendBytes) - ); - - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol configuration error, unknown protocol name '{}'", type); - }; - - std::string conf_name = "protocols." + protocol; - std::string prefix = conf_name + "."; - std::unordered_set pset {conf_name}; - - auto stack = std::make_unique(*this, conf_name); - - while (true) - { - // if there is no "type" - it's a reference to another protocol and this is just an endpoint - if (config.has(prefix + "type")) - { - std::string type = config.getString(prefix + "type"); - if (type == "tls") - { - if (is_secure) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' contains more than one TLS layer", protocol); - is_secure = true; - } - - stack->append(create_factory(type, conf_name)); - } - - if (!config.has(prefix + "impl")) - break; - - conf_name = "protocols." + config.getString(prefix + "impl"); - prefix = conf_name + "."; - - if (!pset.insert(conf_name).second) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' configuration contains a loop on '{}'", protocol, conf_name); - } - - return stack; -} - -HTTPContextPtr Server::httpContext() const -{ - return std::make_shared(context()); -} - -void Server::createServers( - Poco::Util::AbstractConfiguration & config, - const Strings & listen_hosts, - bool listen_try, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - std::vector & servers, - bool start_servers, - const ServerType & server_type) -{ - const Settings & settings = global_context->getSettingsRef(); - - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(settings.http_receive_timeout); - http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); - - Poco::Util::AbstractConfiguration::Keys protocols; - config.keys("protocols", protocols); - - for (const auto & protocol : protocols) - { - if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol)) - continue; - - std::string prefix = "protocols." + protocol + "."; - std::string port_name = prefix + "port"; - std::string description {" protocol"}; - if (config.has(prefix + "description")) - description = config.getString(prefix + "description"); - - if (!config.has(prefix + "port")) - continue; - - std::vector hosts; - if (config.has(prefix + "host")) - hosts.push_back(config.getString(prefix + "host")); - else - hosts = listen_hosts; - - for (const auto & host : hosts) - { - bool is_secure = false; - auto stack = buildProtocolStackFromConfig(config, protocol, http_params, async_metrics, is_secure); - - if (stack->empty()) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' stack empty", protocol); - - createServer(config, host, port_name.c_str(), listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, host, port, is_secure); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - - return ProtocolServerAdapter( - host, - port_name.c_str(), - description + ": " + address.toString(), - std::make_unique( - stack.release(), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - }); - } - } - - for (const auto & listen_host : listen_hosts) - { - const char * port_name; - - if (server_type.shouldStart(ServerType::Type::HTTP)) - { - /// HTTP - port_name = "http_port"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - - return ProtocolServerAdapter( - listen_host, - port_name, - "http://" + address.toString(), - std::make_unique( - httpContext(), createHandlerFactory(*this, config, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params, ProfileEvents::InterfaceHTTPReceiveBytes, ProfileEvents::InterfaceHTTPSendBytes)); - }); - } - - if (server_type.shouldStart(ServerType::Type::HTTPS)) - { - /// HTTPS - port_name = "https_port"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "https://" + address.toString(), - std::make_unique( - httpContext(), createHandlerFactory(*this, config, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params, ProfileEvents::InterfaceHTTPReceiveBytes, ProfileEvents::InterfaceHTTPSendBytes)); -#else - UNUSED(port); - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "HTTPS protocol is disabled because Poco library was built without NetSSL support."); -#endif - }); - } - - if (server_type.shouldStart(ServerType::Type::TCP)) - { - /// TCP - port_name = "tcp_port"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "native protocol (tcp): " + address.toString(), - std::make_unique( - new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - }); - } - - if (server_type.shouldStart(ServerType::Type::TCP_WITH_PROXY)) - { - /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt - port_name = "tcp_with_proxy_port"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "native protocol (tcp) with PROXY: " + address.toString(), - std::make_unique( - new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ true, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - }); - } - - if (server_type.shouldStart(ServerType::Type::TCP_SECURE)) - { - /// TCP with SSL - port_name = "tcp_port_secure"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - #if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "secure native protocol (tcp_secure): " + address.toString(), - std::make_unique( - new TCPHandlerFactory(*this, /* secure */ true, /* proxy protocol */ false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - #else - UNUSED(port); - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); - #endif - }); - } - - if (server_type.shouldStart(ServerType::Type::MYSQL)) - { - port_name = "mysql_port"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan()); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "MySQL compatibility protocol: " + address.toString(), - std::make_unique(new MySQLHandlerFactory(*this, ProfileEvents::InterfaceMySQLReceiveBytes, ProfileEvents::InterfaceMySQLSendBytes), server_pool, socket, new Poco::Net::TCPServerParams)); - }); - } - - if (server_type.shouldStart(ServerType::Type::POSTGRESQL)) - { - port_name = "postgresql_port"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan()); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "PostgreSQL compatibility protocol: " + address.toString(), - std::make_unique(new PostgreSQLHandlerFactory(*this, ProfileEvents::InterfacePostgreSQLReceiveBytes, ProfileEvents::InterfacePostgreSQLSendBytes), server_pool, socket, new Poco::Net::TCPServerParams)); - }); - } - -#if USE_GRPC - if (server_type.shouldStart(ServerType::Type::GRPC)) - { - port_name = "grpc_port"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::SocketAddress server_address(listen_host, port); - return ProtocolServerAdapter( - listen_host, - port_name, - "gRPC protocol: " + server_address.toString(), - std::make_unique(*this, makeSocketAddress(listen_host, port, &logger()))); - }); - } -#endif - if (server_type.shouldStart(ServerType::Type::PROMETHEUS)) - { - /// Prometheus (if defined and not setup yet with http_port) - port_name = "prometheus.port"; - createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "Prometheus: http://" + address.toString(), - std::make_unique( - httpContext(), createHandlerFactory(*this, config, async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params, ProfileEvents::InterfacePrometheusReceiveBytes, ProfileEvents::InterfacePrometheusSendBytes)); - }); - } - } -} - -void Server::createInterserverServers( - Poco::Util::AbstractConfiguration & config, - const Strings & interserver_listen_hosts, - bool listen_try, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - std::vector & servers, - bool start_servers, - const ServerType & server_type) -{ - const Settings & settings = global_context->getSettingsRef(); - - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(settings.http_receive_timeout); - http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); - - /// Now iterate over interserver_listen_hosts - for (const auto & interserver_listen_host : interserver_listen_hosts) - { - const char * port_name; - - if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTP)) - { - /// Interserver IO HTTP - port_name = "interserver_http_port"; - createServer(config, interserver_listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, interserver_listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - interserver_listen_host, - port_name, - "replica communication (interserver): http://" + address.toString(), - std::make_unique( - httpContext(), - createHandlerFactory(*this, config, async_metrics, "InterserverIOHTTPHandler-factory"), - server_pool, - socket, - http_params, - ProfileEvents::InterfaceInterserverReceiveBytes, - ProfileEvents::InterfaceInterserverSendBytes)); - }); - } - - if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTPS)) - { - port_name = "interserver_https_port"; - createServer(config, interserver_listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(config, socket, interserver_listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - interserver_listen_host, - port_name, - "secure replica communication (interserver): https://" + address.toString(), - std::make_unique( - httpContext(), - createHandlerFactory(*this, config, async_metrics, "InterserverIOHTTPSHandler-factory"), - server_pool, - socket, - http_params, - ProfileEvents::InterfaceInterserverReceiveBytes, - ProfileEvents::InterfaceInterserverSendBytes)); -#else - UNUSED(port); - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); -#endif - }); - } - } -} - -void Server::stopServers( - std::vector & servers, - const ServerType & server_type -) const -{ - LoggerRawPtr log = &logger(); - - /// Remove servers once all their connections are closed - auto check_server = [&log](const char prefix[], auto & server) - { - if (!server.isStopping()) - return false; - size_t current_connections = server.currentConnections(); - LOG_DEBUG(log, "Server {}{}: {} ({} connections)", - server.getDescription(), - prefix, - !current_connections ? "finished" : "waiting", - current_connections); - return !current_connections; - }; - - std::erase_if(servers, std::bind_front(check_server, " (from one of previous remove)")); - - for (auto & server : servers) - { - if (!server.isStopping()) - { - const std::string server_port_name = server.getPortName(); - - if (server_type.shouldStop(server_port_name)) - server.stop(); - } - } - - std::erase_if(servers, std::bind_front(check_server, "")); -} - -void Server::updateServers( - Poco::Util::AbstractConfiguration & config, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - std::vector & servers, - std::vector & servers_to_start_before_tables) -{ - LoggerRawPtr log = &logger(); - - const auto listen_hosts = getListenHosts(config); - const auto interserver_listen_hosts = getInterserverListenHosts(config); - const auto listen_try = getListenTry(config); - - /// Remove servers once all their connections are closed - auto check_server = [&log](const char prefix[], auto & server) - { - if (!server.isStopping()) - return false; - size_t current_connections = server.currentConnections(); - LOG_DEBUG(log, "Server {}{}: {} ({} connections)", - server.getDescription(), - prefix, - !current_connections ? "finished" : "waiting", - current_connections); - return !current_connections; - }; - - std::erase_if(servers, std::bind_front(check_server, " (from one of previous reload)")); - - Poco::Util::AbstractConfiguration & previous_config = latest_config ? *latest_config : this->config(); - - std::vector all_servers; - all_servers.reserve(servers.size() + servers_to_start_before_tables.size()); - for (auto & server : servers) - all_servers.push_back(&server); - - for (auto & server : servers_to_start_before_tables) - all_servers.push_back(&server); - - for (auto * server : all_servers) - { - if (!server->isStopping()) - { - std::string port_name = server->getPortName(); - bool has_host = false; - bool is_http = false; - if (port_name.starts_with("protocols.")) - { - std::string protocol = port_name.substr(0, port_name.find_last_of('.')); - has_host = config.has(protocol + ".host"); - - std::string conf_name = protocol; - std::string prefix = protocol + "."; - std::unordered_set pset {conf_name}; - while (true) - { - if (config.has(prefix + "type")) - { - std::string type = config.getString(prefix + "type"); - if (type == "http") - { - is_http = true; - break; - } - } - - if (!config.has(prefix + "impl")) - break; - - conf_name = "protocols." + config.getString(prefix + "impl"); - prefix = conf_name + "."; - - if (!pset.insert(conf_name).second) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' configuration contains a loop on '{}'", protocol, conf_name); - } - } - else - { - /// NOTE: better to compare using getPortName() over using - /// dynamic_cast<> since HTTPServer is also used for prometheus and - /// internal replication communications. - is_http = server->getPortName() == "http_port" || server->getPortName() == "https_port"; - } - - if (!has_host) - has_host = std::find(listen_hosts.begin(), listen_hosts.end(), server->getListenHost()) != listen_hosts.end(); - bool has_port = !config.getString(port_name, "").empty(); - bool force_restart = is_http && !isSameConfiguration(previous_config, config, "http_handlers"); - if (force_restart) - LOG_TRACE(log, " had been changed, will reload {}", server->getDescription()); - - if (!has_host || !has_port || config.getInt(server->getPortName()) != server->portNumber() || force_restart) - { - server->stop(); - LOG_INFO(log, "Stopped listening for {}", server->getDescription()); - } - } - } - - createServers(config, listen_hosts, listen_try, server_pool, async_metrics, servers, /* start_servers= */ true); - createInterserverServers(config, interserver_listen_hosts, listen_try, server_pool, async_metrics, servers_to_start_before_tables, /* start_servers= */ true); - - std::erase_if(servers, std::bind_front(check_server, "")); - std::erase_if(servers_to_start_before_tables, std::bind_front(check_server, "")); -} - } diff --git a/programs/server/Server.h b/programs/server/Server.h index 3f03dd137ef..b4931ce53d1 100644 --- a/programs/server/Server.h +++ b/programs/server/Server.h @@ -1,15 +1,10 @@ #pragma once #include - #include -#include -#include -#include -#include /** Server provides three interfaces: - * 1. HTTP - simple interface for any applications. + * 1. HTTP, GRPC - simple interfaces for any applications. * 2. TCP - interface for native clickhouse-client and for server to server internal communications. * More rich and efficient, but less compatible * - data is transferred by columns; @@ -18,43 +13,21 @@ * 3. Interserver HTTP - for replication. */ -namespace Poco -{ - namespace Net - { - class ServerSocket; - } -} - namespace DB { -class AsynchronousMetrics; -class ProtocolServerAdapter; class Server : public BaseDaemon, public IServer { public: using ServerApplication::run; - Poco::Util::LayeredConfiguration & config() const override - { - return BaseDaemon::config(); - } + Poco::Util::LayeredConfiguration & config() const override { return BaseDaemon::config(); } - Poco::Logger & logger() const override - { - return BaseDaemon::logger(); - } + Poco::Logger & logger() const override { return BaseDaemon::logger(); } - ContextMutablePtr context() const override - { - return global_context; - } + ContextMutablePtr context() const override { return global_context; } - bool isCancelled() const override - { - return BaseDaemon::isCancelled(); - } + bool isCancelled() const override { return BaseDaemon::isCancelled(); } void defineOptions(Poco::Util::OptionSet & _options) override; @@ -73,64 +46,6 @@ private: ContextMutablePtr global_context; /// Updated/recent config, to compare http_handlers ConfigurationPtr latest_config; - - HTTPContextPtr httpContext() const; - - Poco::Net::SocketAddress socketBindListen( - const Poco::Util::AbstractConfiguration & config, - Poco::Net::ServerSocket & socket, - const std::string & host, - UInt16 port, - [[maybe_unused]] bool secure = false) const; - - std::unique_ptr buildProtocolStackFromConfig( - const Poco::Util::AbstractConfiguration & config, - const std::string & protocol, - Poco::Net::HTTPServerParams::Ptr http_params, - AsynchronousMetrics & async_metrics, - bool & is_secure); - - using CreateServerFunc = std::function; - void createServer( - Poco::Util::AbstractConfiguration & config, - const std::string & listen_host, - const char * port_name, - bool listen_try, - bool start_server, - std::vector & servers, - CreateServerFunc && func) const; - - void createServers( - Poco::Util::AbstractConfiguration & config, - const Strings & listen_hosts, - bool listen_try, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - std::vector & servers, - bool start_servers = false, - const ServerType & server_type = ServerType(ServerType::Type::QUERIES_ALL)); - - void createInterserverServers( - Poco::Util::AbstractConfiguration & config, - const Strings & interserver_listen_hosts, - bool listen_try, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - std::vector & servers, - bool start_servers = false, - const ServerType & server_type = ServerType(ServerType::Type::QUERIES_ALL)); - - void updateServers( - Poco::Util::AbstractConfiguration & config, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - std::vector & servers, - std::vector & servers_to_start_before_tables); - - void stopServers( - std::vector & servers, - const ServerType & server_type - ) const; }; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4e8946facda..826204111a0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -234,6 +234,7 @@ add_object_library(clickhouse_client Client) add_object_library(clickhouse_bridge BridgeHelper) add_object_library(clickhouse_server Server) add_object_library(clickhouse_server_http Server/HTTP) +add_object_library(clickhouse_server_manager Server/ServersManager) add_object_library(clickhouse_formats Formats) add_object_library(clickhouse_processors Processors) add_object_library(clickhouse_processors_executors Processors/Executors) diff --git a/src/Server/ServersManager/IServersManager.cpp b/src/Server/ServersManager/IServersManager.cpp new file mode 100644 index 00000000000..c903d90f766 --- /dev/null +++ b/src/Server/ServersManager/IServersManager.cpp @@ -0,0 +1,268 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int NETWORK_ERROR; +extern const int INVALID_CONFIG_PARAMETER; +} + +IServersManager::IServersManager(ContextMutablePtr l_global_context, Poco::Logger * l_logger) + : global_context(l_global_context), logger(l_logger) +{ +} + + +bool IServersManager::empty() const +{ + return servers.empty(); +} + +std::vector IServersManager::getMetrics() const +{ + std::vector metrics; + metrics.reserve(servers.size()); + for (const auto & server : servers) + metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); + return metrics; +} + +void IServersManager::startServers() +{ + for (auto & server : servers) + { + server.start(); + LOG_INFO(logger, "Listening for {}", server.getDescription()); + } +} + +void IServersManager::stopServers(const ServerType & server_type) +{ + /// Remove servers once all their connections are closed + auto check_server = [&](const char prefix[], auto & server) + { + if (!server.isStopping()) + return false; + size_t current_connections = server.currentConnections(); + LOG_DEBUG( + logger, + "Server {}{}: {} ({} connections)", + server.getDescription(), + prefix, + !current_connections ? "finished" : "waiting", + current_connections); + return !current_connections; + }; + + std::erase_if(servers, std::bind_front(check_server, " (from one of previous remove)")); + + for (auto & server : servers) + { + if (!server.isStopping() && server_type.shouldStop(server.getPortName())) + server.stop(); + } + + std::erase_if(servers, std::bind_front(check_server, "")); +} + +void IServersManager::updateServers( + const Poco::Util::AbstractConfiguration & config, + IServer & iserver, + std::mutex & servers_lock, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + ConfigurationPtr latest_config) +{ + stopServersForUpdate(config, latest_config); + createServers(config, iserver, servers_lock, server_pool, async_metrics, true, ServerType(ServerType::Type::QUERIES_ALL)); +} + +Poco::Net::SocketAddress IServersManager::socketBindListen( + const Poco::Util::AbstractConfiguration & config, Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port) const +{ + auto address = makeSocketAddress(host, port, logger); + socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ config.getBool("listen_reuse_port", false)); + /// If caller requests any available port from the OS, discover it after binding. + if (port == 0) + { + address = socket.address(); + LOG_DEBUG(logger, "Requested any available port (port == 0), actual port is {:d}", address.port()); + } + + socket.listen(/* backlog = */ config.getUInt("listen_backlog", 4096)); + return address; +} + +void IServersManager::createServer( + const Poco::Util::AbstractConfiguration & config, + const std::string & listen_host, + const char * port_name, + CreateServerFunc && func, + bool start_server) +{ + /// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file. + if (config.getString(port_name, "").empty()) + return; + + /// If we already have an active server for this listen_host/port_name, don't create it again + for (const auto & server : servers) + { + if (!server.isStopping() && server.getListenHost() == listen_host && server.getPortName() == port_name) + return; + } + + auto port = config.getInt(port_name); + try + { + servers.push_back(func(port)); + if (start_server) + { + servers.back().start(); + LOG_INFO(logger, "Listening for {}", servers.back().getDescription()); + } + global_context->registerServerPort(port_name, port); + } + catch (const Poco::Exception &) + { + if (!getListenTry(config)) + { + throw Exception(ErrorCodes::NETWORK_ERROR, "Listen [{}]:{} failed: {}", listen_host, port, getCurrentExceptionMessage(false)); + } + LOG_WARNING( + logger, + "Listen [{}]:{} failed: {}. If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, " + "then consider to " + "specify not disabled IPv4 or IPv6 address to listen in element of configuration " + "file. Example for disabled IPv6: 0.0.0.0 ." + " Example for disabled IPv4: ::", + listen_host, + port, + getCurrentExceptionMessage(false)); + } +} + +void IServersManager::stopServersForUpdate(const Poco::Util::AbstractConfiguration & config, ConfigurationPtr latest_config) +{ + /// Remove servers once all their connections are closed + auto check_server = [&](const char prefix[], auto & server) + { + if (!server.isStopping()) + return false; + size_t current_connections = server.currentConnections(); + LOG_DEBUG( + logger, + "Server {}{}: {} ({} connections)", + server.getDescription(), + prefix, + !current_connections ? "finished" : "waiting", + current_connections); + return !current_connections; + }; + + std::erase_if(servers, std::bind_front(check_server, " (from one of previous reload)")); + + const auto listen_hosts = getListenHosts(config); + const Poco::Util::AbstractConfiguration & previous_config = latest_config ? *latest_config : config; + + for (auto & server : servers) + { + if (server.isStopping()) + return; + std::string port_name = server.getPortName(); + bool has_host = false; + bool is_http = false; + if (port_name.starts_with("protocols.")) + { + std::string protocol = port_name.substr(0, port_name.find_last_of('.')); + has_host = config.has(protocol + ".host"); + + std::string conf_name = protocol; + std::string prefix = protocol + "."; + std::unordered_set pset{conf_name}; + while (true) + { + if (config.has(prefix + "type")) + { + std::string type = config.getString(prefix + "type"); + if (type == "http") + { + is_http = true; + break; + } + } + + if (!config.has(prefix + "impl")) + break; + + conf_name = "protocols." + config.getString(prefix + "impl"); + prefix = conf_name + "."; + + if (!pset.insert(conf_name).second) + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' configuration contains a loop on '{}'", protocol, conf_name); + } + } + else + { + /// NOTE: better to compare using getPortName() over using + /// dynamic_cast<> since HTTPServer is also used for prometheus and + /// internal replication communications. + is_http = server.getPortName() == "http_port" || server.getPortName() == "https_port"; + } + + if (!has_host) + has_host = std::find(listen_hosts.begin(), listen_hosts.end(), server.getListenHost()) != listen_hosts.end(); + bool has_port = !config.getString(port_name, "").empty(); + bool force_restart = is_http && !isSameConfiguration(previous_config, config, "http_handlers"); + if (force_restart) + LOG_TRACE(logger, " had been changed, will reload {}", server.getDescription()); + + if (!has_host || !has_port || config.getInt(server.getPortName()) != server.portNumber() || force_restart) + { + server.stop(); + LOG_INFO(logger, "Stopped listening for {}", server.getDescription()); + } + } + + std::erase_if(servers, std::bind_front(check_server, "")); +} + +Strings IServersManager::getListenHosts(const Poco::Util::AbstractConfiguration & config) const +{ + auto listen_hosts = DB::getMultipleValuesFromConfig(config, "", "listen_host"); + if (listen_hosts.empty()) + { + listen_hosts.emplace_back("::1"); + listen_hosts.emplace_back("127.0.0.1"); + } + return listen_hosts; +} + +bool IServersManager::getListenTry(const Poco::Util::AbstractConfiguration & config) const +{ + bool listen_try = config.getBool("listen_try", false); + if (!listen_try) + { + Poco::Util::AbstractConfiguration::Keys protocols; + config.keys("protocols", protocols); + listen_try = DB::getMultipleValuesFromConfig(config, "", "listen_host").empty() + && std::none_of( + protocols.begin(), + protocols.end(), + [&](const auto & protocol) + { return config.has("protocols." + protocol + ".host") && config.has("protocols." + protocol + ".port"); }); + } + return listen_try; +} + +} diff --git a/src/Server/ServersManager/IServersManager.h b/src/Server/ServersManager/IServersManager.h new file mode 100644 index 00000000000..5218ab63554 --- /dev/null +++ b/src/Server/ServersManager/IServersManager.h @@ -0,0 +1,74 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class IServersManager +{ +public: + IServersManager(ContextMutablePtr global_context, Poco::Logger * logger); + virtual ~IServersManager() = default; + + bool empty() const; + std::vector getMetrics() const; + + virtual void createServers( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + std::mutex & servers_lock, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + bool start_servers, + const ServerType & server_type) + = 0; + + virtual void startServers(); + + virtual void stopServers(const ServerType & server_type); + virtual size_t stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) = 0; + + virtual void updateServers( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + std::mutex & servers_lock, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + ConfigurationPtr latest_config); + +protected: + ContextMutablePtr global_context; + Poco::Logger * logger; + + std::vector servers; + + Poco::Net::SocketAddress socketBindListen( + const Poco::Util::AbstractConfiguration & config, Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port) const; + + using CreateServerFunc = std::function; + virtual void createServer( + const Poco::Util::AbstractConfiguration & config, + const std::string & listen_host, + const char * port_name, + CreateServerFunc && func, + bool start_server); + + virtual void stopServersForUpdate(const Poco::Util::AbstractConfiguration & config, ConfigurationPtr latest_config); + + Strings getListenHosts(const Poco::Util::AbstractConfiguration & config) const; + bool getListenTry(const Poco::Util::AbstractConfiguration & config) const; +}; + +} diff --git a/src/Server/ServersManager/InterServersManager.cpp b/src/Server/ServersManager/InterServersManager.cpp new file mode 100644 index 00000000000..28491a4f4f4 --- /dev/null +++ b/src/Server/ServersManager/InterServersManager.cpp @@ -0,0 +1,327 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if USE_SSL +# include +#endif + +#if USE_NURAFT +# include +# include +#endif + +namespace ProfileEvents +{ +extern const Event InterfaceInterserverSendBytes; +extern const Event InterfaceInterserverReceiveBytes; +} + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int SUPPORT_IS_DISABLED; +} + +void InterServersManager::createServers( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + std::mutex & servers_lock, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + bool start_servers, + const ServerType & server_type) +{ + if (config.has("keeper_server.server_id")) + { +#if USE_NURAFT + //// If we don't have configured connection probably someone trying to use clickhouse-server instead + //// of clickhouse-keeper, so start synchronously. + bool can_initialize_keeper_async = false; + + if (zkutil::hasZooKeeperConfig(config)) /// We have configured connection to some zookeeper cluster + { + /// If we cannot connect to some other node from our cluster then we have to wait our Keeper start + /// synchronously. + can_initialize_keeper_async = global_context->tryCheckClientConnectionToMyKeeperCluster(); + } + /// Initialize keeper RAFT. + global_context->initializeKeeperDispatcher(can_initialize_keeper_async); + FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); + + auto config_getter = [this]() -> const Poco::Util::AbstractConfiguration & { return global_context->getConfigRef(); }; + + for (const auto & listen_host : getListenHosts(config)) + { + /// TCP Keeper + constexpr auto port_name = "keeper_server.tcp_port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout( + Poco::Timespan(config.getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)); + socket.setSendTimeout( + Poco::Timespan(config.getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)); + return ProtocolServerAdapter( + listen_host, + port_name, + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory( + config_getter, + global_context->getKeeperDispatcher(), + global_context->getSettingsRef().receive_timeout.totalSeconds(), + global_context->getSettingsRef().send_timeout.totalSeconds(), + false), + server_pool, + socket)); + }, + /* start_server = */ false); + + constexpr auto secure_port_name = "keeper_server.tcp_port_secure"; + createServer( + config, + listen_host, + secure_port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { +# if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout( + Poco::Timespan(config.getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)); + socket.setSendTimeout( + Poco::Timespan(config.getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)); + return ProtocolServerAdapter( + listen_host, + secure_port_name, + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory( + config_getter, + global_context->getKeeperDispatcher(), + global_context->getSettingsRef().receive_timeout.totalSeconds(), + global_context->getSettingsRef().send_timeout.totalSeconds(), + true), + server_pool, + socket)); +# else + UNUSED(port); + throw Exception( + ErrorCodes::SUPPORT_IS_DISABLED, + "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); +# endif + }, + /* start_server: */ false); + + /// HTTP control endpoints + createServer( + config, + listen_host, + /* port_name = */ "keeper_server.http_control.port", + [&](UInt16 port) -> ProtocolServerAdapter + { + auto http_context = std::make_shared(global_context); + Poco::Timespan keep_alive_timeout(config.getUInt("keep_alive_timeout", 10), 0); + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(http_context->getReceiveTimeout()); + http_params->setKeepAliveTimeout(keep_alive_timeout); + + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(http_context->getReceiveTimeout()); + socket.setSendTimeout(http_context->getSendTimeout()); + return ProtocolServerAdapter( + listen_host, + port_name, + "HTTP Control: http://" + address.toString(), + std::make_unique( + std::move(http_context), + createKeeperHTTPControlMainHandlerFactory( + config_getter(), global_context->getKeeperDispatcher(), "KeeperHTTPControlHandler-factory"), + server_pool, + socket, + http_params)); + }, + /* start_server: */ false); + } +#else + throw Exception( + ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination."); +#endif + } + + { + std::lock_guard lock(servers_lock); + /// We should start interserver communications before (and more important shutdown after) tables. + /// Because server can wait for a long-running queries (for example in tcp_handler) after interserver handler was already shut down. + /// In this case we will have replicated tables which are unable to send any parts to other replicas, but still can + /// communicate with zookeeper, execute merges, etc. + createInterserverServers(config, server, server_pool, async_metrics, start_servers, server_type); + startServers(); + } +} + +size_t InterServersManager::stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) +{ + if (servers.empty()) + { + return 0; + } + + LOG_DEBUG(logger, "Waiting for current connections to servers for tables to finish."); + + size_t current_connections = 0; + { + std::lock_guard lock(servers_lock); + for (auto & server : servers) + { + server.stop(); + current_connections += server.currentConnections(); + } + } + + if (current_connections) + LOG_INFO(logger, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); + else + LOG_INFO(logger, "Closed all listening sockets."); + + if (current_connections > 0) + current_connections = waitServersToFinish(servers, servers_lock, server_settings.shutdown_wait_unfinished); + + if (current_connections) + LOG_INFO( + logger, + "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections " + "after context shutdown.", + current_connections); + else + LOG_INFO(logger, "Closed connections to servers for tables."); + return current_connections; +} + +void InterServersManager::updateServers( + const Poco::Util::AbstractConfiguration & config, + IServer & iserver, + std::mutex & /*servers_lock*/, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + ConfigurationPtr latest_config) +{ + stopServersForUpdate(config, latest_config); + createInterserverServers(config, iserver, server_pool, async_metrics, true, ServerType(ServerType::Type::QUERIES_ALL)); +} + +Strings InterServersManager::getInterserverListenHosts(const Poco::Util::AbstractConfiguration & config) const +{ + auto interserver_listen_hosts = DB::getMultipleValuesFromConfig(config, "", "interserver_listen_host"); + if (!interserver_listen_hosts.empty()) + return interserver_listen_hosts; + + /// Use more general restriction in case of emptiness + return getListenHosts(config); +} + +void InterServersManager::createInterserverServers( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + bool start_servers, + const ServerType & server_type) +{ + const Settings & settings = global_context->getSettingsRef(); + + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(settings.http_receive_timeout); + http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); + + /// Now iterate over interserver_listen_hosts + for (const auto & interserver_listen_host : getInterserverListenHosts(config)) + { + if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTP)) + { + /// Interserver IO HTTP + constexpr auto port_name = "interserver_http_port"; + createServer( + config, + interserver_listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, interserver_listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + interserver_listen_host, + port_name, + "replica communication (interserver): http://" + address.toString(), + std::make_unique( + std::make_shared(global_context), + createHandlerFactory(server, config, async_metrics, "InterserverIOHTTPHandler-factory"), + server_pool, + socket, + http_params, + ProfileEvents::InterfaceInterserverReceiveBytes, + ProfileEvents::InterfaceInterserverSendBytes)); + }, + start_servers); + } + + if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTPS)) + { + constexpr auto port_name = "interserver_https_port"; + createServer( + config, + interserver_listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(config, socket, interserver_listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + interserver_listen_host, + port_name, + "secure replica communication (interserver): https://" + address.toString(), + std::make_unique( + std::make_shared(global_context), + createHandlerFactory(server, config, async_metrics, "InterserverIOHTTPSHandler-factory"), + server_pool, + socket, + http_params, + ProfileEvents::InterfaceInterserverReceiveBytes, + ProfileEvents::InterfaceInterserverSendBytes)); +#else + UNUSED(port); + throw Exception( + ErrorCodes::SUPPORT_IS_DISABLED, + "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); +#endif + }, + start_servers); + } + } +} + +} diff --git a/src/Server/ServersManager/InterServersManager.h b/src/Server/ServersManager/InterServersManager.h new file mode 100644 index 00000000000..2a389e28c22 --- /dev/null +++ b/src/Server/ServersManager/InterServersManager.h @@ -0,0 +1,45 @@ +#pragma once + +#include + +namespace DB +{ + +class InterServersManager : public IServersManager +{ +public: + using IServersManager::IServersManager; + + void createServers( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + std::mutex & servers_lock, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + bool start_servers, + const ServerType & server_type) override; + + using IServersManager::stopServers; + size_t stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) override; + + void updateServers( + const Poco::Util::AbstractConfiguration & config, + IServer & iserver, + std::mutex & servers_lock, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + ConfigurationPtr latest_config) override; + +private: + Strings getInterserverListenHosts(const Poco::Util::AbstractConfiguration & config) const; + + void createInterserverServers( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + bool start_servers, + const ServerType & server_type); +}; + +} diff --git a/src/Server/ServersManager/ProtocolServersManager.cpp b/src/Server/ServersManager/ProtocolServersManager.cpp new file mode 100644 index 00000000000..17b028eddbb --- /dev/null +++ b/src/Server/ServersManager/ProtocolServersManager.cpp @@ -0,0 +1,523 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if USE_SSL +# include +#endif + +#if USE_GRPC +# include +#endif + +namespace ProfileEvents +{ +extern const Event InterfaceNativeSendBytes; +extern const Event InterfaceNativeReceiveBytes; +extern const Event InterfaceHTTPSendBytes; +extern const Event InterfaceHTTPReceiveBytes; +extern const Event InterfacePrometheusSendBytes; +extern const Event InterfacePrometheusReceiveBytes; +extern const Event InterfaceMySQLSendBytes; +extern const Event InterfaceMySQLReceiveBytes; +extern const Event InterfacePostgreSQLSendBytes; +extern const Event InterfacePostgreSQLReceiveBytes; +extern const Event InterfaceInterserverSendBytes; +extern const Event InterfaceInterserverReceiveBytes; +} + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int SUPPORT_IS_DISABLED; +extern const int INVALID_CONFIG_PARAMETER; +} + +void ProtocolServersManager::createServers( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + std::mutex & /*servers_lock*/, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + bool start_servers, + const ServerType & server_type) +{ + auto listen_hosts = getListenHosts(config); + const Settings & settings = global_context->getSettingsRef(); + + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(settings.http_receive_timeout); + http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); + + Poco::Util::AbstractConfiguration::Keys protocols; + config.keys("protocols", protocols); + + for (const auto & protocol : protocols) + { + if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol)) + continue; + + std::string prefix = "protocols." + protocol + "."; + std::string port_name = prefix + "port"; + std::string description{" protocol"}; + if (config.has(prefix + "description")) + description = config.getString(prefix + "description"); + + if (!config.has(prefix + "port")) + continue; + + std::vector hosts; + if (config.has(prefix + "host")) + hosts.push_back(config.getString(prefix + "host")); + else + hosts = listen_hosts; + + for (const auto & host : hosts) + { + bool is_secure = false; + auto stack = buildProtocolStackFromConfig(config, server, protocol, http_params, async_metrics, is_secure); + + if (stack->empty()) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' stack empty", protocol); + + createServer( + config, + host, + port_name.c_str(), + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + host, + port_name.c_str(), + description + ": " + address.toString(), + std::make_unique(stack.release(), server_pool, socket, new Poco::Net::TCPServerParams)); + }, + start_servers); + } + } + + for (const auto & listen_host : listen_hosts) + { + if (server_type.shouldStart(ServerType::Type::HTTP)) + { + /// HTTP + constexpr auto port_name = "http_port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "http://" + address.toString(), + std::make_unique( + std::make_shared(global_context), + createHandlerFactory(server, config, async_metrics, "HTTPHandler-factory"), + server_pool, + socket, + http_params, + ProfileEvents::InterfaceHTTPReceiveBytes, + ProfileEvents::InterfaceHTTPSendBytes)); + }, + start_servers); + } + + if (server_type.shouldStart(ServerType::Type::HTTPS)) + { + /// HTTPS + constexpr auto port_name = "https_port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "https://" + address.toString(), + std::make_unique( + std::make_shared(global_context), + createHandlerFactory(server, config, async_metrics, "HTTPSHandler-factory"), + server_pool, + socket, + http_params, + ProfileEvents::InterfaceHTTPReceiveBytes, + ProfileEvents::InterfaceHTTPSendBytes)); +#else + UNUSED(port); + throw Exception( + ErrorCodes::SUPPORT_IS_DISABLED, + "HTTPS protocol is disabled because Poco library was built without NetSSL support."); +#endif + }, + start_servers); + } + + if (server_type.shouldStart(ServerType::Type::TCP)) + { + /// TCP + constexpr auto port_name = "tcp_port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "native protocol (tcp): " + address.toString(), + std::make_unique( + new TCPHandlerFactory( + server, false, false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }, + start_servers); + } + + if (server_type.shouldStart(ServerType::Type::TCP_WITH_PROXY)) + { + /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt + constexpr auto port_name = "tcp_with_proxy_port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "native protocol (tcp) with PROXY: " + address.toString(), + std::make_unique( + new TCPHandlerFactory( + server, false, true, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }, + start_servers); + } + + if (server_type.shouldStart(ServerType::Type::TCP_SECURE)) + { + /// TCP with SSL + constexpr auto port_name = "tcp_port_secure"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "secure native protocol (tcp_secure): " + address.toString(), + std::make_unique( + new TCPHandlerFactory( + server, true, false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), + server_pool, + socket, + new Poco::Net::TCPServerParams)); +#else + UNUSED(port); + throw Exception( + ErrorCodes::SUPPORT_IS_DISABLED, + "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); +#endif + }, + start_servers); + } + + if (server_type.shouldStart(ServerType::Type::MYSQL)) + { + constexpr auto port_name = "mysql_port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(Poco::Timespan()); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "MySQL compatibility protocol: " + address.toString(), + std::make_unique( + new MySQLHandlerFactory( + server, ProfileEvents::InterfaceMySQLReceiveBytes, ProfileEvents::InterfaceMySQLSendBytes), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }, + start_servers); + } + + if (server_type.shouldStart(ServerType::Type::POSTGRESQL)) + { + constexpr auto port_name = "postgresql_port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(Poco::Timespan()); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "PostgreSQL compatibility protocol: " + address.toString(), + std::make_unique( + new PostgreSQLHandlerFactory( + server, ProfileEvents::InterfacePostgreSQLReceiveBytes, ProfileEvents::InterfacePostgreSQLSendBytes), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }, + start_servers); + } + +#if USE_GRPC + if (server_type.shouldStart(ServerType::Type::GRPC)) + { + constexpr auto port_name = "grpc_port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::SocketAddress server_address(listen_host, port); + return ProtocolServerAdapter( + listen_host, + port_name, + "gRPC protocol: " + server_address.toString(), + std::make_unique(server, makeSocketAddress(listen_host, port, logger))); + }, + start_servers); + } +#endif + if (server_type.shouldStart(ServerType::Type::PROMETHEUS)) + { + /// Prometheus (if defined and not setup yet with http_port) + constexpr auto port_name = "prometheus.port"; + createServer( + config, + listen_host, + port_name, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "Prometheus: http://" + address.toString(), + std::make_unique( + std::make_shared(global_context), + createHandlerFactory(server, config, async_metrics, "PrometheusHandler-factory"), + server_pool, + socket, + http_params, + ProfileEvents::InterfacePrometheusReceiveBytes, + ProfileEvents::InterfacePrometheusSendBytes)); + }, + start_servers); + } + } +} + +size_t ProtocolServersManager::stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) +{ + if (servers.empty()) + { + return 0; + } + + LOG_DEBUG(logger, "Waiting for current connections to close."); + + size_t current_connections = 0; + { + std::lock_guard lock(servers_lock); + for (auto & server : servers) + { + server.stop(); + current_connections += server.currentConnections(); + } + } + + if (current_connections) + LOG_WARNING(logger, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); + else + LOG_INFO(logger, "Closed all listening sockets."); + + /// Wait for unfinished backups and restores. + /// This must be done after closing listening sockets (no more backups/restores) but before ProcessList::killAllQueries + /// (because killAllQueries() will cancel all running backups/restores). + if (server_settings.shutdown_wait_backups_and_restores) + global_context->waitAllBackupsAndRestores(); + /// Killing remaining queries. + if (!server_settings.shutdown_wait_unfinished_queries) + global_context->getProcessList().killAllQueries(); + + if (current_connections) + current_connections = waitServersToFinish(servers, servers_lock, server_settings.shutdown_wait_unfinished); + + if (current_connections) + LOG_WARNING( + logger, + "Closed connections. But {} remain." + " Tip: To increase wait time add to config: 60", + current_connections); + else + LOG_INFO(logger, "Closed connections."); + return current_connections; +} + +std::unique_ptr ProtocolServersManager::buildProtocolStackFromConfig( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + const std::string & protocol, + Poco::Net::HTTPServerParams::Ptr http_params, + AsynchronousMetrics & async_metrics, + bool & is_secure) const +{ + auto create_factory = [&](const std::string & type, const std::string & conf_name) -> TCPServerConnectionFactory::Ptr + { + if (type == "tcp") + return TCPServerConnectionFactory::Ptr(new TCPHandlerFactory( + server, false, false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes)); + + if (type == "tls") +#if USE_SSL + return TCPServerConnectionFactory::Ptr(new TLSHandlerFactory(server, conf_name)); +#else + throw Exception( + ErrorCodes::SUPPORT_IS_DISABLED, + "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); +#endif + + if (type == "proxy1") + return TCPServerConnectionFactory::Ptr(new ProxyV1HandlerFactory(server, conf_name)); + if (type == "mysql") + return TCPServerConnectionFactory::Ptr( + new MySQLHandlerFactory(server, ProfileEvents::InterfaceMySQLReceiveBytes, ProfileEvents::InterfaceMySQLSendBytes)); + if (type == "postgres") + return TCPServerConnectionFactory::Ptr(new PostgreSQLHandlerFactory( + server, ProfileEvents::InterfacePostgreSQLReceiveBytes, ProfileEvents::InterfacePostgreSQLSendBytes)); + if (type == "http") + return TCPServerConnectionFactory::Ptr(new HTTPServerConnectionFactory( + std::make_shared(global_context), + http_params, + createHandlerFactory(server, config, async_metrics, "HTTPHandler-factory"), + ProfileEvents::InterfaceHTTPReceiveBytes, + ProfileEvents::InterfaceHTTPSendBytes)); + if (type == "prometheus") + return TCPServerConnectionFactory::Ptr(new HTTPServerConnectionFactory( + std::make_shared(global_context), + http_params, + createHandlerFactory(server, config, async_metrics, "PrometheusHandler-factory"), + ProfileEvents::InterfacePrometheusReceiveBytes, + ProfileEvents::InterfacePrometheusSendBytes)); + if (type == "interserver") + return TCPServerConnectionFactory::Ptr(new HTTPServerConnectionFactory( + std::make_shared(global_context), + http_params, + createHandlerFactory(server, config, async_metrics, "InterserverIOHTTPHandler-factory"), + ProfileEvents::InterfaceInterserverReceiveBytes, + ProfileEvents::InterfaceInterserverSendBytes)); + + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol configuration error, unknown protocol name '{}'", type); + }; + + std::string conf_name = "protocols." + protocol; + std::string prefix = conf_name + "."; + std::unordered_set pset{conf_name}; + + auto stack = std::make_unique(server, conf_name); + + while (true) + { + // if there is no "type" - it's a reference to another protocol and this is just an endpoint + if (config.has(prefix + "type")) + { + std::string type = config.getString(prefix + "type"); + if (type == "tls") + { + if (is_secure) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' contains more than one TLS layer", protocol); + is_secure = true; + } + + stack->append(create_factory(type, conf_name)); + } + + if (!config.has(prefix + "impl")) + break; + + conf_name = "protocols." + config.getString(prefix + "impl"); + prefix = conf_name + "."; + + if (!pset.insert(conf_name).second) + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' configuration contains a loop on '{}'", protocol, conf_name); + } + + return stack; +} + +} diff --git a/src/Server/ServersManager/ProtocolServersManager.h b/src/Server/ServersManager/ProtocolServersManager.h new file mode 100644 index 00000000000..e9eaaeb2184 --- /dev/null +++ b/src/Server/ServersManager/ProtocolServersManager.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class ProtocolServersManager : public IServersManager +{ +public: + using IServersManager::IServersManager; + + void createServers( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + std::mutex & servers_lock, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + bool start_servers, + const ServerType & server_type) override; + + using IServersManager::stopServers; + size_t stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) override; + +private: + std::unique_ptr buildProtocolStackFromConfig( + const Poco::Util::AbstractConfiguration & config, + IServer & server, + const std::string & protocol, + Poco::Net::HTTPServerParams::Ptr http_params, + AsynchronousMetrics & async_metrics, + bool & is_secure) const; +}; + +} From 27627f603fcfcd6df06bfb5210463c1fff8763c6 Mon Sep 17 00:00:00 2001 From: jsc0218 Date: Fri, 24 May 2024 03:04:36 +0000 Subject: [PATCH 317/392] fix --- .../0_stateless/02319_lightweight_delete_on_merge_tree.sql | 2 +- tests/queries/0_stateless/02792_drop_projection_lwd.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql index 050b8e37722..f82f79dbe44 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql @@ -102,7 +102,7 @@ ALTER TABLE t_proj ADD PROJECTION p_1 (SELECT avg(a), avg(b), count()) SETTINGS INSERT INTO t_proj SELECT number + 1, number + 1 FROM numbers(1000); -DELETE FROM t_proj WHERE a < 100; -- { serverError BAD_ARGUMENTS } +DELETE FROM t_proj WHERE a < 100; -- { serverError NOT_IMPLEMENTED } SELECT avg(a), avg(b), count() FROM t_proj; diff --git a/tests/queries/0_stateless/02792_drop_projection_lwd.sql b/tests/queries/0_stateless/02792_drop_projection_lwd.sql index a1d8a9c90f3..dcde7dcc600 100644 --- a/tests/queries/0_stateless/02792_drop_projection_lwd.sql +++ b/tests/queries/0_stateless/02792_drop_projection_lwd.sql @@ -7,7 +7,7 @@ CREATE TABLE t_projections_lwd (a UInt32, b UInt32, PROJECTION p (SELECT * ORDER INSERT INTO t_projections_lwd SELECT number, number FROM numbers(100); -- LWD does not work, as expected -DELETE FROM t_projections_lwd WHERE a = 1; -- { serverError BAD_ARGUMENTS } +DELETE FROM t_projections_lwd WHERE a = 1; -- { serverError NOT_IMPLEMENTED } KILL MUTATION WHERE database = currentDatabase() AND table = 't_projections_lwd' SYNC FORMAT Null; -- drop projection From 029e2ea22624f067d546317faab02f189b143df8 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 24 May 2024 05:54:16 +0200 Subject: [PATCH 318/392] Standardize references to data type docs --- .../functions/arithmetic-functions.md | 32 +- .../functions/array-functions.md | 98 +++--- .../sql-reference/functions/bit-functions.md | 20 +- .../functions/bitmap-functions.md | 14 +- .../functions/date-time-functions.md | 226 ++++++------- .../functions/distance-functions.md | 78 ++--- .../functions/encoding-functions.md | 50 +-- .../functions/encryption-functions.md | 44 +-- .../functions/ext-dict-functions.md | 32 +- docs/en/sql-reference/functions/files.md | 2 +- .../functions/functions-for-nulls.md | 2 +- .../functions/geo/coordinates.md | 4 +- .../en/sql-reference/functions/geo/geohash.md | 12 +- docs/en/sql-reference/functions/geo/h3.md | 214 ++++++------- docs/en/sql-reference/functions/geo/s2.md | 72 ++--- .../sql-reference/functions/hash-functions.md | 302 +++++++++--------- docs/en/sql-reference/functions/index.md | 4 +- .../sql-reference/functions/introspection.md | 28 +- .../functions/ip-address-functions.md | 26 +- .../sql-reference/functions/json-functions.md | 50 +-- .../functions/logical-functions.md | 24 +- .../sql-reference/functions/math-functions.md | 136 ++++---- .../sql-reference/functions/nlp-functions.md | 18 +- .../functions/other-functions.md | 116 +++---- .../functions/random-functions.md | 34 +- .../functions/rounding-functions.md | 8 +- .../functions/splitting-merging-functions.md | 36 +-- .../functions/string-functions.md | 116 +++---- .../functions/string-replace-functions.md | 8 +- .../functions/string-search-functions.md | 96 +++--- .../functions/time-series-functions.md | 8 +- .../functions/time-window-functions.md | 10 +- .../functions/tuple-functions.md | 56 ++-- .../functions/tuple-map-functions.md | 76 ++--- .../functions/type-conversion-functions.md | 168 +++++----- .../sql-reference/functions/ulid-functions.md | 8 +- .../sql-reference/functions/url-functions.md | 32 +- .../sql-reference/functions/uuid-functions.md | 26 +- .../functions/ym-dict-functions.md | 6 +- 39 files changed, 1146 insertions(+), 1146 deletions(-) diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index 6515ab6d702..e3fb1d91c05 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -77,7 +77,7 @@ Alias: `a * b` (operator) ## divide -Calculates the quotient of two values `a` and `b`. The result type is always [Float64](../../sql-reference/data-types/float.md). Integer division is provided by the `intDiv` function. +Calculates the quotient of two values `a` and `b`. The result type is always [Float64](../data-types/float.md). Integer division is provided by the `intDiv` function. Division by 0 returns `inf`, `-inf`, or `nan`. @@ -172,8 +172,8 @@ ifNotFinite(x,y) **Arguments** -- `x` — Value to check for infinity. [Float\*](../../sql-reference/data-types/float.md). -- `y` — Fallback value. [Float\*](../../sql-reference/data-types/float.md). +- `x` — Value to check for infinity. [Float\*](../data-types/float.md). +- `y` — Fallback value. [Float\*](../data-types/float.md). **Returned value** @@ -208,7 +208,7 @@ isNaN(x) Calculates the remainder of the division of two values `a` by `b`. -The result type is an integer if both inputs are integers. If one of the inputs is a floating-point number, the result type is [Float64](../../sql-reference/data-types/float.md). +The result type is an integer if both inputs are integers. If one of the inputs is a floating-point number, the result type is [Float64](../data-types/float.md). The remainder is computed like in C++. Truncated division is used for negative numbers. @@ -312,7 +312,7 @@ lcm(a, b) ## max2 -Returns the bigger of two values `a` and `b`. The returned value is of type [Float64](../../sql-reference/data-types/float.md). +Returns the bigger of two values `a` and `b`. The returned value is of type [Float64](../data-types/float.md). **Syntax** @@ -338,7 +338,7 @@ Result: ## min2 -Returns the smaller of two values `a` and `b`. The returned value is of type [Float64](../../sql-reference/data-types/float.md). +Returns the smaller of two values `a` and `b`. The returned value is of type [Float64](../data-types/float.md). **Syntax** @@ -364,7 +364,7 @@ Result: ## multiplyDecimal -Multiplies two decimals `a` and `b`. The result value will be of type [Decimal256](../../sql-reference/data-types/decimal.md). +Multiplies two decimals `a` and `b`. The result value will be of type [Decimal256](../data-types/decimal.md). The scale of the result can be explicitly specified by `result_scale`. If `result_scale` is not specified, it is assumed to be the maximum scale of the input values. @@ -378,13 +378,13 @@ multiplyDecimal(a, b[, result_scale]) **Arguments** -- `a` — First value: [Decimal](../../sql-reference/data-types/decimal.md). -- `b` — Second value: [Decimal](../../sql-reference/data-types/decimal.md). -- `result_scale` — Scale of result: [Int/UInt](../../sql-reference/data-types/int-uint.md). +- `a` — First value. [Decimal](../data-types/decimal.md). +- `b` — Second value. [Decimal](../data-types/decimal.md). +- `result_scale` — Scale of result. [Int/UInt](../data-types/int-uint.md). **Returned value** -- The result of multiplication with given scale. [Decimal256](../../sql-reference/data-types/decimal.md). +- The result of multiplication with given scale. [Decimal256](../data-types/decimal.md). **Example** @@ -438,7 +438,7 @@ Code: 407. DB::Exception: Received from localhost:9000. DB::Exception: Decimal m ## divideDecimal -Divides two decimals `a` and `b`. The result value will be of type [Decimal256](../../sql-reference/data-types/decimal.md). +Divides two decimals `a` and `b`. The result value will be of type [Decimal256](../data-types/decimal.md). The scale of the result can be explicitly specified by `result_scale`. If `result_scale` is not specified, it is assumed to be the maximum scale of the input values. @@ -452,13 +452,13 @@ divideDecimal(a, b[, result_scale]) **Arguments** -- `a` — First value: [Decimal](../../sql-reference/data-types/decimal.md). -- `b` — Second value: [Decimal](../../sql-reference/data-types/decimal.md). -- `result_scale` — Scale of result: [Int/UInt](../../sql-reference/data-types/int-uint.md). +- `a` — First value: [Decimal](../data-types/decimal.md). +- `b` — Second value: [Decimal](../data-types/decimal.md). +- `result_scale` — Scale of result: [Int/UInt](../data-types/int-uint.md). **Returned value** -- The result of division with given scale. [Decimal256](../../sql-reference/data-types/decimal.md). +- The result of division with given scale. [Decimal256](../data-types/decimal.md). **Example** diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index ff716804d97..7b52fbff714 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -19,7 +19,7 @@ empty([x]) An array is considered empty if it does not contain any elements. :::note -Can be optimized by enabling the [`optimize_functions_to_subcolumns` setting](../../operations/settings/settings.md#optimize-functions-to-subcolumns). With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../../sql-reference/data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT empty(arr) FROM TABLE;` transforms to `SELECT arr.size0 = 0 FROM TABLE;`. +Can be optimized by enabling the [`optimize_functions_to_subcolumns` setting](../../operations/settings/settings.md#optimize-functions-to-subcolumns). With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT empty(arr) FROM TABLE;` transforms to `SELECT arr.size0 = 0 FROM TABLE;`. ::: The function also works for [strings](string-functions.md#empty) or [UUID](uuid-functions.md#empty). @@ -61,7 +61,7 @@ notEmpty([x]) An array is considered non-empty if it contains at least one element. :::note -Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../../sql-reference/data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT notEmpty(arr) FROM table` transforms to `SELECT arr.size0 != 0 FROM TABLE`. +Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT notEmpty(arr) FROM table` transforms to `SELECT arr.size0 != 0 FROM TABLE`. ::: The function also works for [strings](string-functions.md#notempty) or [UUID](uuid-functions.md#notempty). @@ -96,7 +96,7 @@ Returns the number of items in the array. The result type is UInt64. The function also works for strings. -Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../../sql-reference/data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT length(arr) FROM table` transforms to `SELECT arr.size0 FROM TABLE`. +Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT length(arr) FROM table` transforms to `SELECT arr.size0 FROM TABLE`. Alias: `OCTET_LENGTH` @@ -577,7 +577,7 @@ arrayConcat(arrays) **Arguments** -- `arrays` – Arbitrary number of arguments of [Array](../../sql-reference/data-types/array.md) type. +- `arrays` – Arbitrary number of arguments of [Array](../data-types/array.md) type. **Example** @@ -1058,7 +1058,7 @@ arrayPushBack(array, single_value) **Arguments** - `array` – Array. -- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see “[Data types](../../sql-reference/data-types/index.md#data_types)”. Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`. +- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see “[Data types](../data-types/index.md#data_types)”. Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`. **Example** @@ -1083,7 +1083,7 @@ arrayPushFront(array, single_value) **Arguments** - `array` – Array. -- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see “[Data types](../../sql-reference/data-types/index.md#data_types)”. Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`. +- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see “[Data types](../data-types/index.md#data_types)”. Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`. **Example** @@ -1179,12 +1179,12 @@ arrayShingles(array, length) **Arguments** -- `array` — Input array [Array](../../sql-reference/data-types/array.md). +- `array` — Input array [Array](../data-types/array.md). - `length` — The length of each shingle. **Returned value** -- An array of generated shingles. [Array](../../sql-reference/data-types/array.md). +- An array of generated shingles. [Array](../data-types/array.md). **Examples** @@ -1760,8 +1760,8 @@ arrayReduce(agg_func, arr1, arr2, ..., arrN) **Arguments** -- `agg_func` — The name of an aggregate function which should be a constant [string](../../sql-reference/data-types/string.md). -- `arr` — Any number of [array](../../sql-reference/data-types/array.md) type columns as the parameters of the aggregation function. +- `agg_func` — The name of an aggregate function which should be a constant [string](../data-types/string.md). +- `arr` — Any number of [array](../data-types/array.md) type columns as the parameters of the aggregation function. **Returned value** @@ -1829,13 +1829,13 @@ arrayReduceInRanges(agg_func, ranges, arr1, arr2, ..., arrN) **Arguments** -- `agg_func` — The name of an aggregate function which should be a constant [string](../../sql-reference/data-types/string.md). -- `ranges` — The ranges to aggretate which should be an [array](../../sql-reference/data-types/array.md) of [tuples](../../sql-reference/data-types/tuple.md) which containing the index and the length of each range. -- `arr` — Any number of [Array](../../sql-reference/data-types/array.md) type columns as the parameters of the aggregation function. +- `agg_func` — The name of an aggregate function which should be a constant [string](../data-types/string.md). +- `ranges` — The ranges to aggretate which should be an [array](../data-types/array.md) of [tuples](../data-types/tuple.md) which containing the index and the length of each range. +- `arr` — Any number of [Array](../data-types/array.md) type columns as the parameters of the aggregation function. **Returned value** -- Array containing results of the aggregate function over specified ranges. [Array](../../sql-reference/data-types/array.md). +- Array containing results of the aggregate function over specified ranges. [Array](../data-types/array.md). **Example** @@ -1948,7 +1948,7 @@ Alias: `flatten`. **Parameters** -- `array_of_arrays` — [Array](../../sql-reference/data-types/array.md) of arrays. For example, `[[1,2,3], [4,5]]`. +- `array_of_arrays` — [Array](../data-types/array.md) of arrays. For example, `[[1,2,3], [4,5]]`. **Examples** @@ -1974,7 +1974,7 @@ arrayCompact(arr) **Arguments** -`arr` — The [array](../../sql-reference/data-types/array.md) to inspect. +`arr` — The [array](../data-types/array.md) to inspect. **Returned value** @@ -2008,13 +2008,13 @@ arrayZip(arr1, arr2, ..., arrN) **Arguments** -- `arrN` — [Array](../../sql-reference/data-types/array.md). +- `arrN` — [Array](../data-types/array.md). The function can take any number of arrays of different types. All the input arrays must be of equal size. **Returned value** -- Array with elements from the source arrays grouped into [tuples](../../sql-reference/data-types/tuple.md). Data types in the tuple are the same as types of the input arrays and in the same order as arrays are passed. [Array](../../sql-reference/data-types/array.md). +- Array with elements from the source arrays grouped into [tuples](../data-types/tuple.md). Data types in the tuple are the same as types of the input arrays and in the same order as arrays are passed. [Array](../data-types/array.md). **Example** @@ -2364,8 +2364,8 @@ arrayMin([func,] arr) **Arguments** -- `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — Array. [Array](../../sql-reference/data-types/array.md). +- `func` — Function. [Expression](../data-types/special-data-types/expression.md). +- `arr` — Array. [Array](../data-types/array.md). **Returned value** @@ -2421,8 +2421,8 @@ arrayMax([func,] arr) **Arguments** -- `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — Array. [Array](../../sql-reference/data-types/array.md). +- `func` — Function. [Expression](../data-types/special-data-types/expression.md). +- `arr` — Array. [Array](../data-types/array.md). **Returned value** @@ -2478,8 +2478,8 @@ arraySum([func,] arr) **Arguments** -- `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — Array. [Array](../../sql-reference/data-types/array.md). +- `func` — Function. [Expression](../data-types/special-data-types/expression.md). +- `arr` — Array. [Array](../data-types/array.md). **Returned value** @@ -2488,10 +2488,10 @@ arraySum([func,] arr) :::note Return type: -- For decimal numbers in the source array (or for converted values, if `func` is specified) — [Decimal128](../../sql-reference/data-types/decimal.md). -- For floating point numbers — [Float64](../../sql-reference/data-types/float.md). -- For numeric unsigned — [UInt64](../../sql-reference/data-types/int-uint.md). -- For numeric signed — [Int64](../../sql-reference/data-types/int-uint.md). +- For decimal numbers in the source array (or for converted values, if `func` is specified) — [Decimal128](../data-types/decimal.md). +- For floating point numbers — [Float64](../data-types/float.md). +- For numeric unsigned — [UInt64](../data-types/int-uint.md). +- For numeric signed — [Int64](../data-types/int-uint.md). ::: **Examples** @@ -2540,12 +2540,12 @@ arrayAvg([func,] arr) **Arguments** -- `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md). -- `arr` — Array. [Array](../../sql-reference/data-types/array.md). +- `func` — Function. [Expression](../data-types/special-data-types/expression.md). +- `arr` — Array. [Array](../data-types/array.md). **Returned value** -- The average of function values (or the array average). [Float64](../../sql-reference/data-types/float.md). +- The average of function values (or the array average). [Float64](../data-types/float.md). **Examples** @@ -2589,7 +2589,7 @@ arrayCumSum(arr) **Arguments** -- `arr` — [Array](../../sql-reference/data-types/array.md) of numeric values. +- `arr` — [Array](../data-types/array.md) of numeric values. **Returned value** @@ -2621,7 +2621,7 @@ arrayCumSumNonNegative(arr) **Arguments** -- `arr` — [Array](../../sql-reference/data-types/array.md) of numeric values. +- `arr` — [Array](../data-types/array.md) of numeric values. **Returned value** @@ -2641,7 +2641,7 @@ Note that the `arraySumNonNegative` is a [higher-order function](../../sql-refer ## arrayProduct -Multiplies elements of an [array](../../sql-reference/data-types/array.md). +Multiplies elements of an [array](../data-types/array.md). **Syntax** @@ -2651,11 +2651,11 @@ arrayProduct(arr) **Arguments** -- `arr` — [Array](../../sql-reference/data-types/array.md) of numeric values. +- `arr` — [Array](../data-types/array.md) of numeric values. **Returned value** -- A product of array's elements. [Float64](../../sql-reference/data-types/float.md). +- A product of array's elements. [Float64](../data-types/float.md). **Examples** @@ -2679,7 +2679,7 @@ Query: SELECT arrayProduct([toDecimal64(1,8), toDecimal64(2,8), toDecimal64(3,8)]) as res, toTypeName(res); ``` -Return value type is always [Float64](../../sql-reference/data-types/float.md). Result: +Return value type is always [Float64](../data-types/float.md). Result: ``` text ┌─res─┬─toTypeName(arrayProduct(array(toDecimal64(1, 8), toDecimal64(2, 8), toDecimal64(3, 8))))─┐ @@ -2689,7 +2689,7 @@ Return value type is always [Float64](../../sql-reference/data-types/float.md). ## arrayRotateLeft -Rotates an [array](../../sql-reference/data-types/array.md) to the left by the specified number of elements. +Rotates an [array](../data-types/array.md) to the left by the specified number of elements. If the number of elements is negative, the array is rotated to the right. **Syntax** @@ -2700,12 +2700,12 @@ arrayRotateLeft(arr, n) **Arguments** -- `arr` — [Array](../../sql-reference/data-types/array.md). +- `arr` — [Array](../data-types/array.md). - `n` — Number of elements to rotate. **Returned value** -- An array rotated to the left by the specified number of elements. [Array](../../sql-reference/data-types/array.md). +- An array rotated to the left by the specified number of elements. [Array](../data-types/array.md). **Examples** @@ -2753,7 +2753,7 @@ Result: ## arrayRotateRight -Rotates an [array](../../sql-reference/data-types/array.md) to the right by the specified number of elements. +Rotates an [array](../data-types/array.md) to the right by the specified number of elements. If the number of elements is negative, the array is rotated to the left. **Syntax** @@ -2764,12 +2764,12 @@ arrayRotateRight(arr, n) **Arguments** -- `arr` — [Array](../../sql-reference/data-types/array.md). +- `arr` — [Array](../data-types/array.md). - `n` — Number of elements to rotate. **Returned value** -- An array rotated to the right by the specified number of elements. [Array](../../sql-reference/data-types/array.md). +- An array rotated to the right by the specified number of elements. [Array](../data-types/array.md). **Examples** @@ -2817,7 +2817,7 @@ Result: ## arrayShiftLeft -Shifts an [array](../../sql-reference/data-types/array.md) to the left by the specified number of elements. +Shifts an [array](../data-types/array.md) to the left by the specified number of elements. New elements are filled with the provided argument or the default value of the array element type. If the number of elements is negative, the array is shifted to the right. @@ -2829,13 +2829,13 @@ arrayShiftLeft(arr, n[, default]) **Arguments** -- `arr` — [Array](../../sql-reference/data-types/array.md). +- `arr` — [Array](../data-types/array.md). - `n` — Number of elements to shift. - `default` — Optional. Default value for new elements. **Returned value** -- An array shifted to the left by the specified number of elements. [Array](../../sql-reference/data-types/array.md). +- An array shifted to the left by the specified number of elements. [Array](../data-types/array.md). **Examples** @@ -2911,7 +2911,7 @@ Result: ## arrayShiftRight -Shifts an [array](../../sql-reference/data-types/array.md) to the right by the specified number of elements. +Shifts an [array](../data-types/array.md) to the right by the specified number of elements. New elements are filled with the provided argument or the default value of the array element type. If the number of elements is negative, the array is shifted to the left. @@ -2923,13 +2923,13 @@ arrayShiftRight(arr, n[, default]) **Arguments** -- `arr` — [Array](../../sql-reference/data-types/array.md). +- `arr` — [Array](../data-types/array.md). - `n` — Number of elements to shift. - `default` — Optional. Default value for new elements. **Returned value** -- An array shifted to the right by the specified number of elements. [Array](../../sql-reference/data-types/array.md). +- An array shifted to the right by the specified number of elements. [Array](../data-types/array.md). **Examples** diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index 2538ad32022..a48893b93bf 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -34,8 +34,8 @@ bitShiftLeft(a, b) **Arguments** -- `a` — A value to shift. [Integer types](../../sql-reference/data-types/int-uint.md), [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). -- `b` — The number of shift positions. [Unsigned integer types](../../sql-reference/data-types/int-uint.md), 64 bit types or less are allowed. +- `a` — A value to shift. [Integer types](../data-types/int-uint.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +- `b` — The number of shift positions. [Unsigned integer types](../data-types/int-uint.md), 64 bit types or less are allowed. **Returned value** @@ -81,8 +81,8 @@ bitShiftRight(a, b) **Arguments** -- `a` — A value to shift. [Integer types](../../sql-reference/data-types/int-uint.md), [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). -- `b` — The number of shift positions. [Unsigned integer types](../../sql-reference/data-types/int-uint.md), 64 bit types or less are allowed. +- `a` — A value to shift. [Integer types](../data-types/int-uint.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +- `b` — The number of shift positions. [Unsigned integer types](../data-types/int-uint.md), 64 bit types or less are allowed. **Returned value** @@ -131,13 +131,13 @@ bitSlice(s, offset[, length]) **Arguments** -- `s` — s is [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `s` — s is [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). - `offset` — The start index with bit, A positive value indicates an offset on the left, and a negative value is an indent on the right. Numbering of the bits begins with 1. - `length` — The length of substring with bit. If you specify a negative value, the function returns an open substring \[offset, array_length - length\]. If you omit the value, the function returns the substring \[offset, the_end_string\]. If length exceeds s, it will be truncate.If length isn't multiple of 8, will fill 0 on the right. **Returned value** -- The substring. [String](../../sql-reference/data-types/string.md) +- The substring. [String](../data-types/string.md) **Example** @@ -362,7 +362,7 @@ bitCount(x) **Arguments** -- `x` — [Integer](../../sql-reference/data-types/int-uint.md) or [floating-point](../../sql-reference/data-types/float.md) number. The function uses the value representation in memory. It allows supporting floating-point numbers. +- `x` — [Integer](../data-types/int-uint.md) or [floating-point](../data-types/float.md) number. The function uses the value representation in memory. It allows supporting floating-point numbers. **Returned value** @@ -402,12 +402,12 @@ bitHammingDistance(int1, int2) **Arguments** -- `int1` — First integer value. [Int64](../../sql-reference/data-types/int-uint.md). -- `int2` — Second integer value. [Int64](../../sql-reference/data-types/int-uint.md). +- `int1` — First integer value. [Int64](../data-types/int-uint.md). +- `int2` — Second integer value. [Int64](../data-types/int-uint.md). **Returned value** -- The Hamming distance. [UInt8](../../sql-reference/data-types/int-uint.md). +- The Hamming distance. [UInt8](../data-types/int-uint.md). **Examples** diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index e546de039da..a5c8a663b71 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -75,8 +75,8 @@ bitmapSubsetInRange(bitmap, range_start, range_end) **Arguments** - `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – Start of the range (inclusive). [UInt32](../../sql-reference/data-types/int-uint.md). -- `range_end` – End of the range (exclusive). [UInt32](../../sql-reference/data-types/int-uint.md). +- `range_start` – Start of the range (inclusive). [UInt32](../data-types/int-uint.md). +- `range_end` – End of the range (exclusive). [UInt32](../data-types/int-uint.md). **Example** @@ -105,8 +105,8 @@ bitmapSubsetLimit(bitmap, range_start, cardinality_limit) **Arguments** - `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – Start of the range (inclusive). [UInt32](../../sql-reference/data-types/int-uint.md). -- `cardinality_limit` – Maximum cardinality of the subset. [UInt32](../../sql-reference/data-types/int-uint.md). +- `range_start` – Start of the range (inclusive). [UInt32](../data-types/int-uint.md). +- `cardinality_limit` – Maximum cardinality of the subset. [UInt32](../data-types/int-uint.md). **Example** @@ -135,8 +135,8 @@ subBitmap(bitmap, offset, cardinality_limit) **Arguments** - `bitmap` – The bitmap. [Bitmap object](#bitmap_functions-bitmapbuild). -- `offset` – The position of the first element of the subset. [UInt32](../../sql-reference/data-types/int-uint.md). -- `cardinality_limit` – The maximum number of elements in the subset. [UInt32](../../sql-reference/data-types/int-uint.md). +- `offset` – The position of the first element of the subset. [UInt32](../data-types/int-uint.md). +- `cardinality_limit` – The maximum number of elements in the subset. [UInt32](../data-types/int-uint.md). **Example** @@ -163,7 +163,7 @@ bitmapContains(bitmap, needle) **Arguments** - `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `needle` – Searched bit value. [UInt32](../../sql-reference/data-types/int-uint.md). +- `needle` – Searched bit value. [UInt32](../data-types/int-uint.md). **Returned values** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index a1d6dbb5930..6ad26f452ad 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -26,7 +26,7 @@ SELECT ## makeDate -Creates a [Date](../../sql-reference/data-types/date.md) +Creates a [Date](../data-types/date.md) - from a year, month and day argument, or - from a year and day of year argument. @@ -43,14 +43,14 @@ Alias: **Arguments** -- `year` — Year. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). -- `month` — Month. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). -- `day` — Day. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). -- `day_of_year` — Day of the year. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). +- `year` — Year. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). +- `month` — Month. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). +- `day` — Day. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). +- `day_of_year` — Day of the year. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). **Returned value** -- A date created from the arguments. [Date](../../sql-reference/data-types/date.md). +- A date created from the arguments. [Date](../data-types/date.md). **Example** @@ -83,11 +83,11 @@ Result: ``` ## makeDate32 -Like [makeDate](#makeDate) but produces a [Date32](../../sql-reference/data-types/date32.md). +Like [makeDate](#makeDate) but produces a [Date32](../data-types/date32.md). ## makeDateTime -Creates a [DateTime](../../sql-reference/data-types/datetime.md) from a year, month, day, hour, minute and second argument. +Creates a [DateTime](../data-types/datetime.md) from a year, month, day, hour, minute and second argument. **Syntax** @@ -97,17 +97,17 @@ makeDateTime(year, month, day, hour, minute, second[, timezone]) **Arguments** -- `year` — Year. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). -- `month` — Month. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). -- `day` — Day. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). -- `hour` — Hour. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). -- `minute` — Minute. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). -- `second` — Second. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). +- `year` — Year. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). +- `month` — Month. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). +- `day` — Day. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). +- `hour` — Hour. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). +- `minute` — Minute. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). +- `second` — Second. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). - `timezone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). **Returned value** -- A date with time created from the arguments. [DateTime](../../sql-reference/data-types/datetime.md). +- A date with time created from the arguments. [DateTime](../data-types/datetime.md). **Example** @@ -125,7 +125,7 @@ Result: ## makeDateTime64 -Like [makeDateTime](#makedatetime) but produces a [DateTime64](../../sql-reference/data-types/datetime64.md). +Like [makeDateTime](#makedatetime) but produces a [DateTime64](../data-types/datetime64.md). **Syntax** @@ -135,7 +135,7 @@ makeDateTime64(year, month, day, hour, minute, second[, fraction[, precision[, t ## timestamp -Converts the first argument 'expr' to type [DateTime64(6)](../../sql-reference/data-types/datetime64.md). +Converts the first argument 'expr' to type [DateTime64(6)](../data-types/datetime64.md). If a second argument 'expr_time' is provided, it adds the specified time to the converted value. **Syntax** @@ -148,8 +148,8 @@ Alias: `TIMESTAMP` **Arguments** -- `expr` - Date or date with time. [String](../../sql-reference/data-types/string.md). -- `expr_time` - Optional parameter. Time to add. [String](../../sql-reference/data-types/string.md). +- `expr` - Date or date with time. [String](../data-types/string.md). +- `expr_time` - Optional parameter. Time to add. [String](../data-types/string.md). **Examples** @@ -179,7 +179,7 @@ Result: **Returned value** -- [DateTime64](../../sql-reference/data-types/datetime64.md)(6) +- [DateTime64](../data-types/datetime64.md)(6) ## timeZone @@ -196,7 +196,7 @@ Alias: `timezone`. **Returned value** -- Timezone. [String](../../sql-reference/data-types/string.md). +- Timezone. [String](../data-types/string.md). **Example** @@ -231,7 +231,7 @@ Alias: `serverTimezone`. **Returned value** -- Timezone. [String](../../sql-reference/data-types/string.md). +- Timezone. [String](../data-types/string.md). **Example** @@ -265,12 +265,12 @@ Alias: `toTimezone`. **Arguments** -- `value` — Time or date and time. [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — Timezone for the returned value. [String](../../sql-reference/data-types/string.md). This argument is a constant, because `toTimezone` changes the timezone of a column (timezone is an attribute of `DateTime*` types). +- `value` — Time or date and time. [DateTime64](../data-types/datetime64.md). +- `timezone` — Timezone for the returned value. [String](../data-types/string.md). This argument is a constant, because `toTimezone` changes the timezone of a column (timezone is an attribute of `DateTime*` types). **Returned value** -- Date and time. [DateTime](../../sql-reference/data-types/datetime.md). +- Date and time. [DateTime](../data-types/datetime.md). **Example** @@ -310,7 +310,7 @@ int32samoa: 1546300800 ## timeZoneOf -Returns the timezone name of [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md) data types. +Returns the timezone name of [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md) data types. **Syntax** @@ -322,11 +322,11 @@ Alias: `timezoneOf`. **Arguments** -- `value` — Date and time. [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `value` — Date and time. [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Returned value** -- Timezone name. [String](../../sql-reference/data-types/string.md). +- Timezone name. [String](../data-types/string.md). **Example** @@ -357,11 +357,11 @@ Alias: `timezoneOffset`. **Arguments** -- `value` — Date and time. [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `value` — Date and time. [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Returned value** -- Offset from UTC in seconds. [Int32](../../sql-reference/data-types/int-uint.md). +- Offset from UTC in seconds. [Int32](../data-types/int-uint.md). **Example** @@ -1192,12 +1192,12 @@ toStartOfSecond(value, [timezone]) **Arguments** -- `value` — Date and time. [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). If not specified, the function uses the timezone of the `value` parameter. [String](../../sql-reference/data-types/string.md). +- `value` — Date and time. [DateTime64](../data-types/datetime64.md). +- `timezone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). If not specified, the function uses the timezone of the `value` parameter. [String](../data-types/string.md). **Returned value** -- Input value without sub-seconds. [DateTime64](../../sql-reference/data-types/datetime64.md). +- Input value without sub-seconds. [DateTime64](../data-types/datetime64.md). **Examples** @@ -1534,12 +1534,12 @@ Alias: `TO_DAYS` **Arguments** -- `date` — The date to calculate the number of days passed since year zero from. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). -- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `date` — The date to calculate the number of days passed since year zero from. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../data-types/string.md) **Returned value** -The number of days passed since date 0000-01-01. [UInt32](../../sql-reference/data-types/int-uint.md). +The number of days passed since date 0000-01-01. [UInt32](../data-types/int-uint.md). **Example** @@ -1563,7 +1563,7 @@ Result: Returns for a given number of days passed since [1 January 0000](https://en.wikipedia.org/wiki/Year_zero) the corresponding date in the [proleptic Gregorian calendar defined by ISO 8601](https://en.wikipedia.org/wiki/Gregorian_calendar#Proleptic_Gregorian_calendar). The calculation is the same as in MySQL's [`FROM_DAYS()`](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_from-days) function. -The result is undefined if it cannot be represented within the bounds of the [Date](../../sql-reference/data-types/date.md) type. +The result is undefined if it cannot be represented within the bounds of the [Date](../data-types/date.md) type. **Syntax** @@ -1579,7 +1579,7 @@ Alias: `FROM_DAYS` **Returned value** -The date corresponding to the number of days passed since year zero. [Date](../../sql-reference/data-types/date.md). +The date corresponding to the number of days passed since year zero. [Date](../data-types/date.md). **Example** @@ -1601,7 +1601,7 @@ Result: ## fromDaysSinceYearZero32 -Like [fromDaysSinceYearZero](#fromDaysSinceYearZero) but returns a [Date32](../../sql-reference/data-types/date32.md). +Like [fromDaysSinceYearZero](#fromDaysSinceYearZero) but returns a [Date32](../data-types/date32.md). ## age @@ -1618,7 +1618,7 @@ age('unit', startdate, enddate, [timezone]) **Arguments** -- `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md). +- `unit` — The type of interval for result. [String](../data-types/string.md). Possible values: - `nanosecond`, `nanoseconds`, `ns` @@ -1633,15 +1633,15 @@ age('unit', startdate, enddate, [timezone]) - `quarter`, `quarters`, `qq`, `q` - `year`, `years`, `yyyy`, `yy` -- `startdate` — The first time value to subtract (the subtrahend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `startdate` — The first time value to subtract (the subtrahend). [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). -- `enddate` — The second time value to subtract from (the minuend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `enddate` — The second time value to subtract from (the minuend). [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. [String](../../sql-reference/data-types/string.md). +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. [String](../data-types/string.md). **Returned value** -Difference between `enddate` and `startdate` expressed in `unit`. [Int](../../sql-reference/data-types/int-uint.md). +Difference between `enddate` and `startdate` expressed in `unit`. [Int](../data-types/int-uint.md). **Example** @@ -1694,7 +1694,7 @@ Aliases: `dateDiff`, `DATE_DIFF`, `timestampDiff`, `timestamp_diff`, `TIMESTAMP_ **Arguments** -- `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md). +- `unit` — The type of interval for result. [String](../data-types/string.md). Possible values: - `nanosecond`, `nanoseconds`, `ns` @@ -1709,15 +1709,15 @@ Aliases: `dateDiff`, `DATE_DIFF`, `timestampDiff`, `timestamp_diff`, `TIMESTAMP_ - `quarter`, `quarters`, `qq`, `q` - `year`, `years`, `yyyy`, `yy` -- `startdate` — The first time value to subtract (the subtrahend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `startdate` — The first time value to subtract (the subtrahend). [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). -- `enddate` — The second time value to subtract from (the minuend). [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `enddate` — The second time value to subtract from (the minuend). [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. [String](../../sql-reference/data-types/string.md). +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. [String](../data-types/string.md). **Returned value** -Difference between `enddate` and `startdate` expressed in `unit`. [Int](../../sql-reference/data-types/int-uint.md). +Difference between `enddate` and `startdate` expressed in `unit`. [Int](../data-types/int-uint.md). **Example** @@ -1781,12 +1781,12 @@ Alias: `dateTrunc`. `unit` argument is case-insensitive. -- `value` — Date and time. [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). If not specified, the function uses the timezone of the `value` parameter. [String](../../sql-reference/data-types/string.md). +- `value` — Date and time. [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). If not specified, the function uses the timezone of the `value` parameter. [String](../data-types/string.md). **Returned value** -- Value, truncated to the specified part of date. [DateTime](../../sql-reference/data-types/datetime.md). +- Value, truncated to the specified part of date. [DateTime](../data-types/datetime.md). **Example** @@ -1844,7 +1844,7 @@ Aliases: `dateAdd`, `DATE_ADD`. **Arguments** -- `unit` — The type of interval to add. Note: This is not a [String](../../sql-reference/data-types/string.md) and must therefore not be quoted. +- `unit` — The type of interval to add. Note: This is not a [String](../data-types/string.md) and must therefore not be quoted. Possible values: - `second` @@ -1856,12 +1856,12 @@ Aliases: `dateAdd`, `DATE_ADD`. - `quarter` - `year` -- `value` — Value of interval to add. [Int](../../sql-reference/data-types/int-uint.md). -- `date` — The date or date with time to which `value` is added. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `value` — Value of interval to add. [Int](../data-types/int-uint.md). +- `date` — The date or date with time to which `value` is added. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Returned value** -Date or date with time obtained by adding `value`, expressed in `unit`, to `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by adding `value`, expressed in `unit`, to `date`. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Example** @@ -1918,7 +1918,7 @@ Aliases: `dateSub`, `DATE_SUB`. **Arguments** -- `unit` — The type of interval to subtract. Note: This is not a [String](../../sql-reference/data-types/string.md) and must therefore not be quoted. +- `unit` — The type of interval to subtract. Note: This is not a [String](../data-types/string.md) and must therefore not be quoted. Possible values: @@ -1931,12 +1931,12 @@ Aliases: `dateSub`, `DATE_SUB`. - `quarter` - `year` -- `value` — Value of interval to subtract. [Int](../../sql-reference/data-types/int-uint.md). -- `date` — The date or date with time from which `value` is subtracted. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `value` — Value of interval to subtract. [Int](../data-types/int-uint.md). +- `date` — The date or date with time from which `value` is subtracted. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Returned value** -Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Example** @@ -1985,9 +1985,9 @@ Aliases: `timeStampAdd`, `TIMESTAMP_ADD`. **Arguments** -- `date` — Date or date with time. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). -- `value` — Value of interval to add. [Int](../../sql-reference/data-types/int-uint.md). -- `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md). +- `date` — Date or date with time. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). +- `value` — Value of interval to add. [Int](../data-types/int-uint.md). +- `unit` — The type of interval to add. [String](../data-types/string.md). Possible values: - `second` @@ -2001,7 +2001,7 @@ Aliases: `timeStampAdd`, `TIMESTAMP_ADD`. **Returned value** -Date or date with time with the specified `value` expressed in `unit` added to `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time with the specified `value` expressed in `unit` added to `date`. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Example** @@ -2033,7 +2033,7 @@ Aliases: `timeStampSub`, `TIMESTAMP_SUB`. **Arguments** -- `unit` — The type of interval to subtract. [String](../../sql-reference/data-types/string.md). +- `unit` — The type of interval to subtract. [String](../data-types/string.md). Possible values: - `second` @@ -2045,12 +2045,12 @@ Aliases: `timeStampSub`, `TIMESTAMP_SUB`. - `quarter` - `year` -- `value` — Value of interval to subtract. [Int](../../sql-reference/data-types/int-uint.md). -- `date` — Date or date with time. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `value` — Value of interval to subtract. [Int](../data-types/int-uint.md). +- `date` — Date or date with time. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Returned value** -Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Example** @@ -2080,12 +2080,12 @@ addDate(date, interval) **Arguments** -- `date` — The date or date with time to which `interval` is added. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), or [String](../../sql-reference/data-types/string.md) -- `interval` — Interval to add. [Interval](../../sql-reference/data-types/special-data-types/interval.md). +- `date` — The date or date with time to which `interval` is added. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md), [DateTime64](../data-types/datetime64.md), or [String](../data-types/string.md) +- `interval` — Interval to add. [Interval](../data-types/special-data-types/interval.md). **Returned value** -Date or date with time obtained by adding `interval` to `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by adding `interval` to `date`. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Example** @@ -2121,12 +2121,12 @@ subDate(date, interval) **Arguments** -- `date` — The date or date with time from which `interval` is subtracted. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), or [String](../../sql-reference/data-types/string.md) -- `interval` — Interval to subtract. [Interval](../../sql-reference/data-types/special-data-types/interval.md). +- `date` — The date or date with time from which `interval` is subtracted. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md), [DateTime64](../data-types/datetime64.md), or [String](../data-types/string.md) +- `interval` — Interval to subtract. [Interval](../data-types/special-data-types/interval.md). **Returned value** -Date or date with time obtained by subtracting `interval` from `date`. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +Date or date with time obtained by subtracting `interval` from `date`. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Example** @@ -2162,11 +2162,11 @@ now([timezone]) **Arguments** -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md). +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../data-types/string.md). **Returned value** -- Current date and time. [DateTime](../../sql-reference/data-types/datetime.md). +- Current date and time. [DateTime](../data-types/datetime.md). **Example** @@ -2211,11 +2211,11 @@ now64([scale], [timezone]) **Arguments** - `scale` - Tick size (precision): 10-precision seconds. Valid range: [ 0 : 9 ]. Typically, are used - 3 (default) (milliseconds), 6 (microseconds), 9 (nanoseconds). -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md). +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../data-types/string.md). **Returned value** -- Current date and time with sub-second precision. [DateTime64](../../sql-reference/data-types/datetime64.md). +- Current date and time with sub-second precision. [DateTime64](../data-types/datetime64.md). **Example** @@ -2245,11 +2245,11 @@ nowInBlock([timezone]) **Arguments** -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md). +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../data-types/string.md). **Returned value** -- Current date and time at the moment of processing of each block of data. [DateTime](../../sql-reference/data-types/datetime.md). +- Current date and time at the moment of processing of each block of data. [DateTime](../data-types/datetime.md). **Example** @@ -2289,7 +2289,7 @@ today() **Returned value** -- Current date. [DateTime](../../sql-reference/data-types/datetime.md). +- Current date. [DateTime](../data-types/datetime.md). **Example** @@ -2379,7 +2379,7 @@ Result: ## YYYYMMDDToDate -Converts a number containing the year, month and day number to a [Date](../../sql-reference/data-types/date.md). +Converts a number containing the year, month and day number to a [Date](../data-types/date.md). This function is the opposite of function `toYYYYMMDD()`. @@ -2393,11 +2393,11 @@ YYYYMMDDToDate(yyyymmdd); **Arguments** -- `yyyymmdd` - A number representing the year, month and day. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). +- `yyyymmdd` - A number representing the year, month and day. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). **Returned value** -- a date created from the arguments. [Date](../../sql-reference/data-types/date.md). +- a date created from the arguments. [Date](../data-types/date.md). **Example** @@ -2415,11 +2415,11 @@ Result: ## YYYYMMDDToDate32 -Like function `YYYYMMDDToDate()` but produces a [Date32](../../sql-reference/data-types/date32.md). +Like function `YYYYMMDDToDate()` but produces a [Date32](../data-types/date32.md). ## YYYYMMDDhhmmssToDateTime -Converts a number containing the year, month, day, hours, minute and second number to a [DateTime](../../sql-reference/data-types/datetime.md). +Converts a number containing the year, month, day, hours, minute and second number to a [DateTime](../data-types/datetime.md). The output is undefined if the input does not encode a valid DateTime value. @@ -2433,12 +2433,12 @@ YYYYMMDDhhmmssToDateTime(yyyymmddhhmmss[, timezone]); **Arguments** -- `yyyymmddhhmmss` - A number representing the year, month and day. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). +- `yyyymmddhhmmss` - A number representing the year, month and day. [Integer](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). - `timezone` - [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). **Returned value** -- a date with time created from the arguments. [DateTime](../../sql-reference/data-types/datetime.md). +- a date with time created from the arguments. [DateTime](../data-types/datetime.md). **Example** @@ -2456,7 +2456,7 @@ Result: ## YYYYMMDDhhmmssToDateTime64 -Like function `YYYYMMDDhhmmssToDate()` but produces a [DateTime64](../../sql-reference/data-types/datetime64.md). +Like function `YYYYMMDDhhmmssToDate()` but produces a [DateTime64](../data-types/datetime64.md). Accepts an additional, optional `precision` parameter after the `timezone` parameter. @@ -3453,7 +3453,7 @@ Formats a Time according to the given Format string. Format is a constant expres formatDateTime uses MySQL datetime format style, refer to https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format. -The opposite operation of this function is [parseDateTime](/docs/en/sql-reference/functions/type-conversion-functions.md#type_conversion_functions-parseDateTime). +The opposite operation of this function is [parseDateTime](../functions/type-conversion-functions.md#type_conversion_functions-parseDateTime). Alias: `DATE_FORMAT`. @@ -3579,7 +3579,7 @@ LIMIT 10 Similar to formatDateTime, except that it formats datetime in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html. -The opposite operation of this function is [parseDateTimeInJodaSyntax](/docs/en/sql-reference/functions/type-conversion-functions.md#type_conversion_functions-parseDateTimeInJodaSyntax). +The opposite operation of this function is [parseDateTimeInJodaSyntax](../functions/type-conversion-functions.md#type_conversion_functions-parseDateTimeInJodaSyntax). **Replacement fields** @@ -3639,13 +3639,13 @@ dateName(date_part, date) **Arguments** -- `date_part` — Date part. Possible values: 'year', 'quarter', 'month', 'week', 'dayofyear', 'day', 'weekday', 'hour', 'minute', 'second'. [String](../../sql-reference/data-types/string.md). -- `date` — Date. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — Timezone. Optional. [String](../../sql-reference/data-types/string.md). +- `date_part` — Date part. Possible values: 'year', 'quarter', 'month', 'week', 'dayofyear', 'day', 'weekday', 'hour', 'minute', 'second'. [String](../data-types/string.md). +- `date` — Date. [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). +- `timezone` — Timezone. Optional. [String](../data-types/string.md). **Returned value** -- The specified part of date. [String](../../sql-reference/data-types/string.md#string) +- The specified part of date. [String](../data-types/string.md#string) **Example** @@ -3677,11 +3677,11 @@ monthName(date) **Arguments** -- `date` — Date or date with time. [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `date` — Date or date with time. [Date](../data-types/date.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md). **Returned value** -- The name of the month. [String](../../sql-reference/data-types/string.md#string) +- The name of the month. [String](../data-types/string.md#string) **Example** @@ -3704,7 +3704,7 @@ This function converts a Unix timestamp to a calendar date and a time of a day. It can be called in two ways: -When given a single argument of type [Integer](../../sql-reference/data-types/int-uint.md), it returns a value of type [DateTime](../../sql-reference/data-types/datetime.md), i.e. behaves like [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime). +When given a single argument of type [Integer](../data-types/int-uint.md), it returns a value of type [DateTime](../data-types/datetime.md), i.e. behaves like [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime). Alias: `FROM_UNIXTIME`. @@ -3722,7 +3722,7 @@ Result: └──────────────────────────────┘ ``` -When given two or three arguments where the first argument is a value of type [Integer](../../sql-reference/data-types/int-uint.md), [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md), the second argument is a constant format string and the third argument is an optional constant time zone string, the function returns a value of type [String](../../sql-reference/data-types/string.md#string), i.e. it behaves like [formatDateTime](#formatdatetime). In this case, [MySQL's datetime format style](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format) is used. +When given two or three arguments where the first argument is a value of type [Integer](../data-types/int-uint.md), [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md), the second argument is a constant format string and the third argument is an optional constant time zone string, the function returns a value of type [String](../data-types/string.md#string), i.e. it behaves like [formatDateTime](#formatdatetime). In this case, [MySQL's datetime format style](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format) is used. **Example:** @@ -3772,11 +3772,11 @@ toModifiedJulianDay(date) **Arguments** -- `date` — Date in text form. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `date` — Date in text form. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** -- Modified Julian Day number. [Int32](../../sql-reference/data-types/int-uint.md). +- Modified Julian Day number. [Int32](../data-types/int-uint.md). **Example** @@ -3804,11 +3804,11 @@ toModifiedJulianDayOrNull(date) **Arguments** -- `date` — Date in text form. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `date` — Date in text form. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** -- Modified Julian Day number. [Nullable(Int32)](../../sql-reference/data-types/int-uint.md). +- Modified Julian Day number. [Nullable(Int32)](../data-types/int-uint.md). **Example** @@ -3836,11 +3836,11 @@ fromModifiedJulianDay(day) **Arguments** -- `day` — Modified Julian Day number. [Any integral types](../../sql-reference/data-types/int-uint.md). +- `day` — Modified Julian Day number. [Any integral types](../data-types/int-uint.md). **Returned value** -- Date in text form. [String](../../sql-reference/data-types/string.md) +- Date in text form. [String](../data-types/string.md) **Example** @@ -3868,11 +3868,11 @@ fromModifiedJulianDayOrNull(day) **Arguments** -- `day` — Modified Julian Day number. [Any integral types](../../sql-reference/data-types/int-uint.md). +- `day` — Modified Julian Day number. [Any integral types](../data-types/int-uint.md). **Returned value** -- Date in text form. [Nullable(String)](../../sql-reference/data-types/string.md) +- Date in text form. [Nullable(String)](../data-types/string.md) **Example** @@ -3900,8 +3900,8 @@ toUTCTimestamp(time_val, time_zone) **Arguments** -- `time_val` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) -- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `time_val` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../data-types/datetime.md) +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../data-types/string.md) **Returned value** @@ -3933,8 +3933,8 @@ fromUTCTimestamp(time_val, time_zone) **Arguments** -- `time_val` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) -- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `time_val` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../data-types/datetime.md) +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../data-types/string.md) **Returned value** @@ -3965,8 +3965,8 @@ timeDiff(first_datetime, second_datetime) *Arguments** -- `first_datetime` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) -- `second_datetime` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) +- `first_datetime` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../data-types/datetime.md) +- `second_datetime` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../data-types/datetime.md) **Returned value** diff --git a/docs/en/sql-reference/functions/distance-functions.md b/docs/en/sql-reference/functions/distance-functions.md index 9fda491ac50..a455d0af91b 100644 --- a/docs/en/sql-reference/functions/distance-functions.md +++ b/docs/en/sql-reference/functions/distance-functions.md @@ -20,11 +20,11 @@ Alias: `normL1`. **Arguments** -- `vector` — [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector` — [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- L1-norm or [taxicab geometry](https://en.wikipedia.org/wiki/Taxicab_geometry) distance. [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). +- L1-norm or [taxicab geometry](https://en.wikipedia.org/wiki/Taxicab_geometry) distance. [UInt](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). **Examples** @@ -56,11 +56,11 @@ Alias: `normL2`. **Arguments** -- `vector` — [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector` — [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- L2-norm or [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance). [Float](../../sql-reference/data-types/float.md). +- L2-norm or [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance). [Float](../data-types/float.md). **Example** @@ -91,11 +91,11 @@ Alias: `normL2Squared`. ***Arguments** -- `vector` — [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector` — [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- L2-norm squared. [Float](../../sql-reference/data-types/float.md). +- L2-norm squared. [Float](../data-types/float.md). **Example** @@ -127,11 +127,11 @@ Alias: `normLinf`. **Arguments** -- `vector` — [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector` — [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- Linf-norm or the maximum absolute value. [Float](../../sql-reference/data-types/float.md). +- Linf-norm or the maximum absolute value. [Float](../data-types/float.md). **Example** @@ -163,12 +163,12 @@ Alias: `normLp`. **Arguments** -- `vector` — [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). -- `p` — The power. Possible values: real number in `[1; inf)`. [UInt](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). +- `vector` — [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). +- `p` — The power. Possible values: real number in `[1; inf)`. [UInt](../data-types/int-uint.md) or [Float](../data-types/float.md). **Returned value** -- [Lp-norm](https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm). [Float](../../sql-reference/data-types/float.md). +- [Lp-norm](https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm). [Float](../data-types/float.md). **Example** @@ -200,12 +200,12 @@ Alias: `distanceL1`. **Arguments** -- `vector1` — First vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). -- `vector2` — Second vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector1` — First vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). +- `vector2` — Second vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- 1-norm distance. [Float](../../sql-reference/data-types/float.md). +- 1-norm distance. [Float](../data-types/float.md). **Example** @@ -237,12 +237,12 @@ Alias: `distanceL2`. **Arguments** -- `vector1` — First vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). -- `vector2` — Second vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector1` — First vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). +- `vector2` — Second vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- 2-norm distance. [Float](../../sql-reference/data-types/float.md). +- 2-norm distance. [Float](../data-types/float.md). **Example** @@ -274,12 +274,12 @@ Alias: `distanceL2Squared`. **Arguments** -- `vector1` — First vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). -- `vector2` — Second vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector1` — First vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). +- `vector2` — Second vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- Sum of the squares of the difference between the corresponding elements of two vectors. [Float](../../sql-reference/data-types/float.md). +- Sum of the squares of the difference between the corresponding elements of two vectors. [Float](../data-types/float.md). **Example** @@ -311,12 +311,12 @@ Alias: `distanceLinf`. **Arguments** -- `vector1` — First vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). -- `vector1` — Second vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector1` — First vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). +- `vector1` — Second vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- Infinity-norm distance. [Float](../../sql-reference/data-types/float.md). +- Infinity-norm distance. [Float](../data-types/float.md). **Example** @@ -348,13 +348,13 @@ Alias: `distanceLp`. **Arguments** -- `vector1` — First vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). -- `vector2` — Second vector. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). -- `p` — The power. Possible values: real number from `[1; inf)`. [UInt](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). +- `vector1` — First vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). +- `vector2` — Second vector. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). +- `p` — The power. Possible values: real number from `[1; inf)`. [UInt](../data-types/int-uint.md) or [Float](../data-types/float.md). **Returned value** -- p-norm distance. [Float](../../sql-reference/data-types/float.md). +- p-norm distance. [Float](../data-types/float.md). **Example** @@ -387,11 +387,11 @@ Alias: `normalizeL1`. **Arguments** -- `tuple` — [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple` — [Tuple](../data-types/tuple.md). **Returned value** -- Unit vector. [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). +- Unit vector. [Tuple](../data-types/tuple.md) of [Float](../data-types/float.md). **Example** @@ -423,11 +423,11 @@ Alias: `normalizeL1`. **Arguments** -- `tuple` — [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple` — [Tuple](../data-types/tuple.md). **Returned value** -- Unit vector. [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). +- Unit vector. [Tuple](../data-types/tuple.md) of [Float](../data-types/float.md). **Example** @@ -459,11 +459,11 @@ Alias: `normalizeLinf `. **Arguments** -- `tuple` — [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple` — [Tuple](../data-types/tuple.md). **Returned value** -- Unit vector. [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). +- Unit vector. [Tuple](../data-types/tuple.md) of [Float](../data-types/float.md). **Example** @@ -495,12 +495,12 @@ Alias: `normalizeLp `. **Arguments** -- `tuple` — [Tuple](../../sql-reference/data-types/tuple.md). -- `p` — The power. Possible values: any number from [1;inf). [UInt](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). +- `tuple` — [Tuple](../data-types/tuple.md). +- `p` — The power. Possible values: any number from [1;inf). [UInt](../data-types/int-uint.md) or [Float](../data-types/float.md). **Returned value** -- Unit vector. [Tuple](../../sql-reference/data-types/tuple.md) of [Float](../../sql-reference/data-types/float.md). +- Unit vector. [Tuple](../data-types/tuple.md) of [Float](../data-types/float.md). **Example** @@ -530,12 +530,12 @@ cosineDistance(vector1, vector2) **Arguments** -- `vector1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). -- `vector2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md) or [Array](../../sql-reference/data-types/array.md). +- `vector1` — First tuple. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). +- `vector2` — Second tuple. [Tuple](../data-types/tuple.md) or [Array](../data-types/array.md). **Returned value** -- Cosine of the angle between two vectors subtracted from one. [Float](../../sql-reference/data-types/float.md). +- Cosine of the angle between two vectors subtracted from one. [Float](../data-types/float.md). **Examples** diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index bc64fdea427..408b605727d 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -18,7 +18,7 @@ char(number_1, [number_2, ..., number_n]); **Arguments** -- `number_1, number_2, ..., number_n` — Numerical arguments interpreted as integers. Types: [Int](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md). +- `number_1, number_2, ..., number_n` — Numerical arguments interpreted as integers. Types: [Int](../data-types/int-uint.md), [Float](../data-types/float.md). **Returned value** @@ -86,21 +86,21 @@ The function is using uppercase letters `A-F` and not using any prefixes (like ` For integer arguments, it prints hex digits (“nibbles”) from the most significant to least significant (big-endian or “human-readable” order). It starts with the most significant non-zero byte (leading zero bytes are omitted) but always prints both digits of every byte even if the leading digit is zero. -Values of type [Date](../../sql-reference/data-types/date.md) and [DateTime](../../sql-reference/data-types/datetime.md) are formatted as corresponding integers (the number of days since Epoch for Date and the value of Unix Timestamp for DateTime). +Values of type [Date](../data-types/date.md) and [DateTime](../data-types/datetime.md) are formatted as corresponding integers (the number of days since Epoch for Date and the value of Unix Timestamp for DateTime). -For [String](../../sql-reference/data-types/string.md) and [FixedString](../../sql-reference/data-types/fixedstring.md), all bytes are simply encoded as two hexadecimal numbers. Zero bytes are not omitted. +For [String](../data-types/string.md) and [FixedString](../data-types/fixedstring.md), all bytes are simply encoded as two hexadecimal numbers. Zero bytes are not omitted. -Values of [Float](../../sql-reference/data-types/float.md) and [Decimal](../../sql-reference/data-types/decimal.md) types are encoded as their representation in memory. As we support little-endian architecture, they are encoded in little-endian. Zero leading/trailing bytes are not omitted. +Values of [Float](../data-types/float.md) and [Decimal](../data-types/decimal.md) types are encoded as their representation in memory. As we support little-endian architecture, they are encoded in little-endian. Zero leading/trailing bytes are not omitted. Values of [UUID](../data-types/uuid.md) type are encoded as big-endian order string. **Arguments** -- `arg` — A value to convert to hexadecimal. Types: [String](../../sql-reference/data-types/string.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md), [Decimal](../../sql-reference/data-types/decimal.md), [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md). +- `arg` — A value to convert to hexadecimal. Types: [String](../data-types/string.md), [UInt](../data-types/int-uint.md), [Float](../data-types/float.md), [Decimal](../data-types/decimal.md), [Date](../data-types/date.md) or [DateTime](../data-types/datetime.md). **Returned value** -- A string with the hexadecimal representation of the argument. [String](../../sql-reference/data-types/string.md). +- A string with the hexadecimal representation of the argument. [String](../data-types/string.md). **Examples** @@ -181,13 +181,13 @@ unhex(arg) **Arguments** -- `arg` — A string containing any number of hexadecimal digits. [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md). +- `arg` — A string containing any number of hexadecimal digits. [String](../data-types/string.md), [FixedString](../data-types/fixedstring.md). Supports both uppercase and lowercase letters `A-F`. The number of hexadecimal digits does not have to be even. If it is odd, the last digit is interpreted as the least significant half of the `00-0F` byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn’t thrown). For a numeric argument the inverse of hex(N) is not performed by unhex(). **Returned value** -- A binary string (BLOB). [String](../../sql-reference/data-types/string.md). +- A binary string (BLOB). [String](../data-types/string.md). **Example** @@ -231,21 +231,21 @@ Alias: `BIN`. For integer arguments, it prints bin digits from the most significant to least significant (big-endian or “human-readable” order). It starts with the most significant non-zero byte (leading zero bytes are omitted) but always prints eight digits of every byte if the leading digit is zero. -Values of type [Date](../../sql-reference/data-types/date.md) and [DateTime](../../sql-reference/data-types/datetime.md) are formatted as corresponding integers (the number of days since Epoch for `Date` and the value of Unix Timestamp for `DateTime`). +Values of type [Date](../data-types/date.md) and [DateTime](../data-types/datetime.md) are formatted as corresponding integers (the number of days since Epoch for `Date` and the value of Unix Timestamp for `DateTime`). -For [String](../../sql-reference/data-types/string.md) and [FixedString](../../sql-reference/data-types/fixedstring.md), all bytes are simply encoded as eight binary numbers. Zero bytes are not omitted. +For [String](../data-types/string.md) and [FixedString](../data-types/fixedstring.md), all bytes are simply encoded as eight binary numbers. Zero bytes are not omitted. -Values of [Float](../../sql-reference/data-types/float.md) and [Decimal](../../sql-reference/data-types/decimal.md) types are encoded as their representation in memory. As we support little-endian architecture, they are encoded in little-endian. Zero leading/trailing bytes are not omitted. +Values of [Float](../data-types/float.md) and [Decimal](../data-types/decimal.md) types are encoded as their representation in memory. As we support little-endian architecture, they are encoded in little-endian. Zero leading/trailing bytes are not omitted. Values of [UUID](../data-types/uuid.md) type are encoded as big-endian order string. **Arguments** -- `arg` — A value to convert to binary. [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md), [Decimal](../../sql-reference/data-types/decimal.md), [Date](../../sql-reference/data-types/date.md), or [DateTime](../../sql-reference/data-types/datetime.md). +- `arg` — A value to convert to binary. [String](../data-types/string.md), [FixedString](../data-types/fixedstring.md), [UInt](../data-types/int-uint.md), [Float](../data-types/float.md), [Decimal](../data-types/decimal.md), [Date](../data-types/date.md), or [DateTime](../data-types/datetime.md). **Returned value** -- A string with the binary representation of the argument. [String](../../sql-reference/data-types/string.md). +- A string with the binary representation of the argument. [String](../data-types/string.md). **Examples** @@ -330,11 +330,11 @@ Supports binary digits `0` and `1`. The number of binary digits does not have to **Arguments** -- `arg` — A string containing any number of binary digits. [String](../../sql-reference/data-types/string.md). +- `arg` — A string containing any number of binary digits. [String](../data-types/string.md). **Returned value** -- A binary string (BLOB). [String](../../sql-reference/data-types/string.md). +- A binary string (BLOB). [String](../data-types/string.md). **Examples** @@ -386,11 +386,11 @@ bitPositionsToArray(arg) **Arguments** -- `arg` — Integer value. [Int/UInt](../../sql-reference/data-types/int-uint.md). +- `arg` — Integer value. [Int/UInt](../data-types/int-uint.md). **Returned value** -- An array containing a list of positions of bits that equal `1`, in ascending order. [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). +- An array containing a list of positions of bits that equal `1`, in ascending order. [Array](../data-types/array.md)([UInt64](../data-types/int-uint.md)). **Example** @@ -442,11 +442,11 @@ mortonEncode(args) **Parameters** -- `args`: up to 8 [unsigned integers](../../sql-reference/data-types/int-uint.md) or columns of the aforementioned type. +- `args`: up to 8 [unsigned integers](../data-types/int-uint.md) or columns of the aforementioned type. **Returned value** -- A UInt64 code. [UInt64](../../sql-reference/data-types/int-uint.md) +- A UInt64 code. [UInt64](../data-types/int-uint.md) **Example** @@ -463,7 +463,7 @@ Result: ### Expanded mode -Accepts a range mask ([tuple](../../sql-reference/data-types/tuple.md)) as a first argument and up to 8 [unsigned integers](../../sql-reference/data-types/int-uint.md) as other arguments. +Accepts a range mask ([tuple](../data-types/tuple.md)) as a first argument and up to 8 [unsigned integers](../data-types/int-uint.md) as other arguments. Each number in the mask configures the amount of range expansion:
1 - no expansion
@@ -480,13 +480,13 @@ mortonEncode(range_mask, args) **Parameters** - `range_mask`: 1-8. -- `args`: up to 8 [unsigned integers](../../sql-reference/data-types/int-uint.md) or columns of the aforementioned type. +- `args`: up to 8 [unsigned integers](../data-types/int-uint.md) or columns of the aforementioned type. Note: when using columns for `args` the provided `range_mask` tuple should still be a constant. **Returned value** -- A UInt64 code. [UInt64](../../sql-reference/data-types/int-uint.md) +- A UInt64 code. [UInt64](../data-types/int-uint.md) **Example** @@ -579,7 +579,7 @@ Result: **implementation details** -Please note that you can fit only so many bits of information into Morton code as [UInt64](../../sql-reference/data-types/int-uint.md) has. Two arguments will have a range of maximum 2^32 (64/2) each, three arguments a range of max 2^21 (64/3) each and so on. All overflow will be clamped to zero. +Please note that you can fit only so many bits of information into Morton code as [UInt64](../data-types/int-uint.md) has. Two arguments will have a range of maximum 2^32 (64/2) each, three arguments a range of max 2^21 (64/3) each and so on. All overflow will be clamped to zero. ## mortonDecode @@ -601,11 +601,11 @@ mortonDecode(tuple_size, code) **Parameters** - `tuple_size`: integer value no more than 8. -- `code`: [UInt64](../../sql-reference/data-types/int-uint.md) code. +- `code`: [UInt64](../data-types/int-uint.md) code. **Returned value** -- [tuple](../../sql-reference/data-types/tuple.md) of the specified size. [UInt64](../../sql-reference/data-types/int-uint.md) +- [tuple](../data-types/tuple.md) of the specified size. [UInt64](../data-types/int-uint.md) **Example** diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md index 00c9ef376d3..5d82e26eb32 100644 --- a/docs/en/sql-reference/functions/encryption-functions.md +++ b/docs/en/sql-reference/functions/encryption-functions.md @@ -30,15 +30,15 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad]) **Arguments** -- `mode` — Encryption mode. [String](../../sql-reference/data-types/string.md#string). -- `plaintext` — Text that need to be encrypted. [String](../../sql-reference/data-types/string.md#string). -- `key` — Encryption key. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Required for `-gcm` modes, optional for others. [String](../../sql-reference/data-types/string.md#string). -- `aad` — Additional authenticated data. It isn't encrypted, but it affects decryption. Works only in `-gcm` modes, for others would throw an exception. [String](../../sql-reference/data-types/string.md#string). +- `mode` — Encryption mode. [String](../data-types/string.md#string). +- `plaintext` — Text that need to be encrypted. [String](../data-types/string.md#string). +- `key` — Encryption key. [String](../data-types/string.md#string). +- `iv` — Initialization vector. Required for `-gcm` modes, optional for others. [String](../data-types/string.md#string). +- `aad` — Additional authenticated data. It isn't encrypted, but it affects decryption. Works only in `-gcm` modes, for others would throw an exception. [String](../data-types/string.md#string). **Returned value** -- Ciphertext binary string. [String](../../sql-reference/data-types/string.md#string). +- Ciphertext binary string. [String](../data-types/string.md#string). **Examples** @@ -123,14 +123,14 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv]) **Arguments** -- `mode` — Encryption mode. [String](../../sql-reference/data-types/string.md#string). -- `plaintext` — Text that needs to be encrypted. [String](../../sql-reference/data-types/string.md#string). -- `key` — Encryption key. If key is longer than required by mode, MySQL-specific key folding is performed. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Optional, only first 16 bytes are taken into account [String](../../sql-reference/data-types/string.md#string). +- `mode` — Encryption mode. [String](../data-types/string.md#string). +- `plaintext` — Text that needs to be encrypted. [String](../data-types/string.md#string). +- `key` — Encryption key. If key is longer than required by mode, MySQL-specific key folding is performed. [String](../data-types/string.md#string). +- `iv` — Initialization vector. Optional, only first 16 bytes are taken into account [String](../data-types/string.md#string). **Returned value** -- Ciphertext binary string. [String](../../sql-reference/data-types/string.md#string). +- Ciphertext binary string. [String](../data-types/string.md#string). **Examples** @@ -230,15 +230,15 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad]) **Arguments** -- `mode` — Decryption mode. [String](../../sql-reference/data-types/string.md#string). -- `ciphertext` — Encrypted text that needs to be decrypted. [String](../../sql-reference/data-types/string.md#string). -- `key` — Decryption key. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Required for `-gcm` modes, Optional for others. [String](../../sql-reference/data-types/string.md#string). -- `aad` — Additional authenticated data. Won't decrypt if this value is incorrect. Works only in `-gcm` modes, for others would throw an exception. [String](../../sql-reference/data-types/string.md#string). +- `mode` — Decryption mode. [String](../data-types/string.md#string). +- `ciphertext` — Encrypted text that needs to be decrypted. [String](../data-types/string.md#string). +- `key` — Decryption key. [String](../data-types/string.md#string). +- `iv` — Initialization vector. Required for `-gcm` modes, Optional for others. [String](../data-types/string.md#string). +- `aad` — Additional authenticated data. Won't decrypt if this value is incorrect. Works only in `-gcm` modes, for others would throw an exception. [String](../data-types/string.md#string). **Returned value** -- Decrypted String. [String](../../sql-reference/data-types/string.md#string). +- Decrypted String. [String](../data-types/string.md#string). **Examples** @@ -361,14 +361,14 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) **Arguments** -- `mode` — Decryption mode. [String](../../sql-reference/data-types/string.md#string). -- `ciphertext` — Encrypted text that needs to be decrypted. [String](../../sql-reference/data-types/string.md#string). -- `key` — Decryption key. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Optional. [String](../../sql-reference/data-types/string.md#string). +- `mode` — Decryption mode. [String](../data-types/string.md#string). +- `ciphertext` — Encrypted text that needs to be decrypted. [String](../data-types/string.md#string). +- `key` — Decryption key. [String](../data-types/string.md#string). +- `iv` — Initialization vector. Optional. [String](../data-types/string.md#string). **Returned value** -- Decrypted String. [String](../../sql-reference/data-types/string.md#string). +- Decrypted String. [String](../data-types/string.md#string). **Examples** diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 41657aafbbe..82c21ce40c8 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -25,9 +25,9 @@ dictGetOrNull('dict_name', attr_name, id_expr) **Arguments** - `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). -- `attr_names` — Name of the column of the dictionary, [String literal](../../sql-reference/syntax.md#syntax-string-literal), or tuple of column names, [Tuple](../../sql-reference/data-types/tuple.md)([String literal](../../sql-reference/syntax.md#syntax-string-literal)). -- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning dictionary key-type value or [Tuple](../../sql-reference/data-types/tuple.md)-type value depending on the dictionary configuration. -- `default_value_expr` — Values returned if the dictionary does not contain a row with the `id_expr` key. [Expression](../../sql-reference/syntax.md#syntax-expressions) or [Tuple](../../sql-reference/data-types/tuple.md)([Expression](../../sql-reference/syntax.md#syntax-expressions)), returning the value (or values) in the data types configured for the `attr_names` attribute. +- `attr_names` — Name of the column of the dictionary, [String literal](../../sql-reference/syntax.md#syntax-string-literal), or tuple of column names, [Tuple](../data-types/tuple.md)([String literal](../../sql-reference/syntax.md#syntax-string-literal)). +- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning dictionary key-type value or [Tuple](../data-types/tuple.md)-type value depending on the dictionary configuration. +- `default_value_expr` — Values returned if the dictionary does not contain a row with the `id_expr` key. [Expression](../../sql-reference/syntax.md#syntax-expressions) or [Tuple](../data-types/tuple.md)([Expression](../../sql-reference/syntax.md#syntax-expressions)), returning the value (or values) in the data types configured for the `attr_names` attribute. **Returned value** @@ -239,7 +239,7 @@ dictHas('dict_name', id_expr) **Arguments** - `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). -- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning dictionary key-type value or [Tuple](../../sql-reference/data-types/tuple.md)-type value depending on the dictionary configuration. +- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning dictionary key-type value or [Tuple](../data-types/tuple.md)-type value depending on the dictionary configuration. **Returned value** @@ -259,11 +259,11 @@ dictGetHierarchy('dict_name', key) **Arguments** - `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). -- `key` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md)-type value. +- `key` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../data-types/int-uint.md)-type value. **Returned value** -- Parents for the key. [Array(UInt64)](../../sql-reference/data-types/array.md). +- Parents for the key. [Array(UInt64)](../data-types/array.md). ## dictIsIn @@ -276,8 +276,8 @@ dictIsIn('dict_name', child_id_expr, ancestor_id_expr) **Arguments** - `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). -- `child_id_expr` — Key to be checked. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md)-type value. -- `ancestor_id_expr` — Alleged ancestor of the `child_id_expr` key. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md)-type value. +- `child_id_expr` — Key to be checked. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../data-types/int-uint.md)-type value. +- `ancestor_id_expr` — Alleged ancestor of the `child_id_expr` key. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../data-types/int-uint.md)-type value. **Returned value** @@ -297,11 +297,11 @@ dictGetChildren(dict_name, key) **Arguments** - `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). -- `key` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md)-type value. +- `key` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../data-types/int-uint.md)-type value. **Returned values** -- First-level descendants for the key. [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). +- First-level descendants for the key. [Array](../data-types/array.md)([UInt64](../data-types/int-uint.md)). **Example** @@ -344,12 +344,12 @@ dictGetDescendants(dict_name, key, level) **Arguments** - `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). -- `key` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md)-type value. -- `level` — Hierarchy level. If `level = 0` returns all descendants to the end. [UInt8](../../sql-reference/data-types/int-uint.md). +- `key` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../data-types/int-uint.md)-type value. +- `level` — Hierarchy level. If `level = 0` returns all descendants to the end. [UInt8](../data-types/int-uint.md). **Returned values** -- Descendants for the key. [Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md)). +- Descendants for the key. [Array](../data-types/array.md)([UInt64](../data-types/int-uint.md)). **Example** @@ -409,8 +409,8 @@ dictGetAll('dict_name', attr_names, id_expr[, limit]) **Arguments** - `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). -- `attr_names` — Name of the column of the dictionary, [String literal](../../sql-reference/syntax.md#syntax-string-literal), or tuple of column names, [Tuple](../../sql-reference/data-types/tuple.md)([String literal](../../sql-reference/syntax.md#syntax-string-literal)). -- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning array of dictionary key-type value or [Tuple](../../sql-reference/data-types/tuple.md)-type value depending on the dictionary configuration. +- `attr_names` — Name of the column of the dictionary, [String literal](../../sql-reference/syntax.md#syntax-string-literal), or tuple of column names, [Tuple](../data-types/tuple.md)([String literal](../../sql-reference/syntax.md#syntax-string-literal)). +- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning array of dictionary key-type value or [Tuple](../data-types/tuple.md)-type value depending on the dictionary configuration. - `limit` - Maximum length for each value array returned. When truncating, child nodes are given precedence over parent nodes, and otherwise the defined list order for the regexp tree dictionary is respected. If unspecified, array length is unlimited. **Returned value** @@ -499,7 +499,7 @@ dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) - `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). - `attr_name` — Name of the column of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal). -- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md) or [Tuple](../../sql-reference/data-types/tuple.md)-type value depending on the dictionary configuration. +- `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../data-types/int-uint.md) or [Tuple](../data-types/tuple.md)-type value depending on the dictionary configuration. - `default_value_expr` — Value returned if the dictionary does not contain a row with the `id_expr` key. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning the value in the data type configured for the `attr_name` attribute. **Returned value** diff --git a/docs/en/sql-reference/functions/files.md b/docs/en/sql-reference/functions/files.md index d62cd1db88d..ac9e21cd416 100644 --- a/docs/en/sql-reference/functions/files.md +++ b/docs/en/sql-reference/functions/files.md @@ -19,7 +19,7 @@ file(path[, default]) **Arguments** - `path` — The path of the file relative to [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports wildcards `*`, `**`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` are numbers and `'abc', 'def'` are strings. -- `default` — The value returned if the file does not exist or cannot be accessed. Supported data types: [String](../../sql-reference/data-types/string.md) and [NULL](../../sql-reference/syntax.md#null-literal). +- `default` — The value returned if the file does not exist or cannot be accessed. Supported data types: [String](../data-types/string.md) and [NULL](../../sql-reference/syntax.md#null-literal). **Example** diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md index 90520145b9d..a0dfbebc8ae 100644 --- a/docs/en/sql-reference/functions/functions-for-nulls.md +++ b/docs/en/sql-reference/functions/functions-for-nulls.md @@ -351,7 +351,7 @@ Result: ## assumeNotNull -Returns the corresponding non-`Nullable` value for a value of [Nullable](../../sql-reference/data-types/nullable.md) type. If the original value is `NULL`, an arbitrary result can be returned. See also functions `ifNull` and `coalesce`. +Returns the corresponding non-`Nullable` value for a value of [Nullable](../data-types/nullable.md) type. If the original value is `NULL`, an arbitrary result can be returned. See also functions `ifNull` and `coalesce`. ``` sql assumeNotNull(x) diff --git a/docs/en/sql-reference/functions/geo/coordinates.md b/docs/en/sql-reference/functions/geo/coordinates.md index 1cbc1933206..d10573b8995 100644 --- a/docs/en/sql-reference/functions/geo/coordinates.md +++ b/docs/en/sql-reference/functions/geo/coordinates.md @@ -152,8 +152,8 @@ pointInPolygon((x, y), [(a, b), (c, d) ...], ...) **Input values** -- `(x, y)` — Coordinates of a point on the plane. Data type — [Tuple](../../../sql-reference/data-types/tuple.md) — A tuple of two numbers. -- `[(a, b), (c, d) ...]` — Polygon vertices. Data type — [Array](../../../sql-reference/data-types/array.md). Each vertex is represented by a pair of coordinates `(a, b)`. Vertices should be specified in a clockwise or counterclockwise order. The minimum number of vertices is 3. The polygon must be constant. +- `(x, y)` — Coordinates of a point on the plane. Data type — [Tuple](../../data-types/tuple.md) — A tuple of two numbers. +- `[(a, b), (c, d) ...]` — Polygon vertices. Data type — [Array](../../data-types/array.md). Each vertex is represented by a pair of coordinates `(a, b)`. Vertices should be specified in a clockwise or counterclockwise order. The minimum number of vertices is 3. The polygon must be constant. - The function also supports polygons with holes (cut out sections). In this case, add polygons that define the cut out sections using additional arguments of the function. The function does not support non-simply-connected polygons. **Returned values** diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md index 80c55650b9c..8abc8006e5d 100644 --- a/docs/en/sql-reference/functions/geo/geohash.md +++ b/docs/en/sql-reference/functions/geo/geohash.md @@ -74,11 +74,11 @@ geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precisi **Arguments** -- `longitude_min` — Minimum longitude. Range: `[-180°, 180°]`. [Float](../../../sql-reference/data-types/float.md). -- `latitude_min` — Minimum latitude. Range: `[-90°, 90°]`. [Float](../../../sql-reference/data-types/float.md). -- `longitude_max` — Maximum longitude. Range: `[-180°, 180°]`. [Float](../../../sql-reference/data-types/float.md). -- `latitude_max` — Maximum latitude. Range: `[-90°, 90°]`. [Float](../../../sql-reference/data-types/float.md). -- `precision` — Geohash precision. Range: `[1, 12]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `longitude_min` — Minimum longitude. Range: `[-180°, 180°]`. [Float](../../data-types/float.md). +- `latitude_min` — Minimum latitude. Range: `[-90°, 90°]`. [Float](../../data-types/float.md). +- `longitude_max` — Maximum longitude. Range: `[-180°, 180°]`. [Float](../../data-types/float.md). +- `latitude_max` — Maximum latitude. Range: `[-90°, 90°]`. [Float](../../data-types/float.md). +- `precision` — Geohash precision. Range: `[1, 12]`. [UInt8](../../data-types/int-uint.md). :::note All coordinate parameters must be of the same type: either `Float32` or `Float64`. @@ -86,7 +86,7 @@ All coordinate parameters must be of the same type: either `Float32` or `Float64 **Returned values** -- Array of precision-long strings of geohash-boxes covering provided area, you should not rely on order of items. [Array](../../../sql-reference/data-types/array.md)([String](../../../sql-reference/data-types/string.md)). +- Array of precision-long strings of geohash-boxes covering provided area, you should not rely on order of items. [Array](../../data-types/array.md)([String](../../data-types/string.md)). - `[]` - Empty array if minimum latitude and longitude values aren’t less than corresponding maximum values. :::note diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index 7faff8288b3..bcdd457964a 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -26,12 +26,12 @@ h3IsValid(h3index) **Parameter** -- `h3index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned values** -- 1 — The number is a valid H3 index. [UInt8](../../../sql-reference/data-types/int-uint.md). -- 0 — The number is not a valid H3 index. [UInt8](../../../sql-reference/data-types/int-uint.md). +- 1 — The number is a valid H3 index. [UInt8](../../data-types/int-uint.md). +- 0 — The number is not a valid H3 index. [UInt8](../../data-types/int-uint.md). **Example** @@ -61,12 +61,12 @@ h3GetResolution(h3index) **Parameter** -- `h3index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned values** -- Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). -- If the index is not valid, the function returns a random value. Use [h3IsValid](#h3isvalid) to verify the index. [UInt8](../../../sql-reference/data-types/int-uint.md). +- Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). +- If the index is not valid, the function returns a random value. Use [h3IsValid](#h3isvalid) to verify the index. [UInt8](../../data-types/int-uint.md). **Example** @@ -96,11 +96,11 @@ h3EdgeAngle(resolution) **Parameter** -- `resolution` — Index resolution. [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. +- `resolution` — Index resolution. [UInt8](../../data-types/int-uint.md). Range: `[0, 15]`. **Returned values** -- The average length of the [H3](#h3index) hexagon edge in grades. [Float64](../../../sql-reference/data-types/float.md). +- The average length of the [H3](#h3index) hexagon edge in grades. [Float64](../../data-types/float.md). **Example** @@ -130,11 +130,11 @@ h3EdgeLengthM(resolution) **Parameter** -- `resolution` — Index resolution. [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. +- `resolution` — Index resolution. [UInt8](../../data-types/int-uint.md). Range: `[0, 15]`. **Returned values** -- The average length of the [H3](#h3index) hexagon edge in meters. [Float64](../../../sql-reference/data-types/float.md). +- The average length of the [H3](#h3index) hexagon edge in meters. [Float64](../../data-types/float.md). **Example** @@ -164,11 +164,11 @@ h3EdgeLengthKm(resolution) **Parameter** -- `resolution` — Index resolution. [UInt8](../../../sql-reference/data-types/int-uint.md). Range: `[0, 15]`. +- `resolution` — Index resolution. [UInt8](../../data-types/int-uint.md). Range: `[0, 15]`. **Returned values** -- The average length of the [H3](#h3index) hexagon edge in kilometers. [Float64](../../../sql-reference/data-types/float.md). +- The average length of the [H3](#h3index) hexagon edge in kilometers. [Float64](../../data-types/float.md). **Example** @@ -198,14 +198,14 @@ geoToH3(lon, lat, resolution) **Arguments** -- `lon` — Longitude. [Float64](../../../sql-reference/data-types/float.md). -- `lat` — Latitude. [Float64](../../../sql-reference/data-types/float.md). -- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `lon` — Longitude. [Float64](../../data-types/float.md). +- `lat` — Latitude. [Float64](../../data-types/float.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Returned values** -- Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). -- 0 in case of error. [UInt64](../../../sql-reference/data-types/int-uint.md). +- Hexagon index number. [UInt64](../../data-types/int-uint.md). +- 0 in case of error. [UInt64](../../data-types/int-uint.md). **Example** @@ -235,11 +235,11 @@ h3ToGeo(h3Index) **Arguments** -- `h3Index` — H3 Index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3Index` — H3 Index. [UInt64](../../data-types/int-uint.md). **Returned values** -- A tuple consisting of two values: `tuple(lon,lat)`. `lon` — Longitude. [Float64](../../../sql-reference/data-types/float.md). `lat` — Latitude. [Float64](../../../sql-reference/data-types/float.md). +- A tuple consisting of two values: `tuple(lon,lat)`. `lon` — Longitude. [Float64](../../data-types/float.md). `lat` — Latitude. [Float64](../../data-types/float.md). **Example** @@ -269,11 +269,11 @@ h3ToGeoBoundary(h3Index) **Arguments** -- `h3Index` — H3 Index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `h3Index` — H3 Index. [UInt64](../../data-types/int-uint.md). **Returned values** -- Array of pairs '(lon, lat)'. [Array](../../../sql-reference/data-types/array.md)([Float64](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md)). +- Array of pairs '(lon, lat)'. [Array](../../data-types/array.md)([Float64](../../data-types/float.md), [Float64](../../data-types/float.md)). **Example** @@ -304,12 +304,12 @@ h3kRing(h3index, k) **Arguments** -- `h3index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `k` — Radius. [integer](../../../sql-reference/data-types/int-uint.md) +- `h3index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). +- `k` — Radius. [integer](../../data-types/int-uint.md) **Returned values** -- Array of H3 indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of H3 indexes. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -345,11 +345,11 @@ h3GetBaseCell(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- Hexagon base cell number. [UInt8](../../../sql-reference/data-types/int-uint.md). +- Hexagon base cell number. [UInt8](../../data-types/int-uint.md). **Example** @@ -379,11 +379,11 @@ h3HexAreaM2(resolution) **Parameter** -- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Returned value** -- Area in square meters. [Float64](../../../sql-reference/data-types/float.md). +- Area in square meters. [Float64](../../data-types/float.md). **Example** @@ -413,11 +413,11 @@ h3HexAreaKm2(resolution) **Parameter** -- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Returned value** -- Area in square kilometers. [Float64](../../../sql-reference/data-types/float.md). +- Area in square kilometers. [Float64](../../data-types/float.md). **Example** @@ -447,13 +447,13 @@ h3IndexesAreNeighbors(index1, index2) **Arguments** -- `index1` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `index2` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index1` — Hexagon index number. [UInt64](../../data-types/int-uint.md). +- `index2` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- `1` — Indexes are neighbours. [UInt8](../../../sql-reference/data-types/int-uint.md). -- `0` — Indexes are not neighbours. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — Indexes are neighbours. [UInt8](../../data-types/int-uint.md). +- `0` — Indexes are not neighbours. [UInt8](../../data-types/int-uint.md). **Example** @@ -483,12 +483,12 @@ h3ToChildren(index, resolution) **Arguments** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Returned values** -- Array of the child H3-indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of the child H3-indexes. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -518,12 +518,12 @@ h3ToParent(index, resolution) **Arguments** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Returned value** -- Parent H3 index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- Parent H3 index. [UInt64](../../data-types/int-uint.md). **Example** @@ -551,11 +551,11 @@ h3ToString(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- String representation of the H3 index. [String](../../../sql-reference/data-types/string.md). +- String representation of the H3 index. [String](../../data-types/string.md). **Example** @@ -585,11 +585,11 @@ stringToH3(index_str) **Parameter** -- `index_str` — String representation of the H3 index. [String](../../../sql-reference/data-types/string.md). +- `index_str` — String representation of the H3 index. [String](../../data-types/string.md). **Returned value** -- Hexagon index number. Returns 0 on error. [UInt64](../../../sql-reference/data-types/int-uint.md). +- Hexagon index number. Returns 0 on error. [UInt64](../../data-types/int-uint.md). **Example** @@ -619,11 +619,11 @@ h3GetResolution(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Example** @@ -653,12 +653,12 @@ h3IsResClassIII(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- `1` — Index has a resolution with Class III orientation. [UInt8](../../../sql-reference/data-types/int-uint.md). -- `0` — Index doesn't have a resolution with Class III orientation. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — Index has a resolution with Class III orientation. [UInt8](../../data-types/int-uint.md). +- `0` — Index doesn't have a resolution with Class III orientation. [UInt8](../../data-types/int-uint.md). **Example** @@ -688,12 +688,12 @@ h3IsPentagon(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- `1` — Index represents a pentagonal cell. [UInt8](../../../sql-reference/data-types/int-uint.md). -- `0` — Index doesn't represent a pentagonal cell. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — Index represents a pentagonal cell. [UInt8](../../data-types/int-uint.md). +- `0` — Index doesn't represent a pentagonal cell. [UInt8](../../data-types/int-uint.md). **Example** @@ -723,11 +723,11 @@ h3GetFaces(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned values** -- Array containing icosahedron faces intersected by a given H3 index. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array containing icosahedron faces intersected by a given H3 index. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -757,11 +757,11 @@ h3CellAreaM2(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- Cell area in square meters. [Float64](../../../sql-reference/data-types/float.md). +- Cell area in square meters. [Float64](../../data-types/float.md). **Example** @@ -791,11 +791,11 @@ h3CellAreaRads2(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- Cell area in square radians. [Float64](../../../sql-reference/data-types/float.md). +- Cell area in square radians. [Float64](../../data-types/float.md). **Example** @@ -825,12 +825,12 @@ h3ToCenterChild(index, resolution) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Returned values** -- [H3](#h3index) index of the center child contained by given [H3](#h3index) at the given resolution. [UInt64](../../../sql-reference/data-types/int-uint.md). +- [H3](#h3index) index of the center child contained by given [H3](#h3index) at the given resolution. [UInt64](../../data-types/int-uint.md). **Example** @@ -860,11 +860,11 @@ h3ExactEdgeLengthM(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- Exact edge length in meters. [Float64](../../../sql-reference/data-types/float.md). +- Exact edge length in meters. [Float64](../../data-types/float.md). **Example** @@ -894,11 +894,11 @@ h3ExactEdgeLengthKm(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- Exact edge length in kilometers. [Float64](../../../sql-reference/data-types/float.md). +- Exact edge length in kilometers. [Float64](../../data-types/float.md). **Example** @@ -928,11 +928,11 @@ h3ExactEdgeLengthRads(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- Exact edge length in radians. [Float64](../../../sql-reference/data-types/float.md). +- Exact edge length in radians. [Float64](../../data-types/float.md). **Example** @@ -962,11 +962,11 @@ h3NumHexagons(resolution) **Parameter** -- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Returned value** -- Number of H3 indices. [Int64](../../../sql-reference/data-types/int-uint.md). +- Number of H3 indices. [Int64](../../data-types/int-uint.md). **Example** @@ -996,12 +996,12 @@ h3PointDistM(lat1, lon1, lat2, lon2) **Arguments** -- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../../sql-reference/data-types/float.md). -- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../../sql-reference/data-types/float.md). +- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../data-types/float.md). +- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../data-types/float.md). **Returned values** -- Haversine or great circle distance in meters.[Float64](../../../sql-reference/data-types/float.md). +- Haversine or great circle distance in meters.[Float64](../../data-types/float.md). **Example** @@ -1031,12 +1031,12 @@ h3PointDistKm(lat1, lon1, lat2, lon2) **Arguments** -- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../../sql-reference/data-types/float.md). -- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../../sql-reference/data-types/float.md). +- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../data-types/float.md). +- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../data-types/float.md). **Returned values** -- Haversine or great circle distance in kilometers. [Float64](../../../sql-reference/data-types/float.md). +- Haversine or great circle distance in kilometers. [Float64](../../data-types/float.md). **Example** @@ -1066,12 +1066,12 @@ h3PointDistRads(lat1, lon1, lat2, lon2) **Arguments** -- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../../sql-reference/data-types/float.md). -- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../../sql-reference/data-types/float.md). +- `lat1`, `lon1` — Latitude and Longitude of point1 in degrees. [Float64](../../data-types/float.md). +- `lat2`, `lon2` — Latitude and Longitude of point2 in degrees. [Float64](../../data-types/float.md). **Returned values** -- Haversine or great circle distance in radians. [Float64](../../../sql-reference/data-types/float.md). +- Haversine or great circle distance in radians. [Float64](../../data-types/float.md). **Example** @@ -1101,7 +1101,7 @@ h3GetRes0Indexes() **Returned values** -- Array of all the resolution 0 H3 indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of all the resolution 0 H3 indexes. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -1132,11 +1132,11 @@ h3GetPentagonIndexes(resolution) **Parameter** -- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. [UInt8](../../data-types/int-uint.md). **Returned value** -- Array of all pentagon H3 indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of all pentagon H3 indexes. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -1166,12 +1166,12 @@ h3Line(start,end) **Parameter** -- `start` — Hexagon index number that represents a starting point. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `end` — Hexagon index number that represents an ending point. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `start` — Hexagon index number that represents a starting point. [UInt64](../../data-types/int-uint.md). +- `end` — Hexagon index number that represents an ending point. [UInt64](../../data-types/int-uint.md). **Returned value** -Array of h3 indexes representing the line of indices between the two provided indices. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +Array of h3 indexes representing the line of indices between the two provided indices. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -1201,12 +1201,12 @@ h3Distance(start,end) **Parameter** -- `start` — Hexagon index number that represents a starting point. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `end` — Hexagon index number that represents an ending point. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `start` — Hexagon index number that represents a starting point. [UInt64](../../data-types/int-uint.md). +- `end` — Hexagon index number that represents an ending point. [UInt64](../../data-types/int-uint.md). **Returned value** -- Number of grid cells. [Int64](../../../sql-reference/data-types/int-uint.md). +- Number of grid cells. [Int64](../../data-types/int-uint.md). Returns a negative number if finding the distance fails. @@ -1240,12 +1240,12 @@ h3HexRing(index, k) **Parameter** -- `index` — Hexagon index number that represents the origin. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `k` — Distance. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number that represents the origin. [UInt64](../../data-types/int-uint.md). +- `k` — Distance. [UInt64](../../data-types/int-uint.md). **Returned values** -- Array of H3 indexes. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- Array of H3 indexes. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -1275,12 +1275,12 @@ h3GetUnidirectionalEdge(originIndex, destinationIndex) **Parameter** -- `originIndex` — Origin Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `destinationIndex` — Destination Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `originIndex` — Origin Hexagon index number. [UInt64](../../data-types/int-uint.md). +- `destinationIndex` — Destination Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- Unidirectional Edge Hexagon Index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- Unidirectional Edge Hexagon Index number. [UInt64](../../data-types/int-uint.md). **Example** @@ -1310,12 +1310,12 @@ h3UnidirectionalEdgeisValid(index) **Parameter** -- `index` — Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number. [UInt64](../../data-types/int-uint.md). **Returned value** -- 1 — The H3 index is a valid unidirectional edge. [UInt8](../../../sql-reference/data-types/int-uint.md). -- 0 — The H3 index is not a valid unidirectional edge. [UInt8](../../../sql-reference/data-types/int-uint.md). +- 1 — The H3 index is a valid unidirectional edge. [UInt8](../../data-types/int-uint.md). +- 0 — The H3 index is not a valid unidirectional edge. [UInt8](../../data-types/int-uint.md). **Example** @@ -1345,11 +1345,11 @@ h3GetOriginIndexFromUnidirectionalEdge(edge) **Parameter** -- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../data-types/int-uint.md). **Returned value** -- Origin Hexagon Index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- Origin Hexagon Index number. [UInt64](../../data-types/int-uint.md). **Example** @@ -1379,11 +1379,11 @@ h3GetDestinationIndexFromUnidirectionalEdge(edge) **Parameter** -- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../data-types/int-uint.md). **Returned value** -- Destination Hexagon Index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- Destination Hexagon Index number. [UInt64](../../data-types/int-uint.md). **Example** @@ -1413,14 +1413,14 @@ h3GetIndexesFromUnidirectionalEdge(edge) **Parameter** -- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `edge` — Hexagon index number that represents a unidirectional edge. [UInt64](../../data-types/int-uint.md). **Returned value** A tuple consisting of two values `tuple(origin,destination)`: -- `origin` — Origin Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `destination` — Destination Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `origin` — Origin Hexagon index number. [UInt64](../../data-types/int-uint.md). +- `destination` — Destination Hexagon index number. [UInt64](../../data-types/int-uint.md). Returns `(0,0)` if the provided input is not valid. @@ -1452,11 +1452,11 @@ h3GetUnidirectionalEdgesFromHexagon(index) **Parameter** -- `index` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number that represents a unidirectional edge. [UInt64](../../data-types/int-uint.md). **Returned value** -Array of h3 indexes representing each unidirectional edge. [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +Array of h3 indexes representing each unidirectional edge. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -1486,11 +1486,11 @@ h3GetUnidirectionalEdgeBoundary(index) **Parameter** -- `index` — Hexagon index number that represents a unidirectional edge. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `index` — Hexagon index number that represents a unidirectional edge. [UInt64](../../data-types/int-uint.md). **Returned value** -- Array of pairs '(lon, lat)'. [Array](../../../sql-reference/data-types/array.md)([Float64](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md)). +- Array of pairs '(lon, lat)'. [Array](../../data-types/array.md)([Float64](../../data-types/float.md), [Float64](../../data-types/float.md)). **Example** diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index 2158ef2d57d..3165b21318b 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -21,12 +21,12 @@ geoToS2(lon, lat) **Arguments** -- `lon` — Longitude. [Float64](../../../sql-reference/data-types/float.md). -- `lat` — Latitude. [Float64](../../../sql-reference/data-types/float.md). +- `lon` — Longitude. [Float64](../../data-types/float.md). +- `lat` — Latitude. [Float64](../../data-types/float.md). **Returned values** -- S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- S2 point index. [UInt64](../../data-types/int-uint.md). **Example** @@ -56,13 +56,13 @@ s2ToGeo(s2index) **Arguments** -- `s2index` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2index` — S2 Index. [UInt64](../../data-types/int-uint.md). **Returned values** - A [tuple](../../data-types/tuple.md) consisting of two values: - - `lon`. [Float64](../../../sql-reference/data-types/float.md). - - `lat`. [Float64](../../../sql-reference/data-types/float.md). + - `lon`. [Float64](../../data-types/float.md). + - `lat`. [Float64](../../data-types/float.md). **Example** @@ -92,11 +92,11 @@ s2GetNeighbors(s2index) **Arguments** -- `s2index` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2index` — S2 Index. [UInt64](../../data-types/int-uint.md). **Returned value** -- An array consisting of 4 neighbor indexes: `array[s2index1, s2index3, s2index2, s2index4]`. [Array](../../data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)). +- An array consisting of 4 neighbor indexes: `array[s2index1, s2index3, s2index2, s2index4]`. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). **Example** @@ -126,12 +126,12 @@ s2CellsIntersect(s2index1, s2index2) **Arguments** -- `siIndex1`, `s2index2` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `siIndex1`, `s2index2` — S2 Index. [UInt64](../../data-types/int-uint.md). **Returned value** -- `1` — If the cells intersect. [UInt8](../../../sql-reference/data-types/int-uint.md). -- `0` — If the cells don't intersect. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — If the cells intersect. [UInt8](../../data-types/int-uint.md). +- `0` — If the cells don't intersect. [UInt8](../../data-types/int-uint.md). **Example** @@ -161,14 +161,14 @@ s2CapContains(center, degrees, point) **Arguments** -- `center` — S2 point index corresponding to the cap. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `degrees` — Radius of the cap in degrees. [Float64](../../../sql-reference/data-types/float.md). -- `point` — S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `center` — S2 point index corresponding to the cap. [UInt64](../../data-types/int-uint.md). +- `degrees` — Radius of the cap in degrees. [Float64](../../data-types/float.md). +- `point` — S2 point index. [UInt64](../../data-types/int-uint.md). **Returned value** -- `1` — If the cap contains the S2 point index. [UInt8](../../../sql-reference/data-types/int-uint.md). -- `0` — If the cap doesn't contain the S2 point index. [UInt8](../../../sql-reference/data-types/int-uint.md). +- `1` — If the cap contains the S2 point index. [UInt8](../../data-types/int-uint.md). +- `0` — If the cap doesn't contain the S2 point index. [UInt8](../../data-types/int-uint.md). **Example** @@ -198,13 +198,13 @@ s2CapUnion(center1, radius1, center2, radius2) **Arguments** -- `center1`, `center2` — S2 point indexes corresponding to the two input caps. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `radius1`, `radius2` — Radius of the two input caps in degrees. [Float64](../../../sql-reference/data-types/float.md). +- `center1`, `center2` — S2 point indexes corresponding to the two input caps. [UInt64](../../data-types/int-uint.md). +- `radius1`, `radius2` — Radius of the two input caps in degrees. [Float64](../../data-types/float.md). **Returned values** -- `center` — S2 point index corresponding the center of the smallest cap containing the two input caps. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `radius` — Radius of the smallest cap containing the two input caps. [Float64](../../../sql-reference/data-types/float.md). +- `center` — S2 point index corresponding the center of the smallest cap containing the two input caps. [UInt64](../../data-types/int-uint.md). +- `radius` — Radius of the smallest cap containing the two input caps. [Float64](../../data-types/float.md). **Example** @@ -234,14 +234,14 @@ s2RectAdd(s2pointLow, s2pointHigh, s2Point) **Arguments** -- `s2PointLow` — Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2Point` — Target S2 point index that the bound rectangle should be grown to include. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2PointLow` — Low S2 point index corresponding to the rectangle. [UInt64](../../data-types/int-uint.md). +- `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../data-types/int-uint.md). +- `s2Point` — Target S2 point index that the bound rectangle should be grown to include. [UInt64](../../data-types/int-uint.md). **Returned values** -- `s2PointLow` — Low S2 cell id corresponding to the grown rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2PointHigh` — Height S2 cell id corresponding to the grown rectangle. [UInt64](../../../sql-reference/data-types/float.md). +- `s2PointLow` — Low S2 cell id corresponding to the grown rectangle. [UInt64](../../data-types/int-uint.md). +- `s2PointHigh` — Height S2 cell id corresponding to the grown rectangle. [UInt64](../../data-types/float.md). **Example** @@ -271,9 +271,9 @@ s2RectContains(s2PointLow, s2PointHi, s2Point) **Arguments** -- `s2PointLow` — Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2Point` — Target S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2PointLow` — Low S2 point index corresponding to the rectangle. [UInt64](../../data-types/int-uint.md). +- `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../data-types/int-uint.md). +- `s2Point` — Target S2 point index. [UInt64](../../data-types/int-uint.md). **Returned value** @@ -308,13 +308,13 @@ s2RectUnion(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2PointHi) **Arguments** -- `s2Rect1PointLow`, `s2Rect1PointHi` — Low and High S2 point indexes corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2Rect2PointLow`, `s2Rect2PointHi` — Low and High S2 point indexes corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2Rect1PointLow`, `s2Rect1PointHi` — Low and High S2 point indexes corresponding to the first rectangle. [UInt64](../../data-types/int-uint.md). +- `s2Rect2PointLow`, `s2Rect2PointHi` — Low and High S2 point indexes corresponding to the second rectangle. [UInt64](../../data-types/int-uint.md). **Returned values** -- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the union rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2UnionRect2PointHi` — High S2 cell id corresponding to the union rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the union rectangle. [UInt64](../../data-types/int-uint.md). +- `s2UnionRect2PointHi` — High S2 cell id corresponding to the union rectangle. [UInt64](../../data-types/int-uint.md). **Example** @@ -344,13 +344,13 @@ s2RectIntersection(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2Poin **Arguments** -- `s2Rect1PointLow`, `s2Rect1PointHi` — Low and High S2 point indexes corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2Rect2PointLow`, `s2Rect2PointHi` — Low and High S2 point indexes corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2Rect1PointLow`, `s2Rect1PointHi` — Low and High S2 point indexes corresponding to the first rectangle. [UInt64](../../data-types/int-uint.md). +- `s2Rect2PointLow`, `s2Rect2PointHi` — Low and High S2 point indexes corresponding to the second rectangle. [UInt64](../../data-types/int-uint.md). **Returned values** -- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. [UInt64](../../../sql-reference/data-types/int-uint.md). -- `s2UnionRect2PointHi` — High S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. [UInt64](../../../sql-reference/data-types/int-uint.md). +- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. [UInt64](../../data-types/int-uint.md). +- `s2UnionRect2PointHi` — High S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. [UInt64](../../data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index e3968a691a8..506114038f7 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -12,7 +12,7 @@ Simhash is a hash function, which returns close hash values for close (similar) ## halfMD5 -[Interprets](/docs/en/sql-reference/functions/type-conversion-functions.md/#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. +[Interprets](../functions/type-conversion-functions.md/#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. ```sql halfMD5(par1, ...) @@ -23,11 +23,11 @@ Consider using the [sipHash64](#siphash64) function instead. **Arguments** -The function takes a variable number of input parameters. Arguments can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). +The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). **Returned Value** -A [UInt64](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. +A [UInt64](../data-types/int-uint.md) data type hash value. **Example** @@ -61,7 +61,7 @@ sipHash64(par1,...) This is a cryptographic hash function. It works at least three times faster than the [MD5](#md5) hash function. -The function [interprets](/docs/en/sql-reference/functions/type-conversion-functions.md/#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. It then combines the hashes by the following algorithm: +The function [interprets](../functions/type-conversion-functions.md/#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. It then combines the hashes by the following algorithm: 1. The first and the second hash value are concatenated to an array which is hashed. 2. The previously calculated hash value and the hash of the third input parameter are hashed in a similar way. @@ -69,11 +69,11 @@ The function [interprets](/docs/en/sql-reference/functions/type-conversion-funct **Arguments** -The function takes a variable number of input parameters of any of the [supported data types](/docs/en/sql-reference/data-types/index.md). +The function takes a variable number of input parameters of any of the [supported data types](../data-types/index.md). **Returned Value** -A [UInt64](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. +A [UInt64](../data-types/int-uint.md) data type hash value. Note that the calculated hash values may be equal for the same input values of different argument types. This affects for example integer types of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data. @@ -105,7 +105,7 @@ Same as [sipHash64](#siphash64), but the first argument is a tuple of two UInt64 **Returned value** -A [UInt64](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. +A [UInt64](../data-types/int-uint.md) data type hash value. **Example** @@ -143,7 +143,7 @@ Same as for [sipHash64](#siphash64). **Returned value** -A 128-bit `SipHash` hash value of type [FixedString(16)](/docs/en/sql-reference/data-types/fixedstring.md). +A 128-bit `SipHash` hash value of type [FixedString(16)](../data-types/fixedstring.md). **Example** @@ -183,7 +183,7 @@ Same as [sipHash128](#siphash128), but the first argument is a tuple of two UInt **Returned value** -A 128-bit `SipHash` hash value of type [FixedString(16)](/docs/en/sql-reference/data-types/fixedstring.md). +A 128-bit `SipHash` hash value of type [FixedString(16)](../data-types/fixedstring.md). **Example** @@ -217,7 +217,7 @@ Same as for [sipHash128](#siphash128). **Returned value** -A 128-bit `SipHash` hash value of type [FixedString(16)](/docs/en/sql-reference/data-types/fixedstring.md). +A 128-bit `SipHash` hash value of type [FixedString(16)](../data-types/fixedstring.md). **Example** @@ -251,7 +251,7 @@ Same as [sipHash128Reference](#siphash128reference), but the first argument is a **Returned value** -A 128-bit `SipHash` hash value of type [FixedString(16)](/docs/en/sql-reference/data-types/fixedstring.md). +A 128-bit `SipHash` hash value of type [FixedString(16)](../data-types/fixedstring.md). **Example** @@ -283,11 +283,11 @@ Note that Google changed the algorithm of CityHash after it has been added to Cl **Arguments** -The function takes a variable number of input parameters. Arguments can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). +The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). **Returned Value** -A [UInt64](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. +A [UInt64](../data-types/int-uint.md) data type hash value. **Examples** @@ -321,7 +321,7 @@ It works faster than intHash32. Average quality. ## SHA1, SHA224, SHA256, SHA512, SHA512_256 -Calculates SHA-1, SHA-224, SHA-256, SHA-512, SHA-512-256 hash from a string and returns the resulting set of bytes as [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). +Calculates SHA-1, SHA-224, SHA-256, SHA-512, SHA-512-256 hash from a string and returns the resulting set of bytes as [FixedString](../data-types/fixedstring.md). **Syntax** @@ -337,15 +337,15 @@ Even in these cases, we recommend applying the function offline and pre-calculat **Arguments** -- `s` — Input string for SHA hash calculation. [String](/docs/en/sql-reference/data-types/string.md). +- `s` — Input string for SHA hash calculation. [String](../data-types/string.md). **Returned value** -- SHA hash as a hex-unencoded FixedString. SHA-1 returns as FixedString(20), SHA-224 as FixedString(28), SHA-256 — FixedString(32), SHA-512 — FixedString(64). [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). +- SHA hash as a hex-unencoded FixedString. SHA-1 returns as FixedString(20), SHA-224 as FixedString(28), SHA-256 — FixedString(32), SHA-512 — FixedString(64). [FixedString](../data-types/fixedstring.md). **Example** -Use the [hex](/docs/en/sql-reference/functions/encoding-functions.md/#hex) function to represent the result as a hex-encoded string. +Use the [hex](../functions/encoding-functions.md/#hex) function to represent the result as a hex-encoded string. Query: @@ -363,7 +363,7 @@ Result: ## BLAKE3 -Calculates BLAKE3 hash string and returns the resulting set of bytes as [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). +Calculates BLAKE3 hash string and returns the resulting set of bytes as [FixedString](../data-types/fixedstring.md). **Syntax** @@ -375,15 +375,15 @@ This cryptographic hash-function is integrated into ClickHouse with BLAKE3 Rust **Arguments** -- s - input string for BLAKE3 hash calculation. [String](/docs/en/sql-reference/data-types/string.md). +- s - input string for BLAKE3 hash calculation. [String](../data-types/string.md). **Return value** -- BLAKE3 hash as a byte array with type FixedString(32). [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). +- BLAKE3 hash as a byte array with type FixedString(32). [FixedString](../data-types/fixedstring.md). **Example** -Use function [hex](/docs/en/sql-reference/functions/encoding-functions.md/#hex) to represent the result as a hex-encoded string. +Use function [hex](../functions/encoding-functions.md/#hex) to represent the result as a hex-encoded string. Query: ```sql @@ -419,11 +419,11 @@ These functions use the `Fingerprint64` and `Hash64` methods respectively from a **Arguments** -The function takes a variable number of input parameters. Arguments can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). +The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). **Returned Value** -A [UInt64](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. +A [UInt64](../data-types/int-uint.md) data type hash value. **Example** @@ -564,11 +564,11 @@ metroHash64(par1, ...) **Arguments** -The function takes a variable number of input parameters. Arguments can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). +The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). **Returned Value** -A [UInt64](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. +A [UInt64](../data-types/int-uint.md) data type hash value. **Example** @@ -602,12 +602,12 @@ Alias: `yandexConsistentHash` (left for backwards compatibility sake). **Parameters** -- `input`: A UInt64-type key [UInt64](/docs/en/sql-reference/data-types/int-uint.md). -- `n`: Number of buckets. [UInt16](/docs/en/sql-reference/data-types/int-uint.md). +- `input`: A UInt64-type key [UInt64](../data-types/int-uint.md). +- `n`: Number of buckets. [UInt16](../data-types/int-uint.md). **Returned value** -- A [UInt16](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. +- A [UInt16](../data-types/int-uint.md) data type hash value. **Implementation details** @@ -638,12 +638,12 @@ murmurHash2_64(par1, ...) **Arguments** -Both functions take a variable number of input parameters. Arguments can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). +Both functions take a variable number of input parameters. Arguments can be any of the [supported data types](../data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). **Returned Value** -- The `murmurHash2_32` function returns hash value having the [UInt32](/docs/en/sql-reference/data-types/int-uint.md) data type. -- The `murmurHash2_64` function returns hash value having the [UInt64](/docs/en/sql-reference/data-types/int-uint.md) data type. +- The `murmurHash2_32` function returns hash value having the [UInt32](../data-types/int-uint.md) data type. +- The `murmurHash2_64` function returns hash value having the [UInt64](../data-types/int-uint.md) data type. **Example** @@ -669,11 +669,11 @@ gccMurmurHash(par1, ...) **Arguments** -- `par1, ...` — A variable number of parameters that can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md/#data_types). +- `par1, ...` — A variable number of parameters that can be any of the [supported data types](../data-types/index.md/#data_types). **Returned value** -- Calculated hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Calculated hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -706,11 +706,11 @@ MurmurHash(par1, ...) **Arguments** -- `par1, ...` — A variable number of parameters that can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md/#data_types). +- `par1, ...` — A variable number of parameters that can be any of the [supported data types](../data-types/index.md/#data_types). **Returned value** -- Calculated hash value. [UInt32](/docs/en/sql-reference/data-types/int-uint.md). +- Calculated hash value. [UInt32](../data-types/int-uint.md). **Example** @@ -741,12 +741,12 @@ murmurHash3_64(par1, ...) **Arguments** -Both functions take a variable number of input parameters. Arguments can be any of the [supported data types](/docs/en/sql-reference/data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). +Both functions take a variable number of input parameters. Arguments can be any of the [supported data types](../data-types/index.md). For some data types calculated value of hash function may be the same for the same values even if types of arguments differ (integers of different size, named and unnamed `Tuple` with the same data, `Map` and the corresponding `Array(Tuple(key, value))` type with the same data). **Returned Value** -- The `murmurHash3_32` function returns a [UInt32](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. -- The `murmurHash3_64` function returns a [UInt64](/docs/en/sql-reference/data-types/int-uint.md) data type hash value. +- The `murmurHash3_32` function returns a [UInt32](../data-types/int-uint.md) data type hash value. +- The `murmurHash3_64` function returns a [UInt64](../data-types/int-uint.md) data type hash value. **Example** @@ -772,11 +772,11 @@ murmurHash3_128(expr) **Arguments** -- `expr` — A list of [expressions](/docs/en/sql-reference/syntax.md/#syntax-expressions). [String](/docs/en/sql-reference/data-types/string.md). +- `expr` — A list of [expressions](../syntax.md/#syntax-expressions). [String](../data-types/string.md). **Returned value** -A 128-bit `MurmurHash3` hash value. [FixedString(16)](/docs/en/sql-reference/data-types/fixedstring.md). +A 128-bit `MurmurHash3` hash value. [FixedString(16)](../data-types/fixedstring.md). **Example** @@ -806,11 +806,11 @@ xxh3(expr) **Arguments** -- `expr` — A list of [expressions](/docs/en/sql-reference/syntax.md/#syntax-expressions) of any data type. +- `expr` — A list of [expressions](../syntax.md/#syntax-expressions) of any data type. **Returned value** -A 64-bit `xxh3` hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +A 64-bit `xxh3` hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -872,7 +872,7 @@ Result: Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive. -Can be used for detection of semi-duplicate strings with [bitHammingDistance](/docs/en/sql-reference/functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. **Syntax** @@ -882,12 +882,12 @@ ngramSimHash(string[, ngramsize]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -909,7 +909,7 @@ Result: Splits a ASCII string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive. -Can be used for detection of semi-duplicate strings with [bitHammingDistance](/docs/en/sql-reference/functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. **Syntax** @@ -919,12 +919,12 @@ ngramSimHashCaseInsensitive(string[, ngramsize]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -946,7 +946,7 @@ Result: Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case sensitive. -Can be used for detection of semi-duplicate strings with [bitHammingDistance](/docs/en/sql-reference/functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. **Syntax** @@ -956,12 +956,12 @@ ngramSimHashUTF8(string[, ngramsize]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -983,7 +983,7 @@ Result: Splits a UTF-8 string into n-grams of `ngramsize` symbols and returns the n-gram `simhash`. Is case insensitive. -Can be used for detection of semi-duplicate strings with [bitHammingDistance](/docs/en/sql-reference/functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. **Syntax** @@ -993,12 +993,12 @@ ngramSimHashCaseInsensitiveUTF8(string[, ngramsize]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -1020,7 +1020,7 @@ Result: Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive. -Can be used for detection of semi-duplicate strings with [bitHammingDistance](/docs/en/sql-reference/functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. **Syntax** @@ -1030,12 +1030,12 @@ wordShingleSimHash(string[, shinglesize]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -1057,7 +1057,7 @@ Result: Splits a ASCII string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive. -Can be used for detection of semi-duplicate strings with [bitHammingDistance](/docs/en/sql-reference/functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. **Syntax** @@ -1067,12 +1067,12 @@ wordShingleSimHashCaseInsensitive(string[, shinglesize]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -1094,7 +1094,7 @@ Result: Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case sensitive. -Can be used for detection of semi-duplicate strings with [bitHammingDistance](/docs/en/sql-reference/functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. **Syntax** @@ -1104,12 +1104,12 @@ wordShingleSimHashUTF8(string[, shinglesize]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -1131,7 +1131,7 @@ Result: Splits a UTF-8 string into parts (shingles) of `shinglesize` words and returns the word shingle `simhash`. Is case insensitive. -Can be used for detection of semi-duplicate strings with [bitHammingDistance](/docs/en/sql-reference/functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. +Can be used for detection of semi-duplicate strings with [bitHammingDistance](../functions/bit-functions.md/#bithammingdistance). The smaller is the [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) of the calculated `simhashes` of two strings, the more likely these strings are the same. **Syntax** @@ -1141,12 +1141,12 @@ wordShingleSimHashCaseInsensitiveUTF8(string[, shinglesize]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -1176,11 +1176,11 @@ wyHash64(string) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). +- `string` — String. [String](../data-types/string.md). **Returned value** -- Hash value. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Hash value. [UInt64](../data-types/int-uint.md). **Example** @@ -1202,7 +1202,7 @@ Result: Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. -Can be used for detection of semi-duplicate strings with [tupleHammingDistance](/docs/en/sql-reference/functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. **Syntax** @@ -1212,13 +1212,13 @@ ngramMinHash(string[, ngramsize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](../data-types/tuple.md)([UInt64](../data-types/int-uint.md), [UInt64](../data-types/int-uint.md)). **Example** @@ -1240,7 +1240,7 @@ Result: Splits a ASCII string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. -Can be used for detection of semi-duplicate strings with [tupleHammingDistance](/docs/en/sql-reference/functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. **Syntax** @@ -1250,13 +1250,13 @@ ngramMinHashCaseInsensitive(string[, ngramsize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](../data-types/tuple.md)([UInt64](../data-types/int-uint.md), [UInt64](../data-types/int-uint.md)). **Example** @@ -1278,7 +1278,7 @@ Result: Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. -Can be used for detection of semi-duplicate strings with [tupleHammingDistance](/docs/en/sql-reference/functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. **Syntax** @@ -1288,13 +1288,13 @@ ngramMinHashUTF8(string[, ngramsize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](../data-types/tuple.md)([UInt64](../data-types/int-uint.md), [UInt64](../data-types/int-uint.md)). **Example** @@ -1316,7 +1316,7 @@ Result: Splits a UTF-8 string into n-grams of `ngramsize` symbols and calculates hash values for each n-gram. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. -Can be used for detection of semi-duplicate strings with [tupleHammingDistance](/docs/en/sql-reference/functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. **Syntax** @@ -1326,13 +1326,13 @@ ngramMinHashCaseInsensitiveUTF8(string [, ngramsize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](../data-types/tuple.md)([UInt64](../data-types/int-uint.md), [UInt64](../data-types/int-uint.md)). **Example** @@ -1362,13 +1362,13 @@ ngramMinHashArg(string[, ngramsize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two tuples with `hashnum` n-grams each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` n-grams each. [Tuple](../data-types/tuple.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md)), [Tuple](../data-types/tuple.md)([String](../data-types/string.md))). **Example** @@ -1398,13 +1398,13 @@ ngramMinHashArgCaseInsensitive(string[, ngramsize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two tuples with `hashnum` n-grams each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` n-grams each. [Tuple](../data-types/tuple.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md)), [Tuple](../data-types/tuple.md)([String](../data-types/string.md))). **Example** @@ -1434,13 +1434,13 @@ ngramMinHashArgUTF8(string[, ngramsize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two tuples with `hashnum` n-grams each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` n-grams each. [Tuple](../data-types/tuple.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md)), [Tuple](../data-types/tuple.md)([String](../data-types/string.md))). **Example** @@ -1470,13 +1470,13 @@ ngramMinHashArgCaseInsensitiveUTF8(string[, ngramsize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `ngramsize` — The size of an n-gram. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two tuples with `hashnum` n-grams each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` n-grams each. [Tuple](../data-types/tuple.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md)), [Tuple](../data-types/tuple.md)([String](../data-types/string.md))). **Example** @@ -1498,7 +1498,7 @@ Result: Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. -Can be used for detection of semi-duplicate strings with [tupleHammingDistance](/docs/en/sql-reference/functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. **Syntax** @@ -1508,13 +1508,13 @@ wordShingleMinHash(string[, shinglesize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](../data-types/tuple.md)([UInt64](../data-types/int-uint.md), [UInt64](../data-types/int-uint.md)). **Example** @@ -1536,7 +1536,7 @@ Result: Splits a ASCII string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. -Can be used for detection of semi-duplicate strings with [tupleHammingDistance](/docs/en/sql-reference/functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. **Syntax** @@ -1546,13 +1546,13 @@ wordShingleMinHashCaseInsensitive(string[, shinglesize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](../data-types/tuple.md)([UInt64](../data-types/int-uint.md), [UInt64](../data-types/int-uint.md)). **Example** @@ -1574,7 +1574,7 @@ Result: Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case sensitive. -Can be used for detection of semi-duplicate strings with [tupleHammingDistance](/docs/en/sql-reference/functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. **Syntax** @@ -1584,13 +1584,13 @@ wordShingleMinHashUTF8(string[, shinglesize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](../data-types/tuple.md)([UInt64](../data-types/int-uint.md), [UInt64](../data-types/int-uint.md)). **Example** @@ -1612,7 +1612,7 @@ Result: Splits a UTF-8 string into parts (shingles) of `shinglesize` words and calculates hash values for each word shingle. Uses `hashnum` minimum hashes to calculate the minimum hash and `hashnum` maximum hashes to calculate the maximum hash. Returns a tuple with these hashes. Is case insensitive. -Can be used for detection of semi-duplicate strings with [tupleHammingDistance](/docs/en/sql-reference/functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. +Can be used for detection of semi-duplicate strings with [tupleHammingDistance](../functions/tuple-functions.md/#tuplehammingdistance). For two strings: if one of the returned hashes is the same for both strings, we think that those strings are the same. **Syntax** @@ -1622,13 +1622,13 @@ wordShingleMinHashCaseInsensitiveUTF8(string[, shinglesize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two hashes — the minimum and the maximum. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([UInt64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md)). +- Tuple with two hashes — the minimum and the maximum. [Tuple](../data-types/tuple.md)([UInt64](../data-types/int-uint.md), [UInt64](../data-types/int-uint.md)). **Example** @@ -1658,13 +1658,13 @@ wordShingleMinHashArg(string[, shinglesize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two tuples with `hashnum` word shingles each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` word shingles each. [Tuple](../data-types/tuple.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md)), [Tuple](../data-types/tuple.md)([String](../data-types/string.md))). **Example** @@ -1694,13 +1694,13 @@ wordShingleMinHashArgCaseInsensitive(string[, shinglesize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two tuples with `hashnum` word shingles each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` word shingles each. [Tuple](../data-types/tuple.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md)), [Tuple](../data-types/tuple.md)([String](../data-types/string.md))). **Example** @@ -1730,13 +1730,13 @@ wordShingleMinHashArgUTF8(string[, shinglesize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two tuples with `hashnum` word shingles each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` word shingles each. [Tuple](../data-types/tuple.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md)), [Tuple](../data-types/tuple.md)([String](../data-types/string.md))). **Example** @@ -1766,13 +1766,13 @@ wordShingleMinHashArgCaseInsensitiveUTF8(string[, shinglesize, hashnum]) **Arguments** -- `string` — String. [String](/docs/en/sql-reference/data-types/string.md). -- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md). +- `shinglesize` — The size of a word shingle. Optional. Possible values: any number from `1` to `25`. Default value: `3`. [UInt8](../data-types/int-uint.md). +- `hashnum` — The number of minimum and maximum hashes used to calculate the result. Optional. Possible values: any number from `1` to `25`. Default value: `6`. [UInt8](../data-types/int-uint.md). **Returned value** -- Tuple with two tuples with `hashnum` word shingles each. [Tuple](/docs/en/sql-reference/data-types/tuple.md)([Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md)), [Tuple](/docs/en/sql-reference/data-types/tuple.md)([String](/docs/en/sql-reference/data-types/string.md))). +- Tuple with two tuples with `hashnum` word shingles each. [Tuple](../data-types/tuple.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md)), [Tuple](../data-types/tuple.md)([String](../data-types/string.md))). **Example** @@ -1810,7 +1810,7 @@ Alias: `sqid` **Returned Value** -A sqid [String](/docs/en/sql-reference/data-types/string.md). +A sqid [String](../data-types/string.md). **Example** @@ -1837,11 +1837,11 @@ sqidDecode(sqid) **Arguments** -- A sqid - [String](/docs/en/sql-reference/data-types/string.md) +- A sqid - [String](../data-types/string.md) **Returned Value** -The sqid transformed to numbers [Array(UInt64)](/docs/en/sql-reference/data-types/array.md). +The sqid transformed to numbers [Array(UInt64)](../data-types/array.md). **Example** diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index d07a5292431..c0256ba4735 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -11,7 +11,7 @@ There are at least\* two types of functions - regular functions (they are just c In this section we discuss regular functions. For aggregate functions, see the section “Aggregate functions”. :::note -There is a third type of function that the [‘arrayJoin’ function](/docs/en/sql-reference/functions/array-join.md) belongs to. And [table functions](/docs/en/sql-reference/table-functions/index.md) can also be mentioned separately. +There is a third type of function that the [‘arrayJoin’ function](../functions/array-join.md) belongs to. And [table functions](../table-functions/index.md) can also be mentioned separately. ::: ## Strong Typing @@ -63,4 +63,4 @@ For some functions the first argument (the lambda function) can be omitted. In t ## User Defined Functions (UDFs) -ClickHouse supports user-defined functions. See [UDFs](/docs/en/sql-reference/functions/udf.md). +ClickHouse supports user-defined functions. See [UDFs](../functions/udf.md). diff --git a/docs/en/sql-reference/functions/introspection.md b/docs/en/sql-reference/functions/introspection.md index be8a2956d41..540e148e3f1 100644 --- a/docs/en/sql-reference/functions/introspection.md +++ b/docs/en/sql-reference/functions/introspection.md @@ -36,14 +36,14 @@ addressToLine(address_of_binary_instruction) **Arguments** -- `address_of_binary_instruction` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Address of instruction in a running process. +- `address_of_binary_instruction` ([UInt64](../data-types/int-uint.md)) — Address of instruction in a running process. **Returned value** -- Source code filename and the line number in this file delimited by colon. [String](../../sql-reference/data-types/string.md). +- Source code filename and the line number in this file delimited by colon. [String](../data-types/string.md). - For example, `/build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.cpp:199`, where `199` is a line number. -- Name of a binary, if the function couldn’t find the debug information. [String](../../sql-reference/data-types/string.md). -- Empty string, if the address is not valid. [String](../../sql-reference/data-types/string.md). +- Name of a binary, if the function couldn’t find the debug information. [String](../data-types/string.md). +- Empty string, if the address is not valid. [String](../data-types/string.md). **Example** @@ -124,7 +124,7 @@ addressToLineWithInlines(address_of_binary_instruction) **Arguments** -- `address_of_binary_instruction` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Address of instruction in a running process. +- `address_of_binary_instruction` ([UInt64](../data-types/int-uint.md)) — Address of instruction in a running process. **Returned value** @@ -132,7 +132,7 @@ addressToLineWithInlines(address_of_binary_instruction) - Array with single element which is name of a binary, if the function couldn’t find the debug information. -- Empty array, if the address is not valid. [Array(String)](../../sql-reference/data-types/array.md). +- Empty array, if the address is not valid. [Array(String)](../data-types/array.md). **Example** @@ -225,12 +225,12 @@ addressToSymbol(address_of_binary_instruction) **Arguments** -- `address_of_binary_instruction` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Address of instruction in a running process. +- `address_of_binary_instruction` ([UInt64](../data-types/int-uint.md)) — Address of instruction in a running process. **Returned value** -- Symbol from ClickHouse object files. [String](../../sql-reference/data-types/string.md). -- Empty string, if the address is not valid. [String](../../sql-reference/data-types/string.md). +- Symbol from ClickHouse object files. [String](../data-types/string.md). +- Empty string, if the address is not valid. [String](../data-types/string.md). **Example** @@ -320,12 +320,12 @@ demangle(symbol) **Arguments** -- `symbol` ([String](../../sql-reference/data-types/string.md)) — Symbol from an object file. +- `symbol` ([String](../data-types/string.md)) — Symbol from an object file. **Returned value** -- Name of the C++ function. [String](../../sql-reference/data-types/string.md). -- Empty string if a symbol is not valid. [String](../../sql-reference/data-types/string.md). +- Name of the C++ function. [String](../data-types/string.md). +- Empty string if a symbol is not valid. [String](../data-types/string.md). **Example** @@ -414,7 +414,7 @@ tid() **Returned value** -- Current thread id. [Uint64](../../sql-reference/data-types/int-uint.md#uint-ranges). +- Current thread id. [Uint64](../data-types/int-uint.md#uint-ranges). **Example** @@ -444,7 +444,7 @@ logTrace('message') **Arguments** -- `message` — Message that is emitted to server log. [String](../../sql-reference/data-types/string.md#string). +- `message` — Message that is emitted to server log. [String](../data-types/string.md#string). **Returned value** diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 21beffbd0a8..5b6a3aef2c8 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -147,11 +147,11 @@ IPv6StringToNum(string) **Argument** -- `string` — IP address. [String](../../sql-reference/data-types/string.md). +- `string` — IP address. [String](../data-types/string.md). **Returned value** -- IPv6 address in binary format. [FixedString(16)](../../sql-reference/data-types/fixedstring.md). +- IPv6 address in binary format. [FixedString(16)](../data-types/fixedstring.md). **Example** @@ -246,7 +246,7 @@ SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); ## toIPv4(string) -An alias to `IPv4StringToNum()` that takes a string form of IPv4 address and returns value of [IPv4](../../sql-reference/data-types/ipv4.md) type, which is binary equal to value returned by `IPv4StringToNum()`. +An alias to `IPv4StringToNum()` that takes a string form of IPv4 address and returns value of [IPv4](../data-types/ipv4.md) type, which is binary equal to value returned by `IPv4StringToNum()`. ``` sql WITH @@ -294,7 +294,7 @@ Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null ## toIPv6 -Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. +Converts a string form of IPv6 address to [IPv6](../data-types/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format. If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned. @@ -307,11 +307,11 @@ toIPv6(string) **Argument** -- `string` — IP address. [String](../../sql-reference/data-types/string.md) +- `string` — IP address. [String](../data-types/string.md) **Returned value** -- IP address. [IPv6](../../sql-reference/data-types/ipv6.md). +- IP address. [IPv6](../data-types/ipv6.md). **Examples** @@ -366,11 +366,11 @@ isIPv4String(string) **Arguments** -- `string` — IP address. [String](../../sql-reference/data-types/string.md). +- `string` — IP address. [String](../data-types/string.md). **Returned value** -- `1` if `string` is IPv4 address, `0` otherwise. [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `string` is IPv4 address, `0` otherwise. [UInt8](../data-types/int-uint.md). **Examples** @@ -402,11 +402,11 @@ isIPv6String(string) **Arguments** -- `string` — IP address. [String](../../sql-reference/data-types/string.md). +- `string` — IP address. [String](../data-types/string.md). **Returned value** -- `1` if `string` is IPv6 address, `0` otherwise. [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `string` is IPv6 address, `0` otherwise. [UInt8](../data-types/int-uint.md). **Examples** @@ -441,12 +441,12 @@ This function accepts both IPv4 and IPv6 addresses (and networks) represented as **Arguments** -- `address` — An IPv4 or IPv6 address. [String](../../sql-reference/data-types/string.md). -- `prefix` — An IPv4 or IPv6 network prefix in CIDR. [String](../../sql-reference/data-types/string.md). +- `address` — An IPv4 or IPv6 address. [String](../data-types/string.md). +- `prefix` — An IPv4 or IPv6 network prefix in CIDR. [String](../data-types/string.md). **Returned value** -- `1` or `0`. [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` or `0`. [UInt8](../data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index dc4a3d871e7..8359d5f9fbc 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -31,7 +31,7 @@ simpleJSONHas(json, field_name) **Parameters** -- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `json`: The JSON in which the field is searched for. [String](../data-types/string.md#string) - `field_name`: The name of the field to search for. [String literal](../syntax#string) **Returned value** @@ -71,7 +71,7 @@ simpleJSONExtractUInt(json, field_name) **Parameters** -- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `json`: The JSON in which the field is searched for. [String](../data-types/string.md#string) - `field_name`: The name of the field to search for. [String literal](../syntax#string) **Returned value** @@ -118,7 +118,7 @@ simpleJSONExtractInt(json, field_name) **Parameters** -- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `json`: The JSON in which the field is searched for. [String](../data-types/string.md#string) - `field_name`: The name of the field to search for. [String literal](../syntax#string) **Returned value** @@ -165,7 +165,7 @@ simpleJSONExtractFloat(json, field_name) **Parameters** -- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `json`: The JSON in which the field is searched for. [String](../data-types/string.md#string) - `field_name`: The name of the field to search for. [String literal](../syntax#string) **Returned value** @@ -212,7 +212,7 @@ simpleJSONExtractBool(json, field_name) **Parameters** -- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `json`: The JSON in which the field is searched for. [String](../data-types/string.md#string) - `field_name`: The name of the field to search for. [String literal](../syntax#string) **Returned value** @@ -259,12 +259,12 @@ simpleJSONExtractRaw(json, field_name) **Parameters** -- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `json`: The JSON in which the field is searched for. [String](../data-types/string.md#string) - `field_name`: The name of the field to search for. [String literal](../syntax#string) **Returned value** -It returns the value of the field as a [`String`](../../sql-reference/data-types/string.md#string), including separators if the field exists, or an empty `String` otherwise. +It returns the value of the field as a [`String`](../data-types/string.md#string), including separators if the field exists, or an empty `String` otherwise. **Example** @@ -306,12 +306,12 @@ simpleJSONExtractString(json, field_name) **Parameters** -- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `json`: The JSON in which the field is searched for. [String](../data-types/string.md#string) - `field_name`: The name of the field to search for. [String literal](../syntax#string) **Returned value** -It returns the value of a field as a [`String`](../../sql-reference/data-types/string.md#string), including separators. The value is unescaped. It returns an empty `String`: if the field doesn't contain a double quoted string, if unescaping fails or if the field doesn't exist. +It returns the value of a field as a [`String`](../data-types/string.md#string), including separators. The value is unescaped. It returns an empty `String`: if the field doesn't contain a double quoted string, if unescaping fails or if the field doesn't exist. **Implementation details** @@ -528,12 +528,12 @@ JSONExtractKeys(json[, a, b, c...]) **Arguments** -- `json` — [String](../../sql-reference/data-types/string.md) with valid JSON. -- `a, b, c...` — Comma-separated indices or keys that specify the path to the inner field in a nested JSON object. Each argument can be either a [String](../../sql-reference/data-types/string.md) to get the field by the key or an [Integer](../../sql-reference/data-types/int-uint.md) to get the N-th field (indexed from 1, negative integers count from the end). If not set, the whole JSON is parsed as the top-level object. Optional parameter. +- `json` — [String](../data-types/string.md) with valid JSON. +- `a, b, c...` — Comma-separated indices or keys that specify the path to the inner field in a nested JSON object. Each argument can be either a [String](../data-types/string.md) to get the field by the key or an [Integer](../data-types/int-uint.md) to get the N-th field (indexed from 1, negative integers count from the end). If not set, the whole JSON is parsed as the top-level object. Optional parameter. **Returned value** -Array with the keys of the JSON. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +Array with the keys of the JSON. [Array](../data-types/array.md)([String](../data-types/string.md)). **Example** @@ -588,13 +588,13 @@ JSONExtractKeysAndValuesRaw(json[, p, a, t, h]) **Arguments** -- `json` — [String](../../sql-reference/data-types/string.md) with valid JSON. -- `p, a, t, h` — Comma-separated indices or keys that specify the path to the inner field in a nested JSON object. Each argument can be either a [string](../../sql-reference/data-types/string.md) to get the field by the key or an [integer](../../sql-reference/data-types/int-uint.md) to get the N-th field (indexed from 1, negative integers count from the end). If not set, the whole JSON is parsed as the top-level object. Optional parameter. +- `json` — [String](../data-types/string.md) with valid JSON. +- `p, a, t, h` — Comma-separated indices or keys that specify the path to the inner field in a nested JSON object. Each argument can be either a [string](../data-types/string.md) to get the field by the key or an [integer](../data-types/int-uint.md) to get the N-th field (indexed from 1, negative integers count from the end). If not set, the whole JSON is parsed as the top-level object. Optional parameter. **Returned values** -- Array with `('key', 'value')` tuples. Both tuple members are strings. [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md), [String](../../sql-reference/data-types/string.md)). -- Empty array if the requested object does not exist, or input JSON is invalid. [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md), [String](../../sql-reference/data-types/string.md)). +- Array with `('key', 'value')` tuples. Both tuple members are strings. [Array](../data-types/array.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md), [String](../data-types/string.md)). +- Empty array if the requested object does not exist, or input JSON is invalid. [Array](../data-types/array.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md), [String](../data-types/string.md)). **Examples** @@ -719,9 +719,9 @@ Before version 21.11 the order of arguments was wrong, i.e. JSON_VALUE(path, jso ## toJSONString Serializes a value to its JSON representation. Various data types and nested structures are supported. -64-bit [integers](../../sql-reference/data-types/int-uint.md) or bigger (like `UInt64` or `Int128`) are enclosed in quotes by default. [output_format_json_quote_64bit_integers](../../operations/settings/settings.md#session_settings-output_format_json_quote_64bit_integers) controls this behavior. +64-bit [integers](../data-types/int-uint.md) or bigger (like `UInt64` or `Int128`) are enclosed in quotes by default. [output_format_json_quote_64bit_integers](../../operations/settings/settings.md#session_settings-output_format_json_quote_64bit_integers) controls this behavior. Special values `NaN` and `inf` are replaced with `null`. Enable [output_format_json_quote_denormals](../../operations/settings/settings.md#settings-output_format_json_quote_denormals) setting to show them. -When serializing an [Enum](../../sql-reference/data-types/enum.md) value, the function outputs its name. +When serializing an [Enum](../data-types/enum.md) value, the function outputs its name. **Syntax** @@ -735,12 +735,12 @@ toJSONString(value) **Returned value** -- JSON representation of the value. [String](../../sql-reference/data-types/string.md). +- JSON representation of the value. [String](../data-types/string.md). **Example** -The first example shows serialization of a [Map](../../sql-reference/data-types/map.md). -The second example shows some special values wrapped into a [Tuple](../../sql-reference/data-types/tuple.md). +The first example shows serialization of a [Map](../data-types/map.md). +The second example shows some special values wrapped into a [Tuple](../data-types/tuple.md). Query: @@ -776,11 +776,11 @@ Alias: `JSON_ARRAY_LENGTH(json)`. **Arguments** -- `json` — [String](../../sql-reference/data-types/string.md) with valid JSON. +- `json` — [String](../data-types/string.md) with valid JSON. **Returned value** -- If `json` is a valid JSON array string, returns the number of array elements, otherwise returns NULL. [Nullable(UInt64)](../../sql-reference/data-types/int-uint.md). +- If `json` is a valid JSON array string, returns the number of array elements, otherwise returns NULL. [Nullable(UInt64)](../data-types/int-uint.md). **Example** @@ -807,11 +807,11 @@ jsonMergePatch(json1, json2, ...) **Arguments** -- `json` — [String](../../sql-reference/data-types/string.md) with valid JSON. +- `json` — [String](../data-types/string.md) with valid JSON. **Returned value** -- If JSON object strings are valid, return the merged JSON object string. [String](../../sql-reference/data-types/string.md). +- If JSON object strings are valid, return the merged JSON object string. [String](../data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/logical-functions.md b/docs/en/sql-reference/functions/logical-functions.md index 1977c5c2a7e..8448dd4ff12 100644 --- a/docs/en/sql-reference/functions/logical-functions.md +++ b/docs/en/sql-reference/functions/logical-functions.md @@ -6,7 +6,7 @@ sidebar_label: Logical # Logical Functions -Below functions perform logical operations on arguments of arbitrary numeric types. They return either 0 or 1 as [UInt8](../../sql-reference/data-types/int-uint.md) or in some cases `NULL`. +Below functions perform logical operations on arguments of arbitrary numeric types. They return either 0 or 1 as [UInt8](../data-types/int-uint.md) or in some cases `NULL`. Zero as an argument is considered `false`, non-zero values are considered `true`. @@ -26,13 +26,13 @@ Alias: The [AND operator](../../sql-reference/operators/index.md#logical-and-ope **Arguments** -- `val1, val2, ...` — List of at least two values. [Int](../../sql-reference/data-types/int-uint.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Nullable](../../sql-reference/data-types/nullable.md). +- `val1, val2, ...` — List of at least two values. [Int](../data-types/int-uint.md), [UInt](../data-types/int-uint.md), [Float](../data-types/float.md) or [Nullable](../data-types/nullable.md). **Returned value** -- `0`, if at least one argument evaluates to `false`. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `0`, if at least one argument evaluates to `false`. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). - `NULL`, if no argument evaluates to `false` and at least one argument is `NULL`. [NULL](../../sql-reference/syntax.md/#null). -- `1`, otherwise. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `1`, otherwise. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). **Example** @@ -78,7 +78,7 @@ Alias: The [OR operator](../../sql-reference/operators/index.md#logical-or-opera **Arguments** -- `val1, val2, ...` — List of at least two values. [Int](../../sql-reference/data-types/int-uint.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Nullable](../../sql-reference/data-types/nullable.md). +- `val1, val2, ...` — List of at least two values. [Int](../data-types/int-uint.md), [UInt](../data-types/int-uint.md), [Float](../data-types/float.md) or [Nullable](../data-types/nullable.md). **Returned value** @@ -86,7 +86,7 @@ Alias: The [OR operator](../../sql-reference/operators/index.md#logical-or-opera - `0`, if all arguments evaluate to `false`, - `NULL`, if all arguments evaluate to `false` and at least one argument is `NULL`. -Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +Type: [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). **Example** @@ -130,12 +130,12 @@ Alias: The [Negation operator](../../sql-reference/operators/index.md#logical-ne **Arguments** -- `val` — The value. [Int](../../sql-reference/data-types/int-uint.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Nullable](../../sql-reference/data-types/nullable.md). +- `val` — The value. [Int](../data-types/int-uint.md), [UInt](../data-types/int-uint.md), [Float](../data-types/float.md) or [Nullable](../data-types/nullable.md). **Returned value** -- `1`, if `val` evaluates to `false`. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). -- `0`, if `val` evaluates to `true`. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `1`, if `val` evaluates to `false`. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). +- `0`, if `val` evaluates to `true`. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). - `NULL`, if `val` is `NULL`. [NULL](../../sql-reference/syntax.md/#null). **Example** @@ -164,12 +164,12 @@ xor(val1, val2...) **Arguments** -- `val1, val2, ...` — List of at least two values. [Int](../../sql-reference/data-types/int-uint.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Nullable](../../sql-reference/data-types/nullable.md). +- `val1, val2, ...` — List of at least two values. [Int](../data-types/int-uint.md), [UInt](../data-types/int-uint.md), [Float](../data-types/float.md) or [Nullable](../data-types/nullable.md). **Returned value** -- `1`, for two values: if one of the values evaluates to `false` and other does not. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). -- `0`, for two values: if both values evaluate to `false` or to both `true`. [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). +- `1`, for two values: if one of the values evaluates to `false` and other does not. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). +- `0`, for two values: if both values evaluate to `false` or to both `true`. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). - `NULL`, if at least one of the inputs is `NULL`. [NULL](../../sql-reference/syntax.md/#null). **Example** diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 03ddc38ef50..7f50fa933b6 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -18,7 +18,7 @@ e() **Returned value** -Type: [Float64](../../sql-reference/data-types/float.md). +Type: [Float64](../data-types/float.md). ## pi @@ -31,7 +31,7 @@ pi() ``` **Returned value** -Type: [Float64](../../sql-reference/data-types/float.md). +Type: [Float64](../data-types/float.md). ## exp @@ -45,11 +45,11 @@ exp(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## log @@ -65,11 +65,11 @@ Alias: `ln(x)` **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## exp2 @@ -83,11 +83,11 @@ exp2(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## intExp2 @@ -111,11 +111,11 @@ log2(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## exp10 @@ -129,11 +129,11 @@ exp10(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## intExp10 @@ -157,11 +157,11 @@ log10(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## sqrt @@ -173,11 +173,11 @@ sqrt(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## cbrt @@ -189,11 +189,11 @@ cbrt(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## erf @@ -207,11 +207,11 @@ erf(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). **Example** @@ -239,11 +239,11 @@ erfc(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## lgamma @@ -257,11 +257,11 @@ lgamma(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## tgamma @@ -275,11 +275,11 @@ gamma(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## sin @@ -293,11 +293,11 @@ sin(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). **Example** @@ -323,11 +323,11 @@ cos(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## tan @@ -341,11 +341,11 @@ tan(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## asin @@ -359,11 +359,11 @@ asin(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## acos @@ -377,11 +377,11 @@ acos(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## atan @@ -395,11 +395,11 @@ atan(x) **Arguments** -- `x` - [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` - [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -Type: [Float*](../../sql-reference/data-types/float.md). +Type: [Float*](../data-types/float.md). ## pow @@ -415,12 +415,12 @@ Alias: `power(x, y)` **Arguments** -- `x` - [(U)Int8/16/32/64](../../sql-reference/data-types/int-uint.md) or [Float*](../../sql-reference/data-types/float.md) -- `y` - [(U)Int8/16/32/64](../../sql-reference/data-types/int-uint.md) or [Float*](../../sql-reference/data-types/float.md) +- `x` - [(U)Int8/16/32/64](../data-types/int-uint.md) or [Float*](../data-types/float.md) +- `y` - [(U)Int8/16/32/64](../data-types/int-uint.md) or [Float*](../data-types/float.md) **Returned value** -Type: [Float64](../../sql-reference/data-types/float.md). +Type: [Float64](../data-types/float.md). ## cosh @@ -434,13 +434,13 @@ cosh(x) **Arguments** -- `x` — The angle, in radians. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — The angle, in radians. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** - Values from the interval: `1 <= cosh(x) < +∞`. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** @@ -468,13 +468,13 @@ acosh(x) **Arguments** -- `x` — Hyperbolic cosine of angle. Values from the interval: `1 <= x < +∞`. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — Hyperbolic cosine of angle. Values from the interval: `1 <= x < +∞`. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** - The angle, in radians. Values from the interval: `0 <= acosh(x) < +∞`. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** @@ -502,13 +502,13 @@ sinh(x) **Arguments** -- `x` — The angle, in radians. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — The angle, in radians. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** - Values from the interval: `-∞ < sinh(x) < +∞`. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** @@ -536,13 +536,13 @@ asinh(x) **Arguments** -- `x` — Hyperbolic sine of angle. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — Hyperbolic sine of angle. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** - The angle, in radians. Values from the interval: `-∞ < asinh(x) < +∞`. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** @@ -569,13 +569,13 @@ tanh(x) **Arguments** -- `x` — The angle, in radians. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — The angle, in radians. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** - Values from the interval: `-1 < tanh(x) < 1`. -Type: [Float*](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float*](../data-types/float.md#float32-float64). **Example** @@ -601,13 +601,13 @@ atanh(x) **Arguments** -- `x` — Hyperbolic tangent of angle. Values from the interval: `–1 < x < 1`. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — Hyperbolic tangent of angle. Values from the interval: `–1 < x < 1`. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** - The angle, in radians. Values from the interval: `-∞ < atanh(x) < +∞`. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** @@ -635,14 +635,14 @@ atan2(y, x) **Arguments** -- `y` — y-coordinate of the point through which the ray passes. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md). -- `x` — x-coordinate of the point through which the ray passes. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md). +- `y` — y-coordinate of the point through which the ray passes. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). +- `x` — x-coordinate of the point through which the ray passes. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** - The angle `θ` such that `−π < θ ≤ π`, in radians. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** @@ -670,14 +670,14 @@ hypot(x, y) **Arguments** -- `x` — The first cathetus of a right-angle triangle. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md). -- `y` — The second cathetus of a right-angle triangle. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md). +- `x` — The first cathetus of a right-angle triangle. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). +- `y` — The second cathetus of a right-angle triangle. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md). **Returned value** - The length of the hypotenuse of a right-angle triangle. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** @@ -705,13 +705,13 @@ log1p(x) **Arguments** -- `x` — Values from the interval: `-1 < x < +∞`. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — Values from the interval: `-1 < x < +∞`. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** - Values from the interval: `-∞ < log1p(x) < +∞`. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** @@ -747,7 +747,7 @@ sign(x) - 0 for `x = 0` - 1 for `x > 0` -Type: [Int8](../../sql-reference/data-types/int-uint.md). +Type: [Int8](../data-types/int-uint.md). **Examples** @@ -804,11 +804,11 @@ sigmoid(x) **Parameters** -- `x` — input value. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — input value. Values from the interval: `-∞ < x < +∞`. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -- Corresponding value along the sigmoid curve between 0 and 1. [Float64](../../sql-reference/data-types/float.md). +- Corresponding value along the sigmoid curve between 0 and 1. [Float64](../data-types/float.md). **Example** @@ -838,11 +838,11 @@ degrees(x) **Arguments** -- `x` — Input in radians. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — Input in radians. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** -- Value in degrees. [Float64](../../sql-reference/data-types/float.md#float32-float64). +- Value in degrees. [Float64](../data-types/float.md#float32-float64). **Example** @@ -870,13 +870,13 @@ radians(x) **Arguments** -- `x` — Input in degrees. [(U)Int*](../../sql-reference/data-types/int-uint.md), [Float*](../../sql-reference/data-types/float.md) or [Decimal*](../../sql-reference/data-types/decimal.md). +- `x` — Input in degrees. [(U)Int*](../data-types/int-uint.md), [Float*](../data-types/float.md) or [Decimal*](../data-types/decimal.md). **Returned value** - Value in radians. -Type: [Float64](../../sql-reference/data-types/float.md#float32-float64). +Type: [Float64](../data-types/float.md#float32-float64). **Example** diff --git a/docs/en/sql-reference/functions/nlp-functions.md b/docs/en/sql-reference/functions/nlp-functions.md index 3e0458d226d..4bfa181a35f 100644 --- a/docs/en/sql-reference/functions/nlp-functions.md +++ b/docs/en/sql-reference/functions/nlp-functions.md @@ -23,7 +23,7 @@ stem('language', word) ### Arguments - `language` — Language which rules will be applied. Use the two letter [ISO 639-1 code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). -- `word` — word that needs to be stemmed. Must be in lowercase. [String](../../sql-reference/data-types/string.md#string). +- `word` — word that needs to be stemmed. Must be in lowercase. [String](../data-types/string.md#string). ### Examples @@ -88,8 +88,8 @@ lemmatize('language', word) ### Arguments -- `language` — Language which rules will be applied. [String](../../sql-reference/data-types/string.md#string). -- `word` — Word that needs to be lemmatized. Must be lowercase. [String](../../sql-reference/data-types/string.md#string). +- `language` — Language which rules will be applied. [String](../data-types/string.md#string). +- `word` — Word that needs to be lemmatized. Must be lowercase. [String](../data-types/string.md#string). ### Examples @@ -139,8 +139,8 @@ synonyms('extension_name', word) ### Arguments -- `extension_name` — Name of the extension in which search will be performed. [String](../../sql-reference/data-types/string.md#string). -- `word` — Word that will be searched in extension. [String](../../sql-reference/data-types/string.md#string). +- `extension_name` — Name of the extension in which search will be performed. [String](../data-types/string.md#string). +- `word` — Word that will be searched in extension. [String](../data-types/string.md#string). ### Examples @@ -188,7 +188,7 @@ detectLanguage('text_to_be_analyzed') ### Arguments -- `text_to_be_analyzed` — A collection (or sentences) of strings to analyze. [String](../../sql-reference/data-types/string.md#string). +- `text_to_be_analyzed` — A collection (or sentences) of strings to analyze. [String](../data-types/string.md#string). ### Returned value @@ -226,7 +226,7 @@ detectLanguageMixed('text_to_be_analyzed') ### Arguments -- `text_to_be_analyzed` — A collection (or sentences) of strings to analyze. [String](../../sql-reference/data-types/string.md#string). +- `text_to_be_analyzed` — A collection (or sentences) of strings to analyze. [String](../data-types/string.md#string). ### Returned value @@ -262,7 +262,7 @@ detectLanguageUnknown('text_to_be_analyzed') ### Arguments -- `text_to_be_analyzed` — A collection (or sentences) of strings to analyze. [String](../../sql-reference/data-types/string.md#string). +- `text_to_be_analyzed` — A collection (or sentences) of strings to analyze. [String](../data-types/string.md#string). ### Returned value @@ -302,7 +302,7 @@ detectCharset('text_to_be_analyzed') ### Arguments -- `text_to_be_analyzed` — A collection (or sentences) of strings to analyze. [String](../../sql-reference/data-types/string.md#string). +- `text_to_be_analyzed` — A collection (or sentences) of strings to analyze. [String](../data-types/string.md#string). ### Returned value diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 45fc12388fe..dfe1224f7b8 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -33,11 +33,11 @@ getMacro(name); **Arguments** -- `name` — Macro name to retrieve from the `` section. [String](../../sql-reference/data-types/string.md#string). +- `name` — Macro name to retrieve from the `` section. [String](../data-types/string.md#string). **Returned value** -- Value of the specified macro. [String](../../sql-reference/data-types/string.md). +- Value of the specified macro. [String](../data-types/string.md). **Example** @@ -116,7 +116,7 @@ basename(expr) **Arguments** -- `expr` — A value of type [String](../../sql-reference/data-types/string.md). Backslashes must be escaped. +- `expr` — A value of type [String](../data-types/string.md). Backslashes must be escaped. **Returned Value** @@ -237,11 +237,11 @@ byteSize(argument [, ...]) **Returned value** -- Estimation of byte size of the arguments in memory. [UInt64](../../sql-reference/data-types/int-uint.md). +- Estimation of byte size of the arguments in memory. [UInt64](../data-types/int-uint.md). **Examples** -For [String](../../sql-reference/data-types/string.md) arguments, the function returns the string length + 9 (terminating zero + length). +For [String](../data-types/string.md) arguments, the function returns the string length + 9 (terminating zero + length). Query: @@ -350,7 +350,7 @@ sleep(seconds) **Arguments** -- `seconds`: [UInt*](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md) The number of seconds to pause the query execution to a maximum of 3 seconds. It can be a floating-point value to specify fractional seconds. +- `seconds`: [UInt*](../data-types/int-uint.md) or [Float](../data-types/float.md) The number of seconds to pause the query execution to a maximum of 3 seconds. It can be a floating-point value to specify fractional seconds. **Returned value** @@ -400,7 +400,7 @@ sleepEachRow(seconds) **Arguments** -- `seconds`: [UInt*](../../sql-reference/data-types/int-uint.md) or [Float*](../../sql-reference/data-types/float.md) The number of seconds to pause the query execution for each row in the result set to a maximum of 3 seconds. It can be a floating-point value to specify fractional seconds. +- `seconds`: [UInt*](../data-types/int-uint.md) or [Float*](../data-types/float.md) The number of seconds to pause the query execution for each row in the result set to a maximum of 3 seconds. It can be a floating-point value to specify fractional seconds. **Returned value** @@ -494,8 +494,8 @@ isConstant(x) **Returned values** -- `1` if `x` is constant. [UInt8](../../sql-reference/data-types/int-uint.md). -- `0` if `x` is non-constant. [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `x` is constant. [UInt8](../data-types/int-uint.md). +- `0` if `x` is non-constant. [UInt8](../data-types/int-uint.md). **Examples** @@ -963,7 +963,7 @@ uptime() **Returned value** -- Time value of seconds. [UInt32](/docs/en/sql-reference/data-types/int-uint.md). +- Time value of seconds. [UInt32](../data-types/int-uint.md). **Example** @@ -1226,7 +1226,7 @@ To prevent that you can create a subquery with [ORDER BY](../../sql-reference/st **Arguments** - `column` — A column name or scalar expression. -- `offset` — The number of rows to look before or ahead of the current row in `column`. [Int64](../../sql-reference/data-types/int-uint.md). +- `offset` — The number of rows to look before or ahead of the current row in `column`. [Int64](../data-types/int-uint.md). - `default_value` — Optional. The returned value if offset is beyond the block boundaries. Type of data blocks affected. **Returned values** @@ -1446,12 +1446,12 @@ runningConcurrency(start, end) **Arguments** -- `start` — A column with the start time of events. [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md). -- `end` — A column with the end time of events. [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md). +- `start` — A column with the start time of events. [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), or [DateTime64](../data-types/datetime64.md). +- `end` — A column with the end time of events. [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), or [DateTime64](../data-types/datetime64.md). **Returned values** -- The number of concurrent events at each event start time. [UInt32](../../sql-reference/data-types/int-uint.md) +- The number of concurrent events at each event start time. [UInt32](../data-types/int-uint.md) **Example** @@ -1515,7 +1515,7 @@ MACStringToOUI(s) ## getSizeOfEnumType -Returns the number of fields in [Enum](../../sql-reference/data-types/enum.md). +Returns the number of fields in [Enum](../data-types/enum.md). An exception is thrown if the type is not `Enum`. **Syntax** @@ -1674,7 +1674,7 @@ defaultValueOfArgumentType(expression) - `0` for numbers. - Empty string for strings. -- `ᴺᵁᴸᴸ` for [Nullable](../../sql-reference/data-types/nullable.md). +- `ᴺᵁᴸᴸ` for [Nullable](../data-types/nullable.md). **Example** @@ -1724,7 +1724,7 @@ defaultValueOfTypeName(type) - `0` for numbers. - Empty string for strings. -- `ᴺᵁᴸᴸ` for [Nullable](../../sql-reference/data-types/nullable.md). +- `ᴺᵁᴸᴸ` for [Nullable](../data-types/nullable.md). **Example** @@ -1937,7 +1937,7 @@ filesystemAvailable() **Returned value** -- The amount of remaining space available in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). +- The amount of remaining space available in bytes. [UInt64](../data-types/int-uint.md). **Example** @@ -1967,7 +1967,7 @@ filesystemFree() **Returned value** -- The amount of free space in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). +- The amount of free space in bytes. [UInt64](../data-types/int-uint.md). **Example** @@ -1997,7 +1997,7 @@ filesystemCapacity() **Returned value** -- Capacity of the filesystem in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). +- Capacity of the filesystem in bytes. [UInt64](../data-types/int-uint.md). **Example** @@ -2017,7 +2017,7 @@ Result: ## initializeAggregation -Calculates the result of an aggregate function based on a single value. This function can be used to initialize aggregate functions with combinator [-State](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state). You can create states of aggregate functions and insert them to columns of type [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction) or use initialized aggregates as default values. +Calculates the result of an aggregate function based on a single value. This function can be used to initialize aggregate functions with combinator [-State](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state). You can create states of aggregate functions and insert them to columns of type [AggregateFunction](../data-types/aggregatefunction.md#data-type-aggregatefunction) or use initialized aggregates as default values. **Syntax** @@ -2027,7 +2027,7 @@ initializeAggregation (aggregate_function, arg1, arg2, ..., argN) **Arguments** -- `aggregate_function` — Name of the aggregation function to initialize. [String](../../sql-reference/data-types/string.md). +- `aggregate_function` — Name of the aggregation function to initialize. [String](../data-types/string.md). - `arg` — Arguments of aggregate function. **Returned value(s)** @@ -2102,7 +2102,7 @@ finalizeAggregation(state) **Arguments** -- `state` — State of aggregation. [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction). +- `state` — State of aggregation. [AggregateFunction](../data-types/aggregatefunction.md#data-type-aggregatefunction). **Returned value(s)** @@ -2210,8 +2210,8 @@ runningAccumulate(agg_state[, grouping]); **Arguments** -- `agg_state` — State of the aggregate function. [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction). -- `grouping` — Grouping key. Optional. The state of the function is reset if the `grouping` value is changed. It can be any of the [supported data types](../../sql-reference/data-types/index.md) for which the equality operator is defined. +- `agg_state` — State of the aggregate function. [AggregateFunction](../data-types/aggregatefunction.md#data-type-aggregatefunction). +- `grouping` — Grouping key. Optional. The state of the function is reset if the `grouping` value is changed. It can be any of the [supported data types](../data-types/index.md) for which the equality operator is defined. **Returned value** @@ -2485,7 +2485,7 @@ getSetting('custom_setting'); **Parameter** -- `custom_setting` — The setting name. [String](../../sql-reference/data-types/string.md). +- `custom_setting` — The setting name. [String](../data-types/string.md). **Returned value** @@ -2510,7 +2510,7 @@ Result: ## isDecimalOverflow -Checks whether the [Decimal](../../sql-reference/data-types/decimal.md) value is outside its precision or outside the specified precision. +Checks whether the [Decimal](../data-types/decimal.md) value is outside its precision or outside the specified precision. **Syntax** @@ -2520,8 +2520,8 @@ isDecimalOverflow(d, [p]) **Arguments** -- `d` — value. [Decimal](../../sql-reference/data-types/decimal.md). -- `p` — precision. Optional. If omitted, the initial precision of the first argument is used. This parameter can be helpful to migrate data from/to another database or file. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). +- `d` — value. [Decimal](../data-types/decimal.md). +- `p` — precision. Optional. If omitted, the initial precision of the first argument is used. This parameter can be helpful to migrate data from/to another database or file. [UInt8](../data-types/int-uint.md#uint-ranges). **Returned values** @@ -2557,11 +2557,11 @@ countDigits(x) **Arguments** -- `x` — [Int](../../sql-reference/data-types/int-uint.md) or [Decimal](../../sql-reference/data-types/decimal.md) value. +- `x` — [Int](../data-types/int-uint.md) or [Decimal](../data-types/decimal.md) value. **Returned value** -- Number of digits. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). +- Number of digits. [UInt8](../data-types/int-uint.md#uint-ranges). :::note For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). @@ -2585,7 +2585,7 @@ Result: ## errorCodeToName -- The textual name of an error code. [LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md). +- The textual name of an error code. [LowCardinality(String)](../data-types/lowcardinality.md). **Syntax** @@ -2616,7 +2616,7 @@ tcpPort() **Returned value** -- The TCP port number. [UInt16](../../sql-reference/data-types/int-uint.md). +- The TCP port number. [UInt16](../data-types/int-uint.md). **Example** @@ -2652,7 +2652,7 @@ currentProfiles() **Returned value** -- List of the current user settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the current user settings profiles. [Array](../data-types/array.md)([String](../data-types/string.md)). ## enabledProfiles @@ -2666,7 +2666,7 @@ enabledProfiles() **Returned value** -- List of the enabled settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the enabled settings profiles. [Array](../data-types/array.md)([String](../data-types/string.md)). ## defaultProfiles @@ -2680,7 +2680,7 @@ defaultProfiles() **Returned value** -- List of the default settings profiles. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the default settings profiles. [Array](../data-types/array.md)([String](../data-types/string.md)). ## currentRoles @@ -2694,7 +2694,7 @@ currentRoles() **Returned value** -- A list of the current roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- A list of the current roles for the current user. [Array](../data-types/array.md)([String](../data-types/string.md)). ## enabledRoles @@ -2708,7 +2708,7 @@ enabledRoles() **Returned value** -- List of the enabled roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the enabled roles for the current user. [Array](../data-types/array.md)([String](../data-types/string.md)). ## defaultRoles @@ -2722,7 +2722,7 @@ defaultRoles() **Returned value** -- List of the default roles for the current user. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- List of the default roles for the current user. [Array](../data-types/array.md)([String](../data-types/string.md)). ## getServerPort @@ -2736,7 +2736,7 @@ getServerPort(port_name) **Arguments** -- `port_name` — The name of the server port. [String](../../sql-reference/data-types/string.md#string). Possible values: +- `port_name` — The name of the server port. [String](../data-types/string.md#string). Possible values: - 'tcp_port' - 'tcp_port_secure' @@ -2751,7 +2751,7 @@ getServerPort(port_name) **Returned value** -- The number of the server port. [UInt16](../../sql-reference/data-types/int-uint.md). +- The number of the server port. [UInt16](../data-types/int-uint.md). **Example** @@ -2783,7 +2783,7 @@ queryID() **Returned value** -- The ID of the current query. [String](../../sql-reference/data-types/string.md) +- The ID of the current query. [String](../data-types/string.md) **Example** @@ -2817,7 +2817,7 @@ initialQueryID() **Returned value** -- The ID of the initial current query. [String](../../sql-reference/data-types/string.md) +- The ID of the initial current query. [String](../data-types/string.md) **Example** @@ -2850,7 +2850,7 @@ shardNum() **Returned value** -- Shard index or constant `0`. [UInt32](../../sql-reference/data-types/int-uint.md). +- Shard index or constant `0`. [UInt32](../data-types/int-uint.md). **Example** @@ -2890,7 +2890,7 @@ shardCount() **Returned value** -- Total number of shards or `0`. [UInt32](../../sql-reference/data-types/int-uint.md). +- Total number of shards or `0`. [UInt32](../data-types/int-uint.md). **See Also** @@ -2912,7 +2912,7 @@ getOSKernelVersion() **Returned value** -- The current OS kernel version. [String](../../sql-reference/data-types/string.md). +- The current OS kernel version. [String](../data-types/string.md). **Example** @@ -2946,7 +2946,7 @@ zookeeperSessionUptime() **Returned value** -- Uptime of the current ZooKeeper session in seconds. [UInt32](../../sql-reference/data-types/int-uint.md). +- Uptime of the current ZooKeeper session in seconds. [UInt32](../data-types/int-uint.md). **Example** @@ -2983,7 +2983,7 @@ All arguments must be constant. **Returned value** -- Randomly generated table structure. [String](../../sql-reference/data-types/string.md). +- Randomly generated table structure. [String](../data-types/string.md). **Examples** @@ -3050,7 +3050,7 @@ structureToCapnProtoSchema(structure) **Returned value** -- CapnProto schema. [String](../../sql-reference/data-types/string.md). +- CapnProto schema. [String](../data-types/string.md). **Examples** @@ -3149,7 +3149,7 @@ structureToProtobufSchema(structure) **Returned value** -- Protobuf schema. [String](../../sql-reference/data-types/string.md). +- Protobuf schema. [String](../data-types/string.md). **Examples** @@ -3229,11 +3229,11 @@ formatQueryOrNull(query) **Arguments** -- `query` - The SQL query to be formatted. [String](../../sql-reference/data-types/string.md) +- `query` - The SQL query to be formatted. [String](../data-types/string.md) **Returned value** -- The formatted query. [String](../../sql-reference/data-types/string.md). +- The formatted query. [String](../data-types/string.md). **Example** @@ -3268,11 +3268,11 @@ formatQuerySingleLineOrNull(query) **Arguments** -- `query` - The SQL query to be formatted. [String](../../sql-reference/data-types/string.md) +- `query` - The SQL query to be formatted. [String](../data-types/string.md) **Returned value** -- The formatted query. [String](../../sql-reference/data-types/string.md). +- The formatted query. [String](../data-types/string.md). **Example** @@ -3300,8 +3300,8 @@ variantElement(variant, type_name, [, default_value]) **Arguments** -- `variant` — Variant column. [Variant](../../sql-reference/data-types/variant.md). -- `type_name` — The name of the variant type to extract. [String](../../sql-reference/data-types/string.md). +- `variant` — Variant column. [Variant](../data-types/variant.md). +- `type_name` — The name of the variant type to extract. [String](../data-types/string.md). - `default_value` - The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional. **Returned value** @@ -3337,7 +3337,7 @@ variantType(variant) **Arguments** -- `variant` — Variant column. [Variant](../../sql-reference/data-types/variant.md). +- `variant` — Variant column. [Variant](../data-types/variant.md). **Returned value** @@ -3553,7 +3553,7 @@ showCertificate() **Returned value** -- Map of key-value pairs relating to the configured SSL certificate. [Map](../../sql-reference/data-types/map.md)([String](../../sql-reference/data-types/string.md), [String](../../sql-reference/data-types/string.md)). +- Map of key-value pairs relating to the configured SSL certificate. [Map](../data-types/map.md)([String](../data-types/string.md), [String](../data-types/string.md)). **Example** diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index a7866c6d12e..a9b483aa0e5 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -169,7 +169,7 @@ randUniform(min, max) ### Returned value -A random number of type [Float64](/docs/en/sql-reference/data-types/float.md). +A random number of type [Float64](../data-types/float.md). ### Example @@ -204,7 +204,7 @@ randNormal(mean, variance) **Returned value** -- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](../data-types/float.md). **Example** @@ -241,7 +241,7 @@ randLogNormal(mean, variance) **Returned value** -- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](../data-types/float.md). **Example** @@ -278,7 +278,7 @@ randBinomial(experiments, probability) **Returned value** -- Random number. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Random number. [UInt64](../data-types/int-uint.md). **Example** @@ -315,7 +315,7 @@ randNegativeBinomial(experiments, probability) **Returned value** -- Random number. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Random number. [UInt64](../data-types/int-uint.md). **Example** @@ -351,7 +351,7 @@ randPoisson(n) **Returned value** -- Random number. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Random number. [UInt64](../data-types/int-uint.md). **Example** @@ -387,7 +387,7 @@ randBernoulli(probability) **Returned value** -- Random number. [UInt64](/docs/en/sql-reference/data-types/int-uint.md). +- Random number. [UInt64](../data-types/int-uint.md). **Example** @@ -423,7 +423,7 @@ randExponential(lambda) **Returned value** -- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](../data-types/float.md). **Example** @@ -459,7 +459,7 @@ randChiSquared(degree_of_freedom) **Returned value** -- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](../data-types/float.md). **Example** @@ -495,7 +495,7 @@ randStudentT(degree_of_freedom) **Returned value** -- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](../data-types/float.md). **Example** @@ -532,7 +532,7 @@ randFisherF(d1, d2) **Returned value** -- Random number. [Float64](/docs/en/sql-reference/data-types/float.md). +- Random number. [Float64](../data-types/float.md). **Example** @@ -568,7 +568,7 @@ randomString(length) **Returned value** -- String filled with random bytes. [String](../../sql-reference/data-types/string.md). +- String filled with random bytes. [String](../data-types/string.md). **Example** @@ -604,11 +604,11 @@ randomFixedString(length); **Arguments** -- `length` — String length in bytes. [UInt64](../../sql-reference/data-types/int-uint.md). +- `length` — String length in bytes. [UInt64](../data-types/int-uint.md). **Returned value(s)** -- String filled with random bytes. [FixedString](../../sql-reference/data-types/fixedstring.md). +- String filled with random bytes. [FixedString](../data-types/fixedstring.md). **Example** @@ -643,7 +643,7 @@ randomPrintableASCII(length) **Returned value** -- String with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. [String](../../sql-reference/data-types/string.md) +- String with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. [String](../data-types/string.md) **Example** @@ -671,11 +671,11 @@ randomStringUTF8(length); **Arguments** -- `length` — Length of the string in code points. [UInt64](../../sql-reference/data-types/int-uint.md). +- `length` — Length of the string in code points. [UInt64](../data-types/int-uint.md). **Returned value(s)** -- UTF-8 random string. [String](../../sql-reference/data-types/string.md). +- UTF-8 random string. [String](../data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index 20f73de4410..ab344f664fd 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -36,8 +36,8 @@ Alias: `truncate`. **Parameters** -- `input`: A numeric type ([Float](/docs/en/sql-reference/data-types/float.md), [Decimal](/docs/en/sql-reference/data-types/decimal.md) or [Integer](/docs/en/sql-reference/data-types/int-uint.md)). -- `precision`: An [Integer](/docs/en/sql-reference/data-types/int-uint.md) type. +- `input`: A numeric type ([Float](../data-types/float.md), [Decimal](../data-types/decimal.md) or [Integer](../data-types/int-uint.md)). +- `precision`: An [Integer](../data-types/int-uint.md) type. **Returned value** @@ -69,7 +69,7 @@ round(expression [, decimal_places]) **Arguments** -- `expression` — A number to be rounded. Can be any [expression](../../sql-reference/syntax.md#syntax-expressions) returning the numeric [data type](../../sql-reference/data-types/index.md#data_types). +- `expression` — A number to be rounded. Can be any [expression](../../sql-reference/syntax.md#syntax-expressions) returning the numeric [data type](../data-types/index.md#data_types). - `decimal-places` — An integer value. - If `decimal-places > 0` then the function rounds the value to the right of the decimal point. - If `decimal-places < 0` then the function rounds the value to the left of the decimal point. @@ -171,7 +171,7 @@ roundBankers(expression [, decimal_places]) **Arguments** -- `expression` — A number to be rounded. Can be any [expression](../../sql-reference/syntax.md#syntax-expressions) returning the numeric [data type](../../sql-reference/data-types/index.md#data_types). +- `expression` — A number to be rounded. Can be any [expression](../../sql-reference/syntax.md#syntax-expressions) returning the numeric [data type](../data-types/index.md#data_types). - `decimal-places` — Decimal places. An integer number. - `decimal-places > 0` — The function rounds the number to the given position right of the decimal point. Example: `roundBankers(3.55, 1) = 3.6`. - `decimal-places < 0` — The function rounds the number to the given position left of the decimal point. Example: `roundBankers(24.55, -1) = 20`. diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 8aa171949a3..9ec4ee974c4 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -19,13 +19,13 @@ splitByChar(separator, s[, max_substrings])) **Arguments** -- `separator` — The separator which should contain exactly one character. [String](../../sql-reference/data-types/string.md). -- `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `separator` — The separator which should contain exactly one character. [String](../data-types/string.md). +- `s` — The string to split. [String](../data-types/string.md). - `max_substrings` — An optional `Int64` defaulting to 0. If `max_substrings` > 0, the returned array will contain at most `max_substrings` substrings, otherwise the function will return as many substrings as possible. **Returned value(s)** -- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). :::note Empty substrings may be selected when: @@ -72,13 +72,13 @@ splitByString(separator, s[, max_substrings])) **Arguments** -- `separator` — The separator. [String](../../sql-reference/data-types/string.md). -- `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `separator` — The separator. [String](../data-types/string.md). +- `s` — The string to split. [String](../data-types/string.md). - `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. **Returned value(s)** -- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). :::note Empty substrings may be selected when: @@ -129,13 +129,13 @@ splitByRegexp(regexp, s[, max_substrings])) **Arguments** - `regexp` — Regular expression. Constant. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -- `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `s` — The string to split. [String](../data-types/string.md). - `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. **Returned value(s)** -- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). :::note Empty substrings may be selected when: @@ -186,13 +186,13 @@ splitByWhitespace(s[, max_substrings])) **Arguments** -- `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `s` — The string to split. [String](../data-types/string.md). - `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. **Returned value(s)** -- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). :::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. @@ -225,13 +225,13 @@ splitByNonAlpha(s[, max_substrings])) **Arguments** -- `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `s` — The string to split. [String](../data-types/string.md). - `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. **Returned value(s)** -- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). :::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. @@ -288,12 +288,12 @@ Alias: `splitByAlpha` **Arguments** -- `s` — The string to split. [String](../../sql-reference/data-types/string.md). +- `s` — The string to split. [String](../data-types/string.md). - `max_substrings` — An optional `Int64` defaulting to 0. When `max_substrings` > 0, the returned substrings will be no more than `max_substrings`, otherwise the function will return as many substrings as possible. **Returned value(s)** -- An array of selected substrings. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). :::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. @@ -357,12 +357,12 @@ ngrams(string, ngramsize) **Arguments** -- `string` — String. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). -- `ngramsize` — The size of an n-gram. [UInt](../../sql-reference/data-types/int-uint.md). +- `string` — String. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +- `ngramsize` — The size of an n-gram. [UInt](../data-types/int-uint.md). **Returned values** -- Array with n-grams. [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). +- Array with n-grams. [Array](../data-types/array.md)([String](../data-types/string.md)). **Example** @@ -384,7 +384,7 @@ Splits a string into tokens using non-alphanumeric ASCII characters as separator **Arguments** -- `input_string` — Any set of bytes represented as the [String](../../sql-reference/data-types/string.md) data type object. +- `input_string` — Any set of bytes represented as the [String](../data-types/string.md) data type object. **Returned value** diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index f45ceb99617..342ca2b9f03 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -183,7 +183,7 @@ left(s, offset) **Parameters** -- `s`: The string to calculate a substring from. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `s`: The string to calculate a substring from. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). - `offset`: The number of bytes of the offset. [UInt*](../data-types/int-uint). **Returned value** @@ -230,7 +230,7 @@ leftUTF8(s, offset) **Parameters** -- `s`: The UTF-8 encoded string to calculate a substring from. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `s`: The UTF-8 encoded string to calculate a substring from. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). - `offset`: The number of bytes of the offset. [UInt*](../data-types/int-uint). **Returned value** @@ -347,7 +347,7 @@ right(s, offset) **Parameters** -- `s`: The string to calculate a substring from. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `s`: The string to calculate a substring from. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). - `offset`: The number of bytes of the offset. [UInt*](../data-types/int-uint). **Returned value** @@ -394,7 +394,7 @@ rightUTF8(s, offset) **Parameters** -- `s`: The UTF-8 encoded string to calculate a substring from. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `s`: The UTF-8 encoded string to calculate a substring from. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). - `offset`: The number of bytes of the offset. [UInt*](../data-types/int-uint). **Returned value** @@ -513,11 +513,11 @@ Alias: `lcase` **Parameters** -- `input`: A string type [String](/docs/en/sql-reference/data-types/string.md). +- `input`: A string type [String](../data-types/string.md). **Returned value** -- A [String](/docs/en/sql-reference/data-types/string.md) data type value. +- A [String](../data-types/string.md) data type value. **Example** @@ -547,11 +547,11 @@ Alias: `ucase` **Parameters** -- `input`: A string type [String](/docs/en/sql-reference/data-types/string.md). +- `input`: A string type [String](../data-types/string.md). **Returned value** -- A [String](/docs/en/sql-reference/data-types/string.md) data type value. +- A [String](../data-types/string.md) data type value. **Examples** @@ -591,11 +591,11 @@ upperUTF8(input) **Parameters** -- `input`: A string type [String](/docs/en/sql-reference/data-types/string.md). +- `input`: A string type [String](../data-types/string.md). **Returned value** -- A [String](/docs/en/sql-reference/data-types/string.md) data type value. +- A [String](../data-types/string.md) data type value. **Example** @@ -627,7 +627,7 @@ toValidUTF8(input_string) **Arguments** -- `input_string` — Any set of bytes represented as the [String](../../sql-reference/data-types/string.md) data type object. +- `input_string` — Any set of bytes represented as the [String](../data-types/string.md) data type object. **Returned value** @@ -659,8 +659,8 @@ Alias: `REPEAT` **Arguments** -- `s` — The string to repeat. [String](../../sql-reference/data-types/string.md). -- `n` — The number of times to repeat the string. [UInt* or Int*](../../sql-reference/data-types/int-uint.md). +- `s` — The string to repeat. [String](../data-types/string.md). +- `n` — The number of times to repeat the string. [UInt* or Int*](../data-types/int-uint.md). **Returned value** @@ -694,7 +694,7 @@ Alias: `SPACE`. **Arguments** -- `n` — The number of times to repeat the space. [UInt* or Int*](../../sql-reference/data-types/int-uint.md). +- `n` — The number of times to repeat the space. [UInt* or Int*](../data-types/int-uint.md). **Returned value** @@ -738,7 +738,7 @@ concat(s1, s2, ...) At least one value of arbitrary type. -Arguments which are not of types [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md) are converted to strings using their default serialization. As this decreases performance, it is not recommended to use non-String/FixedString arguments. +Arguments which are not of types [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md) are converted to strings using their default serialization. As this decreases performance, it is not recommended to use non-String/FixedString arguments. **Returned values** @@ -845,8 +845,8 @@ Alias: `concat_ws` **Arguments** -- sep — separator. Const [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). -- exprN — expression to be concatenated. Arguments which are not of types [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md) are converted to strings using their default serialization. As this decreases performance, it is not recommended to use non-String/FixedString arguments. +- sep — separator. Const [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +- exprN — expression to be concatenated. Arguments which are not of types [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md) are converted to strings using their default serialization. As this decreases performance, it is not recommended to use non-String/FixedString arguments. **Returned values** @@ -891,9 +891,9 @@ Alias: **Arguments** -- `s` — The string to calculate a substring from. [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md) or [Enum](../../sql-reference/data-types/enum.md) -- `offset` — The starting position of the substring in `s` . [(U)Int*](../../sql-reference/data-types/int-uint.md). -- `length` — The maximum length of the substring. [(U)Int*](../../sql-reference/data-types/int-uint.md). Optional. +- `s` — The string to calculate a substring from. [String](../data-types/string.md), [FixedString](../data-types/fixedstring.md) or [Enum](../data-types/enum.md) +- `offset` — The starting position of the substring in `s` . [(U)Int*](../data-types/int-uint.md). +- `length` — The maximum length of the substring. [(U)Int*](../data-types/int-uint.md). Optional. **Returned value** @@ -927,9 +927,9 @@ substringUTF8(s, offset[, length]) **Arguments** -- `s`: The string to calculate a substring from. [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md) or [Enum](../../sql-reference/data-types/enum.md) -- `offset`: The starting position of the substring in `s` . [(U)Int*](../../sql-reference/data-types/int-uint.md). -- `length`: The maximum length of the substring. [(U)Int*](../../sql-reference/data-types/int-uint.md). Optional. +- `s`: The string to calculate a substring from. [String](../data-types/string.md), [FixedString](../data-types/fixedstring.md) or [Enum](../data-types/enum.md) +- `offset`: The starting position of the substring in `s` . [(U)Int*](../data-types/int-uint.md). +- `length`: The maximum length of the substring. [(U)Int*](../data-types/int-uint.md). Optional. **Returned value** @@ -965,8 +965,8 @@ Alias: `SUBSTRING_INDEX` **Arguments** -- s: The string to extract substring from. [String](../../sql-reference/data-types/string.md). -- delim: The character to split. [String](../../sql-reference/data-types/string.md). +- s: The string to extract substring from. [String](../data-types/string.md). +- delim: The character to split. [String](../data-types/string.md). - count: The number of occurrences of the delimiter to count before extracting the substring. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. [UInt or Int](../data-types/int-uint.md) **Example** @@ -996,13 +996,13 @@ substringIndexUTF8(s, delim, count) **Arguments** -- `s`: The string to extract substring from. [String](../../sql-reference/data-types/string.md). -- `delim`: The character to split. [String](../../sql-reference/data-types/string.md). +- `s`: The string to extract substring from. [String](../data-types/string.md). +- `delim`: The character to split. [String](../data-types/string.md). - `count`: The number of occurrences of the delimiter to count before extracting the substring. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. [UInt or Int](../data-types/int-uint.md) **Returned value** -A substring [String](../../sql-reference/data-types/string.md) of `s` before `count` occurrences of `delim`. +A substring [String](../data-types/string.md) of `s` before `count` occurrences of `delim`. **Implementation details** @@ -1050,11 +1050,11 @@ base58Encode(plaintext) **Arguments** -- `plaintext` — [String](../../sql-reference/data-types/string.md) column or constant. +- `plaintext` — [String](../data-types/string.md) column or constant. **Returned value** -- A string containing the encoded value of the argument. [String](../../sql-reference/data-types/string.md). +- A string containing the encoded value of the argument. [String](../data-types/string.md). **Example** @@ -1082,7 +1082,7 @@ base58Decode(encoded) **Arguments** -- `encoded` — [String](../../sql-reference/data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, an exception is thrown. +- `encoded` — [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, an exception is thrown. **Returned value** @@ -1114,7 +1114,7 @@ tryBase58Decode(encoded) **Parameters** -- `encoded`: [String](../../sql-reference/data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, returns an empty string in case of error. +- `encoded`: [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, returns an empty string in case of error. **Returned value** @@ -1158,7 +1158,7 @@ tryBase64Decode(encoded) **Parameters** -- `encoded`: [String](../../sql-reference/data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, returns an empty string in case of error. +- `encoded`: [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, returns an empty string in case of error. **Examples** @@ -1257,8 +1257,8 @@ trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) **Arguments** -- `trim_character` — Specified characters for trim. [String](../../sql-reference/data-types/string.md). -- `input_string` — String for trim. [String](../../sql-reference/data-types/string.md). +- `trim_character` — Specified characters for trim. [String](../data-types/string.md). +- `input_string` — String for trim. [String](../data-types/string.md). **Returned value** @@ -1292,7 +1292,7 @@ Alias: `ltrim(input_string)`. **Arguments** -- `input_string` — string to trim. [String](../../sql-reference/data-types/string.md). +- `input_string` — string to trim. [String](../data-types/string.md). **Returned value** @@ -1326,7 +1326,7 @@ Alias: `rtrim(input_string)`. **Arguments** -- `input_string` — string to trim. [String](../../sql-reference/data-types/string.md). +- `input_string` — string to trim. [String](../data-types/string.md). **Returned value** @@ -1360,7 +1360,7 @@ Alias: `trim(input_string)`. **Arguments** -- `input_string` — string to trim. [String](../../sql-reference/data-types/string.md). +- `input_string` — string to trim. [String](../data-types/string.md). **Returned value** @@ -1410,11 +1410,11 @@ normalizeQuery(x) **Arguments** -- `x` — Sequence of characters. [String](../../sql-reference/data-types/string.md). +- `x` — Sequence of characters. [String](../data-types/string.md). **Returned value** -- Sequence of characters with placeholders. [String](../../sql-reference/data-types/string.md). +- Sequence of characters with placeholders. [String](../data-types/string.md). **Example** @@ -1442,11 +1442,11 @@ normalizedQueryHash(x) **Arguments** -- `x` — Sequence of characters. [String](../../sql-reference/data-types/string.md). +- `x` — Sequence of characters. [String](../data-types/string.md). **Returned value** -- Hash value. [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges). +- Hash value. [UInt64](../data-types/int-uint.md#uint-ranges). **Example** @@ -1474,11 +1474,11 @@ normalizeUTF8NFC(words) **Arguments** -- `words` — UTF8-encoded input string. [String](../../sql-reference/data-types/string.md). +- `words` — UTF8-encoded input string. [String](../data-types/string.md). **Returned value** -- String transformed to NFC normalization form. [String](../../sql-reference/data-types/string.md). +- String transformed to NFC normalization form. [String](../data-types/string.md). **Example** @@ -1506,11 +1506,11 @@ normalizeUTF8NFD(words) **Arguments** -- `words` — UTF8-encoded input string. [String](../../sql-reference/data-types/string.md). +- `words` — UTF8-encoded input string. [String](../data-types/string.md). **Returned value** -- String transformed to NFD normalization form. [String](../../sql-reference/data-types/string.md). +- String transformed to NFD normalization form. [String](../data-types/string.md). **Example** @@ -1538,11 +1538,11 @@ normalizeUTF8NFKC(words) **Arguments** -- `words` — UTF8-encoded input string. [String](../../sql-reference/data-types/string.md). +- `words` — UTF8-encoded input string. [String](../data-types/string.md). **Returned value** -- String transformed to NFKC normalization form. [String](../../sql-reference/data-types/string.md). +- String transformed to NFKC normalization form. [String](../data-types/string.md). **Example** @@ -1570,11 +1570,11 @@ normalizeUTF8NFKD(words) **Arguments** -- `words` — UTF8-encoded input string. [String](../../sql-reference/data-types/string.md). +- `words` — UTF8-encoded input string. [String](../data-types/string.md). **Returned value** -- String transformed to NFKD normalization form. [String](../../sql-reference/data-types/string.md). +- String transformed to NFKD normalization form. [String](../data-types/string.md). **Example** @@ -1605,11 +1605,11 @@ encodeXMLComponent(x) **Arguments** -- `x` — An input string. [String](../../sql-reference/data-types/string.md). +- `x` — An input string. [String](../data-types/string.md). **Returned value** -- The escaped string. [String](../../sql-reference/data-types/string.md). +- The escaped string. [String](../data-types/string.md). **Example** @@ -1643,11 +1643,11 @@ decodeXMLComponent(x) **Arguments** -- `x` — An input string. [String](../../sql-reference/data-types/string.md). +- `x` — An input string. [String](../data-types/string.md). **Returned value** -- The un-escaped string. [String](../../sql-reference/data-types/string.md). +- The un-escaped string. [String](../data-types/string.md). **Example** @@ -1677,11 +1677,11 @@ decodeHTMLComponent(x) **Arguments** -- `x` — An input string. [String](../../sql-reference/data-types/string.md). +- `x` — An input string. [String](../data-types/string.md). **Returned value** -- The un-escaped string. [String](../../sql-reference/data-types/string.md). +- The un-escaped string. [String](../data-types/string.md). **Example** @@ -1730,11 +1730,11 @@ extractTextFromHTML(x) **Arguments** -- `x` — input text. [String](../../sql-reference/data-types/string.md). +- `x` — input text. [String](../data-types/string.md). **Returned value** -- Extracted text. [String](../../sql-reference/data-types/string.md). +- Extracted text. [String](../data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 0e183626555..7aeb1f5b2a7 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -202,13 +202,13 @@ translateUTF8(s, from, to) **Parameters** -- `s`: A string type [String](/docs/en/sql-reference/data-types/string.md). -- `from`: A string type [String](/docs/en/sql-reference/data-types/string.md). -- `to`: A string type [String](/docs/en/sql-reference/data-types/string.md). +- `s`: A string type [String](../data-types/string.md). +- `from`: A string type [String](../data-types/string.md). +- `to`: A string type [String](../data-types/string.md). **Returned value** -- A [String](/docs/en/sql-reference/data-types/string.md) data type value. +- A [String](../data-types/string.md) data type value. **Examples** diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 43b9e621bc0..07f776906e6 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -17,7 +17,7 @@ Functions in this section also assume that the searched string (referred to in t violated, no exception is thrown and results are undefined. Search with UTF-8 encoded strings is usually provided by separate function variants. Likewise, if a UTF-8 function variant is used and the input strings are not UTF-8 encoded text, no exception is thrown and the results are undefined. Note that no automatic Unicode normalization is performed, however you can use the -[normalizeUTF8*()](https://clickhouse.com/docs/en/sql-reference/functions/string-functions/) functions for that. +[normalizeUTF8*()](https://clickhouse.com../functions/string-functions/) functions for that. [General strings functions](string-functions.md) and [functions for replacing in strings](string-replace-functions.md) are described separately. @@ -38,12 +38,12 @@ Alias: - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../../sql-reference/data-types/int-uint.md). Optional. +- `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../data-types/int-uint.md). Optional. **Returned values** -- Starting position in bytes and counting from 1, if the substring was found. [UInt64](../../sql-reference/data-types/int-uint.md). -- 0, if the substring was not found. [UInt64](../../sql-reference/data-types/int-uint.md). +- Starting position in bytes and counting from 1, if the substring was found. [UInt64](../data-types/int-uint.md). +- 0, if the substring was not found. [UInt64](../data-types/int-uint.md). If substring `needle` is empty, these rules apply: - if no `start_pos` was specified: return `1` @@ -204,7 +204,7 @@ multiSearchAllPositions(haystack, [needle1, needle2, ..., needleN]) **Arguments** - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Substrings to be searched. [Array](../data-types/array.md). **Returned values** @@ -239,7 +239,7 @@ multiSearchAllPositionsCaseInsensitive(haystack, [needle1, needle2, ..., needleN **Parameters** - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -273,7 +273,7 @@ multiSearchAllPositionsUTF8(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — UTF-8 encoded string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — UTF-8 encoded substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — UTF-8 encoded substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -309,7 +309,7 @@ multiSearchAllPositionsCaseInsensitiveUTF8(haystack, [needle1, needle2, ..., nee **Parameters** - `haystack` — UTF-8 encoded string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — UTF-8 encoded substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — UTF-8 encoded substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -347,7 +347,7 @@ multiSearchFirstPosition(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -381,7 +381,7 @@ multiSearchFirstPositionCaseInsensitive(haystack, [needle1, needle2, ..., needle **Parameters** - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Array of substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Array of substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -415,7 +415,7 @@ multiSearchFirstPositionUTF8(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — UTF-8 string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Array of UTF-8 substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Array of UTF-8 substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -451,7 +451,7 @@ multiSearchFirstPositionCaseInsensitiveUTF8(haystack, [needle1, needle2, ..., ne **Parameters** - `haystack` — UTF-8 string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Array of UTF-8 substrings to be searched. [Array](../../sql-reference/data-types/array.md) +- `needle` — Array of UTF-8 substrings to be searched. [Array](../data-types/array.md) **Returned value** @@ -488,7 +488,7 @@ multiSearchFirstIndex(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -522,7 +522,7 @@ multiSearchFirstIndexCaseInsensitive(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -556,7 +556,7 @@ multiSearchFirstIndexUTF8(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — UTF-8 string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Array of UTF-8 substrings to be searched. [Array](../../sql-reference/data-types/array.md) +- `needle` — Array of UTF-8 substrings to be searched. [Array](../data-types/array.md) **Returned value** @@ -592,7 +592,7 @@ multiSearchFirstIndexCaseInsensitiveUTF8(haystack, [needle1, needle2, ..., needl **Parameters** - `haystack` — UTF-8 string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Array of UTF-8 substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Array of UTF-8 substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -630,7 +630,7 @@ multiSearchAny(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — Substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -664,7 +664,7 @@ multiSearchAnyCaseInsensitive(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — Substrings to be searched. [Array](../../sql-reference/data-types/array.md) +- `needle` — Substrings to be searched. [Array](../data-types/array.md) **Returned value** @@ -698,7 +698,7 @@ multiSearchAnyUTF8(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — UTF-8 string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — UTF-8 substrings to be searched. [Array](../../sql-reference/data-types/array.md). +- `needle` — UTF-8 substrings to be searched. [Array](../data-types/array.md). **Returned value** @@ -734,7 +734,7 @@ multiSearchAnyCaseInsensitiveUTF8(haystack, [needle1, needle2, ..., needleN]) **Parameters** - `haystack` — UTF-8 string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `needle` — UTF-8 substrings to be searched. [Array](../../sql-reference/data-types/array.md) +- `needle` — UTF-8 substrings to be searched. [Array](../data-types/array.md) **Returned value** @@ -894,12 +894,12 @@ extractAllGroupsHorizontal(haystack, pattern) **Arguments** -- `haystack` — Input string. [String](../../sql-reference/data-types/string.md). -- `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. [String](../../sql-reference/data-types/string.md). +- `haystack` — Input string. [String](../data-types/string.md). +- `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. [String](../data-types/string.md). **Returned value** -- Array of arrays of matches. [Array](../../sql-reference/data-types/array.md). +- Array of arrays of matches. [Array](../data-types/array.md). :::note If `haystack` does not match the `pattern` regex, an array of empty arrays is returned. @@ -931,12 +931,12 @@ extractAllGroupsVertical(haystack, pattern) **Arguments** -- `haystack` — Input string. [String](../../sql-reference/data-types/string.md). -- `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. [String](../../sql-reference/data-types/string.md). +- `haystack` — Input string. [String](../data-types/string.md). +- `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. [String](../data-types/string.md). **Returned value** -- Array of arrays of matches. [Array](../../sql-reference/data-types/array.md). +- Array of arrays of matches. [Array](../data-types/array.md). :::note If `haystack` does not match the `pattern` regex, an empty array is returned. @@ -970,7 +970,7 @@ Matching is based on UTF-8, e.g. `_` matches the Unicode code point `¥` which i If the haystack or the LIKE expression are not valid UTF-8, the behavior is undefined. -No automatic Unicode normalization is performed, you can use the [normalizeUTF8*()](https://clickhouse.com/docs/en/sql-reference/functions/string-functions/) functions for that. +No automatic Unicode normalization is performed, you can use the [normalizeUTF8*()](https://clickhouse.com../functions/string-functions/) functions for that. To match against literal `%`, `_` and `\` (which are LIKE metacharacters), prepend them with a backslash: `\%`, `\_` and `\\`. The backslash loses its special meaning (i.e. is interpreted literally) if it prepends a character different than `%`, `_` or `\`. @@ -1007,7 +1007,7 @@ Alias: `haystack NOT ILIKE pattern` (operator) ## ngramDistance -Calculates the 4-gram distance between a `haystack` string and a `needle` string. For this, it counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns a [Float32](../../sql-reference/data-types/float.md/#float32-float64) between 0 and 1. The smaller the result is, the more similar the strings are to each other. +Calculates the 4-gram distance between a `haystack` string and a `needle` string. For this, it counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns a [Float32](../data-types/float.md/#float32-float64) between 0 and 1. The smaller the result is, the more similar the strings are to each other. Functions [`ngramDistanceCaseInsensitive`](#ngramdistancecaseinsensitive), [`ngramDistanceUTF8`](#ngramdistanceutf8), [`ngramDistanceCaseInsensitiveUTF8`](#ngramdistancecaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. @@ -1024,7 +1024,7 @@ ngramDistance(haystack, needle) **Returned value** -- Value between 0 and 1 representing the similarity between the two strings. [Float32](../../sql-reference/data-types/float.md/#float32-float64) +- Value between 0 and 1 representing the similarity between the two strings. [Float32](../data-types/float.md/#float32-float64) **Implementation details** @@ -1078,7 +1078,7 @@ ngramDistanceCaseInsensitive(haystack, needle) **Returned value** -- Value between 0 and 1 representing the similarity between the two strings. [Float32](../../sql-reference/data-types/float.md/#float32-float64) +- Value between 0 and 1 representing the similarity between the two strings. [Float32](../data-types/float.md/#float32-float64) **Examples** @@ -1127,7 +1127,7 @@ ngramDistanceUTF8(haystack, needle) **Returned value** -- Value between 0 and 1 representing the similarity between the two strings. [Float32](../../sql-reference/data-types/float.md/#float32-float64) +- Value between 0 and 1 representing the similarity between the two strings. [Float32](../data-types/float.md/#float32-float64) **Example** @@ -1160,7 +1160,7 @@ ngramDistanceCaseInsensitiveUTF8(haystack, needle) **Returned value** -- Value between 0 and 1 representing the similarity between the two strings. [Float32](../../sql-reference/data-types/float.md/#float32-float64) +- Value between 0 and 1 representing the similarity between the two strings. [Float32](../data-types/float.md/#float32-float64) **Example** @@ -1178,7 +1178,7 @@ Result: ## ngramSearch -Like `ngramDistance` but calculates the non-symmetric difference between a `needle` string and a `haystack` string, i.e. the number of n-grams from the needle minus the common number of n-grams normalized by the number of `needle` n-grams. Returns a [Float32](../../sql-reference/data-types/float.md/#float32-float64) between 0 and 1. The bigger the result is, the more likely `needle` is in the `haystack`. This function is useful for fuzzy string search. Also see function [`soundex`](../../sql-reference/functions/string-functions#soundex). +Like `ngramDistance` but calculates the non-symmetric difference between a `needle` string and a `haystack` string, i.e. the number of n-grams from the needle minus the common number of n-grams normalized by the number of `needle` n-grams. Returns a [Float32](../data-types/float.md/#float32-float64) between 0 and 1. The bigger the result is, the more likely `needle` is in the `haystack`. This function is useful for fuzzy string search. Also see function [`soundex`](../../sql-reference/functions/string-functions#soundex). Functions [`ngramSearchCaseInsensitive`](#ngramsearchcaseinsensitive), [`ngramSearchUTF8`](#ngramsearchutf8), [`ngramSearchCaseInsensitiveUTF8`](#ngramsearchcaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. @@ -1195,7 +1195,7 @@ ngramSearch(haystack, needle) **Returned value** -- Value between 0 and 1 representing the likelihood of the `needle` being in the `haystack`. [Float32](../../sql-reference/data-types/float.md/#float32-float64) +- Value between 0 and 1 representing the likelihood of the `needle` being in the `haystack`. [Float32](../data-types/float.md/#float32-float64) **Implementation details** @@ -1234,7 +1234,7 @@ ngramSearchCaseInsensitive(haystack, needle) **Returned value** -- Value between 0 and 1 representing the likelihood of the `needle` being in the `haystack`. [Float32](../../sql-reference/data-types/float.md/#float32-float64) +- Value between 0 and 1 representing the likelihood of the `needle` being in the `haystack`. [Float32](../data-types/float.md/#float32-float64) The bigger the result is, the more likely `needle` is in the `haystack`. @@ -1269,7 +1269,7 @@ ngramSearchUTF8(haystack, needle) **Returned value** -- Value between 0 and 1 representing the likelihood of the `needle` being in the `haystack`. [Float32](../../sql-reference/data-types/float.md/#float32-float64) +- Value between 0 and 1 representing the likelihood of the `needle` being in the `haystack`. [Float32](../data-types/float.md/#float32-float64) The bigger the result is, the more likely `needle` is in the `haystack`. @@ -1304,7 +1304,7 @@ ngramSearchCaseInsensitiveUTF8(haystack, needle) **Returned value** -- Value between 0 and 1 representing the likelihood of the `needle` being in the `haystack`. [Float32](../../sql-reference/data-types/float.md/#float32-float64) +- Value between 0 and 1 representing the likelihood of the `needle` being in the `haystack`. [Float32](../data-types/float.md/#float32-float64) The bigger the result is, the more likely `needle` is in the `haystack`. @@ -1338,11 +1338,11 @@ countSubstrings(haystack, needle[, start_pos]) - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../../sql-reference/data-types/int-uint.md). Optional. +- `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../data-types/int-uint.md). Optional. **Returned values** -- The number of occurrences. [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of occurrences. [UInt64](../data-types/int-uint.md). **Examples** @@ -1385,11 +1385,11 @@ countSubstringsCaseInsensitive(haystack, needle[, start_pos]) - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../../sql-reference/data-types/int-uint.md). Optional. +- `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../data-types/int-uint.md). Optional. **Returned values** -- The number of occurrences. [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of occurrences. [UInt64](../data-types/int-uint.md). **Examples** @@ -1437,11 +1437,11 @@ countSubstringsCaseInsensitiveUTF8(haystack, needle[, start_pos]) - `haystack` — UTF-8 string in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../../sql-reference/data-types/int-uint.md). Optional. +- `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../data-types/int-uint.md). Optional. **Returned values** -- The number of occurrences. [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of occurrences. [UInt64](../data-types/int-uint.md). **Examples** @@ -1488,11 +1488,11 @@ countMatches(haystack, pattern) **Arguments** - `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `pattern` — The regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). [String](../../sql-reference/data-types/string.md). +- `pattern` — The regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). [String](../data-types/string.md). **Returned value** -- The number of matches. [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of matches. [UInt64](../data-types/int-uint.md). **Examples** @@ -1533,11 +1533,11 @@ countMatchesCaseInsensitive(haystack, pattern) **Arguments** - `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `pattern` — The regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). [String](../../sql-reference/data-types/string.md). +- `pattern` — The regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). [String](../data-types/string.md). **Returned value** -- The number of matches. [UInt64](../../sql-reference/data-types/int-uint.md). +- The number of matches. [UInt64](../data-types/int-uint.md). **Examples** @@ -1571,7 +1571,7 @@ Alias: `REGEXP_EXTRACT(haystack, pattern[, index])`. - `haystack` — String, in which regexp pattern will to be matched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `pattern` — String, regexp expression, must be constant. [String](../../sql-reference/syntax.md#syntax-string-literal). -- `index` – An integer number greater or equal 0 with default 1. It represents which regex group to extract. [UInt or Int](../../sql-reference/data-types/int-uint.md). Optional. +- `index` – An integer number greater or equal 0 with default 1. It represents which regex group to extract. [UInt or Int](../data-types/int-uint.md). Optional. **Returned values** diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index beb7a0503b9..da8ed1f51ba 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -30,7 +30,7 @@ At least four data points are required in `series` to detect outliers. **Returned value** -- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. [Array](../../sql-reference/data-types/array.md). +- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. [Array](../data-types/array.md). **Examples** @@ -79,8 +79,8 @@ seriesPeriodDetectFFT(series); **Returned value** -- A real value equal to the period of series data. [Float64](../../sql-reference/data-types/float.md). -- Returns NAN when number of data points are less than four. [nan](../../sql-reference/data-types/float.md/#nan-and-inf). +- A real value equal to the period of series data. [Float64](../data-types/float.md). +- Returns NAN when number of data points are less than four. [nan](../data-types/float.md/#nan-and-inf). **Examples** @@ -130,7 +130,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** - An array of four arrays where the first array include seasonal components, the second array - trend, -the third array - residue component, and the fourth array - baseline(seasonal + trend) component. [Array](../../sql-reference/data-types/array.md). +the third array - residue component, and the fourth array - baseline(seasonal + trend) component. [Array](../data-types/array.md). **Examples** diff --git a/docs/en/sql-reference/functions/time-window-functions.md b/docs/en/sql-reference/functions/time-window-functions.md index 2b5f093c149..2cec1987c20 100644 --- a/docs/en/sql-reference/functions/time-window-functions.md +++ b/docs/en/sql-reference/functions/time-window-functions.md @@ -17,8 +17,8 @@ tumble(time_attr, interval [, timezone]) ``` **Arguments** -- `time_attr` - Date and time. [DateTime](../../sql-reference/data-types/datetime.md) data type. -- `interval` - Window interval in [Interval](../../sql-reference/data-types/special-data-types/interval.md) data type. +- `time_attr` - Date and time. [DateTime](../data-types/datetime.md) data type. +- `interval` - Window interval in [Interval](../data-types/special-data-types/interval.md) data type. - `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). **Returned values** @@ -51,9 +51,9 @@ hop(time_attr, hop_interval, window_interval [, timezone]) **Arguments** -- `time_attr` - Date and time. [DateTime](../../sql-reference/data-types/datetime.md) data type. -- `hop_interval` - Hop interval in [Interval](../../sql-reference/data-types/special-data-types/interval.md) data type. Should be a positive number. -- `window_interval` - Window interval in [Interval](../../sql-reference/data-types/special-data-types/interval.md) data type. Should be a positive number. +- `time_attr` - Date and time. [DateTime](../data-types/datetime.md) data type. +- `hop_interval` - Hop interval in [Interval](../data-types/special-data-types/interval.md) data type. Should be a positive number. +- `window_interval` - Window interval in [Interval](../data-types/special-data-types/interval.md) data type. Should be a positive number. - `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). **Returned values** diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index b4fa442a637..0663be08240 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -35,7 +35,7 @@ tupleElement(tuple, name, [, default_value]) ## untuple -Performs syntactic substitution of [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2) elements in the call location. +Performs syntactic substitution of [tuple](../data-types/tuple.md#tuplet1-t2) elements in the call location. The names of the result columns are implementation-specific and subject to change. Do not assume specific column names after `untuple`. @@ -49,7 +49,7 @@ You can use the `EXCEPT` expression to skip columns as a result of the query. **Arguments** -- `x` — A `tuple` function, column, or tuple of elements. [Tuple](../../sql-reference/data-types/tuple.md). +- `x` — A `tuple` function, column, or tuple of elements. [Tuple](../data-types/tuple.md). **Returned value** @@ -111,7 +111,7 @@ Result: **See Also** -- [Tuple](../../sql-reference/data-types/tuple.md) +- [Tuple](../data-types/tuple.md) ## tupleHammingDistance @@ -125,8 +125,8 @@ tupleHammingDistance(tuple1, tuple2) **Arguments** -- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). -- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple1` — First tuple. [Tuple](../data-types/tuple.md). +- `tuple2` — Second tuple. [Tuple](../data-types/tuple.md). Tuples should have the same type of the elements. @@ -198,11 +198,11 @@ tupleToNameValuePairs(tuple) **Arguments** -- `tuple` — Named tuple. [Tuple](../../sql-reference/data-types/tuple.md) with any types of values. +- `tuple` — Named tuple. [Tuple](../data-types/tuple.md) with any types of values. **Returned value** -- An array with (name, value) pairs. [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([String](../../sql-reference/data-types/string.md), ...)). +- An array with (name, value) pairs. [Array](../data-types/array.md)([Tuple](../data-types/tuple.md)([String](../data-types/string.md), ...)). **Example** @@ -273,12 +273,12 @@ Alias: `vectorSum`. **Arguments** -- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). -- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple1` — First tuple. [Tuple](../data-types/tuple.md). +- `tuple2` — Second tuple. [Tuple](../data-types/tuple.md). **Returned value** -- Tuple with the sum. [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the sum. [Tuple](../data-types/tuple.md). **Example** @@ -310,12 +310,12 @@ Alias: `vectorDifference`. **Arguments** -- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). -- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple1` — First tuple. [Tuple](../data-types/tuple.md). +- `tuple2` — Second tuple. [Tuple](../data-types/tuple.md). **Returned value** -- Tuple with the result of subtraction. [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the result of subtraction. [Tuple](../data-types/tuple.md). **Example** @@ -345,12 +345,12 @@ tupleMultiply(tuple1, tuple2) **Arguments** -- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). -- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple1` — First tuple. [Tuple](../data-types/tuple.md). +- `tuple2` — Second tuple. [Tuple](../data-types/tuple.md). **Returned value** -- Tuple with the multiplication. [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the multiplication. [Tuple](../data-types/tuple.md). **Example** @@ -380,12 +380,12 @@ tupleDivide(tuple1, tuple2) **Arguments** -- `tuple1` — First tuple. [Tuple](../../sql-reference/data-types/tuple.md). -- `tuple2` — Second tuple. [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple1` — First tuple. [Tuple](../data-types/tuple.md). +- `tuple2` — Second tuple. [Tuple](../data-types/tuple.md). **Returned value** -- Tuple with the result of division. [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the result of division. [Tuple](../data-types/tuple.md). **Example** @@ -415,11 +415,11 @@ tupleNegate(tuple) **Arguments** -- `tuple` — [Tuple](../../sql-reference/data-types/tuple.md). +- `tuple` — [Tuple](../data-types/tuple.md). **Returned value** -- Tuple with the result of negation. [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with the result of negation. [Tuple](../data-types/tuple.md). **Example** @@ -449,12 +449,12 @@ tupleMultiplyByNumber(tuple, number) **Arguments** -- `tuple` — [Tuple](../../sql-reference/data-types/tuple.md). -- `number` — Multiplier. [Int/UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). +- `tuple` — [Tuple](../data-types/tuple.md). +- `number` — Multiplier. [Int/UInt](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). **Returned value** -- Tuple with multiplied values. [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with multiplied values. [Tuple](../data-types/tuple.md). **Example** @@ -484,12 +484,12 @@ tupleDivideByNumber(tuple, number) **Arguments** -- `tuple` — [Tuple](../../sql-reference/data-types/tuple.md). -- `number` — Divider. [Int/UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md). +- `tuple` — [Tuple](../data-types/tuple.md). +- `number` — Divider. [Int/UInt](../data-types/int-uint.md), [Float](../data-types/float.md) or [Decimal](../data-types/decimal.md). **Returned value** -- Tuple with divided values. [Tuple](../../sql-reference/data-types/tuple.md). +- Tuple with divided values. [Tuple](../data-types/tuple.md). **Example** @@ -517,7 +517,7 @@ tupleConcat(tuples) **Arguments** -- `tuples` – Arbitrary number of arguments of [Tuple](../../sql-reference/data-types/tuple.md) type. +- `tuples` – Arbitrary number of arguments of [Tuple](../data-types/tuple.md) type. **Example** diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index f02c8fde06c..d9c18e2a0a2 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -6,7 +6,7 @@ sidebar_label: Maps ## map -Arranges `key:value` pairs into [Map(key, value)](../../sql-reference/data-types/map.md) data type. +Arranges `key:value` pairs into [Map(key, value)](../data-types/map.md) data type. **Syntax** @@ -16,12 +16,12 @@ map(key1, value1[, key2, value2, ...]) **Arguments** -- `key` — The key part of the pair. Arbitrary type, except [Nullable](../../sql-reference/data-types/nullable.md) and [LowCardinality](../../sql-reference/data-types/lowcardinality.md) nested with [Nullable](../../sql-reference/data-types/nullable.md). -- `value` — The value part of the pair. Arbitrary type, including [Map](../../sql-reference/data-types/map.md) and [Array](../../sql-reference/data-types/array.md). +- `key` — The key part of the pair. Arbitrary type, except [Nullable](../data-types/nullable.md) and [LowCardinality](../data-types/lowcardinality.md) nested with [Nullable](../data-types/nullable.md). +- `value` — The value part of the pair. Arbitrary type, including [Map](../data-types/map.md) and [Array](../data-types/array.md). **Returned value** -- Data structure as `key:value` pairs. [Map(key, value)](../../sql-reference/data-types/map.md). +- Data structure as `key:value` pairs. [Map(key, value)](../data-types/map.md). **Examples** @@ -61,11 +61,11 @@ Result: **See Also** -- [Map(key, value)](../../sql-reference/data-types/map.md) data type +- [Map(key, value)](../data-types/map.md) data type ## mapFromArrays -Merges an [Array](../../sql-reference/data-types/array.md) of keys and an [Array](../../sql-reference/data-types/array.md) of values into a [Map(key, value)](../../sql-reference/data-types/map.md). Notice that the second argument could also be a [Map](../../sql-reference/data-types/map.md), thus it is casted to an Array when executing. +Merges an [Array](../data-types/array.md) of keys and an [Array](../data-types/array.md) of values into a [Map(key, value)](../data-types/map.md). Notice that the second argument could also be a [Map](../data-types/map.md), thus it is casted to an Array when executing. The function is a more convenient alternative to `CAST((key_array, value_array_or_map), 'Map(key_type, value_type)')`. For example, instead of writing `CAST((['aa', 'bb'], [4, 5]), 'Map(String, UInt32)')`, you can write `mapFromArrays(['aa', 'bb'], [4, 5])`. @@ -81,7 +81,7 @@ Alias: `MAP_FROM_ARRAYS(keys, values)` **Arguments** -- `keys` — Given key array to create a map from. The nested type of array must be: [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md), [LowCardinality](../../sql-reference/data-types/lowcardinality.md), [FixedString](../../sql-reference/data-types/fixedstring.md), [UUID](../../sql-reference/data-types/uuid.md), [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), [Date32](../../sql-reference/data-types/date32.md), [Enum](../../sql-reference/data-types/enum.md) +- `keys` — Given key array to create a map from. The nested type of array must be: [String](../data-types/string.md), [Integer](../data-types/int-uint.md), [LowCardinality](../data-types/lowcardinality.md), [FixedString](../data-types/fixedstring.md), [UUID](../data-types/uuid.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [Date32](../data-types/date32.md), [Enum](../data-types/enum.md) - `values` - Given value array or map to create a map from. **Returned value** @@ -109,7 +109,7 @@ SELECT mapFromArrays([1, 2, 3], map('a', 1, 'b', 2, 'c', 3)) ## extractKeyValuePairs -Extracts key-value pairs, i.e. a [Map(String, String)](../../sql-reference/data-types/map.md), from a string. Parsing is robust towards noise (e.g. log files). +Extracts key-value pairs, i.e. a [Map(String, String)](../data-types/map.md), from a string. Parsing is robust towards noise (e.g. log files). A key-value pair consists of a key, followed by a `key_value_delimiter` and a value. Key value pairs must be separated by `pair_delimiter`. Quoted keys and values are also supported. @@ -125,14 +125,14 @@ Alias: **Arguments** -- `data` - String to extract key-value pairs from. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). -- `key_value_delimiter` - Character to be used as delimiter between the key and the value. Defaults to `:`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). -- `pair_delimiters` - Set of character to be used as delimiters between pairs. Defaults to ` `, `,` and `;`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). -- `quoting_character` - Character to be used as quoting character. Defaults to `"`. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md). +- `data` - String to extract key-value pairs from. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +- `key_value_delimiter` - Character to be used as delimiter between the key and the value. Defaults to `:`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +- `pair_delimiters` - Set of character to be used as delimiters between pairs. Defaults to ` `, `,` and `;`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +- `quoting_character` - Character to be used as quoting character. Defaults to `"`. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned values** -- A [Map(String, String)](../../sql-reference/data-types/map.md) of key-value pairs. +- A [Map(String, String)](../data-types/map.md) of key-value pairs. **Examples** @@ -221,11 +221,11 @@ mapAdd(arg1, arg2 [, ...]) **Arguments** -Arguments are [maps](../../sql-reference/data-types/map.md) or [tuples](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array), where items in the first array represent keys, and the second array contains values for the each key. All key arrays should have same type, and all value arrays should contain items which are promoted to the one type ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) or [Float64](../../sql-reference/data-types/float.md#float32-float64)). The common promoted type is used as a type for the result array. +Arguments are [maps](../data-types/map.md) or [tuples](../data-types/tuple.md#tuplet1-t2) of two [arrays](../data-types/array.md#data-type-array), where items in the first array represent keys, and the second array contains values for the each key. All key arrays should have same type, and all value arrays should contain items which are promoted to the one type ([Int64](../data-types/int-uint.md#int-ranges), [UInt64](../data-types/int-uint.md#uint-ranges) or [Float64](../data-types/float.md#float32-float64)). The common promoted type is used as a type for the result array. **Returned value** -- Depending on the arguments returns one [map](../../sql-reference/data-types/map.md) or [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2), where the first array contains the sorted keys and the second array contains values. +- Depending on the arguments returns one [map](../data-types/map.md) or [tuple](../data-types/tuple.md#tuplet1-t2), where the first array contains the sorted keys and the second array contains values. **Example** @@ -269,11 +269,11 @@ mapSubtract(Tuple(Array, Array), Tuple(Array, Array) [, ...]) **Arguments** -Arguments are [maps](../../sql-reference/data-types/map.md) or [tuples](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array), where items in the first array represent keys, and the second array contains values for the each key. All key arrays should have same type, and all value arrays should contain items which are promote to the one type ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) or [Float64](../../sql-reference/data-types/float.md#float32-float64)). The common promoted type is used as a type for the result array. +Arguments are [maps](../data-types/map.md) or [tuples](../data-types/tuple.md#tuplet1-t2) of two [arrays](../data-types/array.md#data-type-array), where items in the first array represent keys, and the second array contains values for the each key. All key arrays should have same type, and all value arrays should contain items which are promote to the one type ([Int64](../data-types/int-uint.md#int-ranges), [UInt64](../data-types/int-uint.md#uint-ranges) or [Float64](../data-types/float.md#float32-float64)). The common promoted type is used as a type for the result array. **Returned value** -- Depending on the arguments returns one [map](../../sql-reference/data-types/map.md) or [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2), where the first array contains the sorted keys and the second array contains values. +- Depending on the arguments returns one [map](../data-types/map.md) or [tuple](../data-types/tuple.md#tuplet1-t2), where the first array contains the sorted keys and the second array contains values. **Example** @@ -322,21 +322,21 @@ For array arguments the number of elements in `keys` and `values` must be the sa **Arguments** -Arguments are [maps](../../sql-reference/data-types/map.md) or two [arrays](../../sql-reference/data-types/array.md#data-type-array), where the first array represent keys, and the second array contains values for the each key. +Arguments are [maps](../data-types/map.md) or two [arrays](../data-types/array.md#data-type-array), where the first array represent keys, and the second array contains values for the each key. Mapped arrays: -- `keys` — Array of keys. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)). -- `values` — Array of values. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)). -- `max` — Maximum key value. Optional. [Int8, Int16, Int32, Int64, Int128, Int256](../../sql-reference/data-types/int-uint.md#int-ranges). +- `keys` — Array of keys. [Array](../data-types/array.md#data-type-array)([Int](../data-types/int-uint.md#uint-ranges)). +- `values` — Array of values. [Array](../data-types/array.md#data-type-array)([Int](../data-types/int-uint.md#uint-ranges)). +- `max` — Maximum key value. Optional. [Int8, Int16, Int32, Int64, Int128, Int256](../data-types/int-uint.md#int-ranges). or -- `map` — Map with integer keys. [Map](../../sql-reference/data-types/map.md). +- `map` — Map with integer keys. [Map](../data-types/map.md). **Returned value** -- Depending on the arguments returns a [map](../../sql-reference/data-types/map.md) or a [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array): keys in sorted order, and values the corresponding keys. +- Depending on the arguments returns a [map](../data-types/map.md) or a [tuple](../data-types/tuple.md#tuplet1-t2) of two [arrays](../data-types/array.md#data-type-array): keys in sorted order, and values the corresponding keys. **Example** @@ -380,12 +380,12 @@ mapContains(map, key) **Arguments** -- `map` — Map. [Map](../../sql-reference/data-types/map.md). +- `map` — Map. [Map](../data-types/map.md). - `key` — Key. Type matches the type of keys of `map` parameter. **Returned value** -- `1` if `map` contains `key`, `0` if not. [UInt8](../../sql-reference/data-types/int-uint.md). +- `1` if `map` contains `key`, `0` if not. [UInt8](../data-types/int-uint.md). **Example** @@ -413,7 +413,7 @@ Result: Returns all keys from the `map` parameter. -Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [keys](../../sql-reference/data-types/map.md#map-subcolumns) subcolumn instead of reading and processing the whole column data. The query `SELECT mapKeys(m) FROM table` transforms to `SELECT m.keys FROM table`. +Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [keys](../data-types/map.md#map-subcolumns) subcolumn instead of reading and processing the whole column data. The query `SELECT mapKeys(m) FROM table` transforms to `SELECT m.keys FROM table`. **Syntax** @@ -423,11 +423,11 @@ mapKeys(map) **Arguments** -- `map` — Map. [Map](../../sql-reference/data-types/map.md). +- `map` — Map. [Map](../data-types/map.md). **Returned value** -- Array containing all keys from the `map`. [Array](../../sql-reference/data-types/array.md). +- Array containing all keys from the `map`. [Array](../data-types/array.md). **Example** @@ -454,7 +454,7 @@ Result: Returns all values from the `map` parameter. -Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [values](../../sql-reference/data-types/map.md#map-subcolumns) subcolumn instead of reading and processing the whole column data. The query `SELECT mapValues(m) FROM table` transforms to `SELECT m.values FROM table`. +Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [values](../data-types/map.md#map-subcolumns) subcolumn instead of reading and processing the whole column data. The query `SELECT mapValues(m) FROM table` transforms to `SELECT m.values FROM table`. **Syntax** @@ -464,11 +464,11 @@ mapValues(map) **Arguments** -- `map` — Map. [Map](../../sql-reference/data-types/map.md). +- `map` — Map. [Map](../data-types/map.md). **Returned value** -- Array containing all the values from `map`. [Array](../../sql-reference/data-types/array.md). +- Array containing all the values from `map`. [Array](../data-types/array.md). **Example** @@ -500,7 +500,7 @@ mapContainsKeyLike(map, pattern) ``` **Arguments** -- `map` — Map. [Map](../../sql-reference/data-types/map.md). +- `map` — Map. [Map](../data-types/map.md). - `pattern` - String pattern to match. **Returned value** @@ -538,7 +538,7 @@ mapExtractKeyLike(map, pattern) **Arguments** -- `map` — Map. [Map](../../sql-reference/data-types/map.md). +- `map` — Map. [Map](../data-types/map.md). - `pattern` - String pattern to match. **Returned value** @@ -577,7 +577,7 @@ mapApply(func, map) **Arguments** - `func` - [Lambda function](../../sql-reference/functions/index.md#higher-order-functions---operator-and-lambdaparams-expr-function). -- `map` — [Map](../../sql-reference/data-types/map.md). +- `map` — [Map](../data-types/map.md). **Returned value** @@ -617,7 +617,7 @@ mapFilter(func, map) **Arguments** - `func` - [Lambda function](../../sql-reference/functions/index.md#higher-order-functions---operator-and-lambdaparams-expr-function). -- `map` — [Map](../../sql-reference/data-types/map.md). +- `map` — [Map](../data-types/map.md). **Returned value** @@ -658,8 +658,8 @@ mapUpdate(map1, map2) **Arguments** -- `map1` [Map](../../sql-reference/data-types/map.md). -- `map2` [Map](../../sql-reference/data-types/map.md). +- `map1` [Map](../data-types/map.md). +- `map2` [Map](../data-types/map.md). **Returned value** @@ -691,7 +691,7 @@ mapConcat(maps) **Arguments** -- `maps` – Arbitrary number of arguments of [Map](../../sql-reference/data-types/map.md) type. +- `maps` – Arbitrary number of arguments of [Map](../data-types/map.md) type. **Returned value** diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index f1c2e92f201..d123f317dc6 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -51,7 +51,7 @@ SETTINGS cast_keep_nullable = 1 ## toInt(8\|16\|32\|64\|128\|256) -Converts an input value to a value the [Int](/docs/en/sql-reference/data-types/int-uint.md) data type. This function family includes: +Converts an input value to a value the [Int](../data-types/int-uint.md) data type. This function family includes: - `toInt8(expr)` — Converts to a value of data type `Int8`. - `toInt16(expr)` — Converts to a value of data type `Int16`. @@ -62,7 +62,7 @@ Converts an input value to a value the [Int](/docs/en/sql-reference/data-types/i **Arguments** -- `expr` — [Expression](/docs/en/sql-reference/syntax.md/#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. +- `expr` — [Expression](../syntax.md/#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. **Returned value** @@ -70,7 +70,7 @@ Integer value in the `Int8`, `Int16`, `Int32`, `Int64`, `Int128` or `Int256` dat Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behavior of functions for the [NaN and Inf](/docs/en/sql-reference/data-types/float.md/#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. +The behavior of functions for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. **Example** @@ -90,7 +90,7 @@ Result: ## toInt(8\|16\|32\|64\|128\|256)OrZero -Takes an argument of type [String](/docs/en/sql-reference/data-types/string.md) and tries to parse it into an Int (8 \| 16 \| 32 \| 64 \| 128 \| 256). If unsuccessful, returns `0`. +Takes an argument of type [String](../data-types/string.md) and tries to parse it into an Int (8 \| 16 \| 32 \| 64 \| 128 \| 256). If unsuccessful, returns `0`. **Example** @@ -151,7 +151,7 @@ Result: ## toUInt(8\|16\|32\|64\|256) -Converts an input value to the [UInt](/docs/en/sql-reference/data-types/int-uint.md) data type. This function family includes: +Converts an input value to the [UInt](../data-types/int-uint.md) data type. This function family includes: - `toUInt8(expr)` — Converts to a value of data type `UInt8`. - `toUInt16(expr)` — Converts to a value of data type `UInt16`. @@ -161,7 +161,7 @@ Converts an input value to the [UInt](/docs/en/sql-reference/data-types/int-uint **Arguments** -- `expr` — [Expression](/docs/en/sql-reference/syntax.md/#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. +- `expr` — [Expression](../syntax.md/#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. **Returned value** @@ -169,7 +169,7 @@ Converts an input value to the [UInt](/docs/en/sql-reference/data-types/int-uint Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behavior of functions for negative arguments and for the [NaN and Inf](/docs/en/sql-reference/data-types/float.md/#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. +The behavior of functions for negative arguments and for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. **Example** @@ -203,9 +203,9 @@ Result: ## toDate -Converts the argument to [Date](/docs/en/sql-reference/data-types/date.md) data type. +Converts the argument to [Date](../data-types/date.md) data type. -If the argument is [DateTime](/docs/en/sql-reference/data-types/datetime.md) or [DateTime64](/docs/en/sql-reference/data-types/datetime64.md), it truncates it and leaves the date component of the DateTime: +If the argument is [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md), it truncates it and leaves the date component of the DateTime: ```sql SELECT @@ -219,7 +219,7 @@ SELECT └─────────────────────┴───────────────┘ ``` -If the argument is a [String](/docs/en/sql-reference/data-types/string.md), it is parsed as [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). If it was parsed as [DateTime](/docs/en/sql-reference/data-types/datetime.md), the date component is being used: +If the argument is a [String](../data-types/string.md), it is parsed as [Date](../data-types/date.md) or [DateTime](../data-types/datetime.md). If it was parsed as [DateTime](../data-types/datetime.md), the date component is being used: ```sql SELECT @@ -247,7 +247,7 @@ SELECT └────────────┴───────────────────────────────────────────┘ ``` -If the argument is a number and looks like a UNIX timestamp (is greater than 65535), it is interpreted as a [DateTime](/docs/en/sql-reference/data-types/datetime.md), then truncated to [Date](/docs/en/sql-reference/data-types/date.md) in the current timezone. The timezone argument can be specified as a second argument of the function. The truncation to [Date](/docs/en/sql-reference/data-types/date.md) depends on the timezone: +If the argument is a number and looks like a UNIX timestamp (is greater than 65535), it is interpreted as a [DateTime](../data-types/datetime.md), then truncated to [Date](../data-types/date.md) in the current timezone. The timezone argument can be specified as a second argument of the function. The truncation to [Date](../data-types/date.md) depends on the timezone: ```sql SELECT @@ -276,7 +276,7 @@ date_Samoa_2: 2022-12-31 The example above demonstrates how the same UNIX timestamp can be interpreted as different dates in different time zones. -If the argument is a number and it is smaller than 65536, it is interpreted as the number of days since 1970-01-01 (the first UNIX day) and converted to [Date](/docs/en/sql-reference/data-types/date.md). It corresponds to the internal numeric representation of the `Date` data type. Example: +If the argument is a number and it is smaller than 65536, it is interpreted as the number of days since 1970-01-01 (the first UNIX day) and converted to [Date](../data-types/date.md). It corresponds to the internal numeric representation of the `Date` data type. Example: ```sql SELECT toDate(12345) @@ -317,7 +317,7 @@ SELECT ## toDateOrZero -The same as [toDate](#todate) but returns lower boundary of [Date](/docs/en/sql-reference/data-types/date.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. +The same as [toDate](#todate) but returns lower boundary of [Date](../data-types/date.md) if an invalid argument is received. Only [String](../data-types/string.md) argument is supported. **Example** @@ -338,7 +338,7 @@ Result: ## toDateOrNull -The same as [toDate](#todate) but returns `NULL` if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. +The same as [toDate](#todate) but returns `NULL` if an invalid argument is received. Only [String](../data-types/string.md) argument is supported. **Example** @@ -359,7 +359,7 @@ Result: ## toDateOrDefault -Like [toDate](#todate) but if unsuccessful, returns a default value which is either the second argument (if specified), or otherwise the lower boundary of [Date](/docs/en/sql-reference/data-types/date.md). +Like [toDate](#todate) but if unsuccessful, returns a default value which is either the second argument (if specified), or otherwise the lower boundary of [Date](../data-types/date.md). **Syntax** @@ -386,7 +386,7 @@ Result: ## toDateTime -Converts an input value to [DateTime](/docs/en/sql-reference/data-types/datetime.md). +Converts an input value to [DateTime](../data-types/datetime.md). **Syntax** @@ -396,18 +396,18 @@ toDateTime(expr[, time_zone ]) **Arguments** -- `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [Int](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). -- `time_zone` — Time zone. [String](/docs/en/sql-reference/data-types/string.md). +- `expr` — The value. [String](../data-types/string.md), [Int](../data-types/int-uint.md), [Date](../data-types/date.md) or [DateTime](../data-types/datetime.md). +- `time_zone` — Time zone. [String](../data-types/string.md). :::note If `expr` is a number, it is interpreted as the number of seconds since the beginning of the Unix Epoch (as Unix timestamp). -If `expr` is a [String](/docs/en/sql-reference/data-types/string.md), it may be interpreted as a Unix timestamp or as a string representation of date / date with time. +If `expr` is a [String](../data-types/string.md), it may be interpreted as a Unix timestamp or as a string representation of date / date with time. Thus, parsing of short numbers' string representations (up to 4 digits) is explicitly disabled due to ambiguity, e.g. a string `'1999'` may be both a year (an incomplete string representation of Date / DateTime) or a unix timestamp. Longer numeric strings are allowed. ::: **Returned value** -- A date time. [DateTime](/docs/en/sql-reference/data-types/datetime.md) +- A date time. [DateTime](../data-types/datetime.md) **Example** @@ -428,7 +428,7 @@ Result: ## toDateTimeOrZero -The same as [toDateTime](#todatetime) but returns lower boundary of [DateTime](/docs/en/sql-reference/data-types/datetime.md) if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. +The same as [toDateTime](#todatetime) but returns lower boundary of [DateTime](../data-types/datetime.md) if an invalid argument is received. Only [String](../data-types/string.md) argument is supported. **Example** @@ -449,7 +449,7 @@ Result: ## toDateTimeOrNull -The same as [toDateTime](#todatetime) but returns `NULL` if an invalid argument is received. Only [String](/docs/en/sql-reference/data-types/string.md) argument is supported. +The same as [toDateTime](#todatetime) but returns `NULL` if an invalid argument is received. Only [String](../data-types/string.md) argument is supported. **Example** @@ -470,7 +470,7 @@ Result: ## toDateTimeOrDefault -Like [toDateTime](#todatetime) but if unsuccessful, returns a default value which is either the third argument (if specified), or otherwise the lower boundary of [DateTime](/docs/en/sql-reference/data-types/datetime.md). +Like [toDateTime](#todatetime) but if unsuccessful, returns a default value which is either the third argument (if specified), or otherwise the lower boundary of [DateTime](../data-types/datetime.md). **Syntax** @@ -497,7 +497,7 @@ Result: ## toDate32 -Converts the argument to the [Date32](/docs/en/sql-reference/data-types/date32.md) data type. If the value is outside the range, `toDate32` returns the border values supported by [Date32](/docs/en/sql-reference/data-types/date32.md). If the argument has [Date](/docs/en/sql-reference/data-types/date.md) type, it's borders are taken into account. +Converts the argument to the [Date32](../data-types/date32.md) data type. If the value is outside the range, `toDate32` returns the border values supported by [Date32](../data-types/date32.md). If the argument has [Date](../data-types/date.md) type, it's borders are taken into account. **Syntax** @@ -507,11 +507,11 @@ toDate32(expr) **Arguments** -- `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [UInt32](/docs/en/sql-reference/data-types/int-uint.md) or [Date](/docs/en/sql-reference/data-types/date.md). +- `expr` — The value. [String](../data-types/string.md), [UInt32](../data-types/int-uint.md) or [Date](../data-types/date.md). **Returned value** -- A calendar date. Type [Date32](/docs/en/sql-reference/data-types/date32.md). +- A calendar date. Type [Date32](../data-types/date32.md). **Example** @@ -539,7 +539,7 @@ SELECT toDate32('1899-01-01') AS value, toTypeName(value); └────────────┴────────────────────────────────────┘ ``` -3. With [Date](/docs/en/sql-reference/data-types/date.md) argument: +3. With [Date](../data-types/date.md) argument: ``` sql SELECT toDate32(toDate('1899-01-01')) AS value, toTypeName(value); @@ -553,7 +553,7 @@ SELECT toDate32(toDate('1899-01-01')) AS value, toTypeName(value); ## toDate32OrZero -The same as [toDate32](#todate32) but returns the min value of [Date32](/docs/en/sql-reference/data-types/date32.md) if an invalid argument is received. +The same as [toDate32](#todate32) but returns the min value of [Date32](../data-types/date32.md) if an invalid argument is received. **Example** @@ -593,7 +593,7 @@ Result: ## toDate32OrDefault -Converts the argument to the [Date32](/docs/en/sql-reference/data-types/date32.md) data type. If the value is outside the range, `toDate32OrDefault` returns the lower border value supported by [Date32](/docs/en/sql-reference/data-types/date32.md). If the argument has [Date](/docs/en/sql-reference/data-types/date.md) type, it's borders are taken into account. Returns default value if an invalid argument is received. +Converts the argument to the [Date32](../data-types/date32.md) data type. If the value is outside the range, `toDate32OrDefault` returns the lower border value supported by [Date32](../data-types/date32.md). If the argument has [Date](../data-types/date.md) type, it's borders are taken into account. Returns default value if an invalid argument is received. **Example** @@ -615,7 +615,7 @@ Result: ## toDateTime64 -Converts the argument to the [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) data type. +Converts the argument to the [DateTime64](../data-types/datetime64.md) data type. **Syntax** @@ -625,13 +625,13 @@ toDateTime64(expr, scale, [timezone]) **Arguments** -- `expr` — The value. [String](/docs/en/sql-reference/data-types/string.md), [UInt32](/docs/en/sql-reference/data-types/int-uint.md), [Float](/docs/en/sql-reference/data-types/float.md) or [DateTime](/docs/en/sql-reference/data-types/datetime.md). +- `expr` — The value. [String](../data-types/string.md), [UInt32](../data-types/int-uint.md), [Float](../data-types/float.md) or [DateTime](../data-types/datetime.md). - `scale` - Tick size (precision): 10-precision seconds. Valid range: [ 0 : 9 ]. - `timezone` - Time zone of the specified datetime64 object. **Returned value** -- A calendar date and time of day, with sub-second precision. [DateTime64](/docs/en/sql-reference/data-types/datetime64.md). +- A calendar date and time of day, with sub-second precision. [DateTime64](../data-types/datetime64.md). **Example** @@ -692,7 +692,7 @@ SELECT toDateTime64('2019-01-01 00:00:00', 3, 'Asia/Istanbul') AS value, toTypeN ## toDecimal(32\|64\|128\|256) -Converts `value` to the [Decimal](/docs/en/sql-reference/data-types/decimal.md) data type with precision of `S`. The `value` can be a number or a string. The `S` (scale) parameter specifies the number of decimal places. +Converts `value` to the [Decimal](../data-types/decimal.md) data type with precision of `S`. The `value` can be a number or a string. The `S` (scale) parameter specifies the number of decimal places. - `toDecimal32(value, S)` - `toDecimal64(value, S)` @@ -701,7 +701,7 @@ Converts `value` to the [Decimal](/docs/en/sql-reference/data-types/decimal.md) ## toDecimal(32\|64\|128\|256)OrNull -Converts an input string to a [Nullable(Decimal(P,S))](/docs/en/sql-reference/data-types/decimal.md) data type value. This family of functions includes: +Converts an input string to a [Nullable(Decimal(P,S))](../data-types/decimal.md) data type value. This family of functions includes: - `toDecimal32OrNull(expr, S)` — Results in `Nullable(Decimal32(S))` data type. - `toDecimal64OrNull(expr, S)` — Results in `Nullable(Decimal64(S))` data type. @@ -712,7 +712,7 @@ These functions should be used instead of `toDecimal*()` functions, if you prefe **Arguments** -- `expr` — [Expression](/docs/en/sql-reference/syntax.md/#syntax-expressions), returns a value in the [String](/docs/en/sql-reference/data-types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. +- `expr` — [Expression](../syntax.md/#syntax-expressions), returns a value in the [String](../data-types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. - `S` — Scale, the number of decimal places in the resulting value. **Returned value** @@ -755,7 +755,7 @@ Result: ## toDecimal(32\|64\|128\|256)OrDefault -Converts an input string to a [Decimal(P,S)](/docs/en/sql-reference/data-types/decimal.md) data type value. This family of functions includes: +Converts an input string to a [Decimal(P,S)](../data-types/decimal.md) data type value. This family of functions includes: - `toDecimal32OrDefault(expr, S)` — Results in `Decimal32(S)` data type. - `toDecimal64OrDefault(expr, S)` — Results in `Decimal64(S)` data type. @@ -766,7 +766,7 @@ These functions should be used instead of `toDecimal*()` functions, if you prefe **Arguments** -- `expr` — [Expression](/docs/en/sql-reference/syntax.md/#syntax-expressions), returns a value in the [String](/docs/en/sql-reference/data-types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. +- `expr` — [Expression](../syntax.md/#syntax-expressions), returns a value in the [String](../data-types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. - `S` — Scale, the number of decimal places in the resulting value. **Returned value** @@ -808,7 +808,7 @@ Result: ## toDecimal(32\|64\|128\|256)OrZero -Converts an input value to the [Decimal(P,S)](/docs/en/sql-reference/data-types/decimal.md) data type. This family of functions includes: +Converts an input value to the [Decimal(P,S)](../data-types/decimal.md) data type. This family of functions includes: - `toDecimal32OrZero( expr, S)` — Results in `Decimal32(S)` data type. - `toDecimal64OrZero( expr, S)` — Results in `Decimal64(S)` data type. @@ -819,7 +819,7 @@ These functions should be used instead of `toDecimal*()` functions, if you prefe **Arguments** -- `expr` — [Expression](/docs/en/sql-reference/syntax.md/#syntax-expressions), returns a value in the [String](/docs/en/sql-reference/data-types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. +- `expr` — [Expression](../syntax.md/#syntax-expressions), returns a value in the [String](../data-types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. - `S` — Scale, the number of decimal places in the resulting value. **Returned value** @@ -919,7 +919,7 @@ Also see the `toUnixTimestamp` function. ## toFixedString(s, N) -Converts a [String](/docs/en/sql-reference/data-types/string.md) type argument to a [FixedString(N)](/docs/en/sql-reference/data-types/fixedstring.md) type (a string of fixed length N). +Converts a [String](../data-types/string.md) type argument to a [FixedString(N)](../data-types/fixedstring.md) type (a string of fixed length N). If the string has fewer bytes than N, it is padded with null bytes to the right. If the string has more bytes than N, an exception is thrown. ## toStringCutToZero(s) @@ -968,14 +968,14 @@ toDecimalString(number, scale) **Arguments** -- `number` — Value to be represented as String, [Int, UInt](/docs/en/sql-reference/data-types/int-uint.md), [Float](/docs/en/sql-reference/data-types/float.md), [Decimal](/docs/en/sql-reference/data-types/decimal.md), -- `scale` — Number of fractional digits, [UInt8](/docs/en/sql-reference/data-types/int-uint.md). - * Maximum scale for [Decimal](/docs/en/sql-reference/data-types/decimal.md) and [Int, UInt](/docs/en/sql-reference/data-types/int-uint.md) types is 77 (it is the maximum possible number of significant digits for Decimal), - * Maximum scale for [Float](/docs/en/sql-reference/data-types/float.md) is 60. +- `number` — Value to be represented as String, [Int, UInt](../data-types/int-uint.md), [Float](../data-types/float.md), [Decimal](../data-types/decimal.md), +- `scale` — Number of fractional digits, [UInt8](../data-types/int-uint.md). + * Maximum scale for [Decimal](../data-types/decimal.md) and [Int, UInt](../data-types/int-uint.md) types is 77 (it is the maximum possible number of significant digits for Decimal), + * Maximum scale for [Float](../data-types/float.md) is 60. **Returned value** -- Input value represented as [String](/docs/en/sql-reference/data-types/string.md) with given number of fractional digits (scale). +- Input value represented as [String](../data-types/string.md) with given number of fractional digits (scale). The number is rounded up or down according to common arithmetic in case requested scale is smaller than original number's scale. **Example** @@ -1017,7 +1017,7 @@ This function accepts a number or date or date with time and returns a FixedStri ## reinterpretAsUUID :::note -In addition to the UUID functions listed here, there is dedicated [UUID function documentation](/docs/en/sql-reference/functions/uuid-functions.md). +In addition to the UUID functions listed here, there is dedicated [UUID function documentation](../functions/uuid-functions.md). ::: Accepts 16 bytes string and returns UUID containing bytes representing the corresponding value in network byte order (big-endian). If the string isn't long enough, the function works as if the string is padded with the necessary number of null bytes to the end. If the string is longer than 16 bytes, the extra bytes at the end are ignored. @@ -1030,11 +1030,11 @@ reinterpretAsUUID(fixed_string) **Arguments** -- `fixed_string` — Big-endian byte string. [FixedString](/docs/en/sql-reference/data-types/fixedstring.md/#fixedstring). +- `fixed_string` — Big-endian byte string. [FixedString](../data-types/fixedstring.md/#fixedstring). **Returned value** -- The UUID type value. [UUID](/docs/en/sql-reference/data-types/uuid.md/#uuid-data-type). +- The UUID type value. [UUID](../data-types/uuid.md/#uuid-data-type). **Examples** @@ -1087,7 +1087,7 @@ reinterpret(x, type) **Arguments** - `x` — Any type. -- `type` — Destination type. [String](/docs/en/sql-reference/data-types/string.md). +- `type` — Destination type. [String](../data-types/string.md). **Returned value** @@ -1126,7 +1126,7 @@ x::t **Arguments** - `x` — A value to convert. May be of any type. -- `T` — The name of the target data type. [String](/docs/en/sql-reference/data-types/string.md). +- `T` — The name of the target data type. [String](../data-types/string.md). - `t` — The target data type. **Returned value** @@ -1175,9 +1175,9 @@ Result: └─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ ``` -Conversion to [FixedString (N)](/docs/en/sql-reference/data-types/fixedstring.md) only works for arguments of type [String](/docs/en/sql-reference/data-types/string.md) or [FixedString](/docs/en/sql-reference/data-types/fixedstring.md). +Conversion to [FixedString (N)](../data-types/fixedstring.md) only works for arguments of type [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -Type conversion to [Nullable](/docs/en/sql-reference/data-types/nullable.md) and back is supported. +Type conversion to [Nullable](../data-types/nullable.md) and back is supported. **Example** @@ -1251,7 +1251,7 @@ Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in c ## accurateCastOrNull(x, T) -Converts input value `x` to the specified data type `T`. Always returns [Nullable](/docs/en/sql-reference/data-types/nullable.md) type and returns [NULL](/docs/en/sql-reference/syntax.md/#null-literal) if the casted value is not representable in the target type. +Converts input value `x` to the specified data type `T`. Always returns [Nullable](../data-types/nullable.md) type and returns [NULL](../syntax.md/#null-literal) if the casted value is not representable in the target type. **Syntax** @@ -1360,7 +1360,7 @@ Result: ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) -Converts a Number type argument to an [Interval](/docs/en/sql-reference/data-types/special-data-types/interval.md) data type. +Converts a Number type argument to an [Interval](../data-types/special-data-types/interval.md) data type. **Syntax** @@ -1407,9 +1407,9 @@ Result: ## parseDateTime {#type_conversion_functions-parseDateTime} -Converts a [String](/docs/en/sql-reference/data-types/string.md) to [DateTime](/docs/en/sql-reference/data-types/datetime.md) according to a [MySQL format string](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format). +Converts a [String](../data-types/string.md) to [DateTime](../data-types/datetime.md) according to a [MySQL format string](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format). -This function is the opposite operation of function [formatDateTime](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTime). +This function is the opposite operation of function [formatDateTime](../functions/date-time-functions.md#date_time_functions-formatDateTime). **Syntax** @@ -1429,7 +1429,7 @@ Returns DateTime values parsed from input string according to a MySQL style form **Supported format specifiers** -All format specifiers listed in [formatDateTime](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTime) except: +All format specifiers listed in [formatDateTime](../functions/date-time-functions.md#date_time_functions-formatDateTime) except: - %Q: Quarter (1-4) **Example** @@ -1458,7 +1458,7 @@ Alias: `str_to_date`. Similar to [parseDateTime](#parsedatetime), except that the format string is in [Joda](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL syntax. -This function is the opposite operation of function [formatDateTimeInJodaSyntax](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTimeInJodaSyntax). +This function is the opposite operation of function [formatDateTimeInJodaSyntax](../functions/date-time-functions.md#date_time_functions-formatDateTimeInJodaSyntax). **Syntax** @@ -1478,7 +1478,7 @@ Returns DateTime values parsed from input string according to a Joda style forma **Supported format specifiers** -All format specifiers listed in [formatDateTimeInJoda](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTime) are supported, except: +All format specifiers listed in [formatDateTimeInJoda](../functions/date-time-functions.md#date_time_functions-formatDateTime) are supported, except: - S: fraction of second - z: time zone - Z: time zone offset/id @@ -1504,7 +1504,7 @@ Same as for [parseDateTimeInJodaSyntax](#type_conversion_functions-parseDateTime ## parseDateTimeBestEffort ## parseDateTime32BestEffort -Converts a date and time in the [String](/docs/en/sql-reference/data-types/string.md) representation to [DateTime](/docs/en/sql-reference/data-types/datetime.md/#data_type-datetime) data type. +Converts a date and time in the [String](../data-types/string.md) representation to [DateTime](../data-types/datetime.md/#data_type-datetime) data type. The function parses [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601), [RFC 1123 - 5.2.14 RFC-822 Date and Time Specification](https://tools.ietf.org/html/rfc1123#page-55), ClickHouse’s and some other date and time formats. @@ -1516,8 +1516,8 @@ parseDateTimeBestEffort(time_string [, time_zone]) **Arguments** -- `time_string` — String containing a date and time to convert. [String](/docs/en/sql-reference/data-types/string.md). -- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](/docs/en/sql-reference/data-types/string.md). +- `time_string` — String containing a date and time to convert. [String](../data-types/string.md). +- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../data-types/string.md). **Supported non-standard formats** @@ -1533,7 +1533,7 @@ If the year is not specified, it is considered to be equal to the current year. **Returned value** -- `time_string` converted to the [DateTime](/docs/en/sql-reference/data-types/datetime.md) data type. +- `time_string` converted to the [DateTime](../data-types/datetime.md) data type. **Examples** @@ -1665,7 +1665,7 @@ Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except ## parseDateTime64BestEffort -Same as [parseDateTimeBestEffort](#parsedatetimebesteffort) function but also parse milliseconds and microseconds and returns [DateTime](/docs/en/sql-reference/functions/type-conversion-functions.md/#data_type-datetime) data type. +Same as [parseDateTimeBestEffort](#parsedatetimebesteffort) function but also parse milliseconds and microseconds and returns [DateTime](../functions/type-conversion-functions.md/#data_type-datetime) data type. **Syntax** @@ -1675,13 +1675,13 @@ parseDateTime64BestEffort(time_string [, precision [, time_zone]]) **Arguments** -- `time_string` — String containing a date or date with time to convert. [String](/docs/en/sql-reference/data-types/string.md). -- `precision` — Required precision. `3` — for milliseconds, `6` — for microseconds. Default — `3`. Optional. [UInt8](/docs/en/sql-reference/data-types/int-uint.md). -- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](/docs/en/sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. [String](../data-types/string.md). +- `precision` — Required precision. `3` — for milliseconds, `6` — for microseconds. Default — `3`. Optional. [UInt8](../data-types/int-uint.md). +- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../data-types/string.md). **Returned value** -- `time_string` converted to the [DateTime](/docs/en/sql-reference/data-types/datetime.md) data type. +- `time_string` converted to the [DateTime](../data-types/datetime.md) data type. **Examples** @@ -1731,7 +1731,7 @@ Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that ## toLowCardinality -Converts input parameter to the [LowCardinality](/docs/en/sql-reference/data-types/lowcardinality.md) version of same data type. +Converts input parameter to the [LowCardinality](../data-types/lowcardinality.md) version of same data type. To convert data from the `LowCardinality` data type use the [CAST](#type_conversion_function-cast) function. For example, `CAST(x as String)`. @@ -1743,7 +1743,7 @@ toLowCardinality(expr) **Arguments** -- `expr` — [Expression](/docs/en/sql-reference/syntax.md/#syntax-expressions) resulting in one of the [supported data types](/docs/en/sql-reference/data-types/index.md/#data_types). +- `expr` — [Expression](../syntax.md/#syntax-expressions) resulting in one of the [supported data types](../data-types/index.md/#data_types). **Returned values** @@ -1978,7 +1978,7 @@ Result: ## snowflakeToDateTime -Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime](/docs/en/sql-reference/data-types/datetime.md) format. +Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime](../data-types/datetime.md) format. **Syntax** @@ -1988,12 +1988,12 @@ snowflakeToDateTime(value[, time_zone]) **Arguments** -- `value` — Snowflake ID. [Int64](/docs/en/sql-reference/data-types/int-uint.md). -- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](/docs/en/sql-reference/data-types/string.md). +- `value` — Snowflake ID. [Int64](../data-types/int-uint.md). +- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../data-types/string.md). **Returned value** -- The timestamp component of `value` as a [DateTime](/docs/en/sql-reference/data-types/datetime.md) value. +- The timestamp component of `value` as a [DateTime](../data-types/datetime.md) value. **Example** @@ -2014,7 +2014,7 @@ Result: ## snowflakeToDateTime64 -Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) format. +Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime64](../data-types/datetime64.md) format. **Syntax** @@ -2024,12 +2024,12 @@ snowflakeToDateTime64(value[, time_zone]) **Arguments** -- `value` — Snowflake ID. [Int64](/docs/en/sql-reference/data-types/int-uint.md). -- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](/docs/en/sql-reference/data-types/string.md). +- `value` — Snowflake ID. [Int64](../data-types/int-uint.md). +- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../data-types/string.md). **Returned value** -- The timestamp component of `value` as a [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) with scale = 3, i.e. millisecond precision. +- The timestamp component of `value` as a [DateTime64](../data-types/datetime64.md) with scale = 3, i.e. millisecond precision. **Example** @@ -2050,7 +2050,7 @@ Result: ## dateTimeToSnowflake -Converts a [DateTime](/docs/en/sql-reference/data-types/datetime.md) value to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time. +Converts a [DateTime](../data-types/datetime.md) value to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time. **Syntax** @@ -2060,11 +2060,11 @@ dateTimeToSnowflake(value) **Arguments** -- `value` — Date with time. [DateTime](/docs/en/sql-reference/data-types/datetime.md). +- `value` — Date with time. [DateTime](../data-types/datetime.md). **Returned value** -- Input value converted to the [Int64](/docs/en/sql-reference/data-types/int-uint.md) data type as the first Snowflake ID at that time. +- Input value converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time. **Example** @@ -2084,7 +2084,7 @@ Result: ## dateTime64ToSnowflake -Convert a [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time. +Convert a [DateTime64](../data-types/datetime64.md) to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time. **Syntax** @@ -2094,11 +2094,11 @@ dateTime64ToSnowflake(value) **Arguments** -- `value` — Date with time. [DateTime64](/docs/en/sql-reference/data-types/datetime64.md). +- `value` — Date with time. [DateTime64](../data-types/datetime64.md). **Returned value** -- Input value converted to the [Int64](/docs/en/sql-reference/data-types/int-uint.md) data type as the first Snowflake ID at that time. +- Input value converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time. **Example** diff --git a/docs/en/sql-reference/functions/ulid-functions.md b/docs/en/sql-reference/functions/ulid-functions.md index b4e3fc2d164..dc6a803d638 100644 --- a/docs/en/sql-reference/functions/ulid-functions.md +++ b/docs/en/sql-reference/functions/ulid-functions.md @@ -18,7 +18,7 @@ generateULID([x]) **Arguments** -- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter. +- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter. **Returned value** @@ -60,12 +60,12 @@ ULIDStringToDateTime(ulid[, timezone]) **Arguments** -- `ulid` — Input ULID. [String](/docs/en/sql-reference/data-types/string.md) or [FixedString(26)](/docs/en/sql-reference/data-types/fixedstring.md). -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md). +- `ulid` — Input ULID. [String](../data-types/string.md) or [FixedString(26)](../data-types/fixedstring.md). +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../data-types/string.md). **Returned value** -- Timestamp with milliseconds precision. [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md). +- Timestamp with milliseconds precision. [DateTime64(3)](../data-types/datetime64.md). **Usage example** diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index cc826b0bba4..130f0147ca1 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -28,7 +28,7 @@ domain(url) **Arguments** -- `url` — URL. [String](../../sql-reference/data-types/string.md). +- `url` — URL. [String](../data-types/string.md). The URL can be specified with or without a scheme. Examples: @@ -77,7 +77,7 @@ topLevelDomain(url) **Arguments** -- `url` — URL. [String](../../sql-reference/data-types/string.md). +- `url` — URL. [String](../data-types/string.md). The URL can be specified with or without a scheme. Examples: @@ -89,8 +89,8 @@ https://clickhouse.com/time/ **Returned values** -- Domain name. If ClickHouse can parse the input string as a URL. [String](../../sql-reference/data-types/string.md). -- Empty string. If ClickHouse cannot parse the input string as a URL. [String](../../sql-reference/data-types/string.md). +- Domain name. If ClickHouse can parse the input string as a URL. [String](../data-types/string.md). +- Empty string. If ClickHouse cannot parse the input string as a URL. [String](../data-types/string.md). **Example** @@ -153,12 +153,12 @@ cutToFirstSignificantSubdomainCustom(URL, TLD) **Arguments** -- `URL` — URL. [String](../../sql-reference/data-types/string.md). -- `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md). +- `URL` — URL. [String](../data-types/string.md). +- `TLD` — Custom TLD list name. [String](../data-types/string.md). **Returned value** -- Part of the domain that includes top-level subdomains up to the first significant subdomain. [String](../../sql-reference/data-types/string.md). +- Part of the domain that includes top-level subdomains up to the first significant subdomain. [String](../data-types/string.md). **Example** @@ -205,12 +205,12 @@ cutToFirstSignificantSubdomainCustomWithWWW(URL, TLD) **Arguments** -- `URL` — URL. [String](../../sql-reference/data-types/string.md). -- `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md). +- `URL` — URL. [String](../data-types/string.md). +- `TLD` — Custom TLD list name. [String](../data-types/string.md). **Returned value** -- Part of the domain that includes top-level subdomains up to the first significant subdomain without stripping `www`. [String](../../sql-reference/data-types/string.md). +- Part of the domain that includes top-level subdomains up to the first significant subdomain without stripping `www`. [String](../data-types/string.md). **Example** @@ -257,12 +257,12 @@ firstSignificantSubdomainCustom(URL, TLD) **Arguments** -- `URL` — URL. [String](../../sql-reference/data-types/string.md). -- `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md). +- `URL` — URL. [String](../data-types/string.md). +- `TLD` — Custom TLD list name. [String](../data-types/string.md). **Returned value** -- First significant subdomain. [String](../../sql-reference/data-types/string.md). +- First significant subdomain. [String](../data-types/string.md). **Example** @@ -408,7 +408,7 @@ netloc(URL) **Arguments** -- `url` — URL. [String](../../sql-reference/data-types/string.md). +- `url` — URL. [String](../data-types/string.md). **Returned value** @@ -462,8 +462,8 @@ cutURLParameter(URL, name) **Arguments** -- `url` — URL. [String](../../sql-reference/data-types/string.md). -- `name` — name of URL parameter. [String](../../sql-reference/data-types/string.md) or [Array](../../sql-reference/data-types/array.md) of Strings. +- `url` — URL. [String](../data-types/string.md). +- `name` — name of URL parameter. [String](../data-types/string.md) or [Array](../data-types/array.md) of Strings. **Returned value** diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index a16663afc5b..a4e4037eedc 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -18,7 +18,7 @@ generateUUIDv4([expr]) **Arguments** -- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional. +- `expr` — An arbitrary [expression](../syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional. **Returned value** @@ -90,7 +90,7 @@ generateUUIDv7([expr]) **Arguments** -- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional. +- `expr` — An arbitrary [expression](../syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional. **Returned value** @@ -163,7 +163,7 @@ generateUUIDv7ThreadMonotonic([expr]) **Arguments** -- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional. +- `expr` — An arbitrary [expression](../syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional. **Returned value** @@ -233,7 +233,7 @@ generateUUIDv7NonMonotonic([expr]) **Arguments** -- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional. +- `expr` — An arbitrary [expression](../syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional. **Returned value** @@ -379,8 +379,8 @@ Result: **Arguments** -- `string` — String of 36 characters or FixedString(36). [String](../../sql-reference/syntax.md#string). -- `default` — UUID to be used as the default if the first argument cannot be converted to a UUID type. [UUID](/docs/en/sql-reference/data-types/uuid.md). +- `string` — String of 36 characters or FixedString(36). [String](../syntax.md#string). +- `default` — UUID to be used as the default if the first argument cannot be converted to a UUID type. [UUID](../data-types/uuid.md). **Returned value** @@ -478,7 +478,7 @@ Result: ## UUIDStringToNum -Accepts `string` containing 36 characters in the format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, and returns a [FixedString(16)](../../sql-reference/data-types/fixedstring.md) as its binary representation, with its format optionally specified by `variant` (`Big-endian` by default). +Accepts `string` containing 36 characters in the format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, and returns a [FixedString(16)](../data-types/fixedstring.md) as its binary representation, with its format optionally specified by `variant` (`Big-endian` by default). **Syntax** @@ -488,7 +488,7 @@ UUIDStringToNum(string[, variant = 1]) **Arguments** -- `string` — A [String](../../sql-reference/syntax.md#syntax-string-literal) of 36 characters or [FixedString](../../sql-reference/syntax.md#syntax-string-literal) +- `string` — A [String](../syntax.md#syntax-string-literal) of 36 characters or [FixedString](../syntax.md#syntax-string-literal) - `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`. **Returned value** @@ -537,7 +537,7 @@ UUIDNumToString(binary[, variant = 1]) **Arguments** -- `binary` — [FixedString(16)](../../sql-reference/data-types/fixedstring.md) as a binary representation of a UUID. +- `binary` — [FixedString(16)](../data-types/fixedstring.md) as a binary representation of a UUID. - `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`. **Returned value** @@ -576,7 +576,7 @@ Result: ## UUIDToNum -Accepts a [UUID](../../sql-reference/data-types/uuid.md) and returns its binary representation as a [FixedString(16)](../../sql-reference/data-types/fixedstring.md), with its format optionally specified by `variant` (`Big-endian` by default). This function replaces calls to two separate functions `UUIDStringToNum(toString(uuid))` so no intermediate conversion from UUID to string is required to extract bytes from a UUID. +Accepts a [UUID](../data-types/uuid.md) and returns its binary representation as a [FixedString(16)](../data-types/fixedstring.md), with its format optionally specified by `variant` (`Big-endian` by default). This function replaces calls to two separate functions `UUIDStringToNum(toString(uuid))` so no intermediate conversion from UUID to string is required to extract bytes from a UUID. **Syntax** @@ -636,11 +636,11 @@ UUIDv7ToDateTime(uuid[, timezone]) **Arguments** - `uuid` — [UUID](../data-types/uuid.md) of version 7. -- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md). +- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../data-types/string.md). **Returned value** -- Timestamp with milliseconds precision. If the UUID is not a valid version 7 UUID, it returns 1970-01-01 00:00:00.000. [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md). +- Timestamp with milliseconds precision. If the UUID is not a valid version 7 UUID, it returns 1970-01-01 00:00:00.000. [DateTime64(3)](../data-types/datetime64.md). **Usage examples** @@ -684,4 +684,4 @@ serverUUID() ## See also -- [dictGetUUID](../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions-other) +- [dictGetUUID](../functions/ext-dict-functions.md#ext_dict_functions-other) diff --git a/docs/en/sql-reference/functions/ym-dict-functions.md b/docs/en/sql-reference/functions/ym-dict-functions.md index 043686889c4..03251f0b9af 100644 --- a/docs/en/sql-reference/functions/ym-dict-functions.md +++ b/docs/en/sql-reference/functions/ym-dict-functions.md @@ -432,13 +432,13 @@ regionIn(lhs, rhs\[, geobase\]) **Parameters** -- `lhs` — Lhs region ID from the geobase. [UInt32](../../sql-reference/data-types/int-uint). -- `rhs` — Rhs region ID from the geobase. [UInt32](../../sql-reference/data-types/int-uint). +- `lhs` — Lhs region ID from the geobase. [UInt32](../data-types/int-uint). +- `rhs` — Rhs region ID from the geobase. [UInt32](../data-types/int-uint). - `geobase` — Dictionary key. See [Multiple Geobases](#multiple-geobases). [String](../data-types/string). Optional. **Returned value** -- 1, if it belongs. [UInt8](../../sql-reference/data-types/int-uint). +- 1, if it belongs. [UInt8](../data-types/int-uint). - 0, if it doesn't belong. **Implementation details** From e87c168bd86a0697621b5692f80b1f64e40337a5 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 24 May 2024 06:42:13 +0200 Subject: [PATCH 319/392] Turn multi-line returns into a single line --- .../sql-reference/functions/introspection.md | 13 ++--- .../functions/splitting-merging-functions.md | 3 +- .../functions/string-search-functions.md | 58 +++++++------------ .../functions/time-series-functions.md | 3 +- .../sql-reference/functions/url-functions.md | 6 +- 5 files changed, 31 insertions(+), 52 deletions(-) diff --git a/docs/en/sql-reference/functions/introspection.md b/docs/en/sql-reference/functions/introspection.md index 540e148e3f1..5dc57e70591 100644 --- a/docs/en/sql-reference/functions/introspection.md +++ b/docs/en/sql-reference/functions/introspection.md @@ -112,9 +112,11 @@ trace_source_code_lines: /lib/x86_64-linux-gnu/libpthread-2.27.so ## addressToLineWithInlines -Similar to `addressToLine`, but it will return an Array with all inline functions, and will be much slower as a price. +Similar to `addressToLine`, but returns an Array with all inline functions. As a result of this, it is slower than `addressToLine`. +:::note If you use official ClickHouse packages, you need to install the `clickhouse-common-static-dbg` package. +::: **Syntax** @@ -128,11 +130,7 @@ addressToLineWithInlines(address_of_binary_instruction) **Returned value** -- Array which first element is source code filename and the line number in this file delimited by colon. And from second element, inline functions' source code filename and line number and function name are listed. - -- Array with single element which is name of a binary, if the function couldn’t find the debug information. - -- Empty array, if the address is not valid. [Array(String)](../data-types/array.md). +- An array whose first element is the source code filename and line number in the file delimited by a colon. From the second element onwards, inline functions' source code filenames, line numbers and function names are listed. If the function couldn’t find the debug information, then an array with a single element equal to the name of the binary is returned, otherwise an empty array is returned if the address is not valid. [Array(String)](../data-types/array.md). **Example** @@ -324,8 +322,7 @@ demangle(symbol) **Returned value** -- Name of the C++ function. [String](../data-types/string.md). -- Empty string if a symbol is not valid. [String](../data-types/string.md). +- Name of the C++ function, or an empty string if the symbol is not valid. [String](../data-types/string.md). **Example** diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 9ec4ee974c4..a3c28504a29 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -328,8 +328,7 @@ extractAllGroups(text, regexp) **Returned values** -- If the function finds at least one matching group, it returns `Array(Array(String))` column, clustered by group_id (1 to N, where N is number of capturing groups in `regexp`). [Array](../data-types/array.md). -- If there is no matching group, returns an empty array. [Array](../data-types/array.md). +- If the function finds at least one matching group, it returns `Array(Array(String))` column, clustered by group_id (1 to N, where N is number of capturing groups in `regexp`). If there is no matching group, it returns an empty array. [Array](../data-types/array.md). **Example** diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 07f776906e6..d261cff3580 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -40,7 +40,7 @@ Alias: - `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../data-types/int-uint.md). Optional. -**Returned values** +**Returned value** - Starting position in bytes and counting from 1, if the substring was found. [UInt64](../data-types/int-uint.md). - 0, if the substring was not found. [UInt64](../data-types/int-uint.md). @@ -206,7 +206,7 @@ multiSearchAllPositions(haystack, [needle1, needle2, ..., needleN]) - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Substrings to be searched. [Array](../data-types/array.md). -**Returned values** +**Returned value** - Array of the starting position in bytes and counting from 1, if the substring was found. - 0, if the substring was not found. @@ -492,8 +492,7 @@ multiSearchFirstIndex(haystack, [needle1, needle2, ..., needleN]) **Returned value** -- index (starting from 1) of the leftmost found needle. -- 0, if there was no match. +- index (starting from 1) of the leftmost found needle. Otherwise 0, if there was no match. [UInt8](../data-types/int-uint.md). **Example** @@ -526,8 +525,7 @@ multiSearchFirstIndexCaseInsensitive(haystack, [needle1, needle2, ..., needleN]) **Returned value** -- index (starting from 1) of the leftmost found needle. -- 0, if there was no match. +- index (starting from 1) of the leftmost found needle. Otherwise 0, if there was no match. [UInt8](../data-types/int-uint.md). **Example** @@ -560,8 +558,7 @@ multiSearchFirstIndexUTF8(haystack, [needle1, needle2, ..., needleN]) **Returned value** -- index (starting from 1) of the leftmost found needle. -- 0, if there was no match. +- index (starting from 1) of the leftmost found needle, Otherwise 0, if there was no match. [UInt8](../data-types/int-uint.md). **Example** @@ -596,8 +593,7 @@ multiSearchFirstIndexCaseInsensitiveUTF8(haystack, [needle1, needle2, ..., needl **Returned value** -- index (starting from 1) of the leftmost found needle. -- 0, if there was no match. +- index (starting from 1) of the leftmost found needle. Otherwise 0, if there was no match. [UInt8](../data-types/int-uint.md). **Example** @@ -1340,7 +1336,7 @@ countSubstrings(haystack, needle[, start_pos]) - `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../data-types/int-uint.md). Optional. -**Returned values** +**Returned value** - The number of occurrences. [UInt64](../data-types/int-uint.md). @@ -1387,7 +1383,7 @@ countSubstringsCaseInsensitive(haystack, needle[, start_pos]) - `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../data-types/int-uint.md). Optional. -**Returned values** +**Returned value** - The number of occurrences. [UInt64](../data-types/int-uint.md). @@ -1439,7 +1435,7 @@ countSubstringsCaseInsensitiveUTF8(haystack, needle[, start_pos]) - `needle` — Substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). - `start_pos` – Position (1-based) in `haystack` at which the search starts. [UInt](../data-types/int-uint.md). Optional. -**Returned values** +**Returned value** - The number of occurrences. [UInt64](../data-types/int-uint.md). @@ -1573,7 +1569,7 @@ Alias: `REGEXP_EXTRACT(haystack, pattern[, index])`. - `pattern` — String, regexp expression, must be constant. [String](../../sql-reference/syntax.md#syntax-string-literal). - `index` – An integer number greater or equal 0 with default 1. It represents which regex group to extract. [UInt or Int](../data-types/int-uint.md). Optional. -**Returned values** +**Returned value** `pattern` may contain multiple regexp groups, `index` indicates which regex group to extract. An index of 0 means matching the entire regular expression. [String](../data-types/string.md). @@ -1612,10 +1608,9 @@ hasSubsequence(haystack, needle) - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Subsequence to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -**Returned values** +**Returned value** -- 1, if needle is a subsequence of haystack. [UInt8](../data-types/int-uint.md). -- 0, otherwise. [UInt8](../data-types/int-uint.md). +- 1, if needle is a subsequence of haystack, 0 otherwise. [UInt8](../data-types/int-uint.md). **Examples** @@ -1648,10 +1643,9 @@ hasSubsequenceCaseInsensitive(haystack, needle) - `haystack` — String in which the search is performed. [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Subsequence to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal). -**Returned values** +**Returned value** -- 1, if needle is a subsequence of haystack. [UInt8](../data-types/int-uint.md). -- 0, otherwise. [UInt8](../data-types/int-uint.md). +- 1, if needle is a subsequence of haystack, 0 otherwise [UInt8](../data-types/int-uint.md). **Examples** @@ -1684,10 +1678,9 @@ hasSubsequenceUTF8(haystack, needle) - `haystack` — String in which the search is performed. UTF-8 encoded [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Subsequence to be searched. UTF-8 encoded [String](../../sql-reference/syntax.md#syntax-string-literal). -**Returned values** +**Returned value** -- 1, if needle is a subsequence of haystack. [UInt8](../data-types/int-uint.md). -- 0, otherwise. [UInt8](../data-types/int-uint.md). +- 1, if needle is a subsequence of haystack, 0, otherwise. [UInt8](../data-types/int-uint.md). Query: @@ -1720,10 +1713,9 @@ hasSubsequenceCaseInsensitiveUTF8(haystack, needle) - `haystack` — String in which the search is performed. UTF-8 encoded [String](../../sql-reference/syntax.md#syntax-string-literal). - `needle` — Subsequence to be searched. UTF-8 encoded [String](../../sql-reference/syntax.md#syntax-string-literal). -**Returned values** +**Returned value** -- 1, if needle is a subsequence of haystack. [UInt8](../data-types/int-uint.md). -- 0, otherwise. [UInt8](../data-types/int-uint.md). +- 1, if needle is a subsequence of haystack, 0 otherwise. [UInt8](../data-types/int-uint.md). **Examples** @@ -1758,8 +1750,7 @@ hasToken(haystack, token) **Returned value** -- 1, if the token is present in the haystack. -- 0, if the token is not present. +- 1, if the token is present in the haystack, 0 otherwise. [UInt8](../data-types/int-uint.md). **Implementation details** @@ -1794,9 +1785,7 @@ hasTokenOrNull(haystack, token) **Returned value** -- 1, if the token is present in the haystack. -- 0, if the token is not present in the haystack. -- null, if the token is ill-formed. +- 1, if the token is present in the haystack, 0 if it is not present, and null if the token is ill formed. **Implementation details** @@ -1833,8 +1822,7 @@ hasTokenCaseInsensitive(haystack, token) **Returned value** -- 1, if the token is present in the haystack. -- 0, otherwise. +- 1, if the token is present in the haystack, 0 otherwise. [UInt8](../data-types/int-uint.md). **Implementation details** @@ -1869,9 +1857,7 @@ hasTokenCaseInsensitiveOrNull(haystack, token) **Returned value** -- 1, if the token is present in the haystack. -- 0, if token is not present. -- null, if the token is ill-formed. +- 1, if the token is present in the haystack, 0 if the token is not present, otherwise [`null`](../data-types/nullable.md) if the token is ill-formed. [UInt8](../data-types/int-uint.md). **Implementation details** diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index da8ed1f51ba..ce5dea14ec5 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -79,8 +79,7 @@ seriesPeriodDetectFFT(series); **Returned value** -- A real value equal to the period of series data. [Float64](../data-types/float.md). -- Returns NAN when number of data points are less than four. [nan](../data-types/float.md/#nan-and-inf). +- A real value equal to the period of series data. NaN when number of data points are less than four. [Float64](../data-types/float.md). **Examples** diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index 130f0147ca1..47890e0b271 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -48,8 +48,7 @@ clickhouse.com **Returned values** -- Host name. If ClickHouse can parse the input string as a URL. [String](../data-types/string.md). -- Empty string. If ClickHouse can’t parse the input string as a URL. [String](../data-types/string.md). +- Host name if ClickHouse can parse the input string as a URL, otherwise an empty string. [String](../data-types/string.md). **Example** @@ -89,8 +88,7 @@ https://clickhouse.com/time/ **Returned values** -- Domain name. If ClickHouse can parse the input string as a URL. [String](../data-types/string.md). -- Empty string. If ClickHouse cannot parse the input string as a URL. [String](../data-types/string.md). +- Domain name if ClickHouse can parse the input string as a URL. Otherwise, an empty string. [String](../data-types/string.md). **Example** From 67ff6883fd11422231d029cf5a128dd5b87dbdfa Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 24 May 2024 06:51:38 +0200 Subject: [PATCH 320/392] Restore original formatting for logical functions and, or, not, xor --- .../functions/logical-functions.md | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/docs/en/sql-reference/functions/logical-functions.md b/docs/en/sql-reference/functions/logical-functions.md index 8448dd4ff12..7222dbeeb0d 100644 --- a/docs/en/sql-reference/functions/logical-functions.md +++ b/docs/en/sql-reference/functions/logical-functions.md @@ -30,9 +30,11 @@ Alias: The [AND operator](../../sql-reference/operators/index.md#logical-and-ope **Returned value** -- `0`, if at least one argument evaluates to `false`. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). -- `NULL`, if no argument evaluates to `false` and at least one argument is `NULL`. [NULL](../../sql-reference/syntax.md/#null). -- `1`, otherwise. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). +- `0`, if at least one argument evaluates to `false`, +- `NULL`, if no argument evaluates to `false` and at least one argument is `NULL`, +- `1`, otherwise. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). **Example** @@ -86,7 +88,7 @@ Alias: The [OR operator](../../sql-reference/operators/index.md#logical-or-opera - `0`, if all arguments evaluate to `false`, - `NULL`, if all arguments evaluate to `false` and at least one argument is `NULL`. -Type: [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). +Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). **Example** @@ -134,9 +136,11 @@ Alias: The [Negation operator](../../sql-reference/operators/index.md#logical-ne **Returned value** -- `1`, if `val` evaluates to `false`. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). -- `0`, if `val` evaluates to `true`. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). -- `NULL`, if `val` is `NULL`. [NULL](../../sql-reference/syntax.md/#null). +- `1`, if `val` evaluates to `false`, +- `0`, if `val` evaluates to `true`, +- `NULL`, if `val` is `NULL`. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). **Example** @@ -168,9 +172,11 @@ xor(val1, val2...) **Returned value** -- `1`, for two values: if one of the values evaluates to `false` and other does not. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). -- `0`, for two values: if both values evaluate to `false` or to both `true`. [UInt8](../data-types/int-uint.md) or [Nullable](../data-types/nullable.md)([UInt8](../data-types/int-uint.md)). -- `NULL`, if at least one of the inputs is `NULL`. [NULL](../../sql-reference/syntax.md/#null). +- `1`, for two values: if one of the values evaluates to `false` and other does not, +- `0`, for two values: if both values evaluate to `false` or to both `true`, +- `NULL`, if at least one of the inputs is `NULL` + +Type: [UInt8](../../sql-reference/data-types/int-uint.md) or [Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md)). **Example** From 3071909aca68d73b0e29660896f883ff759ef48e Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 24 May 2024 07:00:47 +0200 Subject: [PATCH 321/392] Revert roundAge to original formatting --- .../sql-reference/functions/rounding-functions.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index ab344f664fd..c2998a82205 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -328,14 +328,15 @@ roundAge(num) **Returned value** -- Returns `0`, for $age \lt 1$. [UInt8](../data-types/int-uint.md). -- Returns `17`, for $1 \leq age \leq 17$. [UInt8](../data-types/int-uint.md). -- Returns `18`, for $18 \leq age \leq 24$. [UInt8](../data-types/int-uint.md). -- Returns `25`, for $25 \leq age \leq 34$. [UInt8](../data-types/int-uint.md). -- Returns `35`, for $35 \leq age \leq 44$. [UInt8](../data-types/int-uint.md). -- Returns `45`, for $45 \leq age \leq 54$. [UInt8](../data-types/int-uint.md). -- Returns `55`, for $age \geq 55$. [UInt8](../data-types/int-uint.md). +- Returns `0`, for $age \lt 1$. +- Returns `17`, for $1 \leq age \leq 17$. +- Returns `18`, for $18 \leq age \leq 24$. +- Returns `25`, for $25 \leq age \leq 34$. +- Returns `35`, for $35 \leq age \leq 44$. +- Returns `45`, for $45 \leq age \leq 54$. +- Returns `55`, for $age \geq 55$. +Type: [UInt8](../data-types/int-uint.md). **Example** Query: From b19c5ad13ac56d0e2cf6d0b5361ef7992b18e29b Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 24 May 2024 07:01:15 +0200 Subject: [PATCH 322/392] Revert roundAge to original formatting --- docs/en/sql-reference/functions/rounding-functions.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index c2998a82205..d18185c5013 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -337,6 +337,7 @@ roundAge(num) - Returns `55`, for $age \geq 55$. Type: [UInt8](../data-types/int-uint.md). + **Example** Query: From 8783647703ec60eb936824c0265a298a33e9ae43 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 24 May 2024 07:03:15 +0200 Subject: [PATCH 323/392] Revert addressToLine to original formatting --- docs/en/sql-reference/functions/introspection.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/functions/introspection.md b/docs/en/sql-reference/functions/introspection.md index 5dc57e70591..bec97208843 100644 --- a/docs/en/sql-reference/functions/introspection.md +++ b/docs/en/sql-reference/functions/introspection.md @@ -40,10 +40,12 @@ addressToLine(address_of_binary_instruction) **Returned value** -- Source code filename and the line number in this file delimited by colon. [String](../data-types/string.md). - - For example, `/build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.cpp:199`, where `199` is a line number. -- Name of a binary, if the function couldn’t find the debug information. [String](../data-types/string.md). -- Empty string, if the address is not valid. [String](../data-types/string.md). +- Source code filename and the line number in this file delimited by colon. + For example, `/build/obj-x86_64-linux-gnu/../src/Common/ThreadPool.cpp:199`, where `199` is a line number. +- Name of a binary, if the function couldn’t find the debug information. +- Empty string, if the address is not valid. + +Type: [String](../../sql-reference/data-types/string.md). **Example** From c638de90c2d6e0a2aa48d2eadd763ad7aa47e3a7 Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 24 May 2024 10:01:06 +0200 Subject: [PATCH 324/392] Fix incorrectly placed :::note blocks --- .../sql-reference/functions/splitting-merging-functions.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index a3c28504a29..20d63d84628 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -27,13 +27,11 @@ splitByChar(separator, s[, max_substrings])) - An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). -:::note Empty substrings may be selected when: - A separator occurs at the beginning or end of the string; - There are multiple consecutive separators; - The original string `s` is empty. -::: :::note The behavior of parameter `max_substrings` changed starting with ClickHouse v22.11. In versions older than that, `max_substrings > 0` meant that `max_substring`-many splits were performed and that the remainder of the string was returned as the final element of the list. @@ -80,13 +78,13 @@ splitByString(separator, s[, max_substrings])) - An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). -:::note Empty substrings may be selected when: - A non-empty separator occurs at the beginning or end of the string; - There are multiple consecutive non-empty separators; - The original string `s` is empty while the separator is not empty. +:::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. ::: @@ -137,13 +135,14 @@ splitByRegexp(regexp, s[, max_substrings])) - An array of selected substrings. [Array](../data-types/array.md)([String](../data-types/string.md)). -:::note + Empty substrings may be selected when: - A non-empty regular expression match occurs at the beginning or end of the string; - There are multiple consecutive non-empty regular expression matches; - The original string `s` is empty while the regular expression is not empty. +:::note Setting [splitby_max_substrings_includes_remaining_string](../../operations/settings/settings.md#splitby_max_substrings_includes_remaining_string) (default: 0) controls if the remaining string is included in the last element of the result array when argument `max_substrings` > 0. ::: From 480f911c7664c15cccf913b0b7cc3d66645c557c Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 May 2024 08:33:44 +0000 Subject: [PATCH 325/392] Fix spelling --- .../aspell-ignore/en/aspell-dict.txt | 117 +++++++++--------- 1 file changed, 59 insertions(+), 58 deletions(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 1c601bc200a..6df2e426561 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -6,6 +6,7 @@ AMPLab AMQP ANNIndex ANNIndexes +ANOVA AORM APIs ARMv @@ -29,13 +30,6 @@ Alexey AnyEvent AppleClang Approximative -arrayDotProduct -arrayEnumerateDenseRanked -arrayEnumerateUniqRanked -arrayFirstOrNull -arrayLastOrNull -arrayPartialShuffle -arrayShuffle ArrayJoin ArrowStream AsyncInsertCacheSize @@ -53,8 +47,6 @@ AutoFDO AutoML Autocompletion AvroConfluent -analysisOfVariance -ANOVA BIGINT BIGSERIAL BORO @@ -186,7 +178,6 @@ ComplexKeyCache ComplexKeyDirect ComplexKeyHashed Composable -composable Config ConnectionDetails Const @@ -396,8 +387,6 @@ InterserverThreads IsPentagon IsResClassIII IsValid -isNotDistinctFrom -isNullable JBOD JOINed JOINs @@ -466,8 +455,6 @@ KittenHouse Klickhouse Kolmogorov Konstantin -kostik -kostikConsistentHash Korzeniewski Kubernetes LDAP @@ -477,9 +464,8 @@ LLDB LLVM's LOCALTIME LOCALTIMESTAMP -LOONGARCH LONGLONG -LoongArch +LOONGARCH Levenshtein Liao LibFuzzer @@ -497,6 +483,7 @@ LocalThreadActive LogQL Logstash LookML +LoongArch LowCardinality LpDistance LpNorm @@ -571,17 +558,6 @@ MindsDB Mongodb Monotonicity MsgPack -multiSearchAllPositionsCaseInsensitive -multiSearchAllPositionsCaseInsensitiveUTF -multiSearchAnyCaseInsensitive -multiSearchAnyCaseInsensitiveUTF -multiSearchAnyUTF -multiSearchFirstIndexCaseInsensitive -multiSearchFirstIndexCaseInsensitiveUTF -multiSearchFirstIndexUTF -multiSearchFirstPositionCaseInsensitive -multiSearchFirstPositionCaseInsensitiveUTF -multiSearchFirstPositionUTF MultiPolygon Multiline Multiqueries @@ -683,8 +659,8 @@ OSUserTimeNormalized OTLP OUTFILE ObjectId -Observability Oblakov +Observability Octonica Ok OnTime @@ -765,7 +741,6 @@ Promtail Protobuf ProtobufSingle ProxySQL -proportionsZTest Punycode PyArrow PyCharm @@ -886,7 +861,6 @@ Simhash SimpleAggregateFunction SimpleState SipHash -sigmoid Smirnov's Smirnov'test Soundex @@ -932,7 +906,6 @@ TAVG TCPConnection TCPThreads TDigest -ThreadMonotonic TINYINT TLSv TMAX @@ -958,7 +931,6 @@ TablesLoaderForegroundThreads TablesLoaderForegroundThreadsActive TablesToDropQueueSize TargetSpecific -tanh Telegraf TemplateIgnoreSpaces TemporaryFilesForAggregation @@ -968,6 +940,7 @@ TemporaryFilesUnknown Testflows Tgz Theil's +ThreadMonotonic ThreadPoolFSReaderThreads ThreadPoolFSReaderThreadsActive ThreadPoolRemoteFSReaderThreads @@ -1028,7 +1001,6 @@ UncompressedCacheBytes UncompressedCacheCells UnidirectionalEdgeIsValid UniqThetaSketch -unshuffled Updatable Uppercased Uptime @@ -1095,6 +1067,7 @@ activerecord addDate addDays addHours +addInterval addMicroseconds addMilliseconds addMinutes @@ -1102,10 +1075,9 @@ addMonths addNanoseconds addQuarters addSeconds +addTupleOfIntervals addWeeks addYears -addInterval -addTupleOfIntervals addr addressToLine addressToLineWithInlines @@ -1120,6 +1092,7 @@ aiochclient allocator alphaTokens amplab +analysisOfVariance analytics anonymize anonymized @@ -1147,15 +1120,19 @@ arrayCumSum arrayCumSumNonNegative arrayDifference arrayDistinct +arrayDotProduct arrayElement arrayEnumerate arrayEnumerateDense +arrayEnumerateDenseRanked arrayEnumerateUniq +arrayEnumerateUniqRanked arrayExists arrayFill arrayFilter arrayFirst arrayFirstIndex +arrayFirstOrNull arrayFlatten arrayFold arrayIntersect @@ -1163,10 +1140,12 @@ arrayJaccardIndex arrayJoin arrayLast arrayLastIndex +arrayLastOrNull arrayMap arrayMax arrayMin arrayPartialReverseSort +arrayPartialShuffle arrayPartialSort arrayPopBack arrayPopFront @@ -1186,6 +1165,7 @@ arrayRotateRight arrayShiftLeft arrayShiftRight arrayShingles +arrayShuffle arraySlice arraySort arraySplit @@ -1367,6 +1347,7 @@ collapsingmergetree combinator combinators comparising +composable compressability concat concatAssumeInjective @@ -1728,8 +1709,8 @@ hasSubsequenceCaseInsensitive hasSubsequenceCaseInsensitiveUTF hasSubsequenceUTF hasSubstr -hasToken hasThreadFuzzer +hasToken hasTokenCaseInsensitive hasTokenCaseInsensitiveOrNull hasTokenOrNull @@ -1802,8 +1783,10 @@ isIPAddressInRange isIPv isInfinite isNaN +isNotDistinctFrom isNotNull isNull +isNullable isValidJSON isValidUTF isZeroOrNull @@ -1855,6 +1838,8 @@ kolmogorovSmirnovTest kolmogorovsmirnovtest kolya konsole +kostik +kostikConsistentHash kurtPop kurtSamp kurtosis @@ -1866,9 +1851,9 @@ laravel largestTriangleThreeBuckets latencies ldap -leftUTF leftPad leftPadUTF +leftUTF lemmatization lemmatize lemmatized @@ -1915,8 +1900,8 @@ logTrace logagent loghouse london -loongarch lookups +loongarch lowcardinality lowerUTF lowercased @@ -1987,8 +1972,8 @@ mispredictions mmap mmapped modularization -moduloOrZero moduli +moduloOrZero mongodb monotonicity monthName @@ -2005,10 +1990,21 @@ multiMatchAllIndices multiMatchAny multiMatchAnyIndex multiSearchAllPositions +multiSearchAllPositionsCaseInsensitive +multiSearchAllPositionsCaseInsensitiveUTF multiSearchAllPositionsUTF multiSearchAny +multiSearchAnyCaseInsensitive +multiSearchAnyCaseInsensitiveUTF +multiSearchAnyUTF multiSearchFirstIndex +multiSearchFirstIndexCaseInsensitive +multiSearchFirstIndexCaseInsensitiveUTF +multiSearchFirstIndexUTF multiSearchFirstPosition +multiSearchFirstPositionCaseInsensitive +multiSearchFirstPositionCaseInsensitiveUTF +multiSearchFirstPositionUTF multibyte multidirectory multiline @@ -2094,6 +2090,7 @@ ok omclickhouse onstraints ontime +onwards openSSL openSUSE openldap @@ -2205,6 +2202,7 @@ procfs profiler proleptic prometheus +proportionsZTest proto protobuf protobufsingle @@ -2343,8 +2341,8 @@ retentions rethrow retransmit retriable -rewritable reverseUTF +rewritable rightPad rightPadUTF rightUTF @@ -2404,8 +2402,9 @@ sharded sharding shortcircuit shortkeys -showCertificate shoutout +showCertificate +sigmoid simdjson simpleJSON simpleJSONExtractBool @@ -2419,8 +2418,8 @@ simpleLinearRegression simpleaggregatefunction simplelinearregression simpod -singlepart singleValueOrNull +singlepart singlevalueornull sinh sipHash @@ -2465,13 +2464,13 @@ statbox stateful stddev stddevPop -stddevSamp -stddevpop -stddevsamp -stddevpopstable stddevPopStable -stddevsampstable +stddevSamp stddevSampStable +stddevpop +stddevpopstable +stddevsamp +stddevsampstable stderr stdin stdout @@ -2532,6 +2531,7 @@ substrings subtitiles subtractDays subtractHours +subtractInterval subtractMicroseconds subtractMilliseconds subtractMinutes @@ -2539,10 +2539,9 @@ subtractMonths subtractNanoseconds subtractQuarters subtractSeconds +subtractTupleOfIntervals subtractWeeks subtractYears -subtractInterval -subtractTupleOfIntervals subtree subtrees subtype @@ -2551,13 +2550,13 @@ sumCount sumKahan sumMap sumMapFiltered +sumMapFilteredWithOverflow +sumMapWithOverflow sumWithOverflow sumcount sumkahan summap summapwithoverflow -sumMapWithOverflow -sumMapFilteredWithOverflow summingmergetree sumwithoverflow superaggregates @@ -2580,6 +2579,7 @@ tabseparatedrawwithnames tabseparatedrawwithnamesandtypes tabseparatedwithnames tabseparatedwithnamesandtypes +tanh tcp tcpPort tcpnodelay @@ -2714,18 +2714,18 @@ tupleDivide tupleDivideByNumber tupleElement tupleHammingDistance +tupleIntDiv +tupleIntDivByNumber +tupleIntDivOrZero +tupleIntDivOrZeroByNumber tupleMinus +tupleModulo +tupleModuloByNumber tupleMultiply tupleMultiplyByNumber tupleNegate tuplePlus tupleToNameValuePairs -tupleIntDiv -tupleIntDivByNumber -tupleIntDivOrZero -tupleIntDivOrZeroByNumber -tupleModulo -tupleModuloByNumber turbostat txt typename @@ -2769,6 +2769,7 @@ unrealiable unreplicated unresolvable unrounded +unshuffled untracked untrusted untuple @@ -2779,8 +2780,8 @@ uptime uptrace uring url -urlencoded urlCluster +urlencoded urls usearch userspace From 3e21ff92a38ece0b0ebcf72554e45d33ce612771 Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 10:53:19 +0200 Subject: [PATCH 326/392] CI: master workflow with folded jobs --- .github/workflows/master.yml | 825 ++--------------------------- .github/workflows/merge_queue.yml | 6 +- .github/workflows/pull_request.yml | 22 +- 3 files changed, 62 insertions(+), 791 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index d2ea714e4e4..11ec484d208 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -53,13 +53,13 @@ jobs: - name: Re-create GH statuses for skipped jobs if any run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ runner.temp }}/ci_run_data.json --update-gh-statuses - BuildDockers: - needs: [RunConfig] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_docker.yml - with: - data: ${{ needs.RunConfig.outputs.data }} - # Tested in MQ +# Runs in MQ: +# BuildDockers: +# needs: [RunConfig] +# if: ${{ !failure() && !cancelled() }} +# uses: ./.github/workflows/reusable_docker.yml +# with: +# data: ${{ needs.RunConfig.outputs.data }} # StyleCheck: # needs: [RunConfig, BuildDockers] # if: ${{ !failure() && !cancelled() }} @@ -70,262 +70,73 @@ jobs: # data: ${{ needs.RunConfig.outputs.data }} # run_command: | # python3 style_check.py --no-push - CompatibilityCheckX86: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml + + ################################# Main stages ################################# + # for main CI chain + # + Builds_1: + needs: [RunConfig] + if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Builds_1') }} + # using callable wf (reusable_stage.yml) allows grouping all nested jobs under a tab + uses: ./.github/workflows/reusable_build_stage.yml with: - test_name: Compatibility check (amd64) - runner_type: style-checker + stage: Builds_1 data: ${{ needs.RunConfig.outputs.data }} - CompatibilityCheckAarch64: - needs: [RunConfig, BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml + Tests_1: + needs: [RunConfig, Builds_1] + if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Tests_1') }} + uses: ./.github/workflows/reusable_test_stage.yml with: - test_name: Compatibility check (aarch64) - runner_type: style-checker + stage: Tests_1 data: ${{ needs.RunConfig.outputs.data }} -######################################################################################### -#################################### ORDINARY BUILDS #################################### -######################################################################################### -# TODO: never skip builds! - BuilderDebRelease: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml + Builds_2: + needs: [RunConfig, Builds_1] + if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Builds_2') }} + uses: ./.github/workflows/reusable_build_stage.yml with: - build_name: package_release - checkout_depth: 0 + stage: Builds_2 data: ${{ needs.RunConfig.outputs.data }} - BuilderDebReleaseCoverage: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml + Tests_2: + needs: [RunConfig, Builds_2] + if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Tests_2') }} + uses: ./.github/workflows/reusable_test_stage.yml with: - build_name: package_release_coverage - checkout_depth: 0 + stage: Tests_2 data: ${{ needs.RunConfig.outputs.data }} - BuilderDebAarch64: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml + # stage for jobs that do not prohibit merge + Tests_3: + needs: [RunConfig, Builds_1] + if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Tests_3') }} + uses: ./.github/workflows/reusable_test_stage.yml with: - build_name: package_aarch64 - checkout_depth: 0 + stage: Tests_3 data: ${{ needs.RunConfig.outputs.data }} - BuilderBinRelease: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_release - checkout_depth: 0 # otherwise we will have no info about contributors - data: ${{ needs.RunConfig.outputs.data }} - BuilderDebAsan: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: package_asan - data: ${{ needs.RunConfig.outputs.data }} - BuilderDebUBsan: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: package_ubsan - data: ${{ needs.RunConfig.outputs.data }} - BuilderDebTsan: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: package_tsan - data: ${{ needs.RunConfig.outputs.data }} - BuilderDebMsan: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: package_msan - data: ${{ needs.RunConfig.outputs.data }} - BuilderDebDebug: - needs: [RunConfig, BuildDockers] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: package_debug - data: ${{ needs.RunConfig.outputs.data }} -########################################################################################## -##################################### SPECIAL BUILDS ##################################### -########################################################################################## - BuilderBinClangTidy: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_tidy - data: ${{ needs.RunConfig.outputs.data }} - BuilderBinDarwin: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_darwin - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinAarch64: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_aarch64 - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinFreeBSD: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_freebsd - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinDarwinAarch64: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_darwin_aarch64 - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinPPC64: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_ppc64le - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinAmd64Compat: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_amd64_compat - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinAmd64Musl: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_amd64_musl - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinAarch64V80Compat: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_aarch64_v80compat - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinRISCV64: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_riscv64 - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinS390X: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_s390x - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 - BuilderBinLoongarch64: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_build.yml - with: - build_name: binary_loongarch64 - data: ${{ needs.RunConfig.outputs.data }} - checkout_depth: 0 -############################################################################################ -##################################### Docker images ####################################### -############################################################################################ - DockerServerImage: - needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Docker server image - runner_type: style-checker - data: ${{ needs.RunConfig.outputs.data }} - DockerKeeperImage: - needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Docker keeper image - runner_type: style-checker - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################ -##################################### BUILD REPORTER ####################################### -############################################################################################ - BuilderReport: + + ################################# Reports ################################# + # Reports should be run even if Builds_1/2 failed - put them separately in wf (not in Tests_1/2) + Builds_1_Report: # run report check for failed builds to indicate the CI error - if: ${{ !cancelled() }} - needs: - - RunConfig - - BuilderDebAarch64 - - BuilderDebAsan - - BuilderDebDebug - - BuilderDebMsan - - BuilderDebRelease - - BuilderDebTsan - - BuilderDebUBsan + if: ${{ !cancelled() && needs.RunConfig.result == 'success' && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'ClickHouse build check') }} + needs: [RunConfig, Builds_1] uses: ./.github/workflows/reusable_test.yml with: test_name: ClickHouse build check runner_type: style-checker-aarch64 data: ${{ needs.RunConfig.outputs.data }} - BuilderSpecialReport: + Builds_2_Report: # run report check for failed builds to indicate the CI error - if: ${{ !cancelled() }} - needs: - - RunConfig - - BuilderBinAarch64 - - BuilderBinDarwin - - BuilderBinDarwinAarch64 - - BuilderBinFreeBSD - - BuilderBinPPC64 - - BuilderBinRISCV64 - - BuilderBinS390X - - BuilderBinLoongarch64 - - BuilderBinAmd64Compat - - BuilderBinAarch64V80Compat - - BuilderBinClangTidy - - BuilderBinAmd64Musl - - BuilderDebReleaseCoverage - - BuilderBinRelease + if: ${{ !cancelled() && needs.RunConfig.result == 'success' && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'ClickHouse special build check') }} + needs: [RunConfig, Builds_2] uses: ./.github/workflows/reusable_test.yml with: test_name: ClickHouse special build check runner_type: style-checker-aarch64 data: ${{ needs.RunConfig.outputs.data }} + MarkReleaseReady: if: ${{ !failure() && !cancelled() }} - needs: - - BuilderBinDarwin - - BuilderBinDarwinAarch64 - - BuilderDebRelease - - BuilderDebAarch64 - runs-on: [self-hosted, style-checker] + needs: [RunConfig, Builds_1] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Debug run: | @@ -338,7 +149,7 @@ jobs: no both ${{ !(contains(needs.*.result, 'skipped') || contains(needs.*.result, 'failure')) }} EOF - name: Not ready - # fail the job to be able restart it + # fail the job to be able to restart it if: ${{ contains(needs.*.result, 'skipped') || contains(needs.*.result, 'failure') }} run: exit 1 - name: Check out repository code @@ -349,544 +160,14 @@ jobs: run: | cd "$GITHUB_WORKSPACE/tests/ci" python3 mark_release_ready.py -############################################################################################ -#################################### INSTALL PACKAGES ###################################### -############################################################################################ - InstallPackagesTestRelease: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Install packages (amd64) - runner_type: style-checker - data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 install_check.py "$CHECK_NAME" - InstallPackagesTestAarch64: - needs: [RunConfig, BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Install packages (arm64) - runner_type: style-checker-aarch64 - data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 install_check.py "$CHECK_NAME" -############################################################################################## -########################### FUNCTIONAl STATELESS TESTS ####################################### -############################################################################################## - FunctionalStatelessTestRelease: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (release) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestReleaseAnalyzerS3Replicated: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (release, old analyzer, s3, DatabaseReplicated) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestS3Debug: - needs: [RunConfig, BuilderDebDebug] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (debug, s3 storage) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestS3Tsan: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (tsan, s3 storage) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestAarch64: - needs: [RunConfig, BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (aarch64) - runner_type: func-tester-aarch64 - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestAsan: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (asan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestTsan: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (tsan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestMsan: - needs: [RunConfig, BuilderDebMsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (msan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestUBsan: - needs: [RunConfig, BuilderDebUBsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (ubsan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestDebug: - needs: [RunConfig, BuilderDebDebug] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (debug) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestAsanAzure: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (azure, asan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################## -############################ FUNCTIONAl STATEFUL TESTS ####################################### -############################################################################################## - FunctionalStatefulTestRelease: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (release) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestAarch64: - needs: [RunConfig, BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (aarch64) - runner_type: func-tester-aarch64 - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestAsan: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (asan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestTsan: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (tsan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestMsan: - needs: [RunConfig, BuilderDebMsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (msan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestUBsan: - needs: [RunConfig, BuilderDebUBsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (ubsan) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestDebug: - needs: [RunConfig, BuilderDebDebug] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (debug) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - # Parallel replicas - FunctionalStatefulTestDebugParallelReplicas: - needs: [RunConfig, BuilderDebDebug] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (debug, ParallelReplicas) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestUBsanParallelReplicas: - needs: [RunConfig, BuilderDebUBsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (ubsan, ParallelReplicas) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestMsanParallelReplicas: - needs: [RunConfig, BuilderDebMsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (msan, ParallelReplicas) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestTsanParallelReplicas: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (tsan, ParallelReplicas) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestAsanParallelReplicas: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (asan, ParallelReplicas) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestReleaseParallelReplicas: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (release, ParallelReplicas) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################## -########################### ClickBench ####################################################### -############################################################################################## - ClickBenchAMD64: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: ClickBench (amd64) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 clickbench.py "$CHECK_NAME" - ClickBenchAarch64: - needs: [RunConfig, BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: ClickBench (aarch64) - runner_type: func-tester-aarch64 - data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 clickbench.py "$CHECK_NAME" -############################################################################################## -######################################### STRESS TESTS ####################################### -############################################################################################## - StressTestAsan: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stress test (asan) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - StressTestTsan: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stress test (tsan) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - StressTestTsanAzure: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stress test (azure, tsan) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - StressTestMsan: - needs: [RunConfig, BuilderDebMsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stress test (msan) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - StressTestUBsan: - needs: [RunConfig, BuilderDebUBsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stress test (ubsan) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - StressTestDebug: - needs: [RunConfig, BuilderDebDebug] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stress test (debug) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################# -############################# INTEGRATION TESTS ############################################# -############################################################################################# - IntegrationTestsAsan: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Integration tests (asan) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - IntegrationTestsAnalyzerAsan: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Integration tests (asan, old analyzer) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - IntegrationTestsTsan: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Integration tests (tsan) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - IntegrationTestsRelease: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Integration tests (release) - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################## -##################################### AST FUZZERS ############################################ -############################################################################################## - ASTFuzzerTestAsan: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: AST fuzzer (asan) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - ASTFuzzerTestTsan: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: AST fuzzer (tsan) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - ASTFuzzerTestUBSan: - needs: [RunConfig, BuilderDebUBsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: AST fuzzer (ubsan) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - ASTFuzzerTestMSan: - needs: [RunConfig, BuilderDebMsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: AST fuzzer (msan) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - ASTFuzzerTestDebug: - needs: [RunConfig, BuilderDebDebug] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: AST fuzzer (debug) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################# -#################################### UNIT TESTS ############################################# -############################################################################################# - UnitTestsAsan: - needs: [RunConfig, BuilderDebAsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Unit tests (asan) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - UnitTestsReleaseClang: - needs: [RunConfig, BuilderBinRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Unit tests (release) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - UnitTestsTsan: - needs: [RunConfig, BuilderDebTsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Unit tests (tsan) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - UnitTestsMsan: - needs: [RunConfig, BuilderDebMsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Unit tests (msan) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - UnitTestsUBsan: - needs: [RunConfig, BuilderDebUBsan] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Unit tests (ubsan) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################# -#################################### PERFORMANCE TESTS ###################################### -############################################################################################# - PerformanceComparisonX86: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Performance Comparison - runner_type: stress-tester - data: ${{ needs.RunConfig.outputs.data }} - PerformanceComparisonAarch: - needs: [RunConfig, BuilderDebAarch64] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Performance Comparison Aarch64 - runner_type: func-tester-aarch64 - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################## -############################ SQLLOGIC TEST ################################################### -############################################################################################## - SQLLogicTestRelease: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Sqllogic test (release) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################## -##################################### SQL TEST ############################################### -############################################################################################## - SQLTest: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: SQLTest - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} -############################################################################################## -###################################### SQLANCER FUZZERS ###################################### -############################################################################################## - SQLancerTestRelease: - needs: [RunConfig, BuilderDebRelease] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: SQLancer (release) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} - SQLancerTestDebug: - needs: [RunConfig, BuilderDebDebug] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: SQLancer (debug) - runner_type: fuzzer-unit-tester - data: ${{ needs.RunConfig.outputs.data }} FinishCheck: if: ${{ !failure() && !cancelled() }} - needs: - - MarkReleaseReady - - FunctionalStatelessTestDebug - - FunctionalStatelessTestRelease - - FunctionalStatelessTestReleaseAnalyzerS3Replicated - - FunctionalStatelessTestAarch64 - - FunctionalStatelessTestAsan - - FunctionalStatelessTestTsan - - FunctionalStatelessTestMsan - - FunctionalStatelessTestUBsan - - FunctionalStatelessTestS3Debug - - FunctionalStatelessTestS3Tsan - - FunctionalStatefulTestDebug - - FunctionalStatefulTestRelease - - FunctionalStatefulTestAarch64 - - FunctionalStatefulTestAsan - - FunctionalStatefulTestTsan - - FunctionalStatefulTestMsan - - FunctionalStatefulTestUBsan - - FunctionalStatefulTestDebugParallelReplicas - - FunctionalStatefulTestUBsanParallelReplicas - - FunctionalStatefulTestMsanParallelReplicas - - FunctionalStatefulTestTsanParallelReplicas - - FunctionalStatefulTestAsanParallelReplicas - - FunctionalStatefulTestReleaseParallelReplicas - - StressTestDebug - - StressTestAsan - - StressTestTsan - - StressTestMsan - - StressTestUBsan - - IntegrationTestsAsan - - IntegrationTestsAnalyzerAsan - - IntegrationTestsTsan - - IntegrationTestsRelease - - PerformanceComparisonX86 - - PerformanceComparisonAarch - - CompatibilityCheckX86 - - CompatibilityCheckAarch64 - - ASTFuzzerTestDebug - - ASTFuzzerTestAsan - - ASTFuzzerTestTsan - - ASTFuzzerTestMSan - - ASTFuzzerTestUBSan - - UnitTestsAsan - - UnitTestsTsan - - UnitTestsMsan - - UnitTestsUBsan - - UnitTestsReleaseClang - - SQLancerTestRelease - - SQLancerTestDebug - - SQLLogicTestRelease - - SQLTest - runs-on: [self-hosted, style-checker] + needs: [RunConfig, Builds_1, Builds_2, Builds_1_Report, Builds_2_Report, Tests_1, Tests_2, Tests_3] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Check out repository code uses: ClickHouse/checkout@v1 - with: - clear-repository: true - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index 97aa0db4cdb..d1b03198485 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -20,7 +20,7 @@ jobs: uses: ClickHouse/checkout@v1 with: clear-repository: true # to ensure correct digests - fetch-depth: 0 # to get version + fetch-depth: 0 # to get a version filter: tree:0 - name: Cancel PR workflow run: | @@ -60,7 +60,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Style check - runner_type: style-checker + runner_type: style-checker-aarch64 run_command: | python3 style_check.py data: ${{ needs.RunConfig.outputs.data }} @@ -85,7 +85,7 @@ jobs: FinishCheck: if: ${{ !failure() && !cancelled() }} needs: [RunConfig, BuildDockers, StyleCheck, FastTest] - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Check out repository code uses: ClickHouse/checkout@v1 diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 48b4a558580..aa570c3ce2f 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -31,7 +31,7 @@ jobs: uses: ClickHouse/checkout@v1 with: clear-repository: true # to ensure correct digests - fetch-depth: 0 # to get version + fetch-depth: 0 # to get a version filter: tree:0 - name: Cancel Sync PR workflow run: | @@ -78,7 +78,7 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: Style check - runner_type: style-checker + runner_type: style-checker-aarch64 run_command: | python3 style_check.py data: ${{ needs.RunConfig.outputs.data }} @@ -98,13 +98,13 @@ jobs: run_command: | python3 fast_test_check.py - ################################# Main statges ################################# + ################################# Main stages ################################# # for main CI chain # Builds_1: needs: [RunConfig, StyleCheck, FastTest] if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Builds_1') }} - # using callable wf (reusable_stage.yml) allows to group all nested jobs under a tab + # using callable wf (reusable_stage.yml) allows grouping all nested jobs under a tab uses: ./.github/workflows/reusable_build_stage.yml with: stage: Builds_1 @@ -112,7 +112,6 @@ jobs: Tests_1: needs: [RunConfig, Builds_1] if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Tests_1') }} - # using callable wf (reusable_stage.yml) allows to group all nested jobs under a tab uses: ./.github/workflows/reusable_test_stage.yml with: stage: Tests_1 @@ -120,7 +119,6 @@ jobs: Builds_2: needs: [RunConfig, Builds_1] if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Builds_2') }} - # using callable wf (reusable_stage.yml) allows to group all nested jobs under a tab uses: ./.github/workflows/reusable_build_stage.yml with: stage: Builds_2 @@ -128,7 +126,6 @@ jobs: Tests_2: needs: [RunConfig, Builds_2] if: ${{ !failure() && !cancelled() && contains(fromJson(needs.RunConfig.outputs.data).stages_data.stages_to_do, 'Tests_2') }} - # using callable wf (reusable_stage.yml) allows to group all nested jobs under a tab uses: ./.github/workflows/reusable_test_stage.yml with: stage: Tests_2 @@ -182,7 +179,7 @@ jobs: FinishCheck: if: ${{ !failure() && !cancelled() }} needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Builds_2, Builds_1_Report, Builds_2_Report, Tests_1, Tests_2, Tests_3] - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Check out repository code uses: ClickHouse/checkout@v1 @@ -192,13 +189,6 @@ jobs: run: | cd "$GITHUB_WORKSPACE/tests/ci" python3 finish_check.py - # FIXME: merge on approval does not work with MQ. Could be fixed by using defaul GH's automerge after some corrections in Mergeable Check status - # - name: Auto merge if approved - # if: ${{ github.event_name != 'merge_group' }} - # run: | - # cd "$GITHUB_WORKSPACE/tests/ci" - # python3 merge_pr.py --check-approved - ############################################################################################# ###################################### JEPSEN TESTS ######################################### @@ -216,5 +206,5 @@ jobs: uses: ./.github/workflows/reusable_test.yml with: test_name: ClickHouse Keeper Jepsen - runner_type: style-checker + runner_type: style-checker-aarch64 data: ${{ needs.RunConfig.outputs.data }} From d4fb2d50e95762838b46356a79e7ba8ecd3e4c5e Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 11:36:28 +0200 Subject: [PATCH 327/392] CI: Sync, Merge check, CI gh's statuses fixes --- .github/workflows/master.yml | 21 +++++++++++---------- .github/workflows/pull_request.yml | 7 +++++-- tests/ci/ci.py | 29 +++++++++++++++++++++++++++-- tests/ci/commit_status_helper.py | 13 +++++++------ tests/ci/finish_check.py | 2 +- tests/ci/merge_pr.py | 1 - 6 files changed, 51 insertions(+), 22 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 11ec484d208..7c55098bdfd 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -27,15 +27,16 @@ jobs: run: | cd "$GITHUB_WORKSPACE/tests/ci" python3 sync_pr.py --merge || : - - name: Python unit tests - run: | - cd "$GITHUB_WORKSPACE/tests/ci" - echo "Testing the main ci directory" - python3 -m unittest discover -s . -p 'test_*.py' - for dir in *_lambda/; do - echo "Testing $dir" - python3 -m unittest discover -s "$dir" -p 'test_*.py' - done +# Runs in MQ: +# - name: Python unit tests +# run: | +# cd "$GITHUB_WORKSPACE/tests/ci" +# echo "Testing the main ci directory" +# python3 -m unittest discover -s . -p 'test_*.py' +# for dir in *_lambda/; do +# echo "Testing $dir" +# python3 -m unittest discover -s "$dir" -p 'test_*.py' +# done - name: PrepareRunConfig id: runconfig run: | @@ -162,7 +163,7 @@ jobs: python3 mark_release_ready.py FinishCheck: - if: ${{ !failure() && !cancelled() }} + if: ${{ !cancelled() }} needs: [RunConfig, Builds_1, Builds_2, Builds_1_Report, Builds_2_Report, Tests_1, Tests_2, Tests_3] runs-on: [self-hosted, style-checker-aarch64] steps: diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index aa570c3ce2f..7d22554473e 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -33,9 +33,12 @@ jobs: clear-repository: true # to ensure correct digests fetch-depth: 0 # to get a version filter: tree:0 - - name: Cancel Sync PR workflow + - name: Cancel previous Sync PR workflow run: | python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --cancel-previous-run + - name: Set pending Sync status + run: | + python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --set-pending-status - name: Labels check run: | cd "$GITHUB_WORKSPACE/tests/ci" @@ -177,7 +180,7 @@ jobs: ################################# Stage Final ################################# # FinishCheck: - if: ${{ !failure() && !cancelled() }} + if: ${{ !cancelled() }} needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Builds_2, Builds_1_Report, Builds_2_Report, Tests_1, Tests_2, Tests_3] runs-on: [self-hosted, style-checker-aarch64] steps: diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 4afd3f46f9d..fc25bee354d 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -17,7 +17,7 @@ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union import docker_images_helper import upload_result_helper from build_check import get_release_or_pr -from ci_config import CI_CONFIG, Build, CILabels, CIStages, JobNames +from ci_config import CI_CONFIG, Build, CILabels, CIStages, JobNames, StatusNames from ci_utils import GHActions, is_hex, normalize_string from clickhouse_helper import ( CiLogsCredentials, @@ -52,7 +52,7 @@ from git_helper import GIT_PREFIX, Git from git_helper import Runner as GitRunner from github_helper import GitHub from pr_info import PRInfo -from report import ERROR, SUCCESS, BuildResult, JobReport +from report import ERROR, SUCCESS, BuildResult, JobReport, PENDING from s3_helper import S3Helper from ci_metadata import CiMetadata from version_helper import get_version_from_repo @@ -996,6 +996,11 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: action="store_true", help="Action that cancels previous running PR workflow if PR added into the Merge Queue", ) + parser.add_argument( + "--set-pending-status", + action="store_true", + help="Action to set needed pending statuses in the beginning of CI workflow, e.g. for Sync wf", + ) parser.add_argument( "--configure", action="store_true", @@ -1930,6 +1935,19 @@ def _cancel_pr_wf(s3: S3Helper, pr_number: int, cancel_sync: bool = False) -> No ) +def _set_pending_statuses(pr_info: PRInfo) -> None: + commit = get_commit(GitHub(get_best_robot_token(), per_page=100), pr_info.sha) + try: + commit.create_status( + state=PENDING, + target_url="", + description="", + context=StatusNames.SYNC, + ) + except Exception as ex: + print(f"ERROR: failed to set GH commit status, ex: {ex}") + + def main() -> int: logging.basicConfig(level=logging.INFO) exit_code = 0 @@ -2265,6 +2283,13 @@ def main() -> int: else: assert False, "BUG! Not supported scenario" + ### SET PENDING STATUS + elif args.cancel_previous_run: + if pr_info.is_pr: + _set_pending_statuses(pr_info) + else: + assert False, "BUG! Not supported scenario" + ### print results _print_results(result, args.outfile, args.pretty) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index e1c47353743..22cc0085781 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -433,11 +433,8 @@ def set_mergeable_check( commit: Commit, description: str = "", state: StatusType = SUCCESS, - hide_url: bool = False, ) -> CommitStatus: - report_url = GITHUB_RUN_URL - if hide_url: - report_url = "" + report_url = "" return post_commit_status( commit, state, @@ -469,7 +466,6 @@ def update_mergeable_check(commit: Commit, pr_info: PRInfo, check_name: str) -> def trigger_mergeable_check( commit: Commit, statuses: CommitStatuses, - hide_url: bool = False, set_if_green: bool = False, workflow_failed: bool = False, ) -> StatusType: @@ -484,9 +480,12 @@ def trigger_mergeable_check( success = [] fail = [] + pending = [] for status in required_checks: if status.state == SUCCESS: success.append(status.context) + elif status.state == PENDING: + pending.append(status.context) else: fail.append(status.context) @@ -503,6 +502,8 @@ def trigger_mergeable_check( elif workflow_failed: description = "check workflow failures" state = FAILURE + elif pending: + description = "pending: " + ", ".join(pending) description = format_description(description) if not set_if_green and state == SUCCESS: @@ -510,7 +511,7 @@ def trigger_mergeable_check( pass else: if mergeable_status is None or mergeable_status.description != description: - set_mergeable_check(commit, description, state, hide_url) + set_mergeable_check(commit, description, state) return state diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 1a7000f5353..130973ee8ff 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -67,7 +67,7 @@ def main(): if status.state == PENDING: post_commit_status( commit, - SUCCESS, + state, # map Mergeable Check status to CI Running status.target_url, "All checks finished", StatusNames.CI, diff --git a/tests/ci/merge_pr.py b/tests/ci/merge_pr.py index 500de4eb718..e1c7bf94ff5 100644 --- a/tests/ci/merge_pr.py +++ b/tests/ci/merge_pr.py @@ -250,7 +250,6 @@ def main(): trigger_mergeable_check( commit, statuses, - hide_url=False, set_if_green=True, workflow_failed=(args.wf_status != "success"), ) From 22b441ed40034280d80506150f9f4969966a3f87 Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 11:46:50 +0200 Subject: [PATCH 328/392] fix PR template --- .github/PULL_REQUEST_TEMPLATE.md | 64 +++++++++++++++----------------- tests/ci/ci.py | 3 +- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 64dc9049bc2..663b464d002 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -46,42 +46,36 @@ At a minimum, the following information should be added (but add more as needed) **NOTE:** If your merge the PR with modified CI you **MUST KNOW** what you are doing **NOTE:** Checked options will be applied if set before CI RunConfig/PrepareRunConfig step - -#### Run these jobs only (required builds will be added automatically): -- [ ] Integration Tests -- [ ] Stateless tests -- [ ] Stateful tests -- [ ] Unit tests -- [ ] Performance tests -- [ ] All with aarch64 -- [ ] All with ASAN -- [ ] All with TSAN -- [ ] All with Analyzer -- [ ] All with Azure -- [ ] Add your option here - -#### Deny these jobs: -- [ ] Fast test -- [ ] Integration Tests -- [ ] Stateless tests -- [ ] Stateful tests -- [ ] Performance tests -- [ ] All with ASAN -- [ ] All with TSAN -- [ ] All with MSAN -- [ ] All with UBSAN -- [ ] All with Coverage -- [ ] All with Aarch64 - -#### Extra options: +--- +- [ ] Allow: Integration Tests +- [ ] Allow:: Stateless tests +- [ ] Allow: Stateful tests +- [ ] Allow: Unit tests +- [ ] Allow: Performance tests +- [ ] Allow: All with aarch64 +- [ ] Allow: All with ASAN +- [ ] Allow: All with TSAN +- [ ] Allow: All with Analyzer +- [ ] Allow: All with Azure +- [ ] Allow: Add your option here +--- +- [ ] Exclude: Fast test +- [ ] Exclude: Integration Tests +- [ ] Exclude: Stateless tests +- [ ] Exclude: Stateful tests +- [ ] Exclude: Performance tests +- [ ] Exclude: All with ASAN +- [ ] Exclude: All with TSAN +- [ ] Exclude: All with MSAN +- [ ] Exclude: All with UBSAN +- [ ] Exclude: All with Coverage +- [ ] Exclude: All with Aarch64 +--- - [ ] do not test (only style check) - [ ] disable merge-commit (no merge from master before tests) - [ ] disable CI cache (job reuse) - -#### Only specified batches in multi-batch jobs: -- [ ] 1 -- [ ] 2 -- [ ] 3 -- [ ] 4 - +- [ ] only batch 1 for multi-batch jobs +- [ ] only batch 2 for multi-batch jobs +- [ ] only batch 3 for multi-batch jobs +- [ ] only batch 4 for multi-batch jobs
diff --git a/tests/ci/ci.py b/tests/ci/ci.py index fc25bee354d..c4e06ccd79a 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -1938,6 +1938,7 @@ def _cancel_pr_wf(s3: S3Helper, pr_number: int, cancel_sync: bool = False) -> No def _set_pending_statuses(pr_info: PRInfo) -> None: commit = get_commit(GitHub(get_best_robot_token(), per_page=100), pr_info.sha) try: + print("Set SYNC status to pending") commit.create_status( state=PENDING, target_url="", @@ -2284,7 +2285,7 @@ def main() -> int: assert False, "BUG! Not supported scenario" ### SET PENDING STATUS - elif args.cancel_previous_run: + elif args.set_pending_status: if pr_info.is_pr: _set_pending_statuses(pr_info) else: From a725112c4c7e33ae23e970b2c50f762ca2edea96 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 May 2024 10:10:39 +0000 Subject: [PATCH 329/392] Fix different hashes for reading/writing from/to query cache --- src/Interpreters/executeQuery.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 0b5f68f27f6..59d012a0a0e 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -1093,6 +1093,15 @@ static std::tuple executeQueryImpl( && (ast->as() || ast->as()); QueryCache::Usage query_cache_usage = QueryCache::Usage::None; + /// If the query runs with "use_query_cache = 1", we first probe if the query cache already contains the query result (if yes: + /// return result from cache). If doesn't, we execute the query normally and write the result into the query cache. Both steps use a + /// hash of the AST, the current database and the settings as cache key. Unfortunately, the settings are in some places internally + /// modified between steps 1 and 2 (= during query execution) - this is silly but hard to forbid. As a result, the hashes no longer + /// match and the cache is rendered ineffective. Therefore make a copy of the settings and use it for steps 1 and 2. + std::optional settings_copy; + if (can_use_query_cache) + settings_copy = settings; + if (!async_insert) { /// If it is a non-internal SELECT, and passive (read) use of the query cache is enabled, and the cache knows the query, then set @@ -1101,7 +1110,7 @@ static std::tuple executeQueryImpl( { if (can_use_query_cache && settings.enable_reads_from_query_cache) { - QueryCache::Key key(ast, context->getCurrentDatabase(), settings, context->getUserID(), context->getCurrentRoles()); + QueryCache::Key key(ast, context->getCurrentDatabase(), *settings_copy, context->getUserID(), context->getCurrentRoles()); QueryCache::Reader reader = query_cache->createReader(key); if (reader.hasCacheEntryForKey()) { @@ -1224,7 +1233,7 @@ static std::tuple executeQueryImpl( && (!ast_contains_system_tables || system_table_handling == QueryCacheSystemTableHandling::Save)) { QueryCache::Key key( - ast, context->getCurrentDatabase(), settings, res.pipeline.getHeader(), + ast, context->getCurrentDatabase(), *settings_copy, res.pipeline.getHeader(), context->getUserID(), context->getCurrentRoles(), settings.query_cache_share_between_users, std::chrono::system_clock::now() + std::chrono::seconds(settings.query_cache_ttl), From 0e758722c6da7044fcb2c8958f175a8321c056a5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 May 2024 10:19:01 +0000 Subject: [PATCH 330/392] Enable 02494_query_cache_nested_query_bug for Analyzer --- .../0_stateless/02494_query_cache_nested_query_bug.reference | 2 +- tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.reference b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.reference index 389e2621455..b261da18d51 100644 --- a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.reference +++ b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.reference @@ -1,2 +1,2 @@ -2 +1 0 diff --git a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh index 8712c7c84c6..15015761295 100755 --- a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh +++ b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh @@ -15,7 +15,7 @@ ${CLICKHOUSE_CLIENT} --query "CREATE TABLE tab (a UInt64) ENGINE=MergeTree() ORD ${CLICKHOUSE_CLIENT} --query "INSERT INTO tab VALUES (1) (2) (3)" ${CLICKHOUSE_CLIENT} --query "INSERT INTO tab VALUES (3) (4) (5)" -SETTINGS="SETTINGS use_query_cache=1, max_threads=1, allow_experimental_analyzer=0, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=0.0" +SETTINGS="SETTINGS use_query_cache=1, max_threads=1, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=0.0" # Verify that the first query does two aggregations and the second query zero aggregations. Since query cache is currently not integrated # with EXPLAIN PLAN, we need to check the logs. From f1421c9e5c542ed529dd3b225fc06c696a054080 Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 12:02:14 +0200 Subject: [PATCH 331/392] style fix --- .github/PULL_REQUEST_TEMPLATE.md | 11 +++++------ tests/ci/commit_status_helper.py | 4 +--- tests/ci/finish_check.py | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 663b464d002..f9765c1d57b 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -46,9 +46,8 @@ At a minimum, the following information should be added (but add more as needed) **NOTE:** If your merge the PR with modified CI you **MUST KNOW** what you are doing **NOTE:** Checked options will be applied if set before CI RunConfig/PrepareRunConfig step ---- - [ ] Allow: Integration Tests -- [ ] Allow:: Stateless tests +- [ ] Allow: Stateless tests - [ ] Allow: Stateful tests - [ ] Allow: Unit tests - [ ] Allow: Performance tests @@ -74,8 +73,8 @@ At a minimum, the following information should be added (but add more as needed) - [ ] do not test (only style check) - [ ] disable merge-commit (no merge from master before tests) - [ ] disable CI cache (job reuse) -- [ ] only batch 1 for multi-batch jobs -- [ ] only batch 2 for multi-batch jobs -- [ ] only batch 3 for multi-batch jobs -- [ ] only batch 4 for multi-batch jobs +- [ ] allow: batch 1 for multi-batch jobs +- [ ] allow: batch 2 +- [ ] allow: batch 3 +- [ ] allow: batch 4, 5 and 6 diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 22cc0085781..bdbb0e80653 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -20,7 +20,6 @@ from github.Repository import Repository from ci_config import CHECK_DESCRIPTIONS, CheckDescription, StatusNames, is_required from env_helper import ( GITHUB_REPOSITORY, - GITHUB_RUN_URL, GITHUB_UPSTREAM_REPOSITORY, TEMP_PATH, ) @@ -557,13 +556,12 @@ def update_upstream_sync_status( post_commit_status( last_synced_upstream_commit, sync_status, - "", # let's won't expose any urls from cloud + "", "", StatusNames.SYNC, ) trigger_mergeable_check( last_synced_upstream_commit, get_commit_filtered_statuses(last_synced_upstream_commit), - True, set_if_green=can_set_green_mergeable_status, ) diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 130973ee8ff..269d5aa3175 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -15,7 +15,7 @@ from commit_status_helper import ( ) from get_robot_token import get_best_robot_token from pr_info import PRInfo -from report import PENDING, SUCCESS +from report import PENDING from synchronizer_utils import SYNC_BRANCH_PREFIX from env_helper import GITHUB_REPOSITORY, GITHUB_UPSTREAM_REPOSITORY @@ -67,7 +67,7 @@ def main(): if status.state == PENDING: post_commit_status( commit, - state, # map Mergeable Check status to CI Running + state, # map Mergeable Check status to CI Running status.target_url, "All checks finished", StatusNames.CI, From 1f1c2c21b19dc3d29b60f0508b79bceb425585e7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 May 2024 10:32:42 +0000 Subject: [PATCH 332/392] Fix spelling --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 6df2e426561..6eae333681d 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1617,6 +1617,8 @@ gcem generateRandom generateRandomStructure generateSeries +generateSnowflakeID +generateSnowflakeIDThreadMonotonic generateULID generateUUIDv geoDistance From 7ccb776ed93196e72485aa0219d7b281ea0f68de Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 12:39:35 +0200 Subject: [PATCH 333/392] mcheck fix --- tests/ci/commit_status_helper.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index bdbb0e80653..b17c189c405 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -490,11 +490,6 @@ def trigger_mergeable_check( state: StatusType = SUCCESS - if success: - description = ", ".join(success) - else: - description = "awaiting job statuses" - if fail: description = "failed: " + ", ".join(fail) state = FAILURE @@ -503,6 +498,11 @@ def trigger_mergeable_check( state = FAILURE elif pending: description = "pending: " + ", ".join(pending) + state = PENDING + else: + # all good + description = ", ".join(success) + description = format_description(description) if not set_if_green and state == SUCCESS: From 534f996be3ec5baa544b45180fd1ff049eb2cada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 May 2024 13:07:37 +0200 Subject: [PATCH 334/392] Change input_format_parquet_use_native_reader to 24.6 --- src/Core/SettingsChangesHistory.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 23f7810835c..9b5bf6b50a5 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,6 +85,8 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"24.6", {{"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, + }}, {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, @@ -93,7 +95,6 @@ static std::map sett {"cross_join_min_bytes_to_compress", 0, 1_GiB, "A new setting."}, {"http_max_chunk_size", 0, 0, "Internal limitation"}, {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, - {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, {"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"}, {"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"}, {"allow_experimental_dynamic_type", false, false, "Add new experimental Dynamic type"}, From d48fba5b2b4176434242c75121066001846a1e17 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Fri, 19 Apr 2024 00:30:55 +0800 Subject: [PATCH 335/392] Limit the array index of FixedHashTable by min/max If the type of key is 8 bits or 16 bits in aggregation, ClickHouse will use array of 256 or 65536 length to store the key and boost the mergeSingleLevel, rather than key comparison. However, if the key has occupied only small range of the total 65536 cells, most of the cycles are wasted on the `isZero()` to find the next cell which is not zero in iterator++. The solution is to use min/max and update min/max when emplace. Then we can set the upper searching limit to max in iterator++. And just set min as the value of `begin()`, rather than searching the first cell that not equals to 0. We have tested the patch on 2x80 vCPUs server, Query 7 of ClickBench has gained 2.1x performance improvement. Signed-off-by: Jiebin Sun --- src/Common/HashTable/FixedHashTable.h | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index 49675aaafbc..d40169028b5 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -114,6 +114,8 @@ template class FixedHashTable : private boost::noncopyable, protected Allocator, protected Cell::State, protected Size { static constexpr size_t NUM_CELLS = 1ULL << (sizeof(Key) * 8); + size_t min = NUM_CELLS - 1; + size_t max = 0; protected: friend class const_iterator; @@ -169,7 +171,7 @@ protected: ++ptr; /// Skip empty cells in the main buffer. - const auto * buf_end = container->buf + container->NUM_CELLS; + const auto * buf_end= container->buf + container->max + 1; while (ptr < buf_end && ptr->isZero(*container)) ++ptr; @@ -294,14 +296,10 @@ public: const_iterator begin() const { - if (!buf) + if (!buf && min > max) return end(); - const Cell * ptr = buf; - auto buf_end = buf + NUM_CELLS; - while (ptr < buf_end && ptr->isZero(*this)) - ++ptr; - + const Cell * ptr = buf + min; return const_iterator(this, ptr); } @@ -309,21 +307,17 @@ public: iterator begin() { - if (!buf) + if (!buf && min > max) return end(); - Cell * ptr = buf; - auto buf_end = buf + NUM_CELLS; - while (ptr < buf_end && ptr->isZero(*this)) - ++ptr; - + Cell * ptr = buf + min; return iterator(this, ptr); } const_iterator end() const { /// Avoid UBSan warning about adding zero to nullptr. It is valid in C++20 (and earlier) but not valid in C. - return const_iterator(this, buf ? buf + NUM_CELLS : buf); + return const_iterator(this, buf ? buf + max + 1: buf); } const_iterator cend() const @@ -333,7 +327,7 @@ public: iterator end() { - return iterator(this, buf ? buf + NUM_CELLS : buf); + return iterator(this, buf ? buf + max + 1 : buf); } @@ -350,6 +344,8 @@ public: new (&buf[x]) Cell(x, *this); inserted = true; + if (x < min) min = x; + if (x > max) max = x; this->increaseSize(); } From 69960a5735fa3f08ddac258e2208d27e2d4e0a01 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Fri, 19 Apr 2024 18:19:25 +0800 Subject: [PATCH 336/392] Fix a bug if the container is empty --- src/Common/HashTable/FixedHashTable.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index d40169028b5..67605417a84 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -296,7 +296,7 @@ public: const_iterator begin() const { - if (!buf && min > max) + if (!buf || min > max) return end(); const Cell * ptr = buf + min; @@ -307,7 +307,8 @@ public: iterator begin() { - if (!buf && min > max) + /// If the container is empty, the initialization of min/max will not work as min > max. + if (!buf || min > max) return end(); Cell * ptr = buf + min; From 60420f2a8e3809640fd7a6a6b5c26b7b0d9df962 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Thu, 25 Apr 2024 01:53:20 +0800 Subject: [PATCH 337/392] Fix a bug if data will be inserted not by emplace(). --- src/Common/HashTable/FixedHashTable.h | 42 ++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index 67605417a84..be4f82434b1 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -171,7 +171,9 @@ protected: ++ptr; /// Skip empty cells in the main buffer. - const auto * buf_end= container->buf + container->max + 1; + const auto * buf_end = container->buf + container->NUM_CELLS; + if (container->min <= container->max) + buf_end = container->buf + container->max + 1; while (ptr < buf_end && ptr->isZero(*container)) ++ptr; @@ -296,10 +298,19 @@ public: const_iterator begin() const { - if (!buf || min > max) + if (!buf) return end(); - const Cell * ptr = buf + min; + const Cell * ptr = buf; + if (min > max) + { + auto buf_end = buf + NUM_CELLS; + while (ptr < buf_end && ptr->isZero(*this)) + ++ptr; + } + else + ptr = buf + min; + return const_iterator(this, ptr); } @@ -307,18 +318,30 @@ public: iterator begin() { - /// If the container is empty, the initialization of min/max will not work as min > max. - if (!buf || min > max) + /// If min > max, it might use emplace to insert the value or the container is empty. + if (!buf) return end(); - Cell * ptr = buf + min; + Cell * ptr = buf; + if (min > max) + { + auto buf_end = buf + NUM_CELLS; + while (ptr < buf_end && ptr->isZero(*this)) + ++ptr; + } + else + ptr = buf + min; + return iterator(this, ptr); } const_iterator end() const { /// Avoid UBSan warning about adding zero to nullptr. It is valid in C++20 (and earlier) but not valid in C. - return const_iterator(this, buf ? buf + max + 1: buf); + if (min > max) + return const_iterator(this, buf ? buf + NUM_CELLS: buf); + else + return const_iterator(this, buf ? buf + max + 1: buf); } const_iterator cend() const @@ -328,7 +351,10 @@ public: iterator end() { - return iterator(this, buf ? buf + max + 1 : buf); + if (min > max) + return iterator(this, buf ? buf + NUM_CELLS: buf); + else + return iterator(this, buf ? buf + max + 1: buf); } From 7f960e4e8ad046e4359a9803fb49b49441444bdc Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Thu, 9 May 2024 01:13:11 +0800 Subject: [PATCH 338/392] Add the use_emplace_to_insert_data flag. `emplace()` is the only interface to update min/max. If the FixedHashTable.emplace() is not used to revise the hashtable value, then we should not continue the min/max optimization. --- src/Common/HashTable/FixedHashTable.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index be4f82434b1..25860800f6e 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -114,6 +114,7 @@ template class FixedHashTable : private boost::noncopyable, protected Allocator, protected Cell::State, protected Size { static constexpr size_t NUM_CELLS = 1ULL << (sizeof(Key) * 8); + bool use_emplace_to_insert_data = true; size_t min = NUM_CELLS - 1; size_t max = 0; @@ -172,7 +173,7 @@ protected: /// Skip empty cells in the main buffer. const auto * buf_end = container->buf + container->NUM_CELLS; - if (container->min <= container->max) + if (container->use_min_max_optimization()) buf_end = container->buf + container->max + 1; while (ptr < buf_end && ptr->isZero(*container)) ++ptr; @@ -302,7 +303,7 @@ public: return end(); const Cell * ptr = buf; - if (min > max) + if (!use_min_max_optimization()) { auto buf_end = buf + NUM_CELLS; while (ptr < buf_end && ptr->isZero(*this)) @@ -323,7 +324,7 @@ public: return end(); Cell * ptr = buf; - if (min > max) + if (!use_min_max_optimization()) { auto buf_end = buf + NUM_CELLS; while (ptr < buf_end && ptr->isZero(*this)) @@ -338,7 +339,7 @@ public: const_iterator end() const { /// Avoid UBSan warning about adding zero to nullptr. It is valid in C++20 (and earlier) but not valid in C. - if (min > max) + if (!use_min_max_optimization()) return const_iterator(this, buf ? buf + NUM_CELLS: buf); else return const_iterator(this, buf ? buf + max + 1: buf); @@ -351,7 +352,7 @@ public: iterator end() { - if (min > max) + if (!use_min_max_optimization()) return iterator(this, buf ? buf + NUM_CELLS: buf); else return iterator(this, buf ? buf + max + 1: buf); @@ -400,6 +401,10 @@ public: bool ALWAYS_INLINE has(const Key & x) const { return !buf[x].isZero(*this); } bool ALWAYS_INLINE has(const Key &, size_t hash_value) const { return !buf[hash_value].isZero(*this); } + /// Decide if we use the min/max optimization. `max < min` means the FixedHashtable is empty. The flag `use_emplace_to_insert_data` + /// will check if the FixedHashTable will use `emplace()` to insert the raw data. + bool ALWAYS_INLINE use_min_max_optimization() const {return ((max >= min) && use_emplace_to_insert_data);} + void write(DB::WriteBuffer & wb) const { Cell::State::write(wb); @@ -456,6 +461,7 @@ public: x.read(rb); new (&buf[place_value]) Cell(x, *this); } + use_emplace_to_insert_data = false; } void readText(DB::ReadBuffer & rb) @@ -478,6 +484,7 @@ public: x.readText(rb); new (&buf[place_value]) Cell(x, *this); } + use_emplace_to_insert_data = false; } size_t size() const { return this->getSize(buf, *this, NUM_CELLS); } @@ -516,7 +523,11 @@ public: } const Cell * data() const { return buf; } - Cell * data() { return buf; } + Cell * data() + { + use_emplace_to_insert_data = false; + return buf; + } #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS size_t getCollisions() const { return 0; } From 4e6f5fba830008091fbb2e62acc7a7e60e193a37 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Fri, 17 May 2024 10:32:41 +0800 Subject: [PATCH 339/392] Update src/Common/HashTable/FixedHashTable.h Add comment by Nikita. Co-authored-by: Nikita Taranov --- src/Common/HashTable/FixedHashTable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index 25860800f6e..3214c974003 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -114,7 +114,9 @@ template class FixedHashTable : private boost::noncopyable, protected Allocator, protected Cell::State, protected Size { static constexpr size_t NUM_CELLS = 1ULL << (sizeof(Key) * 8); - bool use_emplace_to_insert_data = true; + /// We maintain min and max values inserted into the hash table to then limit the amount of cells to traverse to the [min; max] range. + /// Both values could be efficiently calculated only within `emplace` calls (and not when we populate the hash table in `read` method for example), so we update them only within `emplace` and track if any other method was called. + bool only_emplace_was_used_to_insert_data = true; size_t min = NUM_CELLS - 1; size_t max = 0; From ca88da11e0e1f96d6e833349130899aa0605263a Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Fri, 17 May 2024 10:33:43 +0800 Subject: [PATCH 340/392] Update src/Common/HashTable/FixedHashTable.h Revise the method name by Nikita. Co-authored-by: Nikita Taranov --- src/Common/HashTable/FixedHashTable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index 3214c974003..b34f45f0a9a 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -405,7 +405,7 @@ public: /// Decide if we use the min/max optimization. `max < min` means the FixedHashtable is empty. The flag `use_emplace_to_insert_data` /// will check if the FixedHashTable will use `emplace()` to insert the raw data. - bool ALWAYS_INLINE use_min_max_optimization() const {return ((max >= min) && use_emplace_to_insert_data);} + bool ALWAYS_INLINE canUseMinMaxOptimization() const {return ((max >= min) && use_emplace_to_insert_data);} void write(DB::WriteBuffer & wb) const { From d1d57caf0a2b470f7ad9d05b910633d7c08c581e Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Fri, 17 May 2024 22:30:51 +0800 Subject: [PATCH 341/392] Generate the seperate function firstPopulatedCell() and lastPopulatedCell() --- src/Common/HashTable/FixedHashTable.h | 66 ++++++++++++--------------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index b34f45f0a9a..f842a30e3d8 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -114,6 +114,7 @@ template class FixedHashTable : private boost::noncopyable, protected Allocator, protected Cell::State, protected Size { static constexpr size_t NUM_CELLS = 1ULL << (sizeof(Key) * 8); + /// We maintain min and max values inserted into the hash table to then limit the amount of cells to traverse to the [min; max] range. /// Both values could be efficiently calculated only within `emplace` calls (and not when we populate the hash table in `read` method for example), so we update them only within `emplace` and track if any other method was called. bool only_emplace_was_used_to_insert_data = true; @@ -175,7 +176,7 @@ protected: /// Skip empty cells in the main buffer. const auto * buf_end = container->buf + container->NUM_CELLS; - if (container->use_min_max_optimization()) + if (container->canUseMinMaxOptimization()) buf_end = container->buf + container->max + 1; while (ptr < buf_end && ptr->isZero(*container)) ++ptr; @@ -304,47 +305,23 @@ public: if (!buf) return end(); - const Cell * ptr = buf; - if (!use_min_max_optimization()) - { - auto buf_end = buf + NUM_CELLS; - while (ptr < buf_end && ptr->isZero(*this)) - ++ptr; - } - else - ptr = buf + min; - - return const_iterator(this, ptr); + return const_iterator(this, firstPopulatedCell()); } const_iterator cbegin() const { return begin(); } iterator begin() { - /// If min > max, it might use emplace to insert the value or the container is empty. if (!buf) return end(); - Cell * ptr = buf; - if (!use_min_max_optimization()) - { - auto buf_end = buf + NUM_CELLS; - while (ptr < buf_end && ptr->isZero(*this)) - ++ptr; - } - else - ptr = buf + min; - - return iterator(this, ptr); + return iterator(this, const_cast(firstPopulatedCell())); } const_iterator end() const { /// Avoid UBSan warning about adding zero to nullptr. It is valid in C++20 (and earlier) but not valid in C. - if (!use_min_max_optimization()) - return const_iterator(this, buf ? buf + NUM_CELLS: buf); - else - return const_iterator(this, buf ? buf + max + 1: buf); + return const_iterator(this, lastPopulatedCell()); } const_iterator cend() const @@ -354,10 +331,7 @@ public: iterator end() { - if (!use_min_max_optimization()) - return iterator(this, buf ? buf + NUM_CELLS: buf); - else - return iterator(this, buf ? buf + max + 1: buf); + return iterator(this, lastPopulatedCell()); } @@ -403,9 +377,25 @@ public: bool ALWAYS_INLINE has(const Key & x) const { return !buf[x].isZero(*this); } bool ALWAYS_INLINE has(const Key &, size_t hash_value) const { return !buf[hash_value].isZero(*this); } - /// Decide if we use the min/max optimization. `max < min` means the FixedHashtable is empty. The flag `use_emplace_to_insert_data` - /// will check if the FixedHashTable will use `emplace()` to insert the raw data. - bool ALWAYS_INLINE canUseMinMaxOptimization() const {return ((max >= min) && use_emplace_to_insert_data);} + /// Decide if we use the min/max optimization. `max < min` means the FixedHashtable is empty. The flag `only_emplace_was_used_to_insert_data` + /// will check if the FixedHashTable will only use `emplace()` to insert the raw data. + bool ALWAYS_INLINE canUseMinMaxOptimization() const { return ((max >= min) && only_emplace_was_used_to_insert_data); } + + const Cell * ALWAYS_INLINE firstPopulatedCell() const + { + const Cell * ptr = buf; + if (!canUseMinMaxOptimization()) + { + while (ptr < buf + NUM_CELLS && ptr->isZero(*this)) + ++ptr; + } + else + ptr = buf + min; + + return ptr; + } + + Cell * ALWAYS_INLINE lastPopulatedCell() const { return canUseMinMaxOptimization() ? buf + max + 1 : buf + NUM_CELLS; } void write(DB::WriteBuffer & wb) const { @@ -463,7 +453,7 @@ public: x.read(rb); new (&buf[place_value]) Cell(x, *this); } - use_emplace_to_insert_data = false; + only_emplace_was_used_to_insert_data = false; } void readText(DB::ReadBuffer & rb) @@ -486,7 +476,7 @@ public: x.readText(rb); new (&buf[place_value]) Cell(x, *this); } - use_emplace_to_insert_data = false; + only_emplace_was_used_to_insert_data = false; } size_t size() const { return this->getSize(buf, *this, NUM_CELLS); } @@ -527,7 +517,7 @@ public: const Cell * data() const { return buf; } Cell * data() { - use_emplace_to_insert_data = false; + only_emplace_was_used_to_insert_data = false; return buf; } From d40c5a07becdbaa1652f3860f239e7e83d752f91 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Tue, 21 May 2024 20:31:43 +0800 Subject: [PATCH 342/392] Avoid UBSan warning while buf is nullptr --- src/Common/HashTable/FixedHashTable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/HashTable/FixedHashTable.h b/src/Common/HashTable/FixedHashTable.h index f842a30e3d8..a84391b37e3 100644 --- a/src/Common/HashTable/FixedHashTable.h +++ b/src/Common/HashTable/FixedHashTable.h @@ -321,7 +321,7 @@ public: const_iterator end() const { /// Avoid UBSan warning about adding zero to nullptr. It is valid in C++20 (and earlier) but not valid in C. - return const_iterator(this, lastPopulatedCell()); + return const_iterator(this, buf ? lastPopulatedCell() : buf); } const_iterator cend() const @@ -331,7 +331,7 @@ public: iterator end() { - return iterator(this, lastPopulatedCell()); + return iterator(this, buf ? lastPopulatedCell() : buf); } From a6e06b27d221cfd7f5b7987c2b642487b2a80d01 Mon Sep 17 00:00:00 2001 From: Nikita Fomichev Date: Fri, 24 May 2024 14:17:37 +0200 Subject: [PATCH 343/392] Update description for settings cross_join_min_rows_to_compress and cross_join_min_bytes_to_compress --- src/Core/SettingsChangesHistory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 23f7810835c..0521f70a91b 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -89,8 +89,8 @@ static std::map sett {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, {"output_format_parquet_use_custom_encoder", false, true, "Enable custom Parquet encoder."}, - {"cross_join_min_rows_to_compress", 0, 10000000, "A new setting."}, - {"cross_join_min_bytes_to_compress", 0, 1_GiB, "A new setting."}, + {"cross_join_min_rows_to_compress", 0, 10000000, "Minimal count of rows to compress block in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached."}, + {"cross_join_min_bytes_to_compress", 0, 1_GiB, "Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached."}, {"http_max_chunk_size", 0, 0, "Internal limitation"}, {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, From 3d207039584cb69d9fffe1b3ec923a31fab5f032 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 24 May 2024 12:27:19 +0000 Subject: [PATCH 344/392] Force-enable analyzer so that tests without Analyzer can no longer fail --- tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh index 15015761295..a5339a098dc 100755 --- a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh +++ b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh @@ -15,7 +15,7 @@ ${CLICKHOUSE_CLIENT} --query "CREATE TABLE tab (a UInt64) ENGINE=MergeTree() ORD ${CLICKHOUSE_CLIENT} --query "INSERT INTO tab VALUES (1) (2) (3)" ${CLICKHOUSE_CLIENT} --query "INSERT INTO tab VALUES (3) (4) (5)" -SETTINGS="SETTINGS use_query_cache=1, max_threads=1, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=0.0" +SETTINGS="SETTINGS use_query_cache=1, max_threads=1, allow_experimental_analyzer=1, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=0.0" # Verify that the first query does two aggregations and the second query zero aggregations. Since query cache is currently not integrated # with EXPLAIN PLAN, we need to check the logs. From aada1de796144829b2a6e334764923cef6da4fff Mon Sep 17 00:00:00 2001 From: TTPO100AJIEX Date: Fri, 24 May 2024 15:36:41 +0300 Subject: [PATCH 345/392] Rename function parameters, remove unnecessary virtual --- src/Server/ServersManager/IServersManager.cpp | 8 ++-- src/Server/ServersManager/IServersManager.h | 14 +++---- .../ServersManager/InterServersManager.cpp | 20 +++++----- .../ServersManager/InterServersManager.h | 1 - .../ServersManager/ProtocolServersManager.cpp | 40 +++++++++---------- 5 files changed, 41 insertions(+), 42 deletions(-) diff --git a/src/Server/ServersManager/IServersManager.cpp b/src/Server/ServersManager/IServersManager.cpp index c903d90f766..8b1eee94303 100644 --- a/src/Server/ServersManager/IServersManager.cpp +++ b/src/Server/ServersManager/IServersManager.cpp @@ -17,8 +17,8 @@ extern const int NETWORK_ERROR; extern const int INVALID_CONFIG_PARAMETER; } -IServersManager::IServersManager(ContextMutablePtr l_global_context, Poco::Logger * l_logger) - : global_context(l_global_context), logger(l_logger) +IServersManager::IServersManager(ContextMutablePtr global_context_, Poco::Logger * logger_) + : global_context(global_context_), logger(logger_) { } @@ -107,8 +107,8 @@ void IServersManager::createServer( const Poco::Util::AbstractConfiguration & config, const std::string & listen_host, const char * port_name, - CreateServerFunc && func, - bool start_server) + bool start_server, + CreateServerFunc && func) { /// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file. if (config.getString(port_name, "").empty()) diff --git a/src/Server/ServersManager/IServersManager.h b/src/Server/ServersManager/IServersManager.h index 5218ab63554..7e1d9d50d82 100644 --- a/src/Server/ServersManager/IServersManager.h +++ b/src/Server/ServersManager/IServersManager.h @@ -19,7 +19,7 @@ namespace DB class IServersManager { public: - IServersManager(ContextMutablePtr global_context, Poco::Logger * logger); + IServersManager(ContextMutablePtr global_context_, Poco::Logger * logger_); virtual ~IServersManager() = default; bool empty() const; @@ -35,9 +35,9 @@ public: const ServerType & server_type) = 0; - virtual void startServers(); + void startServers(); - virtual void stopServers(const ServerType & server_type); + void stopServers(const ServerType & server_type); virtual size_t stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) = 0; virtual void updateServers( @@ -58,14 +58,14 @@ protected: const Poco::Util::AbstractConfiguration & config, Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port) const; using CreateServerFunc = std::function; - virtual void createServer( + void createServer( const Poco::Util::AbstractConfiguration & config, const std::string & listen_host, const char * port_name, - CreateServerFunc && func, - bool start_server); + bool start_server, + CreateServerFunc && func); - virtual void stopServersForUpdate(const Poco::Util::AbstractConfiguration & config, ConfigurationPtr latest_config); + void stopServersForUpdate(const Poco::Util::AbstractConfiguration & config, ConfigurationPtr latest_config); Strings getListenHosts(const Poco::Util::AbstractConfiguration & config) const; bool getListenTry(const Poco::Util::AbstractConfiguration & config) const; diff --git a/src/Server/ServersManager/InterServersManager.cpp b/src/Server/ServersManager/InterServersManager.cpp index 28491a4f4f4..4425d468248 100644 --- a/src/Server/ServersManager/InterServersManager.cpp +++ b/src/Server/ServersManager/InterServersManager.cpp @@ -71,6 +71,7 @@ void InterServersManager::createServers( config, listen_host, port_name, + /* start_server = */ false, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -92,14 +93,14 @@ void InterServersManager::createServers( false), server_pool, socket)); - }, - /* start_server = */ false); + }); constexpr auto secure_port_name = "keeper_server.tcp_port_secure"; createServer( config, listen_host, secure_port_name, + /* start_server = */ false, [&](UInt16 port) -> ProtocolServerAdapter { # if USE_SSL @@ -128,14 +129,14 @@ void InterServersManager::createServers( ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); # endif - }, - /* start_server: */ false); + }); /// HTTP control endpoints createServer( config, listen_host, /* port_name = */ "keeper_server.http_control.port", + /* start_server = */ false, [&](UInt16 port) -> ProtocolServerAdapter { auto http_context = std::make_shared(global_context); @@ -159,8 +160,7 @@ void InterServersManager::createServers( server_pool, socket, http_params)); - }, - /* start_server: */ false); + }); } #else throw Exception( @@ -264,6 +264,7 @@ void InterServersManager::createInterserverServers( config, interserver_listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -282,8 +283,7 @@ void InterServersManager::createInterserverServers( http_params, ProfileEvents::InterfaceInterserverReceiveBytes, ProfileEvents::InterfaceInterserverSendBytes)); - }, - start_servers); + }); } if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTPS)) @@ -293,6 +293,7 @@ void InterServersManager::createInterserverServers( config, interserver_listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { #if USE_SSL @@ -318,8 +319,7 @@ void InterServersManager::createInterserverServers( ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); #endif - }, - start_servers); + }); } } } diff --git a/src/Server/ServersManager/InterServersManager.h b/src/Server/ServersManager/InterServersManager.h index 2a389e28c22..8780eae18e0 100644 --- a/src/Server/ServersManager/InterServersManager.h +++ b/src/Server/ServersManager/InterServersManager.h @@ -19,7 +19,6 @@ public: bool start_servers, const ServerType & server_type) override; - using IServersManager::stopServers; size_t stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) override; void updateServers( diff --git a/src/Server/ServersManager/ProtocolServersManager.cpp b/src/Server/ServersManager/ProtocolServersManager.cpp index 17b028eddbb..af57de3ac3c 100644 --- a/src/Server/ServersManager/ProtocolServersManager.cpp +++ b/src/Server/ServersManager/ProtocolServersManager.cpp @@ -99,6 +99,7 @@ void ProtocolServersManager::createServers( config, host, port_name.c_str(), + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -110,8 +111,7 @@ void ProtocolServersManager::createServers( port_name.c_str(), description + ": " + address.toString(), std::make_unique(stack.release(), server_pool, socket, new Poco::Net::TCPServerParams)); - }, - start_servers); + }); } } @@ -125,6 +125,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -143,8 +144,7 @@ void ProtocolServersManager::createServers( http_params, ProfileEvents::InterfaceHTTPReceiveBytes, ProfileEvents::InterfaceHTTPSendBytes)); - }, - start_servers); + }); } if (server_type.shouldStart(ServerType::Type::HTTPS)) @@ -155,6 +155,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { #if USE_SSL @@ -180,8 +181,7 @@ void ProtocolServersManager::createServers( ErrorCodes::SUPPORT_IS_DISABLED, "HTTPS protocol is disabled because Poco library was built without NetSSL support."); #endif - }, - start_servers); + }); } if (server_type.shouldStart(ServerType::Type::TCP)) @@ -192,6 +192,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -208,8 +209,7 @@ void ProtocolServersManager::createServers( server_pool, socket, new Poco::Net::TCPServerParams)); - }, - start_servers); + }); } if (server_type.shouldStart(ServerType::Type::TCP_WITH_PROXY)) @@ -220,6 +220,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -236,8 +237,7 @@ void ProtocolServersManager::createServers( server_pool, socket, new Poco::Net::TCPServerParams)); - }, - start_servers); + }); } if (server_type.shouldStart(ServerType::Type::TCP_SECURE)) @@ -248,6 +248,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { #if USE_SSL @@ -271,8 +272,7 @@ void ProtocolServersManager::createServers( ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); #endif - }, - start_servers); + }); } if (server_type.shouldStart(ServerType::Type::MYSQL)) @@ -282,6 +282,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -298,8 +299,7 @@ void ProtocolServersManager::createServers( server_pool, socket, new Poco::Net::TCPServerParams)); - }, - start_servers); + }); } if (server_type.shouldStart(ServerType::Type::POSTGRESQL)) @@ -309,6 +309,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -325,8 +326,7 @@ void ProtocolServersManager::createServers( server_pool, socket, new Poco::Net::TCPServerParams)); - }, - start_servers); + }); } #if USE_GRPC @@ -337,6 +337,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::SocketAddress server_address(listen_host, port); @@ -345,8 +346,7 @@ void ProtocolServersManager::createServers( port_name, "gRPC protocol: " + server_address.toString(), std::make_unique(server, makeSocketAddress(listen_host, port, logger))); - }, - start_servers); + }); } #endif if (server_type.shouldStart(ServerType::Type::PROMETHEUS)) @@ -357,6 +357,7 @@ void ProtocolServersManager::createServers( config, listen_host, port_name, + start_servers, [&](UInt16 port) -> ProtocolServerAdapter { Poco::Net::ServerSocket socket; @@ -375,8 +376,7 @@ void ProtocolServersManager::createServers( http_params, ProfileEvents::InterfacePrometheusReceiveBytes, ProfileEvents::InterfacePrometheusSendBytes)); - }, - start_servers); + }); } } } From 2cc1b27fb5f898a8c728dda03f4dea3941c653b4 Mon Sep 17 00:00:00 2001 From: Nikita Fomichev Date: Fri, 24 May 2024 14:41:04 +0200 Subject: [PATCH 346/392] Update docs for settings cross_join_min_rows_to_compress and cross_join_min_bytes_to_compress --- docs/en/operations/settings/settings.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 2b5cd11819a..b2efe5d2af4 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5468,3 +5468,15 @@ Defines how MySQL types are converted to corresponding ClickHouse types. A comma - `datetime64`: convert `DATETIME` and `TIMESTAMP` types to `DateTime64` instead of `DateTime` when precision is not `0`. - `date2Date32`: convert `DATE` to `Date32` instead of `Date`. Takes precedence over `date2String`. - `date2String`: convert `DATE` to `String` instead of `Date`. Overridden by `datetime64`. + +## cross_join_min_rows_to_compress + +Minimal count of rows to compress block in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. + +Default value: `10000000`. + +## cross_join_min_bytes_to_compress + +Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. + +Default value: `1GiB`. From 7f450cfbdd7578a0b1519f74ff7998f400793284 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 22 May 2024 17:17:43 +0000 Subject: [PATCH 347/392] Try add alias to array join. --- src/Analyzer/ArrayJoinNode.cpp | 19 +++++ src/Analyzer/ColumnNode.cpp | 7 +- src/Analyzer/Passes/QueryAnalysisPass.cpp | 69 +++++++++++++++---- src/Analyzer/QueryTreeBuilder.cpp | 4 +- src/Analyzer/createUniqueTableAliases.cpp | 34 +++++++++ src/Parsers/ASTTablesInSelectQuery.cpp | 9 +++ src/Parsers/ASTTablesInSelectQuery.h | 4 ++ src/Parsers/ParserTablesInSelectQuery.cpp | 4 ++ .../QueryPlan/DistributedCreateLocalPlan.cpp | 8 +++ 9 files changed, 144 insertions(+), 14 deletions(-) diff --git a/src/Analyzer/ArrayJoinNode.cpp b/src/Analyzer/ArrayJoinNode.cpp index 59389d4f2a8..9c1eb9dce3e 100644 --- a/src/Analyzer/ArrayJoinNode.cpp +++ b/src/Analyzer/ArrayJoinNode.cpp @@ -24,6 +24,9 @@ void ArrayJoinNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_stat buffer << std::string(indent, ' ') << "ARRAY_JOIN id: " << format_state.getNodeId(this); buffer << ", is_left: " << is_left; + if (hasAlias()) + buffer << ", alias: " << getAlias(); + buffer << '\n' << std::string(indent + 2, ' ') << "TABLE EXPRESSION\n"; getTableExpression()->dumpTreeImpl(buffer, format_state, indent + 4); @@ -52,6 +55,8 @@ ASTPtr ArrayJoinNode::toASTImpl(const ConvertToASTOptions & options) const auto array_join_ast = std::make_shared(); array_join_ast->kind = is_left ? ASTArrayJoin::Kind::Left : ASTArrayJoin::Kind::Inner; + array_join_ast->setAlias(getAlias()); + auto array_join_expressions_ast = std::make_shared(); const auto & array_join_expressions = getJoinExpressions().getNodes(); @@ -65,7 +70,21 @@ ASTPtr ArrayJoinNode::toASTImpl(const ConvertToASTOptions & options) const else array_join_expression_ast = array_join_expression->toAST(options); + // QueryTreeNodePtr column_source; + // if (column_node) + // column_source = column_node->getColumnSourceOrNull(); + + // if (column_source && column_source->hasAlias()) + // { + // const auto & column_alias = column_node->getAlias(); + // const auto & name_or_alias = column_alias.empty() ? column_node->getColumnName() : column_alias; + + // if (!name_or_alias.starts_with("__")) + // array_join_expression_ast->setAlias(fmt::format("{}.{}", column_source->getAlias(), name_or_alias)); + // } + // else array_join_expression_ast->setAlias(array_join_expression->getAlias()); + array_join_expressions_ast->children.push_back(std::move(array_join_expression_ast)); } diff --git a/src/Analyzer/ColumnNode.cpp b/src/Analyzer/ColumnNode.cpp index 2b514a85121..f76c096a339 100644 --- a/src/Analyzer/ColumnNode.cpp +++ b/src/Analyzer/ColumnNode.cpp @@ -103,10 +103,15 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const if (column_source && options.fully_qualified_identifiers) { auto node_type = column_source->getNodeType(); + + // if (node_type == QueryTreeNodeType::ARRAY_JOIN && column_source->hasAlias()) + // return std::make_shared(std::string(fmt::format("{}.{}", column_source->getAlias(), column.name))); + if (node_type == QueryTreeNodeType::TABLE || node_type == QueryTreeNodeType::TABLE_FUNCTION || node_type == QueryTreeNodeType::QUERY || - node_type == QueryTreeNodeType::UNION) + node_type == QueryTreeNodeType::UNION || + node_type == QueryTreeNodeType::ARRAY_JOIN) { if (column_source->hasAlias()) { diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index b7c223303eb..f55f6d6c18f 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -1068,10 +1068,25 @@ public: void visitImpl(QueryTreeNodePtr & node) { updateAliasesIfNeeded(node, false /*is_lambda_node*/); + + // if (auto * array_join_node = node->as()) + // { + // for (const auto & elem : array_join_node->getJoinExpressions()) + // { + // for (auto & child : elem->getChildren()) + // { + // // std::cerr << "<<<<<<<<<< " << child->dumpTree() << std::endl; + // visit(child); + // } + // } + // } } bool needChildVisit(const QueryTreeNodePtr &, const QueryTreeNodePtr & child) { + // if (parent->getNodeType() == QueryTreeNodeType::ARRAY_JOIN) + // return false; + if (auto * lambda_node = child->as()) { updateAliasesIfNeeded(child, true /*is_lambda_node*/); @@ -1114,6 +1129,8 @@ private: if (node->getNodeType() == QueryTreeNodeType::WINDOW) return; + // std::cerr << ">>>>>>>>>> " << node->dumpTree() << std::endl; + const auto & alias = node->getAlias(); if (is_lambda_node) @@ -1526,7 +1543,7 @@ private: ProjectionNames resolveFunction(QueryTreeNodePtr & function_node, IdentifierResolveScope & scope); - ProjectionNames resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression); + ProjectionNames resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression, bool use_alias_table = true); ProjectionNames resolveExpressionNodeList(QueryTreeNodePtr & node_list, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression); @@ -3794,6 +3811,8 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveExpressionFromArrayJoinExpressions(con const QueryTreeNodePtr & table_expression_node, IdentifierResolveScope & scope) { + // std::cerr << "tryResolveExpressionFromArrayJoinExpressions " << scope.dump() << std::endl; + const auto & array_join_node = table_expression_node->as(); const auto & array_join_column_expressions_list = array_join_node.getJoinExpressions(); const auto & array_join_column_expressions_nodes = array_join_column_expressions_list.getNodes(); @@ -3871,9 +3890,14 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi const QueryTreeNodePtr & table_expression_node, IdentifierResolveScope & scope) { + // std::cerr << "tryResolveIdentifierFromArrayJoin " << identifier_lookup.identifier.getFullName() << std::endl; + const auto & from_array_join_node = table_expression_node->as(); auto resolved_identifier = tryResolveIdentifierFromJoinTreeNode(identifier_lookup, from_array_join_node.getTableExpression(), scope); + // std::cerr << "tryResolveIdentifierFromArrayJoin 2 " << scope.table_expressions_in_resolve_process.contains(table_expression_node.get()) + // << ' ' << identifier_lookup.dump() << '\n' << table_expression_node->dumpTree() << std::endl; + if (scope.table_expressions_in_resolve_process.contains(table_expression_node.get()) || !identifier_lookup.isExpressionLookup()) return resolved_identifier; @@ -3888,8 +3912,11 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi for (const auto & array_join_column_expression : array_join_column_expressions_nodes) { auto & array_join_column_expression_typed = array_join_column_expression->as(); + // std::cerr << "========== " << identifier_lookup.identifier.getFullName() << ' ' << from_array_join_node.getAlias() << ' ' << array_join_column_expression_typed.getAlias() << std::endl; - if (array_join_column_expression_typed.getAlias() == identifier_lookup.identifier.getFullName()) + const auto & parts = identifier_lookup.identifier.getParts(); + if (array_join_column_expression_typed.getAlias() == identifier_lookup.identifier.getFullName() || + (parts.size() == 2 && parts.front() == from_array_join_node.getAlias() && parts.back() == array_join_column_expression_typed.getAlias())) { auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), array_join_column_expression_typed.getColumnSource()); @@ -3911,6 +3938,8 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoinTreeNode(const Ident const QueryTreeNodePtr & join_tree_node, IdentifierResolveScope & scope) { + // std::cerr << "tryResolveIdentifierFromJoinTreeNode " << identifier_lookup.identifier.getFullName() << std::endl; + auto join_tree_node_type = join_tree_node->getNodeType(); switch (join_tree_node_type) @@ -3964,6 +3993,8 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoinTree(const Identifie if (identifier_lookup.isFunctionLookup()) return {}; + // std::cerr << "tryResolveIdentifier " << identifier_lookup.identifier.getFullName() << std::endl; + /// Try to resolve identifier from table columns if (auto resolved_identifier = tryResolveIdentifierFromTableColumns(identifier_lookup, scope)) return resolved_identifier; @@ -4112,6 +4143,8 @@ IdentifierResolveResult QueryAnalyzer::tryResolveIdentifier(const IdentifierLook IdentifierResolveScope & scope, IdentifierResolveSettings identifier_resolve_settings) { + // std::cerr << "tryResolveIdentifier " << identifier_lookup.identifier.getFullName() << std::endl; + auto it = scope.identifier_lookup_to_resolve_state.find(identifier_lookup); if (it != scope.identifier_lookup_to_resolve_state.end()) { @@ -6284,7 +6317,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi * * 4. If node has alias, update its value in scope alias map. Deregister alias from expression_aliases_in_resolve_process. */ -ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression) +ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression, bool use_alias_table) { checkStackSize(); @@ -6334,7 +6367,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id * To support both (SELECT 1) AS expression in projection and (SELECT 1) as subquery in IN, do not use * alias table because in alias table subquery could be evaluated as scalar. */ - bool use_alias_table = true; + //bool use_alias_table = true; if (is_duplicated_alias || (allow_table_expression && isSubqueryNodeType(node->getNodeType()))) use_alias_table = false; @@ -7569,22 +7602,33 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif for (auto & array_join_expression : array_join_nodes) { auto array_join_expression_alias = array_join_expression->getAlias(); - if (!array_join_expression_alias.empty() && scope.aliases.alias_name_to_expression_node->contains(array_join_expression_alias)) - throw Exception(ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS, - "ARRAY JOIN expression {} with duplicate alias {}. In scope {}", - array_join_expression->formatASTForErrorMessage(), - array_join_expression_alias, - scope.scope_node->formatASTForErrorMessage()); + // if (!array_join_expression_alias.empty() && scope.aliases.alias_name_to_expression_node->contains(array_join_expression_alias)) + // throw Exception(ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS, + // "ARRAY JOIN expression {} with duplicate alias {}. In scope {}", + // array_join_expression->formatASTForErrorMessage(), + // array_join_expression_alias, + // scope.scope_node->formatASTForErrorMessage()); /// Add array join expression into scope - expressions_visitor.visit(array_join_expression); + + for (const auto & elem : array_join_nodes) + { + for (auto & child : elem->getChildren()) + { + //std::cerr << "<<<<<<<<<< " << child->dumpTree() << std::endl; + expressions_visitor.visit(child); + //visit(child); + } + } + + //expressions_visitor.visit(array_join_expression); std::string identifier_full_name; if (auto * identifier_node = array_join_expression->as()) identifier_full_name = identifier_node->getIdentifier().getFullName(); - resolveExpressionNode(array_join_expression, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); + resolveExpressionNode(array_join_expression, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/, false); auto process_array_join_expression = [&](QueryTreeNodePtr & expression) { @@ -8456,6 +8500,7 @@ QueryAnalysisPass::QueryAnalysisPass(bool only_analyze_) : only_analyze(only_ana void QueryAnalysisPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { + // std::cerr << ".... qap\n" << query_tree_node->dumpTree() << std::endl; QueryAnalyzer analyzer(only_analyze); analyzer.resolve(query_tree_node, table_expression, context); createUniqueTableAliases(query_tree_node, table_expression, context); diff --git a/src/Analyzer/QueryTreeBuilder.cpp b/src/Analyzer/QueryTreeBuilder.cpp index 6a5db4bc1de..1d4810296b4 100644 --- a/src/Analyzer/QueryTreeBuilder.cpp +++ b/src/Analyzer/QueryTreeBuilder.cpp @@ -957,6 +957,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildJoinTree(const ASTPtr & tables_in_select auto array_join_expressions_list = buildExpressionList(array_join_expression.expression_list, context); auto array_join_node = std::make_shared(std::move(last_table_expression), std::move(array_join_expressions_list), is_left_array_join); + array_join_node->setAlias(array_join_expression.tryGetAlias()); /** Original AST is not set because it will contain only array join part and does * not include left table expression. @@ -1045,7 +1046,8 @@ ColumnTransformersNodes QueryTreeBuilder::buildColumnTransformers(const ASTPtr & QueryTreeNodePtr buildQueryTree(ASTPtr query, ContextPtr context) { QueryTreeBuilder builder(std::move(query), context); - return builder.getQueryTreeNode(); + auto qt = builder.getQueryTreeNode(); + return qt; } } diff --git a/src/Analyzer/createUniqueTableAliases.cpp b/src/Analyzer/createUniqueTableAliases.cpp index 8f850fe8dec..30b8c0a433b 100644 --- a/src/Analyzer/createUniqueTableAliases.cpp +++ b/src/Analyzer/createUniqueTableAliases.cpp @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include #include #include @@ -58,6 +60,38 @@ public: alias = fmt::format("__table{}", ++next_id); node->setAlias(alias); } + + if (auto * array_join = node->as()) + { + //size_t counter = 0; + for (auto & column : array_join->getJoinExpressions()) + { + if (auto * column_node = column->as()) + { + if (!column_node->hasAlias()) + column_node->setAlias(column_node->getColumnName()); + } + } + } + + // if (auto * array_join = node->as()) + // { + // for (auto & column : array_join->getJoinExpressions()) + // { + // if (auto * column_node = column->as()) + // { + // const auto & column_alias = column_node->getAlias(); + // const auto & name_or_alias = column_alias.empty() ? column_node->getColumnName() : column_alias; + + // if (!name_or_alias.starts_with("__")) + // { + + // column_node->setAlias(fmt::format("{}.{}", alias, name_or_alias)); + // } + // } + // } + // } + break; } default: diff --git a/src/Parsers/ASTTablesInSelectQuery.cpp b/src/Parsers/ASTTablesInSelectQuery.cpp index e782bad797e..2f3e9207f81 100644 --- a/src/Parsers/ASTTablesInSelectQuery.cpp +++ b/src/Parsers/ASTTablesInSelectQuery.cpp @@ -247,6 +247,12 @@ void ASTTableJoin::formatImpl(const FormatSettings & settings, FormatState & sta formatImplAfterTable(settings, state, frame); } +static void writeAlias(const String & name, const ASTWithAlias::FormatSettings & settings) +{ + settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " AS " << (settings.hilite ? IAST::hilite_alias : ""); + settings.writeIdentifier(name); + settings.ostr << (settings.hilite ? IAST::hilite_none : ""); +} void ASTArrayJoin::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { @@ -258,6 +264,9 @@ void ASTArrayJoin::formatImpl(const FormatSettings & settings, FormatState & sta << indent_str << (kind == Kind::Left ? "LEFT " : "") << "ARRAY JOIN" << (settings.hilite ? hilite_none : ""); + if (!alias.empty()) + writeAlias(alias, settings); + settings.one_line ? expression_list->formatImpl(settings, state, frame) : expression_list->as().formatImplMultiline(settings, state, frame); diff --git a/src/Parsers/ASTTablesInSelectQuery.h b/src/Parsers/ASTTablesInSelectQuery.h index f3f329ca2b6..4619b22f022 100644 --- a/src/Parsers/ASTTablesInSelectQuery.h +++ b/src/Parsers/ASTTablesInSelectQuery.h @@ -95,6 +95,10 @@ struct ASTArrayJoin : public IAST /// List of array or nested names to JOIN, possible with aliases. ASTPtr expression_list; + String alias; + + String tryGetAlias() const override { return alias; } + void setAlias(const String & to) override { alias = to; } using IAST::IAST; String getID(char) const override { return "ArrayJoin"; } diff --git a/src/Parsers/ParserTablesInSelectQuery.cpp b/src/Parsers/ParserTablesInSelectQuery.cpp index b4d48ae67e9..b2a801c8943 100644 --- a/src/Parsers/ParserTablesInSelectQuery.cpp +++ b/src/Parsers/ParserTablesInSelectQuery.cpp @@ -98,6 +98,10 @@ bool ParserArrayJoin::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!has_array_join) return false; + ASTPtr alias_node; + if (ParserAlias(false).parse(pos, alias_node, expected)) + tryGetIdentifierNameInto(alias_node, res->alias); + if (!ParserExpressionList(false).parse(pos, res->expression_list, expected)) return false; diff --git a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp index d4545482477..aef3c03255e 100644 --- a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp +++ b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp @@ -2,6 +2,7 @@ #include #include +#include "Parsers/queryToString.h" #include #include #include @@ -68,12 +69,19 @@ std::unique_ptr createLocalPlan( if (context->getSettingsRef().allow_experimental_analyzer) { + // std::cerr << query_ast->dumpTree() << std::endl; + // std::cerr << queryToString(query_ast) << std::endl; + /// For Analyzer, identifier in GROUP BY/ORDER BY/LIMIT BY lists has been resolved to /// ConstantNode in QueryTree if it is an alias of a constant, so we should not replace /// ConstantNode with ProjectionNode again(https://github.com/ClickHouse/ClickHouse/issues/62289). new_context->setSetting("enable_positional_arguments", Field(false)); auto interpreter = InterpreterSelectQueryAnalyzer(query_ast, new_context, select_query_options); + // std::cerr << interpreter.getQueryTree()->dumpTree() << std::endl; query_plan = std::make_unique(std::move(interpreter).extractQueryPlan()); + WriteBufferFromOwnString buf; + query_plan->explainPlan(buf, {.header=true, .actions=true}); + // std::cerr << buf.str() << std::endl; } else { From b4581286f74bcdfe199c3b8967e237ae3375cd88 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 May 2024 16:34:11 +0000 Subject: [PATCH 348/392] Properly resolve array join columns. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 60 ++++++++++++++++--- .../02374_analyzer_array_join.reference | 16 ++++- .../0_stateless/02374_analyzer_array_join.sql | 4 +- .../02521_analyzer_array_join_crash.reference | 9 ++- .../02521_analyzer_array_join_crash.sql | 6 +- 5 files changed, 75 insertions(+), 20 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index f55f6d6c18f..6bce3dff49d 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -607,6 +607,8 @@ struct ScopeAliases std::unordered_set nodes_with_duplicated_aliases; std::vector cloned_nodes_with_duplicated_aliases; + std::unordered_set array_join_aliases; + std::unordered_map & getAliasMap(IdentifierLookupContext lookup_context) { switch (lookup_context) @@ -2875,7 +2877,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromExpressionArguments(cons bool QueryAnalyzer::tryBindIdentifierToAliases(const IdentifierLookup & identifier_lookup, const IdentifierResolveScope & scope) { - return scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME) != nullptr; + return scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME) != nullptr || scope.aliases.array_join_aliases.contains(identifier_lookup.identifier.front()); } /** Resolve identifier from scope aliases. @@ -2924,6 +2926,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier IdentifierResolveSettings identifier_resolve_settings) { const auto & identifier_bind_part = identifier_lookup.identifier.front(); + // std::cerr << "tryResolveIdentifierFromAliases " << identifier_lookup.dump() << std::endl; auto * it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME); if (it == nullptr) @@ -2952,6 +2955,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier } auto node_type = alias_node->getNodeType(); + // std::cerr << "tryResolveIdentifierFromAliases 1.5 \n" << alias_node->dumpTree() << std::endl; /// Resolve expression if necessary if (node_type == QueryTreeNodeType::IDENTIFIER) @@ -2960,6 +2964,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier auto & alias_identifier_node = alias_node->as(); auto identifier = alias_identifier_node.getIdentifier(); + // std::cerr << "tryResolveIdentifierFromAliases 2 " << identifier.getFullName() << std::endl; auto lookup_result = tryResolveIdentifier(IdentifierLookup{identifier, identifier_lookup.lookup_context}, scope, identifier_resolve_settings); if (!lookup_result.resolved_identifier) { @@ -3136,6 +3141,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromStorage( size_t identifier_column_qualifier_parts, bool can_be_not_found) { + // std::cerr << "tryResolveIdentifierFromStorage " << identifier.getFullName() << std::endl; auto identifier_without_column_qualifier = identifier; identifier_without_column_qualifier.popFirst(identifier_column_qualifier_parts); @@ -3278,6 +3284,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromStorage( { auto qualified_identifier_with_removed_part = qualified_identifier; qualified_identifier_with_removed_part.popFirst(); + // std::cerr << "tryResolveIdentifierFromStorage qualified_identifier_with_removed_part" << qualified_identifier_with_removed_part.getFullName() << std::endl; if (qualified_identifier_with_removed_part.empty()) break; @@ -3896,7 +3903,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi auto resolved_identifier = tryResolveIdentifierFromJoinTreeNode(identifier_lookup, from_array_join_node.getTableExpression(), scope); // std::cerr << "tryResolveIdentifierFromArrayJoin 2 " << scope.table_expressions_in_resolve_process.contains(table_expression_node.get()) - // << ' ' << identifier_lookup.dump() << '\n' << table_expression_node->dumpTree() << std::endl; + // << ' ' << identifier_lookup.dump() << ' ' << (resolved_identifier ? resolved_identifier->dumpTree() : "not resolved ") << std::endl; if (scope.table_expressions_in_resolve_process.contains(table_expression_node.get()) || !identifier_lookup.isExpressionLookup()) return resolved_identifier; @@ -3914,14 +3921,48 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi auto & array_join_column_expression_typed = array_join_column_expression->as(); // std::cerr << "========== " << identifier_lookup.identifier.getFullName() << ' ' << from_array_join_node.getAlias() << ' ' << array_join_column_expression_typed.getAlias() << std::endl; - const auto & parts = identifier_lookup.identifier.getParts(); - if (array_join_column_expression_typed.getAlias() == identifier_lookup.identifier.getFullName() || - (parts.size() == 2 && parts.front() == from_array_join_node.getAlias() && parts.back() == array_join_column_expression_typed.getAlias())) + IdentifierView identifier_view(identifier_lookup.identifier); + + if (identifier_view.isCompound() && from_array_join_node.hasAlias() && identifier_view.front() == from_array_join_node.getAlias()) + identifier_view.popFirst(); + + const auto & alias_or_name = array_join_column_expression_typed.hasAlias() + ? array_join_column_expression_typed.getAlias() + : array_join_column_expression_typed.getColumnName(); + + if (identifier_view.front() == alias_or_name) + identifier_view.popFirst(); + else if (identifier_view.getFullName() == alias_or_name) + identifier_view.popFirst(identifier_view.getPartsSize()); /// Clear + else + continue; + + if (identifier_view.empty()) { auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), array_join_column_expression_typed.getColumnSource()); return array_join_column; } + + auto compound_expr = tryResolveIdentifierFromCompoundExpression( + identifier_lookup.identifier, + identifier_lookup.identifier.getPartsSize() - identifier_view.getPartsSize() /*identifier_bind_size*/, + array_join_column_expression, + {} /* compound_expression_source */, + scope, + true /* can_be_not_found */); + + if (compound_expr) + return compound_expr; + + // const auto & parts = identifier_lookup.identifier.getParts(); + // if (array_join_column_expression_typed.getAlias() == identifier_lookup.identifier.getFullName() || + // (parts.size() == 2 && parts.front() == from_array_join_node.getAlias() && parts.back() == array_join_column_expression_typed.getAlias())) + // { + // auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), + // array_join_column_expression_typed.getColumnSource()); + // return array_join_column; + // } } if (!resolved_identifier) @@ -3993,7 +4034,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoinTree(const Identifie if (identifier_lookup.isFunctionLookup()) return {}; - // std::cerr << "tryResolveIdentifier " << identifier_lookup.identifier.getFullName() << std::endl; + // std::cerr << "tryResolveIdentifierFromJoinTree " << identifier_lookup.identifier.getFullName() << std::endl; /// Try to resolve identifier from table columns if (auto resolved_identifier = tryResolveIdentifierFromTableColumns(identifier_lookup, scope)) @@ -7613,15 +7654,18 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif for (const auto & elem : array_join_nodes) { + if (elem->hasAlias()) + scope.aliases.array_join_aliases.insert(elem->getAlias()); for (auto & child : elem->getChildren()) { //std::cerr << "<<<<<<<<<< " << child->dumpTree() << std::endl; - expressions_visitor.visit(child); + if (child) + expressions_visitor.visit(child); //visit(child); } } - //expressions_visitor.visit(array_join_expression); + // expressions_visitor.visit(array_join_expression); std::string identifier_full_name; diff --git a/tests/queries/0_stateless/02374_analyzer_array_join.reference b/tests/queries/0_stateless/02374_analyzer_array_join.reference index 6dd384c7d9c..44f3e5a95e9 100644 --- a/tests/queries/0_stateless/02374_analyzer_array_join.reference +++ b/tests/queries/0_stateless/02374_analyzer_array_join.reference @@ -45,7 +45,13 @@ SELECT id, value, value_1, value_2 FROM test_table ARRAY JOIN [[1, 2, 3]] AS val 0 Value [1,2,3] 1 0 Value [1,2,3] 2 0 Value [1,2,3] 3 -SELECT 1 AS value FROM test_table ARRAY JOIN [1,2,3] AS value; -- { serverError 179 } +SELECT 1 AS value FROM test_table ARRAY JOIN [1,2,3] AS value; +1 +2 +3 +1 +2 +3 SELECT 'ARRAY JOIN with column'; ARRAY JOIN with column SELECT id, value, test_table.value_array FROM test_table ARRAY JOIN value_array; @@ -84,7 +90,13 @@ SELECT id, value, value_array AS value_array_array_alias FROM test_table ARRAY J 0 Value [4,5,6] SELECT '--'; -- -SELECT id AS value FROM test_table ARRAY JOIN value_array AS value; -- { serverError 179 } +SELECT id AS value FROM test_table ARRAY JOIN value_array AS value; +1 +2 +3 +4 +5 +6 SELECT '--'; -- SELECT id, value, value_array AS value_array_array_alias, value_array_array_alias_element FROM test_table ARRAY JOIN value_array_array_alias AS value_array_array_alias_element; diff --git a/tests/queries/0_stateless/02374_analyzer_array_join.sql b/tests/queries/0_stateless/02374_analyzer_array_join.sql index bc4bb6616c1..dfd3b755aff 100644 --- a/tests/queries/0_stateless/02374_analyzer_array_join.sql +++ b/tests/queries/0_stateless/02374_analyzer_array_join.sql @@ -33,7 +33,7 @@ SELECT '--'; SELECT id, value, value_1, value_2 FROM test_table ARRAY JOIN [[1, 2, 3]] AS value_1 ARRAY JOIN value_1 AS value_2; -SELECT 1 AS value FROM test_table ARRAY JOIN [1,2,3] AS value; -- { serverError 179 } +SELECT 1 AS value FROM test_table ARRAY JOIN [1,2,3] AS value; SELECT 'ARRAY JOIN with column'; @@ -53,7 +53,7 @@ SELECT id, value, value_array AS value_array_array_alias FROM test_table ARRAY J SELECT '--'; -SELECT id AS value FROM test_table ARRAY JOIN value_array AS value; -- { serverError 179 } +SELECT id AS value FROM test_table ARRAY JOIN value_array AS value; SELECT '--'; diff --git a/tests/queries/0_stateless/02521_analyzer_array_join_crash.reference b/tests/queries/0_stateless/02521_analyzer_array_join_crash.reference index 5e7728e0590..426cfe35e73 100644 --- a/tests/queries/0_stateless/02521_analyzer_array_join_crash.reference +++ b/tests/queries/0_stateless/02521_analyzer_array_join_crash.reference @@ -1,11 +1,10 @@ -- { echoOn } -SELECT id, value_element, value FROM test_table ARRAY JOIN [[1,2,3]] AS value_element, value_element AS value; -0 [1,2,3] [1,2,3] +SELECT id, value_element, value FROM test_table ARRAY JOIN [[1,2,3]] AS value_element, value_element AS value; -- { serverError UNKNOWN_IDENTIFIER } SELECT id, value_element, value FROM test_table ARRAY JOIN [[1,2,3]] AS value_element ARRAY JOIN value_element AS value; 0 [1,2,3] 1 0 [1,2,3] 2 0 [1,2,3] 3 -SELECT value_element, value FROM test_table ARRAY JOIN [1048577] AS value_element, arrayMap(x -> value_element, ['']) AS value; -1048577 [1048577] -SELECT arrayFilter(x -> notEmpty(concat(x)), [NULL, NULL]) FROM system.one ARRAY JOIN [1048577] AS elem, arrayMap(x -> splitByChar(x, elem), ['']) AS unused; -- { serverError 44 } +SELECT value_element, value FROM test_table ARRAY JOIN [1048577] AS value_element ARRAY JOIN arrayMap(x -> value_element, ['']) AS value; +1048577 1048577 +SELECT arrayFilter(x -> notEmpty(concat(x)), [NULL, NULL]) FROM system.one ARRAY JOIN [1048577] AS elem ARRAY JOIN arrayMap(x -> splitByChar(x, elem), ['']) AS unused; -- { serverError ILLEGAL_COLUMN } diff --git a/tests/queries/0_stateless/02521_analyzer_array_join_crash.sql b/tests/queries/0_stateless/02521_analyzer_array_join_crash.sql index 53606e01ab7..7842d47d757 100644 --- a/tests/queries/0_stateless/02521_analyzer_array_join_crash.sql +++ b/tests/queries/0_stateless/02521_analyzer_array_join_crash.sql @@ -11,13 +11,13 @@ INSERT INTO test_table VALUES (0, 'Value'); -- { echoOn } -SELECT id, value_element, value FROM test_table ARRAY JOIN [[1,2,3]] AS value_element, value_element AS value; +SELECT id, value_element, value FROM test_table ARRAY JOIN [[1,2,3]] AS value_element, value_element AS value; -- { serverError UNKNOWN_IDENTIFIER } SELECT id, value_element, value FROM test_table ARRAY JOIN [[1,2,3]] AS value_element ARRAY JOIN value_element AS value; -SELECT value_element, value FROM test_table ARRAY JOIN [1048577] AS value_element, arrayMap(x -> value_element, ['']) AS value; +SELECT value_element, value FROM test_table ARRAY JOIN [1048577] AS value_element ARRAY JOIN arrayMap(x -> value_element, ['']) AS value; -SELECT arrayFilter(x -> notEmpty(concat(x)), [NULL, NULL]) FROM system.one ARRAY JOIN [1048577] AS elem, arrayMap(x -> splitByChar(x, elem), ['']) AS unused; -- { serverError 44 } +SELECT arrayFilter(x -> notEmpty(concat(x)), [NULL, NULL]) FROM system.one ARRAY JOIN [1048577] AS elem ARRAY JOIN arrayMap(x -> splitByChar(x, elem), ['']) AS unused; -- { serverError ILLEGAL_COLUMN } -- { echoOff } From 317941f06af836d719e1360b04616970271ecc12 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 May 2024 17:01:46 +0000 Subject: [PATCH 349/392] Add a test. --- .../03156_analyzer_array_join_distributed.reference | 12 ++++++++++++ .../03156_analyzer_array_join_distributed.sql | 10 ++++++++++ 2 files changed, 22 insertions(+) create mode 100644 tests/queries/0_stateless/03156_analyzer_array_join_distributed.reference create mode 100644 tests/queries/0_stateless/03156_analyzer_array_join_distributed.sql diff --git a/tests/queries/0_stateless/03156_analyzer_array_join_distributed.reference b/tests/queries/0_stateless/03156_analyzer_array_join_distributed.reference new file mode 100644 index 00000000000..b5b2aec9c12 --- /dev/null +++ b/tests/queries/0_stateless/03156_analyzer_array_join_distributed.reference @@ -0,0 +1,12 @@ +Hello [1,2] 1 +Hello [1,2] 2 +Hello [1,2] 1 +Hello [1,2] 1 +Hello [1,2] 2 +Hello [1,2] 2 +Hello 1 +Hello 2 +Hello 1 +Hello 1 +Hello 2 +Hello 2 diff --git a/tests/queries/0_stateless/03156_analyzer_array_join_distributed.sql b/tests/queries/0_stateless/03156_analyzer_array_join_distributed.sql new file mode 100644 index 00000000000..f605a369822 --- /dev/null +++ b/tests/queries/0_stateless/03156_analyzer_array_join_distributed.sql @@ -0,0 +1,10 @@ +CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = MergeTree() ORDER BY (s); + +INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); + +SELECT s, arr, a FROM remote('127.0.0.2', currentDatabase(), arrays_test) ARRAY JOIN arr AS a WHERE a < 3 ORDER BY a; +SELECT s, arr, a FROM remote('127.0.0.{1,2}', currentDatabase(), arrays_test) ARRAY JOIN arr AS a WHERE a < 3 ORDER BY a; + + +SELECT s, arr FROM remote('127.0.0.2', currentDatabase(), arrays_test) ARRAY JOIN arr WHERE arr < 3 ORDER BY arr; +SELECT s, arr FROM remote('127.0.0.{1,2}', currentDatabase(), arrays_test) ARRAY JOIN arr WHERE arr < 3 ORDER BY arr; From bee3c50ecd4a41e64d29812b5607927c12dba111 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 May 2024 17:23:02 +0000 Subject: [PATCH 350/392] Try not to add alias to array join. --- src/Analyzer/ArrayJoinNode.cpp | 2 +- src/Analyzer/ColumnNode.cpp | 4 ++-- src/Analyzer/QueryTreeBuilder.cpp | 2 +- src/Parsers/ASTTablesInSelectQuery.cpp | 16 ++++++++-------- src/Parsers/ASTTablesInSelectQuery.h | 6 +++--- src/Parsers/ParserTablesInSelectQuery.cpp | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/Analyzer/ArrayJoinNode.cpp b/src/Analyzer/ArrayJoinNode.cpp index 9c1eb9dce3e..37c198f8472 100644 --- a/src/Analyzer/ArrayJoinNode.cpp +++ b/src/Analyzer/ArrayJoinNode.cpp @@ -55,7 +55,7 @@ ASTPtr ArrayJoinNode::toASTImpl(const ConvertToASTOptions & options) const auto array_join_ast = std::make_shared(); array_join_ast->kind = is_left ? ASTArrayJoin::Kind::Left : ASTArrayJoin::Kind::Inner; - array_join_ast->setAlias(getAlias()); + // array_join_ast->setAlias(getAlias()); auto array_join_expressions_ast = std::make_shared(); const auto & array_join_expressions = getJoinExpressions().getNodes(); diff --git a/src/Analyzer/ColumnNode.cpp b/src/Analyzer/ColumnNode.cpp index f76c096a339..d12eac68ab4 100644 --- a/src/Analyzer/ColumnNode.cpp +++ b/src/Analyzer/ColumnNode.cpp @@ -110,8 +110,8 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const if (node_type == QueryTreeNodeType::TABLE || node_type == QueryTreeNodeType::TABLE_FUNCTION || node_type == QueryTreeNodeType::QUERY || - node_type == QueryTreeNodeType::UNION || - node_type == QueryTreeNodeType::ARRAY_JOIN) + node_type == QueryTreeNodeType::UNION)// || + //node_type == QueryTreeNodeType::ARRAY_JOIN) { if (column_source->hasAlias()) { diff --git a/src/Analyzer/QueryTreeBuilder.cpp b/src/Analyzer/QueryTreeBuilder.cpp index 1d4810296b4..02d742f5e49 100644 --- a/src/Analyzer/QueryTreeBuilder.cpp +++ b/src/Analyzer/QueryTreeBuilder.cpp @@ -957,7 +957,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildJoinTree(const ASTPtr & tables_in_select auto array_join_expressions_list = buildExpressionList(array_join_expression.expression_list, context); auto array_join_node = std::make_shared(std::move(last_table_expression), std::move(array_join_expressions_list), is_left_array_join); - array_join_node->setAlias(array_join_expression.tryGetAlias()); + // array_join_node->setAlias(array_join_expression.tryGetAlias()); /** Original AST is not set because it will contain only array join part and does * not include left table expression. diff --git a/src/Parsers/ASTTablesInSelectQuery.cpp b/src/Parsers/ASTTablesInSelectQuery.cpp index 2f3e9207f81..b4058a0950d 100644 --- a/src/Parsers/ASTTablesInSelectQuery.cpp +++ b/src/Parsers/ASTTablesInSelectQuery.cpp @@ -247,12 +247,12 @@ void ASTTableJoin::formatImpl(const FormatSettings & settings, FormatState & sta formatImplAfterTable(settings, state, frame); } -static void writeAlias(const String & name, const ASTWithAlias::FormatSettings & settings) -{ - settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " AS " << (settings.hilite ? IAST::hilite_alias : ""); - settings.writeIdentifier(name); - settings.ostr << (settings.hilite ? IAST::hilite_none : ""); -} +// static void writeAlias(const String & name, const ASTWithAlias::FormatSettings & settings) +// { +// settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " AS " << (settings.hilite ? IAST::hilite_alias : ""); +// settings.writeIdentifier(name); +// settings.ostr << (settings.hilite ? IAST::hilite_none : ""); +// } void ASTArrayJoin::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { @@ -264,8 +264,8 @@ void ASTArrayJoin::formatImpl(const FormatSettings & settings, FormatState & sta << indent_str << (kind == Kind::Left ? "LEFT " : "") << "ARRAY JOIN" << (settings.hilite ? hilite_none : ""); - if (!alias.empty()) - writeAlias(alias, settings); + // if (!alias.empty()) + // writeAlias(alias, settings); settings.one_line ? expression_list->formatImpl(settings, state, frame) diff --git a/src/Parsers/ASTTablesInSelectQuery.h b/src/Parsers/ASTTablesInSelectQuery.h index 4619b22f022..212436b0d9e 100644 --- a/src/Parsers/ASTTablesInSelectQuery.h +++ b/src/Parsers/ASTTablesInSelectQuery.h @@ -95,10 +95,10 @@ struct ASTArrayJoin : public IAST /// List of array or nested names to JOIN, possible with aliases. ASTPtr expression_list; - String alias; + // String alias; - String tryGetAlias() const override { return alias; } - void setAlias(const String & to) override { alias = to; } + // String tryGetAlias() const override { return alias; } + // void setAlias(const String & to) override { alias = to; } using IAST::IAST; String getID(char) const override { return "ArrayJoin"; } diff --git a/src/Parsers/ParserTablesInSelectQuery.cpp b/src/Parsers/ParserTablesInSelectQuery.cpp index b2a801c8943..c96b6c1584d 100644 --- a/src/Parsers/ParserTablesInSelectQuery.cpp +++ b/src/Parsers/ParserTablesInSelectQuery.cpp @@ -98,9 +98,9 @@ bool ParserArrayJoin::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!has_array_join) return false; - ASTPtr alias_node; - if (ParserAlias(false).parse(pos, alias_node, expected)) - tryGetIdentifierNameInto(alias_node, res->alias); + // ASTPtr alias_node; + // if (ParserAlias(false).parse(pos, alias_node, expected)) + // tryGetIdentifierNameInto(alias_node, res->alias); if (!ParserExpressionList(false).parse(pos, res->expression_list, expected)) return false; From a19472ddd58d121c8cda910dd7690fa37fb66065 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 23 May 2024 17:53:17 +0000 Subject: [PATCH 351/392] Connect code. --- src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp index aef3c03255e..ad94dd2c173 100644 --- a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp +++ b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp @@ -78,9 +78,9 @@ std::unique_ptr createLocalPlan( new_context->setSetting("enable_positional_arguments", Field(false)); auto interpreter = InterpreterSelectQueryAnalyzer(query_ast, new_context, select_query_options); // std::cerr << interpreter.getQueryTree()->dumpTree() << std::endl; - query_plan = std::make_unique(std::move(interpreter).extractQueryPlan()); - WriteBufferFromOwnString buf; - query_plan->explainPlan(buf, {.header=true, .actions=true}); + // query_plan = std::make_unique(std::move(interpreter).extractQueryPlan()); + // WriteBufferFromOwnString buf; + // query_plan->explainPlan(buf, {.header=true, .actions=true}); // std::cerr << buf.str() << std::endl; } else From 1e5872cb4ea8237d24528d2595a6708a36204a00 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 24 May 2024 11:02:31 +0200 Subject: [PATCH 352/392] Update DistributedCreateLocalPlan.cpp --- src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp index ad94dd2c173..e4d908e2af0 100644 --- a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp +++ b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp @@ -78,7 +78,7 @@ std::unique_ptr createLocalPlan( new_context->setSetting("enable_positional_arguments", Field(false)); auto interpreter = InterpreterSelectQueryAnalyzer(query_ast, new_context, select_query_options); // std::cerr << interpreter.getQueryTree()->dumpTree() << std::endl; - // query_plan = std::make_unique(std::move(interpreter).extractQueryPlan()); + query_plan = std::make_unique(std::move(interpreter).extractQueryPlan()); // WriteBufferFromOwnString buf; // query_plan->explainPlan(buf, {.header=true, .actions=true}); // std::cerr << buf.str() << std::endl; From 634f7c35e8348cbf0c77de729bde131d34ca6336 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 24 May 2024 12:43:40 +0000 Subject: [PATCH 353/392] Better. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 40 +++++++++++-------- .../02374_analyzer_array_join.reference | 24 ++++++----- .../0_stateless/02374_analyzer_array_join.sql | 3 ++ 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 6bce3dff49d..871c3842de0 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -1545,7 +1545,7 @@ private: ProjectionNames resolveFunction(QueryTreeNodePtr & function_node, IdentifierResolveScope & scope); - ProjectionNames resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression, bool use_alias_table = true); + ProjectionNames resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression, bool ignore_alias = false); ProjectionNames resolveExpressionNodeList(QueryTreeNodePtr & node_list, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression); @@ -3919,6 +3919,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi for (const auto & array_join_column_expression : array_join_column_expressions_nodes) { auto & array_join_column_expression_typed = array_join_column_expression->as(); + // std::cerr << "========== " << array_join_column_expression->dumpTree() << std::endl; // std::cerr << "========== " << identifier_lookup.identifier.getFullName() << ' ' << from_array_join_node.getAlias() << ' ' << array_join_column_expression_typed.getAlias() << std::endl; IdentifierView identifier_view(identifier_lookup.identifier); @@ -6358,10 +6359,12 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi * * 4. If node has alias, update its value in scope alias map. Deregister alias from expression_aliases_in_resolve_process. */ -ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression, bool use_alias_table) +ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression, bool ignore_alias) { checkStackSize(); + // std::cerr << "resolveExpressionNode " << ignore_alias << "\n" << node->dumpTree() << std::endl; + auto resolved_expression_it = resolved_expressions.find(node); if (resolved_expression_it != resolved_expressions.end()) { @@ -6378,6 +6381,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id evaluateScalarSubqueryIfNeeded(node, subquery_scope); } + // std::cerr << "resolveExpressionNode taken from cache \n" << node->dumpTree() << "\n PN " << (resolved_expression_it->second.empty() ? "" : resolved_expression_it->second.front()) << std::endl; return resolved_expression_it->second; } @@ -6388,7 +6392,10 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id { auto projection_name_it = node_to_projection_name.find(node); if (projection_name_it != node_to_projection_name.end()) + { + // std::cerr << "resolveExpressionNode taken projection name from map : " << projection_name_it->second << " for \n" << node->dumpTree() << std::endl; result_projection_names.push_back(projection_name_it->second); + } } else { @@ -6408,7 +6415,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id * To support both (SELECT 1) AS expression in projection and (SELECT 1) as subquery in IN, do not use * alias table because in alias table subquery could be evaluated as scalar. */ - //bool use_alias_table = true; + bool use_alias_table = !ignore_alias; if (is_duplicated_alias || (allow_table_expression && isSubqueryNodeType(node->getNodeType()))) use_alias_table = false; @@ -6708,7 +6715,8 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id if (is_duplicated_alias) scope.non_cached_identifier_lookups_during_expression_resolve.erase({Identifier{node_alias}, IdentifierLookupContext::EXPRESSION}); - resolved_expressions.emplace(node, result_projection_names); + if (!ignore_alias) + resolved_expressions.emplace(node, result_projection_names); scope.popExpressionNode(); bool expression_was_root = scope.expressions_in_resolve_process_stack.empty(); @@ -7672,7 +7680,7 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif if (auto * identifier_node = array_join_expression->as()) identifier_full_name = identifier_node->getIdentifier().getFullName(); - resolveExpressionNode(array_join_expression, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/, false); + resolveExpressionNode(array_join_expression, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/, true); auto process_array_join_expression = [&](QueryTreeNodePtr & expression) { @@ -7749,17 +7757,17 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif * with type after ARRAY JOIN. */ array_join_nodes = std::move(array_join_column_expressions); - for (auto & array_join_column_expression : array_join_nodes) - { - auto it = scope.aliases.alias_name_to_expression_node->find(array_join_column_expression->getAlias()); - if (it != scope.aliases.alias_name_to_expression_node->end()) - { - auto & array_join_column_expression_typed = array_join_column_expression->as(); - auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), - array_join_column_expression_typed.getColumnSource()); - it->second = std::move(array_join_column); - } - } + // for (auto & array_join_column_expression : array_join_nodes) + // { + // auto it = scope.aliases.alias_name_to_expression_node->find(array_join_column_expression->getAlias()); + // if (it != scope.aliases.alias_name_to_expression_node->end()) + // { + // auto & array_join_column_expression_typed = array_join_column_expression->as(); + // auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), + // array_join_column_expression_typed.getColumnSource()); + // it->second = std::move(array_join_column); + // } + // } } void QueryAnalyzer::checkDuplicateTableNamesOrAlias(const QueryTreeNodePtr & join_node, QueryTreeNodePtr & left_table_expr, QueryTreeNodePtr & right_table_expr, IdentifierResolveScope & scope) diff --git a/tests/queries/0_stateless/02374_analyzer_array_join.reference b/tests/queries/0_stateless/02374_analyzer_array_join.reference index 44f3e5a95e9..ad7750228d6 100644 --- a/tests/queries/0_stateless/02374_analyzer_array_join.reference +++ b/tests/queries/0_stateless/02374_analyzer_array_join.reference @@ -47,11 +47,11 @@ SELECT id, value, value_1, value_2 FROM test_table ARRAY JOIN [[1, 2, 3]] AS val 0 Value [1,2,3] 3 SELECT 1 AS value FROM test_table ARRAY JOIN [1,2,3] AS value; 1 -2 -3 1 -2 -3 +1 +1 +1 +1 SELECT 'ARRAY JOIN with column'; ARRAY JOIN with column SELECT id, value, test_table.value_array FROM test_table ARRAY JOIN value_array; @@ -91,12 +91,12 @@ SELECT id, value, value_array AS value_array_array_alias FROM test_table ARRAY J SELECT '--'; -- SELECT id AS value FROM test_table ARRAY JOIN value_array AS value; -1 -2 -3 -4 -5 -6 +0 +0 +0 +0 +0 +0 SELECT '--'; -- SELECT id, value, value_array AS value_array_array_alias, value_array_array_alias_element FROM test_table ARRAY JOIN value_array_array_alias AS value_array_array_alias_element; @@ -132,3 +132,7 @@ WHERE NOT ignore(elem) GROUP BY sum(ignore(ignore(ignore(1., 1, 36, 8, 8), ignore(52, 37, 37, '03147_parquet_memory_tracking.parquet', 37, 37, toUInt256(37), 37, 37, toNullable(37), 37, 37), 1., 1, 36, 8, 8), emptyArrayToSingle(arrayMap(x -> toString(x), arrayMap(x -> nullIf(x, 2), arrayJoin([[1]])))))) IGNORE NULLS, modulo(toLowCardinality('03147_parquet_memory_tracking.parquet'), number, toLowCardinality(3)); -- { serverError UNKNOWN_IDENTIFIER } +[1,2] 1 +[1,2] 2 +1 +2 diff --git a/tests/queries/0_stateless/02374_analyzer_array_join.sql b/tests/queries/0_stateless/02374_analyzer_array_join.sql index dfd3b755aff..8c26df1806e 100644 --- a/tests/queries/0_stateless/02374_analyzer_array_join.sql +++ b/tests/queries/0_stateless/02374_analyzer_array_join.sql @@ -80,3 +80,6 @@ GROUP BY -- { echoOff } DROP TABLE test_table; + +select [1, 2] as arr, x from system.one array join arr as x; +select x + 1 as x from (select [number] as arr from numbers(2)) as s array join arr as x; From 9794a193cfb88d7a49b12b9a60986884bf3ebfda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 May 2024 15:05:49 +0200 Subject: [PATCH 354/392] Rename aggregate_function_group_array_has_limit_size --- .../AggregateFunctionGroupArray.cpp | 11 ++++++----- src/Core/ServerSettings.h | 3 ++- src/Core/SettingsEnums.cpp | 5 +++++ src/Core/SettingsEnums.h | 8 ++++++++ .../configs/group_array_max_element_size.xml | 2 +- .../integration/test_group_array_element_size/test.py | 8 ++++---- 6 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index d4fb7afcb78..c21b1d376d9 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -753,10 +753,11 @@ size_t getMaxArraySize() return 0xFFFFFF; } -bool hasLimitArraySize() +bool discardOnLimitReached() { if (auto context = Context::getGlobalContextInstance()) - return context->getServerSettings().aggregate_function_group_array_has_limit_size; + return context->getServerSettings().aggregate_function_group_array_action_when_limit_is_reached + == GroupArrayActionWhenLimitReached::DISCARD; return false; } @@ -767,7 +768,7 @@ AggregateFunctionPtr createAggregateFunctionGroupArray( { assertUnary(name, argument_types); - bool limit_size = hasLimitArraySize(); + bool has_limit = discardOnLimitReached(); UInt64 max_elems = getMaxArraySize(); if (parameters.empty()) @@ -784,14 +785,14 @@ AggregateFunctionPtr createAggregateFunctionGroupArray( (type == Field::Types::UInt64 && parameters[0].get() == 0)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter for aggregate function {} should be positive number", name); - limit_size = true; + has_limit = true; max_elems = parameters[0].get(); } else throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Incorrect number of parameters for aggregate function {}, should be 0 or 1", name); - if (!limit_size) + if (!has_limit) { if (Tlast) throw Exception(ErrorCodes::BAD_ARGUMENTS, "groupArrayLast make sense only with max_elems (groupArrayLast(max_elems)())"); diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index ea0b155b22d..45f235116ab 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -3,6 +3,7 @@ #include #include +#include namespace Poco::Util @@ -51,7 +52,7 @@ namespace DB M(UInt64, max_temporary_data_on_disk_size, 0, "The maximum amount of storage that could be used for external aggregation, joins or sorting., ", 0) \ M(String, temporary_data_in_cache, "", "Cache disk name for temporary data.", 0) \ M(UInt64, aggregate_function_group_array_max_element_size, 0xFFFFFF, "Max array element size in bytes for groupArray function. This limit is checked at serialization and help to avoid large state size.", 0) \ - M(Bool, aggregate_function_group_array_has_limit_size, false, "When the max array element size is exceeded, a `Too large array size` exception will be thrown by default. When set to true, no exception will be thrown, and the excess elements will be discarded.", 0) \ + M(GroupArrayActionWhenLimitReached, aggregate_function_group_array_action_when_limit_is_reached, GroupArrayActionWhenLimitReached::THROW, "Action to execute when max array element size is exceeded in groupArray: `throw` exception, or `discard` extra values", 0) \ M(UInt64, max_server_memory_usage, 0, "Maximum total memory usage of the server in bytes. Zero means unlimited.", 0) \ M(Double, max_server_memory_usage_to_ram_ratio, 0.9, "Same as max_server_memory_usage but in to RAM ratio. Allows to lower max memory on low-memory systems.", 0) \ M(UInt64, merges_mutations_memory_usage_soft_limit, 0, "Maximum total memory usage for merges and mutations in bytes. Zero means unlimited.", 0) \ diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index 0caf6e8d609..05985316566 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -229,4 +229,9 @@ IMPLEMENT_SETTING_ENUM(SQLSecurityType, ErrorCodes::BAD_ARGUMENTS, {{"DEFINER", SQLSecurityType::DEFINER}, {"INVOKER", SQLSecurityType::INVOKER}, {"NONE", SQLSecurityType::NONE}}) + +IMPLEMENT_SETTING_ENUM( + GroupArrayActionWhenLimitReached, + ErrorCodes::BAD_ARGUMENTS, + {{"throw", GroupArrayActionWhenLimitReached::THROW}, {"discard", GroupArrayActionWhenLimitReached::DISCARD}}) } diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index ab163ba96a3..575cd8700c8 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -370,4 +370,12 @@ DECLARE_SETTING_ENUM(SchemaInferenceMode) DECLARE_SETTING_ENUM_WITH_RENAME(DateTimeOverflowBehavior, FormatSettings::DateTimeOverflowBehavior) DECLARE_SETTING_ENUM(SQLSecurityType) + +enum class GroupArrayActionWhenLimitReached : uint8_t +{ + THROW, + DISCARD +}; +DECLARE_SETTING_ENUM(GroupArrayActionWhenLimitReached) + } diff --git a/tests/integration/test_group_array_element_size/configs/group_array_max_element_size.xml b/tests/integration/test_group_array_element_size/configs/group_array_max_element_size.xml index 80409d3e18b..32d5d131a44 100644 --- a/tests/integration/test_group_array_element_size/configs/group_array_max_element_size.xml +++ b/tests/integration/test_group_array_element_size/configs/group_array_max_element_size.xml @@ -1,4 +1,4 @@ 10 - false + throw diff --git a/tests/integration/test_group_array_element_size/test.py b/tests/integration/test_group_array_element_size/test.py index 1eb7647d734..90b2712ffbf 100644 --- a/tests/integration/test_group_array_element_size/test.py +++ b/tests/integration/test_group_array_element_size/test.py @@ -80,8 +80,8 @@ def test_limit_size(started_cluster): node2.replace_in_config( "/etc/clickhouse-server/config.d/group_array_max_element_size.xml", - "false", - "true", + "throw", + "discard", ) node2.restart_clickhouse() @@ -91,8 +91,8 @@ def test_limit_size(started_cluster): node2.replace_in_config( "/etc/clickhouse-server/config.d/group_array_max_element_size.xml", - "true", - "false", + "discard", + "throw", ) node2.restart_clickhouse() From 16fb2fc5616ae462c1f658f9765c82d935b456e4 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 24 May 2024 13:13:19 +0000 Subject: [PATCH 355/392] Split tests 03039_dynamic_all_merge_algorithms to avoid timeouts --- ...9_dynamic_aggregating_merge_tree.reference | 32 +++++++++++++++ .../03039_dynamic_aggregating_merge_tree.sh | 40 +++++++++++++++++++ ...39_dynamic_collapsing_merge_tree.reference | 20 ++++++++++ .../03039_dynamic_collapsing_merge_tree.sh | 38 ++++++++++++++++++ ...039_dynamic_replacing_merge_tree.reference | 20 ++++++++++ .../03039_dynamic_replacing_merge_tree.sh | 39 ++++++++++++++++++ ...03039_dynamic_summing_merge_tree.reference | 32 +++++++++++++++ .../03039_dynamic_summing_merge_tree.sh | 40 +++++++++++++++++++ ..._versioned_collapsing_merge_tree.reference | 20 ++++++++++ ...dynamic_versioned_collapsing_merge_tree.sh | 38 ++++++++++++++++++ 10 files changed, 319 insertions(+) create mode 100644 tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.reference create mode 100755 tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh create mode 100644 tests/queries/0_stateless/03039_dynamic_collapsing_merge_tree.reference create mode 100755 tests/queries/0_stateless/03039_dynamic_collapsing_merge_tree.sh create mode 100644 tests/queries/0_stateless/03039_dynamic_replacing_merge_tree.reference create mode 100755 tests/queries/0_stateless/03039_dynamic_replacing_merge_tree.sh create mode 100644 tests/queries/0_stateless/03039_dynamic_summing_merge_tree.reference create mode 100755 tests/queries/0_stateless/03039_dynamic_summing_merge_tree.sh create mode 100644 tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.reference create mode 100755 tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.sh diff --git a/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.reference b/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.reference new file mode 100644 index 00000000000..3c186fcc935 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.reference @@ -0,0 +1,32 @@ +MergeTree compact + horizontal merge +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree wide + horizontal merge +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree compact + vertical merge +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree wide + vertical merge +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 diff --git a/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh b/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh new file mode 100755 index 00000000000..c433d409c7c --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Fix some settings to avoid timeouts because of some settings randomization +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" + +function test() +{ + $CH_CLIENT -q "create table test (id UInt64, sum AggregateFunction(sum, UInt64), d Dynamic) engine=AggregatingMergeTree() order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), number from numbers(100000) group by number" + $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), 'str_' || toString(number) from numbers(50000, 100000) group by number" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum order by sum, count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum order by sum, count()" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=10000000000, vertical_merge_algorithm_min_columns_to_activate=100000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1,vertical_merge_algorithm_min_rows_to_activate=1000000000, vertical_merge_algorithm_min_columns_to_activate=1000000000000" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03039_dynamic_collapsing_merge_tree.reference b/tests/queries/0_stateless/03039_dynamic_collapsing_merge_tree.reference new file mode 100644 index 00000000000..fc293cc2ec8 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_collapsing_merge_tree.reference @@ -0,0 +1,20 @@ +MergeTree compact + horizontal merge +100000 String +100000 UInt64 +50000 String +50000 UInt64 +MergeTree wide + horizontal merge +100000 String +100000 UInt64 +50000 String +50000 UInt64 +MergeTree compact + vertical merge +100000 String +100000 UInt64 +50000 String +50000 UInt64 +MergeTree wide + vertical merge +100000 String +100000 UInt64 +50000 String +50000 UInt64 diff --git a/tests/queries/0_stateless/03039_dynamic_collapsing_merge_tree.sh b/tests/queries/0_stateless/03039_dynamic_collapsing_merge_tree.sh new file mode 100755 index 00000000000..881c9ec64cc --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_collapsing_merge_tree.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Fix some settings to avoid timeouts because of some settings randomization +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" + +function test() +{ + $CH_CLIENT -q "create table test (id UInt64, sign Int8, d Dynamic) engine=CollapsingMergeTree(sign) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, -1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03039_dynamic_replacing_merge_tree.reference b/tests/queries/0_stateless/03039_dynamic_replacing_merge_tree.reference new file mode 100644 index 00000000000..132b9df6b26 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_replacing_merge_tree.reference @@ -0,0 +1,20 @@ +MergeTree compact + horizontal merge +100000 String +100000 UInt64 +50000 UInt64 +100000 String +MergeTree wide + horizontal merge +100000 String +100000 UInt64 +50000 UInt64 +100000 String +MergeTree compact + vertical merge +100000 String +100000 UInt64 +50000 UInt64 +100000 String +MergeTree wide + vertical merge +100000 String +100000 UInt64 +50000 UInt64 +100000 String diff --git a/tests/queries/0_stateless/03039_dynamic_replacing_merge_tree.sh b/tests/queries/0_stateless/03039_dynamic_replacing_merge_tree.sh new file mode 100755 index 00000000000..fc9039ac98c --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_replacing_merge_tree.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Fix some settings to avoid timeouts because of some settings randomization +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" + + +function test() +{ + $CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=ReplacingMergeTree order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=10000000000, vertical_merge_algorithm_min_columns_to_activate=100000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1,vertical_merge_algorithm_min_rows_to_activate=1000000000, vertical_merge_algorithm_min_columns_to_activate=1000000000000" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03039_dynamic_summing_merge_tree.reference b/tests/queries/0_stateless/03039_dynamic_summing_merge_tree.reference new file mode 100644 index 00000000000..3c186fcc935 --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_summing_merge_tree.reference @@ -0,0 +1,32 @@ +MergeTree compact + horizontal merge +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree wide + horizontal merge +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree compact + vertical merge +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 +MergeTree wide + vertical merge +100000 String +100000 UInt64 +200000 1 +50000 String +100000 UInt64 +100000 1 +50000 2 diff --git a/tests/queries/0_stateless/03039_dynamic_summing_merge_tree.sh b/tests/queries/0_stateless/03039_dynamic_summing_merge_tree.sh new file mode 100755 index 00000000000..f9da70e95ca --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_summing_merge_tree.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Fix some settings to avoid timeouts because of some settings randomization +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" + +function test() +{ + $CH_CLIENT -q "create table test (id UInt64, sum UInt64, d Dynamic) engine=SummingMergeTree(sum) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, 1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select count(), sum from test group by sum order by sum, count()" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "select count(), sum from test group by sum order by sum, count()" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=10000000000, vertical_merge_algorithm_min_columns_to_activate=100000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1,vertical_merge_algorithm_min_rows_to_activate=1000000000, vertical_merge_algorithm_min_columns_to_activate=1000000000000" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.reference b/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.reference new file mode 100644 index 00000000000..cabb0fdefab --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.reference @@ -0,0 +1,20 @@ +MergeTree compact + horizontal merge +100000 String +100000 UInt64 +75000 String +75000 UInt64 +MergeTree wide + horizontal merge +100000 String +100000 UInt64 +75000 String +75000 UInt64 +MergeTree compact + vertical merge +100000 String +100000 UInt64 +75000 String +75000 UInt64 +MergeTree wide + vertical merge +100000 String +100000 UInt64 +75000 String +75000 UInt64 diff --git a/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.sh b/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.sh new file mode 100755 index 00000000000..ca313307a6d --- /dev/null +++ b/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Fix some settings to avoid timeouts because of some settings randomization +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" + +function test() +{ + $CH_CLIENT -q "create table test (id UInt64, sign Int8, version UInt8, d Dynamic) engine=VersionedCollapsingMergeTree(sign, version) order by id settings $1;" + $CH_CLIENT -q "system stop merges test" + $CH_CLIENT -q "insert into test select number, 1, 1, number from numbers(100000)" + $CH_CLIENT -q "insert into test select number, -1, number >= 75000 ? 2 : 1, 'str_' || toString(number) from numbers(50000, 100000)" + + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -nm -q "system start merges test; optimize table test final" + $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" + $CH_CLIENT -q "drop table test" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "MergeTree compact + horizontal merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000" + +echo "MergeTree wide + horizontal merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1" + +echo "MergeTree compact + vertical merge" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" + +echo "MergeTree wide + vertical merge" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" From 09750cb83b0ed72c5527aaf6ab9211203aa6b7f8 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 24 May 2024 13:14:02 +0000 Subject: [PATCH 356/392] Delete old tests --- ...9_dynamic_all_merge_algorithms_1.reference | 88 ------------------- .../03039_dynamic_all_merge_algorithms_1.sh | 65 -------------- ...9_dynamic_all_merge_algorithms_2.reference | 44 ---------- .../03039_dynamic_all_merge_algorithms_2.sh | 50 ----------- 4 files changed, 247 deletions(-) delete mode 100644 tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference delete mode 100755 tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh delete mode 100644 tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference delete mode 100755 tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference deleted file mode 100644 index 6c69b81c183..00000000000 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.reference +++ /dev/null @@ -1,88 +0,0 @@ -MergeTree compact + horizontal merge -ReplacingMergeTree -100000 String -100000 UInt64 -50000 UInt64 -100000 String -SummingMergeTree -100000 String -100000 UInt64 -200000 1 -50000 String -100000 UInt64 -100000 1 -50000 2 -AggregatingMergeTree -100000 String -100000 UInt64 -200000 1 -50000 String -100000 UInt64 -100000 1 -50000 2 -MergeTree wide + horizontal merge -ReplacingMergeTree -100000 String -100000 UInt64 -50000 UInt64 -100000 String -SummingMergeTree -100000 String -100000 UInt64 -200000 1 -50000 String -100000 UInt64 -100000 1 -50000 2 -AggregatingMergeTree -100000 String -100000 UInt64 -200000 1 -50000 String -100000 UInt64 -100000 1 -50000 2 -MergeTree compact + vertical merge -ReplacingMergeTree -100000 String -100000 UInt64 -50000 UInt64 -100000 String -SummingMergeTree -100000 String -100000 UInt64 -200000 1 -50000 String -100000 UInt64 -100000 1 -50000 2 -AggregatingMergeTree -100000 String -100000 UInt64 -200000 1 -50000 String -100000 UInt64 -100000 1 -50000 2 -MergeTree wide + vertical merge -ReplacingMergeTree -100000 String -100000 UInt64 -50000 UInt64 -100000 String -SummingMergeTree -100000 String -100000 UInt64 -200000 1 -50000 String -100000 UInt64 -100000 1 -50000 2 -AggregatingMergeTree -100000 String -100000 UInt64 -200000 1 -50000 String -100000 UInt64 -100000 1 -50000 2 diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh deleted file mode 100755 index 9cfd2294c8d..00000000000 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_1.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env bash -# Tags: long - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# reset --log_comment -CLICKHOUSE_LOG_COMMENT= -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --optimize_aggregation_in_order 0 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" - - -function test() -{ - echo "ReplacingMergeTree" - $CH_CLIENT -q "create table test (id UInt64, d Dynamic) engine=ReplacingMergeTree order by id settings $1;" - $CH_CLIENT -q "system stop merges test" - $CH_CLIENT -q "insert into test select number, number from numbers(100000)" - $CH_CLIENT -q "insert into test select number, 'str_' || toString(number) from numbers(50000, 100000)" - - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "drop table test" - - echo "SummingMergeTree" - $CH_CLIENT -q "create table test (id UInt64, sum UInt64, d Dynamic) engine=SummingMergeTree(sum) order by id settings $1;" - $CH_CLIENT -q "system stop merges test" - $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" - $CH_CLIENT -q "insert into test select number, 1, 'str_' || toString(number) from numbers(50000, 100000)" - - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "select count(), sum from test group by sum order by sum, count()" - $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "select count(), sum from test group by sum order by sum, count()" - $CH_CLIENT -q "drop table test" - - echo "AggregatingMergeTree" - $CH_CLIENT -q "create table test (id UInt64, sum AggregateFunction(sum, UInt64), d Dynamic) engine=AggregatingMergeTree() order by id settings $1;" - $CH_CLIENT -q "system stop merges test" - $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), number from numbers(100000) group by number" - $CH_CLIENT -q "insert into test select number, sumState(1::UInt64), 'str_' || toString(number) from numbers(50000, 100000) group by number" - - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum order by sum, count()" - $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "select count(), sum from (select sumMerge(sum) as sum from test group by id, _part) group by sum order by sum, count()" - $CH_CLIENT -q "drop table test" -} - -$CH_CLIENT -q "drop table if exists test;" - -echo "MergeTree compact + horizontal merge" -test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=10000000000, vertical_merge_algorithm_min_columns_to_activate=100000000000" - -echo "MergeTree wide + horizontal merge" -test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1,vertical_merge_algorithm_min_rows_to_activate=1000000000, vertical_merge_algorithm_min_columns_to_activate=1000000000000" - -echo "MergeTree compact + vertical merge" -test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" - -echo "MergeTree wide + vertical merge" -test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference deleted file mode 100644 index af6c7d8d567..00000000000 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.reference +++ /dev/null @@ -1,44 +0,0 @@ -MergeTree compact + horizontal merge -CollapsingMergeTree -100000 String -100000 UInt64 -50000 String -50000 UInt64 -VersionedCollapsingMergeTree -100000 String -100000 UInt64 -75000 String -75000 UInt64 -MergeTree wide + horizontal merge -CollapsingMergeTree -100000 String -100000 UInt64 -50000 String -50000 UInt64 -VersionedCollapsingMergeTree -100000 String -100000 UInt64 -75000 String -75000 UInt64 -MergeTree compact + vertical merge -CollapsingMergeTree -100000 String -100000 UInt64 -50000 String -50000 UInt64 -VersionedCollapsingMergeTree -100000 String -100000 UInt64 -75000 String -75000 UInt64 -MergeTree wide + vertical merge -CollapsingMergeTree -100000 String -100000 UInt64 -50000 String -50000 UInt64 -VersionedCollapsingMergeTree -100000 String -100000 UInt64 -75000 String -75000 UInt64 diff --git a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh b/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh deleted file mode 100755 index 02362012960..00000000000 --- a/tests/queries/0_stateless/03039_dynamic_all_merge_algorithms_2.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -# Tags: long - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# reset --log_comment -CLICKHOUSE_LOG_COMMENT= -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" - - -function test() -{ - echo "CollapsingMergeTree" - $CH_CLIENT -q "create table test (id UInt64, sign Int8, d Dynamic) engine=CollapsingMergeTree(sign) order by id settings $1;" - $CH_CLIENT -q "system stop merges test" - $CH_CLIENT -q "insert into test select number, 1, number from numbers(100000)" - $CH_CLIENT -q "insert into test select number, -1, 'str_' || toString(number) from numbers(50000, 100000)" - - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "drop table test" - - echo "VersionedCollapsingMergeTree" - $CH_CLIENT -q "create table test (id UInt64, sign Int8, version UInt8, d Dynamic) engine=VersionedCollapsingMergeTree(sign, version) order by id settings $1;" - $CH_CLIENT -q "system stop merges test" - $CH_CLIENT -q "insert into test select number, 1, 1, number from numbers(100000)" - $CH_CLIENT -q "insert into test select number, -1, number >= 75000 ? 2 : 1, 'str_' || toString(number) from numbers(50000, 100000)" - - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -nm -q "system start merges test; optimize table test final" - $CH_CLIENT -q "select count(), dynamicType(d) from test group by dynamicType(d) order by count(), dynamicType(d)" - $CH_CLIENT -q "drop table test" -} - -$CH_CLIENT -q "drop table if exists test;" - -echo "MergeTree compact + horizontal merge" -test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000" - -echo "MergeTree wide + horizontal merge" -test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1" - -echo "MergeTree compact + vertical merge" -test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" - -echo "MergeTree wide + vertical merge" -test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" From cb37b098ef23b0575b987edf35db2276bdb02a69 Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 16:17:25 +0200 Subject: [PATCH 357/392] CI: add secrets to reusable stage wf yml --- .github/workflows/reusable_test_stage.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/reusable_test_stage.yml b/.github/workflows/reusable_test_stage.yml index d7bd55fab43..8926b43d372 100644 --- a/.github/workflows/reusable_test_stage.yml +++ b/.github/workflows/reusable_test_stage.yml @@ -10,6 +10,10 @@ name: StageWF description: ci data type: string required: true + secrets: + secret_envs: + description: if given, it's passed to the environments + required: false jobs: s: @@ -23,3 +27,5 @@ jobs: test_name: ${{ matrix.job_name_and_runner_type.job_name }} runner_type: ${{ matrix.job_name_and_runner_type.runner_type }} data: ${{ inputs.data }} + secrets: + secret_envs: ${{ secrets.secret_envs }} From 4fba9a5c3c3e79bc4b0174410057206b266eb052 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 24 May 2024 14:35:45 +0000 Subject: [PATCH 358/392] Cleanup. --- src/Analyzer/ArrayJoinNode.cpp | 16 ---- src/Analyzer/ColumnNode.cpp | 7 +- src/Analyzer/Passes/QueryAnalysisPass.cpp | 88 +------------------ src/Analyzer/QueryTreeBuilder.cpp | 4 +- src/Analyzer/createUniqueTableAliases.cpp | 31 ------- src/Parsers/ASTTablesInSelectQuery.cpp | 9 -- src/Parsers/ASTTablesInSelectQuery.h | 4 - src/Parsers/ParserTablesInSelectQuery.cpp | 4 - .../QueryPlan/DistributedCreateLocalPlan.cpp | 8 -- 9 files changed, 6 insertions(+), 165 deletions(-) diff --git a/src/Analyzer/ArrayJoinNode.cpp b/src/Analyzer/ArrayJoinNode.cpp index 37c198f8472..27d7229d46a 100644 --- a/src/Analyzer/ArrayJoinNode.cpp +++ b/src/Analyzer/ArrayJoinNode.cpp @@ -55,8 +55,6 @@ ASTPtr ArrayJoinNode::toASTImpl(const ConvertToASTOptions & options) const auto array_join_ast = std::make_shared(); array_join_ast->kind = is_left ? ASTArrayJoin::Kind::Left : ASTArrayJoin::Kind::Inner; - // array_join_ast->setAlias(getAlias()); - auto array_join_expressions_ast = std::make_shared(); const auto & array_join_expressions = getJoinExpressions().getNodes(); @@ -70,21 +68,7 @@ ASTPtr ArrayJoinNode::toASTImpl(const ConvertToASTOptions & options) const else array_join_expression_ast = array_join_expression->toAST(options); - // QueryTreeNodePtr column_source; - // if (column_node) - // column_source = column_node->getColumnSourceOrNull(); - - // if (column_source && column_source->hasAlias()) - // { - // const auto & column_alias = column_node->getAlias(); - // const auto & name_or_alias = column_alias.empty() ? column_node->getColumnName() : column_alias; - - // if (!name_or_alias.starts_with("__")) - // array_join_expression_ast->setAlias(fmt::format("{}.{}", column_source->getAlias(), name_or_alias)); - // } - // else array_join_expression_ast->setAlias(array_join_expression->getAlias()); - array_join_expressions_ast->children.push_back(std::move(array_join_expression_ast)); } diff --git a/src/Analyzer/ColumnNode.cpp b/src/Analyzer/ColumnNode.cpp index d12eac68ab4..2b514a85121 100644 --- a/src/Analyzer/ColumnNode.cpp +++ b/src/Analyzer/ColumnNode.cpp @@ -103,15 +103,10 @@ ASTPtr ColumnNode::toASTImpl(const ConvertToASTOptions & options) const if (column_source && options.fully_qualified_identifiers) { auto node_type = column_source->getNodeType(); - - // if (node_type == QueryTreeNodeType::ARRAY_JOIN && column_source->hasAlias()) - // return std::make_shared(std::string(fmt::format("{}.{}", column_source->getAlias(), column.name))); - if (node_type == QueryTreeNodeType::TABLE || node_type == QueryTreeNodeType::TABLE_FUNCTION || node_type == QueryTreeNodeType::QUERY || - node_type == QueryTreeNodeType::UNION)// || - //node_type == QueryTreeNodeType::ARRAY_JOIN) + node_type == QueryTreeNodeType::UNION) { if (column_source->hasAlias()) { diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 871c3842de0..a5992148b14 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -607,6 +607,8 @@ struct ScopeAliases std::unordered_set nodes_with_duplicated_aliases; std::vector cloned_nodes_with_duplicated_aliases; + /// Names which are aliases from ARRAY JOIN. + /// This is needed to properly qualify columns from matchers and avoid name collision. std::unordered_set array_join_aliases; std::unordered_map & getAliasMap(IdentifierLookupContext lookup_context) @@ -1070,25 +1072,10 @@ public: void visitImpl(QueryTreeNodePtr & node) { updateAliasesIfNeeded(node, false /*is_lambda_node*/); - - // if (auto * array_join_node = node->as()) - // { - // for (const auto & elem : array_join_node->getJoinExpressions()) - // { - // for (auto & child : elem->getChildren()) - // { - // // std::cerr << "<<<<<<<<<< " << child->dumpTree() << std::endl; - // visit(child); - // } - // } - // } } bool needChildVisit(const QueryTreeNodePtr &, const QueryTreeNodePtr & child) { - // if (parent->getNodeType() == QueryTreeNodeType::ARRAY_JOIN) - // return false; - if (auto * lambda_node = child->as()) { updateAliasesIfNeeded(child, true /*is_lambda_node*/); @@ -1131,8 +1118,6 @@ private: if (node->getNodeType() == QueryTreeNodeType::WINDOW) return; - // std::cerr << ">>>>>>>>>> " << node->dumpTree() << std::endl; - const auto & alias = node->getAlias(); if (is_lambda_node) @@ -2926,7 +2911,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier IdentifierResolveSettings identifier_resolve_settings) { const auto & identifier_bind_part = identifier_lookup.identifier.front(); - // std::cerr << "tryResolveIdentifierFromAliases " << identifier_lookup.dump() << std::endl; auto * it = scope.aliases.find(identifier_lookup, ScopeAliases::FindOption::FIRST_NAME); if (it == nullptr) @@ -2955,7 +2939,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier } auto node_type = alias_node->getNodeType(); - // std::cerr << "tryResolveIdentifierFromAliases 1.5 \n" << alias_node->dumpTree() << std::endl; /// Resolve expression if necessary if (node_type == QueryTreeNodeType::IDENTIFIER) @@ -2964,7 +2947,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier auto & alias_identifier_node = alias_node->as(); auto identifier = alias_identifier_node.getIdentifier(); - // std::cerr << "tryResolveIdentifierFromAliases 2 " << identifier.getFullName() << std::endl; auto lookup_result = tryResolveIdentifier(IdentifierLookup{identifier, identifier_lookup.lookup_context}, scope, identifier_resolve_settings); if (!lookup_result.resolved_identifier) { @@ -3141,7 +3123,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromStorage( size_t identifier_column_qualifier_parts, bool can_be_not_found) { - // std::cerr << "tryResolveIdentifierFromStorage " << identifier.getFullName() << std::endl; auto identifier_without_column_qualifier = identifier; identifier_without_column_qualifier.popFirst(identifier_column_qualifier_parts); @@ -3284,7 +3265,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromStorage( { auto qualified_identifier_with_removed_part = qualified_identifier; qualified_identifier_with_removed_part.popFirst(); - // std::cerr << "tryResolveIdentifierFromStorage qualified_identifier_with_removed_part" << qualified_identifier_with_removed_part.getFullName() << std::endl; if (qualified_identifier_with_removed_part.empty()) break; @@ -3818,8 +3798,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveExpressionFromArrayJoinExpressions(con const QueryTreeNodePtr & table_expression_node, IdentifierResolveScope & scope) { - // std::cerr << "tryResolveExpressionFromArrayJoinExpressions " << scope.dump() << std::endl; - const auto & array_join_node = table_expression_node->as(); const auto & array_join_column_expressions_list = array_join_node.getJoinExpressions(); const auto & array_join_column_expressions_nodes = array_join_column_expressions_list.getNodes(); @@ -3897,14 +3875,9 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi const QueryTreeNodePtr & table_expression_node, IdentifierResolveScope & scope) { - // std::cerr << "tryResolveIdentifierFromArrayJoin " << identifier_lookup.identifier.getFullName() << std::endl; - const auto & from_array_join_node = table_expression_node->as(); auto resolved_identifier = tryResolveIdentifierFromJoinTreeNode(identifier_lookup, from_array_join_node.getTableExpression(), scope); - // std::cerr << "tryResolveIdentifierFromArrayJoin 2 " << scope.table_expressions_in_resolve_process.contains(table_expression_node.get()) - // << ' ' << identifier_lookup.dump() << ' ' << (resolved_identifier ? resolved_identifier->dumpTree() : "not resolved ") << std::endl; - if (scope.table_expressions_in_resolve_process.contains(table_expression_node.get()) || !identifier_lookup.isExpressionLookup()) return resolved_identifier; @@ -3919,8 +3892,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi for (const auto & array_join_column_expression : array_join_column_expressions_nodes) { auto & array_join_column_expression_typed = array_join_column_expression->as(); - // std::cerr << "========== " << array_join_column_expression->dumpTree() << std::endl; - // std::cerr << "========== " << identifier_lookup.identifier.getFullName() << ' ' << from_array_join_node.getAlias() << ' ' << array_join_column_expression_typed.getAlias() << std::endl; IdentifierView identifier_view(identifier_lookup.identifier); @@ -3955,15 +3926,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi if (compound_expr) return compound_expr; - - // const auto & parts = identifier_lookup.identifier.getParts(); - // if (array_join_column_expression_typed.getAlias() == identifier_lookup.identifier.getFullName() || - // (parts.size() == 2 && parts.front() == from_array_join_node.getAlias() && parts.back() == array_join_column_expression_typed.getAlias())) - // { - // auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), - // array_join_column_expression_typed.getColumnSource()); - // return array_join_column; - // } } if (!resolved_identifier) @@ -3980,8 +3942,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoinTreeNode(const Ident const QueryTreeNodePtr & join_tree_node, IdentifierResolveScope & scope) { - // std::cerr << "tryResolveIdentifierFromJoinTreeNode " << identifier_lookup.identifier.getFullName() << std::endl; - auto join_tree_node_type = join_tree_node->getNodeType(); switch (join_tree_node_type) @@ -4185,8 +4145,6 @@ IdentifierResolveResult QueryAnalyzer::tryResolveIdentifier(const IdentifierLook IdentifierResolveScope & scope, IdentifierResolveSettings identifier_resolve_settings) { - // std::cerr << "tryResolveIdentifier " << identifier_lookup.identifier.getFullName() << std::endl; - auto it = scope.identifier_lookup_to_resolve_state.find(identifier_lookup); if (it != scope.identifier_lookup_to_resolve_state.end()) { @@ -6363,8 +6321,6 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id { checkStackSize(); - // std::cerr << "resolveExpressionNode " << ignore_alias << "\n" << node->dumpTree() << std::endl; - auto resolved_expression_it = resolved_expressions.find(node); if (resolved_expression_it != resolved_expressions.end()) { @@ -6381,7 +6337,6 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id evaluateScalarSubqueryIfNeeded(node, subquery_scope); } - // std::cerr << "resolveExpressionNode taken from cache \n" << node->dumpTree() << "\n PN " << (resolved_expression_it->second.empty() ? "" : resolved_expression_it->second.front()) << std::endl; return resolved_expression_it->second; } @@ -6392,10 +6347,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id { auto projection_name_it = node_to_projection_name.find(node); if (projection_name_it != node_to_projection_name.end()) - { - // std::cerr << "resolveExpressionNode taken projection name from map : " << projection_name_it->second << " for \n" << node->dumpTree() << std::endl; result_projection_names.push_back(projection_name_it->second); - } } else { @@ -7651,36 +7603,25 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif for (auto & array_join_expression : array_join_nodes) { auto array_join_expression_alias = array_join_expression->getAlias(); - // if (!array_join_expression_alias.empty() && scope.aliases.alias_name_to_expression_node->contains(array_join_expression_alias)) - // throw Exception(ErrorCodes::MULTIPLE_EXPRESSIONS_FOR_ALIAS, - // "ARRAY JOIN expression {} with duplicate alias {}. In scope {}", - // array_join_expression->formatASTForErrorMessage(), - // array_join_expression_alias, - // scope.scope_node->formatASTForErrorMessage()); - - /// Add array join expression into scope for (const auto & elem : array_join_nodes) { if (elem->hasAlias()) scope.aliases.array_join_aliases.insert(elem->getAlias()); + for (auto & child : elem->getChildren()) { - //std::cerr << "<<<<<<<<<< " << child->dumpTree() << std::endl; if (child) expressions_visitor.visit(child); - //visit(child); } } - // expressions_visitor.visit(array_join_expression); - std::string identifier_full_name; if (auto * identifier_node = array_join_expression->as()) identifier_full_name = identifier_node->getIdentifier().getFullName(); - resolveExpressionNode(array_join_expression, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/, true); + resolveExpressionNode(array_join_expression, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/, true /*ignore_alias*/); auto process_array_join_expression = [&](QueryTreeNodePtr & expression) { @@ -7747,27 +7688,7 @@ void QueryAnalyzer::resolveArrayJoin(QueryTreeNodePtr & array_join_node, Identif } } - /** Allow to resolve ARRAY JOIN columns from aliases with types after ARRAY JOIN only after ARRAY JOIN expression list is resolved, because - * during resolution of ARRAY JOIN expression list we must use column type before ARRAY JOIN. - * - * Example: SELECT id, value_element FROM test_table ARRAY JOIN [[1,2,3]] AS value_element, value_element AS value - * It is expected that `value_element AS value` expression inside ARRAY JOIN expression list will be - * resolved as `value_element` expression with type before ARRAY JOIN. - * And it is expected that `value_element` inside projection expression list will be resolved as `value_element` expression - * with type after ARRAY JOIN. - */ array_join_nodes = std::move(array_join_column_expressions); - // for (auto & array_join_column_expression : array_join_nodes) - // { - // auto it = scope.aliases.alias_name_to_expression_node->find(array_join_column_expression->getAlias()); - // if (it != scope.aliases.alias_name_to_expression_node->end()) - // { - // auto & array_join_column_expression_typed = array_join_column_expression->as(); - // auto array_join_column = std::make_shared(array_join_column_expression_typed.getColumn(), - // array_join_column_expression_typed.getColumnSource()); - // it->second = std::move(array_join_column); - // } - // } } void QueryAnalyzer::checkDuplicateTableNamesOrAlias(const QueryTreeNodePtr & join_node, QueryTreeNodePtr & left_table_expr, QueryTreeNodePtr & right_table_expr, IdentifierResolveScope & scope) @@ -8552,7 +8473,6 @@ QueryAnalysisPass::QueryAnalysisPass(bool only_analyze_) : only_analyze(only_ana void QueryAnalysisPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { - // std::cerr << ".... qap\n" << query_tree_node->dumpTree() << std::endl; QueryAnalyzer analyzer(only_analyze); analyzer.resolve(query_tree_node, table_expression, context); createUniqueTableAliases(query_tree_node, table_expression, context); diff --git a/src/Analyzer/QueryTreeBuilder.cpp b/src/Analyzer/QueryTreeBuilder.cpp index 02d742f5e49..6a5db4bc1de 100644 --- a/src/Analyzer/QueryTreeBuilder.cpp +++ b/src/Analyzer/QueryTreeBuilder.cpp @@ -957,7 +957,6 @@ QueryTreeNodePtr QueryTreeBuilder::buildJoinTree(const ASTPtr & tables_in_select auto array_join_expressions_list = buildExpressionList(array_join_expression.expression_list, context); auto array_join_node = std::make_shared(std::move(last_table_expression), std::move(array_join_expressions_list), is_left_array_join); - // array_join_node->setAlias(array_join_expression.tryGetAlias()); /** Original AST is not set because it will contain only array join part and does * not include left table expression. @@ -1046,8 +1045,7 @@ ColumnTransformersNodes QueryTreeBuilder::buildColumnTransformers(const ASTPtr & QueryTreeNodePtr buildQueryTree(ASTPtr query, ContextPtr context) { QueryTreeBuilder builder(std::move(query), context); - auto qt = builder.getQueryTreeNode(); - return qt; + return builder.getQueryTreeNode(); } } diff --git a/src/Analyzer/createUniqueTableAliases.cpp b/src/Analyzer/createUniqueTableAliases.cpp index 30b8c0a433b..b36ba1cafaa 100644 --- a/src/Analyzer/createUniqueTableAliases.cpp +++ b/src/Analyzer/createUniqueTableAliases.cpp @@ -61,37 +61,6 @@ public: node->setAlias(alias); } - if (auto * array_join = node->as()) - { - //size_t counter = 0; - for (auto & column : array_join->getJoinExpressions()) - { - if (auto * column_node = column->as()) - { - if (!column_node->hasAlias()) - column_node->setAlias(column_node->getColumnName()); - } - } - } - - // if (auto * array_join = node->as()) - // { - // for (auto & column : array_join->getJoinExpressions()) - // { - // if (auto * column_node = column->as()) - // { - // const auto & column_alias = column_node->getAlias(); - // const auto & name_or_alias = column_alias.empty() ? column_node->getColumnName() : column_alias; - - // if (!name_or_alias.starts_with("__")) - // { - - // column_node->setAlias(fmt::format("{}.{}", alias, name_or_alias)); - // } - // } - // } - // } - break; } default: diff --git a/src/Parsers/ASTTablesInSelectQuery.cpp b/src/Parsers/ASTTablesInSelectQuery.cpp index b4058a0950d..e782bad797e 100644 --- a/src/Parsers/ASTTablesInSelectQuery.cpp +++ b/src/Parsers/ASTTablesInSelectQuery.cpp @@ -247,12 +247,6 @@ void ASTTableJoin::formatImpl(const FormatSettings & settings, FormatState & sta formatImplAfterTable(settings, state, frame); } -// static void writeAlias(const String & name, const ASTWithAlias::FormatSettings & settings) -// { -// settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " AS " << (settings.hilite ? IAST::hilite_alias : ""); -// settings.writeIdentifier(name); -// settings.ostr << (settings.hilite ? IAST::hilite_none : ""); -// } void ASTArrayJoin::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { @@ -264,9 +258,6 @@ void ASTArrayJoin::formatImpl(const FormatSettings & settings, FormatState & sta << indent_str << (kind == Kind::Left ? "LEFT " : "") << "ARRAY JOIN" << (settings.hilite ? hilite_none : ""); - // if (!alias.empty()) - // writeAlias(alias, settings); - settings.one_line ? expression_list->formatImpl(settings, state, frame) : expression_list->as().formatImplMultiline(settings, state, frame); diff --git a/src/Parsers/ASTTablesInSelectQuery.h b/src/Parsers/ASTTablesInSelectQuery.h index 212436b0d9e..f3f329ca2b6 100644 --- a/src/Parsers/ASTTablesInSelectQuery.h +++ b/src/Parsers/ASTTablesInSelectQuery.h @@ -95,10 +95,6 @@ struct ASTArrayJoin : public IAST /// List of array or nested names to JOIN, possible with aliases. ASTPtr expression_list; - // String alias; - - // String tryGetAlias() const override { return alias; } - // void setAlias(const String & to) override { alias = to; } using IAST::IAST; String getID(char) const override { return "ArrayJoin"; } diff --git a/src/Parsers/ParserTablesInSelectQuery.cpp b/src/Parsers/ParserTablesInSelectQuery.cpp index c96b6c1584d..b4d48ae67e9 100644 --- a/src/Parsers/ParserTablesInSelectQuery.cpp +++ b/src/Parsers/ParserTablesInSelectQuery.cpp @@ -98,10 +98,6 @@ bool ParserArrayJoin::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!has_array_join) return false; - // ASTPtr alias_node; - // if (ParserAlias(false).parse(pos, alias_node, expected)) - // tryGetIdentifierNameInto(alias_node, res->alias); - if (!ParserExpressionList(false).parse(pos, res->expression_list, expected)) return false; diff --git a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp index e4d908e2af0..d4545482477 100644 --- a/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp +++ b/src/Processors/QueryPlan/DistributedCreateLocalPlan.cpp @@ -2,7 +2,6 @@ #include #include -#include "Parsers/queryToString.h" #include #include #include @@ -69,19 +68,12 @@ std::unique_ptr createLocalPlan( if (context->getSettingsRef().allow_experimental_analyzer) { - // std::cerr << query_ast->dumpTree() << std::endl; - // std::cerr << queryToString(query_ast) << std::endl; - /// For Analyzer, identifier in GROUP BY/ORDER BY/LIMIT BY lists has been resolved to /// ConstantNode in QueryTree if it is an alias of a constant, so we should not replace /// ConstantNode with ProjectionNode again(https://github.com/ClickHouse/ClickHouse/issues/62289). new_context->setSetting("enable_positional_arguments", Field(false)); auto interpreter = InterpreterSelectQueryAnalyzer(query_ast, new_context, select_query_options); - // std::cerr << interpreter.getQueryTree()->dumpTree() << std::endl; query_plan = std::make_unique(std::move(interpreter).extractQueryPlan()); - // WriteBufferFromOwnString buf; - // query_plan->explainPlan(buf, {.header=true, .actions=true}); - // std::cerr << buf.str() << std::endl; } else { From dff7a2f1f6bab1a49669a06f95990d34e71c2cf6 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 24 May 2024 14:37:33 +0000 Subject: [PATCH 359/392] Cleanup. --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 2 -- src/Analyzer/createUniqueTableAliases.cpp | 3 --- 2 files changed, 5 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index a5992148b14..3fca66e6eb8 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -3995,8 +3995,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoinTree(const Identifie if (identifier_lookup.isFunctionLookup()) return {}; - // std::cerr << "tryResolveIdentifierFromJoinTree " << identifier_lookup.identifier.getFullName() << std::endl; - /// Try to resolve identifier from table columns if (auto resolved_identifier = tryResolveIdentifierFromTableColumns(identifier_lookup, scope)) return resolved_identifier; diff --git a/src/Analyzer/createUniqueTableAliases.cpp b/src/Analyzer/createUniqueTableAliases.cpp index b36ba1cafaa..8f850fe8dec 100644 --- a/src/Analyzer/createUniqueTableAliases.cpp +++ b/src/Analyzer/createUniqueTableAliases.cpp @@ -1,8 +1,6 @@ #include #include #include -#include -#include #include #include #include @@ -60,7 +58,6 @@ public: alias = fmt::format("__table{}", ++next_id); node->setAlias(alias); } - break; } default: From b254be618087e8f949f420406e791b24d11c960a Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 16:57:08 +0200 Subject: [PATCH 360/392] CI: add secrets to reusable build stage wf yml --- .github/workflows/reusable_build_stage.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/reusable_build_stage.yml b/.github/workflows/reusable_build_stage.yml index 4463645880b..a8e84819c95 100644 --- a/.github/workflows/reusable_build_stage.yml +++ b/.github/workflows/reusable_build_stage.yml @@ -13,6 +13,10 @@ name: BuildStageWF description: ci data type: string required: true + secrets: + secret_envs: + description: if given, it's passed to the environments + required: false jobs: s: @@ -30,3 +34,5 @@ jobs: # for now let's do I deep checkout for builds checkout_depth: 0 data: ${{ inputs.data }} + secrets: + secret_envs: ${{ secrets.secret_envs }} From 4982d7c85cc7a71ddef773cd57df540e7b8cd33a Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 16:59:47 +0200 Subject: [PATCH 361/392] fix for mark release ready --- .github/workflows/master.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 7c55098bdfd..c2a893a8e99 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -136,7 +136,7 @@ jobs: MarkReleaseReady: if: ${{ !failure() && !cancelled() }} - needs: [RunConfig, Builds_1] + needs: [RunConfig, Builds_1, Builds_2] runs-on: [self-hosted, style-checker-aarch64] steps: - name: Debug From b3f836fbb1b451c08d57f4956c0a9c5137fe5ede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 May 2024 17:08:30 +0200 Subject: [PATCH 362/392] Run 03147_system_columns_access_checks only on release --- tests/queries/0_stateless/03147_system_columns_access_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03147_system_columns_access_checks.sh b/tests/queries/0_stateless/03147_system_columns_access_checks.sh index 2bd7fb083ea..b027ea28504 100755 --- a/tests/queries/0_stateless/03147_system_columns_access_checks.sh +++ b/tests/queries/0_stateless/03147_system_columns_access_checks.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest, no-parallel, no-ordinary-database, long +# Tags: no-fasttest, no-parallel, no-ordinary-database, long, no-debug, no-asan, no-tsan, no-msan CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From d5b763d03d581b70b1243ab589223d85d231fe89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 May 2024 17:21:50 +0200 Subject: [PATCH 363/392] Limit max time for 01442_merge_detach_attach_long --- .../01442_merge_detach_attach_long.sh | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/01442_merge_detach_attach_long.sh b/tests/queries/0_stateless/01442_merge_detach_attach_long.sh index acb2550d48c..e7c20158b5d 100755 --- a/tests/queries/0_stateless/01442_merge_detach_attach_long.sh +++ b/tests/queries/0_stateless/01442_merge_detach_attach_long.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-parallel, no-debug +# Tags: long, no-parallel set -e @@ -11,14 +11,24 @@ CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=none ${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS t" ${CLICKHOUSE_CLIENT} --query="CREATE TABLE t (x Int8) ENGINE = MergeTree ORDER BY tuple()" -for _ in {1..100}; do - ${CLICKHOUSE_CLIENT} --query="INSERT INTO t VALUES (0)" - ${CLICKHOUSE_CLIENT} --query="INSERT INTO t VALUES (0)" - ${CLICKHOUSE_CLIENT} --query="OPTIMIZE TABLE t FINAL" 2>/dev/null & - ${CLICKHOUSE_CLIENT} --query="ALTER TABLE t DETACH PARTITION tuple()" - ${CLICKHOUSE_CLIENT} --query="SELECT count() FROM t HAVING count() > 0" -done +function thread_ops() +{ + local TIMELIMIT=$((SECONDS+$1)) + local it=0 + while [ $SECONDS -lt "$TIMELIMIT" ] && [ $it -lt 100 ]; + do + it=$((it+1)) + ${CLICKHOUSE_CLIENT} --query="INSERT INTO t VALUES (0)" + ${CLICKHOUSE_CLIENT} --query="INSERT INTO t VALUES (0)" + ${CLICKHOUSE_CLIENT} --query="OPTIMIZE TABLE t FINAL" 2>/dev/null & + ${CLICKHOUSE_CLIENT} --query="ALTER TABLE t DETACH PARTITION tuple()" + ${CLICKHOUSE_CLIENT} --query="SELECT count() FROM t HAVING count() > 0" + done +} +export -f thread_ops +TIMEOUT=60 +thread_ops $TIMEOUT & wait $CLICKHOUSE_CLIENT -q "DROP TABLE t" From bd415cc83192a734dccb00bd004775e46bd74b7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 May 2024 17:27:47 +0200 Subject: [PATCH 364/392] Reduce 02228_merge_tree_insert_memory_usage partitions --- .../02228_merge_tree_insert_memory_usage.sql | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/queries/0_stateless/02228_merge_tree_insert_memory_usage.sql b/tests/queries/0_stateless/02228_merge_tree_insert_memory_usage.sql index 8924627a717..26a201ec89f 100644 --- a/tests/queries/0_stateless/02228_merge_tree_insert_memory_usage.sql +++ b/tests/queries/0_stateless/02228_merge_tree_insert_memory_usage.sql @@ -1,16 +1,16 @@ -- Tags: long, no-parallel -SET insert_keeper_fault_injection_probability=0; -- to succeed this test can require too many retries due to 1024 partitions, so disable fault injections +SET insert_keeper_fault_injection_probability=0; -- to succeed this test can require too many retries due to 100 partitions, so disable fault injections -- regression for MEMORY_LIMIT_EXCEEDED error because of deferred final part flush drop table if exists data_02228; -create table data_02228 (key1 UInt32, sign Int8, s UInt64) engine = CollapsingMergeTree(sign) order by (key1) partition by key1 % 1024; -insert into data_02228 select number, 1, number from numbers_mt(100e3) settings max_memory_usage='300Mi', max_partitions_per_insert_block=1024, max_insert_delayed_streams_for_parallel_write=0; -insert into data_02228 select number, 1, number from numbers_mt(100e3) settings max_memory_usage='300Mi', max_partitions_per_insert_block=1024, max_insert_delayed_streams_for_parallel_write=10000000; -- { serverError MEMORY_LIMIT_EXCEEDED } +create table data_02228 (key1 UInt32, sign Int8, s UInt64) engine = CollapsingMergeTree(sign) order by (key1) partition by key1 % 100; +insert into data_02228 select number, 1, number from numbers_mt(10_000) settings max_memory_usage='30Mi', max_partitions_per_insert_block=1024, max_insert_delayed_streams_for_parallel_write=0; +insert into data_02228 select number, 1, number from numbers_mt(10_000) settings max_memory_usage='30Mi', max_partitions_per_insert_block=1024, max_insert_delayed_streams_for_parallel_write=1000000; -- { serverError MEMORY_LIMIT_EXCEEDED } drop table data_02228; drop table if exists data_rep_02228 SYNC; -create table data_rep_02228 (key1 UInt32, sign Int8, s UInt64) engine = ReplicatedCollapsingMergeTree('/clickhouse/{database}', 'r1', sign) order by (key1) partition by key1 % 1024; -insert into data_rep_02228 select number, 1, number from numbers_mt(100e3) settings max_memory_usage='300Mi', max_partitions_per_insert_block=1024, max_insert_delayed_streams_for_parallel_write=0; -insert into data_rep_02228 select number, 1, number from numbers_mt(100e3) settings max_memory_usage='300Mi', max_partitions_per_insert_block=1024, max_insert_delayed_streams_for_parallel_write=10000000; -- { serverError MEMORY_LIMIT_EXCEEDED } +create table data_rep_02228 (key1 UInt32, sign Int8, s UInt64) engine = ReplicatedCollapsingMergeTree('/clickhouse/{database}', 'r1', sign) order by (key1) partition by key1 % 100; +insert into data_rep_02228 select number, 1, number from numbers_mt(10_000) settings max_memory_usage='30Mi', max_partitions_per_insert_block=1024, max_insert_delayed_streams_for_parallel_write=0; +insert into data_rep_02228 select number, 1, number from numbers_mt(10_000) settings max_memory_usage='30Mi', max_partitions_per_insert_block=1024, max_insert_delayed_streams_for_parallel_write=1000000; -- { serverError MEMORY_LIMIT_EXCEEDED } drop table data_rep_02228 SYNC; From b396e63ea5721f72e0a1efb15e1c108c93dfad2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 May 2024 17:30:26 +0200 Subject: [PATCH 365/392] Reduce sizes in 02735_parquet_encoder --- tests/queries/0_stateless/02735_parquet_encoder.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02735_parquet_encoder.sql b/tests/queries/0_stateless/02735_parquet_encoder.sql index fe45a2a317d..9320d0e57c3 100644 --- a/tests/queries/0_stateless/02735_parquet_encoder.sql +++ b/tests/queries/0_stateless/02735_parquet_encoder.sql @@ -41,7 +41,7 @@ create temporary table basic_types_02735 as select * from generateRandom(' decimal128 Decimal128(20), decimal256 Decimal256(40), ipv4 IPv4, - ipv6 IPv6') limit 10101; + ipv6 IPv6') limit 1011; insert into function file(basic_types_02735.parquet) select * from basic_types_02735; desc file(basic_types_02735.parquet); select (select sum(cityHash64(*)) from basic_types_02735) - (select sum(cityHash64(*)) from file(basic_types_02735.parquet)); @@ -59,7 +59,7 @@ create temporary table nullables_02735 as select * from generateRandom(' fstr Nullable(FixedString(12)), i256 Nullable(Int256), decimal256 Nullable(Decimal256(40)), - ipv6 Nullable(IPv6)') limit 10000; + ipv6 Nullable(IPv6)') limit 1000; insert into function file(nullables_02735.parquet) select * from nullables_02735; select (select sum(cityHash64(*)) from nullables_02735) - (select sum(cityHash64(*)) from file(nullables_02735.parquet)); drop table nullables_02735; @@ -83,7 +83,7 @@ create table arrays_02735 engine = Memory as select * from generateRandom(' decimal64 Array(Decimal64(10)), ipv4 Array(IPv4), msi Map(String, Int16), - tup Tuple(FixedString(3), Array(String), Map(Int8, Date))') limit 10000; + tup Tuple(FixedString(3), Array(String), Map(Int8, Date))') limit 1000; insert into function file(arrays_02735.parquet) select * from arrays_02735; create temporary table arrays_out_02735 as arrays_02735; insert into arrays_out_02735 select * from file(arrays_02735.parquet); @@ -107,7 +107,7 @@ create temporary table madness_02735 as select * from generateRandom(' mln Map(LowCardinality(String), Nullable(Int8)), t Tuple(Map(FixedString(5), Tuple(Array(UInt16), Nullable(UInt16), Array(Tuple(Int8, Decimal64(10))))), Tuple(kitchen UInt64, sink String)), n Nested(hello UInt64, world Tuple(first String, second FixedString(1))) - ') limit 10000; + ') limit 1000; insert into function file(madness_02735.parquet) select * from madness_02735; insert into function file(a.csv) select * from madness_02735 order by tuple(*); insert into function file(b.csv) select aa, aaa, an, aan, l, ln, arrayMap(x->reinterpret(x, 'UInt128'), al) as al_, aaln, mln, t, n.hello, n.world from file(madness_02735.parquet) order by tuple(aa, aaa, an, aan, l, ln, al_, aaln, mln, t, n.hello, n.world); From 24797a093a216479d70b2b0e065d9f3850d484bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 May 2024 17:31:39 +0200 Subject: [PATCH 366/392] Remove 02344_insert_profile_events_stress from sanitizer run as it's too slow --- .../queries/0_stateless/02344_insert_profile_events_stress.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02344_insert_profile_events_stress.sql b/tests/queries/0_stateless/02344_insert_profile_events_stress.sql index f9fdd3b943f..e9a790bea5d 100644 --- a/tests/queries/0_stateless/02344_insert_profile_events_stress.sql +++ b/tests/queries/0_stateless/02344_insert_profile_events_stress.sql @@ -1,4 +1,4 @@ --- Tags: no-parallel, long, no-debug, no-tsan +-- Tags: no-parallel, long, no-debug, no-tsan, no-msan, no-asan create table data_02344 (key Int) engine=Null; -- 3e9 rows is enough to fill the socket buffer and cause INSERT hung. From 049ca7c71e5c3543e4a63d22f075de2ff96373c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 24 May 2024 17:34:48 +0200 Subject: [PATCH 367/392] Reduce 01396_inactive_replica_cleanup_nodes_zookeeper! --- .../01396_inactive_replica_cleanup_nodes_zookeeper.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh b/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh index 67a2a70b509..11102b128b2 100755 --- a/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh +++ b/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: replica, no-debug, no-parallel +# Tags: replica, no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -11,7 +11,7 @@ REPLICA=$($CLICKHOUSE_CLIENT --query "Select getMacro('replica')") # Check that if we have one inactive replica and a huge number of INSERTs to active replicas, # the number of nodes in ZooKeeper does not grow unbounded. -SCALE=5000 +SCALE=1000 $CLICKHOUSE_CLIENT -n --query " DROP TABLE IF EXISTS r1; From 7f9734d0cc9dc270ea129b75881234ace3cdf1fa Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 24 May 2024 15:38:21 +0000 Subject: [PATCH 368/392] Fix Logical error: Bad cast for Buffer table with prewhere. --- src/Storages/StorageBuffer.cpp | 2 ++ .../0_stateless/00910_buffer_prewhere_different_types.sql | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index d9a0b2b4d59..a3f6b6afc5d 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -302,6 +302,8 @@ void StorageBuffer::read( auto src_table_query_info = query_info; if (src_table_query_info.prewhere_info) { + src_table_query_info.prewhere_info = src_table_query_info.prewhere_info->clone(); + auto actions_dag = ActionsDAG::makeConvertingActions( header_after_adding_defaults.getColumnsWithTypeAndName(), header.getColumnsWithTypeAndName(), diff --git a/tests/queries/0_stateless/00910_buffer_prewhere_different_types.sql b/tests/queries/0_stateless/00910_buffer_prewhere_different_types.sql index 8f305914cb8..702d9bb3e6c 100644 --- a/tests/queries/0_stateless/00910_buffer_prewhere_different_types.sql +++ b/tests/queries/0_stateless/00910_buffer_prewhere_different_types.sql @@ -2,8 +2,14 @@ DROP TABLE IF EXISTS buffer_table1__fuzz_28; DROP TABLE IF EXISTS merge_tree_table1; CREATE TABLE merge_tree_table1 (`x` UInt32) ENGINE = MergeTree ORDER BY x; + +CREATE TABLE buffer_table1__fuzz_24 (`s` Nullable(Int128), `x` Nullable(FixedString(17))) ENGINE = Buffer(currentDatabase(), 'merge_tree_table1', 16, 10, 60, 10, 1000, 1048576, 2097152); +SELECT s FROM buffer_table1__fuzz_24 PREWHERE factorial(toNullable(10)); -- { serverError ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER } + INSERT INTO merge_tree_table1 VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10); +SELECT s FROM buffer_table1__fuzz_24 PREWHERE factorial(toNullable(10)); -- { serverError ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER } + SET send_logs_level='error'; CREATE TABLE buffer_table1__fuzz_28 (`x` Nullable(UInt32)) ENGINE = Buffer(currentDatabase(), 'merge_tree_table1', 16, 10, 60, 10, 1000, 1048576, 2097152); From 2669df7296a1b362807693d0cc41833ecf80a148 Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 24 May 2024 17:30:36 +0200 Subject: [PATCH 369/392] add secrets to reusable build yml --- .github/workflows/reusable_build.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/reusable_build.yml b/.github/workflows/reusable_build.yml index 80d78d93e1b..5e254d785ec 100644 --- a/.github/workflows/reusable_build.yml +++ b/.github/workflows/reusable_build.yml @@ -33,6 +33,10 @@ name: Build ClickHouse additional_envs: description: additional ENV variables to setup the job type: string + secrets: + secret_envs: + description: if given, it's passed to the environments + required: false jobs: Build: @@ -54,6 +58,7 @@ jobs: run: | cat >> "$GITHUB_ENV" << 'EOF' ${{inputs.additional_envs}} + ${{secrets.secret_envs}} DOCKER_TAG< Date: Fri, 24 May 2024 17:44:14 +0200 Subject: [PATCH 370/392] Restore tags --- .../01396_inactive_replica_cleanup_nodes_zookeeper.sh | 2 +- tests/queries/0_stateless/01442_merge_detach_attach_long.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh b/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh index 11102b128b2..1c1eb4489ee 100755 --- a/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh +++ b/tests/queries/0_stateless/01396_inactive_replica_cleanup_nodes_zookeeper.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: replica, no-parallel +# Tags: replica, no-debug, no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01442_merge_detach_attach_long.sh b/tests/queries/0_stateless/01442_merge_detach_attach_long.sh index e7c20158b5d..85fdf7ed764 100755 --- a/tests/queries/0_stateless/01442_merge_detach_attach_long.sh +++ b/tests/queries/0_stateless/01442_merge_detach_attach_long.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-parallel +# Tags: long, no-parallel, no-debug set -e From 772d38a0c139ca5ee76bd7886d70db874db503c0 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 24 May 2024 18:11:21 +0200 Subject: [PATCH 371/392] Update s3queue.md --- docs/en/engines/table-engines/integrations/s3queue.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/s3queue.md b/docs/en/engines/table-engines/integrations/s3queue.md index 8ebab80423f..aa7fa512480 100644 --- a/docs/en/engines/table-engines/integrations/s3queue.md +++ b/docs/en/engines/table-engines/integrations/s3queue.md @@ -202,8 +202,7 @@ Example: CREATE TABLE s3queue_engine_table (name String, value UInt32) ENGINE=S3Queue('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/*', 'CSV', 'gzip') SETTINGS - mode = 'unordered', - keeper_path = '/clickhouse/s3queue/'; + mode = 'unordered'; CREATE TABLE stats (name String, value UInt32) ENGINE = MergeTree() ORDER BY name; From e59097274a72216e99dbec83cbbe4f5142463799 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Fri, 24 May 2024 13:56:16 -0300 Subject: [PATCH 372/392] test for #64211 --- ...uted_merge_global_in_primary_key.reference | 19 +++++ ...istributed_merge_global_in_primary_key.sql | 83 +++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.reference create mode 100644 tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql diff --git a/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.reference b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.reference new file mode 100644 index 00000000000..f572a3570f4 --- /dev/null +++ b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.reference @@ -0,0 +1,19 @@ +------------------- Distributed ------------------ +1 +---------- merge() over distributed -------------- +2 +---------- merge() over local -------------------- +1 +1 +1 +---------- remote() over Merge ------------------- +2 +---------- Distributed over Merge ---------------- +1 +---------- remote() over Merge ------------------- +2 +---------- Merge over Distributed ----------------- +1 +1 +1 +2 diff --git a/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql new file mode 100644 index 00000000000..78176e346f4 --- /dev/null +++ b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql @@ -0,0 +1,83 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/64211 + +create database test; +use test; + +CREATE TABLE test_local (name String) +ENGINE = MergeTree +ORDER BY name as select 'x'; + +CREATE TABLE test_distributed as test_local +ENGINE = Distributed(default, currentDatabase(), test_local); + +CREATE TABLE test_merge as test_local +ENGINE = Merge(currentDatabase(), 'test_local'); + +CREATE TABLE test_merge_distributed as test_local +ENGINE = Distributed(default, currentDatabase(), test_merge); + +CREATE TABLE test_distributed_merge as test_local +ENGINE = Merge(currentDatabase(), 'test_distributed'); + +SELECT '------------------- Distributed ------------------'; +SELECT count() +FROM test_distributed +WHERE name GLOBAL IN (SELECT name FROM test_distributed); + +SELECT '---------- merge() over distributed --------------'; +SELECT count() +FROM merge(currentDatabase(), 'test_distributed') +WHERE name GLOBAL IN (SELECT name FROM test_distributed); + +SELECT '---------- merge() over local --------------------'; +SELECT count() +FROM merge(currentDatabase(), 'test_local') +WHERE name GLOBAL IN (SELECT name FROM test_distributed); + +SELECT count() +FROM merge(currentDatabase(), 'test_local') +WHERE name GLOBAL IN (SELECT name FROM merge(currentDatabase(), 'test_local')); + +SELECT count() +FROM merge(currentDatabase(), 'test_local') +WHERE name GLOBAL IN (SELECT name FROM remote('127.0.0.{1,2}', currentDatabase(), test_merge)); + +SELECT '---------- remote() over Merge -------------------'; +SELECT count() +FROM remote('127.0.0.{1,2}', currentDatabase(), test_merge) +WHERE name GLOBAL IN (SELECT name FROM test_distributed); + +SELECT '---------- Distributed over Merge ----------------'; +SELECT count() +FROM test_merge_distributed +WHERE name GLOBAL IN (SELECT name FROM test_merge_distributed); + +SELECT '---------- remote() over Merge -------------------'; +SELECT count() +FROM remote('127.0.0.{1,2}', currentDatabase(), test_merge) +WHERE name GLOBAL IN (SELECT name FROM remote('127.0.0.{1,2}', currentDatabase(), test_merge)); + +SELECT '---------- Merge over Distributed -----------------'; +SELECT count() +FROM test_distributed_merge +WHERE name GLOBAL IN (SELECT name FROM remote('127.0.0.{1,2}', currentDatabase(), test_merge)); + +SELECT count() +FROM test_distributed_merge +WHERE name GLOBAL IN (SELECT name FROM remote('127.0.0.{1,2}', currentDatabase(), test_distributed_merge)); + +SELECT count() +FROM test_distributed_merge +WHERE name GLOBAL IN (SELECT name FROM test_distributed_merge); + +SELECT count() +FROM remote('127.0.0.{1,2}', currentDatabase(), test_distributed_merge) +WHERE name GLOBAL IN (SELECT name FROM remote('127.0.0.{1,2}', currentDatabase(), test_merge)); + + +DROP TABLE test_merge; +DROP TABLE test_merge_distributed; +DROP TABLE test_distributed_merge; +DROP TABLE test_distributed; +DROP TABLE test_local; +drop database test; From 9a917db4b3eade94941225b4a792f4d2331459ba Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Fri, 24 May 2024 14:27:26 -0300 Subject: [PATCH 373/392] Update 01227_distributed_merge_global_in_primary_key.sql --- .../01227_distributed_merge_global_in_primary_key.sql | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql index 78176e346f4..e73d07c193f 100644 --- a/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql +++ b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql @@ -1,8 +1,5 @@ -- https://github.com/ClickHouse/ClickHouse/issues/64211 -create database test; -use test; - CREATE TABLE test_local (name String) ENGINE = MergeTree ORDER BY name as select 'x'; @@ -80,4 +77,3 @@ DROP TABLE test_merge_distributed; DROP TABLE test_distributed_merge; DROP TABLE test_distributed; DROP TABLE test_local; -drop database test; From 91a84f8e17192a70b48d3152ad8b48107d60c117 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Fri, 24 May 2024 15:03:45 -0300 Subject: [PATCH 374/392] Update 01227_distributed_merge_global_in_primary_key.sql --- .../01227_distributed_merge_global_in_primary_key.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql index e73d07c193f..5cd4aaab1e6 100644 --- a/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql +++ b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql @@ -5,13 +5,13 @@ ENGINE = MergeTree ORDER BY name as select 'x'; CREATE TABLE test_distributed as test_local -ENGINE = Distributed(default, currentDatabase(), test_local); +ENGINE = Distributed(test_shard_localhost, currentDatabase(), test_local); CREATE TABLE test_merge as test_local ENGINE = Merge(currentDatabase(), 'test_local'); CREATE TABLE test_merge_distributed as test_local -ENGINE = Distributed(default, currentDatabase(), test_merge); +ENGINE = Distributed(test_shard_localhost, currentDatabase(), test_merge); CREATE TABLE test_distributed_merge as test_local ENGINE = Merge(currentDatabase(), 'test_distributed'); From 3ed1ec2f63582819f005d591459f30cdbff0daff Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Fri, 24 May 2024 23:54:56 -0300 Subject: [PATCH 375/392] Update tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql Co-authored-by: Nikita Mikhaylov --- .../01227_distributed_merge_global_in_primary_key.sql | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql index 5cd4aaab1e6..6b0dd4c8747 100644 --- a/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql +++ b/tests/queries/0_stateless/01227_distributed_merge_global_in_primary_key.sql @@ -1,5 +1,9 @@ -- https://github.com/ClickHouse/ClickHouse/issues/64211 - +DROP TABLE IF EXISTS test_merge; +DROP TABLE IF EXISTS test_merge_distributed; +DROP TABLE IF EXISTS test_distributed_merge; +DROP TABLE IF EXISTS test_distributed; +DROP TABLE IF EXISTS test_local; CREATE TABLE test_local (name String) ENGINE = MergeTree ORDER BY name as select 'x'; From 031591f3dd5ae155e3a8d8cf061e2956a29e6a4a Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 25 May 2024 15:48:45 +0200 Subject: [PATCH 376/392] Fix settings changes history --- src/Core/SettingsChangesHistory.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index a89516436e8..16f28d94640 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,6 +85,14 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"24.6", {{"hdfs_throw_on_zero_files_match", false, false, "Allow to throw an error when ListObjects request cannot match any files in HDFS engine instead of empty query result"}, + {"azure_throw_on_zero_files_match", false, false, "Allow to throw an error when ListObjects request cannot match any files in AzureBlobStorage engine instead of empty query result"}, + {"s3_validate_request_settings", true, true, "Allow to disable S3 request settings validation"}, + {"azure_skip_empty_files", false, false, "Allow to skip empty files in azure table engine"}, + {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, + {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, + {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, + }}, {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, {"input_format_tsv_crlf_end_of_line", false, false, "Enables reading of CRLF line endings with TSV formats"}, @@ -93,13 +101,6 @@ static std::map sett {"cross_join_min_bytes_to_compress", 0, 1_GiB, "Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached."}, {"http_max_chunk_size", 0, 0, "Internal limitation"}, {"prefer_external_sort_block_bytes", 0, DEFAULT_BLOCK_SIZE * 256, "Prefer maximum block bytes for external sort, reduce the memory usage during merging."}, - {"hdfs_throw_on_zero_files_match", false, false, "Allow to throw an error when ListObjects request cannot match any files in HDFS engine instead of empty query result"}, - {"azure_throw_on_zero_files_match", false, false, "Allow to throw an error when ListObjects request cannot match any files in AzureBlobStorage engine instead of empty query result"}, - {"s3_validate_request_settings", true, true, "Allow to disable S3 request settings validation"}, - {"azure_skip_empty_files", false, false, "Allow to skip empty files in azure table engine"}, - {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, - {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, - {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, {"input_format_force_null_for_omitted_fields", false, false, "Disable type-defaults for omitted fields when needed"}, {"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"}, From 142d67d1b298478a0df46b2585d4719a9ef55f4e Mon Sep 17 00:00:00 2001 From: kssenii Date: Sun, 26 May 2024 11:16:48 +0200 Subject: [PATCH 377/392] Fix S3ObjectStorage::applyNewSettings --- .../ObjectStorages/S3/S3ObjectStorage.cpp | 21 ++++++++----------- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 5 +---- .../ObjectStorage/S3/Configuration.cpp | 2 +- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index c07313b52db..69485bd4d01 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -575,24 +575,21 @@ void S3ObjectStorage::applyNewSettings( ContextPtr context, const ApplyNewSettingsOptions & options) { - auto new_s3_settings = getSettings(config, config_prefix, context, context->getSettingsRef().s3_validate_request_settings); - if (!static_headers.empty()) - { - new_s3_settings->auth_settings.headers.insert( - new_s3_settings->auth_settings.headers.end(), - static_headers.begin(), static_headers.end()); - } + auto settings_from_config = getSettings(config, config_prefix, context, context->getSettingsRef().s3_validate_request_settings); + auto modified_settings = std::make_unique(*s3_settings.get()); + modified_settings->auth_settings.updateFrom(settings_from_config->auth_settings); if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString(), context->getUserName())) - new_s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); + modified_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); - auto current_s3_settings = s3_settings.get(); - if (options.allow_client_change && (current_s3_settings->auth_settings.hasUpdates(new_s3_settings->auth_settings) || for_disk_s3)) + auto current_settings = s3_settings.get(); + if (options.allow_client_change + && (current_settings->auth_settings.hasUpdates(modified_settings->auth_settings) || for_disk_s3)) { - auto new_client = getClient(config, config_prefix, context, *new_s3_settings, for_disk_s3, &uri); + auto new_client = getClient(config, config_prefix, context, *modified_settings, for_disk_s3, &uri); client.set(std::move(new_client)); } - s3_settings.set(std::move(new_s3_settings)); + s3_settings.set(std::move(modified_settings)); } std::unique_ptr S3ObjectStorage::cloneObjectStorage( diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 1fff6d67e23..062ddd4e2a2 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -54,8 +54,7 @@ private: const S3Capabilities & s3_capabilities_, ObjectStorageKeysGeneratorPtr key_generator_, const String & disk_name_, - bool for_disk_s3_ = true, - const HTTPHeaderEntries & static_headers_ = {}) + bool for_disk_s3_ = true) : uri(uri_) , disk_name(disk_name_) , client(std::move(client_)) @@ -64,7 +63,6 @@ private: , key_generator(std::move(key_generator_)) , log(getLogger(logger_name)) , for_disk_s3(for_disk_s3_) - , static_headers(static_headers_) { } @@ -189,7 +187,6 @@ private: LoggerPtr log; const bool for_disk_s3; - const HTTPHeaderEntries static_headers; }; } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 6b6cde0c431..4b217b94730 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -136,7 +136,7 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, return std::make_shared( std::move(client), std::move(s3_settings), url, s3_capabilities, - key_generator, "StorageS3", false, headers_from_ast); + key_generator, "StorageS3", false); } void StorageS3Configuration::fromNamedCollection(const NamedCollection & collection) From 14f259d9d7a9d53ed8d1c64be36be20a622bf7ce Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 26 May 2024 13:54:35 +0000 Subject: [PATCH 378/392] Fix flaky test --- tests/queries/0_stateless/03130_generateSnowflakeId.reference | 2 -- tests/queries/0_stateless/03130_generateSnowflakeId.sql | 2 -- 2 files changed, 4 deletions(-) diff --git a/tests/queries/0_stateless/03130_generateSnowflakeId.reference b/tests/queries/0_stateless/03130_generateSnowflakeId.reference index 6ec0cafab16..f5b7872f81e 100644 --- a/tests/queries/0_stateless/03130_generateSnowflakeId.reference +++ b/tests/queries/0_stateless/03130_generateSnowflakeId.reference @@ -1,11 +1,9 @@ -- generateSnowflakeID 1 -1 0 0 1 100 -- generateSnowflakeIDThreadMonotonic 1 -1 100 diff --git a/tests/queries/0_stateless/03130_generateSnowflakeId.sql b/tests/queries/0_stateless/03130_generateSnowflakeId.sql index 903be5b786c..57cdd21a9fe 100644 --- a/tests/queries/0_stateless/03130_generateSnowflakeId.sql +++ b/tests/queries/0_stateless/03130_generateSnowflakeId.sql @@ -1,6 +1,5 @@ SELECT '-- generateSnowflakeID'; -SELECT bitShiftLeft(toUInt64(generateSnowflakeID()), 52) = 0; -- check machine sequence number is zero SELECT bitAnd(bitShiftRight(toUInt64(generateSnowflakeID()), 63), 1) = 0; -- check first bit is zero SELECT generateSnowflakeID(1) = generateSnowflakeID(2); -- disabled common subexpression elimination --> lhs != rhs @@ -18,7 +17,6 @@ FROM SELECT '-- generateSnowflakeIDThreadMonotonic'; -SELECT bitShiftLeft(toUInt64(generateSnowflakeIDThreadMonotonic()), 52) = 0; -- check machine sequence number is zero SELECT bitAnd(bitShiftRight(toUInt64(generateSnowflakeIDThreadMonotonic()), 63), 1) = 0; -- check first bit is zero SELECT generateSnowflakeIDThreadMonotonic(1, 2); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } From 8f4422d72917c1885a892200e267268f6b2e3b98 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 26 May 2024 14:07:50 +0000 Subject: [PATCH 379/392] Test analyzer and non-analyzer execution --- .../02494_query_cache_nested_query_bug.reference | 2 ++ .../02494_query_cache_nested_query_bug.sh | 12 +++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.reference b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.reference index b261da18d51..9ec033cefb1 100644 --- a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.reference +++ b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.reference @@ -1,2 +1,4 @@ +2 +0 1 0 diff --git a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh index a5339a098dc..6bc3d03ac66 100755 --- a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh +++ b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh @@ -15,11 +15,17 @@ ${CLICKHOUSE_CLIENT} --query "CREATE TABLE tab (a UInt64) ENGINE=MergeTree() ORD ${CLICKHOUSE_CLIENT} --query "INSERT INTO tab VALUES (1) (2) (3)" ${CLICKHOUSE_CLIENT} --query "INSERT INTO tab VALUES (3) (4) (5)" -SETTINGS="SETTINGS use_query_cache=1, max_threads=1, allow_experimental_analyzer=1, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=0.0" +SETTINGS_NO_ANALYZER="SETTINGS use_query_cache=1, max_threads=1, allow_experimental_analyzer=0, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=0.0" +SETTINGS_ANALYZER="SETTINGS use_query_cache=1, max_threads=1, allow_experimental_analyzer=1, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=0.0" # Verify that the first query does two aggregations and the second query zero aggregations. Since query cache is currently not integrated # with EXPLAIN PLAN, we need to check the logs. -${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS" 2>&1 | grep "Aggregated. " | wc -l -${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS" 2>&1 | grep "Aggregated. " | wc -l +${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_NO_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l +${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_NO_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l + +${CLICKHOUSE_CLIENT} --query "SYSTEM DROP QUERY CACHE" + +${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l +${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP QUERY CACHE" From 3ee2307024c9a7b2c54247335f0fb0f0f54380e4 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 27 May 2024 10:04:19 +0200 Subject: [PATCH 380/392] Revert "Refactoring of Server.h: Isolate server management from other logic" --- programs/server/Server.cpp | 987 +++++++++++++++++- programs/server/Server.h | 95 +- src/CMakeLists.txt | 1 - src/Server/ServersManager/IServersManager.cpp | 268 ----- src/Server/ServersManager/IServersManager.h | 74 -- .../ServersManager/InterServersManager.cpp | 327 ------ .../ServersManager/InterServersManager.h | 44 - .../ServersManager/ProtocolServersManager.cpp | 523 ---------- .../ServersManager/ProtocolServersManager.h | 37 - 9 files changed, 1032 insertions(+), 1324 deletions(-) delete mode 100644 src/Server/ServersManager/IServersManager.cpp delete mode 100644 src/Server/ServersManager/IServersManager.h delete mode 100644 src/Server/ServersManager/InterServersManager.cpp delete mode 100644 src/Server/ServersManager/InterServersManager.h delete mode 100644 src/Server/ServersManager/ProtocolServersManager.cpp delete mode 100644 src/Server/ServersManager/ProtocolServersManager.h diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index b62ae40924c..223bc1f77e7 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include @@ -42,9 +44,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -79,19 +83,29 @@ #include #include #include +#include #include "MetricsTransmitter.h" #include +#include +#include #include #include #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include #include #include #include +#include #include "config.h" #include @@ -105,9 +119,19 @@ #endif #if USE_SSL +# include # include #endif +#if USE_GRPC +# include +#endif + +#if USE_NURAFT +# include +# include +#endif + #if USE_JEMALLOC # include #endif @@ -135,6 +159,18 @@ namespace ProfileEvents { extern const Event MainConfigLoads; extern const Event ServerStartupMilliseconds; + extern const Event InterfaceNativeSendBytes; + extern const Event InterfaceNativeReceiveBytes; + extern const Event InterfaceHTTPSendBytes; + extern const Event InterfaceHTTPReceiveBytes; + extern const Event InterfacePrometheusSendBytes; + extern const Event InterfacePrometheusReceiveBytes; + extern const Event InterfaceInterserverSendBytes; + extern const Event InterfaceInterserverReceiveBytes; + extern const Event InterfaceMySQLSendBytes; + extern const Event InterfaceMySQLReceiveBytes; + extern const Event InterfacePostgreSQLSendBytes; + extern const Event InterfacePostgreSQLReceiveBytes; } namespace fs = std::filesystem; @@ -202,9 +238,11 @@ namespace DB namespace ErrorCodes { extern const int NO_ELEMENTS_IN_CONFIG; + extern const int SUPPORT_IS_DISABLED; extern const int ARGUMENT_OUT_OF_BOUND; extern const int EXCESSIVE_ELEMENT_IN_CONFIG; extern const int INVALID_CONFIG_PARAMETER; + extern const int NETWORK_ERROR; extern const int CORRUPTED_DATA; } @@ -219,6 +257,115 @@ static std::string getCanonicalPath(std::string && path) return std::move(path); } +Poco::Net::SocketAddress Server::socketBindListen( + const Poco::Util::AbstractConfiguration & config, + Poco::Net::ServerSocket & socket, + const std::string & host, + UInt16 port, + [[maybe_unused]] bool secure) const +{ + auto address = makeSocketAddress(host, port, &logger()); + socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ config.getBool("listen_reuse_port", false)); + /// If caller requests any available port from the OS, discover it after binding. + if (port == 0) + { + address = socket.address(); + LOG_DEBUG(&logger(), "Requested any available port (port == 0), actual port is {:d}", address.port()); + } + + socket.listen(/* backlog = */ config.getUInt("listen_backlog", 4096)); + + return address; +} + +Strings getListenHosts(const Poco::Util::AbstractConfiguration & config) +{ + auto listen_hosts = DB::getMultipleValuesFromConfig(config, "", "listen_host"); + if (listen_hosts.empty()) + { + listen_hosts.emplace_back("::1"); + listen_hosts.emplace_back("127.0.0.1"); + } + return listen_hosts; +} + +Strings getInterserverListenHosts(const Poco::Util::AbstractConfiguration & config) +{ + auto interserver_listen_hosts = DB::getMultipleValuesFromConfig(config, "", "interserver_listen_host"); + if (!interserver_listen_hosts.empty()) + return interserver_listen_hosts; + + /// Use more general restriction in case of emptiness + return getListenHosts(config); +} + +bool getListenTry(const Poco::Util::AbstractConfiguration & config) +{ + bool listen_try = config.getBool("listen_try", false); + if (!listen_try) + { + Poco::Util::AbstractConfiguration::Keys protocols; + config.keys("protocols", protocols); + listen_try = + DB::getMultipleValuesFromConfig(config, "", "listen_host").empty() && + std::none_of(protocols.begin(), protocols.end(), [&](const auto & protocol) + { + return config.has("protocols." + protocol + ".host") && config.has("protocols." + protocol + ".port"); + }); + } + return listen_try; +} + + +void Server::createServer( + Poco::Util::AbstractConfiguration & config, + const std::string & listen_host, + const char * port_name, + bool listen_try, + bool start_server, + std::vector & servers, + CreateServerFunc && func) const +{ + /// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file. + if (config.getString(port_name, "").empty()) + return; + + /// If we already have an active server for this listen_host/port_name, don't create it again + for (const auto & server : servers) + { + if (!server.isStopping() && server.getListenHost() == listen_host && server.getPortName() == port_name) + return; + } + + auto port = config.getInt(port_name); + try + { + servers.push_back(func(port)); + if (start_server) + { + servers.back().start(); + LOG_INFO(&logger(), "Listening for {}", servers.back().getDescription()); + } + global_context->registerServerPort(port_name, port); + } + catch (const Poco::Exception &) + { + if (listen_try) + { + LOG_WARNING(&logger(), "Listen [{}]:{} failed: {}. If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, " + "then consider to " + "specify not disabled IPv4 or IPv6 address to listen in element of configuration " + "file. Example for disabled IPv6: 0.0.0.0 ." + " Example for disabled IPv4: ::", + listen_host, port, getCurrentExceptionMessage(false)); + } + else + { + throw Exception(ErrorCodes::NETWORK_ERROR, "Listen [{}]:{} failed: {}", listen_host, port, getCurrentExceptionMessage(false)); + } + } +} + #if defined(OS_LINUX) namespace @@ -518,7 +665,6 @@ try ServerSettings server_settings; server_settings.loadSettingsFromConfig(config()); - Poco::ThreadPool server_pool(3, server_settings.max_connections); ASTAlterCommand::setFormatAlterCommandsWithParentheses(server_settings.format_alter_operations_with_parentheses); @@ -575,6 +721,11 @@ try CurrentMetrics::set(CurrentMetrics::Revision, ClickHouseRevision::getVersionRevision()); CurrentMetrics::set(CurrentMetrics::VersionInteger, ClickHouseRevision::getVersionInteger()); + Poco::ThreadPool server_pool(3, server_settings.max_connections); + std::mutex servers_lock; + std::vector servers; + std::vector servers_to_start_before_tables; + /** Context contains all that query execution is dependent: * settings, available functions, data types, aggregate functions, databases, ... */ @@ -624,10 +775,6 @@ try bool will_have_trace_collector = hasPHDRCache() && config().has("trace_log"); - std::mutex servers_lock; - ProtocolServersManager servers(context(), &logger()); - InterServersManager servers_to_start_before_tables(context(), &logger()); - // Initialize global thread pool. Do it before we fetch configs from zookeeper // nodes (`from_zk`), because ZooKeeper interface uses the pool. We will // ignore `max_thread_pool_size` in configs we fetch from ZK, but oh well. @@ -659,7 +806,32 @@ try LOG_DEBUG(log, "Shut down storages."); - servers_to_start_before_tables.stopServers(server_settings, servers_lock); + if (!servers_to_start_before_tables.empty()) + { + LOG_DEBUG(log, "Waiting for current connections to servers for tables to finish."); + size_t current_connections = 0; + { + std::lock_guard lock(servers_lock); + for (auto & server : servers_to_start_before_tables) + { + server.stop(); + current_connections += server.currentConnections(); + } + } + + if (current_connections) + LOG_INFO(log, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); + else + LOG_INFO(log, "Closed all listening sockets."); + + if (current_connections > 0) + current_connections = waitServersToFinish(servers_to_start_before_tables, servers_lock, server_settings.shutdown_wait_unfinished); + + if (current_connections) + LOG_INFO(log, "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections after context shutdown.", current_connections); + else + LOG_INFO(log, "Closed connections to servers for tables."); + } global_context->shutdownKeeperDispatcher(); @@ -756,13 +928,19 @@ try server_settings.asynchronous_heavy_metrics_update_period_s, [&]() -> std::vector { + std::vector metrics; + std::lock_guard lock(servers_lock); - std::vector metrics1 = servers_to_start_before_tables.getMetrics(); - std::vector metrics2 = servers.getMetrics(); - metrics1.reserve(metrics1.size() + metrics2.size()); - metrics1.insert(metrics1.end(), std::make_move_iterator(metrics2.begin()), std::make_move_iterator(metrics2.end())); - return metrics1; - }); + metrics.reserve(servers_to_start_before_tables.size() + servers.size()); + + for (const auto & server : servers_to_start_before_tables) + metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); + + for (const auto & server : servers) + metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); + return metrics; + } + ); zkutil::validateZooKeeperConfig(config()); bool has_zookeeper = zkutil::hasZooKeeperConfig(config()); @@ -1410,8 +1588,7 @@ try if (global_context->isServerCompletelyStarted()) { std::lock_guard lock(servers_lock); - servers.updateServers(*config, *this, servers_lock, server_pool, async_metrics, latest_config); - servers_to_start_before_tables.updateServers(*config, *this, servers_lock, server_pool, async_metrics, latest_config); + updateServers(*config, server_pool, async_metrics, servers, servers_to_start_before_tables); } } @@ -1458,17 +1635,141 @@ try /// Must be the last. latest_config = config; }, - /* already_loaded = */ false); /// Reload it right now (initial loading) + /* already_loaded = */ false); /// Reload it right now (initial loading) - servers_to_start_before_tables.createServers( - config(), - *this, - servers_lock, - server_pool, - async_metrics, - /* start_servers= */ false, - ServerType(ServerType::Type::QUERIES_ALL) - ); + const auto listen_hosts = getListenHosts(config()); + const auto interserver_listen_hosts = getInterserverListenHosts(config()); + const auto listen_try = getListenTry(config()); + + if (config().has("keeper_server.server_id")) + { +#if USE_NURAFT + //// If we don't have configured connection probably someone trying to use clickhouse-server instead + //// of clickhouse-keeper, so start synchronously. + bool can_initialize_keeper_async = false; + + if (has_zookeeper) /// We have configured connection to some zookeeper cluster + { + /// If we cannot connect to some other node from our cluster then we have to wait our Keeper start + /// synchronously. + can_initialize_keeper_async = global_context->tryCheckClientConnectionToMyKeeperCluster(); + } + /// Initialize keeper RAFT. + global_context->initializeKeeperDispatcher(can_initialize_keeper_async); + FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); + + auto config_getter = [this] () -> const Poco::Util::AbstractConfiguration & + { + return global_context->getConfigRef(); + }; + + for (const auto & listen_host : listen_hosts) + { + /// TCP Keeper + const char * port_name = "keeper_server.tcp_port"; + createServer( + config(), listen_host, port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config(), socket, listen_host, port); + socket.setReceiveTimeout(Poco::Timespan(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)); + socket.setSendTimeout(Poco::Timespan(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)); + return ProtocolServerAdapter( + listen_host, + port_name, + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory( + config_getter, global_context->getKeeperDispatcher(), + global_context->getSettingsRef().receive_timeout.totalSeconds(), + global_context->getSettingsRef().send_timeout.totalSeconds(), + false), server_pool, socket)); + }); + + const char * secure_port_name = "keeper_server.tcp_port_secure"; + createServer( + config(), listen_host, secure_port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(config(), socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)); + socket.setSendTimeout(Poco::Timespan(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)); + return ProtocolServerAdapter( + listen_host, + secure_port_name, + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory( + config_getter, global_context->getKeeperDispatcher(), + global_context->getSettingsRef().receive_timeout.totalSeconds(), + global_context->getSettingsRef().send_timeout.totalSeconds(), true), server_pool, socket)); +#else + UNUSED(port); + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); +#endif + }); + + /// HTTP control endpoints + port_name = "keeper_server.http_control.port"; + createServer(config(), listen_host, port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { + auto http_context = httpContext(); + Poco::Timespan keep_alive_timeout(config().getUInt("keep_alive_timeout", 10), 0); + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(http_context->getReceiveTimeout()); + http_params->setKeepAliveTimeout(keep_alive_timeout); + + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config(), socket, listen_host, port); + socket.setReceiveTimeout(http_context->getReceiveTimeout()); + socket.setSendTimeout(http_context->getSendTimeout()); + return ProtocolServerAdapter( + listen_host, + port_name, + "HTTP Control: http://" + address.toString(), + std::make_unique( + std::move(http_context), + createKeeperHTTPControlMainHandlerFactory( + config_getter(), + global_context->getKeeperDispatcher(), + "KeeperHTTPControlHandler-factory"), server_pool, socket, http_params)); + }); + } +#else + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination."); +#endif + + } + + { + std::lock_guard lock(servers_lock); + /// We should start interserver communications before (and more important shutdown after) tables. + /// Because server can wait for a long-running queries (for example in tcp_handler) after interserver handler was already shut down. + /// In this case we will have replicated tables which are unable to send any parts to other replicas, but still can + /// communicate with zookeeper, execute merges, etc. + createInterserverServers( + config(), + interserver_listen_hosts, + listen_try, + server_pool, + async_metrics, + servers_to_start_before_tables, + /* start_servers= */ false); + + + for (auto & server : servers_to_start_before_tables) + { + server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } + } /// Initialize access storages. auto & access_control = global_context->getAccessControl(); @@ -1498,18 +1799,19 @@ try global_context->setStopServersCallback([&](const ServerType & server_type) { std::lock_guard lock(servers_lock); - servers.stopServers(server_type); + stopServers(servers, server_type); }); global_context->setStartServersCallback([&](const ServerType & server_type) { std::lock_guard lock(servers_lock); - servers.createServers( + createServers( config(), - *this, - servers_lock, + listen_hosts, + listen_try, server_pool, async_metrics, + servers, /* start_servers= */ true, server_type); }); @@ -1722,21 +2024,18 @@ try { std::lock_guard lock(servers_lock); - servers.createServers( - config(), - *this, - servers_lock, - server_pool, - async_metrics, - false, - ServerType(ServerType::Type::QUERIES_ALL)); + createServers(config(), listen_hosts, listen_try, server_pool, async_metrics, servers); if (servers.empty()) - throw Exception( - ErrorCodes::NO_ELEMENTS_IN_CONFIG, - "No servers started (add valid listen_host and 'tcp_port' " - "or 'http_port' to configuration file.)"); + throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, + "No servers started (add valid listen_host and 'tcp_port' or 'http_port' " + "to configuration file.)"); } + if (servers.empty()) + throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, + "No servers started (add valid listen_host and 'tcp_port' or 'http_port' " + "to configuration file.)"); + #if USE_SSL CertificateReloader::instance().tryLoad(config()); #endif @@ -1808,7 +2107,12 @@ try { std::lock_guard lock(servers_lock); - servers.startServers(); + for (auto & server : servers) + { + server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } + global_context->setServerCompletelyStarted(); LOG_INFO(log, "Ready for connections."); } @@ -1844,10 +2148,46 @@ try access_control.stopPeriodicReloading(); is_cancelled = true; - const auto remaining_connections = servers.stopServers(server_settings, servers_lock); + + LOG_DEBUG(log, "Waiting for current connections to close."); + + size_t current_connections = 0; + { + std::lock_guard lock(servers_lock); + for (auto & server : servers) + { + server.stop(); + current_connections += server.currentConnections(); + } + } + + if (current_connections) + LOG_WARNING(log, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); + else + LOG_INFO(log, "Closed all listening sockets."); + + /// Wait for unfinished backups and restores. + /// This must be done after closing listening sockets (no more backups/restores) but before ProcessList::killAllQueries + /// (because killAllQueries() will cancel all running backups/restores). + if (server_settings.shutdown_wait_backups_and_restores) + global_context->waitAllBackupsAndRestores(); + + /// Killing remaining queries. + if (!server_settings.shutdown_wait_unfinished_queries) + global_context->getProcessList().killAllQueries(); + + if (current_connections) + current_connections = waitServersToFinish(servers, servers_lock, server_settings.shutdown_wait_unfinished); + + if (current_connections) + LOG_WARNING(log, "Closed connections. But {} remain." + " Tip: To increase wait time add to config: 60", current_connections); + else + LOG_INFO(log, "Closed connections."); + dns_cache_updater.reset(); - if (remaining_connections) + if (current_connections) { /// There is no better way to force connections to close in Poco. /// Otherwise connection handlers will continue to live @@ -1881,4 +2221,561 @@ catch (...) return code ? code : -1; } +std::unique_ptr Server::buildProtocolStackFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & protocol, + Poco::Net::HTTPServerParams::Ptr http_params, + AsynchronousMetrics & async_metrics, + bool & is_secure) +{ + auto create_factory = [&](const std::string & type, const std::string & conf_name) -> TCPServerConnectionFactory::Ptr + { + if (type == "tcp") + return TCPServerConnectionFactory::Ptr(new TCPHandlerFactory(*this, false, false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes)); + + if (type == "tls") +#if USE_SSL + return TCPServerConnectionFactory::Ptr(new TLSHandlerFactory(*this, conf_name)); +#else + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); +#endif + + if (type == "proxy1") + return TCPServerConnectionFactory::Ptr(new ProxyV1HandlerFactory(*this, conf_name)); + if (type == "mysql") + return TCPServerConnectionFactory::Ptr(new MySQLHandlerFactory(*this, ProfileEvents::InterfaceMySQLReceiveBytes, ProfileEvents::InterfaceMySQLSendBytes)); + if (type == "postgres") + return TCPServerConnectionFactory::Ptr(new PostgreSQLHandlerFactory(*this, ProfileEvents::InterfacePostgreSQLReceiveBytes, ProfileEvents::InterfacePostgreSQLSendBytes)); + if (type == "http") + return TCPServerConnectionFactory::Ptr( + new HTTPServerConnectionFactory(httpContext(), http_params, createHandlerFactory(*this, config, async_metrics, "HTTPHandler-factory"), ProfileEvents::InterfaceHTTPReceiveBytes, ProfileEvents::InterfaceHTTPSendBytes) + ); + if (type == "prometheus") + return TCPServerConnectionFactory::Ptr( + new HTTPServerConnectionFactory(httpContext(), http_params, createHandlerFactory(*this, config, async_metrics, "PrometheusHandler-factory"), ProfileEvents::InterfacePrometheusReceiveBytes, ProfileEvents::InterfacePrometheusSendBytes) + ); + if (type == "interserver") + return TCPServerConnectionFactory::Ptr( + new HTTPServerConnectionFactory(httpContext(), http_params, createHandlerFactory(*this, config, async_metrics, "InterserverIOHTTPHandler-factory"), ProfileEvents::InterfaceInterserverReceiveBytes, ProfileEvents::InterfaceInterserverSendBytes) + ); + + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol configuration error, unknown protocol name '{}'", type); + }; + + std::string conf_name = "protocols." + protocol; + std::string prefix = conf_name + "."; + std::unordered_set pset {conf_name}; + + auto stack = std::make_unique(*this, conf_name); + + while (true) + { + // if there is no "type" - it's a reference to another protocol and this is just an endpoint + if (config.has(prefix + "type")) + { + std::string type = config.getString(prefix + "type"); + if (type == "tls") + { + if (is_secure) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' contains more than one TLS layer", protocol); + is_secure = true; + } + + stack->append(create_factory(type, conf_name)); + } + + if (!config.has(prefix + "impl")) + break; + + conf_name = "protocols." + config.getString(prefix + "impl"); + prefix = conf_name + "."; + + if (!pset.insert(conf_name).second) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' configuration contains a loop on '{}'", protocol, conf_name); + } + + return stack; +} + +HTTPContextPtr Server::httpContext() const +{ + return std::make_shared(context()); +} + +void Server::createServers( + Poco::Util::AbstractConfiguration & config, + const Strings & listen_hosts, + bool listen_try, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + bool start_servers, + const ServerType & server_type) +{ + const Settings & settings = global_context->getSettingsRef(); + + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(settings.http_receive_timeout); + http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); + + Poco::Util::AbstractConfiguration::Keys protocols; + config.keys("protocols", protocols); + + for (const auto & protocol : protocols) + { + if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol)) + continue; + + std::string prefix = "protocols." + protocol + "."; + std::string port_name = prefix + "port"; + std::string description {" protocol"}; + if (config.has(prefix + "description")) + description = config.getString(prefix + "description"); + + if (!config.has(prefix + "port")) + continue; + + std::vector hosts; + if (config.has(prefix + "host")) + hosts.push_back(config.getString(prefix + "host")); + else + hosts = listen_hosts; + + for (const auto & host : hosts) + { + bool is_secure = false; + auto stack = buildProtocolStackFromConfig(config, protocol, http_params, async_metrics, is_secure); + + if (stack->empty()) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' stack empty", protocol); + + createServer(config, host, port_name.c_str(), listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, host, port, is_secure); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + + return ProtocolServerAdapter( + host, + port_name.c_str(), + description + ": " + address.toString(), + std::make_unique( + stack.release(), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }); + } + } + + for (const auto & listen_host : listen_hosts) + { + const char * port_name; + + if (server_type.shouldStart(ServerType::Type::HTTP)) + { + /// HTTP + port_name = "http_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + + return ProtocolServerAdapter( + listen_host, + port_name, + "http://" + address.toString(), + std::make_unique( + httpContext(), createHandlerFactory(*this, config, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params, ProfileEvents::InterfaceHTTPReceiveBytes, ProfileEvents::InterfaceHTTPSendBytes)); + }); + } + + if (server_type.shouldStart(ServerType::Type::HTTPS)) + { + /// HTTPS + port_name = "https_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "https://" + address.toString(), + std::make_unique( + httpContext(), createHandlerFactory(*this, config, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params, ProfileEvents::InterfaceHTTPReceiveBytes, ProfileEvents::InterfaceHTTPSendBytes)); +#else + UNUSED(port); + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "HTTPS protocol is disabled because Poco library was built without NetSSL support."); +#endif + }); + } + + if (server_type.shouldStart(ServerType::Type::TCP)) + { + /// TCP + port_name = "tcp_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "native protocol (tcp): " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }); + } + + if (server_type.shouldStart(ServerType::Type::TCP_WITH_PROXY)) + { + /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt + port_name = "tcp_with_proxy_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "native protocol (tcp) with PROXY: " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ true, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }); + } + + if (server_type.shouldStart(ServerType::Type::TCP_SECURE)) + { + /// TCP with SSL + port_name = "tcp_port_secure"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + #if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "secure native protocol (tcp_secure): " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ true, /* proxy protocol */ false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + #else + UNUSED(port); + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); + #endif + }); + } + + if (server_type.shouldStart(ServerType::Type::MYSQL)) + { + port_name = "mysql_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan()); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "MySQL compatibility protocol: " + address.toString(), + std::make_unique(new MySQLHandlerFactory(*this, ProfileEvents::InterfaceMySQLReceiveBytes, ProfileEvents::InterfaceMySQLSendBytes), server_pool, socket, new Poco::Net::TCPServerParams)); + }); + } + + if (server_type.shouldStart(ServerType::Type::POSTGRESQL)) + { + port_name = "postgresql_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan()); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "PostgreSQL compatibility protocol: " + address.toString(), + std::make_unique(new PostgreSQLHandlerFactory(*this, ProfileEvents::InterfacePostgreSQLReceiveBytes, ProfileEvents::InterfacePostgreSQLSendBytes), server_pool, socket, new Poco::Net::TCPServerParams)); + }); + } + +#if USE_GRPC + if (server_type.shouldStart(ServerType::Type::GRPC)) + { + port_name = "grpc_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::SocketAddress server_address(listen_host, port); + return ProtocolServerAdapter( + listen_host, + port_name, + "gRPC protocol: " + server_address.toString(), + std::make_unique(*this, makeSocketAddress(listen_host, port, &logger()))); + }); + } +#endif + if (server_type.shouldStart(ServerType::Type::PROMETHEUS)) + { + /// Prometheus (if defined and not setup yet with http_port) + port_name = "prometheus.port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "Prometheus: http://" + address.toString(), + std::make_unique( + httpContext(), createHandlerFactory(*this, config, async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params, ProfileEvents::InterfacePrometheusReceiveBytes, ProfileEvents::InterfacePrometheusSendBytes)); + }); + } + } +} + +void Server::createInterserverServers( + Poco::Util::AbstractConfiguration & config, + const Strings & interserver_listen_hosts, + bool listen_try, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + bool start_servers, + const ServerType & server_type) +{ + const Settings & settings = global_context->getSettingsRef(); + + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(settings.http_receive_timeout); + http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); + + /// Now iterate over interserver_listen_hosts + for (const auto & interserver_listen_host : interserver_listen_hosts) + { + const char * port_name; + + if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTP)) + { + /// Interserver IO HTTP + port_name = "interserver_http_port"; + createServer(config, interserver_listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(config, socket, interserver_listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + interserver_listen_host, + port_name, + "replica communication (interserver): http://" + address.toString(), + std::make_unique( + httpContext(), + createHandlerFactory(*this, config, async_metrics, "InterserverIOHTTPHandler-factory"), + server_pool, + socket, + http_params, + ProfileEvents::InterfaceInterserverReceiveBytes, + ProfileEvents::InterfaceInterserverSendBytes)); + }); + } + + if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTPS)) + { + port_name = "interserver_https_port"; + createServer(config, interserver_listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(config, socket, interserver_listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + interserver_listen_host, + port_name, + "secure replica communication (interserver): https://" + address.toString(), + std::make_unique( + httpContext(), + createHandlerFactory(*this, config, async_metrics, "InterserverIOHTTPSHandler-factory"), + server_pool, + socket, + http_params, + ProfileEvents::InterfaceInterserverReceiveBytes, + ProfileEvents::InterfaceInterserverSendBytes)); +#else + UNUSED(port); + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); +#endif + }); + } + } +} + +void Server::stopServers( + std::vector & servers, + const ServerType & server_type +) const +{ + LoggerRawPtr log = &logger(); + + /// Remove servers once all their connections are closed + auto check_server = [&log](const char prefix[], auto & server) + { + if (!server.isStopping()) + return false; + size_t current_connections = server.currentConnections(); + LOG_DEBUG(log, "Server {}{}: {} ({} connections)", + server.getDescription(), + prefix, + !current_connections ? "finished" : "waiting", + current_connections); + return !current_connections; + }; + + std::erase_if(servers, std::bind_front(check_server, " (from one of previous remove)")); + + for (auto & server : servers) + { + if (!server.isStopping()) + { + const std::string server_port_name = server.getPortName(); + + if (server_type.shouldStop(server_port_name)) + server.stop(); + } + } + + std::erase_if(servers, std::bind_front(check_server, "")); +} + +void Server::updateServers( + Poco::Util::AbstractConfiguration & config, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + std::vector & servers_to_start_before_tables) +{ + LoggerRawPtr log = &logger(); + + const auto listen_hosts = getListenHosts(config); + const auto interserver_listen_hosts = getInterserverListenHosts(config); + const auto listen_try = getListenTry(config); + + /// Remove servers once all their connections are closed + auto check_server = [&log](const char prefix[], auto & server) + { + if (!server.isStopping()) + return false; + size_t current_connections = server.currentConnections(); + LOG_DEBUG(log, "Server {}{}: {} ({} connections)", + server.getDescription(), + prefix, + !current_connections ? "finished" : "waiting", + current_connections); + return !current_connections; + }; + + std::erase_if(servers, std::bind_front(check_server, " (from one of previous reload)")); + + Poco::Util::AbstractConfiguration & previous_config = latest_config ? *latest_config : this->config(); + + std::vector all_servers; + all_servers.reserve(servers.size() + servers_to_start_before_tables.size()); + for (auto & server : servers) + all_servers.push_back(&server); + + for (auto & server : servers_to_start_before_tables) + all_servers.push_back(&server); + + for (auto * server : all_servers) + { + if (!server->isStopping()) + { + std::string port_name = server->getPortName(); + bool has_host = false; + bool is_http = false; + if (port_name.starts_with("protocols.")) + { + std::string protocol = port_name.substr(0, port_name.find_last_of('.')); + has_host = config.has(protocol + ".host"); + + std::string conf_name = protocol; + std::string prefix = protocol + "."; + std::unordered_set pset {conf_name}; + while (true) + { + if (config.has(prefix + "type")) + { + std::string type = config.getString(prefix + "type"); + if (type == "http") + { + is_http = true; + break; + } + } + + if (!config.has(prefix + "impl")) + break; + + conf_name = "protocols." + config.getString(prefix + "impl"); + prefix = conf_name + "."; + + if (!pset.insert(conf_name).second) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' configuration contains a loop on '{}'", protocol, conf_name); + } + } + else + { + /// NOTE: better to compare using getPortName() over using + /// dynamic_cast<> since HTTPServer is also used for prometheus and + /// internal replication communications. + is_http = server->getPortName() == "http_port" || server->getPortName() == "https_port"; + } + + if (!has_host) + has_host = std::find(listen_hosts.begin(), listen_hosts.end(), server->getListenHost()) != listen_hosts.end(); + bool has_port = !config.getString(port_name, "").empty(); + bool force_restart = is_http && !isSameConfiguration(previous_config, config, "http_handlers"); + if (force_restart) + LOG_TRACE(log, " had been changed, will reload {}", server->getDescription()); + + if (!has_host || !has_port || config.getInt(server->getPortName()) != server->portNumber() || force_restart) + { + server->stop(); + LOG_INFO(log, "Stopped listening for {}", server->getDescription()); + } + } + } + + createServers(config, listen_hosts, listen_try, server_pool, async_metrics, servers, /* start_servers= */ true); + createInterserverServers(config, interserver_listen_hosts, listen_try, server_pool, async_metrics, servers_to_start_before_tables, /* start_servers= */ true); + + std::erase_if(servers, std::bind_front(check_server, "")); + std::erase_if(servers_to_start_before_tables, std::bind_front(check_server, "")); +} + } diff --git a/programs/server/Server.h b/programs/server/Server.h index b4931ce53d1..3f03dd137ef 100644 --- a/programs/server/Server.h +++ b/programs/server/Server.h @@ -1,10 +1,15 @@ #pragma once #include + #include +#include +#include +#include +#include /** Server provides three interfaces: - * 1. HTTP, GRPC - simple interfaces for any applications. + * 1. HTTP - simple interface for any applications. * 2. TCP - interface for native clickhouse-client and for server to server internal communications. * More rich and efficient, but less compatible * - data is transferred by columns; @@ -13,21 +18,43 @@ * 3. Interserver HTTP - for replication. */ +namespace Poco +{ + namespace Net + { + class ServerSocket; + } +} + namespace DB { +class AsynchronousMetrics; +class ProtocolServerAdapter; class Server : public BaseDaemon, public IServer { public: using ServerApplication::run; - Poco::Util::LayeredConfiguration & config() const override { return BaseDaemon::config(); } + Poco::Util::LayeredConfiguration & config() const override + { + return BaseDaemon::config(); + } - Poco::Logger & logger() const override { return BaseDaemon::logger(); } + Poco::Logger & logger() const override + { + return BaseDaemon::logger(); + } - ContextMutablePtr context() const override { return global_context; } + ContextMutablePtr context() const override + { + return global_context; + } - bool isCancelled() const override { return BaseDaemon::isCancelled(); } + bool isCancelled() const override + { + return BaseDaemon::isCancelled(); + } void defineOptions(Poco::Util::OptionSet & _options) override; @@ -46,6 +73,64 @@ private: ContextMutablePtr global_context; /// Updated/recent config, to compare http_handlers ConfigurationPtr latest_config; + + HTTPContextPtr httpContext() const; + + Poco::Net::SocketAddress socketBindListen( + const Poco::Util::AbstractConfiguration & config, + Poco::Net::ServerSocket & socket, + const std::string & host, + UInt16 port, + [[maybe_unused]] bool secure = false) const; + + std::unique_ptr buildProtocolStackFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & protocol, + Poco::Net::HTTPServerParams::Ptr http_params, + AsynchronousMetrics & async_metrics, + bool & is_secure); + + using CreateServerFunc = std::function; + void createServer( + Poco::Util::AbstractConfiguration & config, + const std::string & listen_host, + const char * port_name, + bool listen_try, + bool start_server, + std::vector & servers, + CreateServerFunc && func) const; + + void createServers( + Poco::Util::AbstractConfiguration & config, + const Strings & listen_hosts, + bool listen_try, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + bool start_servers = false, + const ServerType & server_type = ServerType(ServerType::Type::QUERIES_ALL)); + + void createInterserverServers( + Poco::Util::AbstractConfiguration & config, + const Strings & interserver_listen_hosts, + bool listen_try, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + bool start_servers = false, + const ServerType & server_type = ServerType(ServerType::Type::QUERIES_ALL)); + + void updateServers( + Poco::Util::AbstractConfiguration & config, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + std::vector & servers_to_start_before_tables); + + void stopServers( + std::vector & servers, + const ServerType & server_type + ) const; }; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 33042fbc7fc..f2e10a27b75 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -236,7 +236,6 @@ add_object_library(clickhouse_client Client) add_object_library(clickhouse_bridge BridgeHelper) add_object_library(clickhouse_server Server) add_object_library(clickhouse_server_http Server/HTTP) -add_object_library(clickhouse_server_manager Server/ServersManager) add_object_library(clickhouse_formats Formats) add_object_library(clickhouse_processors Processors) add_object_library(clickhouse_processors_executors Processors/Executors) diff --git a/src/Server/ServersManager/IServersManager.cpp b/src/Server/ServersManager/IServersManager.cpp deleted file mode 100644 index 8b1eee94303..00000000000 --- a/src/Server/ServersManager/IServersManager.cpp +++ /dev/null @@ -1,268 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ -extern const int NETWORK_ERROR; -extern const int INVALID_CONFIG_PARAMETER; -} - -IServersManager::IServersManager(ContextMutablePtr global_context_, Poco::Logger * logger_) - : global_context(global_context_), logger(logger_) -{ -} - - -bool IServersManager::empty() const -{ - return servers.empty(); -} - -std::vector IServersManager::getMetrics() const -{ - std::vector metrics; - metrics.reserve(servers.size()); - for (const auto & server : servers) - metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); - return metrics; -} - -void IServersManager::startServers() -{ - for (auto & server : servers) - { - server.start(); - LOG_INFO(logger, "Listening for {}", server.getDescription()); - } -} - -void IServersManager::stopServers(const ServerType & server_type) -{ - /// Remove servers once all their connections are closed - auto check_server = [&](const char prefix[], auto & server) - { - if (!server.isStopping()) - return false; - size_t current_connections = server.currentConnections(); - LOG_DEBUG( - logger, - "Server {}{}: {} ({} connections)", - server.getDescription(), - prefix, - !current_connections ? "finished" : "waiting", - current_connections); - return !current_connections; - }; - - std::erase_if(servers, std::bind_front(check_server, " (from one of previous remove)")); - - for (auto & server : servers) - { - if (!server.isStopping() && server_type.shouldStop(server.getPortName())) - server.stop(); - } - - std::erase_if(servers, std::bind_front(check_server, "")); -} - -void IServersManager::updateServers( - const Poco::Util::AbstractConfiguration & config, - IServer & iserver, - std::mutex & servers_lock, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - ConfigurationPtr latest_config) -{ - stopServersForUpdate(config, latest_config); - createServers(config, iserver, servers_lock, server_pool, async_metrics, true, ServerType(ServerType::Type::QUERIES_ALL)); -} - -Poco::Net::SocketAddress IServersManager::socketBindListen( - const Poco::Util::AbstractConfiguration & config, Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port) const -{ - auto address = makeSocketAddress(host, port, logger); - socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ config.getBool("listen_reuse_port", false)); - /// If caller requests any available port from the OS, discover it after binding. - if (port == 0) - { - address = socket.address(); - LOG_DEBUG(logger, "Requested any available port (port == 0), actual port is {:d}", address.port()); - } - - socket.listen(/* backlog = */ config.getUInt("listen_backlog", 4096)); - return address; -} - -void IServersManager::createServer( - const Poco::Util::AbstractConfiguration & config, - const std::string & listen_host, - const char * port_name, - bool start_server, - CreateServerFunc && func) -{ - /// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file. - if (config.getString(port_name, "").empty()) - return; - - /// If we already have an active server for this listen_host/port_name, don't create it again - for (const auto & server : servers) - { - if (!server.isStopping() && server.getListenHost() == listen_host && server.getPortName() == port_name) - return; - } - - auto port = config.getInt(port_name); - try - { - servers.push_back(func(port)); - if (start_server) - { - servers.back().start(); - LOG_INFO(logger, "Listening for {}", servers.back().getDescription()); - } - global_context->registerServerPort(port_name, port); - } - catch (const Poco::Exception &) - { - if (!getListenTry(config)) - { - throw Exception(ErrorCodes::NETWORK_ERROR, "Listen [{}]:{} failed: {}", listen_host, port, getCurrentExceptionMessage(false)); - } - LOG_WARNING( - logger, - "Listen [{}]:{} failed: {}. If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, " - "then consider to " - "specify not disabled IPv4 or IPv6 address to listen in element of configuration " - "file. Example for disabled IPv6: 0.0.0.0 ." - " Example for disabled IPv4: ::", - listen_host, - port, - getCurrentExceptionMessage(false)); - } -} - -void IServersManager::stopServersForUpdate(const Poco::Util::AbstractConfiguration & config, ConfigurationPtr latest_config) -{ - /// Remove servers once all their connections are closed - auto check_server = [&](const char prefix[], auto & server) - { - if (!server.isStopping()) - return false; - size_t current_connections = server.currentConnections(); - LOG_DEBUG( - logger, - "Server {}{}: {} ({} connections)", - server.getDescription(), - prefix, - !current_connections ? "finished" : "waiting", - current_connections); - return !current_connections; - }; - - std::erase_if(servers, std::bind_front(check_server, " (from one of previous reload)")); - - const auto listen_hosts = getListenHosts(config); - const Poco::Util::AbstractConfiguration & previous_config = latest_config ? *latest_config : config; - - for (auto & server : servers) - { - if (server.isStopping()) - return; - std::string port_name = server.getPortName(); - bool has_host = false; - bool is_http = false; - if (port_name.starts_with("protocols.")) - { - std::string protocol = port_name.substr(0, port_name.find_last_of('.')); - has_host = config.has(protocol + ".host"); - - std::string conf_name = protocol; - std::string prefix = protocol + "."; - std::unordered_set pset{conf_name}; - while (true) - { - if (config.has(prefix + "type")) - { - std::string type = config.getString(prefix + "type"); - if (type == "http") - { - is_http = true; - break; - } - } - - if (!config.has(prefix + "impl")) - break; - - conf_name = "protocols." + config.getString(prefix + "impl"); - prefix = conf_name + "."; - - if (!pset.insert(conf_name).second) - throw Exception( - ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' configuration contains a loop on '{}'", protocol, conf_name); - } - } - else - { - /// NOTE: better to compare using getPortName() over using - /// dynamic_cast<> since HTTPServer is also used for prometheus and - /// internal replication communications. - is_http = server.getPortName() == "http_port" || server.getPortName() == "https_port"; - } - - if (!has_host) - has_host = std::find(listen_hosts.begin(), listen_hosts.end(), server.getListenHost()) != listen_hosts.end(); - bool has_port = !config.getString(port_name, "").empty(); - bool force_restart = is_http && !isSameConfiguration(previous_config, config, "http_handlers"); - if (force_restart) - LOG_TRACE(logger, " had been changed, will reload {}", server.getDescription()); - - if (!has_host || !has_port || config.getInt(server.getPortName()) != server.portNumber() || force_restart) - { - server.stop(); - LOG_INFO(logger, "Stopped listening for {}", server.getDescription()); - } - } - - std::erase_if(servers, std::bind_front(check_server, "")); -} - -Strings IServersManager::getListenHosts(const Poco::Util::AbstractConfiguration & config) const -{ - auto listen_hosts = DB::getMultipleValuesFromConfig(config, "", "listen_host"); - if (listen_hosts.empty()) - { - listen_hosts.emplace_back("::1"); - listen_hosts.emplace_back("127.0.0.1"); - } - return listen_hosts; -} - -bool IServersManager::getListenTry(const Poco::Util::AbstractConfiguration & config) const -{ - bool listen_try = config.getBool("listen_try", false); - if (!listen_try) - { - Poco::Util::AbstractConfiguration::Keys protocols; - config.keys("protocols", protocols); - listen_try = DB::getMultipleValuesFromConfig(config, "", "listen_host").empty() - && std::none_of( - protocols.begin(), - protocols.end(), - [&](const auto & protocol) - { return config.has("protocols." + protocol + ".host") && config.has("protocols." + protocol + ".port"); }); - } - return listen_try; -} - -} diff --git a/src/Server/ServersManager/IServersManager.h b/src/Server/ServersManager/IServersManager.h deleted file mode 100644 index 7e1d9d50d82..00000000000 --- a/src/Server/ServersManager/IServersManager.h +++ /dev/null @@ -1,74 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -class IServersManager -{ -public: - IServersManager(ContextMutablePtr global_context_, Poco::Logger * logger_); - virtual ~IServersManager() = default; - - bool empty() const; - std::vector getMetrics() const; - - virtual void createServers( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - std::mutex & servers_lock, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - bool start_servers, - const ServerType & server_type) - = 0; - - void startServers(); - - void stopServers(const ServerType & server_type); - virtual size_t stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) = 0; - - virtual void updateServers( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - std::mutex & servers_lock, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - ConfigurationPtr latest_config); - -protected: - ContextMutablePtr global_context; - Poco::Logger * logger; - - std::vector servers; - - Poco::Net::SocketAddress socketBindListen( - const Poco::Util::AbstractConfiguration & config, Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port) const; - - using CreateServerFunc = std::function; - void createServer( - const Poco::Util::AbstractConfiguration & config, - const std::string & listen_host, - const char * port_name, - bool start_server, - CreateServerFunc && func); - - void stopServersForUpdate(const Poco::Util::AbstractConfiguration & config, ConfigurationPtr latest_config); - - Strings getListenHosts(const Poco::Util::AbstractConfiguration & config) const; - bool getListenTry(const Poco::Util::AbstractConfiguration & config) const; -}; - -} diff --git a/src/Server/ServersManager/InterServersManager.cpp b/src/Server/ServersManager/InterServersManager.cpp deleted file mode 100644 index 4425d468248..00000000000 --- a/src/Server/ServersManager/InterServersManager.cpp +++ /dev/null @@ -1,327 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if USE_SSL -# include -#endif - -#if USE_NURAFT -# include -# include -#endif - -namespace ProfileEvents -{ -extern const Event InterfaceInterserverSendBytes; -extern const Event InterfaceInterserverReceiveBytes; -} - -namespace DB -{ - -namespace ErrorCodes -{ -extern const int SUPPORT_IS_DISABLED; -} - -void InterServersManager::createServers( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - std::mutex & servers_lock, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - bool start_servers, - const ServerType & server_type) -{ - if (config.has("keeper_server.server_id")) - { -#if USE_NURAFT - //// If we don't have configured connection probably someone trying to use clickhouse-server instead - //// of clickhouse-keeper, so start synchronously. - bool can_initialize_keeper_async = false; - - if (zkutil::hasZooKeeperConfig(config)) /// We have configured connection to some zookeeper cluster - { - /// If we cannot connect to some other node from our cluster then we have to wait our Keeper start - /// synchronously. - can_initialize_keeper_async = global_context->tryCheckClientConnectionToMyKeeperCluster(); - } - /// Initialize keeper RAFT. - global_context->initializeKeeperDispatcher(can_initialize_keeper_async); - FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); - - auto config_getter = [this]() -> const Poco::Util::AbstractConfiguration & { return global_context->getConfigRef(); }; - - for (const auto & listen_host : getListenHosts(config)) - { - /// TCP Keeper - constexpr auto port_name = "keeper_server.tcp_port"; - createServer( - config, - listen_host, - port_name, - /* start_server = */ false, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout( - Poco::Timespan(config.getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)); - socket.setSendTimeout( - Poco::Timespan(config.getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)); - return ProtocolServerAdapter( - listen_host, - port_name, - "Keeper (tcp): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory( - config_getter, - global_context->getKeeperDispatcher(), - global_context->getSettingsRef().receive_timeout.totalSeconds(), - global_context->getSettingsRef().send_timeout.totalSeconds(), - false), - server_pool, - socket)); - }); - - constexpr auto secure_port_name = "keeper_server.tcp_port_secure"; - createServer( - config, - listen_host, - secure_port_name, - /* start_server = */ false, - [&](UInt16 port) -> ProtocolServerAdapter - { -# if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout( - Poco::Timespan(config.getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)); - socket.setSendTimeout( - Poco::Timespan(config.getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)); - return ProtocolServerAdapter( - listen_host, - secure_port_name, - "Keeper with secure protocol (tcp_secure): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory( - config_getter, - global_context->getKeeperDispatcher(), - global_context->getSettingsRef().receive_timeout.totalSeconds(), - global_context->getSettingsRef().send_timeout.totalSeconds(), - true), - server_pool, - socket)); -# else - UNUSED(port); - throw Exception( - ErrorCodes::SUPPORT_IS_DISABLED, - "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); -# endif - }); - - /// HTTP control endpoints - createServer( - config, - listen_host, - /* port_name = */ "keeper_server.http_control.port", - /* start_server = */ false, - [&](UInt16 port) -> ProtocolServerAdapter - { - auto http_context = std::make_shared(global_context); - Poco::Timespan keep_alive_timeout(config.getUInt("keep_alive_timeout", 10), 0); - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(http_context->getReceiveTimeout()); - http_params->setKeepAliveTimeout(keep_alive_timeout); - - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(http_context->getReceiveTimeout()); - socket.setSendTimeout(http_context->getSendTimeout()); - return ProtocolServerAdapter( - listen_host, - port_name, - "HTTP Control: http://" + address.toString(), - std::make_unique( - std::move(http_context), - createKeeperHTTPControlMainHandlerFactory( - config_getter(), global_context->getKeeperDispatcher(), "KeeperHTTPControlHandler-factory"), - server_pool, - socket, - http_params)); - }); - } -#else - throw Exception( - ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination."); -#endif - } - - { - std::lock_guard lock(servers_lock); - /// We should start interserver communications before (and more important shutdown after) tables. - /// Because server can wait for a long-running queries (for example in tcp_handler) after interserver handler was already shut down. - /// In this case we will have replicated tables which are unable to send any parts to other replicas, but still can - /// communicate with zookeeper, execute merges, etc. - createInterserverServers(config, server, server_pool, async_metrics, start_servers, server_type); - startServers(); - } -} - -size_t InterServersManager::stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) -{ - if (servers.empty()) - { - return 0; - } - - LOG_DEBUG(logger, "Waiting for current connections to servers for tables to finish."); - - size_t current_connections = 0; - { - std::lock_guard lock(servers_lock); - for (auto & server : servers) - { - server.stop(); - current_connections += server.currentConnections(); - } - } - - if (current_connections) - LOG_INFO(logger, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); - else - LOG_INFO(logger, "Closed all listening sockets."); - - if (current_connections > 0) - current_connections = waitServersToFinish(servers, servers_lock, server_settings.shutdown_wait_unfinished); - - if (current_connections) - LOG_INFO( - logger, - "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections " - "after context shutdown.", - current_connections); - else - LOG_INFO(logger, "Closed connections to servers for tables."); - return current_connections; -} - -void InterServersManager::updateServers( - const Poco::Util::AbstractConfiguration & config, - IServer & iserver, - std::mutex & /*servers_lock*/, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - ConfigurationPtr latest_config) -{ - stopServersForUpdate(config, latest_config); - createInterserverServers(config, iserver, server_pool, async_metrics, true, ServerType(ServerType::Type::QUERIES_ALL)); -} - -Strings InterServersManager::getInterserverListenHosts(const Poco::Util::AbstractConfiguration & config) const -{ - auto interserver_listen_hosts = DB::getMultipleValuesFromConfig(config, "", "interserver_listen_host"); - if (!interserver_listen_hosts.empty()) - return interserver_listen_hosts; - - /// Use more general restriction in case of emptiness - return getListenHosts(config); -} - -void InterServersManager::createInterserverServers( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - bool start_servers, - const ServerType & server_type) -{ - const Settings & settings = global_context->getSettingsRef(); - - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(settings.http_receive_timeout); - http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); - - /// Now iterate over interserver_listen_hosts - for (const auto & interserver_listen_host : getInterserverListenHosts(config)) - { - if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTP)) - { - /// Interserver IO HTTP - constexpr auto port_name = "interserver_http_port"; - createServer( - config, - interserver_listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, interserver_listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - interserver_listen_host, - port_name, - "replica communication (interserver): http://" + address.toString(), - std::make_unique( - std::make_shared(global_context), - createHandlerFactory(server, config, async_metrics, "InterserverIOHTTPHandler-factory"), - server_pool, - socket, - http_params, - ProfileEvents::InterfaceInterserverReceiveBytes, - ProfileEvents::InterfaceInterserverSendBytes)); - }); - } - - if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTPS)) - { - constexpr auto port_name = "interserver_https_port"; - createServer( - config, - interserver_listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(config, socket, interserver_listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - interserver_listen_host, - port_name, - "secure replica communication (interserver): https://" + address.toString(), - std::make_unique( - std::make_shared(global_context), - createHandlerFactory(server, config, async_metrics, "InterserverIOHTTPSHandler-factory"), - server_pool, - socket, - http_params, - ProfileEvents::InterfaceInterserverReceiveBytes, - ProfileEvents::InterfaceInterserverSendBytes)); -#else - UNUSED(port); - throw Exception( - ErrorCodes::SUPPORT_IS_DISABLED, - "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); -#endif - }); - } - } -} - -} diff --git a/src/Server/ServersManager/InterServersManager.h b/src/Server/ServersManager/InterServersManager.h deleted file mode 100644 index 8780eae18e0..00000000000 --- a/src/Server/ServersManager/InterServersManager.h +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -class InterServersManager : public IServersManager -{ -public: - using IServersManager::IServersManager; - - void createServers( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - std::mutex & servers_lock, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - bool start_servers, - const ServerType & server_type) override; - - size_t stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) override; - - void updateServers( - const Poco::Util::AbstractConfiguration & config, - IServer & iserver, - std::mutex & servers_lock, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - ConfigurationPtr latest_config) override; - -private: - Strings getInterserverListenHosts(const Poco::Util::AbstractConfiguration & config) const; - - void createInterserverServers( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - bool start_servers, - const ServerType & server_type); -}; - -} diff --git a/src/Server/ServersManager/ProtocolServersManager.cpp b/src/Server/ServersManager/ProtocolServersManager.cpp deleted file mode 100644 index af57de3ac3c..00000000000 --- a/src/Server/ServersManager/ProtocolServersManager.cpp +++ /dev/null @@ -1,523 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if USE_SSL -# include -#endif - -#if USE_GRPC -# include -#endif - -namespace ProfileEvents -{ -extern const Event InterfaceNativeSendBytes; -extern const Event InterfaceNativeReceiveBytes; -extern const Event InterfaceHTTPSendBytes; -extern const Event InterfaceHTTPReceiveBytes; -extern const Event InterfacePrometheusSendBytes; -extern const Event InterfacePrometheusReceiveBytes; -extern const Event InterfaceMySQLSendBytes; -extern const Event InterfaceMySQLReceiveBytes; -extern const Event InterfacePostgreSQLSendBytes; -extern const Event InterfacePostgreSQLReceiveBytes; -extern const Event InterfaceInterserverSendBytes; -extern const Event InterfaceInterserverReceiveBytes; -} - -namespace DB -{ - -namespace ErrorCodes -{ -extern const int SUPPORT_IS_DISABLED; -extern const int INVALID_CONFIG_PARAMETER; -} - -void ProtocolServersManager::createServers( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - std::mutex & /*servers_lock*/, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - bool start_servers, - const ServerType & server_type) -{ - auto listen_hosts = getListenHosts(config); - const Settings & settings = global_context->getSettingsRef(); - - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(settings.http_receive_timeout); - http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); - - Poco::Util::AbstractConfiguration::Keys protocols; - config.keys("protocols", protocols); - - for (const auto & protocol : protocols) - { - if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol)) - continue; - - std::string prefix = "protocols." + protocol + "."; - std::string port_name = prefix + "port"; - std::string description{" protocol"}; - if (config.has(prefix + "description")) - description = config.getString(prefix + "description"); - - if (!config.has(prefix + "port")) - continue; - - std::vector hosts; - if (config.has(prefix + "host")) - hosts.push_back(config.getString(prefix + "host")); - else - hosts = listen_hosts; - - for (const auto & host : hosts) - { - bool is_secure = false; - auto stack = buildProtocolStackFromConfig(config, server, protocol, http_params, async_metrics, is_secure); - - if (stack->empty()) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' stack empty", protocol); - - createServer( - config, - host, - port_name.c_str(), - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - host, - port_name.c_str(), - description + ": " + address.toString(), - std::make_unique(stack.release(), server_pool, socket, new Poco::Net::TCPServerParams)); - }); - } - } - - for (const auto & listen_host : listen_hosts) - { - if (server_type.shouldStart(ServerType::Type::HTTP)) - { - /// HTTP - constexpr auto port_name = "http_port"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "http://" + address.toString(), - std::make_unique( - std::make_shared(global_context), - createHandlerFactory(server, config, async_metrics, "HTTPHandler-factory"), - server_pool, - socket, - http_params, - ProfileEvents::InterfaceHTTPReceiveBytes, - ProfileEvents::InterfaceHTTPSendBytes)); - }); - } - - if (server_type.shouldStart(ServerType::Type::HTTPS)) - { - /// HTTPS - constexpr auto port_name = "https_port"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "https://" + address.toString(), - std::make_unique( - std::make_shared(global_context), - createHandlerFactory(server, config, async_metrics, "HTTPSHandler-factory"), - server_pool, - socket, - http_params, - ProfileEvents::InterfaceHTTPReceiveBytes, - ProfileEvents::InterfaceHTTPSendBytes)); -#else - UNUSED(port); - throw Exception( - ErrorCodes::SUPPORT_IS_DISABLED, - "HTTPS protocol is disabled because Poco library was built without NetSSL support."); -#endif - }); - } - - if (server_type.shouldStart(ServerType::Type::TCP)) - { - /// TCP - constexpr auto port_name = "tcp_port"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "native protocol (tcp): " + address.toString(), - std::make_unique( - new TCPHandlerFactory( - server, false, false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - }); - } - - if (server_type.shouldStart(ServerType::Type::TCP_WITH_PROXY)) - { - /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt - constexpr auto port_name = "tcp_with_proxy_port"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "native protocol (tcp) with PROXY: " + address.toString(), - std::make_unique( - new TCPHandlerFactory( - server, false, true, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - }); - } - - if (server_type.shouldStart(ServerType::Type::TCP_SECURE)) - { - /// TCP with SSL - constexpr auto port_name = "tcp_port_secure"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "secure native protocol (tcp_secure): " + address.toString(), - std::make_unique( - new TCPHandlerFactory( - server, true, false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes), - server_pool, - socket, - new Poco::Net::TCPServerParams)); -#else - UNUSED(port); - throw Exception( - ErrorCodes::SUPPORT_IS_DISABLED, - "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); -#endif - }); - } - - if (server_type.shouldStart(ServerType::Type::MYSQL)) - { - constexpr auto port_name = "mysql_port"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(Poco::Timespan()); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "MySQL compatibility protocol: " + address.toString(), - std::make_unique( - new MySQLHandlerFactory( - server, ProfileEvents::InterfaceMySQLReceiveBytes, ProfileEvents::InterfaceMySQLSendBytes), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - }); - } - - if (server_type.shouldStart(ServerType::Type::POSTGRESQL)) - { - constexpr auto port_name = "postgresql_port"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(Poco::Timespan()); - socket.setSendTimeout(settings.send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "PostgreSQL compatibility protocol: " + address.toString(), - std::make_unique( - new PostgreSQLHandlerFactory( - server, ProfileEvents::InterfacePostgreSQLReceiveBytes, ProfileEvents::InterfacePostgreSQLSendBytes), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - }); - } - -#if USE_GRPC - if (server_type.shouldStart(ServerType::Type::GRPC)) - { - constexpr auto port_name = "grpc_port"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::SocketAddress server_address(listen_host, port); - return ProtocolServerAdapter( - listen_host, - port_name, - "gRPC protocol: " + server_address.toString(), - std::make_unique(server, makeSocketAddress(listen_host, port, logger))); - }); - } -#endif - if (server_type.shouldStart(ServerType::Type::PROMETHEUS)) - { - /// Prometheus (if defined and not setup yet with http_port) - constexpr auto port_name = "prometheus.port"; - createServer( - config, - listen_host, - port_name, - start_servers, - [&](UInt16 port) -> ProtocolServerAdapter - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(config, socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - return ProtocolServerAdapter( - listen_host, - port_name, - "Prometheus: http://" + address.toString(), - std::make_unique( - std::make_shared(global_context), - createHandlerFactory(server, config, async_metrics, "PrometheusHandler-factory"), - server_pool, - socket, - http_params, - ProfileEvents::InterfacePrometheusReceiveBytes, - ProfileEvents::InterfacePrometheusSendBytes)); - }); - } - } -} - -size_t ProtocolServersManager::stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) -{ - if (servers.empty()) - { - return 0; - } - - LOG_DEBUG(logger, "Waiting for current connections to close."); - - size_t current_connections = 0; - { - std::lock_guard lock(servers_lock); - for (auto & server : servers) - { - server.stop(); - current_connections += server.currentConnections(); - } - } - - if (current_connections) - LOG_WARNING(logger, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); - else - LOG_INFO(logger, "Closed all listening sockets."); - - /// Wait for unfinished backups and restores. - /// This must be done after closing listening sockets (no more backups/restores) but before ProcessList::killAllQueries - /// (because killAllQueries() will cancel all running backups/restores). - if (server_settings.shutdown_wait_backups_and_restores) - global_context->waitAllBackupsAndRestores(); - /// Killing remaining queries. - if (!server_settings.shutdown_wait_unfinished_queries) - global_context->getProcessList().killAllQueries(); - - if (current_connections) - current_connections = waitServersToFinish(servers, servers_lock, server_settings.shutdown_wait_unfinished); - - if (current_connections) - LOG_WARNING( - logger, - "Closed connections. But {} remain." - " Tip: To increase wait time add to config: 60", - current_connections); - else - LOG_INFO(logger, "Closed connections."); - return current_connections; -} - -std::unique_ptr ProtocolServersManager::buildProtocolStackFromConfig( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - const std::string & protocol, - Poco::Net::HTTPServerParams::Ptr http_params, - AsynchronousMetrics & async_metrics, - bool & is_secure) const -{ - auto create_factory = [&](const std::string & type, const std::string & conf_name) -> TCPServerConnectionFactory::Ptr - { - if (type == "tcp") - return TCPServerConnectionFactory::Ptr(new TCPHandlerFactory( - server, false, false, ProfileEvents::InterfaceNativeReceiveBytes, ProfileEvents::InterfaceNativeSendBytes)); - - if (type == "tls") -#if USE_SSL - return TCPServerConnectionFactory::Ptr(new TLSHandlerFactory(server, conf_name)); -#else - throw Exception( - ErrorCodes::SUPPORT_IS_DISABLED, - "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); -#endif - - if (type == "proxy1") - return TCPServerConnectionFactory::Ptr(new ProxyV1HandlerFactory(server, conf_name)); - if (type == "mysql") - return TCPServerConnectionFactory::Ptr( - new MySQLHandlerFactory(server, ProfileEvents::InterfaceMySQLReceiveBytes, ProfileEvents::InterfaceMySQLSendBytes)); - if (type == "postgres") - return TCPServerConnectionFactory::Ptr(new PostgreSQLHandlerFactory( - server, ProfileEvents::InterfacePostgreSQLReceiveBytes, ProfileEvents::InterfacePostgreSQLSendBytes)); - if (type == "http") - return TCPServerConnectionFactory::Ptr(new HTTPServerConnectionFactory( - std::make_shared(global_context), - http_params, - createHandlerFactory(server, config, async_metrics, "HTTPHandler-factory"), - ProfileEvents::InterfaceHTTPReceiveBytes, - ProfileEvents::InterfaceHTTPSendBytes)); - if (type == "prometheus") - return TCPServerConnectionFactory::Ptr(new HTTPServerConnectionFactory( - std::make_shared(global_context), - http_params, - createHandlerFactory(server, config, async_metrics, "PrometheusHandler-factory"), - ProfileEvents::InterfacePrometheusReceiveBytes, - ProfileEvents::InterfacePrometheusSendBytes)); - if (type == "interserver") - return TCPServerConnectionFactory::Ptr(new HTTPServerConnectionFactory( - std::make_shared(global_context), - http_params, - createHandlerFactory(server, config, async_metrics, "InterserverIOHTTPHandler-factory"), - ProfileEvents::InterfaceInterserverReceiveBytes, - ProfileEvents::InterfaceInterserverSendBytes)); - - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol configuration error, unknown protocol name '{}'", type); - }; - - std::string conf_name = "protocols." + protocol; - std::string prefix = conf_name + "."; - std::unordered_set pset{conf_name}; - - auto stack = std::make_unique(server, conf_name); - - while (true) - { - // if there is no "type" - it's a reference to another protocol and this is just an endpoint - if (config.has(prefix + "type")) - { - std::string type = config.getString(prefix + "type"); - if (type == "tls") - { - if (is_secure) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' contains more than one TLS layer", protocol); - is_secure = true; - } - - stack->append(create_factory(type, conf_name)); - } - - if (!config.has(prefix + "impl")) - break; - - conf_name = "protocols." + config.getString(prefix + "impl"); - prefix = conf_name + "."; - - if (!pset.insert(conf_name).second) - throw Exception( - ErrorCodes::INVALID_CONFIG_PARAMETER, "Protocol '{}' configuration contains a loop on '{}'", protocol, conf_name); - } - - return stack; -} - -} diff --git a/src/Server/ServersManager/ProtocolServersManager.h b/src/Server/ServersManager/ProtocolServersManager.h deleted file mode 100644 index e9eaaeb2184..00000000000 --- a/src/Server/ServersManager/ProtocolServersManager.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace DB -{ - -class ProtocolServersManager : public IServersManager -{ -public: - using IServersManager::IServersManager; - - void createServers( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - std::mutex & servers_lock, - Poco::ThreadPool & server_pool, - AsynchronousMetrics & async_metrics, - bool start_servers, - const ServerType & server_type) override; - - using IServersManager::stopServers; - size_t stopServers(const ServerSettings & server_settings, std::mutex & servers_lock) override; - -private: - std::unique_ptr buildProtocolStackFromConfig( - const Poco::Util::AbstractConfiguration & config, - IServer & server, - const std::string & protocol, - Poco::Net::HTTPServerParams::Ptr http_params, - AsynchronousMetrics & async_metrics, - bool & is_secure) const; -}; - -} From c6660c70b17b8e3c1e22192b825deeb5f9f2120b Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 27 May 2024 10:27:50 +0200 Subject: [PATCH 381/392] Add missing reinterpret functions to documentation --- .../functions/type-conversion-functions.md | 617 +++++++++++++++++- 1 file changed, 611 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 1030d92c76b..2360cecb9a5 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -996,12 +996,585 @@ Result: └─────────────────────────────────────────────┘ ``` -## reinterpretAsUInt(8\|16\|32\|64) +## reinterpretAsUInt8 -## reinterpretAsInt(8\|16\|32\|64) +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt8. -## reinterpretAsFloat* +**Syntax** +```sql +reinterpretAsUInt8(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as UInt8. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as UInt8. [UInt8](../data-types/int-uint.md/#uint8-uint16-uint32-uint64-uint128-uint256-int8-int16-int32-int64-int128-int256). + +**Example** + +Query: + +```sql +SELECT + toInt8(257) AS x, + toTypeName(x), + reinterpretAsUInt8(x) AS res, + toTypeName(res); +``` + +Result: + +```response +┌─x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 1 │ Int8 │ 1 │ UInt8 │ +└───┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsUInt16 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt16. + +**Syntax** + +```sql +reinterpretAsUInt16(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as UInt16. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as UInt16. [UInt16](../data-types/int-uint.md/#uint8-uint16-uint32-uint64-uint128-uint256-int8-int16-int32-int64-int128-int256). + +**Example** + +Query: + +```sql +SELECT + toUInt8(257) AS x, + toTypeName(x), + reinterpretAsUInt16(x) AS res, + toTypeName(res); +``` + +Result: + +```response +┌─x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 1 │ UInt8 │ 1 │ UInt16 │ +└───┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsUInt32 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt32. + +**Syntax** + +```sql +reinterpretAsUInt32(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as UInt32. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as UInt32. [UInt32](../data-types/int-uint.md/#uint8-uint16-uint32-uint64-uint128-uint256-int8-int16-int32-int64-int128-int256). + +**Example** + +Query: + +```sql +SELECT + toUInt16(257) AS x, + toTypeName(x), + reinterpretAsUInt32(x) AS res, + toTypeName(res) +``` + +Result: + +```response +┌───x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 257 │ UInt16 │ 257 │ UInt32 │ +└─────┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsUInt64 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt64. + +**Syntax** + +```sql +reinterpretAsUInt64(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as UInt64. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as UInt64. [UInt64](../data-types/int-uint.md/#uint8-uint16-uint32-uint64-uint128-uint256-int8-int16-int32-int64-int128-int256). + +**Example** + +Query: + +```sql +SELECT + toUInt32(257) AS x, + toTypeName(x), + reinterpretAsUInt64(x) AS res, + toTypeName(res) +``` + +Result: + +```response +┌───x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 257 │ UInt32 │ 257 │ UInt64 │ +└─────┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsUInt128 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt128. + +**Syntax** + +```sql +reinterpretAsUInt128(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as UInt64. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as UInt128. [UInt128](../data-types/int-uint.md/#uint8-uint16-uint32-uint64-uint128-uint256-int8-int16-int32-int64-int128-int256). + +**Example** + +Query: + +```sql +SELECT + toUInt64(257) AS x, + toTypeName(x), + reinterpretAsUInt128(x) AS res, + toTypeName(res) +``` + +Result: + +```response +┌───x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 257 │ UInt64 │ 257 │ UInt128 │ +└─────┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsUInt256 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt256. + +**Syntax** + +```sql +reinterpretAsUInt256(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as UInt256. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as UInt256. [UInt256](../data-types/int-uint.md/#uint8-uint16-uint32-uint64-uint128-uint256-int8-int16-int32-int64-int128-int256). + +**Example** + +Query: + +```sql +SELECT + toUInt128(257) AS x, + toTypeName(x), + reinterpretAsUInt256(x) AS res, + toTypeName(res) +``` + +Result: + +```response +┌───x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 257 │ UInt128 │ 257 │ UInt256 │ +└─────┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsInt8 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int8. + +**Syntax** + +```sql +reinterpretAsInt8(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as Int8. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as Int8. [Int8](../data-types/int-uint.md/#int-ranges). + +**Example** + +Query: + +```sql +SELECT + toUInt8(257) AS x, + toTypeName(x), + reinterpretAsInt8(x) AS res, + toTypeName(res); +``` + +Result: + +```response +┌─x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 1 │ UInt8 │ 1 │ Int8 │ +└───┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsInt16 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int16. + +**Syntax** + +```sql +reinterpretAsInt16(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as Int16. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as Int16. [Int16](../data-types/int-uint.md/#int-ranges). + +**Example** + +Query: + +```sql +SELECT + toInt8(257) AS x, + toTypeName(x), + reinterpretAsInt16(x) AS res, + toTypeName(res); +``` + +Result: + +```response +┌─x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 1 │ Int8 │ 1 │ Int16 │ +└───┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsInt32 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int32. + +**Syntax** + +```sql +reinterpretAsInt32(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as Int32. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as Int32. [Int32](../data-types/int-uint.md/#int-ranges). + +**Example** + +Query: + +```sql +SELECT + toInt16(257) AS x, + toTypeName(x), + reinterpretAsInt32(x) AS res, + toTypeName(res); +``` + +Result: + +```response +┌───x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 257 │ Int16 │ 257 │ Int32 │ +└─────┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsInt64 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int64. + +**Syntax** + +```sql +reinterpretAsInt64(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as Int64. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as Int64. [Int64](../data-types/int-uint.md/#int-ranges). + +**Example** + +Query: + +```sql +SELECT + toInt32(257) AS x, + toTypeName(x), + reinterpretAsInt64(x) AS res, + toTypeName(res); +``` + +Result: + +```response +┌───x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 257 │ Int32 │ 257 │ Int64 │ +└─────┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsInt128 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int128. + +**Syntax** + +```sql +reinterpretAsInt128(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as Int128. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as Int128. [Int128](../data-types/int-uint.md/#int-ranges). + +**Example** + +Query: + +```sql +SELECT + toInt64(257) AS x, + toTypeName(x), + reinterpretAsInt128(x) AS res, + toTypeName(res); +``` + +Result: + +```response +┌───x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 257 │ Int64 │ 257 │ Int128 │ +└─────┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsInt256 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int256. + +**Syntax** + +```sql +reinterpretAsInt256(x) +``` + +**Parameters** + +- `x`: value to byte reinterpret as Int256. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as Int256. [Int256](../data-types/int-uint.md/#int-ranges). + +**Example** + +Query: + +```sql +SELECT + toInt128(257) AS x, + toTypeName(x), + reinterpretAsInt256(x) AS res, + toTypeName(res); +``` + +Result: + +```response +┌───x─┬─toTypeName(x)─┬─res─┬─toTypeName(res)─┐ +│ 257 │ Int128 │ 257 │ Int256 │ +└─────┴───────────────┴─────┴─────────────────┘ +``` + +## reinterpretAsFloat32 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Float32. + +**Syntax** + +```sql +reinterpretAsFloat32(x) +``` + +**Parameters** + +- `x`: value to reinterpret as Float32. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as Float32. [Float32](../data-types/float.md). + +**Example** + +Query: + +```sql +SELECT reinterpretAsUInt32(toFloat32(0.2)) as x, reinterpretAsFloat32(x); +``` + +Result: + +```response +┌──────────x─┬─reinterpretAsFloat32(x)─┐ +│ 1045220557 │ 0.2 │ +└────────────┴─────────────────────────┘ +``` + +## reinterpretAsFloat64 + +Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Float64. + +**Syntax** + +```sql +reinterpretAsFloat64(x) +``` + +**Parameters** + +- `x`: value to reinterpret as Float64. + +:::note +Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). +::: + +**Returned value** + +- Reinterpreted value `x` as Float64. [Float64](../data-types/float.md). + +**Example** + +Query: + +```sql +SELECT reinterpretAsUInt64(toFloat64(0.2)) as x, reinterpretAsFloat64(x); +``` + +Result: + +```response +┌───────────────────x─┬─reinterpretAsFloat64(x)─┐ +│ 4596373779694328218 │ 0.2 │ +└─────────────────────┴─────────────────────────┘ +``` ## reinterpretAsDate @@ -1093,11 +1666,43 @@ Result: ## reinterpretAsString -This function accepts a number or date or date with time and returns a string containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long. +This function accepts a number, date or date with time and returns a string containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long. + +**Syntax** + +```sql +reinterpretAsString(x) +``` + +**Parameters** + +- `x`: value to reinterpret to string. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md). + +**Returned value** + +- String containing bytes representing `x`. [String](../data-types/fixedstring.md). + +**Example** + +Query: + +```sql +SELECT + reinterpretAsString(toDateTime('1970-01-01 01:01:05')), + reinterpretAsString(toDate('1970-03-07')); +``` + +Result: + +```response +┌─reinterpretAsString(toDateTime('1970-01-01 01:01:05'))─┬─reinterpretAsString(toDate('1970-03-07'))─┐ +│ A │ A │ +└────────────────────────────────────────────────────────┴───────────────────────────────────────────┘ +``` ## reinterpretAsFixedString -This function accepts a number or date or date with time and returns a FixedString containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a FixedString that is one byte long. +This function accepts a number, date or date with time and returns a FixedString containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a FixedString that is one byte long. **Syntax** @@ -1137,7 +1742,7 @@ Result: In addition to the UUID functions listed here, there is dedicated [UUID function documentation](/docs/en/sql-reference/functions/uuid-functions.md). ::: -Accepts 16 bytes string and returns UUID containing bytes representing the corresponding value in network byte order (big-endian). If the string isn't long enough, the function works as if the string is padded with the necessary number of null bytes to the end. If the string is longer than 16 bytes, the extra bytes at the end are ignored. +Accepts a 16 byte string and returns a UUID containing bytes representing the corresponding value in network byte order (big-endian). If the string isn't long enough, the function works as if the string is padded with the necessary number of null bytes to the end. If the string is longer than 16 bytes, the extra bytes at the end are ignored. **Syntax** From 8b551cc832a765296213ce462a5472d589b1955d Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 27 May 2024 10:30:18 +0200 Subject: [PATCH 382/392] Remove unneeded test file - one already exists --- .../03156_reinterpret_functions.sql | 36 ------------------- 1 file changed, 36 deletions(-) delete mode 100644 tests/queries/0_stateless/03156_reinterpret_functions.sql diff --git a/tests/queries/0_stateless/03156_reinterpret_functions.sql b/tests/queries/0_stateless/03156_reinterpret_functions.sql deleted file mode 100644 index 4acaaf47cef..00000000000 --- a/tests/queries/0_stateless/03156_reinterpret_functions.sql +++ /dev/null @@ -1,36 +0,0 @@ --- Date and DateTime - -SELECT reinterpretAsDate(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsDate('A',''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsDate([0, 1, 2]); -- { clientError ILLEGAL_TYPE_OF_ARGUMENT} -SELECT reinterpretAsDateTime(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsDateTime('A',''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsDateTime([0, 1, 2]); -- { clientError ILLEGAL_TYPE_OF_ARGUMENT} - -SELECT reinterpretAsDate(65); -SELECT reinterpretAsDate('A'); -SELECT reinterpretAsDateTime(65); -SELECT reinterpretAsDate('A'); - --- Fixed String - -SELECT reinterpretAsFixedString(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsFixedString(toDate('1970-01-01'),''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsFixedString([0, 1, 2]); -- { clientError ILLEGAL_TYPE_OF_ARGUMENT} - -SELECT reinterpretAsFixedString(toDate('1970-03-07')); -SELECT reinterpretAsFixedString(toDateTime('1970-01-01 01:01:05')); -SELECT reinterpretAsFixedString(65); - --- Float32, Float64 - -SELECT reinterpretAsFloat32(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsFloat64(); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsFloat32('1970-01-01', ''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsFloat64('1970-01-01', ''); -- { clientError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT reinterpretAsFloat32([0, 1, 2]); -- { clientError ILLEGAL_TYPE_OF_ARGUMENT} -SELECT reinterpretAsFloat64([0, 1, 2]); -- { clientError4 ILLEGAL_TYPE_OF_ARGUMENT} - - - - From 5a868304c04755bb62b30c45e408b65a3e78dcd0 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 27 May 2024 11:38:22 +0200 Subject: [PATCH 383/392] Revert "Remove some unnecessary `UNREACHABLE`s" --- programs/keeper-client/Commands.cpp | 3 +-- programs/main.cpp | 2 +- src/Access/AccessEntityIO.cpp | 3 ++- src/Access/AccessRights.cpp | 2 +- src/Access/IAccessStorage.cpp | 9 ++++++--- .../AggregateFunctionGroupArray.cpp | 13 +++++++------ .../AggregateFunctionSequenceNextNode.cpp | 1 + src/AggregateFunctions/AggregateFunctionSum.h | 1 + src/Common/DateLUTImpl.cpp | 1 + src/Common/IntervalKind.cpp | 10 ++++++++++ src/Common/TargetSpecific.cpp | 2 ++ src/Common/ThreadProfileEvents.cpp | 1 + src/Common/ZooKeeper/IKeeper.cpp | 2 ++ src/Compression/CompressionCodecDeflateQpl.cpp | 1 + src/Compression/CompressionCodecDoubleDelta.cpp | 10 +++------- src/Coordination/KeeperReconfiguration.cpp | 8 +------- src/Coordination/KeeperServer.cpp | 2 +- src/Core/Field.h | 2 ++ src/DataTypes/Serializations/ISerialization.cpp | 1 + src/Disks/IO/CachedOnDiskReadBufferFromFile.h | 1 + .../MetadataStorageTransactionState.cpp | 1 + src/Disks/VolumeJBOD.cpp | 2 ++ src/Formats/EscapingRuleUtils.cpp | 1 + src/Functions/FunctionsRound.h | 8 ++++++++ src/Functions/FunctionsTimeWindow.cpp | 2 ++ src/Functions/PolygonUtils.h | 2 ++ .../UserDefinedSQLObjectsZooKeeperStorage.cpp | 1 + src/IO/CompressionMethod.cpp | 1 + src/IO/HadoopSnappyReadBuffer.h | 1 + src/Interpreters/AggregatedDataVariants.cpp | 8 ++++++++ src/Interpreters/Cache/FileSegment.cpp | 1 + src/Interpreters/ComparisonGraph.cpp | 1 + src/Interpreters/FilesystemCacheLog.cpp | 1 + src/Interpreters/HashJoin.cpp | 3 +++ src/Interpreters/HashJoin.h | 6 ++++++ .../InterpreterTransactionControlQuery.cpp | 1 + src/Interpreters/SetVariants.cpp | 4 ++++ src/Parsers/ASTExplainQuery.h | 2 ++ src/Parsers/Lexer.cpp | 4 +++- .../Formats/Impl/MsgPackRowInputFormat.cpp | 1 + src/Processors/IProcessor.cpp | 2 ++ src/Processors/QueryPlan/ReadFromMergeTree.cpp | 6 ++++++ src/Processors/QueryPlan/TotalsHavingStep.cpp | 2 ++ src/Processors/Transforms/FillingTransform.cpp | 1 + .../Transforms/buildPushingToViewsChain.cpp | 2 ++ src/Storages/MergeTree/BackgroundJobsAssignee.cpp | 1 + src/Storages/MergeTree/KeyCondition.cpp | 2 ++ src/Storages/MergeTree/MergeTreeData.cpp | 2 ++ src/Storages/MergeTree/MergeTreeDataWriter.cpp | 2 ++ .../PartMovesBetweenShardsOrchestrator.cpp | 2 ++ src/Storages/WindowView/StorageWindowView.cpp | 3 +++ 51 files changed, 121 insertions(+), 30 deletions(-) diff --git a/programs/keeper-client/Commands.cpp b/programs/keeper-client/Commands.cpp index 860840a2d06..a109912e6e0 100644 --- a/programs/keeper-client/Commands.cpp +++ b/programs/keeper-client/Commands.cpp @@ -10,7 +10,6 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; extern const int KEEPER_EXCEPTION; } @@ -442,7 +441,7 @@ void ReconfigCommand::execute(const DB::ASTKeeperQuery * query, DB::KeeperClient new_members = query->args[1].safeGet(); break; default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected operation: {}", operation); + UNREACHABLE(); } auto response = client->zookeeper->reconfig(joining, leaving, new_members); diff --git a/programs/main.cpp b/programs/main.cpp index c270388f17f..bc8476e4ce4 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -155,8 +155,8 @@ auto instructionFailToString(InstructionFail fail) ret("AVX2"); case InstructionFail::AVX512: ret("AVX512"); -#undef ret } + UNREACHABLE(); } diff --git a/src/Access/AccessEntityIO.cpp b/src/Access/AccessEntityIO.cpp index 1b073329296..b0dfd74c53b 100644 --- a/src/Access/AccessEntityIO.cpp +++ b/src/Access/AccessEntityIO.cpp @@ -144,7 +144,8 @@ AccessEntityPtr deserializeAccessEntity(const String & definition, const String catch (Exception & e) { e.addMessage("Could not parse " + file_path); - throw; + e.rethrow(); + UNREACHABLE(); } } diff --git a/src/Access/AccessRights.cpp b/src/Access/AccessRights.cpp index 2127f4ada70..c10931f554c 100644 --- a/src/Access/AccessRights.cpp +++ b/src/Access/AccessRights.cpp @@ -258,7 +258,7 @@ namespace case TABLE_LEVEL: return AccessFlags::allFlagsGrantableOnTableLevel(); case COLUMN_LEVEL: return AccessFlags::allFlagsGrantableOnColumnLevel(); } - chassert(false); + UNREACHABLE(); } } diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index 8d4e7d3073e..8e51481e415 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -257,7 +257,8 @@ std::vector IAccessStorage::insert(const std::vector & mu } e.addMessage("After successfully inserting {}/{}: {}", successfully_inserted.size(), multiple_entities.size(), successfully_inserted_str); } - throw; + e.rethrow(); + UNREACHABLE(); } } @@ -360,7 +361,8 @@ std::vector IAccessStorage::remove(const std::vector & ids, bool thr } e.addMessage("After successfully removing {}/{}: {}", removed_names.size(), ids.size(), removed_names_str); } - throw; + e.rethrow(); + UNREACHABLE(); } } @@ -456,7 +458,8 @@ std::vector IAccessStorage::update(const std::vector & ids, const Up } e.addMessage("After successfully updating {}/{}: {}", names_of_updated.size(), ids.size(), names_of_updated_str); } - throw; + e.rethrow(); + UNREACHABLE(); } } diff --git a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index 930b2c6ce73..d4fb7afcb78 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -60,13 +60,14 @@ struct GroupArrayTrait template constexpr const char * getNameByTrait() { - if constexpr (Trait::last) + if (Trait::last) return "groupArrayLast"; - switch (Trait::sampler) - { - case Sampler::NONE: return "groupArray"; - case Sampler::RNG: return "groupArraySample"; - } + if (Trait::sampler == Sampler::NONE) + return "groupArray"; + else if (Trait::sampler == Sampler::RNG) + return "groupArraySample"; + + UNREACHABLE(); } template diff --git a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp index a9dd53a75e8..bed10333af0 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp +++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp @@ -414,6 +414,7 @@ public: break; return (i == events_size) ? base - i : unmatched_idx; } + UNREACHABLE(); } void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override diff --git a/src/AggregateFunctions/AggregateFunctionSum.h b/src/AggregateFunctions/AggregateFunctionSum.h index 2ce03c530c2..58aaddf357a 100644 --- a/src/AggregateFunctions/AggregateFunctionSum.h +++ b/src/AggregateFunctions/AggregateFunctionSum.h @@ -463,6 +463,7 @@ public: return "sumWithOverflow"; else if constexpr (Type == AggregateFunctionTypeSumKahan) return "sumKahan"; + UNREACHABLE(); } explicit AggregateFunctionSum(const DataTypes & argument_types_) diff --git a/src/Common/DateLUTImpl.cpp b/src/Common/DateLUTImpl.cpp index c87d44a4b95..392ee64dcbf 100644 --- a/src/Common/DateLUTImpl.cpp +++ b/src/Common/DateLUTImpl.cpp @@ -41,6 +41,7 @@ UInt8 getDayOfWeek(const cctz::civil_day & date) case cctz::weekday::saturday: return 6; case cctz::weekday::sunday: return 7; } + UNREACHABLE(); } inline cctz::time_point lookupTz(const cctz::time_zone & cctz_time_zone, const cctz::civil_day & date) diff --git a/src/Common/IntervalKind.cpp b/src/Common/IntervalKind.cpp index 1548d5cf9a5..22c7db504c3 100644 --- a/src/Common/IntervalKind.cpp +++ b/src/Common/IntervalKind.cpp @@ -34,6 +34,8 @@ Int64 IntervalKind::toAvgNanoseconds() const default: return toAvgSeconds() * NANOSECONDS_PER_SECOND; } + + UNREACHABLE(); } Int32 IntervalKind::toAvgSeconds() const @@ -52,6 +54,7 @@ Int32 IntervalKind::toAvgSeconds() const case IntervalKind::Kind::Quarter: return 7889238; /// Exactly 1/4 of a year. case IntervalKind::Kind::Year: return 31556952; /// The average length of a Gregorian year is equal to 365.2425 days } + UNREACHABLE(); } Float64 IntervalKind::toSeconds() const @@ -77,6 +80,7 @@ Float64 IntervalKind::toSeconds() const default: throw Exception(ErrorCodes::BAD_ARGUMENTS, "Not possible to get precise number of seconds in non-precise interval"); } + UNREACHABLE(); } bool IntervalKind::isFixedLength() const @@ -95,6 +99,7 @@ bool IntervalKind::isFixedLength() const case IntervalKind::Kind::Quarter: case IntervalKind::Kind::Year: return false; } + UNREACHABLE(); } IntervalKind IntervalKind::fromAvgSeconds(Int64 num_seconds) @@ -136,6 +141,7 @@ const char * IntervalKind::toKeyword() const case IntervalKind::Kind::Quarter: return "QUARTER"; case IntervalKind::Kind::Year: return "YEAR"; } + UNREACHABLE(); } @@ -155,6 +161,7 @@ const char * IntervalKind::toLowercasedKeyword() const case IntervalKind::Kind::Quarter: return "quarter"; case IntervalKind::Kind::Year: return "year"; } + UNREACHABLE(); } @@ -185,6 +192,7 @@ const char * IntervalKind::toDateDiffUnit() const case IntervalKind::Kind::Year: return "year"; } + UNREACHABLE(); } @@ -215,6 +223,7 @@ const char * IntervalKind::toNameOfFunctionToIntervalDataType() const case IntervalKind::Kind::Year: return "toIntervalYear"; } + UNREACHABLE(); } @@ -248,6 +257,7 @@ const char * IntervalKind::toNameOfFunctionExtractTimePart() const case IntervalKind::Kind::Year: return "toYear"; } + UNREACHABLE(); } diff --git a/src/Common/TargetSpecific.cpp b/src/Common/TargetSpecific.cpp index 8540c9a9986..49f396c0926 100644 --- a/src/Common/TargetSpecific.cpp +++ b/src/Common/TargetSpecific.cpp @@ -54,6 +54,8 @@ String toString(TargetArch arch) case TargetArch::AMXTILE: return "amxtile"; case TargetArch::AMXINT8: return "amxint8"; } + + UNREACHABLE(); } } diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp index 23b41f23bde..6a63d484cd9 100644 --- a/src/Common/ThreadProfileEvents.cpp +++ b/src/Common/ThreadProfileEvents.cpp @@ -75,6 +75,7 @@ const char * TasksStatsCounters::metricsProviderString(MetricsProvider provider) case MetricsProvider::Netlink: return "netlink"; } + UNREACHABLE(); } bool TasksStatsCounters::checkIfAvailable() diff --git a/src/Common/ZooKeeper/IKeeper.cpp b/src/Common/ZooKeeper/IKeeper.cpp index 7cca262baca..7d2602bde1e 100644 --- a/src/Common/ZooKeeper/IKeeper.cpp +++ b/src/Common/ZooKeeper/IKeeper.cpp @@ -146,6 +146,8 @@ const char * errorMessage(Error code) case Error::ZSESSIONMOVED: return "Session moved to another server, so operation is ignored"; case Error::ZNOTREADONLY: return "State-changing request is passed to read-only server"; } + + UNREACHABLE(); } bool isHardwareError(Error zk_return_code) diff --git a/src/Compression/CompressionCodecDeflateQpl.cpp b/src/Compression/CompressionCodecDeflateQpl.cpp index f1b5b24e866..7e0653c69f8 100644 --- a/src/Compression/CompressionCodecDeflateQpl.cpp +++ b/src/Compression/CompressionCodecDeflateQpl.cpp @@ -466,6 +466,7 @@ void CompressionCodecDeflateQpl::doDecompressData(const char * source, UInt32 so sw_codec->doDecompressData(source, source_size, dest, uncompressed_size); return; } + UNREACHABLE(); } void CompressionCodecDeflateQpl::flushAsynchronousDecompressRequests() diff --git a/src/Compression/CompressionCodecDoubleDelta.cpp b/src/Compression/CompressionCodecDoubleDelta.cpp index cbd8cd57a62..e6e8db4c699 100644 --- a/src/Compression/CompressionCodecDoubleDelta.cpp +++ b/src/Compression/CompressionCodecDoubleDelta.cpp @@ -21,11 +21,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; -} - /** NOTE DoubleDelta is surprisingly bad name. The only excuse is that it comes from an academic paper. * Most people will think that "double delta" is just applying delta transform twice. * But in fact it is something more than applying delta transform twice. @@ -147,9 +142,9 @@ namespace ErrorCodes { extern const int CANNOT_COMPRESS; extern const int CANNOT_DECOMPRESS; + extern const int BAD_ARGUMENTS; extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE; extern const int ILLEGAL_CODEC_PARAMETER; - extern const int LOGICAL_ERROR; } namespace @@ -168,8 +163,9 @@ inline Int64 getMaxValueForByteSize(Int8 byte_size) case sizeof(UInt64): return std::numeric_limits::max(); default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "only 1, 2, 4 and 8 data sizes are supported"); + assert(false && "only 1, 2, 4 and 8 data sizes are supported"); } + UNREACHABLE(); } struct WriteSpec diff --git a/src/Coordination/KeeperReconfiguration.cpp b/src/Coordination/KeeperReconfiguration.cpp index 05211af6704..e3642913a7a 100644 --- a/src/Coordination/KeeperReconfiguration.cpp +++ b/src/Coordination/KeeperReconfiguration.cpp @@ -5,12 +5,6 @@ namespace DB { - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - ClusterUpdateActions joiningToClusterUpdates(const ClusterConfigPtr & cfg, std::string_view joining) { ClusterUpdateActions out; @@ -85,7 +79,7 @@ String serializeClusterConfig(const ClusterConfigPtr & cfg, const ClusterUpdateA new_config.emplace_back(RaftServerConfig{*cfg->get_server(priority->id)}); } else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected update"); + UNREACHABLE(); } for (const auto & item : cfg->get_servers()) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 736a01443ce..8d21ce2ab01 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -990,7 +990,7 @@ KeeperServer::ConfigUpdateState KeeperServer::applyConfigUpdate( raft_instance->set_priority(update->id, update->priority, /*broadcast on live leader*/true); return Accepted; } - std::unreachable(); + UNREACHABLE(); } ClusterUpdateActions KeeperServer::getRaftConfigurationDiff(const Poco::Util::AbstractConfiguration & config) diff --git a/src/Core/Field.h b/src/Core/Field.h index 710614cd0a0..4424d669c4d 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -667,6 +667,8 @@ public: case Types::AggregateFunctionState: return f(field.template get()); case Types::CustomType: return f(field.template get()); } + + UNREACHABLE(); } String dump() const; diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index bbb1d1a6cd1..dbe27a5f3f6 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -36,6 +36,7 @@ String ISerialization::kindToString(Kind kind) case Kind::SPARSE: return "Sparse"; } + UNREACHABLE(); } ISerialization::Kind ISerialization::stringToKind(const String & str) diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h index cb34f7932c3..3433698a162 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h @@ -140,6 +140,7 @@ private: case ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE: return "REMOTE_FS_READ_AND_PUT_IN_CACHE"; } + UNREACHABLE(); } size_t first_offset = 0; diff --git a/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp b/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp index a37f4ce7e65..245578b5d9e 100644 --- a/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp @@ -17,6 +17,7 @@ std::string toString(MetadataStorageTransactionState state) case MetadataStorageTransactionState::PARTIALLY_ROLLED_BACK: return "PARTIALLY_ROLLED_BACK"; } + UNREACHABLE(); } } diff --git a/src/Disks/VolumeJBOD.cpp b/src/Disks/VolumeJBOD.cpp index f8b9a57affe..d0e9d32ff5e 100644 --- a/src/Disks/VolumeJBOD.cpp +++ b/src/Disks/VolumeJBOD.cpp @@ -112,6 +112,7 @@ DiskPtr VolumeJBOD::getDisk(size_t /* index */) const return disks_by_size.top().disk; } } + UNREACHABLE(); } ReservationPtr VolumeJBOD::reserve(UInt64 bytes) @@ -163,6 +164,7 @@ ReservationPtr VolumeJBOD::reserve(UInt64 bytes) return reservation; } } + UNREACHABLE(); } bool VolumeJBOD::areMergesAvoided() const diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 9577ca2a8df..89a7a31d033 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -62,6 +62,7 @@ String escapingRuleToString(FormatSettings::EscapingRule escaping_rule) case FormatSettings::EscapingRule::Raw: return "Raw"; } + UNREACHABLE(); } void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index dde57e8320d..99f3a14dfec 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -149,6 +149,8 @@ struct IntegerRoundingComputation return x; } } + + UNREACHABLE(); } static ALWAYS_INLINE T compute(T x, T scale) @@ -161,6 +163,8 @@ struct IntegerRoundingComputation case ScaleMode::Negative: return computeImpl(x, scale); } + + UNREACHABLE(); } static ALWAYS_INLINE void compute(const T * __restrict in, size_t scale, T * __restrict out) requires std::integral @@ -243,6 +247,8 @@ inline float roundWithMode(float x, RoundingMode mode) case RoundingMode::Ceil: return ceilf(x); case RoundingMode::Trunc: return truncf(x); } + + UNREACHABLE(); } inline double roundWithMode(double x, RoundingMode mode) @@ -254,6 +260,8 @@ inline double roundWithMode(double x, RoundingMode mode) case RoundingMode::Ceil: return ceil(x); case RoundingMode::Trunc: return trunc(x); } + + UNREACHABLE(); } template diff --git a/src/Functions/FunctionsTimeWindow.cpp b/src/Functions/FunctionsTimeWindow.cpp index f93a885ee65..1c9f28c9724 100644 --- a/src/Functions/FunctionsTimeWindow.cpp +++ b/src/Functions/FunctionsTimeWindow.cpp @@ -232,6 +232,7 @@ struct TimeWindowImpl default: throw Exception(ErrorCodes::SYNTAX_ERROR, "Fraction seconds are unsupported by windows yet"); } + UNREACHABLE(); } template @@ -421,6 +422,7 @@ struct TimeWindowImpl default: throw Exception(ErrorCodes::SYNTAX_ERROR, "Fraction seconds are unsupported by windows yet"); } + UNREACHABLE(); } template diff --git a/src/Functions/PolygonUtils.h b/src/Functions/PolygonUtils.h index 57f1243537d..c4851718da6 100644 --- a/src/Functions/PolygonUtils.h +++ b/src/Functions/PolygonUtils.h @@ -381,6 +381,8 @@ bool PointInPolygonWithGrid::contains(CoordinateType x, Coordina case CellType::complexPolygon: return boost::geometry::within(Point(x, y), polygons[cell.index_of_inner_polygon]); } + + UNREACHABLE(); } diff --git a/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp b/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp index 766d63eafb0..568e0b9b5d2 100644 --- a/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp +++ b/src/Functions/UserDefined/UserDefinedSQLObjectsZooKeeperStorage.cpp @@ -35,6 +35,7 @@ namespace case UserDefinedSQLObjectType::Function: return "function_"; } + UNREACHABLE(); } constexpr std::string_view sql_extension = ".sql"; diff --git a/src/IO/CompressionMethod.cpp b/src/IO/CompressionMethod.cpp index 22913125e99..b8e1134d422 100644 --- a/src/IO/CompressionMethod.cpp +++ b/src/IO/CompressionMethod.cpp @@ -52,6 +52,7 @@ std::string toContentEncodingName(CompressionMethod method) case CompressionMethod::None: return ""; } + UNREACHABLE(); } CompressionMethod chooseHTTPCompressionMethod(const std::string & list) diff --git a/src/IO/HadoopSnappyReadBuffer.h b/src/IO/HadoopSnappyReadBuffer.h index bbbb84dd6dd..73e52f2c503 100644 --- a/src/IO/HadoopSnappyReadBuffer.h +++ b/src/IO/HadoopSnappyReadBuffer.h @@ -88,6 +88,7 @@ public: case Status::TOO_LARGE_COMPRESSED_BLOCK: return "TOO_LARGE_COMPRESSED_BLOCK"; } + UNREACHABLE(); } explicit HadoopSnappyReadBuffer( diff --git a/src/Interpreters/AggregatedDataVariants.cpp b/src/Interpreters/AggregatedDataVariants.cpp index 8f82f15248f..87cfdda5948 100644 --- a/src/Interpreters/AggregatedDataVariants.cpp +++ b/src/Interpreters/AggregatedDataVariants.cpp @@ -117,6 +117,8 @@ size_t AggregatedDataVariants::size() const APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M } + + UNREACHABLE(); } size_t AggregatedDataVariants::sizeWithoutOverflowRow() const @@ -134,6 +136,8 @@ size_t AggregatedDataVariants::sizeWithoutOverflowRow() const APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M } + + UNREACHABLE(); } const char * AggregatedDataVariants::getMethodName() const @@ -151,6 +155,8 @@ const char * AggregatedDataVariants::getMethodName() const APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M } + + UNREACHABLE(); } bool AggregatedDataVariants::isTwoLevel() const @@ -168,6 +174,8 @@ bool AggregatedDataVariants::isTwoLevel() const APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M } + + UNREACHABLE(); } bool AggregatedDataVariants::isConvertibleToTwoLevel() const diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index 61a356fa3c3..9459029dc4c 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -799,6 +799,7 @@ String FileSegment::stateToString(FileSegment::State state) case FileSegment::State::DETACHED: return "DETACHED"; } + UNREACHABLE(); } bool FileSegment::assertCorrectness() const diff --git a/src/Interpreters/ComparisonGraph.cpp b/src/Interpreters/ComparisonGraph.cpp index d53ff4b0227..4eacbae7a30 100644 --- a/src/Interpreters/ComparisonGraph.cpp +++ b/src/Interpreters/ComparisonGraph.cpp @@ -309,6 +309,7 @@ ComparisonGraphCompareResult ComparisonGraph::pathToCompareResult(Path pat case Path::GREATER: return inverse ? ComparisonGraphCompareResult::LESS : ComparisonGraphCompareResult::GREATER; case Path::GREATER_OR_EQUAL: return inverse ? ComparisonGraphCompareResult::LESS_OR_EQUAL : ComparisonGraphCompareResult::GREATER_OR_EQUAL; } + UNREACHABLE(); } template diff --git a/src/Interpreters/FilesystemCacheLog.cpp b/src/Interpreters/FilesystemCacheLog.cpp index aa489351a98..80fe1c3a8ef 100644 --- a/src/Interpreters/FilesystemCacheLog.cpp +++ b/src/Interpreters/FilesystemCacheLog.cpp @@ -26,6 +26,7 @@ static String typeToString(FilesystemCacheLogElement::CacheType type) case FilesystemCacheLogElement::CacheType::WRITE_THROUGH_CACHE: return "WRITE_THROUGH_CACHE"; } + UNREACHABLE(); } ColumnsDescription FilesystemCacheLogElement::getColumnsDescription() diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 75da8bbc3e7..3a21c13db5e 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -705,6 +705,7 @@ namespace APPLY_FOR_JOIN_VARIANTS(M) #undef M } + UNREACHABLE(); } } @@ -2640,6 +2641,8 @@ private: default: throw Exception(ErrorCodes::UNSUPPORTED_JOIN_KEYS, "Unsupported JOIN keys (type: {})", parent.data->type); } + + UNREACHABLE(); } template diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index a0996556f9a..86db8943926 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -322,6 +322,8 @@ public: APPLY_FOR_JOIN_VARIANTS(M) #undef M } + + UNREACHABLE(); } size_t getTotalByteCountImpl(Type which) const @@ -336,6 +338,8 @@ public: APPLY_FOR_JOIN_VARIANTS(M) #undef M } + + UNREACHABLE(); } size_t getBufferSizeInCells(Type which) const @@ -350,6 +354,8 @@ public: APPLY_FOR_JOIN_VARIANTS(M) #undef M } + + UNREACHABLE(); } /// NOLINTEND(bugprone-macro-parentheses) }; diff --git a/src/Interpreters/InterpreterTransactionControlQuery.cpp b/src/Interpreters/InterpreterTransactionControlQuery.cpp index 13872fbe3f5..d31ace758c4 100644 --- a/src/Interpreters/InterpreterTransactionControlQuery.cpp +++ b/src/Interpreters/InterpreterTransactionControlQuery.cpp @@ -33,6 +33,7 @@ BlockIO InterpreterTransactionControlQuery::execute() case ASTTransactionControl::SET_SNAPSHOT: return executeSetSnapshot(session_context, tcl.snapshot); } + UNREACHABLE(); } BlockIO InterpreterTransactionControlQuery::executeBegin(ContextMutablePtr session_context) diff --git a/src/Interpreters/SetVariants.cpp b/src/Interpreters/SetVariants.cpp index c600d096160..64796a013f1 100644 --- a/src/Interpreters/SetVariants.cpp +++ b/src/Interpreters/SetVariants.cpp @@ -41,6 +41,8 @@ size_t SetVariantsTemplate::getTotalRowCount() const APPLY_FOR_SET_VARIANTS(M) #undef M } + + UNREACHABLE(); } template @@ -55,6 +57,8 @@ size_t SetVariantsTemplate::getTotalByteCount() const APPLY_FOR_SET_VARIANTS(M) #undef M } + + UNREACHABLE(); } template diff --git a/src/Parsers/ASTExplainQuery.h b/src/Parsers/ASTExplainQuery.h index eb095b5dbbc..701bde8cebd 100644 --- a/src/Parsers/ASTExplainQuery.h +++ b/src/Parsers/ASTExplainQuery.h @@ -40,6 +40,8 @@ public: case TableOverride: return "EXPLAIN TABLE OVERRIDE"; case CurrentTransaction: return "EXPLAIN CURRENT TRANSACTION"; } + + UNREACHABLE(); } static ExplainKind fromString(const String & str) diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp index 5f2bd50524c..34855a7ce20 100644 --- a/src/Parsers/Lexer.cpp +++ b/src/Parsers/Lexer.cpp @@ -42,7 +42,7 @@ Token quotedString(const char *& pos, const char * const token_begin, const char continue; } - chassert(false); + UNREACHABLE(); } } @@ -538,6 +538,8 @@ const char * getTokenName(TokenType type) APPLY_FOR_TOKENS(M) #undef M } + + UNREACHABLE(); } diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 6b7f1f5206c..98cbdeaaa4b 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -657,6 +657,7 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {:x} is not supported", object_ext.type()); } } + UNREACHABLE(); } std::optional MsgPackSchemaReader::readRowAndGetDataTypes() diff --git a/src/Processors/IProcessor.cpp b/src/Processors/IProcessor.cpp index 5ab5e5277aa..8b160153733 100644 --- a/src/Processors/IProcessor.cpp +++ b/src/Processors/IProcessor.cpp @@ -36,6 +36,8 @@ std::string IProcessor::statusToName(Status status) case Status::ExpandPipeline: return "ExpandPipeline"; } + + UNREACHABLE(); } } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 24ea8c25fb6..6f0fa55c349 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1136,6 +1136,8 @@ static void addMergingFinal( return std::make_shared(header, num_outputs, sort_description, max_block_size_rows, /*max_block_size_bytes=*/0, merging_params.graphite_params, now); } + + UNREACHABLE(); }; pipe.addTransform(get_merging_processor()); @@ -2123,6 +2125,8 @@ static const char * indexTypeToString(ReadFromMergeTree::IndexType type) case ReadFromMergeTree::IndexType::Skip: return "Skip"; } + + UNREACHABLE(); } static const char * readTypeToString(ReadFromMergeTree::ReadType type) @@ -2138,6 +2142,8 @@ static const char * readTypeToString(ReadFromMergeTree::ReadType type) case ReadFromMergeTree::ReadType::ParallelReplicas: return "Parallel"; } + + UNREACHABLE(); } void ReadFromMergeTree::describeActions(FormatSettings & format_settings) const diff --git a/src/Processors/QueryPlan/TotalsHavingStep.cpp b/src/Processors/QueryPlan/TotalsHavingStep.cpp index ac5e144bf4a..d1bd70fd0b2 100644 --- a/src/Processors/QueryPlan/TotalsHavingStep.cpp +++ b/src/Processors/QueryPlan/TotalsHavingStep.cpp @@ -86,6 +86,8 @@ static String totalsModeToString(TotalsMode totals_mode, double auto_include_thr case TotalsMode::AFTER_HAVING_AUTO: return "after_having_auto threshold " + std::to_string(auto_include_threshold); } + + UNREACHABLE(); } void TotalsHavingStep::describeActions(FormatSettings & settings) const diff --git a/src/Processors/Transforms/FillingTransform.cpp b/src/Processors/Transforms/FillingTransform.cpp index bb38c3e1dc5..05fd2a7254f 100644 --- a/src/Processors/Transforms/FillingTransform.cpp +++ b/src/Processors/Transforms/FillingTransform.cpp @@ -67,6 +67,7 @@ static FillColumnDescription::StepFunction getStepFunction( FOR_EACH_INTERVAL_KIND(DECLARE_CASE) #undef DECLARE_CASE } + UNREACHABLE(); } static bool tryConvertFields(FillColumnDescription & descr, const DataTypePtr & type) diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index a1a886fb4f7..cdcfad4442c 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -898,6 +898,8 @@ static std::exception_ptr addStorageToException(std::exception_ptr ptr, const St { return std::current_exception(); } + + UNREACHABLE(); } void FinalizingViewsTransform::work() diff --git a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp index 0a69bf1109f..56a4378cf9a 100644 --- a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp +++ b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp @@ -93,6 +93,7 @@ String BackgroundJobsAssignee::toString(Type type) case Type::Moving: return "Moving"; } + UNREACHABLE(); } void BackgroundJobsAssignee::start() diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 9666da574fb..bd8642b9f66 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -2964,6 +2964,8 @@ String KeyCondition::RPNElement::toString(std::string_view column_name, bool pri case ALWAYS_TRUE: return "true"; } + + UNREACHABLE(); } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index b6373a22d9c..4b3093eeaac 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1177,6 +1177,8 @@ String MergeTreeData::MergingParams::getModeName() const case Graphite: return "Graphite"; case VersionedCollapsing: return "VersionedCollapsing"; } + + UNREACHABLE(); } Int64 MergeTreeData::getMaxBlockNumber() const diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index df4087b8546..426e36ce9a9 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -360,6 +360,8 @@ Block MergeTreeDataWriter::mergeBlock( return std::make_shared( block, 1, sort_description, block_size + 1, /*block_size_bytes=*/0, merging_params.graphite_params, time(nullptr)); } + + UNREACHABLE(); }; auto merging_algorithm = get_merging_algorithm(); diff --git a/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp b/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp index 4228d7b70b6..78fcfabb704 100644 --- a/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp +++ b/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp @@ -616,6 +616,8 @@ PartMovesBetweenShardsOrchestrator::Entry PartMovesBetweenShardsOrchestrator::st } } } + + UNREACHABLE(); } void PartMovesBetweenShardsOrchestrator::removePins(const Entry & entry, zkutil::ZooKeeperPtr zk) diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index 8bca1c97aad..a9ec1f6c694 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -297,6 +297,7 @@ namespace CASE_WINDOW_KIND(Year) #undef CASE_WINDOW_KIND } + UNREACHABLE(); } class AddingAggregatedChunkInfoTransform : public ISimpleTransform @@ -919,6 +920,7 @@ UInt32 StorageWindowView::getWindowLowerBound(UInt32 time_sec) CASE_WINDOW_KIND(Year) #undef CASE_WINDOW_KIND } + UNREACHABLE(); } UInt32 StorageWindowView::getWindowUpperBound(UInt32 time_sec) @@ -946,6 +948,7 @@ UInt32 StorageWindowView::getWindowUpperBound(UInt32 time_sec) CASE_WINDOW_KIND(Year) #undef CASE_WINDOW_KIND } + UNREACHABLE(); } void StorageWindowView::addFireSignal(std::set & signals) From c42338b8e0e4a8239fb34001860c9dba091e926a Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 27 May 2024 11:51:46 +0200 Subject: [PATCH 384/392] Fix test --- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 69485bd4d01..823e272cf01 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -578,6 +578,7 @@ void S3ObjectStorage::applyNewSettings( auto settings_from_config = getSettings(config, config_prefix, context, context->getSettingsRef().s3_validate_request_settings); auto modified_settings = std::make_unique(*s3_settings.get()); modified_settings->auth_settings.updateFrom(settings_from_config->auth_settings); + modified_settings->request_settings = settings_from_config->request_settings; if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString(), context->getUserName())) modified_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); From 0676b155de8ebbea9cd9f8dcafdfe2dc8a03abfc Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 27 May 2024 12:12:39 +0200 Subject: [PATCH 385/392] Remove logging --- src/Storages/ObjectStorage/ReadBufferIterator.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 5e89a0a1b9d..78cdc442f64 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -254,21 +254,17 @@ ReadBufferIterator::Data ReadBufferIterator::next() } } - LOG_TEST(getLogger("KSSENII"), "Will read columns from {}", current_object_info->getPath()); - std::unique_ptr read_buf; CompressionMethod compression_method; using ObjectInfoInArchive = StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive; if (const auto * object_info_in_archive = dynamic_cast(current_object_info.get())) { - LOG_TEST(getLogger("KSSENII"), "Will read columns from {} from archive", current_object_info->getPath()); compression_method = chooseCompressionMethod(filename, configuration->compression_method); const auto & archive_reader = object_info_in_archive->archive_reader; read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); } else { - LOG_TEST(getLogger("KSSENII"), "Will read columns from {} from s3", current_object_info->getPath()); compression_method = chooseCompressionMethod(filename, configuration->compression_method); read_buf = object_storage->readObject( StoredObject(current_object_info->getPath()), From 2bffc72d64e62f9f5ddb177f4b617bcc6d2c6253 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 27 May 2024 10:57:26 +0000 Subject: [PATCH 386/392] Fix optimize_aggregation_in_order setting --- .../queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh b/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh index c433d409c7c..b8760ec0e1d 100755 --- a/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh +++ b/tests/queries/0_stateless/03039_dynamic_aggregating_merge_tree.sh @@ -8,7 +8,7 @@ CLICKHOUSE_LOG_COMMENT= . "$CUR_DIR"/../shell_config.sh # Fix some settings to avoid timeouts because of some settings randomization -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128 --optimize_aggregation_in_order 0" function test() { From ed6994d372b636b4981593303e8dfde654bc151b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 27 May 2024 13:01:35 +0200 Subject: [PATCH 387/392] Clean settings in 02943_variant_read_subcolumns test --- tests/queries/0_stateless/02943_variant_read_subcolumns.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns.sh index 6bbd127d933..5ca8dd5f36f 100755 --- a/tests/queries/0_stateless/02943_variant_read_subcolumns.sh +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns.sh @@ -7,8 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_suspicious_variant_types=1 --max_insert_threads 4 --group_by_two_level_threshold 752249 --group_by_two_level_threshold_bytes 15083870 --distributed_aggregation_memory_efficient 1 --fsync_metadata 1 --output_format_parallel_formatting 0 --input_format_parallel_parsing 0 --min_chunk_bytes_for_parallel_parsing 6583861 --max_read_buffer_size 640584 --prefer_localhost_replica 1 --max_block_size 38844 --max_threads 48 --optimize_append_index 0 --optimize_if_chain_to_multiif 1 --optimize_if_transform_strings_to_enum 0 --optimize_read_in_order 1 --optimize_or_like_chain 0 --optimize_substitute_columns 1 --enable_multiple_prewhere_read_steps 1 --read_in_order_two_level_merge_threshold 4 --optimize_aggregation_in_order 0 --aggregation_in_order_max_block_bytes 18284646 --use_uncompressed_cache 1 --min_bytes_to_use_direct_io 10737418240 --min_bytes_to_use_mmap_io 10737418240 --local_filesystem_read_method pread --remote_filesystem_read_method read --local_filesystem_read_prefetch 1 --filesystem_cache_segments_batch_size 0 --read_from_filesystem_cache_if_exists_otherwise_bypass_cache 0 --throw_on_error_from_cache_on_write_operations 1 --remote_filesystem_read_prefetch 0 --allow_prefetched_read_pool_for_remote_filesystem 0 --filesystem_prefetch_max_memory_usage 128Mi --filesystem_prefetches_limit 0 --filesystem_prefetch_min_bytes_for_single_read_task 16Mi --filesystem_prefetch_step_marks 50 --filesystem_prefetch_step_bytes 0 --compile_aggregate_expressions 1 --compile_sort_description 0 --merge_tree_coarse_index_granularity 31 --optimize_distinct_in_order 1 --max_bytes_before_external_sort 1 --max_bytes_before_external_group_by 1 --max_bytes_before_remerge_sort 2640239625 --min_compress_block_size 3114155 --max_compress_block_size 226550 --merge_tree_compact_parts_min_granules_to_multibuffer_read 118 --optimize_sorting_by_input_stream_properties 0 --http_response_buffer_size 543038 --http_wait_end_of_query False --enable_memory_bound_merging_of_aggregation_results 1 --min_count_to_compile_expression 3 --min_count_to_compile_aggregate_expression 3 --min_count_to_compile_sort_description 0 --session_timezone America/Mazatlan --prefer_warmed_unmerged_parts_seconds 8 --use_page_cache_for_disks_without_file_cache False --page_cache_inject_eviction True --merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability 0.82 " - +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_as_common_type=1 --allow_suspicious_variant_types=1" function test() { From 747f6ae39c98d2caac1ddd6f5958aecc7bb92e22 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 27 May 2024 12:52:44 +0000 Subject: [PATCH 388/392] Add a comment after #64226 --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 3fca66e6eb8..43edaaa53fd 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -3916,6 +3916,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromArrayJoin(const Identifi return array_join_column; } + /// Resolve subcolumns. Example : SELECT x.y.z FROM tab ARRAY JOIN arr AS x auto compound_expr = tryResolveIdentifierFromCompoundExpression( identifier_lookup.identifier, identifier_lookup.identifier.getPartsSize() - identifier_view.getPartsSize() /*identifier_bind_size*/, From 8f775037bfcf6e109ec4c79b5fd943f25789f240 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 27 May 2024 08:07:05 +0000 Subject: [PATCH 389/392] Address PR review --- src/Backups/BackupIO_S3.cpp | 17 +++++----- src/Disks/DiskEncrypted.h | 6 ++-- src/Disks/IDisk.h | 12 ++++++- .../ObjectStorages/DiskObjectStorage.cpp | 6 ++++ src/Disks/ObjectStorages/DiskObjectStorage.h | 6 ++++ src/Disks/ObjectStorages/IObjectStorage.h | 1 + .../ObjectStorages/S3/S3ObjectStorage.cpp | 31 ++++++++++--------- src/IO/S3/copyS3File.cpp | 15 +++++++-- src/IO/S3/copyS3File.h | 4 +-- .../test_backup_restore_s3/test.py | 4 +++ 10 files changed, 72 insertions(+), 30 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index ee88556fbd6..be2f81a299c 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -188,6 +188,7 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s fs::path(s3_uri.key) / path_in_backup, 0, file_size, + /* dest_s3_client= */ destination_disk->getObjectStorage()->getS3StorageClient(), /* dest_bucket= */ blob_path[1], /* dest_key= */ blob_path[0], s3_settings.request_settings, @@ -195,8 +196,7 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s blob_storage_log, object_attributes, threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupReaderS3"), - /* for_disk_s3= */ true, - destination_disk->getObjectStorage()->getS3StorageClient()); + /* for_disk_s3= */ true); return file_size; }; @@ -258,15 +258,15 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src /* src_key= */ blob_path[0], start_pos, length, - s3_uri.bucket, - fs::path(s3_uri.key) / path_in_backup, + /* dest_s3_client= */ client, + /* dest_bucket= */ s3_uri.bucket, + /* dest_key= */ fs::path(s3_uri.key) / path_in_backup, s3_settings.request_settings, read_settings, blob_storage_log, {}, threadPoolCallbackRunnerUnsafe(getBackupsIOThreadPool().get(), "BackupWriterS3"), - /*for_disk_s3=*/false, - client); + /*for_disk_s3=*/false); return; /// copied! } } @@ -284,8 +284,9 @@ void BackupWriterS3::copyFile(const String & destination, const String & source, /* src_key= */ fs::path(s3_uri.key) / source, 0, size, - s3_uri.bucket, - fs::path(s3_uri.key) / destination, + /* dest_s3_client= */ client, + /* dest_bucket= */ s3_uri.bucket, + /* dest_key= */ fs::path(s3_uri.key) / destination, s3_settings.request_settings, read_settings, blob_storage_log, diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 27cf3096344..9b575c65bce 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -350,10 +350,12 @@ public: return delegate; } - ObjectStoragePtr getObjectStorage() override +#if USE_AWS_S3 + std::shared_ptr getS3StorageClient() const override { - return delegate->getObjectStorage(); + return delegate->getS3StorageClient(); } +#endif private: String wrappedPath(const String & path) const diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index b59e5b7f558..658acb01c74 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -471,6 +470,17 @@ public: virtual DiskPtr getDelegateDiskIfExists() const { return nullptr; } +#if USE_AWS_S3 + virtual std::shared_ptr getS3StorageClient() const + { + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, + "Method getS3StorageClient() is not implemented for disk type: {}", + getDataSourceDescription().toString()); + } +#endif + + protected: friend class DiskDecorator; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index abf0c1fad0b..5803a985000 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -582,6 +582,12 @@ UInt64 DiskObjectStorage::getRevision() const return metadata_helper->getRevision(); } +#if USE_AWS_S3 +std::shared_ptr DiskObjectStorage::getS3StorageClient() const +{ + return object_storage->getS3StorageClient(); +} +#endif DiskPtr DiskObjectStorageReservation::getDisk(size_t i) const { diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index 2a27ddf89a7..ffef0a007da 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -6,6 +6,8 @@ #include #include +#include "config.h" + namespace CurrentMetrics { @@ -210,6 +212,10 @@ public: bool supportsChmod() const override { return metadata_storage->supportsChmod(); } void chmod(const String & path, mode_t mode) override; +#if USE_AWS_S3 + std::shared_ptr getS3StorageClient() const override; +#endif + private: /// Create actual disk object storage transaction for operations diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index c9f445b9a35..b49dc839561 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 2e7bb6eeec9..12dda230b79 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -495,13 +495,14 @@ void S3ObjectStorage::copyObjectToAnotherObjectStorage( // NOLINT try { copyS3File( - current_client, - uri.bucket, - object_from.remote_path, - 0, - size, - dest_s3->uri.bucket, - object_to.remote_path, + /*src_s3_client=*/current_client, + /*src_bucket=*/uri.bucket, + /*src_key=*/object_from.remote_path, + /*src_offset=*/0, + /*src_size=*/size, + /*dest_s3_client=*/current_client, + /*dest_bucket=*/dest_s3->uri.bucket, + /*dest_key=*/object_to.remote_path, settings_ptr->request_settings, patchSettings(read_settings), BlobStorageLogWriter::create(disk_name), @@ -535,13 +536,15 @@ void S3ObjectStorage::copyObject( // NOLINT auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings); auto scheduler = threadPoolCallbackRunnerUnsafe(getThreadPoolWriter(), "S3ObjStor_copy"); - copyS3File(current_client, - uri.bucket, - object_from.remote_path, - 0, - size, - uri.bucket, - object_to.remote_path, + copyS3File( + /*src_s3_client=*/current_client, + /*src_bucket=*/uri.bucket, + /*src_key=*/object_from.remote_path, + /*src_offset=*/0, + /*src_size=*/size, + /*dest_s3_client=*/current_client, + /*dest_bucket=*/uri.bucket, + /*dest_key=*/object_to.remote_path, settings_ptr->request_settings, patchSettings(read_settings), BlobStorageLogWriter::create(disk_name), diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index 8dc2e6c0e0d..24e14985758 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -654,7 +654,16 @@ namespace bool for_disk_s3_, BlobStorageLogWriterPtr blob_storage_log_, std::function fallback_method_) - : UploadHelper(client_ptr_, dest_bucket_, dest_key_, request_settings_, object_metadata_, schedule_, for_disk_s3_, blob_storage_log_, getLogger("copyS3File")) + : UploadHelper( + client_ptr_, + dest_bucket_, + dest_key_, + request_settings_, + object_metadata_, + schedule_, + for_disk_s3_, + blob_storage_log_, + getLogger("copyS3File")) , src_bucket(src_bucket_) , src_key(src_key_) , offset(src_offset_) @@ -869,6 +878,7 @@ void copyS3File( const String & src_key, size_t src_offset, size_t src_size, + std::shared_ptr dest_s3_client, const String & dest_bucket, const String & dest_key, const S3Settings::RequestSettings & settings, @@ -876,8 +886,7 @@ void copyS3File( BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata, ThreadPoolCallbackRunnerUnsafe schedule, - bool for_disk_s3, - std::shared_ptr dest_s3_client) + bool for_disk_s3) { if (!dest_s3_client) dest_s3_client = src_s3_client; diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index cb1960cc368..85b3870ddbf 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -36,6 +36,7 @@ void copyS3File( const String & src_key, size_t src_offset, size_t src_size, + std::shared_ptr dest_s3_client, const String & dest_bucket, const String & dest_key, const S3Settings::RequestSettings & settings, @@ -43,8 +44,7 @@ void copyS3File( BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunnerUnsafe schedule_ = {}, - bool for_disk_s3 = false, - std::shared_ptr dest_s3_client = nullptr); + bool for_disk_s3 = false); /// Copies data from any seekable source to S3. /// The same functionality can be done by using the function copyData() and the class WriteBufferFromS3 diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index a76b32bce39..967ed6a221c 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -28,6 +28,10 @@ node = cluster.add_instance( def setup_minio_users(): + # create 2 extra users with restricted access + # miniorestricted1 - full access to bucket 'root', no access to other buckets + # miniorestricted2 - full access to bucket 'root2', no access to other buckets + # storage policy 'policy_s3_restricted' defines a policy for storing files inside bucket 'root' using 'miniorestricted1' user for user, bucket in [("miniorestricted1", "root"), ("miniorestricted2", "root2")]: print( cluster.exec_in_container( From 8166da7fbb616d9fa2d779ffe8e533b238d3680e Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 27 May 2024 16:21:36 +0200 Subject: [PATCH 390/392] Incorporate review changes --- .../functions/type-conversion-functions.md | 124 +++++------------- 1 file changed, 30 insertions(+), 94 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 2360cecb9a5..c4e0b2946c4 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -998,7 +998,7 @@ Result: ## reinterpretAsUInt8 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt8. +Performs byte reinterpretation by treating the input value as a value of type UInt8. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1008,11 +1008,7 @@ reinterpretAsUInt8(x) **Parameters** -- `x`: value to byte reinterpret as UInt8. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as UInt8. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1040,7 +1036,7 @@ Result: ## reinterpretAsUInt16 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt16. +Performs byte reinterpretation by treating the input value as a value of type UInt16. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1050,11 +1046,7 @@ reinterpretAsUInt16(x) **Parameters** -- `x`: value to byte reinterpret as UInt16. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as UInt16. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1082,7 +1074,7 @@ Result: ## reinterpretAsUInt32 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt32. +Performs byte reinterpretation by treating the input value as a value of type UInt32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1092,11 +1084,7 @@ reinterpretAsUInt32(x) **Parameters** -- `x`: value to byte reinterpret as UInt32. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as UInt32. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1124,7 +1112,7 @@ Result: ## reinterpretAsUInt64 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt64. +Performs byte reinterpretation by treating the input value as a value of type UInt64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1134,11 +1122,7 @@ reinterpretAsUInt64(x) **Parameters** -- `x`: value to byte reinterpret as UInt64. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as UInt64. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1166,7 +1150,7 @@ Result: ## reinterpretAsUInt128 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt128. +Performs byte reinterpretation by treating the input value as a value of type UInt128. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1176,11 +1160,7 @@ reinterpretAsUInt128(x) **Parameters** -- `x`: value to byte reinterpret as UInt64. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as UInt128. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1208,7 +1188,7 @@ Result: ## reinterpretAsUInt256 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type UInt256. +Performs byte reinterpretation by treating the input value as a value of type UInt256. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1218,11 +1198,7 @@ reinterpretAsUInt256(x) **Parameters** -- `x`: value to byte reinterpret as UInt256. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as UInt256. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1250,7 +1226,7 @@ Result: ## reinterpretAsInt8 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int8. +Performs byte reinterpretation by treating the input value as a value of type Int8. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1260,11 +1236,7 @@ reinterpretAsInt8(x) **Parameters** -- `x`: value to byte reinterpret as Int8. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as Int8. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1292,7 +1264,7 @@ Result: ## reinterpretAsInt16 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int16. +Performs byte reinterpretation by treating the input value as a value of type Int16. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1302,11 +1274,7 @@ reinterpretAsInt16(x) **Parameters** -- `x`: value to byte reinterpret as Int16. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as Int16. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1334,7 +1302,7 @@ Result: ## reinterpretAsInt32 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int32. +Performs byte reinterpretation by treating the input value as a value of type Int32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1344,11 +1312,7 @@ reinterpretAsInt32(x) **Parameters** -- `x`: value to byte reinterpret as Int32. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as Int32. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1376,7 +1340,7 @@ Result: ## reinterpretAsInt64 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int64. +Performs byte reinterpretation by treating the input value as a value of type Int64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1386,11 +1350,7 @@ reinterpretAsInt64(x) **Parameters** -- `x`: value to byte reinterpret as Int64. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as Int64. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1418,7 +1378,7 @@ Result: ## reinterpretAsInt128 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int128. +Performs byte reinterpretation by treating the input value as a value of type Int128. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1428,11 +1388,7 @@ reinterpretAsInt128(x) **Parameters** -- `x`: value to byte reinterpret as Int128. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as Int128. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1460,7 +1416,7 @@ Result: ## reinterpretAsInt256 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Int256. +Performs byte reinterpretation by treating the input value as a value of type Int256. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1470,11 +1426,7 @@ reinterpretAsInt256(x) **Parameters** -- `x`: value to byte reinterpret as Int256. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to byte reinterpret as Int256. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1502,7 +1454,7 @@ Result: ## reinterpretAsFloat32 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Float32. +Performs byte reinterpretation by treating the input value as a value of type Float32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1512,11 +1464,7 @@ reinterpretAsFloat32(x) **Parameters** -- `x`: value to reinterpret as Float32. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to reinterpret as Float32. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1540,7 +1488,7 @@ Result: ## reinterpretAsFloat64 -Performs byte reinterpretation similar to [reinterpret_cast](https://en.cppreference.com/w/cpp/language/reinterpret_cast) to type Float64. +Performs byte reinterpretation by treating the input value as a value of type Float64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1550,11 +1498,7 @@ reinterpretAsFloat64(x) **Parameters** -- `x`: value to reinterpret as Float64. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: value to reinterpret as Float64. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1588,11 +1532,7 @@ reinterpretAsDate(x) **Parameters** -- `x`: number of days since the beginning of the Unix Epoch. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: number of days since the beginning of the Unix Epoch. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** @@ -1632,11 +1572,7 @@ reinterpretAsDateTime(x) **Parameters** -- `x`: number of seconds since the beginning of the Unix Epoch. - -:::note -Accepts types that can be interpreted as numeric such as [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md). Accepts [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). -::: +- `x`: number of seconds since the beginning of the Unix Epoch. [(U)Int*](../data-types/int-uint.md), [Float](../data-types/float.md), [Date](../data-types/date.md), [DateTime](../data-types/datetime.md), [UUID](../data-types/uuid.md), [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). **Returned value** From 9eb79530f4b40d0f0dcef4ecd82da97e5136a4bf Mon Sep 17 00:00:00 2001 From: Max K Date: Mon, 27 May 2024 17:35:42 +0200 Subject: [PATCH 391/392] CI: fix build_report selection in case of job reuse --- tests/ci/report.py | 50 +++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/tests/ci/report.py b/tests/ci/report.py index 8676c998afb..670a10f4561 100644 --- a/tests/ci/report.py +++ b/tests/ci/report.py @@ -401,30 +401,40 @@ class BuildResult: @classmethod def load_any(cls, build_name: str, pr_number: int, head_ref: str): # type: ignore """ - loads report from suitable report file with the following priority: - 1. report from PR with the same @pr_number - 2. report from branch with the same @head_ref - 3. report from the master - 4. any other report + loads build report from one of all available report files (matching the job digest) + with the following priority: + 1. report for the current PR @pr_number (might happen in PR' wf with or without job reuse) + 2. report for the current branch @head_ref (might happen in release/master' wf with or without job reuse) + 3. report for master branch (might happen in any workflow in case of job reuse) + 4. any other report (job reuse from another PR, if master report is not available yet) """ - reports = [] + pr_report = None + ref_report = None + master_report = None + any_report = None for file in Path(REPORT_PATH).iterdir(): if f"{build_name}.json" in file.name: - reports.append(file) - if not reports: - return None - file_path = None - for file in reports: - if pr_number and f"_{pr_number}_" in file.name: - file_path = file - break - if f"_{head_ref}_" in file.name: - file_path = file - break + any_report = file if "_master_" in file.name: - file_path = file - break - return cls.load_from_file(file_path or reports[-1]) + master_report = file + elif f"_{head_ref}_" in file.name: + ref_report = file + elif pr_number and f"_{pr_number}_" in file.name: + pr_report = file + + if not any_report: + return None + + if pr_report: + file_path = pr_report + elif ref_report: + file_path = ref_report + elif master_report: + file_path = master_report + else: + file_path = any_report + + return cls.load_from_file(file_path) @classmethod def load_from_file(cls, file: Union[Path, str]): # type: ignore From f610af56f7b8b1b2367420b7533b0262c3c8231d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 28 May 2024 07:20:25 +0000 Subject: [PATCH 392/392] Fix --- src/Backups/BackupIO_S3.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index be2f81a299c..92f086295a0 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -188,7 +188,7 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s fs::path(s3_uri.key) / path_in_backup, 0, file_size, - /* dest_s3_client= */ destination_disk->getObjectStorage()->getS3StorageClient(), + /* dest_s3_client= */ destination_disk->getS3StorageClient(), /* dest_bucket= */ blob_path[1], /* dest_key= */ blob_path[0], s3_settings.request_settings, @@ -253,7 +253,7 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src { LOG_TRACE(log, "Copying file {} from disk {} to S3", src_path, src_disk->getName()); copyS3File( - src_disk->getObjectStorage()->getS3StorageClient(), + src_disk->getS3StorageClient(), /* src_bucket */ blob_path[1], /* src_key= */ blob_path[0], start_pos,