From 1376293e908b164f27e656c3f4f117f74966d737 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 7 Jul 2023 14:35:07 +0000 Subject: [PATCH 01/74] Fix mistakenly comma parsing as part of datetime in CSV best effort parsing --- .../Serializations/SerializationDateTime.cpp | 27 +++++++++++++++---- .../SerializationDateTime64.cpp | 27 +++++++++++++++---- .../02812_csv_date_time_with_comma.reference | 2 ++ .../02812_csv_date_time_with_comma.sql | 3 +++ 4 files changed, 49 insertions(+), 10 deletions(-) create mode 100644 tests/queries/0_stateless/02812_csv_date_time_with_comma.reference create mode 100644 tests/queries/0_stateless/02812_csv_date_time_with_comma.sql diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp index 7238d3ce190..dd3d97d6aa1 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB { @@ -145,12 +146,28 @@ void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & is char maybe_quote = *istr.position(); if (maybe_quote == '\'' || maybe_quote == '\"') - ++istr.position(); - - readText(x, istr, settings, time_zone, utc_time_zone); - - if (maybe_quote == '\'' || maybe_quote == '\"') + { + readText(x, istr, settings, time_zone, utc_time_zone); assertChar(maybe_quote, istr); + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + readText(x, istr, settings, time_zone, utc_time_zone); + } + /// Best effort parsing supports datetime in format like "01.01.2000, 00:00:00" + /// and can mistakenly read comma as a part of datetime. + /// For example data "...,01.01.2000,some string,..." cannot be parsed correctly. + /// To fix this problem we first read CSV string and then try to parse it as datetime. + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + readText(x, buf, settings, time_zone, utc_time_zone); + } + } if (x < 0) x = 0; diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index 78c7ea56529..91078c73f80 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace DB { @@ -143,12 +144,28 @@ void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & char maybe_quote = *istr.position(); if (maybe_quote == '\'' || maybe_quote == '\"') - ++istr.position(); - - readText(x, scale, istr, settings, time_zone, utc_time_zone); - - if (maybe_quote == '\'' || maybe_quote == '\"') + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); assertChar(maybe_quote, istr); + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + readText(x, scale, istr, settings, time_zone, utc_time_zone); + } + /// Best effort parsing supports datetime in format like "01.01.2000, 00:00:00" + /// and can mistakenly read comma as a part of datetime. + /// For example data "...,01.01.2000,some string,..." cannot be parsed correctly. + /// To fix this problem we first read CSV string and then try to parse it as datetime. + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + readText(x, scale, buf, settings, time_zone, utc_time_zone); + } + } assert_cast(column).getData().push_back(x); } diff --git a/tests/queries/0_stateless/02812_csv_date_time_with_comma.reference b/tests/queries/0_stateless/02812_csv_date_time_with_comma.reference new file mode 100644 index 00000000000..f569df13dc1 --- /dev/null +++ b/tests/queries/0_stateless/02812_csv_date_time_with_comma.reference @@ -0,0 +1,2 @@ +2000-01-01 00:00:00 abc +2000-01-01 00:00:00.000 abc diff --git a/tests/queries/0_stateless/02812_csv_date_time_with_comma.sql b/tests/queries/0_stateless/02812_csv_date_time_with_comma.sql new file mode 100644 index 00000000000..ecd3cff6ad0 --- /dev/null +++ b/tests/queries/0_stateless/02812_csv_date_time_with_comma.sql @@ -0,0 +1,3 @@ +select * from format(CSV, 'c1 DateTime, c2 String', '01-01-2000,abc') settings date_time_input_format='best_effort'; +select * from format(CSV, 'c1 DateTime64(3), c2 String', '01-01-2000,abc') settings date_time_input_format='best_effort'; + From d2b52eeeef5a3a3aaf314cd585cb58375df3973f Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 17 Jul 2023 17:11:49 +0000 Subject: [PATCH 02/74] Fix --- src/DataTypes/Serializations/SerializationDateTime.cpp | 1 + src/DataTypes/Serializations/SerializationDateTime64.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp index dd3d97d6aa1..2ba24f5351b 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -147,6 +147,7 @@ void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & is if (maybe_quote == '\'' || maybe_quote == '\"') { + ++istr.position(); readText(x, istr, settings, time_zone, utc_time_zone); assertChar(maybe_quote, istr); } diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index 91078c73f80..c5964f1bd97 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -145,6 +145,7 @@ void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & if (maybe_quote == '\'' || maybe_quote == '\"') { + ++istr.position(); readText(x, scale, istr, settings, time_zone, utc_time_zone); assertChar(maybe_quote, istr); } From 6d77d52dfe034afe196fa1219ddc8897d1070146 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 27 Jul 2023 18:02:29 +0000 Subject: [PATCH 03/74] Allow variable number of columns in TSV/CuatomSeprarated/JSONCompactEachRow, make schema inference work with variable number of columns --- docs/en/interfaces/formats.md | 11 ++-- .../operations/settings/settings-formats.md | 25 +++++++- src/Core/Settings.h | 5 +- src/Formats/FormatFactory.cpp | 3 + src/Formats/FormatSettings.h | 3 + src/Processors/Formats/ISchemaReader.cpp | 46 +++++++++++--- src/Processors/Formats/ISchemaReader.h | 6 +- .../Formats/Impl/CSVRowInputFormat.cpp | 13 ++-- .../Formats/Impl/CSVRowInputFormat.h | 8 ++- .../Impl/CustomSeparatedRowInputFormat.cpp | 19 +++--- .../Impl/CustomSeparatedRowInputFormat.h | 10 +++- .../Impl/JSONCompactEachRowRowInputFormat.cpp | 8 ++- .../Impl/JSONCompactEachRowRowInputFormat.h | 7 ++- .../Formats/Impl/MsgPackRowInputFormat.cpp | 2 +- .../Formats/Impl/MsgPackRowInputFormat.h | 2 +- .../Formats/Impl/MySQLDumpRowInputFormat.cpp | 2 +- .../Formats/Impl/MySQLDumpRowInputFormat.h | 2 +- .../Formats/Impl/RegexpRowInputFormat.cpp | 2 +- .../Formats/Impl/RegexpRowInputFormat.h | 2 +- .../Impl/TabSeparatedRowInputFormat.cpp | 16 +++-- .../Formats/Impl/TabSeparatedRowInputFormat.h | 9 ++- .../Formats/Impl/TemplateRowInputFormat.cpp | 2 +- .../Formats/Impl/TemplateRowInputFormat.h | 2 +- .../Formats/Impl/ValuesBlockInputFormat.cpp | 2 +- .../Formats/Impl/ValuesBlockInputFormat.h | 2 +- .../RowInputFormatWithNamesAndTypes.cpp | 60 ++++++++++--------- .../Formats/RowInputFormatWithNamesAndTypes.h | 10 ++-- ..._with_variable_number_of_columns.reference | 52 ++++++++++++++++ ...ormats_with_variable_number_of_columns.sql | 18 ++++++ 29 files changed, 264 insertions(+), 85 deletions(-) create mode 100644 tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference create mode 100644 tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 15f9d1f47bf..95483068cb2 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -195,6 +195,7 @@ SELECT * FROM nestedt FORMAT TSV - [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`. - [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`. - [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. +- [input_format_tsv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_allow_variable_number_of_columns) - allow variable number of columns in TSV format, ignore extra columns and use default values on missing columns. Default value - `false`. ## TabSeparatedRaw {#tabseparatedraw} @@ -472,7 +473,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. - [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. -- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`. +- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`. - [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`. ## CSVWithNames {#csvwithnames} @@ -501,9 +502,10 @@ the types from input data will be compared with the types of the corresponding c Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_after_delimiter) settings, not from format strings. -If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any. - -If setting [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, trailing empty lines at the end of file will be skipped. +Additional settings: +- [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) - enables automatic detection of header with names and types if any. Default value - `true`. +- [input_format_custom_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_skip_trailing_empty_lines) - skip trailing empty lines at the end of file . Default value - `false`. +- [input_format_custom_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_custom_allow_variable_number_of_columns) - allow variable number of columns in CustomSeparated format, ignore extra columns and use default values on missing columns. Default value - `false`. There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces). @@ -1261,6 +1263,7 @@ SELECT * FROM json_each_row_nested - [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings-formats.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`. - [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`. - [input_format_json_ignore_unknown_keys_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_ignore_unknown_keys_in_named_tuple) - Ignore unknown keys in json object for named tuples. Default value - `false`. +- [input_format_json_compact_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_json_compact_allow_variable_number_of_columns) - allow variable number of columns in JSONCompact/JSONCompactEachRow format, ignore extra columns and use default values on missing columns. Default value - `false`. - [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`. - [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`. - [output_format_json_quote_denormals](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`. diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index ee8e0d547b8..8e3d6b74ffa 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -623,6 +623,13 @@ Column type should be String. If value is empty, default names `row_{i}`will be Default value: ''. +### input_format_json_compact_allow_variable_number_of_columns {#input_format_json_compact_allow_variable_number_of_columns} + +Allow variable number of columns in rows in JSONCompact/JSONCompactEachRow input formats. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. + +Disabled by default. + ## TSV format settings {#tsv-format-settings} ### input_format_tsv_empty_as_default {#input_format_tsv_empty_as_default} @@ -760,6 +767,13 @@ When enabled, trailing empty lines at the end of TSV file will be skipped. Disabled by default. +### input_format_tsv_allow_variable_number_of_columns {#input_format_tsv_allow_variable_number_of_columns} + +Allow variable number of columns in rows in TSV input format. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. + +Disabled by default. + ## CSV format settings {#csv-format-settings} ### format_csv_delimiter {#format_csv_delimiter} @@ -951,9 +965,11 @@ Result ```text " string " ``` + ### input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns} -ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. +Allow variable number of columns in rows in CSV input format. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. Disabled by default. @@ -1589,6 +1605,13 @@ When enabled, trailing empty lines at the end of file in CustomSeparated format Disabled by default. +### input_format_custom_allow_variable_number_of_columns {#input_format_custom_allow_variable_number_of_columns} + +Allow variable number of columns in rows in CustomSeparated input format. +Ignore extra columns in rows with more columns than expected and treat missing columns as default values. + +Disabled by default. + ## Regexp format settings {#regexp-format-settings} ### format_regexp_escaping_rule {#format_regexp_escaping_rule} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c69d132ea25..86146bfad07 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -879,6 +879,10 @@ class IColumn; M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \ M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \ M(Bool, input_format_csv_use_default_on_bad_values, false, "Allow to set default value to column when CSV field deserialization failed on bad value", 0) \ + M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ + M(Bool, input_format_tsv_allow_variable_number_of_columns, false, "Ignore extra columns in TSV input (if file has more columns than expected) and treat missing fields in TSV input as default values", 0) \ + M(Bool, input_format_custom_allow_variable_number_of_columns, false, "Ignore extra columns in CustomSeparated input (if file has more columns than expected) and treat missing fields in CustomSeparated input as default values", 0) \ + M(Bool, input_format_json_compact_allow_variable_number_of_columns, false, "Ignore extra columns in JSONCompact(EachRow) input (if file has more columns than expected) and treat missing fields in JSONCompact(EachRow) input as default values", 0) \ M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \ M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \ M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \ @@ -1023,7 +1027,6 @@ class IColumn; M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ - M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 663b7f1ba95..dff480d1f79 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -86,6 +86,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter; format_settings.custom.try_detect_header = settings.input_format_custom_detect_header; format_settings.custom.skip_trailing_empty_lines = settings.input_format_custom_skip_trailing_empty_lines; + format_settings.custom.allow_variable_number_of_columns = settings.input_format_custom_allow_variable_number_of_columns; format_settings.date_time_input_format = settings.date_time_input_format; format_settings.date_time_output_format = settings.date_time_output_format; format_settings.interval.output_format = settings.interval_output_format; @@ -115,6 +116,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8; format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name; format_settings.json.allow_object_type = context->getSettingsRef().allow_experimental_object_type; + format_settings.json.compact_allow_variable_number_of_columns = settings.input_format_json_compact_allow_variable_number_of_columns; format_settings.null_as_default = settings.input_format_null_as_default; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_rows = settings.output_format_parquet_row_group_size; @@ -161,6 +163,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines; format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header; format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines; + format_settings.tsv.allow_variable_number_of_columns = settings.input_format_tsv_allow_variable_number_of_columns; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 3259c46e5ff..68cf9ad817d 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -176,6 +176,7 @@ struct FormatSettings EscapingRule escaping_rule = EscapingRule::Escaped; bool try_detect_header = true; bool skip_trailing_empty_lines = false; + bool allow_variable_number_of_columns = false; } custom; struct @@ -198,6 +199,7 @@ struct FormatSettings bool validate_types_from_metadata = true; bool validate_utf8 = false; bool allow_object_type = false; + bool compact_allow_variable_number_of_columns = false; } json; struct @@ -316,6 +318,7 @@ struct FormatSettings UInt64 skip_first_lines = 0; bool try_detect_header = true; bool skip_trailing_empty_lines = false; + bool allow_variable_number_of_columns = false; } tsv; struct diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 1fa520eaaee..15b53c2a499 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -115,21 +115,24 @@ NamesAndTypesList IRowSchemaReader::readSchema() "Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. " "Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0"); - DataTypes data_types = readRowAndGetDataTypes(); + auto data_types_maybe = readRowAndGetDataTypes(); /// Check that we read at list one column. - if (data_types.empty()) + if (!data_types_maybe) throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data"); + DataTypes data_types = std::move(*data_types_maybe); + /// If column names weren't set, use default names 'c1', 'c2', ... - if (column_names.empty()) + bool use_default_column_names = column_names.empty(); + if (use_default_column_names) { column_names.reserve(data_types.size()); for (size_t i = 0; i != data_types.size(); ++i) column_names.push_back("c" + std::to_string(i + 1)); } /// If column names were set, check that the number of names match the number of types. - else if (column_names.size() != data_types.size()) + else if (column_names.size() != data_types.size() && !allowVariableNumberOfColumns()) { throw Exception( ErrorCodes::INCORRECT_DATA, @@ -137,6 +140,9 @@ NamesAndTypesList IRowSchemaReader::readSchema() } else { + if (column_names.size() != data_types.size()) + data_types.resize(column_names.size()); + std::unordered_set names_set; for (const auto & name : column_names) { @@ -155,13 +161,39 @@ NamesAndTypesList IRowSchemaReader::readSchema() for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read) { - DataTypes new_data_types = readRowAndGetDataTypes(); - if (new_data_types.empty()) + auto new_data_types_maybe = readRowAndGetDataTypes(); + if (!new_data_types_maybe) /// We reached eof. break; + DataTypes new_data_types = std::move(*new_data_types_maybe); + if (new_data_types.size() != data_types.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values"); + { + if (!allowVariableNumberOfColumns()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values"); + + if (use_default_column_names) + { + /// Current row contains new columns, add new default names. + if (new_data_types.size() > data_types.size()) + { + for (size_t i = data_types.size(); i < new_data_types.size(); ++i) + column_names.push_back("c" + std::to_string(i + 1)); + data_types.resize(new_data_types.size()); + } + /// Current row contain less columns than previous rows. + else + { + new_data_types.resize(data_types.size()); + } + } + /// If names were explicitly set, ignore all extra columns. + else + { + new_data_types.resize(column_names.size()); + } + } for (field_index = 0; field_index != data_types.size(); ++field_index) { diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 40702198a57..0cc8b98f05e 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -93,11 +93,13 @@ protected: /// Read one row and determine types of columns in it. /// Return types in the same order in which the values were in the row. /// If it's impossible to determine the type for some column, return nullptr for it. - /// Return empty list if can't read more data. - virtual DataTypes readRowAndGetDataTypes() = 0; + /// Return std::nullopt if can't read more data. + virtual std::optional readRowAndGetDataTypes() = 0; void setColumnNames(const std::vector & names) { column_names = names; } + virtual bool allowVariableNumberOfColumns() const { return false; } + size_t field_index; private: diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 244b906549e..9092c7fceba 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -284,7 +284,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return true; } -bool CSVFormatReader::allowVariableNumberOfColumns() +bool CSVFormatReader::allowVariableNumberOfColumns() const { return format_settings.csv.allow_variable_number_of_columns; } @@ -410,19 +410,22 @@ CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_t { } -std::pair, DataTypes> CSVSchemaReader::readRowAndGetFieldsAndDataTypes() +std::optional, DataTypes>> CSVSchemaReader::readRowAndGetFieldsAndDataTypes() { if (buf.eof()) return {}; auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, format_settings, FormatSettings::EscapingRule::CSV); - return {fields, data_types}; + return std::make_pair(fields, data_types); } -DataTypes CSVSchemaReader::readRowAndGetDataTypesImpl() +std::optional CSVSchemaReader::readRowAndGetDataTypesImpl() { - return std::move(readRowAndGetFieldsAndDataTypes().second); + auto fields_with_types = readRowAndGetFieldsAndDataTypes(); + if (!fields_with_types) + return {}; + return std::move(fields_with_types->second); } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 7b1a1fc433d..2444477b184 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -70,7 +70,7 @@ public: void skipPrefixBeforeHeader() override; bool checkForEndOfRow() override; - bool allowVariableNumberOfColumns() override; + bool allowVariableNumberOfColumns() const override; std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } @@ -102,8 +102,10 @@ public: CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_); private: - DataTypes readRowAndGetDataTypesImpl() override; - std::pair, DataTypes> readRowAndGetFieldsAndDataTypes() override; + bool allowVariableNumberOfColumns() const override { return format_settings.csv.allow_variable_number_of_columns; } + + std::optional readRowAndGetDataTypesImpl() override; + std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() override; PeekableReadBuffer buf; CSVFormatReader reader; diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 1e67db79a2c..8f8e12e3c2a 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -142,7 +142,7 @@ void CustomSeparatedFormatReader::skipField() skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); } -bool CustomSeparatedFormatReader::checkEndOfRow() +bool CustomSeparatedFormatReader::checkForEndOfRow() { PeekableReadBufferCheckpoint checkpoint{*buf, true}; @@ -200,12 +200,12 @@ std::vector CustomSeparatedFormatReader::readRowImpl() std::vector values; skipRowStartDelimiter(); - if (columns == 0) + if (columns == 0 || allowVariableNumberOfColumns()) { do { values.push_back(readFieldIntoString(values.empty(), false, true)); - } while (!checkEndOfRow()); + } while (!checkForEndOfRow()); columns = values.size(); } else @@ -230,7 +230,7 @@ void CustomSeparatedFormatReader::skipHeaderRow() skipField(); } - while (!checkEndOfRow()); + while (!checkForEndOfRow()); skipRowEndDelimiter(); } @@ -369,7 +369,7 @@ CustomSeparatedSchemaReader::CustomSeparatedSchemaReader( { } -std::pair, DataTypes> CustomSeparatedSchemaReader::readRowAndGetFieldsAndDataTypes() +std::optional, DataTypes>> CustomSeparatedSchemaReader::readRowAndGetFieldsAndDataTypes() { if (no_more_data || reader.checkForSuffix()) { @@ -385,12 +385,15 @@ std::pair, DataTypes> CustomSeparatedSchemaReader::readRowAn auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), &json_inference_info); - return {fields, data_types}; + return std::make_pair(fields, data_types); } -DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypesImpl() +std::optional CustomSeparatedSchemaReader::readRowAndGetDataTypesImpl() { - return readRowAndGetFieldsAndDataTypes().second; + auto fields_with_types = readRowAndGetFieldsAndDataTypes(); + if (!fields_with_types) + return {}; + return std::move(fields_with_types->second); } void CustomSeparatedSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index 2acf35bd143..893f06409f6 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -74,7 +74,9 @@ public: std::vector readRowForHeaderDetection() override { return readRowImpl(); } - bool checkEndOfRow(); + bool checkForEndOfRow() override; + bool allowVariableNumberOfColumns() const override { return format_settings.custom.allow_variable_number_of_columns; } + bool checkForSuffixImpl(bool check_eof); inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf, true); } @@ -109,9 +111,11 @@ public: CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_); private: - DataTypes readRowAndGetDataTypesImpl() override; + bool allowVariableNumberOfColumns() const override { return format_settings.custom.allow_variable_number_of_columns; } - std::pair, DataTypes> readRowAndGetFieldsAndDataTypes() override; + std::optional readRowAndGetDataTypesImpl() override; + + std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() override; void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index b91345bebe3..e3583a3dff0 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -112,6 +112,12 @@ bool JSONCompactEachRowFormatReader::readField(IColumn & column, const DataTypeP return JSONUtils::readField(*in, column, type, serialization, column_name, format_settings, yield_strings); } +bool JSONCompactEachRowFormatReader::checkForEndOfRow() +{ + skipWhitespaceIfAny(*in); + return !in->eof() && *in->position() == ']'; +} + bool JSONCompactEachRowFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); @@ -187,7 +193,7 @@ JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader( { } -DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypesImpl() +std::optional JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypesImpl() { if (first_row) first_row = false; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index bb699f0ca2e..378a41e6471 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -68,6 +68,9 @@ public: std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } + bool checkForEndOfRow() override; + bool allowVariableNumberOfColumns() const override { return format_settings.json.compact_allow_variable_number_of_columns; } + bool yieldStrings() const { return yield_strings; } private: bool yield_strings; @@ -79,7 +82,9 @@ public: JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_); private: - DataTypes readRowAndGetDataTypesImpl() override; + bool allowVariableNumberOfColumns() const override { return format_settings.json.compact_allow_variable_number_of_columns; } + + std::optional readRowAndGetDataTypesImpl() override; void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; void transformFinalTypeIfNeeded(DataTypePtr & type) override; diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index eeca14176cc..a46f0018611 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -634,7 +634,7 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) UNREACHABLE(); } -DataTypes MsgPackSchemaReader::readRowAndGetDataTypes() +std::optional MsgPackSchemaReader::readRowAndGetDataTypes() { if (buf.eof()) return {}; diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h index 0b485d3b97c..028ab878ad0 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h @@ -91,7 +91,7 @@ public: private: msgpack::object_handle readObject(); DataTypePtr getDataType(const msgpack::object & object); - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; PeekableReadBuffer buf; UInt64 number_of_columns; diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp index 90dd07bd5a8..6c754f141da 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp @@ -422,7 +422,7 @@ NamesAndTypesList MySQLDumpSchemaReader::readSchema() return IRowSchemaReader::readSchema(); } -DataTypes MySQLDumpSchemaReader::readRowAndGetDataTypes() +std::optional MySQLDumpSchemaReader::readRowAndGetDataTypes() { if (in.eof()) return {}; diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h index c28355054d7..14a73bf83b0 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.h @@ -33,7 +33,7 @@ public: private: NamesAndTypesList readSchema() override; - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; String table_name; }; diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index d902a8be6a7..8e94a568b1e 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -143,7 +143,7 @@ RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & { } -DataTypes RegexpSchemaReader::readRowAndGetDataTypes() +std::optional RegexpSchemaReader::readRowAndGetDataTypes() { if (buf.eof()) return {}; diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index 2469774aaf9..7417d48d8c1 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -79,7 +79,7 @@ public: RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); private: - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 2239c8539e3..7fbad583ced 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -300,6 +300,11 @@ bool TabSeparatedFormatReader::checkForSuffix() return false; } +bool TabSeparatedFormatReader::checkForEndOfRow() +{ + return buf->eof() || *buf->position() == '\n'; +} + TabSeparatedSchemaReader::TabSeparatedSchemaReader( ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( @@ -315,19 +320,22 @@ TabSeparatedSchemaReader::TabSeparatedSchemaReader( { } -std::pair, DataTypes> TabSeparatedSchemaReader::readRowAndGetFieldsAndDataTypes() +std::optional, DataTypes>> TabSeparatedSchemaReader::readRowAndGetFieldsAndDataTypes() { if (buf.eof()) return {}; auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); - return {fields, data_types}; + return std::make_pair(fields, data_types); } -DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypesImpl() +std::optional TabSeparatedSchemaReader::readRowAndGetDataTypesImpl() { - return readRowAndGetFieldsAndDataTypes().second; + auto fields_with_types = readRowAndGetFieldsAndDataTypes(); + if (!fields_with_types) + return {}; + return std::move(fields_with_types->second); } void registerInputFormatTabSeparated(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 8df57675cf5..e0234761d61 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -76,6 +76,9 @@ public: void setReadBuffer(ReadBuffer & in_) override; bool checkForSuffix() override; + bool checkForEndOfRow() override; + + bool allowVariableNumberOfColumns() const override { return format_settings.tsv.allow_variable_number_of_columns; } private: template @@ -92,8 +95,10 @@ public: TabSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings); private: - DataTypes readRowAndGetDataTypesImpl() override; - std::pair, DataTypes> readRowAndGetFieldsAndDataTypes() override; + bool allowVariableNumberOfColumns() const override { return format_settings.tsv.allow_variable_number_of_columns; } + + std::optional readRowAndGetDataTypesImpl() override; + std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() override; PeekableReadBuffer buf; TabSeparatedFormatReader reader; diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 8a09e800fa7..b065e00f5d1 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -490,7 +490,7 @@ TemplateSchemaReader::TemplateSchemaReader( setColumnNames(row_format.column_names); } -DataTypes TemplateSchemaReader::readRowAndGetDataTypes() +std::optional TemplateSchemaReader::readRowAndGetDataTypes() { if (first_row) format_reader.readPrefix(); diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 8f9088e2c47..2752cb13e50 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -119,7 +119,7 @@ public: std::string row_between_delimiter, const FormatSettings & format_settings_); - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; private: void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 3a65a6fe4ea..6cb469afca1 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -638,7 +638,7 @@ ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & { } -DataTypes ValuesSchemaReader::readRowAndGetDataTypes() +std::optional ValuesSchemaReader::readRowAndGetDataTypes() { if (first_row) { diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index 8f8d44ec088..7f1dbc0da66 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -105,7 +105,7 @@ public: ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings); private: - DataTypes readRowAndGetDataTypes() override; + std::optional readRowAndGetDataTypes() override; PeekableReadBuffer buf; ParserExpression parser; diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index fb49779e0af..cb5c11e2d3b 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -212,8 +212,23 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE format_reader->skipRowStartDelimiter(); ext.read_columns.resize(data_types.size()); - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + size_t file_column = 0; + for (; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) { + if (format_reader->allowVariableNumberOfColumns() && format_reader->checkForEndOfRow()) + { + while (file_column < column_mapping->column_indexes_for_input_fields.size()) + { + const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column]; + columns[*rem_column_index]->insertDefault(); + ++file_column; + } + break; + } + + if (file_column != 0) + format_reader->skipFieldDelimiter(); + const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); if (column_index) @@ -225,22 +240,6 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE column_mapping->names_of_columns[file_column]); else format_reader->skipField(file_column); - - if (!is_last_file_column) - { - if (format_reader->allowVariableNumberOfColumns() && format_reader->checkForEndOfRow()) - { - ++file_column; - while (file_column < column_mapping->column_indexes_for_input_fields.size()) - { - const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column]; - columns[*rem_column_index]->insertDefault(); - ++file_column; - } - } - else - format_reader->skipFieldDelimiter(); - } } if (format_reader->allowVariableNumberOfColumns() && !format_reader->checkForEndOfRow()) @@ -248,7 +247,7 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE do { format_reader->skipFieldDelimiter(); - format_reader->skipField(1); + format_reader->skipField(file_column++); } while (!format_reader->checkForEndOfRow()); } @@ -419,12 +418,14 @@ namespace void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & column_names, std::vector & type_names) { - auto [first_row_values, first_row_types] = readRowAndGetFieldsAndDataTypes(); + auto first_row = readRowAndGetFieldsAndDataTypes(); /// No data. - if (first_row_values.empty()) + if (!first_row) return; + auto [first_row_values, first_row_types] = *first_row; + /// The first row contains non String elements, it cannot be a header. if (!checkIfAllTypesAreString(first_row_types)) { @@ -432,15 +433,17 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & return; } - auto [second_row_values, second_row_types] = readRowAndGetFieldsAndDataTypes(); + auto second_row = readRowAndGetFieldsAndDataTypes(); /// Data contains only 1 row, don't treat it as a header. - if (second_row_values.empty()) + if (!second_row) { buffered_types = first_row_types; return; } + auto [second_row_values, second_row_types] = *second_row; + DataTypes data_types; bool second_row_can_be_type_names = checkIfAllTypesAreString(second_row_types) && checkIfAllValuesAreTypeNames(readNamesFromFields(second_row_values)); size_t row = 2; @@ -450,15 +453,16 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & } else { - data_types = readRowAndGetDataTypes(); + auto data_types_maybe = readRowAndGetDataTypes(); /// Data contains only 2 rows. - if (data_types.empty()) + if (!data_types_maybe) { second_row_can_be_type_names = false; data_types = second_row_types; } else { + data_types = *data_types_maybe; ++row; } } @@ -490,10 +494,10 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & return; } - auto next_row_types = readRowAndGetDataTypes(); + auto next_row_types_maybe = readRowAndGetDataTypes(); /// Check if there are no more rows in data. It means that all rows contains only String values and Nulls, /// so, the first two rows with all String elements can be real data and we cannot use them as a header. - if (next_row_types.empty()) + if (!next_row_types_maybe) { /// Buffer first data types from the first row, because it doesn't contain Nulls. buffered_types = first_row_types; @@ -502,11 +506,11 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & ++row; /// Combine types from current row and from previous rows. - chooseResultColumnTypes(*this, data_types, next_row_types, getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV), default_colum_names, row); + chooseResultColumnTypes(*this, data_types, *next_row_types_maybe, getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV), default_colum_names, row); } } -DataTypes FormatWithNamesAndTypesSchemaReader::readRowAndGetDataTypes() +std::optional FormatWithNamesAndTypesSchemaReader::readRowAndGetDataTypes() { /// Check if we tried to detect a header and have buffered types from read rows. if (!buffered_types.empty()) diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index b5103d3db39..7b3e2cbea67 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -121,7 +121,7 @@ public: virtual bool checkForEndOfRow() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method checkForEndOfRow is not implemented"); } - virtual bool allowVariableNumberOfColumns() { return false; } + virtual bool allowVariableNumberOfColumns() const { return false; } const FormatSettings & getFormatSettings() const { return format_settings; } @@ -160,15 +160,15 @@ public: NamesAndTypesList readSchema() override; protected: - virtual DataTypes readRowAndGetDataTypes() override; + virtual std::optional readRowAndGetDataTypes() override; - virtual DataTypes readRowAndGetDataTypesImpl() + virtual std::optional readRowAndGetDataTypesImpl() { throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetDataTypesImpl is not implemented"}; } - /// Return column fields with inferred types. In case of no more rows, return empty vectors. - virtual std::pair, DataTypes> readRowAndGetFieldsAndDataTypes() + /// Return column fields with inferred types. In case of no more rows, return nullopt. + virtual std::optional, DataTypes>> readRowAndGetFieldsAndDataTypes() { throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetFieldsAndDataTypes is not implemented"}; } diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference new file mode 100644 index 00000000000..39d24f2cbd2 --- /dev/null +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference @@ -0,0 +1,52 @@ +CSV +1 1 +2 0 +0 0 +3 3 +1 1 \N \N +2 \N \N \N +\N \N \N \N +3 3 3 3 +1 1 +2 \N +\N \N +3 3 +TSV +1 1 +2 0 +0 0 +3 3 +1 1 \N \N +2 \N \N \N +\N \N \N \N +3 3 3 3 +1 1 +2 \N +\N \N +3 3 +JSONCompactEachRow +1 1 +2 0 +0 0 +3 3 +1 1 \N \N +2 \N \N \N +\N \N \N \N +3 3 3 3 +1 1 +2 \N +\N \N +3 3 +CustomSeparated +1 1 +2 0 +0 0 +3 3 +1 1 \N \N +2 \N \N \N +\N \N \N \N +3 3 3 3 +1 1 +2 \N +\N \N +3 3 diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql new file mode 100644 index 00000000000..c0a80bf2114 --- /dev/null +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql @@ -0,0 +1,18 @@ +select 'CSV'; +select * from format(CSV, 'x UInt32, y UInt32', '1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; +select * from format(CSV, '1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; +select * from format(CSVWithNames, '"x","y"\n1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; +select 'TSV'; +select * from format(TSV, 'x UInt32, y UInt32', '1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; +select * from format(TSV, '1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; +select * from format(TSVWithNames, 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; +select 'JSONCompactEachRow'; +select * from format(JSONCompactEachRow, 'x UInt32, y UInt32', '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRow, '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRowWithNames, '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select 'CustomSeparated'; +set format_custom_escaping_rule='CSV', format_custom_field_delimiter='', format_custom_row_before_delimiter='', format_custom_row_after_delimiter='', format_custom_row_between_delimiter='', format_custom_result_before_delimiter='', format_custom_result_after_delimiter=''; +select * from format(CustomSeparated, 'x UInt32, y UInt32', '1123333') settings input_format_custom_allow_variable_number_of_columns=1; +select * from format(CustomSeparated, '1123333') settings input_format_custom_allow_variable_number_of_columns=1; +select * from format(CustomSeparatedWithNames, '"x""y"1123333') settings input_format_custom_allow_variable_number_of_columns=1; + From c3c64a7dd50ee0f25dd94eb1d1b645e0352471ec Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 28 Jul 2023 11:40:05 +0000 Subject: [PATCH 04/74] Fix --- .../Impl/CustomSeparatedRowInputFormat.cpp | 5 ++++- .../Formats/RowInputFormatWithNamesAndTypes.cpp | 3 ++- ...ats_with_variable_number_of_columns.reference | 16 ++++++++++++++++ ...4_formats_with_variable_number_of_columns.sql | 4 ++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 8f8e12e3c2a..ff3d6d49199 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -139,7 +139,10 @@ void CustomSeparatedFormatReader::skipRowBetweenDelimiter() void CustomSeparatedFormatReader::skipField() { skipSpaces(); - skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); + if (format_settings.custom.escaping_rule == FormatSettings::EscapingRule::CSV) + readCSVFieldWithTwoPossibleDelimiters(*buf, format_settings.csv, format_settings.custom.field_delimiter, format_settings.custom.row_after_delimiter); + else + skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); } bool CustomSeparatedFormatReader::checkForEndOfRow() diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index cb5c11e2d3b..4000bd14ddc 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -220,7 +220,8 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE while (file_column < column_mapping->column_indexes_for_input_fields.size()) { const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column]; - columns[*rem_column_index]->insertDefault(); + if (rem_column_index) + columns[*rem_column_index]->insertDefault(); ++file_column; } break; diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference index 39d24f2cbd2..e9ff548e05c 100644 --- a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference @@ -11,6 +11,10 @@ CSV 2 \N \N \N 3 3 +1 0 +2 0 +0 0 +3 0 TSV 1 1 2 0 @@ -24,6 +28,10 @@ TSV 2 \N \N \N 3 3 +1 0 +2 0 +0 0 +3 0 JSONCompactEachRow 1 1 2 0 @@ -37,6 +45,10 @@ JSONCompactEachRow 2 \N \N \N 3 3 +1 0 +2 0 +0 0 +3 0 CustomSeparated 1 1 2 0 @@ -50,3 +62,7 @@ CustomSeparated 2 \N \N \N 3 3 +1 0 +2 0 +0 0 +3 0 diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql index c0a80bf2114..dea4c20db8a 100644 --- a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql @@ -2,17 +2,21 @@ select 'CSV'; select * from format(CSV, 'x UInt32, y UInt32', '1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; select * from format(CSV, '1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; select * from format(CSVWithNames, '"x","y"\n1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; +select * from format(CSVWithNames, 'x UInt32, z UInt32', '"x","y"\n1,1\n2\n\n3,3,3,3') settings input_format_csv_allow_variable_number_of_columns=1; select 'TSV'; select * from format(TSV, 'x UInt32, y UInt32', '1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; select * from format(TSV, '1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; select * from format(TSVWithNames, 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; +select * from format(TSVWithNames, 'x UInt32, z UInt32', 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; select 'JSONCompactEachRow'; select * from format(JSONCompactEachRow, 'x UInt32, y UInt32', '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRow, '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRowWithNames, '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRowWithNames, 'x UInt32, z UInt32', '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select 'CustomSeparated'; set format_custom_escaping_rule='CSV', format_custom_field_delimiter='', format_custom_row_before_delimiter='', format_custom_row_after_delimiter='', format_custom_row_between_delimiter='', format_custom_result_before_delimiter='', format_custom_result_after_delimiter=''; select * from format(CustomSeparated, 'x UInt32, y UInt32', '1123333') settings input_format_custom_allow_variable_number_of_columns=1; select * from format(CustomSeparated, '1123333') settings input_format_custom_allow_variable_number_of_columns=1; select * from format(CustomSeparatedWithNames, '"x""y"1123333') settings input_format_custom_allow_variable_number_of_columns=1; +select * from format(CustomSeparatedWithNames, 'x UInt32, z UInt32', '"x""y"1123333') settings input_format_custom_allow_variable_number_of_columns=1; From 6a0a0ff30d3855be655142140002bb652f8227f8 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 3 Aug 2023 15:40:24 +0000 Subject: [PATCH 05/74] One line fix for the specific query --- src/Processors/QueryPlan/ReadFromRemote.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 7a99c363232..4a40d4343fd 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -240,7 +240,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact shard.shard_info.pool, query_string, output_stream->header, context, throttler, scalars, external_tables, stage); remote_query_executor->setLogger(log); - remote_query_executor->setPoolMode(PoolMode::GET_MANY); + remote_query_executor->setPoolMode(PoolMode::GET_ONE); if (!table_func_ptr) remote_query_executor->setMainTable(shard.main_table ? shard.main_table : main_table); From 78a59b213c8a366ba9840764a358edd025bfc7d6 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 3 Aug 2023 15:58:48 +0000 Subject: [PATCH 06/74] Trivial test with MergeTree --- ...835_parallel_replicas_over_distributed.reference | 1 + .../02835_parallel_replicas_over_distributed.sql | 13 +++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference create mode 100644 tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql diff --git a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference new file mode 100644 index 00000000000..ea4483ec305 --- /dev/null +++ b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference @@ -0,0 +1 @@ +100 4950 diff --git a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql new file mode 100644 index 00000000000..199be3771c7 --- /dev/null +++ b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql @@ -0,0 +1,13 @@ +DROP TABLE IF EXISTS test_d; +DROP TABLE IF EXISTS test; +CREATE TABLE test (id UInt64, date Date) +ENGINE = MergeTree +ORDER BY id +as select *, today() from numbers(100); + +CREATE TABLE IF NOT EXISTS test_d as test +ENGINE = Distributed(test_cluster_one_shard_three_replicas_localhost, currentDatabase(), test); + +SELECT count(), sum(id) +FROM test_d +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; From 2bda97d8f7e327aee0dcda56f94097ebba9b4e85 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Sat, 5 Aug 2023 00:05:50 +0000 Subject: [PATCH 07/74] Potential fix for custom key tests --- src/Processors/QueryPlan/ReadFromRemote.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 4a40d4343fd..5bc082b06ee 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -240,7 +240,8 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact shard.shard_info.pool, query_string, output_stream->header, context, throttler, scalars, external_tables, stage); remote_query_executor->setLogger(log); - remote_query_executor->setPoolMode(PoolMode::GET_ONE); + if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS) + remote_query_executor->setPoolMode(PoolMode::GET_ONE); if (!table_func_ptr) remote_query_executor->setMainTable(shard.main_table ? shard.main_table : main_table); From a8b782ca5a2a3a916001397a0a200d8de50c9fa6 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Sat, 5 Aug 2023 23:19:09 +0000 Subject: [PATCH 08/74] Parallel reading from replicas over disributed with several shards --- .../ClusterProxy/executeQuery.cpp | 5 --- src/Processors/QueryPlan/ReadFromRemote.cpp | 10 +++++ tests/config/config.d/clusters.xml | 32 ++++++++++++++ ...rallel_replicas_over_distributed.reference | 7 +++- ...835_parallel_replicas_over_distributed.sql | 42 +++++++++++++++++-- 5 files changed, 86 insertions(+), 10 deletions(-) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 2fed626ffb7..de3150aa69e 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -270,11 +270,6 @@ void executeQueryWithParallelReplicas( const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info, const ClusterPtr & not_optimized_cluster) { - if (not_optimized_cluster->getShardsInfo().size() != 1) - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Cluster for parallel replicas should consist only from one shard"); - - auto shard_info = not_optimized_cluster->getShardsInfo().front(); - const auto & settings = context->getSettingsRef(); ClusterPtr new_cluster = not_optimized_cluster->getClusterWithReplicasAsShards(settings); diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 5bc082b06ee..287ceece5bd 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -240,8 +240,18 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact shard.shard_info.pool, query_string, output_stream->header, context, throttler, scalars, external_tables, stage); remote_query_executor->setLogger(log); + if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS) + // when doing parallel reading from replicas (ParallelReplicasMode::READ_TASKS) on a shard: + // establish a connection to a replica on the shard, the replica will instantiate coordinator to manage parallel reading from replicas on the shard. + // The coordinator will return query result from the shard. + // Only one coordinator per shard is necessary. Therefore using PoolMode::GET_ONE to establish only one connection per shard. + // Using PoolMode::GET_MANY for this mode will(can) lead to instatiation of several coordinators (depends on max_parallel_replicas setting) + // each will execute parallel reading from replicas, + // so the query result will be multiplied by the number of created coordinator remote_query_executor->setPoolMode(PoolMode::GET_ONE); + else + remote_query_executor->setPoolMode(PoolMode::GET_MANY); if (!table_func_ptr) remote_query_executor->setMainTable(shard.main_table ? shard.main_table : main_table); diff --git a/tests/config/config.d/clusters.xml b/tests/config/config.d/clusters.xml index 031d6e64bc9..cfd4868f1dc 100644 --- a/tests/config/config.d/clusters.xml +++ b/tests/config/config.d/clusters.xml @@ -176,6 +176,38 @@ + + + false + + 127.0.0.1 + 9000 + + + 127.0.0.2 + 9000 + + + 127.0.0.3 + 9000 + + + + false + + 127.0.0.4 + 9000 + + + 127.0.0.5 + 9000 + + + 127.0.0.6 + 9000 + + + diff --git a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference index ea4483ec305..e6a24987c0d 100644 --- a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference +++ b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference @@ -1 +1,6 @@ -100 4950 +-- 1 shard, 3 replicas +100 0 99 49.5 +200 0 99 49.5 +-- 2 shards, 3 replicas each +200 0 99 49.5 +400 0 99 49.5 diff --git a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql index 199be3771c7..4e7c72ebfd8 100644 --- a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql +++ b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql @@ -1,13 +1,47 @@ +-- 1 shard + +SELECT '-- 1 shard, 3 replicas'; DROP TABLE IF EXISTS test_d; DROP TABLE IF EXISTS test; CREATE TABLE test (id UInt64, date Date) ENGINE = MergeTree -ORDER BY id -as select *, today() from numbers(100); +ORDER BY id; CREATE TABLE IF NOT EXISTS test_d as test ENGINE = Distributed(test_cluster_one_shard_three_replicas_localhost, currentDatabase(), test); -SELECT count(), sum(id) +insert into test select *, today() from numbers(100); + +SELECT count(), min(id), max(id), avg(id) FROM test_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost'; + +insert into test select *, today() from numbers(100); + +SELECT count(), min(id), max(id), avg(id) +FROM test_d +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost'; + +-- 2 shards + +SELECT '-- 2 shards, 3 replicas each'; +DROP TABLE IF EXISTS test2_d; +DROP TABLE IF EXISTS test2; +CREATE TABLE test2 (id UInt64, date Date) +ENGINE = MergeTree +ORDER BY id; + +CREATE TABLE IF NOT EXISTS test2_d as test2 +ENGINE = Distributed(test_cluster_two_shard_three_replicas_localhost, currentDatabase(), test2, id); + +insert into test2 select *, today() from numbers(100); + +SELECT count(), min(id), max(id), avg(id) +FROM test2_d +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0, cluster_for_parallel_replicas = 'test_cluster_two_shard_three_replicas_localhost'; + +insert into test2 select *, today() from numbers(100); + +SELECT count(), min(id), max(id), avg(id) +FROM test2_d +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0, cluster_for_parallel_replicas = 'test_cluster_two_shard_three_replicas_localhost'; From 22b73c95a70ce82590ca251ec81eb91773c71583 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Sat, 5 Aug 2023 23:26:50 +0000 Subject: [PATCH 09/74] Fix comment --- src/Processors/QueryPlan/ReadFromRemote.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 287ceece5bd..f130c7fa502 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -247,8 +247,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact // The coordinator will return query result from the shard. // Only one coordinator per shard is necessary. Therefore using PoolMode::GET_ONE to establish only one connection per shard. // Using PoolMode::GET_MANY for this mode will(can) lead to instatiation of several coordinators (depends on max_parallel_replicas setting) - // each will execute parallel reading from replicas, - // so the query result will be multiplied by the number of created coordinator + // each will execute parallel reading from replicas, so the query result will be multiplied by the number of created coordinators remote_query_executor->setPoolMode(PoolMode::GET_ONE); else remote_query_executor->setPoolMode(PoolMode::GET_MANY); From cc2cf611a3ff8d6f6c7d20f2315a2f3adef6de5e Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Sun, 6 Aug 2023 09:37:41 +0000 Subject: [PATCH 10/74] fix typo/style --- src/Interpreters/ClusterProxy/executeQuery.cpp | 1 - src/Processors/QueryPlan/ReadFromRemote.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index de3150aa69e..e4a48fdcac0 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -29,7 +29,6 @@ namespace ErrorCodes { extern const int TOO_LARGE_DISTRIBUTED_DEPTH; extern const int LOGICAL_ERROR; - extern const int SUPPORT_IS_DISABLED; } namespace ClusterProxy diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index f130c7fa502..576349844ec 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -246,7 +246,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact // establish a connection to a replica on the shard, the replica will instantiate coordinator to manage parallel reading from replicas on the shard. // The coordinator will return query result from the shard. // Only one coordinator per shard is necessary. Therefore using PoolMode::GET_ONE to establish only one connection per shard. - // Using PoolMode::GET_MANY for this mode will(can) lead to instatiation of several coordinators (depends on max_parallel_replicas setting) + // Using PoolMode::GET_MANY for this mode will(can) lead to instantiation of several coordinators (depends on max_parallel_replicas setting) // each will execute parallel reading from replicas, so the query result will be multiplied by the number of created coordinators remote_query_executor->setPoolMode(PoolMode::GET_ONE); else From cb49b7b9adc2fa26fffd6926115d9044de204b6c Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 7 Aug 2023 19:12:53 +0000 Subject: [PATCH 11/74] Integration test --- .../__init__.py | 0 .../configs/remote_servers.xml | 47 +++++++++++ .../test.py | 82 +++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 tests/integration/test_parallel_replicas_over_distributed/__init__.py create mode 100644 tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml create mode 100644 tests/integration/test_parallel_replicas_over_distributed/test.py diff --git a/tests/integration/test_parallel_replicas_over_distributed/__init__.py b/tests/integration/test_parallel_replicas_over_distributed/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml b/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml new file mode 100644 index 00000000000..bc39adeb481 --- /dev/null +++ b/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml @@ -0,0 +1,47 @@ + + + + + + n1 + 9000 + + + n2 + 9000 + + + + + n3 + 9000 + + + n4 + 9000 + + + + + + + n1 + 9000 + + + n2 + 9000 + + + n3 + 9000 + + + n4 + 9000 + + + + + + diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py new file mode 100644 index 00000000000..8886e1e169a --- /dev/null +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -0,0 +1,82 @@ +import pytest +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +nodes = [ + cluster.add_instance( + f"n{i}", main_configs=["configs/remote_servers.xml"] + ) + for i in (1, 2, 3, 4) +] + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def create_tables(cluster): + for node in nodes: + node.query("DROP TABLE IF EXISTS test_table") + node.query("DROP TABLE IF EXISTS dist_table") + node.query( + "CREATE TABLE IF NOT EXISTS test_table (key Int64, value String) Engine=MergeTree ORDER BY (key)" + ) + + nodes[0].query( + f""" + CREATE TABLE dist_table AS test_table + Engine=Distributed( + {cluster}, + currentDatabase(), + test_table, + rand() + ) + """ + ) + + nodes[0].query( + f"INSERT INTO test_table SELECT number, number FROM numbers(1000)" + ) + nodes[1].query( + f"INSERT INTO test_table SELECT number, number FROM numbers(2000)" + ) + nodes[2].query( + f"INSERT INTO test_table SELECT -number, -number FROM numbers(1000)" + ) + nodes[3].query( + f"INSERT INTO test_table SELECT -number, -number FROM numbers(2000)" + ) + nodes[0].query( + f"INSERT INTO test_table SELECT number, number FROM numbers(1)" + ) + + +@pytest.mark.parametrize( + "cluster", + ["test_multiple_shards_multiple_replicas", "test_single_shard_multiple_replicas"], +) +def test_parallel_replicas_custom_key(start_cluster, cluster): + + create_tables(cluster) + + expected_result = f"6001\t-1999\t1999\t0\n" + node = nodes[0] + assert ( + node.query( + "SELECT count(), min(key), max(key), sum(key) FROM dist_table", + settings={ + "allow_experimental_parallel_reading_from_replicas": 1, + "prefer_localhost_replica": 0, + "max_parallel_replicas": 4, + "use_hedged_requests": 1, + "cluster_for_parallel_replicas": cluster, + }, + ) + == expected_result + ) From 9b631e2cefa7d5e0361ea073e14ab98ba33fc174 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 7 Aug 2023 19:23:31 +0000 Subject: [PATCH 12/74] Automatic style fix --- .../test.py | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index 8886e1e169a..d4f7d15d0de 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -4,9 +4,7 @@ from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) nodes = [ - cluster.add_instance( - f"n{i}", main_configs=["configs/remote_servers.xml"] - ) + cluster.add_instance(f"n{i}", main_configs=["configs/remote_servers.xml"]) for i in (1, 2, 3, 4) ] @@ -40,21 +38,11 @@ def create_tables(cluster): """ ) - nodes[0].query( - f"INSERT INTO test_table SELECT number, number FROM numbers(1000)" - ) - nodes[1].query( - f"INSERT INTO test_table SELECT number, number FROM numbers(2000)" - ) - nodes[2].query( - f"INSERT INTO test_table SELECT -number, -number FROM numbers(1000)" - ) - nodes[3].query( - f"INSERT INTO test_table SELECT -number, -number FROM numbers(2000)" - ) - nodes[0].query( - f"INSERT INTO test_table SELECT number, number FROM numbers(1)" - ) + nodes[0].query(f"INSERT INTO test_table SELECT number, number FROM numbers(1000)") + nodes[1].query(f"INSERT INTO test_table SELECT number, number FROM numbers(2000)") + nodes[2].query(f"INSERT INTO test_table SELECT -number, -number FROM numbers(1000)") + nodes[3].query(f"INSERT INTO test_table SELECT -number, -number FROM numbers(2000)") + nodes[0].query(f"INSERT INTO test_table SELECT number, number FROM numbers(1)") @pytest.mark.parametrize( @@ -62,7 +50,6 @@ def create_tables(cluster): ["test_multiple_shards_multiple_replicas", "test_single_shard_multiple_replicas"], ) def test_parallel_replicas_custom_key(start_cluster, cluster): - create_tables(cluster) expected_result = f"6001\t-1999\t1999\t0\n" From 50b0db598eb730015e0158f9b061817dcce32e57 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 7 Aug 2023 21:06:12 +0000 Subject: [PATCH 13/74] Update test: use ReplicatedMergeTree --- .../test.py | 84 ++++++++++++++----- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index d4f7d15d0de..f7eb166f2cd 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -4,7 +4,7 @@ from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) nodes = [ - cluster.add_instance(f"n{i}", main_configs=["configs/remote_servers.xml"]) + cluster.add_instance(f"n{i}", main_configs=["configs/remote_servers.xml"], with_zookeeper=True) for i in (1, 2, 3, 4) ] @@ -18,50 +18,92 @@ def start_cluster(): cluster.shutdown() -def create_tables(cluster): +def create_tables(cluster, table_name): + # create replicated tables for node in nodes: - node.query("DROP TABLE IF EXISTS test_table") - node.query("DROP TABLE IF EXISTS dist_table") - node.query( - "CREATE TABLE IF NOT EXISTS test_table (key Int64, value String) Engine=MergeTree ORDER BY (key)" - ) + node.query(f"DROP TABLE IF EXISTS {table_name} SYNC") + if cluster == "test_single_shard_multiple_replicas": + nodes[0].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)" + ) + nodes[1].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r2') ORDER BY (key)" + ) + nodes[2].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)" + ) + nodes[3].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r4') ORDER BY (key)" + ) + elif cluster == "test_multiple_shards_multiple_replicas": + # shard 1 + nodes[0].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)" + ) + nodes[1].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r2') ORDER BY (key)" + ) + # shard 2 + nodes[2].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard2/{table_name}', 'r1') ORDER BY (key)" + ) + nodes[3].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard2/{table_name}', 'r2') ORDER BY (key)" + ) + else: + raise Exception(f"Unexpected cluster: {cluster}") + + # create distributed table + nodes[0].query(f"DROP TABLE IF EXISTS {table_name}_d SYNC") nodes[0].query( f""" - CREATE TABLE dist_table AS test_table + CREATE TABLE {table_name}_d AS {table_name} Engine=Distributed( {cluster}, currentDatabase(), - test_table, - rand() + {table_name}, + key ) """ ) - nodes[0].query(f"INSERT INTO test_table SELECT number, number FROM numbers(1000)") - nodes[1].query(f"INSERT INTO test_table SELECT number, number FROM numbers(2000)") - nodes[2].query(f"INSERT INTO test_table SELECT -number, -number FROM numbers(1000)") - nodes[3].query(f"INSERT INTO test_table SELECT -number, -number FROM numbers(2000)") - nodes[0].query(f"INSERT INTO test_table SELECT number, number FROM numbers(1)") + # populate data + nodes[0].query(f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(1000)", settings={"insert_distributed_sync": 1}) + nodes[0].query(f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(2000)", settings={"insert_distributed_sync": 1}) + nodes[0].query(f"INSERT INTO {table_name}_d SELECT -number, -number FROM numbers(1000)", settings={"insert_distributed_sync": 1}) + nodes[0].query(f"INSERT INTO {table_name}_d SELECT -number, -number FROM numbers(2000)", settings={"insert_distributed_sync": 1}) + nodes[0].query(f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(1)", settings={"insert_distributed_sync": 1}) @pytest.mark.parametrize( "cluster", - ["test_multiple_shards_multiple_replicas", "test_single_shard_multiple_replicas"], + ["test_single_shard_multiple_replicas", "test_multiple_shards_multiple_replicas"], ) def test_parallel_replicas_custom_key(start_cluster, cluster): - create_tables(cluster) + table_name = "test_table" + create_tables(cluster, table_name) - expected_result = f"6001\t-1999\t1999\t0\n" node = nodes[0] + expected_result = f"6001\t-1999\t1999\t0\n" + + # w/o parallel replicas assert ( node.query( - "SELECT count(), min(key), max(key), sum(key) FROM dist_table", + f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d" + ) + == expected_result + ) + + # parallel replicas + assert ( + node.query( + f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d", settings={ - "allow_experimental_parallel_reading_from_replicas": 1, + "allow_experimental_parallel_reading_from_replicas": 2, "prefer_localhost_replica": 0, "max_parallel_replicas": 4, - "use_hedged_requests": 1, + "use_hedged_requests": 0, "cluster_for_parallel_replicas": cluster, }, ) From 8ba67f10bbab809b7012740d9b40042229140aa4 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 7 Aug 2023 21:13:08 +0000 Subject: [PATCH 14/74] Update test name --- .../integration/test_parallel_replicas_over_distributed/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index f7eb166f2cd..dae25ee9732 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -80,7 +80,7 @@ def create_tables(cluster, table_name): "cluster", ["test_single_shard_multiple_replicas", "test_multiple_shards_multiple_replicas"], ) -def test_parallel_replicas_custom_key(start_cluster, cluster): +def test_parallel_replicas_over_distributed(start_cluster, cluster): table_name = "test_table" create_tables(cluster, table_name) From 6dc1772781e88fd0f3656d47a3c847f77d46429d Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 7 Aug 2023 21:28:44 +0000 Subject: [PATCH 15/74] Automatic style fix --- .../test.py | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index dae25ee9732..5ad191af331 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -4,7 +4,9 @@ from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) nodes = [ - cluster.add_instance(f"n{i}", main_configs=["configs/remote_servers.xml"], with_zookeeper=True) + cluster.add_instance( + f"n{i}", main_configs=["configs/remote_servers.xml"], with_zookeeper=True + ) for i in (1, 2, 3, 4) ] @@ -69,11 +71,26 @@ def create_tables(cluster, table_name): ) # populate data - nodes[0].query(f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(1000)", settings={"insert_distributed_sync": 1}) - nodes[0].query(f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(2000)", settings={"insert_distributed_sync": 1}) - nodes[0].query(f"INSERT INTO {table_name}_d SELECT -number, -number FROM numbers(1000)", settings={"insert_distributed_sync": 1}) - nodes[0].query(f"INSERT INTO {table_name}_d SELECT -number, -number FROM numbers(2000)", settings={"insert_distributed_sync": 1}) - nodes[0].query(f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(1)", settings={"insert_distributed_sync": 1}) + nodes[0].query( + f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(1000)", + settings={"insert_distributed_sync": 1}, + ) + nodes[0].query( + f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(2000)", + settings={"insert_distributed_sync": 1}, + ) + nodes[0].query( + f"INSERT INTO {table_name}_d SELECT -number, -number FROM numbers(1000)", + settings={"insert_distributed_sync": 1}, + ) + nodes[0].query( + f"INSERT INTO {table_name}_d SELECT -number, -number FROM numbers(2000)", + settings={"insert_distributed_sync": 1}, + ) + nodes[0].query( + f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(1)", + settings={"insert_distributed_sync": 1}, + ) @pytest.mark.parametrize( @@ -89,9 +106,7 @@ def test_parallel_replicas_over_distributed(start_cluster, cluster): # w/o parallel replicas assert ( - node.query( - f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d" - ) + node.query(f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d") == expected_result ) From 0e17d26b88057ad26d8ec6ea34c2b442d6d23291 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 9 Aug 2023 10:04:29 +0000 Subject: [PATCH 16/74] More formats supported, read single archive from 1 thread --- contrib/libarchive-cmake/CMakeLists.txt | 12 +- src/Backups/BackupImpl.cpp | 4 +- src/IO/Archives/IArchiveReader.h | 4 +- src/IO/Archives/LibArchiveReader.cpp | 16 +- src/IO/Archives/LibArchiveReader.h | 4 +- src/IO/Archives/ZipArchiveReader.cpp | 39 ++-- src/IO/Archives/ZipArchiveReader.h | 4 +- src/IO/Archives/createArchiveReader.cpp | 15 +- .../tests/gtest_archive_reader_and_writer.cpp | 38 ++-- src/Storages/StorageFile.cpp | 166 ++++++++++-------- src/Storages/StorageMergeTree.cpp | 2 +- .../0_stateless/02661_read_from_archive.lib | 34 ++-- ...02661_read_from_archive_tarbzip2.reference | 116 ++++++++++++ .../02661_read_from_archive_tarbzip2.sh | 11 ++ .../02661_read_from_archive_tarxz.reference | 116 ++++++++++++ .../02661_read_from_archive_tarxz.sh | 11 ++ .../02661_read_from_archive_tzst.reference | 116 ++++++++++++ .../02661_read_from_archive_tzst.sh | 11 ++ 18 files changed, 582 insertions(+), 137 deletions(-) create mode 100644 tests/queries/0_stateless/02661_read_from_archive_tarbzip2.reference create mode 100755 tests/queries/0_stateless/02661_read_from_archive_tarbzip2.sh create mode 100644 tests/queries/0_stateless/02661_read_from_archive_tarxz.reference create mode 100755 tests/queries/0_stateless/02661_read_from_archive_tarxz.sh create mode 100644 tests/queries/0_stateless/02661_read_from_archive_tzst.reference create mode 100755 tests/queries/0_stateless/02661_read_from_archive_tzst.sh diff --git a/contrib/libarchive-cmake/CMakeLists.txt b/contrib/libarchive-cmake/CMakeLists.txt index fb64266185e..cd5658b7086 100644 --- a/contrib/libarchive-cmake/CMakeLists.txt +++ b/contrib/libarchive-cmake/CMakeLists.txt @@ -147,7 +147,7 @@ target_compile_definitions(_libarchive PUBLIC target_compile_options(_libarchive PRIVATE "-Wno-reserved-macro-identifier") if (TARGET ch_contrib::xz) - target_compile_definitions(_libarchive PUBLIC HAVE_LZMA_H=1) + target_compile_definitions(_libarchive PUBLIC HAVE_LZMA_H=1 HAVE_LIBLZMA=1) target_link_libraries(_libarchive PRIVATE ch_contrib::xz) endif() @@ -156,6 +156,16 @@ if (TARGET ch_contrib::zlib) target_link_libraries(_libarchive PRIVATE ch_contrib::zlib) endif() +if (TARGET ch_contrib::zstd) + target_compile_definitions(_libarchive PUBLIC HAVE_ZSTD_H=1 HAVE_LIBZSTD=1) + target_link_libraries(_libarchive PRIVATE ch_contrib::zstd) +endif() + +if (TARGET ch_contrib::bzip2) + target_compile_definitions(_libarchive PUBLIC HAVE_BZLIB_H=1) + target_link_libraries(_libarchive PRIVATE ch_contrib::bzip2) +endif() + if (OS_LINUX) target_compile_definitions( _libarchive PUBLIC diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 82793f44739..401c93967f6 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -375,7 +375,7 @@ void BackupImpl::readBackupMetadata() if (!archive_reader->fileExists(".backup")) throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Archive {} is not a backup", backup_name_for_logging); setCompressedSize(); - in = archive_reader->readFile(".backup"); + in = archive_reader->readFile(".backup", /*throw_on_not_found=*/true); } else { @@ -685,7 +685,7 @@ std::unique_ptr BackupImpl::readFileImpl(const SizeAndChecks { /// Make `read_buffer` if there is data for this backup entry in this backup. if (use_archive) - read_buffer = archive_reader->readFile(info.data_file_name); + read_buffer = archive_reader->readFile(info.data_file_name, /*throw_on_not_found=*/true); else read_buffer = reader->readFile(info.data_file_name); } diff --git a/src/IO/Archives/IArchiveReader.h b/src/IO/Archives/IArchiveReader.h index 03e5392e970..84a1dc21f5b 100644 --- a/src/IO/Archives/IArchiveReader.h +++ b/src/IO/Archives/IArchiveReader.h @@ -50,8 +50,8 @@ public: /// Starts reading a file from the archive. The function returns a read buffer, /// you can read that buffer to extract uncompressed data from the archive. /// Several read buffers can be used at the same time in parallel. - virtual std::unique_ptr readFile(const String & filename) = 0; - virtual std::unique_ptr readFile(NameFilter filter) = 0; + virtual std::unique_ptr readFile(const String & filename, bool throw_on_not_found) = 0; + virtual std::unique_ptr readFile(NameFilter filter, bool throw_on_not_found) = 0; /// It's possible to convert a file enumerator to a read buffer and vice versa. virtual std::unique_ptr readFile(std::unique_ptr enumerator) = 0; diff --git a/src/IO/Archives/LibArchiveReader.cpp b/src/IO/Archives/LibArchiveReader.cpp index c6e16b46ef7..2b7a4cca5de 100644 --- a/src/IO/Archives/LibArchiveReader.cpp +++ b/src/IO/Archives/LibArchiveReader.cpp @@ -155,7 +155,7 @@ private: archive_read_support_filter_all(archive); archive_read_support_format_all(archive); if (archive_read_open_filename(archive, path_to_archive.c_str(), 10240) != ARCHIVE_OK) - throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open archive: {}", quoteString(path_to_archive)); + throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open archive {}: {}", quoteString(path_to_archive), archive_error_string(archive)); } catch (...) { @@ -293,17 +293,21 @@ std::unique_ptr LibArchiveReader::firstFile() return std::make_unique(std::move(handle)); } -std::unique_ptr LibArchiveReader::readFile(const String & filename) +std::unique_ptr LibArchiveReader::readFile(const String & filename, bool throw_on_not_found) { - return readFile([&](const std::string & file) { return file == filename; }); + return readFile([&](const std::string & file) { return file == filename; }, throw_on_not_found); } -std::unique_ptr LibArchiveReader::readFile(NameFilter filter) +std::unique_ptr LibArchiveReader::readFile(NameFilter filter, bool throw_on_not_found) { Handle handle(path_to_archive, lock_on_reading); if (!handle.locateFile(filter)) - throw Exception( - ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack archive {}: no file found satisfying the filter", path_to_archive); + { + if (throw_on_not_found) + throw Exception( + ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack archive {}: no file found satisfying the filter", path_to_archive); + return nullptr; + } return std::make_unique(std::move(handle), path_to_archive); } diff --git a/src/IO/Archives/LibArchiveReader.h b/src/IO/Archives/LibArchiveReader.h index 700e8f70d04..2ea76ad0ab0 100644 --- a/src/IO/Archives/LibArchiveReader.h +++ b/src/IO/Archives/LibArchiveReader.h @@ -35,8 +35,8 @@ public: /// Starts reading a file from the archive. The function returns a read buffer, /// you can read that buffer to extract uncompressed data from the archive. /// Several read buffers can be used at the same time in parallel. - std::unique_ptr readFile(const String & filename) override; - std::unique_ptr readFile(NameFilter filter) override; + std::unique_ptr readFile(const String & filename, bool throw_on_not_found) override; + std::unique_ptr readFile(NameFilter filter, bool throw_on_not_found) override; /// It's possible to convert a file enumerator to a read buffer and vice versa. std::unique_ptr readFile(std::unique_ptr enumerator) override; diff --git a/src/IO/Archives/ZipArchiveReader.cpp b/src/IO/Archives/ZipArchiveReader.cpp index 181174ef6ec..a19c7abf8dd 100644 --- a/src/IO/Archives/ZipArchiveReader.cpp +++ b/src/IO/Archives/ZipArchiveReader.cpp @@ -75,21 +75,22 @@ public: RawHandle getRawHandle() const { return raw_handle; } std::shared_ptr getReader() const { return reader; } - void locateFile(const String & file_name_) + bool locateFile(const String & file_name_) { resetFileInfo(); bool case_sensitive = true; int err = unzLocateFile(raw_handle, file_name_.c_str(), reinterpret_cast(static_cast(case_sensitive))); if (err == UNZ_END_OF_LIST_OF_FILE) - showError("File " + quoteString(file_name_) + " not found"); + return false; file_name = file_name_; + return true; } - void locateFile(NameFilter filter) + bool locateFile(NameFilter filter) { int err = unzGoToFirstFile(raw_handle); if (err == UNZ_END_OF_LIST_OF_FILE) - showError("No file was found satisfying the filter"); + return false; do { @@ -97,12 +98,12 @@ public: resetFileInfo(); retrieveFileInfo(); if (filter(getFileName())) - return; + return true; err = unzGoToNextFile(raw_handle); } while (err != UNZ_END_OF_LIST_OF_FILE); - showError("No file was found satisfying the filter"); + return false; } bool tryLocateFile(const String & file_name_) @@ -513,7 +514,9 @@ bool ZipArchiveReader::fileExists(const String & filename) ZipArchiveReader::FileInfo ZipArchiveReader::getFileInfo(const String & filename) { auto handle = acquireHandle(); - handle.locateFile(filename); + if (!handle.locateFile(filename)) + showError(fmt::format("File {} was not found in archive", quoteString(filename))); + return handle.getFileInfo(); } @@ -525,17 +528,31 @@ std::unique_ptr ZipArchiveReader::firstFile() return std::make_unique(std::move(handle)); } -std::unique_ptr ZipArchiveReader::readFile(const String & filename) +std::unique_ptr ZipArchiveReader::readFile(const String & filename, bool throw_on_not_found) { auto handle = acquireHandle(); - handle.locateFile(filename); + if (!handle.locateFile(filename)) + { + if (throw_on_not_found) + showError(fmt::format("File {} was not found in archive", quoteString(filename))); + + return nullptr; + } + return std::make_unique(std::move(handle)); } -std::unique_ptr ZipArchiveReader::readFile(NameFilter filter) +std::unique_ptr ZipArchiveReader::readFile(NameFilter filter, bool throw_on_not_found) { auto handle = acquireHandle(); - handle.locateFile(filter); + if (!handle.locateFile(filter)) + { + if (throw_on_not_found) + showError(fmt::format("No file satisfying filter in archive")); + + return nullptr; + } + return std::make_unique(std::move(handle)); } diff --git a/src/IO/Archives/ZipArchiveReader.h b/src/IO/Archives/ZipArchiveReader.h index 0b5fa572860..a8788064fec 100644 --- a/src/IO/Archives/ZipArchiveReader.h +++ b/src/IO/Archives/ZipArchiveReader.h @@ -41,8 +41,8 @@ public: /// Starts reading a file from the archive. The function returns a read buffer, /// you can read that buffer to extract uncompressed data from the archive. /// Several read buffers can be used at the same time in parallel. - std::unique_ptr readFile(const String & filename) override; - std::unique_ptr readFile(NameFilter filter) override; + std::unique_ptr readFile(const String & filename, bool throw_on_not_found) override; + std::unique_ptr readFile(NameFilter filter, bool throw_on_not_found) override; /// It's possible to convert a file enumerator to a read buffer and vice versa. std::unique_ptr readFile(std::unique_ptr enumerator) override; diff --git a/src/IO/Archives/createArchiveReader.cpp b/src/IO/Archives/createArchiveReader.cpp index 37743da7107..0c998971de1 100644 --- a/src/IO/Archives/createArchiveReader.cpp +++ b/src/IO/Archives/createArchiveReader.cpp @@ -24,6 +24,18 @@ std::shared_ptr createArchiveReader( [[maybe_unused]] const std::function()> & archive_read_function, [[maybe_unused]] size_t archive_size) { + using namespace std::literals; + static constexpr std::array tar_extensions + { + ".tar"sv, + ".tar.gz"sv, + ".tgz"sv, + ".tar.zst"sv, + ".tzst"sv, + ".tar.xz"sv, + ".tar.bz2"sv + }; + if (path_to_archive.ends_with(".zip") || path_to_archive.ends_with(".zipx")) { #if USE_MINIZIP @@ -32,7 +44,8 @@ std::shared_ptr createArchiveReader( throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "minizip library is disabled"); #endif } - else if (path_to_archive.ends_with(".tar") || path_to_archive.ends_with("tar.gz")) + else if (std::any_of( + tar_extensions.begin(), tar_extensions.end(), [&](const auto extension) { return path_to_archive.ends_with(extension); })) { #if USE_LIBARCHIVE return std::make_shared(path_to_archive); diff --git a/src/IO/tests/gtest_archive_reader_and_writer.cpp b/src/IO/tests/gtest_archive_reader_and_writer.cpp index 8eeccbcdf75..b48955c25e7 100644 --- a/src/IO/tests/gtest_archive_reader_and_writer.cpp +++ b/src/IO/tests/gtest_archive_reader_and_writer.cpp @@ -113,11 +113,11 @@ TEST_P(ArchiveReaderAndWriterTest, EmptyArchive) EXPECT_FALSE(reader->fileExists("nofile.txt")); - expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "File 'nofile.txt' not found", + expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "File 'nofile.txt' was not found in archive", [&]{ reader->getFileInfo("nofile.txt"); }); - expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "File 'nofile.txt' not found", - [&]{ reader->readFile("nofile.txt"); }); + expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "File 'nofile.txt' was not found in archive", + [&]{ reader->readFile("nofile.txt", /*throw_on_not_found=*/true); }); EXPECT_EQ(reader->firstFile(), nullptr); } @@ -145,7 +145,7 @@ TEST_P(ArchiveReaderAndWriterTest, SingleFileInArchive) EXPECT_GT(file_info.compressed_size, 0); { - auto in = reader->readFile("a.txt"); + auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, contents); @@ -215,14 +215,14 @@ TEST_P(ArchiveReaderAndWriterTest, TwoFilesInArchive) EXPECT_EQ(reader->getFileInfo("b/c.txt").uncompressed_size, c_contents.size()); { - auto in = reader->readFile("a.txt"); + auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, a_contents); } { - auto in = reader->readFile("b/c.txt"); + auto in = reader->readFile("b/c.txt", /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, c_contents); @@ -230,7 +230,7 @@ TEST_P(ArchiveReaderAndWriterTest, TwoFilesInArchive) { /// Read a.txt again. - auto in = reader->readFile("a.txt"); + auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, a_contents); @@ -302,14 +302,14 @@ TEST_P(ArchiveReaderAndWriterTest, InMemory) EXPECT_EQ(reader->getFileInfo("b.txt").uncompressed_size, b_contents.size()); { - auto in = reader->readFile("a.txt"); + auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, a_contents); } { - auto in = reader->readFile("b.txt"); + auto in = reader->readFile("b.txt", /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, b_contents); @@ -317,7 +317,7 @@ TEST_P(ArchiveReaderAndWriterTest, InMemory) { /// Read a.txt again. - auto in = reader->readFile("a.txt"); + auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, a_contents); @@ -343,19 +343,19 @@ TEST_P(ArchiveReaderAndWriterTest, Password) /// Try to read without a password. expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Password is required", - [&]{ reader->readFile("a.txt"); }); + [&]{ reader->readFile("a.txt", /*throw_on_not_found=*/true); }); { /// Try to read with a wrong password. reader->setPassword("123Qwe"); expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Wrong password", - [&]{ reader->readFile("a.txt"); }); + [&]{ reader->readFile("a.txt", /*throw_on_not_found=*/true); }); } { /// Reading with the right password is successful. reader->setPassword("Qwe123"); - auto in = reader->readFile("a.txt"); + auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, contents); @@ -387,7 +387,7 @@ TEST(TarArchiveReaderTest, ReadFile) { bool created = createArchiveWithFiles(archive_path, {{filename, contents}}); EXPECT_EQ(created, true); auto reader = createArchiveReader(archive_path); - auto in = reader->readFile(filename); + auto in = reader->readFile(filename, /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, contents); @@ -405,11 +405,11 @@ TEST(TarArchiveReaderTest, ReadTwoFiles) { auto reader = createArchiveReader(archive_path); EXPECT_EQ(reader->fileExists(file1), true); EXPECT_EQ(reader->fileExists(file2), true); - auto in = reader->readFile(file1); + auto in = reader->readFile(file1, /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, contents1); - in = reader->readFile(file2); + in = reader->readFile(file2, /*throw_on_not_found=*/true); readStringUntilEOF(str, *in); EXPECT_EQ(str, contents2); @@ -448,7 +448,7 @@ TEST(SevenZipArchiveReaderTest, ReadFile) { bool created = createArchiveWithFiles(archive_path, {{filename, contents}}); EXPECT_EQ(created, true); auto reader = createArchiveReader(archive_path); - auto in = reader->readFile(filename); + auto in = reader->readFile(filename, /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, contents); @@ -479,11 +479,11 @@ TEST(SevenZipArchiveReaderTest, ReadTwoFiles) { auto reader = createArchiveReader(archive_path); EXPECT_EQ(reader->fileExists(file1), true); EXPECT_EQ(reader->fileExists(file2), true); - auto in = reader->readFile(file1); + auto in = reader->readFile(file1, /*throw_on_not_found=*/true); String str; readStringUntilEOF(str, *in); EXPECT_EQ(str, contents1); - in = reader->readFile(file2); + in = reader->readFile(file2, /*throw_on_not_found=*/true); readStringUntilEOF(str, *in); EXPECT_EQ(str, contents2); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 5c03540de9a..a0dc98d4312 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -399,11 +399,11 @@ std::unique_ptr createReadBuffer( return reader->readFile([my_matcher = std::move(matcher)](const std::string & path) { return re2::RE2::FullMatch(path, *my_matcher); - }); + }, /*throw_on_not_found=*/true); } else { - return reader->readFile(current_path); + return reader->readFile(current_path, /*throw_on_not_found=*/true); } } @@ -721,28 +721,20 @@ public: { public: explicit FilesIterator( - const Strings & files_, std::vector archives_, std::vector> files_in_archive_) - : files(files_), archives(std::move(archives_)), files_in_archive(std::move(files_in_archive_)) + const Strings & files_, std::vector archives_, const IArchiveReader::NameFilter & name_filter_) + : files(files_), archives(std::move(archives_)), name_filter(name_filter_) { } String next() { + const auto & fs = fromArchive() ? archives : files; + auto current_index = index.fetch_add(1, std::memory_order_relaxed); - if (current_index >= files.size()) + if (current_index >= fs.size()) return ""; - return files[current_index]; - } - - std::pair nextFileFromArchive() - { - auto current_index = index.fetch_add(1, std::memory_order_relaxed); - if (current_index >= files_in_archive.size()) - return {"", ""}; - - const auto & [archive_index, filename] = files_in_archive[current_index]; - return {archives[archive_index], filename}; + return fs[current_index]; } bool fromArchive() const @@ -750,10 +742,29 @@ public: return !archives.empty(); } + bool readSingleFileFromArchive() const + { + return !name_filter; + } + + IArchiveReader::NameFilter getNameFilter() const + { + return name_filter; + } + + const String & getFileName() + { + if (files.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected only 1 filename but got {}", files.size()); + + return files[0]; + } private: std::vector files; + std::vector archives; - std::vector> files_in_archive; + IArchiveReader::NameFilter name_filter; + std::atomic index = 0; }; @@ -863,25 +874,64 @@ public: { if (files_iterator->fromArchive()) { - auto [archive, filename] = files_iterator->nextFileFromArchive(); - if (archive.empty()) - return {}; - - current_path = std::move(filename); - - if (!archive_reader || archive_reader->getPath() != archive) + if (files_iterator->readSingleFileFromArchive()) { + auto archive = files_iterator->next(); + if (archive.empty()) + return {}; + + struct stat file_stat = getFileStat(archive, storage->use_table_fd, storage->table_fd, storage->getName()); + if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + continue; + archive_reader = createArchiveReader(archive); - file_enumerator = archive_reader->firstFile(); + current_path = files_iterator->getFileName(); + read_buf = archive_reader->readFile(current_path, /*throw_on_not_found=*/false); + if (!read_buf) + continue; } - - if (file_enumerator == nullptr) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to find a file in archive {}", archive); - - while (file_enumerator->getFileName() != current_path) + else { - if (!file_enumerator->nextFile()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected file {} is missing from archive {}", current_path, archive); + auto name_filter = files_iterator->getNameFilter(); + chassert(name_filter); + while (true) + { + if (file_enumerator == nullptr) + { + auto archive = files_iterator->next(); + if (archive.empty()) + return {}; + + struct stat file_stat = getFileStat(archive, storage->use_table_fd, storage->table_fd, storage->getName()); + if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + continue; + + archive_reader = createArchiveReader(archive); + file_enumerator = archive_reader->firstFile(); + continue; + } + + bool file_found = true; + while (!name_filter(file_enumerator->getFileName())) + { + if (!file_enumerator->nextFile()) + { + file_found = false; + break; + } + } + + if (file_found) + { + current_path = file_enumerator->getFileName(); + break; + } + + file_enumerator = nullptr; + } + + chassert(file_enumerator); + read_buf = archive_reader->readFile(std::move(file_enumerator)); } } else @@ -903,23 +953,12 @@ public: if (!read_buf) { struct stat file_stat; - if (archive_reader == nullptr) - { - file_stat = getFileStat(current_path, storage->use_table_fd, storage->table_fd, storage->getName()); + file_stat = getFileStat(current_path, storage->use_table_fd, storage->table_fd, storage->getName()); - if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) - continue; - } + if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + continue; - if (archive_reader == nullptr) - { - read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, context); - } - else - { - chassert(file_enumerator); - read_buf = archive_reader->readFile(std::move(file_enumerator)); - } + read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, context); } const Settings & settings = context->getSettingsRef(); @@ -987,10 +1026,10 @@ public: pipeline.reset(); input_format.reset(); - if (archive_reader != nullptr) + if (files_iterator->fromArchive() && !files_iterator->readSingleFileFromArchive()) file_enumerator = archive_reader->nextFile(std::move(read_buf)); - else - read_buf.reset(); + + read_buf.reset(); } return {}; @@ -1050,9 +1089,7 @@ Pipe StorageFile::read( } } - std::vector> files_in_archive; - - size_t files_in_archive_num = 0; + IArchiveReader::NameFilter filter; if (!paths_to_archive.empty()) { if (paths.size() != 1) @@ -1060,7 +1097,6 @@ Pipe StorageFile::read( const auto & path = paths[0]; - IArchiveReader::NameFilter filter; if (path.find_first_of("*?{") != std::string::npos) { auto matcher = std::make_shared(makeRegexpPatternFromGlobs(path)); @@ -1073,32 +1109,14 @@ Pipe StorageFile::read( return re2::RE2::FullMatch(p, *matcher); }; } - - for (size_t i = 0; i < paths_to_archive.size(); ++i) - { - if (filter) - { - const auto & path_to_archive = paths_to_archive[i]; - auto archive_reader = createArchiveReader(path_to_archive); - auto files = archive_reader->getAllFiles(filter); - for (auto & file : files) - files_in_archive.push_back({i, std::move(file)}); - } - else - { - files_in_archive.push_back({i, path}); - } - } - - files_in_archive_num = files_in_archive.size(); } - auto files_iterator = std::make_shared(paths, paths_to_archive, std::move(files_in_archive)); + auto files_iterator = std::make_shared(paths, paths_to_archive, std::move(filter)); auto this_ptr = std::static_pointer_cast(shared_from_this()); size_t num_streams = max_num_streams; - auto files_to_read = std::max(files_in_archive_num, paths.size()); + auto files_to_read = std::max(paths_to_archive.size(), paths.size()); if (max_num_streams > files_to_read) num_streams = files_to_read; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index ad9013d9f13..2aab8587ad8 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -919,7 +919,7 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMerge( if (getCurrentMutationVersion(left, lock) != getCurrentMutationVersion(right, lock)) { - disable_reason = "Some parts have differ mmutatuon version"; + disable_reason = "Some parts have different mutation version"; return false; } diff --git a/tests/queries/0_stateless/02661_read_from_archive.lib b/tests/queries/0_stateless/02661_read_from_archive.lib index 0a015306282..88b2c82f704 100644 --- a/tests/queries/0_stateless/02661_read_from_archive.lib +++ b/tests/queries/0_stateless/02661_read_from_archive.lib @@ -16,33 +16,35 @@ function read_archive_file() { function run_archive_test() { $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS 02661_archive_table" + FILE_PREFIX="${CLICKHOUSE_TEST_UNIQUE_NAME}_$1_" + user_files_path=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') - echo -e "1,2\n3,4" > ${CLICKHOUSE_TEST_UNIQUE_NAME}_data1.csv - echo -e "5,6\n7,8" > ${CLICKHOUSE_TEST_UNIQUE_NAME}_data2.csv - echo -e "9,10\n11,12" > ${CLICKHOUSE_TEST_UNIQUE_NAME}_data3.csv + echo -e "1,2\n3,4" > ${FILE_PREFIX}_data1.csv + echo -e "5,6\n7,8" > ${FILE_PREFIX}_data2.csv + echo -e "9,10\n11,12" > ${FILE_PREFIX}_data3.csv - eval "$2 ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}_archive1.$1 ${CLICKHOUSE_TEST_UNIQUE_NAME}_data1.csv ${CLICKHOUSE_TEST_UNIQUE_NAME}_data2.csv > /dev/null" - eval "$2 ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}_archive2.$1 ${CLICKHOUSE_TEST_UNIQUE_NAME}_data1.csv ${CLICKHOUSE_TEST_UNIQUE_NAME}_data3.csv > /dev/null" - eval "$2 ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}_archive3.$1 ${CLICKHOUSE_TEST_UNIQUE_NAME}_data2.csv ${CLICKHOUSE_TEST_UNIQUE_NAME}_data3.csv > /dev/null" + eval "$2 ${user_files_path}/${FILE_PREFIX}_archive1.$1 ${FILE_PREFIX}_data1.csv ${FILE_PREFIX}_data2.csv > /dev/null" + eval "$2 ${user_files_path}/${FILE_PREFIX}_archive2.$1 ${FILE_PREFIX}_data1.csv ${FILE_PREFIX}_data3.csv > /dev/null" + eval "$2 ${user_files_path}/${FILE_PREFIX}_archive3.$1 ${FILE_PREFIX}_data2.csv ${FILE_PREFIX}_data3.csv > /dev/null" echo "archive1 data1.csv" - read_archive_file "${CLICKHOUSE_TEST_UNIQUE_NAME}_archive1.$1 :: ${CLICKHOUSE_TEST_UNIQUE_NAME}_data1.csv" + read_archive_file "${FILE_PREFIX}_archive1.$1 :: ${FILE_PREFIX}_data1.csv" echo "archive{1..2} data1.csv" - read_archive_file "${CLICKHOUSE_TEST_UNIQUE_NAME}_archive{1..2}.$1 :: ${CLICKHOUSE_TEST_UNIQUE_NAME}_data1.csv" + read_archive_file "${FILE_PREFIX}_archive{1..2}.$1 :: ${FILE_PREFIX}_data1.csv" echo "archive{1,2} data{1,3}.csv" - read_archive_file "${CLICKHOUSE_TEST_UNIQUE_NAME}_archive{1,2}.$1 :: ${CLICKHOUSE_TEST_UNIQUE_NAME}_data{1,3}.csv" + read_archive_file "${FILE_PREFIX}_archive{1,2}.$1 :: ${FILE_PREFIX}_data{1,3}.csv" echo "archive3 data*.csv" - read_archive_file "${CLICKHOUSE_TEST_UNIQUE_NAME}_archive3.$1 :: ${CLICKHOUSE_TEST_UNIQUE_NAME}_data*.csv" + read_archive_file "${FILE_PREFIX}_archive3.$1 :: ${FILE_PREFIX}_data*.csv" echo "archive* *.csv" - read_archive_file "${CLICKHOUSE_TEST_UNIQUE_NAME}_archive*.$1 :: *.csv" + read_archive_file "${FILE_PREFIX}_archive*.$1 :: *.csv" echo "archive* {2..3}.csv" - read_archive_file "${CLICKHOUSE_TEST_UNIQUE_NAME}_archive*.$1 :: ${CLICKHOUSE_TEST_UNIQUE_NAME}_data{2..3}.csv" + read_archive_file "${FILE_PREFIX}_archive*.$1 :: ${FILE_PREFIX}_data{2..3}.csv" - $CLICKHOUSE_LOCAL --query "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}_archive1.$1::nonexistent.csv')" 2>&1 | grep -q "CANNOT_UNPACK_ARCHIVE" && echo "OK" || echo "FAIL" - $CLICKHOUSE_LOCAL --query "SELECT * FROM file('${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}_archive3.$1::{2..3}.csv')" 2>&1 | grep -q "CANNOT_UNPACK_ARCHIVE" && echo "OK" || echo "FAIL" + $CLICKHOUSE_LOCAL --query "SELECT * FROM file('${user_files_path}/${FILE_PREFIX}_archive1.$1::nonexistent.csv')" 2>&1 | grep -q "CANNOT_UNPACK_ARCHIVE" && echo "OK" || echo "FAIL" + $CLICKHOUSE_LOCAL --query "SELECT * FROM file('${user_files_path}/${FILE_PREFIX}_archive3.$1::{2..3}.csv')" 2>&1 | grep -q "CANNOT_UNPACK_ARCHIVE" && echo "OK" || echo "FAIL" - rm ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}_archive{1..3}.$1 + rm ${user_files_path}/${FILE_PREFIX}_archive{1..3}.$1 - rm ${CLICKHOUSE_TEST_UNIQUE_NAME}_data{1..3}.csv + rm ${FILE_PREFIX}_data{1..3}.csv } \ No newline at end of file diff --git a/tests/queries/0_stateless/02661_read_from_archive_tarbzip2.reference b/tests/queries/0_stateless/02661_read_from_archive_tarbzip2.reference new file mode 100644 index 00000000000..27edb5536ad --- /dev/null +++ b/tests/queries/0_stateless/02661_read_from_archive_tarbzip2.reference @@ -0,0 +1,116 @@ +archive1 data1.csv +1 2 +3 4 +1 2 +3 4 +1 2 +3 4 +archive{1..2} data1.csv +1 2 +1 2 +3 4 +3 4 +1 2 +1 2 +3 4 +3 4 +1 2 +1 2 +3 4 +3 4 +archive{1,2} data{1,3}.csv +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +archive3 data*.csv +5 6 +7 8 +9 10 +11 12 +5 6 +7 8 +9 10 +11 12 +5 6 +7 8 +9 10 +11 12 +archive* *.csv +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +archive* {2..3}.csv +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +OK +OK diff --git a/tests/queries/0_stateless/02661_read_from_archive_tarbzip2.sh b/tests/queries/0_stateless/02661_read_from_archive_tarbzip2.sh new file mode 100755 index 00000000000..4c3763629f4 --- /dev/null +++ b/tests/queries/0_stateless/02661_read_from_archive_tarbzip2.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# shellcheck source=./02661_read_from_archive.lib +. "$CUR_DIR"/02661_read_from_archive.lib + +run_archive_test "tar.bz2" "tar -cjf" \ No newline at end of file diff --git a/tests/queries/0_stateless/02661_read_from_archive_tarxz.reference b/tests/queries/0_stateless/02661_read_from_archive_tarxz.reference new file mode 100644 index 00000000000..27edb5536ad --- /dev/null +++ b/tests/queries/0_stateless/02661_read_from_archive_tarxz.reference @@ -0,0 +1,116 @@ +archive1 data1.csv +1 2 +3 4 +1 2 +3 4 +1 2 +3 4 +archive{1..2} data1.csv +1 2 +1 2 +3 4 +3 4 +1 2 +1 2 +3 4 +3 4 +1 2 +1 2 +3 4 +3 4 +archive{1,2} data{1,3}.csv +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +archive3 data*.csv +5 6 +7 8 +9 10 +11 12 +5 6 +7 8 +9 10 +11 12 +5 6 +7 8 +9 10 +11 12 +archive* *.csv +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +archive* {2..3}.csv +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +OK +OK diff --git a/tests/queries/0_stateless/02661_read_from_archive_tarxz.sh b/tests/queries/0_stateless/02661_read_from_archive_tarxz.sh new file mode 100755 index 00000000000..b8ee5bc46d2 --- /dev/null +++ b/tests/queries/0_stateless/02661_read_from_archive_tarxz.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# shellcheck source=./02661_read_from_archive.lib +. "$CUR_DIR"/02661_read_from_archive.lib + +run_archive_test "tar.xz" "tar -cJf" \ No newline at end of file diff --git a/tests/queries/0_stateless/02661_read_from_archive_tzst.reference b/tests/queries/0_stateless/02661_read_from_archive_tzst.reference new file mode 100644 index 00000000000..27edb5536ad --- /dev/null +++ b/tests/queries/0_stateless/02661_read_from_archive_tzst.reference @@ -0,0 +1,116 @@ +archive1 data1.csv +1 2 +3 4 +1 2 +3 4 +1 2 +3 4 +archive{1..2} data1.csv +1 2 +1 2 +3 4 +3 4 +1 2 +1 2 +3 4 +3 4 +1 2 +1 2 +3 4 +3 4 +archive{1,2} data{1,3}.csv +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +1 2 +1 2 +3 4 +3 4 +9 10 +11 12 +archive3 data*.csv +5 6 +7 8 +9 10 +11 12 +5 6 +7 8 +9 10 +11 12 +5 6 +7 8 +9 10 +11 12 +archive* *.csv +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +1 2 +1 2 +3 4 +3 4 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +archive* {2..3}.csv +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +5 6 +5 6 +7 8 +7 8 +9 10 +9 10 +11 12 +11 12 +OK +OK diff --git a/tests/queries/0_stateless/02661_read_from_archive_tzst.sh b/tests/queries/0_stateless/02661_read_from_archive_tzst.sh new file mode 100755 index 00000000000..b4145e0d1d0 --- /dev/null +++ b/tests/queries/0_stateless/02661_read_from_archive_tzst.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# shellcheck source=./02661_read_from_archive.lib +. "$CUR_DIR"/02661_read_from_archive.lib + +run_archive_test "tzst" "tar -caf" \ No newline at end of file From 88182b55357f5426638d8f26c1d88168d2a86680 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 9 Aug 2023 13:29:45 +0000 Subject: [PATCH 17/74] Fix cluster configuration --- .../configs/remote_servers.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_parallel_replicas_custom_key/configs/remote_servers.xml b/tests/integration/test_parallel_replicas_custom_key/configs/remote_servers.xml index 308db461498..d5b027072ef 100644 --- a/tests/integration/test_parallel_replicas_custom_key/configs/remote_servers.xml +++ b/tests/integration/test_parallel_replicas_custom_key/configs/remote_servers.xml @@ -2,7 +2,7 @@ - false + true n1 9000 @@ -13,7 +13,7 @@ - false + true n3 9000 @@ -26,7 +26,7 @@ - false + true n1 9000 From 5e7a438631a6114112cec08a3fd0fe33673e1c1f Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 9 Aug 2023 19:33:45 +0000 Subject: [PATCH 18/74] Fix cluster configs correct internal replication flag --- .../configs/remote_servers.xml | 6 +++--- .../configs/remote_servers.xml | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_parallel_replicas_custom_key/configs/remote_servers.xml b/tests/integration/test_parallel_replicas_custom_key/configs/remote_servers.xml index d5b027072ef..308db461498 100644 --- a/tests/integration/test_parallel_replicas_custom_key/configs/remote_servers.xml +++ b/tests/integration/test_parallel_replicas_custom_key/configs/remote_servers.xml @@ -2,7 +2,7 @@ - true + false n1 9000 @@ -13,7 +13,7 @@ - true + false n3 9000 @@ -26,7 +26,7 @@ - true + false n1 9000 diff --git a/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml b/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml index bc39adeb481..d5b027072ef 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml +++ b/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml @@ -2,6 +2,7 @@ + true n1 9000 @@ -12,6 +13,7 @@ + true n3 9000 @@ -24,6 +26,7 @@ + true n1 9000 From c65f3d1eb870b0f5ce4272ee699edda6bed1423d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 10 Aug 2023 10:09:59 +0000 Subject: [PATCH 19/74] Mutex for name filter --- src/Storages/StorageFile.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index a0dc98d4312..0c720f2d7da 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -747,9 +747,10 @@ public: return !name_filter; } - IArchiveReader::NameFilter getNameFilter() const + bool passesFilter(const std::string & name) const { - return name_filter; + std::lock_guard lock(filter_mutex); + return name_filter(name); } const String & getFileName() @@ -763,6 +764,7 @@ public: std::vector files; std::vector archives; + mutable std::mutex filter_mutex; IArchiveReader::NameFilter name_filter; std::atomic index = 0; @@ -892,8 +894,6 @@ public: } else { - auto name_filter = files_iterator->getNameFilter(); - chassert(name_filter); while (true) { if (file_enumerator == nullptr) @@ -912,7 +912,7 @@ public: } bool file_found = true; - while (!name_filter(file_enumerator->getFileName())) + while (!files_iterator->passesFilter(file_enumerator->getFileName())) { if (!file_enumerator->nextFile()) { From bb38918a263dd59307c463bf038ebf0c4d28d184 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 10 Aug 2023 13:21:11 +0200 Subject: [PATCH 20/74] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: JΓ‘nos Benjamin Antal --- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 2 +- src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp | 2 +- src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9092c7fceba..52f9571f962 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -417,7 +417,7 @@ std::optional, DataTypes>> CSVSchemaReader::readRo auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, format_settings, FormatSettings::EscapingRule::CSV); - return std::make_pair(fields, data_types); + return std::make_pair(std::move(fields), std::move(data_types)); } std::optional CSVSchemaReader::readRowAndGetDataTypesImpl() diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index ff3d6d49199..17cc88425f5 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -388,7 +388,7 @@ std::optional, DataTypes>> CustomSeparatedSchemaRe auto fields = reader.readRow(); auto data_types = tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), &json_inference_info); - return std::make_pair(fields, data_types); + return std::make_pair(std::move(fields), std::move(data_types)); } std::optional CustomSeparatedSchemaReader::readRowAndGetDataTypesImpl() diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 4000bd14ddc..fc2b5cd8207 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -425,7 +425,7 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & if (!first_row) return; - auto [first_row_values, first_row_types] = *first_row; + const auto & [first_row_values, first_row_types] = *first_row; /// The first row contains non String elements, it cannot be a header. if (!checkIfAllTypesAreString(first_row_types)) @@ -443,7 +443,7 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & return; } - auto [second_row_values, second_row_types] = *second_row; + const auto & [second_row_values, second_row_types] = *second_row; DataTypes data_types; bool second_row_can_be_type_names = checkIfAllTypesAreString(second_row_types) && checkIfAllValuesAreTypeNames(readNamesFromFields(second_row_values)); From 6eb6c8a320c919ad33d5e9b1b5f7093eb071be1c Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 10 Aug 2023 11:43:26 +0000 Subject: [PATCH 21/74] Fix: correct execution over cluster with multiple shards respect shard number during parallel replicas query execution --- .../ClusterProxy/executeQuery.cpp | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index e4a48fdcac0..fbc7bbd5bbb 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -270,7 +270,28 @@ void executeQueryWithParallelReplicas( const ClusterPtr & not_optimized_cluster) { const auto & settings = context->getSettingsRef(); - ClusterPtr new_cluster = not_optimized_cluster->getClusterWithReplicasAsShards(settings); + auto new_context = Context::createCopy(context); + auto scalars = new_context->hasQueryContext() ? new_context->getQueryContext()->getScalars() : Scalars{}; + + Int64 shard_num = 0; /// shard_num is 1-based, so 0 - no shard specified + auto it = scalars.find("_shard_num"); + if (it != scalars.end()) + { + const Block & block = it->second; + shard_num = block.getColumns()[0]->get64(0); + } + + ClusterPtr new_cluster; + /// if got valid shard_num from query initiator, then parallel replicas scope is the specified shard + /// shards are numbered in order of appearance in the cluster config + if (shard_num > 0) + { + LOG_DEBUG(&Poco::Logger::get("executeQueryWithParallelReplicas"), "Parallel replicas query in shard scope: shard_num={}", shard_num); + /// shard_num is 1-based, but getClusterWithSingleShard expects 0-based index + new_cluster = not_optimized_cluster->getClusterWithSingleShard(shard_num - 1); + } + else + new_cluster = not_optimized_cluster->getClusterWithReplicasAsShards(settings); auto all_replicas_count = std::min(static_cast(settings.max_parallel_replicas), new_cluster->getShardCount()); auto coordinator = std::make_shared(all_replicas_count); @@ -284,8 +305,6 @@ void executeQueryWithParallelReplicas( /// to then tell it about the reading method we chose. query_info.coordinator = coordinator; - auto new_context = Context::createCopy(context); - auto scalars = new_context->hasQueryContext() ? new_context->getQueryContext()->getScalars() : Scalars{}; auto external_tables = new_context->getExternalTables(); auto read_from_remote = std::make_unique( From 82aff97dd04605233371c9c6de1e59933961cb78 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 10 Aug 2023 11:51:36 +0000 Subject: [PATCH 22/74] Add comment, more test --- src/Processors/Formats/RowInputFormatWithNamesAndTypes.h | 1 + ...2834_formats_with_variable_number_of_columns.reference | 8 ++++++++ .../02834_formats_with_variable_number_of_columns.sql | 2 ++ 3 files changed, 11 insertions(+) diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index 7b3e2cbea67..377341da685 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -119,6 +119,7 @@ public: /// Check suffix. virtual bool checkForSuffix() { return in->eof(); } + /// Check if we are at the end of row, not between fields. virtual bool checkForEndOfRow() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method checkForEndOfRow is not implemented"); } virtual bool allowVariableNumberOfColumns() const { return false; } diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference index e9ff548e05c..50173c150c0 100644 --- a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.reference @@ -37,6 +37,14 @@ JSONCompactEachRow 2 0 0 0 3 3 +1 1 +2 0 +0 0 +3 3 +1 [1,2,3] +2 [] +0 [] +3 [3] 1 1 \N \N 2 \N \N \N \N \N \N \N diff --git a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql index dea4c20db8a..7c55cf2e9a7 100644 --- a/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql +++ b/tests/queries/0_stateless/02834_formats_with_variable_number_of_columns.sql @@ -10,6 +10,8 @@ select * from format(TSVWithNames, 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input select * from format(TSVWithNames, 'x UInt32, z UInt32', 'x\ty\n1\t1\n2\n\n3\t3\t3\t3') settings input_format_tsv_allow_variable_number_of_columns=1; select 'JSONCompactEachRow'; select * from format(JSONCompactEachRow, 'x UInt32, y UInt32', '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRow, 'x UInt32, y UInt32', '[1,1,[1,2,3]]\n[2]\n[]\n[3,3,3,3,[1,2,3]]') settings input_format_json_compact_allow_variable_number_of_columns=1; +select * from format(JSONCompactEachRow, 'x UInt32, y Array(UInt32)', '[1,[1,2,3],1]\n[2]\n[]\n[3,[3],3,3,[1,2,3]]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRow, '[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRowWithNames, '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; select * from format(JSONCompactEachRowWithNames, 'x UInt32, z UInt32', '["x","y"]\n[1,1]\n[2]\n[]\n[3,3,3,3]') settings input_format_json_compact_allow_variable_number_of_columns=1; From bcc0fbbf9150de48fdec116f62c0664b84c13c9f Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Thu, 10 Aug 2023 16:10:33 +0000 Subject: [PATCH 23/74] Add EXCEPT clause to SYSTEM STOP LISTEN query --- docs/en/sql-reference/statements/system.md | 6 +- programs/server/Server.cpp | 6 +- src/Parsers/ASTSystemQuery.cpp | 48 +++++++++-- src/Parsers/ParserSystemQuery.cpp | 81 ++++++++++++++----- src/Server/ServerType.cpp | 58 +++++++++---- src/Server/ServerType.h | 18 ++++- .../test_system_start_stop_listen/test.py | 70 ++++++++++++++++ 7 files changed, 238 insertions(+), 49 deletions(-) diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 59970dbeccd..766dd2348ee 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -443,9 +443,9 @@ SYSTEM STOP LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QU ``` - If `CUSTOM 'protocol'` modifier is specified, the custom protocol with the specified name defined in the protocols section of the server configuration will be stopped. -- If `QUERIES ALL` modifier is specified, all protocols are stopped. -- If `QUERIES DEFAULT` modifier is specified, all default protocols are stopped. -- If `QUERIES CUSTOM` modifier is specified, all custom protocols are stopped. +- If `QUERIES ALL [EXCEPT .. [,..]]` modifier is specified, all protocols are stopped, unless specified with `EXCEPT` clause. +- If `QUERIES DEFAULT [EXCEPT .. [,..]]` modifier is specified, all default protocols are stopped, unless specified with `EXCEPT` clause. +- If `QUERIES CUSTOM [EXCEPT .. [,..]]` modifier is specified, all custom protocols are stopped, unless specified with `EXCEPT` clause. ### SYSTEM START LISTEN diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index e6d5837dd0e..bdff3b79a99 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -2045,6 +2045,9 @@ void Server::createServers( for (const auto & protocol : protocols) { + if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol)) + continue; + std::string prefix = "protocols." + protocol + "."; std::string port_name = prefix + "port"; std::string description {" protocol"}; @@ -2054,9 +2057,6 @@ void Server::createServers( if (!config.has(prefix + "port")) continue; - if (!server_type.shouldStart(ServerType::Type::CUSTOM, port_name)) - continue; - std::vector hosts; if (config.has(prefix + "host")) hosts.push_back(config.getString(prefix + "host")); diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp index fb10474a4d4..9be01719d8c 100644 --- a/src/Parsers/ASTSystemQuery.cpp +++ b/src/Parsers/ASTSystemQuery.cpp @@ -204,7 +204,7 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &, } else if (type == Type::SUSPEND) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << " FOR " + settings.ostr << (settings.hilite ? hilite_keyword : "") << " FOR " << (settings.hilite ? hilite_none : "") << seconds << (settings.hilite ? hilite_keyword : "") << " SECOND" << (settings.hilite ? hilite_none : ""); @@ -232,12 +232,50 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &, } else if (type == Type::START_LISTEN || type == Type::STOP_LISTEN) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << " " << ServerType::serverTypeToString(server_type.type) - << (settings.hilite ? hilite_none : ""); + settings.ostr << (settings.hilite ? hilite_keyword : "") << " " + << ServerType::serverTypeToString(server_type.type) << (settings.hilite ? hilite_none : ""); - if (server_type.type == ServerType::CUSTOM) + if (server_type.type == ServerType::Type::CUSTOM) { - settings.ostr << (settings.hilite ? hilite_identifier : "") << " " << backQuoteIfNeed(server_type.custom_name); + settings.ostr << " " << quoteString(server_type.custom_name); + } + + bool comma = false; + + if (!server_type.exclude_types.empty()) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") + << " EXCEPT" << (settings.hilite ? hilite_none : ""); + + for (auto cur_type : server_type.exclude_types) + { + if (cur_type == ServerType::Type::CUSTOM) + continue; + + if (comma) + settings.ostr << ","; + else + comma = true; + + settings.ostr << (settings.hilite ? hilite_keyword : "") << " " + << ServerType::serverTypeToString(cur_type) << (settings.hilite ? hilite_none : ""); + } + + if (server_type.exclude_types.contains(ServerType::Type::CUSTOM)) + { + for (const auto & cur_name : server_type.exclude_custom_names) + { + if (comma) + settings.ostr << ","; + else + comma = true; + + settings.ostr << (settings.hilite ? hilite_keyword : "") << " " + << ServerType::serverTypeToString(ServerType::Type::CUSTOM) << (settings.hilite ? hilite_none : ""); + + settings.ostr << " " << quoteString(cur_name); + } + } } } diff --git a/src/Parsers/ParserSystemQuery.cpp b/src/Parsers/ParserSystemQuery.cpp index 40fc1acae69..ac3aa41048c 100644 --- a/src/Parsers/ParserSystemQuery.cpp +++ b/src/Parsers/ParserSystemQuery.cpp @@ -458,32 +458,71 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & if (!parseQueryWithOnCluster(res, pos, expected)) return false; - ServerType::Type current_type = ServerType::Type::END; - std::string current_custom_name; - - for (const auto & type : magic_enum::enum_values()) + auto parse_server_type = [&](ServerType::Type & type, std::string & custom_name) -> bool { - if (ParserKeyword{ServerType::serverTypeToString(type)}.ignore(pos, expected)) + type = ServerType::Type::END; + custom_name = ""; + + for (const auto & cur_type : magic_enum::enum_values()) { - current_type = type; - break; + if (ParserKeyword{ServerType::serverTypeToString(cur_type)}.ignore(pos, expected)) + { + type = cur_type; + break; + } + } + + if (type == ServerType::Type::END) + return false; + + if (type == ServerType::CUSTOM) + { + ASTPtr ast; + + if (!ParserStringLiteral{}.parse(pos, ast, expected)) + return false; + + custom_name = ast->as().value.get(); + } + + return true; + }; + + ServerType::Type base_type; + std::string base_custom_name; + + ServerType::Types exclude_type; + ServerType::CustomNames exclude_custom_names; + + if (!parse_server_type(base_type, base_custom_name)) + return false; + + if (ParserKeyword{"EXCEPT"}.ignore(pos, expected)) + { + if (base_type != ServerType::Type::QUERIES_ALL && + base_type != ServerType::Type::QUERIES_DEFAULT && + base_type != ServerType::Type::QUERIES_CUSTOM) + return false; + + ServerType::Type current_type; + std::string current_custom_name; + + while (true) + { + if (!exclude_type.empty() && !ParserToken(TokenType::Comma).ignore(pos, expected)) + break; + + if (!parse_server_type(current_type, current_custom_name)) + return false; + + exclude_type.insert(current_type); + + if (current_type == ServerType::Type::CUSTOM) + exclude_custom_names.insert(current_custom_name); } } - if (current_type == ServerType::Type::END) - return false; - - if (current_type == ServerType::CUSTOM) - { - ASTPtr ast; - - if (!ParserStringLiteral{}.parse(pos, ast, expected)) - return false; - - current_custom_name = ast->as().value.get(); - } - - res->server_type = ServerType(current_type, current_custom_name); + res->server_type = ServerType(base_type, base_custom_name, exclude_type, exclude_custom_names); break; } diff --git a/src/Server/ServerType.cpp b/src/Server/ServerType.cpp index 4952cd1bd24..4199a5fd042 100644 --- a/src/Server/ServerType.cpp +++ b/src/Server/ServerType.cpp @@ -42,12 +42,9 @@ const char * ServerType::serverTypeToString(ServerType::Type type) bool ServerType::shouldStart(Type server_type, const std::string & server_custom_name) const { - if (type == Type::QUERIES_ALL) - return true; - - if (type == Type::QUERIES_DEFAULT) + auto is_type_default = [](Type current_type) { - switch (server_type) + switch (current_type) { case Type::TCP: case Type::TCP_WITH_PROXY: @@ -64,21 +61,42 @@ bool ServerType::shouldStart(Type server_type, const std::string & server_custom default: return false; } + }; + + auto is_type_custom = [](Type current_type) + { + return current_type == Type::CUSTOM; + }; + + if (exclude_types.contains(Type::QUERIES_ALL)) + return false; + + if (exclude_types.contains(Type::QUERIES_DEFAULT) && is_type_default(server_type)) + return false; + + if (exclude_types.contains(Type::QUERIES_CUSTOM) && is_type_custom(server_type)) + return false; + + if (exclude_types.contains(server_type)) + { + if (server_type != Type::CUSTOM) + return false; + + if (exclude_custom_names.contains(server_custom_name)) + return false; } + if (type == Type::QUERIES_ALL) + return true; + + if (type == Type::QUERIES_DEFAULT) + return is_type_default(server_type); + if (type == Type::QUERIES_CUSTOM) - { - switch (server_type) - { - case Type::CUSTOM: - return true; - default: - return false; - } - } + return is_type_custom(server_type); if (type == Type::CUSTOM) - return server_type == type && server_custom_name == "protocols." + custom_name + ".port"; + return server_type == type && server_custom_name == custom_name; return server_type == type; } @@ -86,6 +104,7 @@ bool ServerType::shouldStart(Type server_type, const std::string & server_custom bool ServerType::shouldStop(const std::string & port_name) const { Type port_type; + std::string port_custom_name; if (port_name == "http_port") port_type = Type::HTTP; @@ -121,12 +140,19 @@ bool ServerType::shouldStop(const std::string & port_name) const port_type = Type::INTERSERVER_HTTPS; else if (port_name.starts_with("protocols.") && port_name.ends_with(".port")) + { port_type = Type::CUSTOM; + constexpr size_t protocols_size = std::string_view("protocols.").size(); + constexpr size_t ports_size = std::string_view(".ports").size(); + + port_custom_name = port_name.substr(protocols_size, port_name.size() - protocols_size - ports_size + 1); + } + else return false; - return shouldStart(port_type, port_name); + return shouldStart(port_type, port_custom_name); } } diff --git a/src/Server/ServerType.h b/src/Server/ServerType.h index 1fab492222a..bfbe692f5bd 100644 --- a/src/Server/ServerType.h +++ b/src/Server/ServerType.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB { @@ -28,8 +29,20 @@ public: END }; + using Types = std::unordered_set; + using CustomNames = std::unordered_set; + ServerType() = default; - explicit ServerType(Type type_, const std::string & custom_name_ = "") : type(type_), custom_name(custom_name_) {} + + explicit ServerType( + Type type_, + const std::string & custom_name_ = "", + const Types & exclude_types_ = {}, + const CustomNames exclude_custom_names_ = {}) + : type(type_), + custom_name(custom_name_), + exclude_types(exclude_types_), + exclude_custom_names(exclude_custom_names_) {} static const char * serverTypeToString(Type type); @@ -39,6 +52,9 @@ public: Type type; std::string custom_name; + + Types exclude_types; + CustomNames exclude_custom_names; }; } diff --git a/tests/integration/test_system_start_stop_listen/test.py b/tests/integration/test_system_start_stop_listen/test.py index 1925685af03..8a3081e0c15 100644 --- a/tests/integration/test_system_start_stop_listen/test.py +++ b/tests/integration/test_system_start_stop_listen/test.py @@ -143,3 +143,73 @@ def test_all_protocols(started_cluster): backup_node.query("SYSTEM START LISTEN ON CLUSTER default QUERIES ALL") assert_everything_works() + + +def test_except(started_cluster): + custom_client = Client(main_node.ip_address, 9001, command=cluster.client_bin_path) + assert_everything_works() + + # STOP LISTEN QUERIES ALL EXCEPT + main_node.query("SYSTEM STOP LISTEN QUERIES ALL EXCEPT MYSQL, CUSTOM 'tcp'") + assert "Connection refused" in main_node.query_and_get_error(QUERY) + custom_client.query(MYSQL_QUERY) + assert http_works() == False + assert http_works(8124) == False + + # START LISTEN QUERIES ALL EXCEPT + backup_node.query("SYSTEM START LISTEN ON CLUSTER default QUERIES ALL EXCEPT TCP") + assert "Connection refused" in main_node.query_and_get_error(QUERY) + custom_client.query(MYSQL_QUERY) + assert http_works() == True + assert http_works(8124) == True + backup_node.query("SYSTEM START LISTEN ON CLUSTER default QUERIES ALL") + + assert_everything_works() + + # STOP LISTEN QUERIES DEFAULT EXCEPT + main_node.query("SYSTEM STOP LISTEN QUERIES DEFAULT EXCEPT TCP") + main_node.query(QUERY) + assert "Connections to mysql failed" in custom_client.query_and_get_error( + MYSQL_QUERY + ) + custom_client.query(QUERY) + assert http_works() == False + assert http_works(8124) == True + + # START LISTEN QUERIES DEFAULT EXCEPT + backup_node.query( + "SYSTEM START LISTEN ON CLUSTER default QUERIES DEFAULT EXCEPT HTTP" + ) + main_node.query(QUERY) + main_node.query(MYSQL_QUERY) + custom_client.query(QUERY) + assert http_works() == False + assert http_works(8124) == True + + backup_node.query("SYSTEM START LISTEN ON CLUSTER default QUERIES ALL") + + assert_everything_works() + + # STOP LISTEN QUERIES CUSTOM EXCEPT + main_node.query("SYSTEM STOP LISTEN QUERIES CUSTOM EXCEPT CUSTOM 'tcp'") + main_node.query(QUERY) + custom_client.query(MYSQL_QUERY) + custom_client.query(QUERY) + assert http_works() == True + assert http_works(8124) == False + + main_node.query("SYSTEM STOP LISTEN QUERIES CUSTOM") + + # START LISTEN QUERIES DEFAULT EXCEPT + backup_node.query( + "SYSTEM START LISTEN ON CLUSTER default QUERIES CUSTOM EXCEPT CUSTOM 'tcp'" + ) + main_node.query(QUERY) + main_node.query(MYSQL_QUERY) + assert "Connection refused" in custom_client.query_and_get_error(QUERY) + assert http_works() == True + assert http_works(8124) == True + + backup_node.query("SYSTEM START LISTEN ON CLUSTER default QUERIES ALL") + + assert_everything_works() From 31aa9f459ed0aae4dcea3218ba790e2abd1080e3 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 14 Aug 2023 09:35:29 +0000 Subject: [PATCH 24/74] minor fixes --- src/Databases/IDatabase.cpp | 7 +++---- src/Interpreters/DatabaseCatalog.cpp | 10 +++------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/Databases/IDatabase.cpp b/src/Databases/IDatabase.cpp index 7d12ae6c588..09640d2f86e 100644 --- a/src/Databases/IDatabase.cpp +++ b/src/Databases/IDatabase.cpp @@ -23,11 +23,10 @@ StoragePtr IDatabase::getTable(const String & name, ContextPtr context) const return storage; TableNameHints hints(this->shared_from_this(), context); std::vector names = hints.getHints(name); - if (!names.empty()) - { + if (names.empty()) + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); + else throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist. Maybe you meant {}?", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name), backQuoteIfNeed(names[0])); - } - else throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); } std::vector> IDatabase::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 3baaa182d51..dad455d487b 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -341,14 +341,10 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( { TableNameHints hints(this->tryGetDatabase(table_id.getDatabaseName()), getContext()); std::vector names = hints.getHints(table_id.getTableName()); - if (!names.empty()) - { - /// There is two options: first is to print just the name of the table - /// and the second is to print the result in format: db_name.table_name. I'll comment out the second option below - /// I also leave possibility to print several suggestions + if (names.empty()) + exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_id.getNameForLogs())); + else exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist. Maybe you meant {}?", table_id.getNameForLogs(), backQuoteIfNeed(names[0]))); - } - else exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_id.getNameForLogs())); } return {}; } From c94994afcfea3ad3948cd9bd789f584a76b782a0 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 14 Aug 2023 22:50:46 +0000 Subject: [PATCH 25/74] Propagate setting cluster_for_parallel_replicas to shards --- src/Interpreters/Cluster.h | 2 ++ src/Interpreters/ClusterProxy/executeQuery.cpp | 9 ++++++--- src/Processors/QueryPlan/ReadFromRemote.cpp | 17 ++++++++++++++++- src/Processors/QueryPlan/ReadFromRemote.h | 6 ++++-- src/Storages/StorageMergeTree.cpp | 13 ++++++++++++- src/Storages/StorageReplicatedMergeTree.cpp | 13 ++++++++++++- .../test.py | 10 +++++----- ...02835_parallel_replicas_over_distributed.sql | 8 ++++---- 8 files changed, 61 insertions(+), 17 deletions(-) diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index b90acd1d576..f509a2d9847 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -273,6 +273,8 @@ public: /// Are distributed DDL Queries (ON CLUSTER Clause) allowed for this cluster bool areDistributedDDLQueriesAllowed() const { return allow_distributed_ddl_queries; } + String getName() const { return name; } + private: SlotToShard slot_to_shard; diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index fbc7bbd5bbb..e335df3b2c5 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -234,7 +234,8 @@ void executeQuery( std::move(external_tables), log, shards, - query_info.storage_limits); + query_info.storage_limits, + query_info.getCluster()->getName()); read_from_remote->setStepDescription("Read from remote replica"); plan->addStep(std::move(read_from_remote)); @@ -266,14 +267,16 @@ void executeQueryWithParallelReplicas( const StorageID & main_table, const ASTPtr & table_func_ptr, SelectStreamFactory & stream_factory, - const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info, + const ASTPtr & query_ast, + ContextPtr context, + const SelectQueryInfo & query_info, const ClusterPtr & not_optimized_cluster) { const auto & settings = context->getSettingsRef(); auto new_context = Context::createCopy(context); auto scalars = new_context->hasQueryContext() ? new_context->getQueryContext()->getScalars() : Scalars{}; - Int64 shard_num = 0; /// shard_num is 1-based, so 0 - no shard specified + UInt64 shard_num = 0; /// shard_num is 1-based, so 0 - no shard specified auto it = scalars.find("_shard_num"); if (it != scalars.end()) { diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 576349844ec..e9cf80b0808 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -103,7 +104,8 @@ ReadFromRemote::ReadFromRemote( Tables external_tables_, Poco::Logger * log_, UInt32 shard_count_, - std::shared_ptr storage_limits_) + std::shared_ptr storage_limits_, + const String & cluster_name_) : ISourceStep(DataStream{.header = std::move(header_)}) , shards(std::move(shards_)) , stage(stage_) @@ -116,6 +118,7 @@ ReadFromRemote::ReadFromRemote( , storage_limits(std::move(storage_limits_)) , log(log_) , shard_count(shard_count_) + , cluster_name(cluster_name_) { } @@ -234,6 +237,16 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact scalars["_shard_num"] = Block{{DataTypeUInt32().createColumnConst(1, shard.shard_info.shard_num), std::make_shared(), "_shard_num"}}; + if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS) + { + String cluster_for_parallel_replicas = cluster_name; + LOG_DEBUG(&Poco::Logger::get(__FUNCTION__), "_cluster_for_parallel_replicas: {}", cluster_for_parallel_replicas); + scalars["_cluster_for_parallel_replicas"] = Block{ + {DataTypeString().createColumnConst(1, cluster_for_parallel_replicas), + std::make_shared(), + "_cluster_for_parallel_replicas"}}; + } + std::shared_ptr remote_query_executor; remote_query_executor = std::make_shared( @@ -242,6 +255,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact remote_query_executor->setLogger(log); if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS) + { // when doing parallel reading from replicas (ParallelReplicasMode::READ_TASKS) on a shard: // establish a connection to a replica on the shard, the replica will instantiate coordinator to manage parallel reading from replicas on the shard. // The coordinator will return query result from the shard. @@ -249,6 +263,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact // Using PoolMode::GET_MANY for this mode will(can) lead to instantiation of several coordinators (depends on max_parallel_replicas setting) // each will execute parallel reading from replicas, so the query result will be multiplied by the number of created coordinators remote_query_executor->setPoolMode(PoolMode::GET_ONE); + } else remote_query_executor->setPoolMode(PoolMode::GET_MANY); diff --git a/src/Processors/QueryPlan/ReadFromRemote.h b/src/Processors/QueryPlan/ReadFromRemote.h index ac869cd89f9..27e640970ce 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.h +++ b/src/Processors/QueryPlan/ReadFromRemote.h @@ -35,7 +35,8 @@ public: Tables external_tables_, Poco::Logger * log_, UInt32 shard_count_, - std::shared_ptr storage_limits_); + std::shared_ptr storage_limits_, + const String & cluster_name_); String getName() const override { return "ReadFromRemote"; } @@ -55,8 +56,9 @@ private: Tables external_tables; std::shared_ptr storage_limits; Poco::Logger * log; - UInt32 shard_count; + String cluster_name; + void addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard); void addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard); }; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index ad9013d9f13..25f64ec996d 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -220,7 +220,18 @@ void StorageMergeTree::read( local_context, query_info.query, table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr); - auto cluster = local_context->getCluster(local_context->getSettingsRef().cluster_for_parallel_replicas); + auto scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; + String cluster_for_parallel_replicas = local_context->getSettingsRef().cluster_for_parallel_replicas; + { + auto it = scalars.find("_cluster_for_parallel_replicas"); + if (it != scalars.end()) + { + const Block & block = it->second; + cluster_for_parallel_replicas = block.getColumns()[0]->getDataAt(0).toString(); + } + } + LOG_DEBUG(&Poco::Logger::get("StorageMergeTree::read"), "_cluster_for_parallel_replicas: {}", cluster_for_parallel_replicas); + auto cluster = local_context->getCluster(cluster_for_parallel_replicas); Block header; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 7fce373e26b..5513cc90fff 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5153,7 +5153,18 @@ void StorageReplicatedMergeTree::readParallelReplicasImpl( { auto table_id = getStorageID(); - auto parallel_replicas_cluster = local_context->getCluster(local_context->getSettingsRef().cluster_for_parallel_replicas); + auto scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; + String cluster_for_parallel_replicas = local_context->getSettingsRef().cluster_for_parallel_replicas; + { + auto it = scalars.find("_cluster_for_parallel_replicas"); + if (it != scalars.end()) + { + const Block & block = it->second; + cluster_for_parallel_replicas = block.getColumns()[0]->getDataAt(0).toString(); + } + } + LOG_DEBUG(&Poco::Logger::get(__FUNCTION__), "_cluster_for_parallel_replicas: {}", cluster_for_parallel_replicas); + auto parallel_replicas_cluster = local_context->getCluster(cluster_for_parallel_replicas); ASTPtr modified_query_ast; Block header; diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index 5ad191af331..51ce86d828f 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -105,10 +105,10 @@ def test_parallel_replicas_over_distributed(start_cluster, cluster): expected_result = f"6001\t-1999\t1999\t0\n" # w/o parallel replicas - assert ( - node.query(f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d") - == expected_result - ) + # assert ( + # node.query(f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d") + # == expected_result + # ) # parallel replicas assert ( @@ -119,7 +119,7 @@ def test_parallel_replicas_over_distributed(start_cluster, cluster): "prefer_localhost_replica": 0, "max_parallel_replicas": 4, "use_hedged_requests": 0, - "cluster_for_parallel_replicas": cluster, + # "cluster_for_parallel_replicas": cluster, }, ) == expected_result diff --git a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql index 4e7c72ebfd8..30dfb32678d 100644 --- a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql +++ b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql @@ -14,13 +14,13 @@ insert into test select *, today() from numbers(100); SELECT count(), min(id), max(id), avg(id) FROM test_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost'; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; --, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost'; insert into test select *, today() from numbers(100); SELECT count(), min(id), max(id), avg(id) FROM test_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost'; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; --, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost'; -- 2 shards @@ -38,10 +38,10 @@ insert into test2 select *, today() from numbers(100); SELECT count(), min(id), max(id), avg(id) FROM test2_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0, cluster_for_parallel_replicas = 'test_cluster_two_shard_three_replicas_localhost'; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; --, cluster_for_parallel_replicas = 'test_cluster_two_shard_three_replicas_localhost'; insert into test2 select *, today() from numbers(100); SELECT count(), min(id), max(id), avg(id) FROM test2_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0, cluster_for_parallel_replicas = 'test_cluster_two_shard_three_replicas_localhost'; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; --, cluster_for_parallel_replicas = 'test_cluster_two_shard_three_replicas_localhost'; From c85986a5dae9b0ff8c05992970438b36da327bd6 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 14 Aug 2023 23:06:48 +0000 Subject: [PATCH 26/74] Update test --- .../test_parallel_replicas_over_distributed/test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index 51ce86d828f..ee52c8c040e 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -105,10 +105,10 @@ def test_parallel_replicas_over_distributed(start_cluster, cluster): expected_result = f"6001\t-1999\t1999\t0\n" # w/o parallel replicas - # assert ( - # node.query(f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d") - # == expected_result - # ) + assert ( + node.query(f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d") + == expected_result + ) # parallel replicas assert ( From e82b6b02cafa427547253105ebe3c58828738a99 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 14 Aug 2023 23:15:27 +0000 Subject: [PATCH 27/74] Fix style --- src/Processors/QueryPlan/ReadFromRemote.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index e9cf80b0808..425728a2e80 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -239,12 +239,9 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS) { - String cluster_for_parallel_replicas = cluster_name; - LOG_DEBUG(&Poco::Logger::get(__FUNCTION__), "_cluster_for_parallel_replicas: {}", cluster_for_parallel_replicas); + LOG_DEBUG(&Poco::Logger::get(__FUNCTION__), "_cluster_for_parallel_replicas: {}", cluster_name); scalars["_cluster_for_parallel_replicas"] = Block{ - {DataTypeString().createColumnConst(1, cluster_for_parallel_replicas), - std::make_shared(), - "_cluster_for_parallel_replicas"}}; + {DataTypeString().createColumnConst(1, cluster_name), std::make_shared(), "_cluster_for_parallel_replicas"}}; } std::shared_ptr remote_query_executor; From 9f7a25eeadcd9c78eb61ec1c97cb46e022377600 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 14 Aug 2023 23:27:36 +0000 Subject: [PATCH 28/74] Fix style once more --- src/Processors/QueryPlan/ReadFromRemote.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 425728a2e80..cbcaca3e971 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -240,8 +240,8 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS) { LOG_DEBUG(&Poco::Logger::get(__FUNCTION__), "_cluster_for_parallel_replicas: {}", cluster_name); - scalars["_cluster_for_parallel_replicas"] = Block{ - {DataTypeString().createColumnConst(1, cluster_name), std::make_shared(), "_cluster_for_parallel_replicas"}}; + scalars["_cluster_for_parallel_replicas"] = + Block{{DataTypeString().createColumnConst(1, cluster_name), std::make_shared(), "_cluster_for_parallel_replicas"}}; } std::shared_ptr remote_query_executor; From 9825f8c76eec8999d0aa4509ab4604d7f45122f2 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 15 Aug 2023 17:03:32 +0000 Subject: [PATCH 29/74] Remove debug logging --- src/Interpreters/Cluster.h | 2 +- src/Processors/QueryPlan/ReadFromRemote.cpp | 2 +- src/Storages/StorageMergeTree.cpp | 11 +++++++---- src/Storages/StorageReplicatedMergeTree.cpp | 9 ++++++--- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index f509a2d9847..cb75487cbbc 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -273,7 +273,7 @@ public: /// Are distributed DDL Queries (ON CLUSTER Clause) allowed for this cluster bool areDistributedDDLQueriesAllowed() const { return allow_distributed_ddl_queries; } - String getName() const { return name; } + const String & getName() const { return name; } private: SlotToShard slot_to_shard; diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index cbcaca3e971..bf9a87a692e 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -239,7 +239,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS) { - LOG_DEBUG(&Poco::Logger::get(__FUNCTION__), "_cluster_for_parallel_replicas: {}", cluster_name); + /// the cluster is defined by Distributed table and passed to shards via `_cluster_for_parallel_replicas` scalar value scalars["_cluster_for_parallel_replicas"] = Block{{DataTypeString().createColumnConst(1, cluster_name), std::make_shared(), "_cluster_for_parallel_replicas"}}; } diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 3a0d485e5c4..59d86cf3d1e 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -220,17 +220,20 @@ void StorageMergeTree::read( local_context, query_info.query, table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr); - auto scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; String cluster_for_parallel_replicas = local_context->getSettingsRef().cluster_for_parallel_replicas; { - auto it = scalars.find("_cluster_for_parallel_replicas"); + /// if parallel replicas query executed over Distributed table, + /// the cluster is defined by Distributed table and passed to shards via `_cluster_for_parallel_replicas` scalar value + const auto scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; + const auto it = scalars.find("_cluster_for_parallel_replicas"); if (it != scalars.end()) { const Block & block = it->second; - cluster_for_parallel_replicas = block.getColumns()[0]->getDataAt(0).toString(); + chassert(block.columns() == 1); + const auto & column = block.getByPosition(0).column; + cluster_for_parallel_replicas = column->getDataAt(0).toString(); } } - LOG_DEBUG(&Poco::Logger::get("StorageMergeTree::read"), "_cluster_for_parallel_replicas: {}", cluster_for_parallel_replicas); auto cluster = local_context->getCluster(cluster_for_parallel_replicas); Block header; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index f91d0b49928..60d5d26490b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5156,14 +5156,17 @@ void StorageReplicatedMergeTree::readParallelReplicasImpl( auto scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; String cluster_for_parallel_replicas = local_context->getSettingsRef().cluster_for_parallel_replicas; { - auto it = scalars.find("_cluster_for_parallel_replicas"); + /// if parallel replicas query executed over Distributed table, + /// the cluster is defined by Distributed table and passed to shards via `_cluster_for_parallel_replicas` scalar value + const auto it = scalars.find("_cluster_for_parallel_replicas"); if (it != scalars.end()) { const Block & block = it->second; - cluster_for_parallel_replicas = block.getColumns()[0]->getDataAt(0).toString(); + chassert(block.columns() == 1); + const auto & column = block.getByPosition(0).column; + cluster_for_parallel_replicas = column->getDataAt(0).toString(); } } - LOG_DEBUG(&Poco::Logger::get(__FUNCTION__), "_cluster_for_parallel_replicas: {}", cluster_for_parallel_replicas); auto parallel_replicas_cluster = local_context->getCluster(cluster_for_parallel_replicas); ASTPtr modified_query_ast; From f1e040447f13aaff03d8aee5b00b0d486be73088 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 15 Aug 2023 17:14:36 +0000 Subject: [PATCH 30/74] Tests: there is no need for `cluster_for_parallel_replicas` setting in queries over distributed anymore --- .../test_parallel_replicas_over_distributed/test.py | 5 ++--- .../02835_parallel_replicas_over_distributed.sql | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index ee52c8c040e..a0f9d2b71ac 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -88,7 +88,7 @@ def create_tables(cluster, table_name): settings={"insert_distributed_sync": 1}, ) nodes[0].query( - f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(1)", + f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(3)", settings={"insert_distributed_sync": 1}, ) @@ -102,7 +102,7 @@ def test_parallel_replicas_over_distributed(start_cluster, cluster): create_tables(cluster, table_name) node = nodes[0] - expected_result = f"6001\t-1999\t1999\t0\n" + expected_result = f"6003\t-1999\t1999\t3\n" # w/o parallel replicas assert ( @@ -119,7 +119,6 @@ def test_parallel_replicas_over_distributed(start_cluster, cluster): "prefer_localhost_replica": 0, "max_parallel_replicas": 4, "use_hedged_requests": 0, - # "cluster_for_parallel_replicas": cluster, }, ) == expected_result diff --git a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql index 30dfb32678d..c2fc1d8355d 100644 --- a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql +++ b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql @@ -14,13 +14,13 @@ insert into test select *, today() from numbers(100); SELECT count(), min(id), max(id), avg(id) FROM test_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; --, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost'; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; insert into test select *, today() from numbers(100); SELECT count(), min(id), max(id), avg(id) FROM test_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; --, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost'; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; -- 2 shards @@ -38,10 +38,10 @@ insert into test2 select *, today() from numbers(100); SELECT count(), min(id), max(id), avg(id) FROM test2_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; --, cluster_for_parallel_replicas = 'test_cluster_two_shard_three_replicas_localhost'; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; insert into test2 select *, today() from numbers(100); SELECT count(), min(id), max(id), avg(id) FROM test2_d -SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; --, cluster_for_parallel_replicas = 'test_cluster_two_shard_three_replicas_localhost'; +SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0; From 49134711dc7f98137cb95f42cbe072da038b3e80 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 15 Aug 2023 17:37:01 +0000 Subject: [PATCH 31/74] Polishing --- src/Interpreters/ClusterProxy/executeQuery.cpp | 6 +++--- src/Storages/StorageMergeTree.cpp | 3 +-- src/Storages/StorageReplicatedMergeTree.cpp | 3 +-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 0dcae6a7d65..a42ec25cda8 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -276,11 +276,12 @@ void executeQueryWithParallelReplicas( auto scalars = new_context->hasQueryContext() ? new_context->getQueryContext()->getScalars() : Scalars{}; UInt64 shard_num = 0; /// shard_num is 1-based, so 0 - no shard specified - auto it = scalars.find("_shard_num"); + const auto it = scalars.find("_shard_num"); if (it != scalars.end()) { const Block & block = it->second; - shard_num = block.getColumns()[0]->get64(0); + const auto & column = block.safeGetByPosition(0).column; + shard_num = column->getUInt(0); } ClusterPtr new_cluster; @@ -297,7 +298,6 @@ void executeQueryWithParallelReplicas( auto all_replicas_count = std::min(static_cast(settings.max_parallel_replicas), new_cluster->getShardCount()); auto coordinator = std::make_shared(all_replicas_count); - auto remote_plan = std::make_unique(); /// This is a little bit weird, but we construct an "empty" coordinator without /// any specified reading/coordination method (like Default, InOrder, InReverseOrder) diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 59d86cf3d1e..9d46b0a07aa 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -229,8 +229,7 @@ void StorageMergeTree::read( if (it != scalars.end()) { const Block & block = it->second; - chassert(block.columns() == 1); - const auto & column = block.getByPosition(0).column; + const auto & column = block.safeGetByPosition(0).column; cluster_for_parallel_replicas = column->getDataAt(0).toString(); } } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 60d5d26490b..ba0ce4b76a1 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5162,8 +5162,7 @@ void StorageReplicatedMergeTree::readParallelReplicasImpl( if (it != scalars.end()) { const Block & block = it->second; - chassert(block.columns() == 1); - const auto & column = block.getByPosition(0).column; + const auto & column = block.safeGetByPosition(0).column; cluster_for_parallel_replicas = column->getDataAt(0).toString(); } } From c6fc31c1e36cd981c370fd98baf83cc670c7d584 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Wed, 16 Aug 2023 23:06:42 +0000 Subject: [PATCH 32/74] Review suggestions --- src/Server/ServerType.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Server/ServerType.cpp b/src/Server/ServerType.cpp index 883be4b0892..fb052e7d6e6 100644 --- a/src/Server/ServerType.cpp +++ b/src/Server/ServerType.cpp @@ -63,18 +63,13 @@ bool ServerType::shouldStart(Type server_type, const std::string & server_custom } }; - auto is_type_custom = [](Type current_type) - { - return current_type == Type::CUSTOM; - }; - if (exclude_types.contains(Type::QUERIES_ALL)) return false; if (exclude_types.contains(Type::QUERIES_DEFAULT) && is_type_default(server_type)) return false; - if (exclude_types.contains(Type::QUERIES_CUSTOM) && is_type_custom(server_type)) + if (exclude_types.contains(Type::QUERIES_CUSTOM) && server_type == Type::CUSTOM) return false; if (exclude_types.contains(server_type)) @@ -93,7 +88,7 @@ bool ServerType::shouldStart(Type server_type, const std::string & server_custom return is_type_default(server_type); if (type == Type::QUERIES_CUSTOM) - return is_type_custom(server_type); + return server_type == Type::CUSTOM; if (type == Type::CUSTOM) return server_type == type && server_custom_name == custom_name; From 21ef1f1d1c9ba399948239dd166e2d08bb32db0b Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 16 Aug 2023 23:24:01 +0000 Subject: [PATCH 33/74] Remove unnecessary Scalars usage for cluster_for_parallel_replicas --- src/Processors/QueryPlan/ReadFromRemote.cpp | 15 +++++++++++++-- src/Storages/StorageMergeTree.cpp | 12 ------------ src/Storages/StorageReplicatedMergeTree.cpp | 11 ----------- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index bf9a87a692e..0c3d106b658 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -239,9 +239,20 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS) { + if (context->getSettingsRef().cluster_for_parallel_replicas.changed) + { + const String cluster_for_parallel_replicas = context->getSettingsRef().cluster_for_parallel_replicas; + if (cluster_for_parallel_replicas != cluster_name) + { + LOG_INFO(log, "cluster_for_parallel_replicas was set for the query but has no effect: {}. Distributed table cluster is used: {}", cluster_for_parallel_replicas, cluster_name); + } + } + + context->setSetting("cluster_for_parallel_replicas", cluster_name); + /// the cluster is defined by Distributed table and passed to shards via `_cluster_for_parallel_replicas` scalar value - scalars["_cluster_for_parallel_replicas"] = - Block{{DataTypeString().createColumnConst(1, cluster_name), std::make_shared(), "_cluster_for_parallel_replicas"}}; + // scalars["_cluster_for_parallel_replicas"] = + // Block{{DataTypeString().createColumnConst(1, cluster_name), std::make_shared(), "_cluster_for_parallel_replicas"}}; } std::shared_ptr remote_query_executor; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 9d46b0a07aa..c7bedd3aa4a 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -221,18 +221,6 @@ void StorageMergeTree::read( table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr); String cluster_for_parallel_replicas = local_context->getSettingsRef().cluster_for_parallel_replicas; - { - /// if parallel replicas query executed over Distributed table, - /// the cluster is defined by Distributed table and passed to shards via `_cluster_for_parallel_replicas` scalar value - const auto scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; - const auto it = scalars.find("_cluster_for_parallel_replicas"); - if (it != scalars.end()) - { - const Block & block = it->second; - const auto & column = block.safeGetByPosition(0).column; - cluster_for_parallel_replicas = column->getDataAt(0).toString(); - } - } auto cluster = local_context->getCluster(cluster_for_parallel_replicas); Block header; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index ba0ce4b76a1..e3698df945a 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5155,17 +5155,6 @@ void StorageReplicatedMergeTree::readParallelReplicasImpl( auto scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{}; String cluster_for_parallel_replicas = local_context->getSettingsRef().cluster_for_parallel_replicas; - { - /// if parallel replicas query executed over Distributed table, - /// the cluster is defined by Distributed table and passed to shards via `_cluster_for_parallel_replicas` scalar value - const auto it = scalars.find("_cluster_for_parallel_replicas"); - if (it != scalars.end()) - { - const Block & block = it->second; - const auto & column = block.safeGetByPosition(0).column; - cluster_for_parallel_replicas = column->getDataAt(0).toString(); - } - } auto parallel_replicas_cluster = local_context->getCluster(cluster_for_parallel_replicas); ASTPtr modified_query_ast; From 2e4d346e445cf8dae1d5b79bb8ab62ad311324d3 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 16 Aug 2023 23:31:20 +0000 Subject: [PATCH 34/74] Fixes --- src/Interpreters/ClusterProxy/executeQuery.cpp | 2 +- src/Processors/QueryPlan/ReadFromRemote.cpp | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index a42ec25cda8..0c2ad35515f 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -296,7 +296,7 @@ void executeQueryWithParallelReplicas( else new_cluster = not_optimized_cluster->getClusterWithReplicasAsShards(settings); - auto all_replicas_count = std::min(static_cast(settings.max_parallel_replicas), new_cluster->getShardCount()); + auto all_replicas_count = std::min(static_cast(settings.max_parallel_replicas), not_optimized_cluster->getShardCount()); auto coordinator = std::make_shared(all_replicas_count); /// This is a little bit weird, but we construct an "empty" coordinator without diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 0c3d106b658..2ec6c82898f 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -243,16 +243,10 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact { const String cluster_for_parallel_replicas = context->getSettingsRef().cluster_for_parallel_replicas; if (cluster_for_parallel_replicas != cluster_name) - { - LOG_INFO(log, "cluster_for_parallel_replicas was set for the query but has no effect: {}. Distributed table cluster is used: {}", cluster_for_parallel_replicas, cluster_name); - } + LOG_INFO(log, "cluster_for_parallel_replicas has been set for the query but has no effect: {}. Distributed table cluster is used: {}", + cluster_for_parallel_replicas, cluster_name); } - context->setSetting("cluster_for_parallel_replicas", cluster_name); - - /// the cluster is defined by Distributed table and passed to shards via `_cluster_for_parallel_replicas` scalar value - // scalars["_cluster_for_parallel_replicas"] = - // Block{{DataTypeString().createColumnConst(1, cluster_name), std::make_shared(), "_cluster_for_parallel_replicas"}}; } std::shared_ptr remote_query_executor; From df4ca322102c00375685ad1df178a505c3662a37 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 17 Aug 2023 11:31:51 +0000 Subject: [PATCH 35/74] Fix: select correct number of replicas for shard on coordinator + more tests --- .../ClusterProxy/executeQuery.cpp | 23 +++++++++- .../configs/remote_servers.xml | 14 ++++-- .../test.py | 43 +++++++++++++++---- 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 0c2ad35515f..4e07c78e7cb 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -284,19 +284,38 @@ void executeQueryWithParallelReplicas( shard_num = column->getUInt(0); } + size_t all_replicas_count = 0; ClusterPtr new_cluster; /// if got valid shard_num from query initiator, then parallel replicas scope is the specified shard /// shards are numbered in order of appearance in the cluster config if (shard_num > 0) { - LOG_DEBUG(&Poco::Logger::get("executeQueryWithParallelReplicas"), "Parallel replicas query in shard scope: shard_num={}", shard_num); + const auto shard_count = not_optimized_cluster->getShardCount(); + if (shard_num > shard_count) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Shard number is greater than shard count: shard_num={} shard_count={} cluster={}", + shard_num, + shard_count, + not_optimized_cluster->getName()); + + chassert(shard_count == not_optimized_cluster->getShardsAddresses().size()); + + LOG_DEBUG( + &Poco::Logger::get("executeQueryWithParallelReplicas"), "Parallel replicas query in shard scope: shard_num={}", shard_num); + + const auto shard_replicas_num = not_optimized_cluster->getShardsAddresses()[shard_num - 1].size(); + all_replicas_count = std::min(static_cast(settings.max_parallel_replicas), shard_replicas_num); + /// shard_num is 1-based, but getClusterWithSingleShard expects 0-based index new_cluster = not_optimized_cluster->getClusterWithSingleShard(shard_num - 1); } else + { new_cluster = not_optimized_cluster->getClusterWithReplicasAsShards(settings); + all_replicas_count = std::min(static_cast(settings.max_parallel_replicas), new_cluster->getShardCount()); + } - auto all_replicas_count = std::min(static_cast(settings.max_parallel_replicas), not_optimized_cluster->getShardCount()); auto coordinator = std::make_shared(all_replicas_count); /// This is a little bit weird, but we construct an "empty" coordinator without diff --git a/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml b/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml index d5b027072ef..6d7a45365f7 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml +++ b/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml @@ -11,17 +11,25 @@ n2 9000 - - - true n3 9000 + + + true n4 9000 + + n5 + 9000 + + + n6 + 9000 + diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index a0f9d2b71ac..1c8a3787356 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -7,7 +7,7 @@ nodes = [ cluster.add_instance( f"n{i}", main_configs=["configs/remote_servers.xml"], with_zookeeper=True ) - for i in (1, 2, 3, 4) + for i in (1, 2, 3, 4, 5, 6) ] @@ -46,13 +46,19 @@ def create_tables(cluster, table_name): nodes[1].query( f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r2') ORDER BY (key)" ) - # shard 2 nodes[2].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)" + ) + # shard 2 + nodes[3].query( f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard2/{table_name}', 'r1') ORDER BY (key)" ) - nodes[3].query( + nodes[4].query( f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard2/{table_name}', 'r2') ORDER BY (key)" ) + nodes[5].query( + f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard2/{table_name}', 'r3') ORDER BY (key)" + ) else: raise Exception(f"Unexpected cluster: {cluster}") @@ -94,10 +100,31 @@ def create_tables(cluster, table_name): @pytest.mark.parametrize( - "cluster", - ["test_single_shard_multiple_replicas", "test_multiple_shards_multiple_replicas"], + "cluster,max_parallel_replicas,prefer_localhost_replica", + [ + # prefer_localhost_replica=0 + pytest.param("test_single_shard_multiple_replicas", 2, 0), + pytest.param("test_single_shard_multiple_replicas", 3, 0), + pytest.param("test_single_shard_multiple_replicas", 4, 0), + pytest.param("test_single_shard_multiple_replicas", 10, 0), + # prefer_localhost_replica=1 + pytest.param("test_single_shard_multiple_replicas", 2, 1), + pytest.param("test_single_shard_multiple_replicas", 3, 1), + pytest.param("test_single_shard_multiple_replicas", 4, 1), + pytest.param("test_single_shard_multiple_replicas", 10, 1), + # prefer_localhost_replica=0 + pytest.param("test_multiple_shards_multiple_replicas", 2, 0), + pytest.param("test_multiple_shards_multiple_replicas", 3, 0), + pytest.param("test_multiple_shards_multiple_replicas", 4, 0), + pytest.param("test_multiple_shards_multiple_replicas", 10, 0), + # prefer_localhost_replica=1 + pytest.param("test_multiple_shards_multiple_replicas", 2, 1), + pytest.param("test_multiple_shards_multiple_replicas", 3, 1), + pytest.param("test_multiple_shards_multiple_replicas", 4, 1), + pytest.param("test_multiple_shards_multiple_replicas", 10, 1), + ] ) -def test_parallel_replicas_over_distributed(start_cluster, cluster): +def test_parallel_replicas_over_distributed(start_cluster, cluster, max_parallel_replicas, prefer_localhost_replica): table_name = "test_table" create_tables(cluster, table_name) @@ -116,8 +143,8 @@ def test_parallel_replicas_over_distributed(start_cluster, cluster): f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d", settings={ "allow_experimental_parallel_reading_from_replicas": 2, - "prefer_localhost_replica": 0, - "max_parallel_replicas": 4, + "prefer_localhost_replica": prefer_localhost_replica, + "max_parallel_replicas": max_parallel_replicas, "use_hedged_requests": 0, }, ) From c36f828f9fd02b360c5f6ff8b41ba0d3a3310bf6 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 17 Aug 2023 11:42:56 +0000 Subject: [PATCH 36/74] Automatic style fix --- .../test_parallel_replicas_over_distributed/test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_parallel_replicas_over_distributed/test.py b/tests/integration/test_parallel_replicas_over_distributed/test.py index 1c8a3787356..6525901d366 100644 --- a/tests/integration/test_parallel_replicas_over_distributed/test.py +++ b/tests/integration/test_parallel_replicas_over_distributed/test.py @@ -122,9 +122,11 @@ def create_tables(cluster, table_name): pytest.param("test_multiple_shards_multiple_replicas", 3, 1), pytest.param("test_multiple_shards_multiple_replicas", 4, 1), pytest.param("test_multiple_shards_multiple_replicas", 10, 1), - ] + ], ) -def test_parallel_replicas_over_distributed(start_cluster, cluster, max_parallel_replicas, prefer_localhost_replica): +def test_parallel_replicas_over_distributed( + start_cluster, cluster, max_parallel_replicas, prefer_localhost_replica +): table_name = "test_table" create_tables(cluster, table_name) From 4af3790f34d191cb54a1c003dd3c3266d4fb9502 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 17 Aug 2023 12:11:28 +0000 Subject: [PATCH 37/74] Fix style --- src/Interpreters/ClusterProxy/executeQuery.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 4e07c78e7cb..9e91febddb3 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -28,6 +28,7 @@ namespace DB namespace ErrorCodes { extern const int TOO_LARGE_DISTRIBUTED_DEPTH; + extern const int LOGICAL_ERROR; } namespace ClusterProxy From 9f771b10e96088891d0a6cd4be4ca9419057e123 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 17 Aug 2023 20:42:22 +0000 Subject: [PATCH 38/74] Cleanup --- src/Interpreters/ClusterProxy/executeQuery.cpp | 4 ++-- src/Processors/QueryPlan/ReadFromRemote.cpp | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 9e91febddb3..ccbce0b3dd4 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -302,8 +302,8 @@ void executeQueryWithParallelReplicas( chassert(shard_count == not_optimized_cluster->getShardsAddresses().size()); - LOG_DEBUG( - &Poco::Logger::get("executeQueryWithParallelReplicas"), "Parallel replicas query in shard scope: shard_num={}", shard_num); + LOG_DEBUG(&Poco::Logger::get("executeQueryWithParallelReplicas"), "Parallel replicas query in shard scope: shard_num={} cluster={}", + shard_num, not_optimized_cluster->getName()); const auto shard_replicas_num = not_optimized_cluster->getShardsAddresses()[shard_num - 1].size(); all_replicas_count = std::min(static_cast(settings.max_parallel_replicas), shard_replicas_num); diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 2ec6c82898f..1e2f5adf2b3 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -1,7 +1,6 @@ #include #include -#include #include #include #include From 255f171cf945588dff502ec37e96d5754aaedcea Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 15 Aug 2023 21:02:15 +0300 Subject: [PATCH 39/74] AARCH64 enable JIT compilation --- contrib/llvm-project-cmake/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/llvm-project-cmake/CMakeLists.txt b/contrib/llvm-project-cmake/CMakeLists.txt index fe6cffd33e2..42d9885588f 100644 --- a/contrib/llvm-project-cmake/CMakeLists.txt +++ b/contrib/llvm-project-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -if (APPLE OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined") +if (APPLE OR SANITIZE STREQUAL "undefined") set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF) else() set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON) @@ -11,8 +11,6 @@ if (NOT ENABLE_EMBEDDED_COMPILER) return() endif() -# TODO: Enable compilation on AArch64 - set (LLVM_VERSION "15.0.0bundled") set (LLVM_INCLUDE_DIRS "${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm/include" From d9c4b110cec79b8f8f445e561553b8c17affcb8f Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 18 Aug 2023 14:42:40 +0300 Subject: [PATCH 40/74] LLVM AARCH64 cross compilation fix --- contrib/llvm-project | 2 +- contrib/llvm-project-cmake/CMakeLists.txt | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/contrib/llvm-project b/contrib/llvm-project index d857c707fcc..59bf9d5c174 160000 --- a/contrib/llvm-project +++ b/contrib/llvm-project @@ -1 +1 @@ -Subproject commit d857c707fccd50423bea1c4710dc469cf89607a9 +Subproject commit 59bf9d5c17454e2d899fed505f268a5e97b08ab9 diff --git a/contrib/llvm-project-cmake/CMakeLists.txt b/contrib/llvm-project-cmake/CMakeLists.txt index 42d9885588f..0ca94bf7808 100644 --- a/contrib/llvm-project-cmake/CMakeLists.txt +++ b/contrib/llvm-project-cmake/CMakeLists.txt @@ -97,15 +97,6 @@ set(LLVM_ENABLE_BINDINGS 0 CACHE INTERNAL "") set (LLVM_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm") set (LLVM_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/llvm-project/llvm") -# Since we always use toolchain files to generate hermatic builds, cmake will -# think it's a cross compilation, and LLVM will try to configure NATIVE LLVM -# targets with all tests enabled, which will slow down cmake configuration and -# compilation (You'll see Building native llvm-tblgen...). Let's disable the -# cross compiling indicator for now. -# -# TODO We should let cmake know whether it's indeed a cross compilation in the -# first place. -set (CMAKE_CROSSCOMPILING 0) add_subdirectory ("${LLVM_SOURCE_DIR}" "${LLVM_BINARY_DIR}") set_directory_properties (PROPERTIES From 806519acb3db33a91096a136b1f363b8dced1eaf Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 18 Aug 2023 17:56:30 +0300 Subject: [PATCH 41/74] Test --- contrib/llvm-project | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/llvm-project b/contrib/llvm-project index 59bf9d5c174..4d70dbef5fe 160000 --- a/contrib/llvm-project +++ b/contrib/llvm-project @@ -1 +1 @@ -Subproject commit 59bf9d5c17454e2d899fed505f268a5e97b08ab9 +Subproject commit 4d70dbef5fe573a47b1fd759148597ccd03bd3b4 From 9772beeffc9e98c637f0f28c7f7c1420e4cf6bf2 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sat, 19 Aug 2023 14:20:10 +0300 Subject: [PATCH 42/74] Test --- contrib/llvm-project-cmake/CMakeLists.txt | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/contrib/llvm-project-cmake/CMakeLists.txt b/contrib/llvm-project-cmake/CMakeLists.txt index 0ca94bf7808..f6381782c77 100644 --- a/contrib/llvm-project-cmake/CMakeLists.txt +++ b/contrib/llvm-project-cmake/CMakeLists.txt @@ -56,18 +56,23 @@ set (REQUIRED_LLVM_LIBRARIES LLVMDemangle ) -# if (ARCH_AMD64) +if (ARCH_AMD64) list(APPEND REQUIRED_LLVM_LIBRARIES LLVMX86Info LLVMX86Desc LLVMX86CodeGen) -# elseif (ARCH_AARCH64) -# list(APPEND REQUIRED_LLVM_LIBRARIES LLVMAArch64Info LLVMAArch64Desc LLVMAArch64CodeGen) -# endif () +elseif (ARCH_AARCH64) + list(APPEND REQUIRED_LLVM_LIBRARIES LLVMAArch64Info LLVMAArch64Desc LLVMAArch64CodeGen) +endif () set (CMAKE_INSTALL_RPATH "ON") # Do not adjust RPATH in llvm, since then it will not be able to find libcxx/libcxxabi/libunwind set (LLVM_COMPILER_CHECKED 1 CACHE INTERNAL "") # Skip internal compiler selection set (LLVM_ENABLE_EH 1 CACHE INTERNAL "") # With exception handling set (LLVM_ENABLE_RTTI 1 CACHE INTERNAL "") set (LLVM_ENABLE_PIC 0 CACHE INTERNAL "") -set (LLVM_TARGETS_TO_BUILD "X86" CACHE STRING "") # for x86 + ARM: "X86;AArch64" + +if (ARCH_AMD64) + set (LLVM_TARGETS_TO_BUILD "X86" CACHE STRING "") +elseif (ARCH_AARCH64) + set (LLVM_TARGETS_TO_BUILD "AArch64" CACHE STRING "") +endif () # Omit unnecessary stuff (just the options which are ON by default) set(LLVM_ENABLE_BACKTRACES 0 CACHE INTERNAL "") From 10db751c7b4cc7b86ad7fc24c2b4d5662fed2282 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sat, 19 Aug 2023 14:52:59 +0300 Subject: [PATCH 43/74] Test --- contrib/llvm-project | 2 +- contrib/llvm-project-cmake/CMakeLists.txt | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/contrib/llvm-project b/contrib/llvm-project index 4d70dbef5fe..4879e23f4d3 160000 --- a/contrib/llvm-project +++ b/contrib/llvm-project @@ -1 +1 @@ -Subproject commit 4d70dbef5fe573a47b1fd759148597ccd03bd3b4 +Subproject commit 4879e23f4d33349cd4142c6655dcfdbe3295c677 diff --git a/contrib/llvm-project-cmake/CMakeLists.txt b/contrib/llvm-project-cmake/CMakeLists.txt index f6381782c77..3da18ebb808 100644 --- a/contrib/llvm-project-cmake/CMakeLists.txt +++ b/contrib/llvm-project-cmake/CMakeLists.txt @@ -101,6 +101,9 @@ set(LLVM_ENABLE_BINDINGS 0 CACHE INTERNAL "") set (LLVM_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm") set (LLVM_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/llvm-project/llvm") +if (CMAKE_CROSSCOMPILING) + set (LLVM_HOST_TRIPLE "${CMAKE_C_COMPILER_TARGET}") +endif() add_subdirectory ("${LLVM_SOURCE_DIR}" "${LLVM_BINARY_DIR}") From 8a22c8a13b6a5134d7d52bb491dc6102e53d6250 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sat, 19 Aug 2023 15:24:12 +0300 Subject: [PATCH 44/74] Test --- contrib/llvm-project | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/llvm-project b/contrib/llvm-project index 4879e23f4d3..9154006415b 160000 --- a/contrib/llvm-project +++ b/contrib/llvm-project @@ -1 +1 @@ -Subproject commit 4879e23f4d33349cd4142c6655dcfdbe3295c677 +Subproject commit 9154006415bd7dac436a7e59d63679256b6c231c From 3a92d05014310e85b9c8cfecb0bf2311f6416c64 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sat, 19 Aug 2023 17:26:55 +0300 Subject: [PATCH 45/74] Test --- contrib/llvm-project-cmake/CMakeLists.txt | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/contrib/llvm-project-cmake/CMakeLists.txt b/contrib/llvm-project-cmake/CMakeLists.txt index 3da18ebb808..4bdb1c3b68f 100644 --- a/contrib/llvm-project-cmake/CMakeLists.txt +++ b/contrib/llvm-project-cmake/CMakeLists.txt @@ -56,6 +56,14 @@ set (REQUIRED_LLVM_LIBRARIES LLVMDemangle ) +if (ARCH_AMD64) + set (LLVM_TARGETS_TO_BUILD "X86" CACHE INTERNAL "") +elseif (ARCH_AARCH64) + set (LLVM_TARGETS_TO_BUILD "AArch64" CACHE INTERNAL "") +endif () + +message (STATUS "LLVM TARGETS TO BUILD ${LLVM_TARGETS_TO_BUILD}") + if (ARCH_AMD64) list(APPEND REQUIRED_LLVM_LIBRARIES LLVMX86Info LLVMX86Desc LLVMX86CodeGen) elseif (ARCH_AARCH64) @@ -68,12 +76,6 @@ set (LLVM_ENABLE_EH 1 CACHE INTERNAL "") # With exception handling set (LLVM_ENABLE_RTTI 1 CACHE INTERNAL "") set (LLVM_ENABLE_PIC 0 CACHE INTERNAL "") -if (ARCH_AMD64) - set (LLVM_TARGETS_TO_BUILD "X86" CACHE STRING "") -elseif (ARCH_AARCH64) - set (LLVM_TARGETS_TO_BUILD "AArch64" CACHE STRING "") -endif () - # Omit unnecessary stuff (just the options which are ON by default) set(LLVM_ENABLE_BACKTRACES 0 CACHE INTERNAL "") set(LLVM_ENABLE_CRASH_OVERRIDES 0 CACHE INTERNAL "") @@ -101,8 +103,11 @@ set(LLVM_ENABLE_BINDINGS 0 CACHE INTERNAL "") set (LLVM_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm") set (LLVM_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/llvm-project/llvm") + +message (STATUS "LLVM CMAKE CROSS COMPILING ${CMAKE_CROSSCOMPILING}") if (CMAKE_CROSSCOMPILING) - set (LLVM_HOST_TRIPLE "${CMAKE_C_COMPILER_TARGET}") + set (LLVM_HOST_TRIPLE "${CMAKE_C_COMPILER_TARGET}" CACHE INTERNAL "") + message (STATUS "CROSS COMPILING SET LLVM HOST TRIPLE ${LLVM_HOST_TRIPLE}") endif() add_subdirectory ("${LLVM_SOURCE_DIR}" "${LLVM_BINARY_DIR}") From 21f2b10f569b3d3a9430b3f49bcbb5d612db62a1 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sat, 19 Aug 2023 17:38:14 +0300 Subject: [PATCH 46/74] Test --- .../config/users.d/perf-comparison-tweaks-users.xml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml index dab41adca51..df053c8f495 100644 --- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml +++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml @@ -19,8 +19,9 @@ 12 - 0 - 0 + 1 + 1 + 1 60 From 9effe1bbe1ad8b2ab5f7354bd73bd8df48608eab Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 20 Aug 2023 06:25:34 +0200 Subject: [PATCH 47/74] Simplify test `01600_parts_types_metrics_long` --- .../01600_parts_types_metrics_long.sh | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh index 6bc22f2e794..985dcfa9a8a 100755 --- a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh +++ b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh @@ -10,14 +10,14 @@ set -o pipefail # NOTE: database = $CLICKHOUSE_DATABASE is unwanted verify_sql="SELECT - (SELECT sumIf(value, metric = 'PartsInMemory'), sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) = - (SELECT countIf(part_type == 'InMemory'), countIf(part_type == 'Compact'), countIf(part_type == 'Wide') - FROM (SELECT part_type FROM system.parts UNION ALL SELECT part_type FROM system.projection_parts))" + (SELECT sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) = + (SELECT countIf(part_type == 'Compact'), countIf(part_type == 'Wide') + FROM (SELECT part_type FROM system.parts UNION ALL SELECT part_type FROM system.projection_parts))" # The query is not atomic - it can compare states between system.parts and system.metrics from different points in time. # So, there is inherent race condition (especially in fasttest that runs tests in parallel). # -# But it should get expected result eventually. +# But it should get the expected result eventually. # In case of test failure, this code will do infinite loop and timeout. verify() { @@ -32,21 +32,16 @@ verify() } $CLICKHOUSE_CLIENT --database_atomic_wait_for_drop_and_detach_synchronously=1 --query="DROP TABLE IF EXISTS data_01600" -# InMemory - [0..5] # Compact - (5..10] # Wide - >10 $CLICKHOUSE_CLIENT --query="CREATE TABLE data_01600 (part_type String, key Int) ENGINE = MergeTree PARTITION BY part_type ORDER BY key SETTINGS min_bytes_for_wide_part=0, min_rows_for_wide_part=10, index_granularity = 8192, index_granularity_bytes = '10Mi'" -# InMemory -$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'InMemory', number FROM system.numbers LIMIT 1" -verify - # Compact -$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Compact', number FROM system.numbers LIMIT 6 OFFSET 1" +$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Compact', number FROM system.numbers LIMIT 6" verify # Wide -$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Wide', number FROM system.numbers LIMIT 11 OFFSET 7" +$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Wide', number FROM system.numbers LIMIT 11 OFFSET 6" verify # DROP and check From 28b46a91a6f62674a22bb1a5ed579dc2bf5e2eaf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 20 Aug 2023 06:28:20 +0200 Subject: [PATCH 48/74] Simplify test `01600_parts_types_metrics_long` --- tests/queries/0_stateless/01600_parts_types_metrics_long.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh index 985dcfa9a8a..f4a17152f84 100755 --- a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh +++ b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh @@ -11,7 +11,7 @@ set -o pipefail # NOTE: database = $CLICKHOUSE_DATABASE is unwanted verify_sql="SELECT (SELECT sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) = - (SELECT countIf(part_type == 'Compact'), countIf(part_type == 'Wide') + (SELECT countIf(part_type = 'Compact'), countIf(part_type = 'Wide') FROM (SELECT part_type FROM system.parts UNION ALL SELECT part_type FROM system.projection_parts))" # The query is not atomic - it can compare states between system.parts and system.metrics from different points in time. From 59fced8e992705a1095401e174f3ad32a71f5e68 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 20 Aug 2023 08:00:04 +0200 Subject: [PATCH 49/74] Fix errors in dashboard --- programs/server/dashboard.html | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index d0ae590eaa0..6dc022f667f 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -906,12 +906,15 @@ async function draw(idx, chart, url_params, query) { } if (error) { - const errorMatch = errorMessages.find(({ regex }) => error.match(regex)) - const match = error.match(errorMatch.regex) - const message = errorMatch.messageFunc(match) + const errorMatch = errorMessages.find(({ regex }) => error.match(regex)); + if (!errorMatch) { + throw new Error(error); + } + const match = error.match(errorMatch.regex); + const message = errorMatch.messageFunc(match); if (message) { - const authError = new Error(message) - throw authError + const authError = new Error(message); + throw authError; } } From fe610d0394c26e249f2b3885e085d930d0eb24ac Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 20 Aug 2023 08:07:11 +0200 Subject: [PATCH 50/74] Fix bad JavaScript --- programs/server/dashboard.html | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 6dc022f667f..440cdcae416 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -389,6 +389,7 @@
🌚🌞 +
@@ -865,7 +866,7 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend- }; } -let add_http_cors_header = false; +let add_http_cors_header = true; async function draw(idx, chart, url_params, query) { if (plots[idx]) { @@ -913,8 +914,7 @@ async function draw(idx, chart, url_params, query) { const match = error.match(errorMatch.regex); const message = errorMatch.messageFunc(match); if (message) { - const authError = new Error(message); - throw authError; + throw new Error(message); } } @@ -981,23 +981,23 @@ async function draw(idx, chart, url_params, query) { } function showAuthError(message) { - const charts = document.querySelector('#charts'); + const charts = document.getElementById('charts'); charts.style.height = '0px'; charts.style.opacity = '0'; - const add = document.querySelector('#add'); + const add = document.getElementById('add'); add.style.display = 'none'; - const authError = document.querySelector('#auth-error'); + const authError = document.getElementById('auth-error'); authError.textContent = message; authError.style.display = 'flex'; } function hideAuthError() { - const charts = document.querySelector('#charts'); + const charts = document.getElementById('charts'); charts.style.height = 'auto'; charts.style.opacity = '1'; - const authError = document.querySelector('#auth-error'); + const authError = document.getElementById('auth-error'); authError.textContent = ''; authError.style.display = 'none'; } @@ -1028,11 +1028,11 @@ async function drawAll() { if (results.includes(true)) { const element = document.querySelector('.inputs'); element.classList.remove('unconnected'); - const add = document.querySelector('#add'); + const add = document.getElementById('add'); add.style.display = 'block'; } else { - const charts = document.querySelector('#charts') + const charts = document.getElementById('charts') charts.style.height = '0px'; } }) From b2462a1bb7181abee87365129af0a8c9a7d88a1a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 20 Aug 2023 09:16:38 +0200 Subject: [PATCH 51/74] Add mass editor --- programs/server/dashboard.html | 133 ++++++++++++++++++++++++++++----- 1 file changed, 115 insertions(+), 18 deletions(-) diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 440cdcae416..503e4ef0e41 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -230,8 +230,8 @@ filter: contrast(125%); } - #add, #reload { - padding: .25rem 0.5rem; + #add, #reload, #edit { + padding: 0.25rem 0.5rem; text-align: center; font-weight: bold; user-select: none; @@ -244,13 +244,10 @@ margin-right: 1rem !important; margin-left: 0rem; margin-bottom: 1rem; + height: 3ex; } - /* .unconnected #reload { - margin-left: 3px; - } */ - - #add:hover, #reload:hover { + #add:hover, #reload:hover, #edit:hover { background: var(--button-background-color); } @@ -333,18 +330,21 @@ padding: 2rem; } - .query-editor textarea { - grid-row: 1; - grid-column: 1 / span 2; - z-index: 11; + textarea { padding: 0.5rem; outline: none; border: none; font-size: 12pt; - border-bottom: 1px solid var(--edit-title-border); background: var(--chart-background); color: var(--text-color); resize: none; + } + + .query-editor textarea { + grid-row: 1; + grid-column: 1 / span 2; + z-index: 11; + border-bottom: 1px solid var(--edit-title-border); margin: 0; } @@ -367,10 +367,41 @@ filter: contrast(125%); } + .edit-cancel { + cursor: pointer; + background: var(--new-chart-background-color); + } + .edit-cancel:hover { + filter: contrast(125%); + } + .nowrap { white-space: pre; } + #mass-editor { + display: none; + grid-template-columns: auto fit-content(10%) fit-content(10%); + grid-template-rows: auto fit-content(10%); + row-gap: 1rem; + column-gap: 1rem; + } + + #mass-editor-textarea { + width: 100%; + height: 100%; + grid-row: 1; + grid-column: 1 / span 3; + } + + #mass-editor input { + padding: 0.5rem; + } + + #mass-editor-message { + color: var(--auth-error-color); + } + /* Source: https://cdn.jsdelivr.net/npm/uplot@1.6.21/dist/uPlot.min.css * It is copy-pasted to lower the number of requests. */ @@ -398,6 +429,12 @@
+
+ +   + + +