From ed318d10353101c76a4493ccd9fa6c239868abd3 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 14 Jun 2023 10:35:36 +0000 Subject: [PATCH 01/38] Add input_format_csv_ignore_extra_columns setting (prototype) --- src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 15 ++++++++++++++- tests/queries/0_stateless/00301_csv.reference | 4 ++++ tests/queries/0_stateless/00301_csv.sh | 10 ++++++++++ 6 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index bc879b9bdf6..d38f7767252 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -835,6 +835,7 @@ class IColumn; M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \ M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, -WithNames, -WithNamesAndTypes formats).", IMPORTANT) \ M(Bool, input_format_csv_empty_as_default, true, "Treat empty fields in CSV input as default values.", 0) \ + M(Bool, input_format_csv_ignore_extra_columns, false, "", 0) \ M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \ M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices.", 0) \ M(Bool, input_format_null_as_default, true, "Initialize null fields with default values if the data type of this field is not nullable and it is supported by the input format", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index c235afae57e..0218d268c51 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -63,6 +63,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.delimiter = settings.format_csv_delimiter; format_settings.csv.tuple_delimiter = settings.format_csv_delimiter; format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; + format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns; format_settings.csv.enum_as_number = settings.input_format_csv_enum_as_number; format_settings.csv.null_representation = settings.format_csv_null_representation; format_settings.csv.arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 787c1a64759..3bc53140fe5 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -128,6 +128,7 @@ struct FormatSettings bool allow_single_quotes = true; bool allow_double_quotes = true; bool empty_as_default = false; + bool ignore_extra_columns = false; bool crlf_end_of_line = false; bool enum_as_number = false; bool arrays_as_nested_csv = false; diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index ae75240e0ee..0cc5889b732 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -302,14 +302,27 @@ bool CSVFormatReader::readField( return false; } + auto skip_all = [&]() + { + if (!is_last_file_column || !format_settings.csv.ignore_extra_columns) + { + return; + } + //std::cout << "skip !!!" << std::endl; + buf->position() = find_first_symbols<'\n'>(buf->position(), buf->buffer().end()); + }; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) { /// If value is null but type is not nullable then use default value instead. - return SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization); + bool res = SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization); + skip_all(); + return res; } /// Read the column normally. serialization->deserializeTextCSV(column, *buf, format_settings); + + skip_all(); return true; } diff --git a/tests/queries/0_stateless/00301_csv.reference b/tests/queries/0_stateless/00301_csv.reference index 9863da4b640..61279f3b84a 100644 --- a/tests/queries/0_stateless/00301_csv.reference +++ b/tests/queries/0_stateless/00301_csv.reference @@ -11,3 +11,7 @@ default-eof 1 2019-06-19 2016-01-01 01:02:03 NUL 2016-01-02 01:02:03 Nhello \N \N +Hello world 1 2016-01-01 +Hello world 2 2016-01-02 +Hello world 3 2016-01-03 +Hello world 4 2016-01-04 diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index b2618343dc0..e99c39a0f6f 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -37,3 +37,13 @@ echo 'NULL, NULL $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; + + +$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 1, d Date DEFAULT '2019-06-19') ENGINE = Memory"; + +echo 'Hello world, 1, 2016-01-01 +Hello world, 2 ,2016-01-02, +Hello world, 3 ,2016-01-03, 2016-01-13 +Hello world, 4 ,2016-01-04, 2016-01-14, 2016-01-15' | $CLICKHOUSE_CLIENT --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV"; +$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n"; +$CLICKHOUSE_CLIENT --query="DROP TABLE csv"; \ No newline at end of file From a91fc3ddb33865d2db8170ff96e636de293b323a Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 14 Jun 2023 16:44:31 +0000 Subject: [PATCH 02/38] Add docs/ add more cases in test --- docs/en/interfaces/formats.md | 3 +- .../operations/settings/settings-formats.md | 5 +++ docs/ru/interfaces/formats.md | 4 +- docs/ru/operations/settings/settings.md | 8 +++- src/Core/Settings.h | 2 +- src/Formats/FormatFactory.cpp | 2 +- src/Formats/FormatSettings.h | 2 +- .../Formats/Impl/CSVRowInputFormat.cpp | 39 +++++++++---------- .../RowInputFormatWithNamesAndTypes.cpp | 4 ++ tests/queries/0_stateless/00301_csv.reference | 10 +++-- tests/queries/0_stateless/00301_csv.sh | 13 ++++--- 11 files changed, 56 insertions(+), 36 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 324930e248f..950692deb77 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -470,6 +470,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. +- [input_format_csv_ignore_extra_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_ignore_extra_columns) - ignore extra colums in CSV input. Default value - `false`. ## CSVWithNames {#csvwithnames} @@ -2062,7 +2063,7 @@ Special format for reading Parquet file metadata (https://parquet.apache.org/doc - logical_type - column logical type - compression - compression used for this column - total_uncompressed_size - total uncompressed bytes size of the column, calculated as the sum of total_uncompressed_size of the column from all row groups - - total_compressed_size - total compressed bytes size of the column, calculated as the sum of total_compressed_size of the column from all row groups + - total_compressed_size - total compressed bytes size of the column, calculated as the sum of total_compressed_size of the column from all row groups - space_saved - percent of space saved by compression, calculated as (1 - total_compressed_size/total_uncompressed_size). - encodings - the list of encodings used for this column - row_groups - the list of row groups metadata with the next structure: diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 26501f3f3f6..e721c9408e3 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -931,6 +931,11 @@ Result ```text " string " ``` +### input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns} + +Ignore extra colums in CSV input. + +Disabled by default. ## Values format settings {#values-format-settings} diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 48a6132170a..8488f4ce55a 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -401,8 +401,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR - [output_format_csv_crlf_end_of_line](../operations/settings/settings.md#output_format_csv_crlf_end_of_line) - если установлено значение true, конец строки в формате вывода CSV будет `\r\n` вместо `\n`. Значение по умолчанию - `false`. - [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`. - [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`. -- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. -Значение по умолчанию - `true`. +- [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. Значение по умолчанию - `true`. +- [input_format_csv_ignore_extra_columns](../operations/settings/settings.md/#input_format_csv_ignore_extra_columns) - игнорировать дополнительные столбцы. Значение по умолчанию - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index e3da8302fc8..33d9300f8e1 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1686,7 +1686,7 @@ SELECT * FROM table_with_enum_column_for_csv_insert; ## input_format_csv_detect_header {#input_format_csv_detect_header} Обнаружить заголовок с именами и типами в формате CSV. - + Значение по умолчанию - `true`. ## input_format_csv_skip_first_lines {#input_format_csv_skip_first_lines} @@ -1727,6 +1727,12 @@ echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --in " string " ``` +## input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns} + +Игнорировать дополнительные столбцы. + +Выключено по умолчанию. + ## output_format_tsv_crlf_end_of_line {#settings-output-format-tsv-crlf-end-of-line} Использовать в качестве разделителя строк для TSV формата CRLF (DOC/Windows стиль) вместо LF (Unix стиль). diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d38f7767252..9582419b98c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -835,7 +835,6 @@ class IColumn; M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \ M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, -WithNames, -WithNamesAndTypes formats).", IMPORTANT) \ M(Bool, input_format_csv_empty_as_default, true, "Treat empty fields in CSV input as default values.", 0) \ - M(Bool, input_format_csv_ignore_extra_columns, false, "", 0) \ M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \ M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices.", 0) \ M(Bool, input_format_null_as_default, true, "Initialize null fields with default values if the data type of this field is not nullable and it is supported by the input format", 0) \ @@ -1001,6 +1000,7 @@ class IColumn; M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ + M(Bool, input_format_csv_ignore_extra_columns, false, "Ignore extra colums in CSV input", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 0218d268c51..f29b55f7e73 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -63,7 +63,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.delimiter = settings.format_csv_delimiter; format_settings.csv.tuple_delimiter = settings.format_csv_delimiter; format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; - format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns; format_settings.csv.enum_as_number = settings.input_format_csv_enum_as_number; format_settings.csv.null_representation = settings.format_csv_null_representation; format_settings.csv.arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv; @@ -72,6 +71,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.try_detect_header = settings.input_format_csv_detect_header; format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines; format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; + format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 3bc53140fe5..38148bda373 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -128,7 +128,6 @@ struct FormatSettings bool allow_single_quotes = true; bool allow_double_quotes = true; bool empty_as_default = false; - bool ignore_extra_columns = false; bool crlf_end_of_line = false; bool enum_as_number = false; bool arrays_as_nested_csv = false; @@ -140,6 +139,7 @@ struct FormatSettings bool try_detect_header = true; bool skip_trailing_empty_lines = false; bool trim_whitespaces = true; + bool ignore_extra_columns = false; } csv; struct HiveText diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 0cc5889b732..8aaf8fd3e2f 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -288,6 +288,8 @@ bool CSVFormatReader::readField( const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r'); + bool res = false; + /// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default /// only one empty or NULL column will be expected if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end)) @@ -299,31 +301,28 @@ bool CSVFormatReader::readField( /// they do not contain empty unquoted fields, so this check /// works for tuples as well. column.insertDefault(); - return false; } - - auto skip_all = [&]() - { - if (!is_last_file_column || !format_settings.csv.ignore_extra_columns) - { - return; - } - //std::cout << "skip !!!" << std::endl; - buf->position() = find_first_symbols<'\n'>(buf->position(), buf->buffer().end()); - }; - if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) + else if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) { /// If value is null but type is not nullable then use default value instead. - bool res = SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization); - skip_all(); - return res; + res = SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization); + } + else + { + /// Read the column normally. + serialization->deserializeTextCSV(column, *buf, format_settings); + res = true; } - /// Read the column normally. - serialization->deserializeTextCSV(column, *buf, format_settings); - - skip_all(); - return true; + if (is_last_file_column && format_settings.csv.ignore_extra_columns) + { + while (checkChar(format_settings.csv.delimiter, *buf)) + { + skipField(); + skipWhitespacesAndTabs(*buf); + } + } + return res; } void CSVFormatReader::skipPrefixBeforeHeader() diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index eaedbbb4a1e..24bf1d0d595 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -212,8 +212,12 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE format_reader->skipRowStartDelimiter(); ext.read_columns.resize(data_types.size()); + //std::cout << "col size " << column_mapping->column_indexes_for_input_fields.size() << std::endl; for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) { + // std::cout << " file_column " << file_column << column_mapping->names_of_columns[file_column] << std::endl; + + const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); if (column_index) diff --git a/tests/queries/0_stateless/00301_csv.reference b/tests/queries/0_stateless/00301_csv.reference index 61279f3b84a..3dbe3116bea 100644 --- a/tests/queries/0_stateless/00301_csv.reference +++ b/tests/queries/0_stateless/00301_csv.reference @@ -11,7 +11,9 @@ default-eof 1 2019-06-19 2016-01-01 01:02:03 NUL 2016-01-02 01:02:03 Nhello \N \N -Hello world 1 2016-01-01 -Hello world 2 2016-01-02 -Hello world 3 2016-01-03 -Hello world 4 2016-01-04 +Hello 1 String1 +Hello 2 String2 +Hello 3 String3 +Hello 4 String4 +Hello 5 String5 +Hello 6 String6 diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index e99c39a0f6f..fafe75f6f63 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -39,11 +39,14 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; -$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 1, d Date DEFAULT '2019-06-19') ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory"; -echo 'Hello world, 1, 2016-01-01 -Hello world, 2 ,2016-01-02, -Hello world, 3 ,2016-01-03, 2016-01-13 -Hello world, 4 ,2016-01-04, 2016-01-14, 2016-01-15' | $CLICKHOUSE_CLIENT --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV"; +echo 'Hello, 1, String1 +Hello, 2, String2, +Hello, 3, String3, 2016-01-13 +Hello, 4, , 2016-01-14 +Hello, 5, String5, 2016-01-15, 2016-01-16 +Hello, 6, String6, "line with a +break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; \ No newline at end of file From 806176d88e0b4237c16e23aed27179ed93aa17c1 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 15 Jun 2023 11:23:08 +0000 Subject: [PATCH 03/38] Add input_format_csv_missing_as_default setting and tests --- docs/en/interfaces/formats.md | 3 ++- .../operations/settings/settings-formats.md | 8 +++++++- docs/ru/interfaces/formats.md | 3 ++- docs/ru/operations/settings/settings.md | 8 +++++++- src/Core/Settings.h | 3 ++- src/Dictionaries/CacheDictionary.cpp | 2 +- src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + .../Formats/Impl/CSVRowInputFormat.cpp | 18 +++++++++++++++++- .../Formats/Impl/CSVRowInputFormat.h | 1 + .../RowInputFormatWithNamesAndTypes.cpp | 4 ---- tests/queries/0_stateless/00301_csv.reference | 10 ++++++++++ tests/queries/0_stateless/00301_csv.sh | 19 +++++++++++++++++-- 13 files changed, 68 insertions(+), 13 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 950692deb77..e0b0fcfabd5 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -470,7 +470,8 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. -- [input_format_csv_ignore_extra_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_ignore_extra_columns) - ignore extra colums in CSV input. Default value - `false`. +- [input_format_csv_ignore_extra_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_ignore_extra_columns) - ignore extra columns in CSV input (if your file has more columns than expected). Default value - `false`. +- [input_format_csv_missing_as_default](/docs/en/operations/settings/settings-formats.md/#input_format_csv_missing_as_default) - treat missing fields in CSV input as default values. Default value - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index e721c9408e3..6d9a1fb5160 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -933,7 +933,13 @@ Result ``` ### input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns} -Ignore extra colums in CSV input. +Ignore extra columns in CSV input (if your file has more columns than expected). + +Disabled by default. + +### input_format_csv_missing_as_default {#input_format_csv_missing_as_default} + +Treat missing fields in CSV input as default values. Disabled by default. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 8488f4ce55a..7e3bb3f7d26 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -402,7 +402,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR - [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`. - [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`. - [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. Значение по умолчанию - `true`. -- [input_format_csv_ignore_extra_columns](../operations/settings/settings.md/#input_format_csv_ignore_extra_columns) - игнорировать дополнительные столбцы. Значение по умолчанию - `false`. +- [input_format_csv_ignore_extra_columns](../operations/settings/settings.md/#input_format_csv_ignore_extra_columns) - игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается). Значение по умолчанию - `false`. +- [input_format_csv_missing_as_default](../operations/settings/settings.md/#input_format_csv_missing_as_default) - рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Значение по умолчанию - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 33d9300f8e1..61cfc332585 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1729,7 +1729,13 @@ echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --in ## input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns} -Игнорировать дополнительные столбцы. +Игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается). + +Выключено по умолчанию. + +## input_format_csv_missing_as_default {#input_format_csv_missing_as_default} + +Рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Выключено по умолчанию. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9582419b98c..ce7c28996e8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1000,7 +1000,8 @@ class IColumn; M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ - M(Bool, input_format_csv_ignore_extra_columns, false, "Ignore extra colums in CSV input", 0) \ + M(Bool, input_format_csv_ignore_extra_columns, false, "Ignore extra columns in CSV input (if your file has more columns than expected)", 0) \ + M(Bool, input_format_csv_missing_as_default, false, "Treat missing fields in CSV input as default values", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index c5c88a9f142..359f7c17436 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -138,7 +138,7 @@ Columns CacheDictionary::getColumns( const Columns & default_values_columns) const { /** - * Flow of getColumsImpl + * Flow of getColumnsImpl * 1. Get fetch result from storage * 2. If all keys are found in storage and not expired * 2.1. If storage returns fetched columns in order of keys then result is returned to client. diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index f29b55f7e73..102b5d7eec0 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -72,6 +72,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines; format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns; + format_settings.csv.missing_as_default = settings.input_format_csv_missing_as_default; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 38148bda373..2b52d88184c 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -140,6 +140,7 @@ struct FormatSettings bool skip_trailing_empty_lines = false; bool trim_whitespaces = true; bool ignore_extra_columns = false; + bool missing_as_default = false; } csv; struct HiveText diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 8aaf8fd3e2f..dcc057baef2 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -147,7 +147,18 @@ CSVFormatReader::CSVFormatReader(PeekableReadBuffer & buf_, const FormatSettings void CSVFormatReader::skipFieldDelimiter() { skipWhitespacesAndTabs(*buf); - assertChar(format_settings.csv.delimiter, *buf); + + bool res = checkChar(format_settings.csv.delimiter, *buf); + if (!res && !format_settings.csv.missing_as_default) + { + char err[2] = {format_settings.csv.delimiter, '\0'}; + throwAtAssertionFailed(err, *buf); + } + + if (!res && format_settings.csv.missing_as_default) + { + current_row_has_missing_fields = true; + } } template @@ -187,6 +198,7 @@ void CSVFormatReader::skipRowEndDelimiter() return; skipEndOfLine(*buf); + current_row_has_missing_fields = false; } void CSVFormatReader::skipHeaderRow() @@ -302,6 +314,10 @@ bool CSVFormatReader::readField( /// works for tuples as well. column.insertDefault(); } + else if (current_row_has_missing_fields) + { + column.insertDefault(); + } else if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) { /// If value is null but type is not nullable then use default value instead. diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 0c8099a216c..3958c66bbc6 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -89,6 +89,7 @@ public: protected: PeekableReadBuffer * buf; + bool current_row_has_missing_fields = false; }; class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 24bf1d0d595..eaedbbb4a1e 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -212,12 +212,8 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE format_reader->skipRowStartDelimiter(); ext.read_columns.resize(data_types.size()); - //std::cout << "col size " << column_mapping->column_indexes_for_input_fields.size() << std::endl; for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) { - // std::cout << " file_column " << file_column << column_mapping->names_of_columns[file_column] << std::endl; - - const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); if (column_index) diff --git a/tests/queries/0_stateless/00301_csv.reference b/tests/queries/0_stateless/00301_csv.reference index 3dbe3116bea..fa85fd924e1 100644 --- a/tests/queries/0_stateless/00301_csv.reference +++ b/tests/queries/0_stateless/00301_csv.reference @@ -1,19 +1,29 @@ +=== Test input_format_csv_empty_as_default Hello, world 123 2016-01-01 Hello, "world" 456 2016-01-02 Hello "world" 789 2016-01-03 Hello\n world 100 2016-01-04 default 1 2019-06-19 default-eof 1 2019-06-19 +=== Test datetime 2016-01-01 01:02:03 1 2016-01-02 01:02:03 2 2017-08-15 13:15:01 3 1970-01-02 05:46:39 4 +=== Test nullable datetime 2016-01-01 01:02:03 NUL 2016-01-02 01:02:03 Nhello \N \N +=== Test input_format_csv_ignore_extra_columns Hello 1 String1 Hello 2 String2 Hello 3 String3 Hello 4 String4 Hello 5 String5 Hello 6 String6 +=== Test input_format_csv_missing_as_default +Hello 0 33 \N 55 Default +Hello 0 33 \N 55 Default +Hello 1 2 \N 55 Default +Hello 1 2 3 4 String +Hello 1 2 3 4 String diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index fafe75f6f63..887a75b0ded 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -4,6 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh +echo === Test input_format_csv_empty_as_default $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS csv"; $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 1, d Date DEFAULT '2019-06-19') ENGINE = Memory"; @@ -18,6 +19,7 @@ Hello "world", 789 ,2016-01-03 $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY d, s"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; +echo === Test datetime $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t DateTime('Asia/Istanbul'), s String) ENGINE = Memory"; echo '"2016-01-01 01:02:03","1" @@ -28,7 +30,7 @@ echo '"2016-01-01 01:02:03","1" $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; - +echo === Test nullable datetime $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t Nullable(DateTime('Asia/Istanbul')), s Nullable(String)) ENGINE = Memory"; echo 'NULL, NULL @@ -39,6 +41,7 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; +echo === Test input_format_csv_ignore_extra_columns $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory"; echo 'Hello, 1, String1 @@ -49,4 +52,16 @@ Hello, 5, String5, 2016-01-15, 2016-01-16 Hello, 6, String6, "line with a break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n"; -$CLICKHOUSE_CLIENT --query="DROP TABLE csv"; \ No newline at end of file +$CLICKHOUSE_CLIENT --query="DROP TABLE csv"; + + +echo === Test input_format_csv_missing_as_default +$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (f1 String, f2 UInt64, f3 UInt64 Default 33, f4 Nullable(UInt64), f5 Nullable(UInt64) Default 55, f6 String DEFAULT 'Default') ENGINE = Memory"; + +echo 'Hello +Hello, +Hello, 1, 2 +Hello, 1, 2, 3, 4, String +Hello, 1, 2, 3, 4, String,'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_missing_as_default=1 --query="INSERT INTO csv FORMAT CSV"; +$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY f1, f2, f3, f4 NULLS FIRST, f5, f6"; +$CLICKHOUSE_CLIENT --query="DROP TABLE csv"; From 0eeee11dc46d462412ad671a7d59006fba59c403 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 15 Jun 2023 12:36:18 +0000 Subject: [PATCH 04/38] Style fix, add comment --- .../Formats/Impl/CSVRowInputFormat.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index dcc057baef2..7cd812bc5b0 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -149,15 +149,17 @@ void CSVFormatReader::skipFieldDelimiter() skipWhitespacesAndTabs(*buf); bool res = checkChar(format_settings.csv.delimiter, *buf); - if (!res && !format_settings.csv.missing_as_default) + if (!res) { - char err[2] = {format_settings.csv.delimiter, '\0'}; - throwAtAssertionFailed(err, *buf); - } - - if (!res && format_settings.csv.missing_as_default) - { - current_row_has_missing_fields = true; + if (!format_settings.csv.missing_as_default) + { + char err[2] = {format_settings.csv.delimiter, '\0'}; + throwAtAssertionFailed(err, *buf); + } + else + { + current_row_has_missing_fields = true; + } } } @@ -332,6 +334,7 @@ bool CSVFormatReader::readField( if (is_last_file_column && format_settings.csv.ignore_extra_columns) { + // Skip all fields to next line. while (checkChar(format_settings.csv.delimiter, *buf)) { skipField(); From 02b5b50e41f38bd0f0b67fd2c414579aeb0cd051 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Fri, 16 Jun 2023 12:39:46 +0000 Subject: [PATCH 05/38] Add milli/micro seconds support for date_diff --- .../functions/date-time-functions.md | 2 ++ .../functions/date-time-functions.md | 2 ++ .../functions/date-time-functions.md | 2 ++ src/Functions/DateTimeTransforms.h | 29 +++++++++++++++++++ src/Functions/TransformDateTime64.h | 2 +- src/Functions/dateDiff.cpp | 4 +++ .../02160_special_functions.reference | 8 +++++ .../0_stateless/02160_special_functions.sql | 9 ++++++ 8 files changed, 57 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 280b41e7a5f..82f1a8aa237 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -782,6 +782,8 @@ Aliases: `dateDiff`, `DATE_DIFF`, `timestampDiff`, `timestamp_diff`, `TIMESTAMP_ - `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md). Possible values: + - `microsecond` (possible abbreviations: `us`, `u`) + - `millisecond` (possible abbreviations: `ms`) - `second` (possible abbreviations: `ss`, `s`) - `minute` (possible abbreviations: `mi`, `n`) - `hour` (possible abbreviations: `hh`, `h`) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 867d71d334c..93ae750b10d 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -680,6 +680,8 @@ date_diff('unit', startdate, enddate, [timezone]) - `unit` — единица измерения времени, в которой будет выражено возвращаемое значение функции. [String](../../sql-reference/data-types/string.md). Возможные значения: + - `microsecond` (возможные сокращения: `us`, `u`) + - `millisecond` (возможные сокращения: `ms`) - `second` (возможные сокращения: `ss`, `s`) - `minute` (возможные сокращения: `mi`, `n`) - `hour` (возможные сокращения: `hh`, `h`) diff --git a/docs/zh/sql-reference/functions/date-time-functions.md b/docs/zh/sql-reference/functions/date-time-functions.md index 53dadc23c6d..e4b70322477 100644 --- a/docs/zh/sql-reference/functions/date-time-functions.md +++ b/docs/zh/sql-reference/functions/date-time-functions.md @@ -643,6 +643,8 @@ date_diff('unit', startdate, enddate, [timezone]) - `unit` — `value`对应的时间单位。类型为[String](../../sql-reference/data-types/string.md)。 可能的值: + - `microsecond` + - `millisecond` - `second` - `minute` - `hour` diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 84c71c89b11..4d15078f2d7 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1377,6 +1377,35 @@ struct ToRelativeSecondNumImpl using FactorTransform = ZeroTransform; }; +template +struct ToRelativeSubsecondNumImpl +{ + static constexpr auto name = "toRelativeSubsecondNumImpl"; + + static inline UInt64 execute(const DateTime64 & t, DateTime64::NativeType scale, const DateLUTImpl &) + { + if (scale == second_divider) + return t.value; + if (scale > second_divider) + return t.value / (scale / second_divider); + return t.value * (second_divider / scale); + } + static inline UInt64 execute(UInt32 t, const DateLUTImpl &) + { + return t * second_divider; + } + static inline UInt64 execute(Int32 d, const DateLUTImpl & time_zone) + { + return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))) * second_divider; + } + static inline UInt64 execute(UInt16 d, const DateLUTImpl & time_zone) + { + return static_cast(time_zone.fromDayNum(DayNum(d)) * second_divider); + } + + using FactorTransform = ZeroTransform; +}; + struct ToYYYYMMImpl { static constexpr auto name = "toYYYYMM"; diff --git a/src/Functions/TransformDateTime64.h b/src/Functions/TransformDateTime64.h index 3dab9efeb6b..1a1e732ae40 100644 --- a/src/Functions/TransformDateTime64.h +++ b/src/Functions/TransformDateTime64.h @@ -5,7 +5,7 @@ namespace DB { -/** Tansform-type wrapper for DateTime64, simplifies DateTime64 support for given Transform. +/** Transform-type wrapper for DateTime64, simplifies DateTime64 support for given Transform. * * Depending on what overloads of Transform::execute() are available, when called with DateTime64 value, * invokes Transform::execute() with either: diff --git a/src/Functions/dateDiff.cpp b/src/Functions/dateDiff.cpp index 8361e9db166..62f01274476 100644 --- a/src/Functions/dateDiff.cpp +++ b/src/Functions/dateDiff.cpp @@ -373,6 +373,10 @@ public: impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else if (unit == "second" || unit == "ss" || unit == "s") impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); + else if (unit == "millisecond" || unit == "ms") + impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); + else if (unit == "microsecond" || unit == "us" || unit == "u") + impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} does not support '{}' unit", getName(), unit); diff --git a/tests/queries/0_stateless/02160_special_functions.reference b/tests/queries/0_stateless/02160_special_functions.reference index 3a1dcd88902..37278d4b5dc 100644 --- a/tests/queries/0_stateless/02160_special_functions.reference +++ b/tests/queries/0_stateless/02160_special_functions.reference @@ -33,4 +33,12 @@ Hello 2021-01-01 1 1 +86400000 +172800000 +86461000 +86401299 +701 +701 +800 +60200201 1 diff --git a/tests/queries/0_stateless/02160_special_functions.sql b/tests/queries/0_stateless/02160_special_functions.sql index 6d18e7d0d25..6002f793601 100644 --- a/tests/queries/0_stateless/02160_special_functions.sql +++ b/tests/queries/0_stateless/02160_special_functions.sql @@ -41,4 +41,13 @@ SELECT TIMESTAMPSUB(DATE '2022-01-01', INTERVAL 1 YEAR); SELECT DATE_DIFF(YEAR, DATE '2021-01-01', DATE '2022-01-01'); SELECT DATEDIFF(YEAR, DATE '2021-01-01', DATE '2022-01-01'); +SELECT DATEDIFF(millisecond, '2021-01-01'::Date, '2021-01-02'::Date); +SELECT DATEDIFF(millisecond, '2021-01-01'::Date, '2021-01-03'::Date32); +SELECT DATEDIFF(millisecond, '2021-01-01'::Date, '2021-01-02 00:01:01'::DateTime); +SELECT DATEDIFF(millisecond, '2021-01-01'::Date, '2021-01-02 00:00:01.299'::DateTime64); +SELECT DATEDIFF(millisecond, '2021-01-01 23:59:59.299'::DateTime64, '2021-01-02'::Date); +SELECT DATEDIFF(millisecond, '2021-01-01 23:59:59.299999'::DateTime64(6), '2021-01-02'::Date); +SELECT DATEDIFF(millisecond, '2021-01-01 23:59:59.2'::DateTime64(1), '2021-01-02'::Date); +SELECT DATEDIFF(microsecond, '2021-01-01 23:59:59.899999'::DateTime64(6), '2021-01-02 00:01:00.100200300'::DateTime64(9)); + SELECT EXISTS (SELECT 1); From dd43a186adca8b4480e9287127c740f6b05d743d Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Mon, 19 Jun 2023 09:51:29 +0000 Subject: [PATCH 06/38] Minor edit docs / add int256 test --- docs/en/interfaces/formats.md | 2 +- docs/en/operations/settings/settings-formats.md | 2 +- src/Core/Settings.h | 2 +- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 2 +- tests/queries/0_stateless/00301_csv.reference | 10 +++++----- tests/queries/0_stateless/00301_csv.sh | 10 +++++----- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 7a900ecd869..c5cd19f1743 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -471,7 +471,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. - [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. -- [input_format_csv_ignore_extra_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_ignore_extra_columns) - ignore extra columns in CSV input (if your file has more columns than expected). Default value - `false`. +- [input_format_csv_ignore_extra_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_ignore_extra_columns) - ignore extra columns in CSV input (if file has more columns than expected). Default value - `false`. - [input_format_csv_missing_as_default](/docs/en/operations/settings/settings-formats.md/#input_format_csv_missing_as_default) - treat missing fields in CSV input as default values. Default value - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index c17a24abccf..6b05f41666c 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -933,7 +933,7 @@ Result ``` ### input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns} -Ignore extra columns in CSV input (if your file has more columns than expected). +Ignore extra columns in CSV input (if file has more columns than expected). Disabled by default. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 76bb7ae9206..e60d2df933f 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1005,7 +1005,7 @@ class IColumn; M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ - M(Bool, input_format_csv_ignore_extra_columns, false, "Ignore extra columns in CSV input (if your file has more columns than expected)", 0) \ + M(Bool, input_format_csv_ignore_extra_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected)", 0) \ M(Bool, input_format_csv_missing_as_default, false, "Treat missing fields in CSV input as default values", 0) \ // End of FORMAT_FACTORY_SETTINGS diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index c80887bde0a..a727a5bc490 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -346,7 +346,7 @@ bool CSVFormatReader::readField( while (checkChar(format_settings.csv.delimiter, *buf)) { skipField(); - skipWhitespacesAndTabs(*buf); + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); } } return res; diff --git a/tests/queries/0_stateless/00301_csv.reference b/tests/queries/0_stateless/00301_csv.reference index fa85fd924e1..e3441a3a4b3 100644 --- a/tests/queries/0_stateless/00301_csv.reference +++ b/tests/queries/0_stateless/00301_csv.reference @@ -22,8 +22,8 @@ Hello 4 String4 Hello 5 String5 Hello 6 String6 === Test input_format_csv_missing_as_default -Hello 0 33 \N 55 Default -Hello 0 33 \N 55 Default -Hello 1 2 \N 55 Default -Hello 1 2 3 4 String -Hello 1 2 3 4 String +Hello 0 0 33 \N 55 Default +Hello 0 0 33 \N 55 Default +Hello 1 3 2 \N 55 Default +Hello 1 4 2 3 4 String +Hello 1 5 2 3 4 String diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index 887a75b0ded..4555e0476d8 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -56,12 +56,12 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; echo === Test input_format_csv_missing_as_default -$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (f1 String, f2 UInt64, f3 UInt64 Default 33, f4 Nullable(UInt64), f5 Nullable(UInt64) Default 55, f6 String DEFAULT 'Default') ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE csv (f1 String, f2 UInt64, f3 UInt256, f4 UInt64 Default 33, f5 Nullable(UInt64), f6 Nullable(UInt64) Default 55, f7 String DEFAULT 'Default') ENGINE = Memory"; echo 'Hello Hello, -Hello, 1, 2 -Hello, 1, 2, 3, 4, String -Hello, 1, 2, 3, 4, String,'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_missing_as_default=1 --query="INSERT INTO csv FORMAT CSV"; -$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY f1, f2, f3, f4 NULLS FIRST, f5, f6"; +Hello, 1, 3, 2 +Hello, 1, 4, 2, 3, 4, String +Hello, 1, 5, 2, 3, 4, String,'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_missing_as_default=1 --query="INSERT INTO csv FORMAT CSV"; +$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY f1, f2, f3, f4, f5 NULLS FIRST, f6, f7"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; From f81401db99e194c4a8d231a38918da53bd221e90 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Mon, 19 Jun 2023 10:48:38 +0000 Subject: [PATCH 07/38] Add empty line test --- src/Processors/Formats/Impl/CSVRowInputFormat.h | 2 ++ tests/queries/0_stateless/00301_csv.reference | 2 ++ tests/queries/0_stateless/00301_csv.sh | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 3958c66bbc6..82e03c453e7 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -89,6 +89,8 @@ public: protected: PeekableReadBuffer * buf; + +private: bool current_row_has_missing_fields = false; }; diff --git a/tests/queries/0_stateless/00301_csv.reference b/tests/queries/0_stateless/00301_csv.reference index e3441a3a4b3..140bbda84e7 100644 --- a/tests/queries/0_stateless/00301_csv.reference +++ b/tests/queries/0_stateless/00301_csv.reference @@ -22,6 +22,8 @@ Hello 4 String4 Hello 5 String5 Hello 6 String6 === Test input_format_csv_missing_as_default + 0 0 33 \N 55 Default + 0 0 33 \N 55 Default Hello 0 0 33 \N 55 Default Hello 0 0 33 \N 55 Default Hello 1 3 2 \N 55 Default diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index 4555e0476d8..aa019147bab 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -58,7 +58,9 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; echo === Test input_format_csv_missing_as_default $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (f1 String, f2 UInt64, f3 UInt256, f4 UInt64 Default 33, f5 Nullable(UInt64), f6 Nullable(UInt64) Default 55, f7 String DEFAULT 'Default') ENGINE = Memory"; -echo 'Hello +echo ' +, +Hello Hello, Hello, 1, 3, 2 Hello, 1, 4, 2, 3, 4, String From 792cdb6da5b390f953cb4e704f8de0a25e76633b Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 20 Jun 2023 08:26:59 +0000 Subject: [PATCH 08/38] Add millisecond support to age() / add tests --- .../functions/date-time-functions.md | 4 +- .../functions/date-time-functions.md | 4 +- .../functions/date-time-functions.md | 1 + src/Core/DecimalFunctions.h | 2 +- src/Functions/DateTimeTransforms.h | 50 +++++++--- src/Functions/dateDiff.cpp | 93 ++++++++++++------ .../02477_age_datetime64.reference | 97 +++++++++++++++++++ .../0_stateless/02477_age_datetime64.sql | 59 +++++++++++ 8 files changed, 259 insertions(+), 51 deletions(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 82f1a8aa237..9dca91668f2 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -691,7 +691,7 @@ SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(d ## age -Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 second. +Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 microsecond. E.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for `day` unit, 0 months for `month` unit, 0 years for `year` unit. For an alternative to `age`, see function `date\_diff`. @@ -707,6 +707,8 @@ age('unit', startdate, enddate, [timezone]) - `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md). Possible values: + - `microsecond` (possible abbreviations: `us`, `u`) + - `millisecond` (possible abbreviations: `ms`) - `second` (possible abbreviations: `ss`, `s`) - `minute` (possible abbreviations: `mi`, `n`) - `hour` (possible abbreviations: `hh`, `h`) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 93ae750b10d..ddd3d2ca6f0 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -593,7 +593,7 @@ SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(d ## age -Вычисляет компонент `unit` разницы между `startdate` и `enddate`. Разница вычисляется с точностью в 1 секунду. +Вычисляет компонент `unit` разницы между `startdate` и `enddate`. Разница вычисляется с точностью в 1 микросекунду. Например, разница между `2021-12-29` и `2022-01-01` 3 дня для единицы `day`, 0 месяцев для единицы `month`, 0 лет для единицы `year`. **Синтаксис** @@ -607,6 +607,8 @@ age('unit', startdate, enddate, [timezone]) - `unit` — единица измерения времени, в которой будет выражено возвращаемое значение функции. [String](../../sql-reference/data-types/string.md). Возможные значения: + - `microsecond` (возможные сокращения: `us`, `u`) + - `millisecond` (возможные сокращения: `ms`) - `second` (возможные сокращения: `ss`, `s`) - `minute` (возможные сокращения: `mi`, `n`) - `hour` (возможные сокращения: `hh`, `h`) diff --git a/docs/zh/sql-reference/functions/date-time-functions.md b/docs/zh/sql-reference/functions/date-time-functions.md index e4b70322477..270fa44a421 100644 --- a/docs/zh/sql-reference/functions/date-time-functions.md +++ b/docs/zh/sql-reference/functions/date-time-functions.md @@ -625,6 +625,7 @@ SELECT date_add(YEAR, 3, toDate('2018-01-01')); │ 2021-01-01 │ └───────────────────────────────────────────────┘ ``` +## age {#age} ## date_diff {#date_diff} diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index 357cff2c541..9b6b1e87e5f 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -33,7 +33,7 @@ template <> inline constexpr size_t max_precision = 38; template <> inline constexpr size_t max_precision = 76; template -inline auto scaleMultiplier(UInt32 scale) +constexpr inline auto scaleMultiplier(UInt32 scale) { if constexpr (std::is_same_v || std::is_same_v) return common::exp10_i32(scale); diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 4d15078f2d7..192efb9820d 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1377,30 +1377,30 @@ struct ToRelativeSecondNumImpl using FactorTransform = ZeroTransform; }; -template +template struct ToRelativeSubsecondNumImpl { static constexpr auto name = "toRelativeSubsecondNumImpl"; static inline UInt64 execute(const DateTime64 & t, DateTime64::NativeType scale, const DateLUTImpl &) { - if (scale == second_divider) + if (scale == scale_multiplier) return t.value; - if (scale > second_divider) - return t.value / (scale / second_divider); - return t.value * (second_divider / scale); + if (scale > scale_multiplier) + return t.value / (scale / scale_multiplier); + return t.value * (scale_multiplier / scale); } static inline UInt64 execute(UInt32 t, const DateLUTImpl &) { - return t * second_divider; + return t * scale_multiplier; } static inline UInt64 execute(Int32 d, const DateLUTImpl & time_zone) { - return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))) * second_divider; + return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))) * scale_multiplier; } static inline UInt64 execute(UInt16 d, const DateLUTImpl & time_zone) { - return static_cast(time_zone.fromDayNum(DayNum(d)) * second_divider); + return static_cast(time_zone.fromDayNum(DayNum(d)) * scale_multiplier); } using FactorTransform = ZeroTransform; @@ -1505,25 +1505,43 @@ struct ToYYYYMMDDhhmmssImpl using FactorTransform = ZeroTransform; }; +struct DateTimeComponentsWithFractionalPart +{ + DateLUTImpl::DateTimeComponents datetime; + UInt16 millisecond = 0; + UInt16 microsecond = 0; +}; + struct ToDateTimeComponentsImpl { static constexpr auto name = "toDateTimeComponents"; - static inline DateLUTImpl::DateTimeComponents execute(Int64 t, const DateLUTImpl & time_zone) + static inline DateTimeComponentsWithFractionalPart execute(const DateTime64 & t, DateTime64::NativeType scale_multiplier, const DateLUTImpl & time_zone) { - return time_zone.toDateTimeComponents(t); + const auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); + const auto multiplier = DecimalUtils::scaleMultiplier(6); + Int64 fractional = components.fractional; + + if (scale_multiplier > multiplier) + fractional = fractional / (scale_multiplier / multiplier); + else if (scale_multiplier < multiplier) + fractional = fractional * (multiplier / scale_multiplier); + + UInt16 millisecond = static_cast(fractional / 1000); + UInt16 microsecond = static_cast(fractional % 1000); + return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(components.whole), millisecond, microsecond}; } - static inline DateLUTImpl::DateTimeComponents execute(UInt32 t, const DateLUTImpl & time_zone) + static inline DateTimeComponentsWithFractionalPart execute(UInt32 t, const DateLUTImpl & time_zone) { - return time_zone.toDateTimeComponents(static_cast(t)); + return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(static_cast(t)), 0, 0}; } - static inline DateLUTImpl::DateTimeComponents execute(Int32 d, const DateLUTImpl & time_zone) + static inline DateTimeComponentsWithFractionalPart execute(Int32 d, const DateLUTImpl & time_zone) { - return time_zone.toDateTimeComponents(ExtendedDayNum(d)); + return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(ExtendedDayNum(d)), 0, 0}; } - static inline DateLUTImpl::DateTimeComponents execute(UInt16 d, const DateLUTImpl & time_zone) + static inline DateTimeComponentsWithFractionalPart execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDateTimeComponents(DayNum(d)); + return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(DayNum(d)), 0, 0}; } using FactorTransform = ZeroTransform; diff --git a/src/Functions/dateDiff.cpp b/src/Functions/dateDiff.cpp index 62f01274476..5805526ba1a 100644 --- a/src/Functions/dateDiff.cpp +++ b/src/Functions/dateDiff.cpp @@ -174,12 +174,15 @@ public: { auto res = static_cast(transform_y.execute(y, timezone_y)) - static_cast(transform_x.execute(x, timezone_x)); - DateLUTImpl::DateTimeComponents a_comp; - DateLUTImpl::DateTimeComponents b_comp; + DateTimeComponentsWithFractionalPart a_comp; + DateTimeComponentsWithFractionalPart b_comp; + Int64 adjust_value; - auto x_seconds = TransformDateTime64>(transform_x.getScaleMultiplier()).execute(x, timezone_x); - auto y_seconds = TransformDateTime64>(transform_y.getScaleMultiplier()).execute(y, timezone_y); - if (x_seconds <= y_seconds) + const auto multiplier = DecimalUtils::scaleMultiplier(6); + auto x_microseconds = TransformDateTime64>(transform_x.getScaleMultiplier()).execute(x, timezone_x); + auto y_microseconds = TransformDateTime64>(transform_y.getScaleMultiplier()).execute(y, timezone_y); + + if (x_microseconds <= y_microseconds) { a_comp = TransformDateTime64(transform_x.getScaleMultiplier()).execute(x, timezone_x); b_comp = TransformDateTime64(transform_y.getScaleMultiplier()).execute(y, timezone_y); @@ -191,36 +194,43 @@ public: b_comp = TransformDateTime64(transform_x.getScaleMultiplier()).execute(x, timezone_x); adjust_value = 1; } + const auto & a_date = a_comp.datetime.date; + const auto & b_date = b_comp.datetime.date; + const auto & a_time = a_comp.datetime.time; + const auto & b_time = b_comp.datetime.time; if constexpr (std::is_same_v>>) { - if ((a_comp.date.month > b_comp.date.month) - || ((a_comp.date.month == b_comp.date.month) && ((a_comp.date.day > b_comp.date.day) - || ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour) - || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) - || ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second)))) - ))))) + if ((a_date.month > b_date.month) + || ((a_date.month == b_date.month) && ((a_date.day > b_date.day) + || ((a_date.day == b_date.day) && ((a_time.hour > b_time.hour) + || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) + || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - auto x_month_in_quarter = (a_comp.date.month - 1) % 3; - auto y_month_in_quarter = (b_comp.date.month - 1) % 3; + auto x_month_in_quarter = (a_date.month - 1) % 3; + auto y_month_in_quarter = (b_date.month - 1) % 3; if ((x_month_in_quarter > y_month_in_quarter) - || ((x_month_in_quarter == y_month_in_quarter) && ((a_comp.date.day > b_comp.date.day) - || ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour) - || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) - || ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second)))) - ))))) + || ((x_month_in_quarter == y_month_in_quarter) && ((a_date.day > b_date.day) + || ((a_date.day == b_date.day) && ((a_time.hour > b_time.hour) + || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) + || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - if ((a_comp.date.day > b_comp.date.day) - || ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour) - || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) - || ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second)))) - ))) + if ((a_date.day > b_date.day) + || ((a_date.day == b_date.day) && ((a_time.hour > b_time.hour) + || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) + || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) @@ -228,27 +238,46 @@ public: auto x_day_of_week = TransformDateTime64(transform_x.getScaleMultiplier()).execute(x, 0, timezone_x); auto y_day_of_week = TransformDateTime64(transform_y.getScaleMultiplier()).execute(y, 0, timezone_y); if ((x_day_of_week > y_day_of_week) - || ((x_day_of_week == y_day_of_week) && (a_comp.time.hour > b_comp.time.hour)) - || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) - || ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second))))) + || ((x_day_of_week == y_day_of_week) && (a_time.hour > b_time.hour)) + || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) + || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - if ((a_comp.time.hour > b_comp.time.hour) - || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) - || ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second))))) + if ((a_time.hour > b_time.hour) + || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) + || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - if ((a_comp.time.minute > b_comp.time.minute) - || ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second))) + if ((a_time.minute > b_time.minute) + || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - if (a_comp.time.second > b_comp.time.second) + if ((a_time.second > b_time.second) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))) + res += adjust_value; + } + else if constexpr (std::is_same_v>>) + { + if ((a_comp.millisecond > b_comp.millisecond) + || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))) + res += adjust_value; + } + else if constexpr (std::is_same_v>>) + { + if (a_comp.microsecond > b_comp.microsecond) res += adjust_value; } return res; diff --git a/tests/queries/0_stateless/02477_age_datetime64.reference b/tests/queries/0_stateless/02477_age_datetime64.reference index 3b4459dd26d..b732794eef7 100644 --- a/tests/queries/0_stateless/02477_age_datetime64.reference +++ b/tests/queries/0_stateless/02477_age_datetime64.reference @@ -111,3 +111,100 @@ SELECT age('day', materialize(toDateTime64('2015-08-18 00:00:00', 0, 'UTC')), ma 1 SELECT age('day', materialize(toDate('2015-08-18', 'UTC')), materialize(toDateTime64('2015-08-19 00:00:00', 3, 'UTC'))); 1 +-- DateTime64 vs DateTime64 with fractional part +SELECT age('microsecond', toDateTime64('2015-08-18 20:30:36.100200005', 9, 'UTC'), toDateTime64('2015-08-18 20:30:41.200400005', 9, 'UTC')); +5100200 +SELECT age('microsecond', toDateTime64('2015-08-18 20:30:36.100200005', 9, 'UTC'), toDateTime64('2015-08-18 20:30:41.200400004', 9, 'UTC')); +5100200 +SELECT age('millisecond', toDateTime64('2015-08-18 20:30:36.450299', 6, 'UTC'), toDateTime64('2015-08-18 20:30:41.550299', 6, 'UTC')); +5100 +SELECT age('millisecond', toDateTime64('2015-08-18 20:30:36.450299', 6, 'UTC'), toDateTime64('2015-08-18 20:30:41.550298', 6, 'UTC')); +5099 +SELECT age('second', toDateTime64('2023-03-01 19:18:36.999003', 6, 'UTC'), toDateTime64('2023-03-01 19:18:41.999002', 6, 'UTC')); +4 +SELECT age('second', toDateTime64('2023-03-01 19:18:36.999', 3, 'UTC'), toDateTime64('2023-03-01 19:18:41.001', 3, 'UTC')); +4 +SELECT age('minute', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-01 20:35:36.300', 3, 'UTC')); +5 +SELECT age('minute', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-01 20:35:36.100', 3, 'UTC')); +4 +SELECT age('minute', toDateTime64('2015-01-01 20:30:36.200101', 6, 'UTC'), toDateTime64('2015-01-01 20:35:36.200100', 6, 'UTC')); +4 +SELECT age('hour', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-01 23:30:36.200', 3, 'UTC')); +3 +SELECT age('hour', toDateTime64('2015-01-01 20:31:36.200', 3, 'UTC'), toDateTime64('2015-01-01 23:30:36.200', 3, 'UTC')); +2 +SELECT age('hour', toDateTime64('2015-01-01 20:30:37.200', 3, 'UTC'), toDateTime64('2015-01-01 23:30:36.200', 3, 'UTC')); +2 +SELECT age('hour', toDateTime64('2015-01-01 20:30:36.300', 3, 'UTC'), toDateTime64('2015-01-01 23:30:36.200', 3, 'UTC')); +2 +SELECT age('hour', toDateTime64('2015-01-01 20:30:36.200101', 6, 'UTC'), toDateTime64('2015-01-01 23:30:36.200100', 6, 'UTC')); +2 +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 20:30:36.200', 3, 'UTC')); +3 +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 19:30:36.200', 3, 'UTC')); +2 +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 20:28:36.200', 3, 'UTC')); +2 +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 20:30:35.200', 3, 'UTC')); +2 +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 20:30:36.199', 3, 'UTC')); +2 +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200101', 6, 'UTC'), toDateTime64('2015-01-04 20:30:36.200100', 6, 'UTC')); +2 +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 20:30:36.200', 3, 'UTC')); +2 +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 19:30:36.200', 3, 'UTC')); +1 +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 20:29:36.200', 3, 'UTC')); +1 +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 20:30:35.200', 3, 'UTC')); +1 +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 20:30:36.100', 3, 'UTC')); +1 +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200101', 6, 'UTC'), toDateTime64('2015-01-15 20:30:36.200100', 6, 'UTC')); +1 +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 20:30:36.200', 3, 'UTC')); +16 +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-01 20:30:36.200', 3, 'UTC')); +15 +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 19:30:36.200', 3, 'UTC')); +15 +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 20:29:36.200', 3, 'UTC')); +15 +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 20:30:35.200', 3, 'UTC')); +15 +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 20:30:36.100', 3, 'UTC')); +15 +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2016-05-02 20:30:36.200100', 6, 'UTC')); +15 +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 20:30:36.200', 3, 'UTC')); +5 +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-01 20:30:36.200', 3, 'UTC')); +4 +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 19:30:36.200', 3, 'UTC')); +4 +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 20:29:36.200', 3, 'UTC')); +4 +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 20:30:35.200', 3, 'UTC')); +4 +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 20:30:36.100', 3, 'UTC')); +4 +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2016-04-02 20:30:36.200100', 6, 'UTC')); +4 +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:30:36.200', 3, 'UTC')); +8 +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-01-02 20:30:36.200', 3, 'UTC')); +7 +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-01 20:30:36.200', 3, 'UTC')); +7 +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 19:30:36.200', 3, 'UTC')); +7 +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:29:36.200', 3, 'UTC')); +7 +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:30:35.200', 3, 'UTC')); +7 +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:30:36.100', 3, 'UTC')); +7 +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2023-02-02 20:30:36.200100', 6, 'UTC')); +7 diff --git a/tests/queries/0_stateless/02477_age_datetime64.sql b/tests/queries/0_stateless/02477_age_datetime64.sql index 1bed93991ca..809270f4cce 100644 --- a/tests/queries/0_stateless/02477_age_datetime64.sql +++ b/tests/queries/0_stateless/02477_age_datetime64.sql @@ -75,3 +75,62 @@ SELECT age('second', materialize(toDateTime64('2015-08-18 00:00:00', 0, 'UTC')), SELECT age('second', materialize(toDateTime('2015-08-18 00:00:00', 'UTC')), materialize(toDateTime64('2015-08-18 00:00:10', 3, 'UTC'))); SELECT age('day', materialize(toDateTime64('2015-08-18 00:00:00', 0, 'UTC')), materialize(toDate('2015-08-19', 'UTC'))); SELECT age('day', materialize(toDate('2015-08-18', 'UTC')), materialize(toDateTime64('2015-08-19 00:00:00', 3, 'UTC'))); + +-- DateTime64 vs DateTime64 with fractional part +SELECT age('microsecond', toDateTime64('2015-08-18 20:30:36.100200005', 9, 'UTC'), toDateTime64('2015-08-18 20:30:41.200400005', 9, 'UTC')); +SELECT age('microsecond', toDateTime64('2015-08-18 20:30:36.100200005', 9, 'UTC'), toDateTime64('2015-08-18 20:30:41.200400004', 9, 'UTC')); + +SELECT age('millisecond', toDateTime64('2015-08-18 20:30:36.450299', 6, 'UTC'), toDateTime64('2015-08-18 20:30:41.550299', 6, 'UTC')); +SELECT age('millisecond', toDateTime64('2015-08-18 20:30:36.450299', 6, 'UTC'), toDateTime64('2015-08-18 20:30:41.550298', 6, 'UTC')); + +SELECT age('second', toDateTime64('2023-03-01 19:18:36.999003', 6, 'UTC'), toDateTime64('2023-03-01 19:18:41.999002', 6, 'UTC')); +SELECT age('second', toDateTime64('2023-03-01 19:18:36.999', 3, 'UTC'), toDateTime64('2023-03-01 19:18:41.001', 3, 'UTC')); + +SELECT age('minute', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-01 20:35:36.300', 3, 'UTC')); +SELECT age('minute', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-01 20:35:36.100', 3, 'UTC')); +SELECT age('minute', toDateTime64('2015-01-01 20:30:36.200101', 6, 'UTC'), toDateTime64('2015-01-01 20:35:36.200100', 6, 'UTC')); + +SELECT age('hour', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-01 23:30:36.200', 3, 'UTC')); +SELECT age('hour', toDateTime64('2015-01-01 20:31:36.200', 3, 'UTC'), toDateTime64('2015-01-01 23:30:36.200', 3, 'UTC')); +SELECT age('hour', toDateTime64('2015-01-01 20:30:37.200', 3, 'UTC'), toDateTime64('2015-01-01 23:30:36.200', 3, 'UTC')); +SELECT age('hour', toDateTime64('2015-01-01 20:30:36.300', 3, 'UTC'), toDateTime64('2015-01-01 23:30:36.200', 3, 'UTC')); +SELECT age('hour', toDateTime64('2015-01-01 20:30:36.200101', 6, 'UTC'), toDateTime64('2015-01-01 23:30:36.200100', 6, 'UTC')); + +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 20:30:36.200', 3, 'UTC')); +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 19:30:36.200', 3, 'UTC')); +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 20:28:36.200', 3, 'UTC')); +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 20:30:35.200', 3, 'UTC')); +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-04 20:30:36.199', 3, 'UTC')); +SELECT age('day', toDateTime64('2015-01-01 20:30:36.200101', 6, 'UTC'), toDateTime64('2015-01-04 20:30:36.200100', 6, 'UTC')); + +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 20:30:36.200', 3, 'UTC')); +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 19:30:36.200', 3, 'UTC')); +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 20:29:36.200', 3, 'UTC')); +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 20:30:35.200', 3, 'UTC')); +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200', 3, 'UTC'), toDateTime64('2015-01-15 20:30:36.100', 3, 'UTC')); +SELECT age('week', toDateTime64('2015-01-01 20:30:36.200101', 6, 'UTC'), toDateTime64('2015-01-15 20:30:36.200100', 6, 'UTC')); + +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 20:30:36.200', 3, 'UTC')); +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-01 20:30:36.200', 3, 'UTC')); +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 19:30:36.200', 3, 'UTC')); +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 20:29:36.200', 3, 'UTC')); +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 20:30:35.200', 3, 'UTC')); +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-05-02 20:30:36.100', 3, 'UTC')); +SELECT age('month', toDateTime64('2015-01-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2016-05-02 20:30:36.200100', 6, 'UTC')); + +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 20:30:36.200', 3, 'UTC')); +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-01 20:30:36.200', 3, 'UTC')); +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 19:30:36.200', 3, 'UTC')); +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 20:29:36.200', 3, 'UTC')); +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 20:30:35.200', 3, 'UTC')); +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200', 3, 'UTC'), toDateTime64('2016-04-02 20:30:36.100', 3, 'UTC')); +SELECT age('quarter', toDateTime64('2015-01-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2016-04-02 20:30:36.200100', 6, 'UTC')); + +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:30:36.200', 3, 'UTC')); +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-01-02 20:30:36.200', 3, 'UTC')); +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-01 20:30:36.200', 3, 'UTC')); +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 19:30:36.200', 3, 'UTC')); +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:29:36.200', 3, 'UTC')); +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:30:35.200', 3, 'UTC')); +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:30:36.100', 3, 'UTC')); +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2023-02-02 20:30:36.200100', 6, 'UTC')); \ No newline at end of file From d1cb371d8d9834a596ab50ee198693bc1569bb51 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 20 Jun 2023 08:53:25 +0000 Subject: [PATCH 09/38] Style fix (whitespaces) --- src/Functions/dateDiff.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/Functions/dateDiff.cpp b/src/Functions/dateDiff.cpp index 5805526ba1a..c8ee899d8fb 100644 --- a/src/Functions/dateDiff.cpp +++ b/src/Functions/dateDiff.cpp @@ -176,7 +176,6 @@ public: - static_cast(transform_x.execute(x, timezone_x)); DateTimeComponentsWithFractionalPart a_comp; DateTimeComponentsWithFractionalPart b_comp; - Int64 adjust_value; const auto multiplier = DecimalUtils::scaleMultiplier(6); auto x_microseconds = TransformDateTime64>(transform_x.getScaleMultiplier()).execute(x, timezone_x); @@ -219,7 +218,7 @@ public: || ((a_date.day == b_date.day) && ((a_time.hour > b_time.hour) || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))))))) res += adjust_value; } @@ -229,7 +228,7 @@ public: || ((a_date.day == b_date.day) && ((a_time.hour > b_time.hour) || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))))) res += adjust_value; } @@ -241,7 +240,7 @@ public: || ((x_day_of_week == y_day_of_week) && (a_time.hour > b_time.hour)) || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))) res += adjust_value; } @@ -250,7 +249,7 @@ public: if ((a_time.hour > b_time.hour) || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))) res += adjust_value; } @@ -403,9 +402,9 @@ public: else if (unit == "second" || unit == "ss" || unit == "s") impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else if (unit == "millisecond" || unit == "ms") - impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); + impl.template dispatchForColumns(3)>>(x, y, timezone_x, timezone_y, res->getData()); else if (unit == "microsecond" || unit == "us" || unit == "u") - impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); + impl.template dispatchForColumns(6)>>(x, y, timezone_x, timezone_y, res->getData()); else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} does not support '{}' unit", getName(), unit); From cda37f4d733e1cc0a86f7e211b18e3d540640f3e Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 20 Jun 2023 18:31:04 +0000 Subject: [PATCH 10/38] Fix negative support for date_diff/age(), fix whole int TransformDateTime64 --- src/Core/DecimalFunctions.h | 6 +- src/Functions/DateTimeTransforms.h | 24 ++++--- src/Functions/TransformDateTime64.h | 5 +- src/Functions/dateDiff.cpp | 67 +++++++++---------- src/Functions/toStartOfInterval.cpp | 1 - ...0479_date_and_datetime_to_number.reference | 1 + .../00479_date_and_datetime_to_number.sql | 1 + .../02160_special_functions.reference | 2 + .../0_stateless/02160_special_functions.sql | 3 + .../02477_age_datetime64.reference | 9 +++ .../0_stateless/02477_age_datetime64.sql | 8 ++- 11 files changed, 78 insertions(+), 49 deletions(-) diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index 9b6b1e87e5f..6d855498f6c 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -48,7 +48,11 @@ constexpr inline auto scaleMultiplier(UInt32 scale) /** Components of DecimalX value: * whole - represents whole part of decimal, can be negative or positive. - * fractional - for fractional part of decimal, always positive. + * fractional - for fractional part of decimal. + * + * 0.123 represent 0 / 0.123 + * -0.123 represent 0 / -0.123 + * -1.123 represent -1 / 0.123 */ template struct DecimalComponents diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 192efb9820d..a0b649bbd9b 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -1382,7 +1382,7 @@ struct ToRelativeSubsecondNumImpl { static constexpr auto name = "toRelativeSubsecondNumImpl"; - static inline UInt64 execute(const DateTime64 & t, DateTime64::NativeType scale, const DateLUTImpl &) + static inline Int64 execute(const DateTime64 & t, DateTime64::NativeType scale, const DateLUTImpl &) { if (scale == scale_multiplier) return t.value; @@ -1390,17 +1390,17 @@ struct ToRelativeSubsecondNumImpl return t.value / (scale / scale_multiplier); return t.value * (scale_multiplier / scale); } - static inline UInt64 execute(UInt32 t, const DateLUTImpl &) + static inline Int64 execute(UInt32 t, const DateLUTImpl &) { return t * scale_multiplier; } - static inline UInt64 execute(Int32 d, const DateLUTImpl & time_zone) + static inline Int64 execute(Int32 d, const DateLUTImpl & time_zone) { - return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))) * scale_multiplier; + return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))) * scale_multiplier; } - static inline UInt64 execute(UInt16 d, const DateLUTImpl & time_zone) + static inline Int64 execute(UInt16 d, const DateLUTImpl & time_zone) { - return static_cast(time_zone.fromDayNum(DayNum(d)) * scale_multiplier); + return static_cast(time_zone.fromDayNum(DayNum(d)) * scale_multiplier); } using FactorTransform = ZeroTransform; @@ -1505,9 +1505,8 @@ struct ToYYYYMMDDhhmmssImpl using FactorTransform = ZeroTransform; }; -struct DateTimeComponentsWithFractionalPart +struct DateTimeComponentsWithFractionalPart : public DateLUTImpl::DateTimeComponents { - DateLUTImpl::DateTimeComponents datetime; UInt16 millisecond = 0; UInt16 microsecond = 0; }; @@ -1518,10 +1517,15 @@ struct ToDateTimeComponentsImpl static inline DateTimeComponentsWithFractionalPart execute(const DateTime64 & t, DateTime64::NativeType scale_multiplier, const DateLUTImpl & time_zone) { - const auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); + auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); const auto multiplier = DecimalUtils::scaleMultiplier(6); - Int64 fractional = components.fractional; + if (t.value < 0 && components.fractional) + { + components.fractional = scale_multiplier + (components.whole ? Int64(-1) : Int64(1)) * components.fractional; + --components.whole; + } + Int64 fractional = components.fractional; if (scale_multiplier > multiplier) fractional = fractional / (scale_multiplier / multiplier); else if (scale_multiplier < multiplier) diff --git a/src/Functions/TransformDateTime64.h b/src/Functions/TransformDateTime64.h index 1a1e732ae40..fcee2753066 100644 --- a/src/Functions/TransformDateTime64.h +++ b/src/Functions/TransformDateTime64.h @@ -80,7 +80,10 @@ public: } else { - const auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); + auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); + if (t.value < 0 && components.fractional) + --components.whole; + return wrapped_transform.execute(static_cast(components.whole), std::forward(args)...); } } diff --git a/src/Functions/dateDiff.cpp b/src/Functions/dateDiff.cpp index c8ee899d8fb..e41fe91818a 100644 --- a/src/Functions/dateDiff.cpp +++ b/src/Functions/dateDiff.cpp @@ -193,42 +193,39 @@ public: b_comp = TransformDateTime64(transform_x.getScaleMultiplier()).execute(x, timezone_x); adjust_value = 1; } - const auto & a_date = a_comp.datetime.date; - const auto & b_date = b_comp.datetime.date; - const auto & a_time = a_comp.datetime.time; - const auto & b_time = b_comp.datetime.time; + if constexpr (std::is_same_v>>) { - if ((a_date.month > b_date.month) - || ((a_date.month == b_date.month) && ((a_date.day > b_date.day) - || ((a_date.day == b_date.day) && ((a_time.hour > b_time.hour) - || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) - || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + if ((a_comp.date.month > b_comp.date.month) + || ((a_comp.date.month == b_comp.date.month) && ((a_comp.date.day > b_comp.date.day) + || ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour) + || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) + || ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second) + || ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - auto x_month_in_quarter = (a_date.month - 1) % 3; - auto y_month_in_quarter = (b_date.month - 1) % 3; + auto x_month_in_quarter = (a_comp.date.month - 1) % 3; + auto y_month_in_quarter = (b_comp.date.month - 1) % 3; if ((x_month_in_quarter > y_month_in_quarter) - || ((x_month_in_quarter == y_month_in_quarter) && ((a_date.day > b_date.day) - || ((a_date.day == b_date.day) && ((a_time.hour > b_time.hour) - || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) - || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((x_month_in_quarter == y_month_in_quarter) && ((a_comp.date.day > b_comp.date.day) + || ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour) + || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) + || ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second) + || ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - if ((a_date.day > b_date.day) - || ((a_date.day == b_date.day) && ((a_time.hour > b_time.hour) - || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) - || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + if ((a_comp.date.day > b_comp.date.day) + || ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour) + || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) + || ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second) + || ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))))) res += adjust_value; } @@ -237,34 +234,34 @@ public: auto x_day_of_week = TransformDateTime64(transform_x.getScaleMultiplier()).execute(x, 0, timezone_x); auto y_day_of_week = TransformDateTime64(transform_y.getScaleMultiplier()).execute(y, 0, timezone_y); if ((x_day_of_week > y_day_of_week) - || ((x_day_of_week == y_day_of_week) && (a_time.hour > b_time.hour)) - || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) - || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + || ((x_day_of_week == y_day_of_week) && (a_comp.time.hour > b_comp.time.hour)) + || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) + || ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second) + || ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - if ((a_time.hour > b_time.hour) - || ((a_time.hour == b_time.hour) && ((a_time.minute > b_time.minute) - || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + if ((a_comp.time.hour > b_comp.time.hour) + || ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute) + || ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second) + || ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - if ((a_time.minute > b_time.minute) - || ((a_time.minute == b_time.minute) && ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + if ((a_comp.time.minute > b_comp.time.minute) + || ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second) + || ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))))) res += adjust_value; } else if constexpr (std::is_same_v>>) { - if ((a_time.second > b_time.second) - || ((a_time.second == b_time.second) && ((a_comp.millisecond > b_comp.millisecond) + if ((a_comp.time.second > b_comp.time.second) + || ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond) || ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond))))) res += adjust_value; } diff --git a/src/Functions/toStartOfInterval.cpp b/src/Functions/toStartOfInterval.cpp index 649242d0d86..48bf88cb14c 100644 --- a/src/Functions/toStartOfInterval.cpp +++ b/src/Functions/toStartOfInterval.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include diff --git a/tests/queries/0_stateless/00479_date_and_datetime_to_number.reference b/tests/queries/0_stateless/00479_date_and_datetime_to_number.reference index 1375ccb1542..168b733d702 100644 --- a/tests/queries/0_stateless/00479_date_and_datetime_to_number.reference +++ b/tests/queries/0_stateless/00479_date_and_datetime_to_number.reference @@ -4,3 +4,4 @@ 201707 20170721 20170721112233 +19691231235959 diff --git a/tests/queries/0_stateless/00479_date_and_datetime_to_number.sql b/tests/queries/0_stateless/00479_date_and_datetime_to_number.sql index 71151690028..1e35e99a802 100644 --- a/tests/queries/0_stateless/00479_date_and_datetime_to_number.sql +++ b/tests/queries/0_stateless/00479_date_and_datetime_to_number.sql @@ -4,3 +4,4 @@ SELECT toYYYYMMDDhhmmss(toDate('2017-07-21')); SELECT toYYYYMM(toDateTime('2017-07-21T11:22:33')); SELECT toYYYYMMDD(toDateTime('2017-07-21T11:22:33')); SELECT toYYYYMMDDhhmmss(toDateTime('2017-07-21T11:22:33')); +SELECT toYYYYMMDDhhmmss(toDateTime64('1969-12-31 23:59:59.900', 3)); diff --git a/tests/queries/0_stateless/02160_special_functions.reference b/tests/queries/0_stateless/02160_special_functions.reference index 37278d4b5dc..5e7e3383d8d 100644 --- a/tests/queries/0_stateless/02160_special_functions.reference +++ b/tests/queries/0_stateless/02160_special_functions.reference @@ -41,4 +41,6 @@ Hello 701 800 60200201 +60 +10 1 diff --git a/tests/queries/0_stateless/02160_special_functions.sql b/tests/queries/0_stateless/02160_special_functions.sql index 6002f793601..64919536be3 100644 --- a/tests/queries/0_stateless/02160_special_functions.sql +++ b/tests/queries/0_stateless/02160_special_functions.sql @@ -50,4 +50,7 @@ SELECT DATEDIFF(millisecond, '2021-01-01 23:59:59.299999'::DateTime64(6), '2021- SELECT DATEDIFF(millisecond, '2021-01-01 23:59:59.2'::DateTime64(1), '2021-01-02'::Date); SELECT DATEDIFF(microsecond, '2021-01-01 23:59:59.899999'::DateTime64(6), '2021-01-02 00:01:00.100200300'::DateTime64(9)); +SELECT DATEDIFF(microsecond, '1969-12-31 23:59:59.999950'::DateTime64(6), '1970-01-01 00:00:00.000010'::DateTime64(6)); +SELECT DATEDIFF(second, '1969-12-31 23:59:59.123'::DateTime64(6), '1970-01-01 00:00:09.123'::DateTime64(6)); + SELECT EXISTS (SELECT 1); diff --git a/tests/queries/0_stateless/02477_age_datetime64.reference b/tests/queries/0_stateless/02477_age_datetime64.reference index b732794eef7..e2ac97cbcd9 100644 --- a/tests/queries/0_stateless/02477_age_datetime64.reference +++ b/tests/queries/0_stateless/02477_age_datetime64.reference @@ -208,3 +208,12 @@ SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime 7 SELECT age('year', toDateTime64('2015-02-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2023-02-02 20:30:36.200100', 6, 'UTC')); 7 +-- DateTime64 vs DateTime64 with negative time +SELECT age('millisecond', toDateTime64('1969-12-31 23:59:58.001', 3), toDateTime64('1970-01-01 00:00:00.350', 3, 'UTC')); +2349 +SELECT age('second', toDateTime64('1969-12-31 23:59:58.001', 3), toDateTime64('1970-01-01 00:00:00.35', 3, 'UTC')); +2 +SELECT age('second', toDateTime64('1969-12-31 23:59:50.001', 3), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); +5 +SELECT age('second', toDateTime64('1969-12-31 23:59:50.003', 3), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); +4 diff --git a/tests/queries/0_stateless/02477_age_datetime64.sql b/tests/queries/0_stateless/02477_age_datetime64.sql index 809270f4cce..2992f73a2c1 100644 --- a/tests/queries/0_stateless/02477_age_datetime64.sql +++ b/tests/queries/0_stateless/02477_age_datetime64.sql @@ -133,4 +133,10 @@ SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:29:36.200', 3, 'UTC')); SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:30:35.200', 3, 'UTC')); SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime64('2023-02-02 20:30:36.100', 3, 'UTC')); -SELECT age('year', toDateTime64('2015-02-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2023-02-02 20:30:36.200100', 6, 'UTC')); \ No newline at end of file +SELECT age('year', toDateTime64('2015-02-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2023-02-02 20:30:36.200100', 6, 'UTC')); + +-- DateTime64 vs DateTime64 with negative time +SELECT age('millisecond', toDateTime64('1969-12-31 23:59:58.001', 3), toDateTime64('1970-01-01 00:00:00.350', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:58.001', 3), toDateTime64('1970-01-01 00:00:00.35', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:50.001', 3), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:50.003', 3), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); \ No newline at end of file From 6050813d2e43d09a39fa2cb4b1d6daf1a4a8e27d Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 20 Jun 2023 18:56:07 +0000 Subject: [PATCH 11/38] Remove trailing whitespaces --- src/Core/DecimalFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index 6d855498f6c..648c09d3d72 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -49,7 +49,7 @@ constexpr inline auto scaleMultiplier(UInt32 scale) /** Components of DecimalX value: * whole - represents whole part of decimal, can be negative or positive. * fractional - for fractional part of decimal. - * + * * 0.123 represent 0 / 0.123 * -0.123 represent 0 / -0.123 * -1.123 represent -1 / 0.123 From b20ecc2060ac74945e897de688daa1eaf032dda0 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 21 Jun 2023 06:25:11 +0000 Subject: [PATCH 12/38] Add line without spaces --- tests/queries/0_stateless/00301_csv.reference | 1 + tests/queries/0_stateless/00301_csv.sh | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/00301_csv.reference b/tests/queries/0_stateless/00301_csv.reference index 140bbda84e7..804ccf0c713 100644 --- a/tests/queries/0_stateless/00301_csv.reference +++ b/tests/queries/0_stateless/00301_csv.reference @@ -28,4 +28,5 @@ Hello 0 0 33 \N 55 Default Hello 0 0 33 \N 55 Default Hello 1 3 2 \N 55 Default Hello 1 4 2 3 4 String +Hello 1 4 2 3 4 String Hello 1 5 2 3 4 String diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index aa019147bab..c598be44261 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -60,10 +60,11 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (f1 String, f2 UInt64, f3 UInt256, echo ' , -Hello -Hello, -Hello, 1, 3, 2 -Hello, 1, 4, 2, 3, 4, String -Hello, 1, 5, 2, 3, 4, String,'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_missing_as_default=1 --query="INSERT INTO csv FORMAT CSV"; +"Hello" +"Hello", +"Hello", 1, 3, 2 +"Hello",1,4,2,3,4,"String" +"Hello", 1, 4, 2, 3, 4, "String" +"Hello", 1, 5, 2, 3, 4, "String",'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_missing_as_default=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY f1, f2, f3, f4, f5 NULLS FIRST, f6, f7"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; From d58152cb82cf2abb6112077fc4f79c656380f582 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 21 Jun 2023 08:04:40 +0000 Subject: [PATCH 13/38] Add constants / fix tests --- src/Core/DecimalFunctions.h | 6 +++--- src/Functions/DateTimeTransforms.h | 14 +++++++++----- src/Functions/dateDiff.cpp | 6 +++--- .../0_stateless/02477_age_datetime64.reference | 8 ++++---- tests/queries/0_stateless/02477_age_datetime64.sql | 8 ++++---- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index 648c09d3d72..defc21a5f43 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -50,9 +50,9 @@ constexpr inline auto scaleMultiplier(UInt32 scale) * whole - represents whole part of decimal, can be negative or positive. * fractional - for fractional part of decimal. * - * 0.123 represent 0 / 0.123 - * -0.123 represent 0 / -0.123 - * -1.123 represent -1 / 0.123 + * 0.123 represents 0 / 0.123 + * -0.123 represents 0 / -0.123 + * -1.123 represents -1 / 0.123 */ template struct DecimalComponents diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index a0b649bbd9b..afff8d6523d 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -19,6 +19,9 @@ namespace DB { +static constexpr auto microsecond_scale = 6; +static constexpr auto millisecond_scale = 3; + namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; @@ -1507,8 +1510,8 @@ struct ToYYYYMMDDhhmmssImpl struct DateTimeComponentsWithFractionalPart : public DateLUTImpl::DateTimeComponents { - UInt16 millisecond = 0; - UInt16 microsecond = 0; + UInt16 millisecond; + UInt16 microsecond; }; struct ToDateTimeComponentsImpl @@ -1518,7 +1521,7 @@ struct ToDateTimeComponentsImpl static inline DateTimeComponentsWithFractionalPart execute(const DateTime64 & t, DateTime64::NativeType scale_multiplier, const DateLUTImpl & time_zone) { auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); - const auto multiplier = DecimalUtils::scaleMultiplier(6); + constexpr auto multiplier = DecimalUtils::scaleMultiplier(microsecond_scale); if (t.value < 0 && components.fractional) { @@ -1531,8 +1534,9 @@ struct ToDateTimeComponentsImpl else if (scale_multiplier < multiplier) fractional = fractional * (multiplier / scale_multiplier); - UInt16 millisecond = static_cast(fractional / 1000); - UInt16 microsecond = static_cast(fractional % 1000); + constexpr auto divider = DecimalUtils::scaleMultiplier(microsecond_scale - millisecond_scale); + UInt16 millisecond = static_cast(fractional / divider); + UInt16 microsecond = static_cast(fractional % divider); return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(components.whole), millisecond, microsecond}; } static inline DateTimeComponentsWithFractionalPart execute(UInt32 t, const DateLUTImpl & time_zone) diff --git a/src/Functions/dateDiff.cpp b/src/Functions/dateDiff.cpp index e41fe91818a..79be3059b2a 100644 --- a/src/Functions/dateDiff.cpp +++ b/src/Functions/dateDiff.cpp @@ -177,7 +177,7 @@ public: DateTimeComponentsWithFractionalPart a_comp; DateTimeComponentsWithFractionalPart b_comp; Int64 adjust_value; - const auto multiplier = DecimalUtils::scaleMultiplier(6); + constexpr auto multiplier = DecimalUtils::scaleMultiplier(microsecond_scale); auto x_microseconds = TransformDateTime64>(transform_x.getScaleMultiplier()).execute(x, timezone_x); auto y_microseconds = TransformDateTime64>(transform_y.getScaleMultiplier()).execute(y, timezone_y); @@ -399,9 +399,9 @@ public: else if (unit == "second" || unit == "ss" || unit == "s") impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else if (unit == "millisecond" || unit == "ms") - impl.template dispatchForColumns(3)>>(x, y, timezone_x, timezone_y, res->getData()); + impl.template dispatchForColumns(millisecond_scale)>>(x, y, timezone_x, timezone_y, res->getData()); else if (unit == "microsecond" || unit == "us" || unit == "u") - impl.template dispatchForColumns(6)>>(x, y, timezone_x, timezone_y, res->getData()); + impl.template dispatchForColumns(microsecond_scale)>>(x, y, timezone_x, timezone_y, res->getData()); else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} does not support '{}' unit", getName(), unit); diff --git a/tests/queries/0_stateless/02477_age_datetime64.reference b/tests/queries/0_stateless/02477_age_datetime64.reference index e2ac97cbcd9..c8c716e1e9a 100644 --- a/tests/queries/0_stateless/02477_age_datetime64.reference +++ b/tests/queries/0_stateless/02477_age_datetime64.reference @@ -209,11 +209,11 @@ SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime SELECT age('year', toDateTime64('2015-02-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2023-02-02 20:30:36.200100', 6, 'UTC')); 7 -- DateTime64 vs DateTime64 with negative time -SELECT age('millisecond', toDateTime64('1969-12-31 23:59:58.001', 3), toDateTime64('1970-01-01 00:00:00.350', 3, 'UTC')); +SELECT age('millisecond', toDateTime64('1969-12-31 23:59:58.001', 3, 'UTC'), toDateTime64('1970-01-01 00:00:00.350', 3, 'UTC')); 2349 -SELECT age('second', toDateTime64('1969-12-31 23:59:58.001', 3), toDateTime64('1970-01-01 00:00:00.35', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:58.001', 3, 'UTC'), toDateTime64('1970-01-01 00:00:00.35', 3, 'UTC')); 2 -SELECT age('second', toDateTime64('1969-12-31 23:59:50.001', 3), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:50.001', 3, 'UTC'), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); 5 -SELECT age('second', toDateTime64('1969-12-31 23:59:50.003', 3), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:50.003', 3, 'UTC'), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); 4 diff --git a/tests/queries/0_stateless/02477_age_datetime64.sql b/tests/queries/0_stateless/02477_age_datetime64.sql index 2992f73a2c1..889137395a3 100644 --- a/tests/queries/0_stateless/02477_age_datetime64.sql +++ b/tests/queries/0_stateless/02477_age_datetime64.sql @@ -136,7 +136,7 @@ SELECT age('year', toDateTime64('2015-02-02 20:30:36.200', 3, 'UTC'), toDateTime SELECT age('year', toDateTime64('2015-02-02 20:30:36.200101', 6, 'UTC'), toDateTime64('2023-02-02 20:30:36.200100', 6, 'UTC')); -- DateTime64 vs DateTime64 with negative time -SELECT age('millisecond', toDateTime64('1969-12-31 23:59:58.001', 3), toDateTime64('1970-01-01 00:00:00.350', 3, 'UTC')); -SELECT age('second', toDateTime64('1969-12-31 23:59:58.001', 3), toDateTime64('1970-01-01 00:00:00.35', 3, 'UTC')); -SELECT age('second', toDateTime64('1969-12-31 23:59:50.001', 3), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); -SELECT age('second', toDateTime64('1969-12-31 23:59:50.003', 3), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); \ No newline at end of file +SELECT age('millisecond', toDateTime64('1969-12-31 23:59:58.001', 3, 'UTC'), toDateTime64('1970-01-01 00:00:00.350', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:58.001', 3, 'UTC'), toDateTime64('1970-01-01 00:00:00.35', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:50.001', 3, 'UTC'), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); +SELECT age('second', toDateTime64('1969-12-31 23:59:50.003', 3, 'UTC'), toDateTime64('1969-12-31 23:59:55.002', 3, 'UTC')); \ No newline at end of file From 8bd53cad7849816a6bd6591eddebd9ba19fa7272 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 21 Jun 2023 14:01:05 +0000 Subject: [PATCH 14/38] Add quotes to test --- tests/queries/0_stateless/00301_csv.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index c598be44261..dc354433af9 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -44,12 +44,12 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; echo === Test input_format_csv_ignore_extra_columns $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory"; -echo 'Hello, 1, String1 -Hello, 2, String2, -Hello, 3, String3, 2016-01-13 -Hello, 4, , 2016-01-14 -Hello, 5, String5, 2016-01-15, 2016-01-16 -Hello, 6, String6, "line with a +echo '"Hello", 1, "String1" +"Hello", 2, "String2", +"Hello", 3, "String3", "2016-01-13" +"Hello", 4, , "2016-01-14" +"Hello", 5, "String5", "2016-01-15", "2016-01-16" +"Hello", 6, "String6", "line with a break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; From 2c3a4cb90de34569277edb3e4cf9f50fa9e5d5a2 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 22 Jun 2023 10:47:07 +0000 Subject: [PATCH 15/38] Style fix --- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index a727a5bc490..59b0f25f0bf 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -156,8 +156,7 @@ void CSVFormatReader::skipFieldDelimiter() { skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); - bool res = checkChar(format_settings.csv.delimiter, *buf); - if (!res) + if (!checkChar(format_settings.csv.delimiter, *buf)) { if (!format_settings.csv.missing_as_default) { @@ -165,9 +164,7 @@ void CSVFormatReader::skipFieldDelimiter() throwAtAssertionFailed(err, *buf); } else - { current_row_has_missing_fields = true; - } } } From a0fde6a55b3ddb9cac0b3914fc18af58f6419eac Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 22 Jun 2023 10:50:14 +0000 Subject: [PATCH 16/38] Style fix --- .../Formats/Impl/CSVRowInputFormat.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 59b0f25f0bf..edbc33fb3c3 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -156,16 +156,17 @@ void CSVFormatReader::skipFieldDelimiter() { skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); - if (!checkChar(format_settings.csv.delimiter, *buf)) + bool res = checkChar(format_settings.csv.delimiter, *buf); + if (res) + return; + + if (!format_settings.csv.missing_as_default) { - if (!format_settings.csv.missing_as_default) - { - char err[2] = {format_settings.csv.delimiter, '\0'}; - throwAtAssertionFailed(err, *buf); - } - else - current_row_has_missing_fields = true; + char err[2] = {format_settings.csv.delimiter, '\0'}; + throwAtAssertionFailed(err, *buf); } + else + current_row_has_missing_fields = true; } template From c85ade9c27ae56584c924b4b18541bc8615d816e Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Mon, 26 Jun 2023 07:44:19 +0000 Subject: [PATCH 17/38] Add const multiplier --- .../functions/date-time-functions.md | 1 - src/Core/DecimalFunctions.h | 2 +- src/Functions/DateTimeTransforms.h | 16 ++++++++-------- src/Functions/dateDiff.cpp | 9 ++++----- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/docs/zh/sql-reference/functions/date-time-functions.md b/docs/zh/sql-reference/functions/date-time-functions.md index 270fa44a421..e4b70322477 100644 --- a/docs/zh/sql-reference/functions/date-time-functions.md +++ b/docs/zh/sql-reference/functions/date-time-functions.md @@ -625,7 +625,6 @@ SELECT date_add(YEAR, 3, toDate('2018-01-01')); │ 2021-01-01 │ └───────────────────────────────────────────────┘ ``` -## age {#age} ## date_diff {#date_diff} diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index defc21a5f43..17d95650730 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -33,7 +33,7 @@ template <> inline constexpr size_t max_precision = 38; template <> inline constexpr size_t max_precision = 76; template -constexpr inline auto scaleMultiplier(UInt32 scale) +inline auto scaleMultiplier(UInt32 scale) { if constexpr (std::is_same_v || std::is_same_v) return common::exp10_i32(scale); diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index afff8d6523d..c967d74da0c 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -19,8 +19,8 @@ namespace DB { -static constexpr auto microsecond_scale = 6; -static constexpr auto millisecond_scale = 3; +static constexpr auto microsecond_multiplier = 1000000; +static constexpr auto millisecond_multiplier = 1000; namespace ErrorCodes { @@ -1387,6 +1387,7 @@ struct ToRelativeSubsecondNumImpl static inline Int64 execute(const DateTime64 & t, DateTime64::NativeType scale, const DateLUTImpl &) { + static_assert(scale_multiplier == 1000 || scale_multiplier == 1000000); if (scale == scale_multiplier) return t.value; if (scale > scale_multiplier) @@ -1521,7 +1522,6 @@ struct ToDateTimeComponentsImpl static inline DateTimeComponentsWithFractionalPart execute(const DateTime64 & t, DateTime64::NativeType scale_multiplier, const DateLUTImpl & time_zone) { auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier); - constexpr auto multiplier = DecimalUtils::scaleMultiplier(microsecond_scale); if (t.value < 0 && components.fractional) { @@ -1529,12 +1529,12 @@ struct ToDateTimeComponentsImpl --components.whole; } Int64 fractional = components.fractional; - if (scale_multiplier > multiplier) - fractional = fractional / (scale_multiplier / multiplier); - else if (scale_multiplier < multiplier) - fractional = fractional * (multiplier / scale_multiplier); + if (scale_multiplier > microsecond_multiplier) + fractional = fractional / (scale_multiplier / microsecond_multiplier); + else if (scale_multiplier < microsecond_multiplier) + fractional = fractional * (microsecond_multiplier / scale_multiplier); - constexpr auto divider = DecimalUtils::scaleMultiplier(microsecond_scale - millisecond_scale); + constexpr Int64 divider = microsecond_multiplier/ millisecond_multiplier; UInt16 millisecond = static_cast(fractional / divider); UInt16 microsecond = static_cast(fractional % divider); return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(components.whole), millisecond, microsecond}; diff --git a/src/Functions/dateDiff.cpp b/src/Functions/dateDiff.cpp index 79be3059b2a..253ed703bb9 100644 --- a/src/Functions/dateDiff.cpp +++ b/src/Functions/dateDiff.cpp @@ -177,9 +177,8 @@ public: DateTimeComponentsWithFractionalPart a_comp; DateTimeComponentsWithFractionalPart b_comp; Int64 adjust_value; - constexpr auto multiplier = DecimalUtils::scaleMultiplier(microsecond_scale); - auto x_microseconds = TransformDateTime64>(transform_x.getScaleMultiplier()).execute(x, timezone_x); - auto y_microseconds = TransformDateTime64>(transform_y.getScaleMultiplier()).execute(y, timezone_y); + auto x_microseconds = TransformDateTime64>(transform_x.getScaleMultiplier()).execute(x, timezone_x); + auto y_microseconds = TransformDateTime64>(transform_y.getScaleMultiplier()).execute(y, timezone_y); if (x_microseconds <= y_microseconds) { @@ -399,9 +398,9 @@ public: else if (unit == "second" || unit == "ss" || unit == "s") impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else if (unit == "millisecond" || unit == "ms") - impl.template dispatchForColumns(millisecond_scale)>>(x, y, timezone_x, timezone_y, res->getData()); + impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else if (unit == "microsecond" || unit == "us" || unit == "u") - impl.template dispatchForColumns(microsecond_scale)>>(x, y, timezone_x, timezone_y, res->getData()); + impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} does not support '{}' unit", getName(), unit); From e34597e43d09c3c164fb516a544f82d347be6afa Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Tue, 27 Jun 2023 18:36:32 +0300 Subject: [PATCH 18/38] Fix tabulation --- src/Functions/dateDiff.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/dateDiff.cpp b/src/Functions/dateDiff.cpp index 253ed703bb9..6bfbbb7c735 100644 --- a/src/Functions/dateDiff.cpp +++ b/src/Functions/dateDiff.cpp @@ -399,7 +399,7 @@ public: impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else if (unit == "millisecond" || unit == "ms") impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); - else if (unit == "microsecond" || unit == "us" || unit == "u") + else if (unit == "microsecond" || unit == "us" || unit == "u") impl.template dispatchForColumns>(x, y, timezone_x, timezone_y, res->getData()); else throw Exception(ErrorCodes::BAD_ARGUMENTS, From 50449cc68d03f213c6b128ed51416d1de21ad1cd Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 30 Jun 2023 19:07:32 +0200 Subject: [PATCH 19/38] fix write to finalized buffer --- src/Core/Settings.h | 2 +- src/Server/HTTPHandler.cpp | 7 +++---- .../0_stateless/00429_long_http_bufferization.sh | 13 ++++++++++++- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 658f3c8025b..288413857d4 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -274,7 +274,7 @@ class IColumn; \ M(UInt64, http_headers_progress_interval_ms, 100, "Do not send HTTP headers X-ClickHouse-Progress more frequently than at each specified interval.", 0) \ M(Bool, http_wait_end_of_query, false, "Enable HTTP response buffering on the server-side.", 0) \ - M(UInt64, http_response_buffer_size, false, "The number of bytes to buffer in the server memory before sending a HTTP response to the client or flushing to disk (when http_wait_end_of_query is enabled).", 0) \ + M(UInt64, http_response_buffer_size, 0, "The number of bytes to buffer in the server memory before sending a HTTP response to the client or flushing to disk (when http_wait_end_of_query is enabled).", 0) \ \ M(Bool, fsync_metadata, true, "Do fsync after changing metadata for tables and databases (.sql files). Could be disabled in case of poor latency on server with high load of DDL queries and high load of disk subsystem.", 0) \ \ diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index fe98ae5f69e..a391e3bb2e4 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -902,10 +902,9 @@ try /// Destroy CascadeBuffer to actualize buffers' positions and reset extra references if (used_output.hasDelayed()) { - if (used_output.out_maybe_delayed_and_compressed) - { - used_output.out_maybe_delayed_and_compressed->finalize(); - } + /// do not call finalize here for CascadeWriteBuffer used_output.out_maybe_delayed_and_compressed, + /// exception is written into used_output.out_maybe_compressed later + /// HTTPHandler::trySendExceptionToClient is called with exception context, it is Ok to destroy buffers used_output.out_maybe_delayed_and_compressed.reset(); } diff --git a/tests/queries/0_stateless/00429_long_http_bufferization.sh b/tests/queries/0_stateless/00429_long_http_bufferization.sh index 34d07cef7e3..55192422389 100755 --- a/tests/queries/0_stateless/00429_long_http_bufferization.sh +++ b/tests/queries/0_stateless/00429_long_http_bufferization.sh @@ -7,9 +7,11 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh +format="RowBinary" + function query { # bash isn't able to store \0 bytes, so use [1; 255] random range - echo "SELECT greatest(toUInt8(1), toUInt8(intHash64(number))) FROM system.numbers LIMIT $1 FORMAT RowBinary" + echo "SELECT greatest(toUInt8(1), toUInt8(intHash64(number))) FROM system.numbers LIMIT $1 FORMAT $format" } function ch_url() { @@ -42,6 +44,14 @@ function check_last_line_exception() { } function check_exception_handling() { + # it is impossible to override max_block_size, details here https://github.com/ClickHouse/ClickHouse/issues/51694 + # rebuild CLICKHOUSE_URL for one call in order to avoid using random parameters from CLICKHOUSE_URL_PARAMS + CLICKHOUSE_URL="${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTP}/?wait_end_of_query=0" \ + max_block_size=30000 \ + format=TSV \ + check_last_line_exception \ + "max_result_rows=400000&buffer_size=1048577&wait_end_of_query=0" 111222333444 + check_only_exception "max_result_bytes=1000" 1001 check_only_exception "max_result_bytes=1000&wait_end_of_query=1" 1001 @@ -60,6 +70,7 @@ check_exception_handling # Tune setting to speed up combinatorial test +# max_block_size has no effect here, that value has been set inside CLICKHOUSE_URL max_block_size=500000 corner_sizes="1048576 $(seq 500000 1000000 3500000)" From 31ced70ced6cf8a82aac60b7cd7e9d2740aae2bf Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 4 Jul 2023 20:19:20 +0200 Subject: [PATCH 20/38] remove wrong commit, fix the exceptions in tests --- .../00429_long_http_bufferization.sh | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tests/queries/0_stateless/00429_long_http_bufferization.sh b/tests/queries/0_stateless/00429_long_http_bufferization.sh index 55192422389..98dd300e6ab 100755 --- a/tests/queries/0_stateless/00429_long_http_bufferization.sh +++ b/tests/queries/0_stateless/00429_long_http_bufferization.sh @@ -15,7 +15,9 @@ function query { } function ch_url() { - ${CLICKHOUSE_CURL_COMMAND} -q -sS "${CLICKHOUSE_URL}&max_block_size=$max_block_size&$1" -d "$(query "$2")" + ${CLICKHOUSE_CURL_COMMAND} -q -sS \ + "${CLICKHOUSE_URL}${max_block_size:+"&max_block_size=$max_block_size"}&$1" \ + -d "$(query "$2")" } @@ -26,9 +28,9 @@ exception_pattern="DB::Exception:[[:print:]]*" function check_only_exception() { local res res=$(ch_url "$1" "$2") - #(echo "$res") - #(echo "$res" | wc -l) - #(echo "$res" | grep -c "$exception_pattern") + # echo "$res" + # echo "$res" | wc -l + # echo "$res" | grep -c "$exception_pattern" [[ $(echo "$res" | wc -l) -eq 1 ]] || echo FAIL 1 "$@" [[ $(echo "$res" | grep -c "$exception_pattern") -eq 1 ]] || echo FAIL 2 "$@" } @@ -36,27 +38,23 @@ function check_only_exception() { function check_last_line_exception() { local res res=$(ch_url "$1" "$2") - #echo "$res" > res - #echo "$res" | wc -c - #echo "$res" | tail -n -2 + # echo "$res" > res + # echo "$res" | wc -c + # echo "$res" | tail -n -2 [[ $(echo "$res" | tail -n -1 | grep -c "$exception_pattern") -eq 1 ]] || echo FAIL 3 "$@" [[ $(echo "$res" | head -n -1 | grep -c "$exception_pattern") -eq 0 ]] || echo FAIL 4 "$@" } function check_exception_handling() { - # it is impossible to override max_block_size, details here https://github.com/ClickHouse/ClickHouse/issues/51694 - # rebuild CLICKHOUSE_URL for one call in order to avoid using random parameters from CLICKHOUSE_URL_PARAMS - CLICKHOUSE_URL="${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTP}/?wait_end_of_query=0" \ - max_block_size=30000 \ format=TSV \ check_last_line_exception \ - "max_result_rows=400000&buffer_size=1048577&wait_end_of_query=0" 111222333444 + "max_block_size=30000&max_result_rows=400000&buffer_size=1048577&wait_end_of_query=0" 111222333444 check_only_exception "max_result_bytes=1000" 1001 check_only_exception "max_result_bytes=1000&wait_end_of_query=1" 1001 - check_only_exception "max_result_bytes=1048576&buffer_size=1048576&wait_end_of_query=0" 1048577 - check_only_exception "max_result_bytes=1048576&buffer_size=1048576&wait_end_of_query=1" 1048577 + check_last_line_exception "max_result_bytes=1048576&buffer_size=1048576&wait_end_of_query=0" 1048577 + check_only_exception "max_result_bytes=1048576&buffer_size=1048576&wait_end_of_query=1" 1048577 check_only_exception "max_result_bytes=1500000&buffer_size=2500000&wait_end_of_query=0" 1500001 check_only_exception "max_result_bytes=1500000&buffer_size=1500000&wait_end_of_query=1" 1500001 @@ -70,7 +68,6 @@ check_exception_handling # Tune setting to speed up combinatorial test -# max_block_size has no effect here, that value has been set inside CLICKHOUSE_URL max_block_size=500000 corner_sizes="1048576 $(seq 500000 1000000 3500000)" From 86014a60a308ec41c7416bdbbfe6b360dcf1617b Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Wed, 5 Jul 2023 11:42:02 +0000 Subject: [PATCH 21/38] Fixed case with spaces before delimiter --- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 1 + tests/queries/0_stateless/00301_csv.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index edbc33fb3c3..9731b4ba465 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -341,6 +341,7 @@ bool CSVFormatReader::readField( if (is_last_file_column && format_settings.csv.ignore_extra_columns) { // Skip all fields to next line. + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); while (checkChar(format_settings.csv.delimiter, *buf)) { skipField(); diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index dc354433af9..7657745e9f7 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -49,7 +49,7 @@ echo '"Hello", 1, "String1" "Hello", 3, "String3", "2016-01-13" "Hello", 4, , "2016-01-14" "Hello", 5, "String5", "2016-01-15", "2016-01-16" -"Hello", 6, "String6", "line with a +"Hello", 6, "String6" , "line with a break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; From 24b5c9c204dcc0f3c181d13528d46d012dae86c9 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 6 Jul 2023 06:05:43 +0000 Subject: [PATCH 22/38] Use one setting input_format_csv_allow_variable_number_of_colums and code in RowInput --- docs/en/interfaces/formats.md | 3 +- .../operations/settings/settings-formats.md | 10 +--- docs/ru/interfaces/formats.md | 3 +- docs/ru/operations/settings/settings.md | 10 +--- src/Core/Settings.h | 3 +- src/Formats/FormatFactory.cpp | 3 +- src/Formats/FormatSettings.h | 3 +- .../Formats/Impl/CSVRowInputFormat.cpp | 58 ++++++------------- .../Formats/Impl/CSVRowInputFormat.h | 6 +- .../RowInputFormatWithNamesAndTypes.cpp | 23 ++++++++ .../Formats/RowInputFormatWithNamesAndTypes.h | 4 ++ 11 files changed, 58 insertions(+), 68 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 79790cef5b2..34f9abb91d4 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -471,8 +471,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. - [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. -- [input_format_csv_ignore_extra_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_ignore_extra_columns) - ignore extra columns in CSV input (if file has more columns than expected). Default value - `false`. -- [input_format_csv_missing_as_default](/docs/en/operations/settings/settings-formats.md/#input_format_csv_missing_as_default) - treat missing fields in CSV input as default values. Default value - `false`. +- [input_format_csv_allow_variable_number_of_colums](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_colums) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 6b05f41666c..43e410ceee8 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -931,15 +931,9 @@ Result ```text " string " ``` -### input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns} +### input_format_csv_allow_variable_number_of_colums {#input_format_csv_allow_variable_number_of_colums} -Ignore extra columns in CSV input (if file has more columns than expected). - -Disabled by default. - -### input_format_csv_missing_as_default {#input_format_csv_missing_as_default} - -Treat missing fields in CSV input as default values. +ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Disabled by default. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 7e3bb3f7d26..e7c57fff749 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -402,8 +402,7 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR - [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`. - [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`. - [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. Значение по умолчанию - `true`. -- [input_format_csv_ignore_extra_columns](../operations/settings/settings.md/#input_format_csv_ignore_extra_columns) - игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается). Значение по умолчанию - `false`. -- [input_format_csv_missing_as_default](../operations/settings/settings.md/#input_format_csv_missing_as_default) - рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Значение по умолчанию - `false`. +- [input_format_csv_allow_variable_number_of_colums](../operations/settings/settings.md/#input_format_csv_allow_variable_number_of_colums) - игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Значение по умолчанию - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index e679ce6abe1..ddc101c6991 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1727,15 +1727,9 @@ echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --in " string " ``` -## input_format_csv_ignore_extra_columns {#input_format_csv_ignore_extra_columns} +## input_format_csv_allow_variable_number_of_colums {#input_format_csv_allow_variable_number_of_colums} -Игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается). - -Выключено по умолчанию. - -## input_format_csv_missing_as_default {#input_format_csv_missing_as_default} - -Рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. +Игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Выключено по умолчанию. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 044b3c34dc2..df2a916b7cf 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1009,8 +1009,7 @@ class IColumn; M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ - M(Bool, input_format_csv_ignore_extra_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected)", 0) \ - M(Bool, input_format_csv_missing_as_default, false, "Treat missing fields in CSV input as default values", 0) \ + M(Bool, input_format_csv_allow_variable_number_of_colums, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 04b095a92d6..af9823dde73 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -72,8 +72,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines; format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter; - format_settings.csv.ignore_extra_columns = settings.input_format_csv_ignore_extra_columns; - format_settings.csv.missing_as_default = settings.input_format_csv_missing_as_default; + format_settings.csv.allow_variable_number_of_colums = settings.input_format_csv_allow_variable_number_of_colums; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 4bdc9077a0b..653578f8496 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -140,8 +140,7 @@ struct FormatSettings bool skip_trailing_empty_lines = false; bool trim_whitespaces = true; bool allow_whitespace_or_tab_as_delimiter = false; - bool ignore_extra_columns = false; - bool missing_as_default = false; + bool allow_variable_number_of_colums = false; } csv; struct HiveText diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9731b4ba465..57e05ae7cd3 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -155,18 +155,7 @@ CSVFormatReader::CSVFormatReader(PeekableReadBuffer & buf_, const FormatSettings void CSVFormatReader::skipFieldDelimiter() { skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); - - bool res = checkChar(format_settings.csv.delimiter, *buf); - if (res) - return; - - if (!format_settings.csv.missing_as_default) - { - char err[2] = {format_settings.csv.delimiter, '\0'}; - throwAtAssertionFailed(err, *buf); - } - else - current_row_has_missing_fields = true; + assertChar(format_settings.csv.delimiter, *buf); } template @@ -206,7 +195,6 @@ void CSVFormatReader::skipRowEndDelimiter() return; skipEndOfLine(*buf); - current_row_has_missing_fields = false; } void CSVFormatReader::skipHeaderRow() @@ -295,6 +283,11 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return true; } +bool CSVFormatReader::allowVariableNumberOfColumns() +{ + return format_settings.csv.allow_variable_number_of_colums; +} + bool CSVFormatReader::readField( IColumn & column, const DataTypePtr & type, @@ -308,8 +301,6 @@ bool CSVFormatReader::readField( const bool at_delimiter = !buf->eof() && *buf->position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column && (buf->eof() || *buf->position() == '\n' || *buf->position() == '\r'); - bool res = false; - /// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default /// only one empty or NULL column will be expected if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end)) @@ -321,34 +312,18 @@ bool CSVFormatReader::readField( /// they do not contain empty unquoted fields, so this check /// works for tuples as well. column.insertDefault(); - } - else if (current_row_has_missing_fields) - { - column.insertDefault(); - } - else if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - { - /// If value is null but type is not nullable then use default value instead. - res = SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization); - } - else - { - /// Read the column normally. - serialization->deserializeTextCSV(column, *buf, format_settings); - res = true; + return false; } - if (is_last_file_column && format_settings.csv.ignore_extra_columns) + if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) { - // Skip all fields to next line. - skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); - while (checkChar(format_settings.csv.delimiter, *buf)) - { - skipField(); - skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); - } + /// If value is null but type is not nullable then use default value instead. + return SerializationNullable::deserializeTextCSVImpl(column, *buf, format_settings, serialization); } - return res; + + /// Read the column normally. + serialization->deserializeTextCSV(column, *buf, format_settings); + return true; } void CSVFormatReader::skipPrefixBeforeHeader() @@ -377,6 +352,11 @@ bool CSVFormatReader::checkForSuffix() return false; } +bool CSVFormatReader::checkForEndOfRow() +{ + return buf->eof() || *buf->position() == '\n' || *buf->position() == '\r'; +} + CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesSchemaReader( buf, diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 82e03c453e7..8ccf04feed3 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -69,6 +69,9 @@ public: void skipRowEndDelimiter() override; void skipPrefixBeforeHeader() override; + bool checkForEndOfRow() override; + bool allowVariableNumberOfColumns() override; + std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } std::vector readHeaderRow() { return readRowImpl(); } @@ -89,9 +92,6 @@ public: protected: PeekableReadBuffer * buf; - -private: - bool current_row_has_missing_fields = false; }; class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index eaedbbb4a1e..fb49779e0af 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -227,7 +227,30 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE format_reader->skipField(file_column); if (!is_last_file_column) + { + if (format_reader->allowVariableNumberOfColumns() && format_reader->checkForEndOfRow()) + { + ++file_column; + while (file_column < column_mapping->column_indexes_for_input_fields.size()) + { + const auto & rem_column_index = column_mapping->column_indexes_for_input_fields[file_column]; + columns[*rem_column_index]->insertDefault(); + ++file_column; + } + } + else + format_reader->skipFieldDelimiter(); + } + } + + if (format_reader->allowVariableNumberOfColumns() && !format_reader->checkForEndOfRow()) + { + do + { format_reader->skipFieldDelimiter(); + format_reader->skipField(1); + } + while (!format_reader->checkForEndOfRow()); } format_reader->skipRowEndDelimiter(); diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index 5648acd392d..b5103d3db39 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -119,6 +119,10 @@ public: /// Check suffix. virtual bool checkForSuffix() { return in->eof(); } + virtual bool checkForEndOfRow() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method checkForEndOfRow is not implemented"); } + + virtual bool allowVariableNumberOfColumns() { return false; } + const FormatSettings & getFormatSettings() const { return format_settings; } virtual void setReadBuffer(ReadBuffer & in_) { in = &in_; } From 32f5a7830229b53df80f9e788b860066a4a86947 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 6 Jul 2023 07:32:46 +0000 Subject: [PATCH 23/38] Fix setting name --- docs/en/interfaces/formats.md | 2 +- docs/en/operations/settings/settings-formats.md | 2 +- docs/ru/interfaces/formats.md | 2 +- docs/ru/operations/settings/settings.md | 2 +- src/Core/Settings.h | 2 +- src/Formats/FormatFactory.cpp | 2 +- src/Formats/FormatSettings.h | 2 +- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 2 +- tests/queries/0_stateless/00301_csv.reference | 4 ++-- tests/queries/0_stateless/00301_csv.sh | 8 ++++---- 10 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 34f9abb91d4..ed2f010a632 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -471,7 +471,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. - [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. -- [input_format_csv_allow_variable_number_of_colums](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_colums) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`. +- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 43e410ceee8..3eea5ef4ad9 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -931,7 +931,7 @@ Result ```text " string " ``` -### input_format_csv_allow_variable_number_of_colums {#input_format_csv_allow_variable_number_of_colums} +### input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns} ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index e7c57fff749..e232b63f049 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -402,7 +402,7 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR - [input_format_csv_skip_first_lines](../operations/settings/settings.md#input_format_csv_skip_first_lines) - пропустить указанное количество строк в начале данных. Значение по умолчанию - `0`. - [input_format_csv_detect_header](../operations/settings/settings.md#input_format_csv_detect_header) - обнаружить заголовок с именами и типами в формате CSV. Значение по умолчанию - `true`. - [input_format_csv_trim_whitespaces](../operations/settings/settings.md#input_format_csv_trim_whitespaces) - удалить пробелы и символы табуляции из строк без кавычек. Значение по умолчанию - `true`. -- [input_format_csv_allow_variable_number_of_colums](../operations/settings/settings.md/#input_format_csv_allow_variable_number_of_colums) - игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Значение по умолчанию - `false`. +- [input_format_csv_allow_variable_number_of_columns](../operations/settings/settings.md/#input_format_csv_allow_variable_number_of_columns) - игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. Значение по умолчанию - `false`. ## CSVWithNames {#csvwithnames} diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index ddc101c6991..42e21f6140b 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1727,7 +1727,7 @@ echo ' string ' | ./clickhouse local -q "select * from table FORMAT CSV" --in " string " ``` -## input_format_csv_allow_variable_number_of_colums {#input_format_csv_allow_variable_number_of_colums} +## input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns} Игнорировать дополнительные столбцы (если файл содержит больше столбцов чем ожидается) и рассматривать отсутствующие поля в CSV в качестве значений по умолчанию. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index df2a916b7cf..7f8a52c69fa 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1009,7 +1009,7 @@ class IColumn; M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ - M(Bool, input_format_csv_allow_variable_number_of_colums, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ + M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index af9823dde73..182abc84ffe 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -72,7 +72,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines; format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces; format_settings.csv.allow_whitespace_or_tab_as_delimiter = settings.input_format_csv_allow_whitespace_or_tab_as_delimiter; - format_settings.csv.allow_variable_number_of_colums = settings.input_format_csv_allow_variable_number_of_colums; + format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 653578f8496..dd4608227d0 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -140,7 +140,7 @@ struct FormatSettings bool skip_trailing_empty_lines = false; bool trim_whitespaces = true; bool allow_whitespace_or_tab_as_delimiter = false; - bool allow_variable_number_of_colums = false; + bool allow_variable_number_of_columns = false; } csv; struct HiveText diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 57e05ae7cd3..60f1cbe1f80 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -285,7 +285,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) bool CSVFormatReader::allowVariableNumberOfColumns() { - return format_settings.csv.allow_variable_number_of_colums; + return format_settings.csv.allow_variable_number_of_columns; } bool CSVFormatReader::readField( diff --git a/tests/queries/0_stateless/00301_csv.reference b/tests/queries/0_stateless/00301_csv.reference index 804ccf0c713..ec8c5f2b371 100644 --- a/tests/queries/0_stateless/00301_csv.reference +++ b/tests/queries/0_stateless/00301_csv.reference @@ -14,14 +14,14 @@ default-eof 1 2019-06-19 2016-01-01 01:02:03 NUL 2016-01-02 01:02:03 Nhello \N \N -=== Test input_format_csv_ignore_extra_columns +=== Test ignore extra columns Hello 1 String1 Hello 2 String2 Hello 3 String3 Hello 4 String4 Hello 5 String5 Hello 6 String6 -=== Test input_format_csv_missing_as_default +=== Test missing as default 0 0 33 \N 55 Default 0 0 33 \N 55 Default Hello 0 0 33 \N 55 Default diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index 7657745e9f7..776bd39fc03 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -41,7 +41,7 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; -echo === Test input_format_csv_ignore_extra_columns +echo === Test ignore extra columns $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory"; echo '"Hello", 1, "String1" @@ -50,12 +50,12 @@ echo '"Hello", 1, "String1" "Hello", 4, , "2016-01-14" "Hello", 5, "String5", "2016-01-15", "2016-01-16" "Hello", 6, "String6" , "line with a -break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_ignore_extra_columns=1 --query="INSERT INTO csv FORMAT CSV"; +break"' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --input_format_csv_allow_variable_number_of_columns=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s, n"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; -echo === Test input_format_csv_missing_as_default +echo === Test missing as default $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (f1 String, f2 UInt64, f3 UInt256, f4 UInt64 Default 33, f5 Nullable(UInt64), f6 Nullable(UInt64) Default 55, f7 String DEFAULT 'Default') ENGINE = Memory"; echo ' @@ -65,6 +65,6 @@ echo ' "Hello", 1, 3, 2 "Hello",1,4,2,3,4,"String" "Hello", 1, 4, 2, 3, 4, "String" -"Hello", 1, 5, 2, 3, 4, "String",'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_missing_as_default=1 --query="INSERT INTO csv FORMAT CSV"; +"Hello", 1, 5, 2, 3, 4, "String",'| $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_allow_variable_number_of_columns=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY f1, f2, f3, f4, f5 NULLS FIRST, f6, f7"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; From 86fc70223693db8aac9edfa7c85e7e80286042ec Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 6 Jul 2023 15:14:18 +0300 Subject: [PATCH 24/38] Add skipWhitespacesAndTabs() Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 60f1cbe1f80..79ce2549b4d 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -354,6 +354,7 @@ bool CSVFormatReader::checkForSuffix() bool CSVFormatReader::checkForEndOfRow() { + skipWhitespacesAndTabs(*buf, format_settings.csv.allow_whitespace_or_tab_as_delimiter); return buf->eof() || *buf->position() == '\n' || *buf->position() == '\r'; } From 28332076054cc77660a4dbc3e13dcea1999a6342 Mon Sep 17 00:00:00 2001 From: Dmitry Kardymon Date: Thu, 6 Jul 2023 13:09:49 +0000 Subject: [PATCH 25/38] Edit tests to test last commit --- tests/queries/0_stateless/00301_csv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index 776bd39fc03..80053c99a17 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -44,7 +44,7 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; echo === Test ignore extra columns $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (s String, n UInt64 DEFAULT 3, d String DEFAULT 'String4') ENGINE = Memory"; -echo '"Hello", 1, "String1" +echo '"Hello", 1, "String1" "Hello", 2, "String2", "Hello", 3, "String3", "2016-01-13" "Hello", 4, , "2016-01-14" From c35294317dbff31b8ff8b48f6256162d6d5dc02e Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 6 Jul 2023 15:06:54 +0000 Subject: [PATCH 26/38] Remove parts in order for object storage always --- src/Storages/MergeTree/MergeTreeData.cpp | 34 +++++++++++++++--------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index fa9bfd38a23..0ef71895999 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2137,20 +2137,20 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) /// Please don't use "zero-copy replication" (a non-production feature) in production. /// It is not ready for production usage. Don't use it. - bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication; + /// It also is disabled for any object storage, because it can lead to race conditions on blob removal. + /// (see comment at `clearPartsFromFilesystemImpl`). + bool need_remove_parts_in_order = false; - if (need_remove_parts_in_order) + if (supportsReplication()) { - bool has_zero_copy_disk = false; for (const auto & disk : getDisks()) { - if (disk->supportZeroCopyReplication()) + if (disk->isRemote()) { - has_zero_copy_disk = true; + need_remove_parts_in_order = true; break; } } - need_remove_parts_in_order = has_zero_copy_disk; } std::vector parts_to_delete; @@ -2394,18 +2394,28 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t std::mutex part_names_mutex; auto runner = threadPoolCallbackRunner(getPartsCleaningThreadPool().get(), "PartsCleaning"); - /// This flag disallow straightforward concurrent parts removal. It's required only in case - /// when we have parts on zero-copy disk + at least some of them were mutated. + /** Straightforward concurrent parts removal can be applied for the case + * when we have parts on object storage disk + at least some of them were mutated + * (thus, can contains hardlinks to files in the previous parts). + * If we are deleting parts that contains hardlinks to the same file we may face into race condition + * and delete only local metadata files, but not the blobs on object storage. + * Given that, we remove in parallel only "independent" parts that don't have such hardlinks. + * Note that it also may be applicable for the regular MergeTree, fixed only for Replicated. + * + * To avoid this we need to fix race conditions on parts and blob removal. + */ bool remove_parts_in_order = false; - if (settings->allow_remote_fs_zero_copy_replication && dynamic_cast(this) != nullptr) + if (dynamic_cast(this) != nullptr) { remove_parts_in_order = std::any_of( parts_to_remove.begin(), parts_to_remove.end(), - [] (const auto & data_part) { return data_part->isStoredOnRemoteDiskWithZeroCopySupport() && data_part->info.getMutationVersion() > 0; } + [] (const auto & data_part) + { + return data_part->isStoredOnRemoteDisk() && data_part->info.getMutationVersion() > 0; + } ); } - if (!remove_parts_in_order) { /// NOTE: Under heavy system load you may get "Cannot schedule a task" from ThreadPool. @@ -2441,7 +2451,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t /// NOTE: Under heavy system load you may get "Cannot schedule a task" from ThreadPool. LOG_DEBUG( - log, "Removing {} parts from filesystem (concurrently): Parts: [{}]", parts_to_remove.size(), fmt::join(parts_to_remove, ", ")); + log, "Removing {} parts from filesystem (concurrently in order): Parts: [{}]", parts_to_remove.size(), fmt::join(parts_to_remove, ", ")); /// We have "zero copy replication" parts and we are going to remove them in parallel. /// The problem is that all parts in a mutation chain must be removed sequentially to avoid "key does not exits" issues. From eed1ecb6ba7ba4fdebd1c572881d064c66a0a102 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 7 Jul 2023 15:01:23 +0000 Subject: [PATCH 27/38] Revert "Remove parts in order for object storage always" This reverts commit c35294317dbff31b8ff8b48f6256162d6d5dc02e. --- src/Storages/MergeTree/MergeTreeData.cpp | 34 +++++++++--------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 0ef71895999..fa9bfd38a23 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2137,20 +2137,20 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) /// Please don't use "zero-copy replication" (a non-production feature) in production. /// It is not ready for production usage. Don't use it. - /// It also is disabled for any object storage, because it can lead to race conditions on blob removal. - /// (see comment at `clearPartsFromFilesystemImpl`). - bool need_remove_parts_in_order = false; + bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication; - if (supportsReplication()) + if (need_remove_parts_in_order) { + bool has_zero_copy_disk = false; for (const auto & disk : getDisks()) { - if (disk->isRemote()) + if (disk->supportZeroCopyReplication()) { - need_remove_parts_in_order = true; + has_zero_copy_disk = true; break; } } + need_remove_parts_in_order = has_zero_copy_disk; } std::vector parts_to_delete; @@ -2394,28 +2394,18 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t std::mutex part_names_mutex; auto runner = threadPoolCallbackRunner(getPartsCleaningThreadPool().get(), "PartsCleaning"); - /** Straightforward concurrent parts removal can be applied for the case - * when we have parts on object storage disk + at least some of them were mutated - * (thus, can contains hardlinks to files in the previous parts). - * If we are deleting parts that contains hardlinks to the same file we may face into race condition - * and delete only local metadata files, but not the blobs on object storage. - * Given that, we remove in parallel only "independent" parts that don't have such hardlinks. - * Note that it also may be applicable for the regular MergeTree, fixed only for Replicated. - * - * To avoid this we need to fix race conditions on parts and blob removal. - */ + /// This flag disallow straightforward concurrent parts removal. It's required only in case + /// when we have parts on zero-copy disk + at least some of them were mutated. bool remove_parts_in_order = false; - if (dynamic_cast(this) != nullptr) + if (settings->allow_remote_fs_zero_copy_replication && dynamic_cast(this) != nullptr) { remove_parts_in_order = std::any_of( parts_to_remove.begin(), parts_to_remove.end(), - [] (const auto & data_part) - { - return data_part->isStoredOnRemoteDisk() && data_part->info.getMutationVersion() > 0; - } + [] (const auto & data_part) { return data_part->isStoredOnRemoteDiskWithZeroCopySupport() && data_part->info.getMutationVersion() > 0; } ); } + if (!remove_parts_in_order) { /// NOTE: Under heavy system load you may get "Cannot schedule a task" from ThreadPool. @@ -2451,7 +2441,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t /// NOTE: Under heavy system load you may get "Cannot schedule a task" from ThreadPool. LOG_DEBUG( - log, "Removing {} parts from filesystem (concurrently in order): Parts: [{}]", parts_to_remove.size(), fmt::join(parts_to_remove, ", ")); + log, "Removing {} parts from filesystem (concurrently): Parts: [{}]", parts_to_remove.size(), fmt::join(parts_to_remove, ", ")); /// We have "zero copy replication" parts and we are going to remove them in parallel. /// The problem is that all parts in a mutation chain must be removed sequentially to avoid "key does not exits" issues. From 227e415d6d71ca49b486052513786c5f050a6279 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 7 Jul 2023 15:08:21 +0000 Subject: [PATCH 28/38] Check refcount in `RemoveManyObjectStorageOperation::finalize` instead of `execute` --- .../DiskObjectStorageTransaction.cpp | 39 ++++++++++++------- src/Disks/ObjectStorages/IMetadataStorage.h | 5 ++- .../MetadataStorageFromDisk.cpp | 7 +++- .../ObjectStorages/MetadataStorageFromDisk.h | 5 ++- ...taStorageFromDiskTransactionOperations.cpp | 5 +++ ...dataStorageFromDiskTransactionOperations.h | 12 ++++++ .../MetadataStorageFromPlainObjectStorage.cpp | 5 ++- .../MetadataStorageFromPlainObjectStorage.h | 5 ++- 8 files changed, 63 insertions(+), 20 deletions(-) diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp index bd66ada492f..f3dbac445a5 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace DB { @@ -150,7 +152,15 @@ struct RemoveManyObjectStorageOperation final : public IDiskObjectStorageOperati RemoveBatchRequest remove_paths; bool keep_all_batch_data; NameSet file_names_remove_metadata_only; - StoredObjects objects_to_remove; + + struct ObjectsToRemove + { + StoredObjects objects; + UnlinkMetadataFileOperationOutcomePtr unlink_outcome; + }; + + std::vector objects_to_remove; + bool remove_from_cache = false; RemoveManyObjectStorageOperation( @@ -174,7 +184,6 @@ struct RemoveManyObjectStorageOperation final : public IDiskObjectStorageOperati { for (const auto & [path, if_exists] : remove_paths) { - if (!metadata_storage.exists(path)) { if (if_exists) @@ -188,14 +197,12 @@ struct RemoveManyObjectStorageOperation final : public IDiskObjectStorageOperati try { - uint32_t hardlink_count = metadata_storage.getHardlinkCount(path); - auto objects = metadata_storage.getStorageObjects(path); - - tx->unlinkMetadata(path); - - /// File is really redundant - if (hardlink_count == 0 && !keep_all_batch_data && !file_names_remove_metadata_only.contains(fs::path(path).filename())) - std::move(objects.begin(), objects.end(), std::back_inserter(objects_to_remove)); + auto unlink_outcome = tx->unlinkMetadata(path); + if (unlink_outcome && !keep_all_batch_data && !file_names_remove_metadata_only.contains(fs::path(path).filename())) + { + auto objects = metadata_storage.getStorageObjects(path); + objects_to_remove.emplace_back(ObjectsToRemove{std::move(objects), std::move(unlink_outcome)}); + } } catch (const Exception & e) { @@ -215,15 +222,21 @@ struct RemoveManyObjectStorageOperation final : public IDiskObjectStorageOperati void undo() override { - } void finalize() override { + StoredObjects remove_from_remote; + for (auto && [objects, unlink_outcome] : objects_to_remove) + { + if (unlink_outcome->num_hardlinks == 0) + std::move(objects.begin(), objects.end(), std::back_inserter(remove_from_remote)); + } + /// Read comment inside RemoveObjectStorageOperation class /// TL;DR Don't pay any attention to 404 status code - if (!objects_to_remove.empty()) - object_storage.removeObjectsIfExist(objects_to_remove); + if (!remove_from_remote.empty()) + object_storage.removeObjectsIfExist(remove_from_remote); } }; diff --git a/src/Disks/ObjectStorages/IMetadataStorage.h b/src/Disks/ObjectStorages/IMetadataStorage.h index 00150df9fa3..264c481ee08 100644 --- a/src/Disks/ObjectStorages/IMetadataStorage.h +++ b/src/Disks/ObjectStorages/IMetadataStorage.h @@ -22,6 +22,8 @@ namespace ErrorCodes } class IMetadataStorage; +struct UnlinkMetadataFileOperationOutcome; +using UnlinkMetadataFileOperationOutcomePtr = std::shared_ptr; /// Tries to provide some "transactions" interface, which allow /// to execute (commit) operations simultaneously. We don't provide @@ -127,9 +129,10 @@ public: /// Unlink metadata file and do something special if required /// By default just remove file (unlink file). - virtual void unlinkMetadata(const std::string & path) + virtual UnlinkMetadataFileOperationOutcomePtr unlinkMetadata(const std::string & path) { unlinkFile(path); + return nullptr; } virtual ~IMetadataTransaction() = default; diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp index 9461a82845f..53428c2f6e1 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp @@ -340,9 +340,12 @@ void MetadataStorageFromDiskTransaction::addBlobToMetadata(const std::string & p addOperation(std::make_unique(path, blob_name, metadata_storage.object_storage_root_path, size_in_bytes, *metadata_storage.disk, metadata_storage)); } -void MetadataStorageFromDiskTransaction::unlinkMetadata(const std::string & path) +UnlinkMetadataFileOperationOutcomePtr MetadataStorageFromDiskTransaction::unlinkMetadata(const std::string & path) { - addOperation(std::make_unique(path, *metadata_storage.disk, metadata_storage)); + auto operation = std::make_unique(path, *metadata_storage.getDisk(), metadata_storage); + auto result = operation->outcome; + addOperation(std::move(operation)); + return result; } } diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h index 5273f0b041e..b518f5e3622 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h @@ -11,6 +11,9 @@ namespace DB { +struct UnlinkMetadataFileOperationOutcome; +using UnlinkMetadataFileOperationOutcomePtr = std::shared_ptr; + /// Store metadata on a separate disk /// (used for object storages, like S3 and related). class MetadataStorageFromDisk final : public IMetadataStorage @@ -131,7 +134,7 @@ public: void replaceFile(const std::string & path_from, const std::string & path_to) override; - void unlinkMetadata(const std::string & path) override; + UnlinkMetadataFileOperationOutcomePtr unlinkMetadata(const std::string & path) override; }; diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp index 7463622cb06..78e8764f8fc 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp @@ -319,6 +319,8 @@ void UnlinkMetadataFileOperation::execute(std::unique_lock & metada write_operation = std::make_unique(path, disk, metadata->serializeToString()); write_operation->execute(metadata_lock); } + outcome->num_hardlinks = ref_count; + unlink_operation = std::make_unique(path, disk); unlink_operation->execute(metadata_lock); } @@ -334,6 +336,9 @@ void UnlinkMetadataFileOperation::undo() if (write_operation) write_operation->undo(); + + /// Update outcome to reflect the fact that we have restored the file. + outcome->num_hardlinks++; } void SetReadonlyFileOperation::execute(std::unique_lock & metadata_lock) diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h index d8e4892a0a5..4662ebc3423 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h @@ -3,6 +3,8 @@ #include #include +#include + namespace DB { class MetadataStorageFromDisk; @@ -242,9 +244,19 @@ private: std::unique_ptr write_operation; }; +/// Return the result of operation to the caller. +/// It is used in `IDiskObjectStorageOperation::finalize` after metadata transaction executed to make decision on blob removal. +struct UnlinkMetadataFileOperationOutcome +{ + UInt32 num_hardlinks = std::numeric_limits::max(); +}; + +using UnlinkMetadataFileOperationOutcomePtr = std::shared_ptr; struct UnlinkMetadataFileOperation final : public IMetadataOperation { + const UnlinkMetadataFileOperationOutcomePtr outcome = std::make_shared(); + UnlinkMetadataFileOperation( const std::string & path_, IDisk & disk_, diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index c119e9f3adc..3650c7eaac8 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -135,9 +135,10 @@ void MetadataStorageFromPlainObjectStorageTransaction::addBlobToMetadata( { /// Noop, local metadata files is only one file, it is the metadata file itself. } -void MetadataStorageFromPlainObjectStorageTransaction::unlinkMetadata(const std::string &) + +UnlinkMetadataFileOperationOutcomePtr MetadataStorageFromPlainObjectStorageTransaction::unlinkMetadata(const std::string &) { - /// Noop, no separate metadata. + return nullptr; } } diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h index fb5b6d0757c..bd068c1362f 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h @@ -9,6 +9,9 @@ namespace DB { +struct UnlinkMetadataFileOperationOutcome; +using UnlinkMetadataFileOperationOutcomePtr = std::shared_ptr; + /// Object storage is used as a filesystem, in a limited form: /// - no directory concept, files only /// - no stat/chmod/... @@ -104,7 +107,7 @@ public: void unlinkFile(const std::string & path) override; - void unlinkMetadata(const std::string & path) override; + UnlinkMetadataFileOperationOutcomePtr unlinkMetadata(const std::string & path) override; void commit() override { From 88911e1378900d6687e05f08c6cbe592b5d32001 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 7 Jul 2023 16:42:03 +0000 Subject: [PATCH 29/38] Check refcount in finalize for other RemoveObjectStorageOperations --- .../DiskObjectStorageTransaction.cpp | 50 +++++++++---------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp index f3dbac445a5..0ae577602b1 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -65,11 +65,18 @@ struct PureMetadataObjectStorageOperation final : public IDiskObjectStorageOpera std::string getInfoForLog() const override { return fmt::format("PureMetadataObjectStorageOperation"); } }; + +struct ObjectsToRemove +{ + StoredObjects objects; + UnlinkMetadataFileOperationOutcomePtr unlink_outcome; +}; + struct RemoveObjectStorageOperation final : public IDiskObjectStorageOperation { std::string path; bool delete_metadata_only; - StoredObjects objects_to_remove; + ObjectsToRemove objects_to_remove; bool if_exists; bool remove_from_cache = false; @@ -105,15 +112,12 @@ struct RemoveObjectStorageOperation final : public IDiskObjectStorageOperation try { - uint32_t hardlink_count = metadata_storage.getHardlinkCount(path); auto objects = metadata_storage.getStorageObjects(path); - tx->unlinkMetadata(path); + auto unlink_outcome = tx->unlinkMetadata(path); - if (hardlink_count == 0) - { - objects_to_remove = std::move(objects); - } + if (unlink_outcome) + objects_to_remove = ObjectsToRemove{std::move(objects), std::move(unlink_outcome)}; } catch (const Exception & e) { @@ -142,8 +146,11 @@ struct RemoveObjectStorageOperation final : public IDiskObjectStorageOperation /// due to network error or similar. And when it will retry an operation it may receive /// a 404 HTTP code. We don't want to threat this code as a real error for deletion process /// (e.g. throwing some exceptions) and thus we just use method `removeObjectsIfExists` - if (!delete_metadata_only && !objects_to_remove.empty()) - object_storage.removeObjectsIfExist(objects_to_remove); + if (!delete_metadata_only && !objects_to_remove.objects.empty() + && objects_to_remove.unlink_outcome->num_hardlinks == 0) + { + object_storage.removeObjectsIfExist(objects_to_remove.objects); + } } }; @@ -153,12 +160,6 @@ struct RemoveManyObjectStorageOperation final : public IDiskObjectStorageOperati bool keep_all_batch_data; NameSet file_names_remove_metadata_only; - struct ObjectsToRemove - { - StoredObjects objects; - UnlinkMetadataFileOperationOutcomePtr unlink_outcome; - }; - std::vector objects_to_remove; bool remove_from_cache = false; @@ -197,10 +198,10 @@ struct RemoveManyObjectStorageOperation final : public IDiskObjectStorageOperati try { + auto objects = metadata_storage.getStorageObjects(path); auto unlink_outcome = tx->unlinkMetadata(path); if (unlink_outcome && !keep_all_batch_data && !file_names_remove_metadata_only.contains(fs::path(path).filename())) { - auto objects = metadata_storage.getStorageObjects(path); objects_to_remove.emplace_back(ObjectsToRemove{std::move(objects), std::move(unlink_outcome)}); } } @@ -244,10 +245,9 @@ struct RemoveManyObjectStorageOperation final : public IDiskObjectStorageOperati struct RemoveRecursiveObjectStorageOperation final : public IDiskObjectStorageOperation { std::string path; - std::unordered_map objects_to_remove; + std::unordered_map objects_to_remove_by_path; bool keep_all_batch_data; NameSet file_names_remove_metadata_only; - StoredObjects objects_to_remove_from_cache; RemoveRecursiveObjectStorageOperation( IObjectStorage & object_storage_, @@ -274,14 +274,11 @@ struct RemoveRecursiveObjectStorageOperation final : public IDiskObjectStorageOp { try { - uint32_t hardlink_count = metadata_storage.getHardlinkCount(path_to_remove); auto objects_paths = metadata_storage.getStorageObjects(path_to_remove); - - tx->unlinkMetadata(path_to_remove); - - if (hardlink_count == 0) + auto unlink_outcome = tx->unlinkMetadata(path_to_remove); + if (unlink_outcome) { - objects_to_remove[path_to_remove] = std::move(objects_paths); + objects_to_remove_by_path[path_to_remove] = ObjectsToRemove{std::move(objects_paths), std::move(unlink_outcome)}; } } catch (const Exception & e) @@ -331,11 +328,12 @@ struct RemoveRecursiveObjectStorageOperation final : public IDiskObjectStorageOp if (!keep_all_batch_data) { StoredObjects remove_from_remote; - for (auto && [local_path, remote_paths] : objects_to_remove) + for (auto && [local_path, objects_to_remove] : objects_to_remove_by_path) { if (!file_names_remove_metadata_only.contains(fs::path(local_path).filename())) { - std::move(remote_paths.begin(), remote_paths.end(), std::back_inserter(remove_from_remote)); + if (objects_to_remove.unlink_outcome->num_hardlinks == 0) + std::move(objects_to_remove.objects.begin(), objects_to_remove.objects.end(), std::back_inserter(remove_from_remote)); } } /// Read comment inside RemoveObjectStorageOperation class From 790b438b7c19979d49bf35e97b518b4266d7c515 Mon Sep 17 00:00:00 2001 From: vdimir Date: Sat, 8 Jul 2023 13:33:49 +0000 Subject: [PATCH 30/38] add test_delete_race_leftovers --- .../configs/config.d/storage_conf.xml | 19 +++- .../test_alter_moving_garbage/test.py | 91 ++++++++++++++++++- 2 files changed, 104 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml index 1450a459257..67c4cc2d489 100644 --- a/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml +++ b/tests/integration/test_alter_moving_garbage/configs/config.d/storage_conf.xml @@ -1,12 +1,18 @@ - + s3 http://minio1:9001/root/data/ minio minio123 - + + + s3 + http://minio1:9001/root/data2/ + minio + minio123 + @@ -15,10 +21,17 @@ default - s3 + s31 + + + + s32 + + + diff --git a/tests/integration/test_alter_moving_garbage/test.py b/tests/integration/test_alter_moving_garbage/test.py index 330df3ac490..af9fffbb74d 100644 --- a/tests/integration/test_alter_moving_garbage/test.py +++ b/tests/integration/test_alter_moving_garbage/test.py @@ -39,7 +39,7 @@ def cluster(): def create_table(node, table_name, replicated, additional_settings): settings = { "storage_policy": "two_disks", - "old_parts_lifetime": 1, + "old_parts_lifetime": 0, "index_granularity": 512, "temporary_directories_lifetime": 0, "merge_tree_clear_old_temporary_directories_interval_seconds": 1, @@ -73,9 +73,13 @@ def create_table(node, table_name, replicated, additional_settings): "allow_remote_fs_zero_copy_replication,replicated_engine", [(False, False), (False, True), (True, True)], ) -def test_create_table( +def test_alter_moving( cluster, allow_remote_fs_zero_copy_replication, replicated_engine ): + """ + Test that we correctly move parts during ALTER TABLE + """ + if replicated_engine: nodes = list(cluster.instances.values()) else: @@ -126,7 +130,7 @@ def test_create_table( partition = f"2021-01-{i:02d}" try: random.choice(nodes).query( - f"ALTER TABLE {table_name} MOVE PARTITION '{partition}' TO DISK 's3'", + f"ALTER TABLE {table_name} MOVE PARTITION '{partition}' TO DISK 's31'", ) except QueryRuntimeException as e: if "PART_IS_TEMPORARILY_LOCKED" in str(e): @@ -153,3 +157,84 @@ def test_create_table( ) assert data_digest == "1000\n" + + +def test_delete_race_leftovers(cluster): + """ + Test that we correctly delete outdated parts and do not leave any leftovers on s3 + """ + + node = cluster.instances["node1"] + + table_name = "test_delete_race_leftovers" + additional_settings = { + # use another disk not to interfere with other tests + "storage_policy": "one_disk", + # always remove parts in parallel + "concurrent_part_removal_threshold": 1, + } + + create_table( + node, table_name, replicated=True, additional_settings=additional_settings + ) + + # Stop merges to have several small parts in active set + node.query(f"SYSTEM STOP MERGES {table_name}") + + # Creare several small parts in one partition + for i in range(1, 11): + node.query( + f"INSERT INTO {table_name} SELECT toDate('2021-01-01'), number as id, toString(sipHash64(number, {i})) FROM numbers(10_000)" + ) + table_digest_query = f"SELECT count(), sum(sipHash64(id, data)) FROM {table_name}" + table_digest = node.query(table_digest_query) + + # Execute several noop deletes to have parts with updated mutation id without changes in data + # New parts will have symlinks to old parts + node.query(f"SYSTEM START MERGES {table_name}") + for i in range(10): + node.query(f"DELETE FROM {table_name} WHERE data = ''") + + # Make existing parts outdated + # Also we don't want have changing parts set, + # because it will be difficult match objects on s3 and in remote_data_paths to check correctness + node.query(f"OPTIMIZE TABLE {table_name} FINAL") + + inactive_parts_query = ( + f"SELECT count() FROM system.parts " + f"WHERE not active AND table = '{table_name}' AND database = 'default'" + ) + + # Try to wait for deletion of outdated parts + # However, we do not want to wait too long + # If some parts are not deleted after several iterations, we will just continue + for i in range(20): + inactive_parts_count = int(node.query(inactive_parts_query).strip()) + if inactive_parts_count == 0: + print(f"Inactive parts are deleted after {i} iterations") + break + + print(f"Inactive parts count: {inactive_parts_count}") + time.sleep(5) + + # Check that we correctly deleted all outdated parts and no leftovers on s3 + known_remote_paths = set( + node.query( + f"SELECT remote_path FROM system.remote_data_paths WHERE disk_name = 's32'" + ).splitlines() + ) + + all_remote_paths = set( + obj.object_name + for obj in cluster.minio_client.list_objects( + cluster.minio_bucket, "data2/", recursive=True + ) + ) + + # Some blobs can be deleted after we listed remote_data_paths + # It's alright, thus we check only that all remote paths are known + # (in other words, all remote paths is subset of known paths) + assert all_remote_paths == {p for p in known_remote_paths if p in all_remote_paths} + + # Check that we have all data + assert table_digest == node.query(table_digest_query) From bd5f39351502d67b35ef5e02f166be2d31b7e6fe Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 11 Jul 2023 05:14:39 +0000 Subject: [PATCH 31/38] Always remove blobs in MetadataStorageFromPlainObjectStorageTransaction::unlinkMetadata --- .../ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index 3650c7eaac8..022ff86df50 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -138,7 +138,8 @@ void MetadataStorageFromPlainObjectStorageTransaction::addBlobToMetadata( UnlinkMetadataFileOperationOutcomePtr MetadataStorageFromPlainObjectStorageTransaction::unlinkMetadata(const std::string &) { - return nullptr; + /// No hardlinks, so will always remove file. + return std::make_shared(UnlinkMetadataFileOperationOutcome{0}); } } From 062b1c464c2961672dc23cc85c6226431f1c44f3 Mon Sep 17 00:00:00 2001 From: Mike Kot Date: Tue, 11 Jul 2023 10:04:43 +0000 Subject: [PATCH 32/38] watch for certificate file updates in configreloader --- programs/keeper/Keeper.cpp | 315 ++++++++++++------------ programs/server/Server.cpp | 9 +- src/Access/UsersConfigAccessStorage.cpp | 2 +- src/Common/Config/ConfigReloader.cpp | 28 ++- src/Common/Config/ConfigReloader.h | 26 +- 5 files changed, 197 insertions(+), 183 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index db35e30bd30..2a54ddb952f 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -1,28 +1,28 @@ #include "Keeper.h" -#include -#include -#include -#include -#include -#include #include -#include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include +#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include @@ -30,13 +30,13 @@ #include #include -#include #include +#include #include "Core/Defines.h" #include "config.h" -#include "config_version.h" #include "config_tools.h" +#include "config_version.h" #if USE_SSL @@ -44,8 +44,8 @@ # include #endif -#include #include +#include #include @@ -70,9 +70,9 @@ int mainEntryClickHouseKeeper(int argc, char ** argv) // Weak symbols don't work correctly on Darwin // so we have a stub implementation to avoid linker errors -void collectCrashLog( - Int32, UInt64, const String &, const StackTrace &) -{} +void collectCrashLog(Int32, UInt64, const String &, const StackTrace &) +{ +} #endif @@ -87,7 +87,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -Poco::Net::SocketAddress Keeper::socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure) const +Poco::Net::SocketAddress +Keeper::socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure) const { auto address = makeSocketAddress(host, port, &logger()); socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ config().getBool("listen_reuse_port", false)); @@ -113,7 +114,9 @@ void Keeper::createServer(const std::string & listen_host, const char * port_nam if (listen_try) { - LOG_WARNING(&logger(), "{}. If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, then consider to " + LOG_WARNING( + &logger(), + "{}. If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, then consider to " "specify not disabled IPv4 or IPv6 address to listen in element of configuration " "file. Example for disabled IPv6: 0.0.0.0 ." " Example for disabled IPv4: ::", @@ -137,12 +140,13 @@ int Keeper::run() if (config().hasOption("help")) { Poco::Util::HelpFormatter help_formatter(Keeper::options()); - auto header_str = fmt::format("{0} [OPTION] [-- [ARG]...]\n" + auto header_str = fmt::format( + "{0} [OPTION] [-- [ARG]...]\n" #if ENABLE_CLICKHOUSE_KEEPER_CLIENT - "{0} client [OPTION]\n" + "{0} client [OPTION]\n" #endif - "positional arguments can be used to rewrite config.xml properties, for example, --http_port=8010", - commandName()); + "positional arguments can be used to rewrite config.xml properties, for example, --http_port=8010", + commandName()); help_formatter.setHeader(header_str); help_formatter.format(std::cout); return 0; @@ -161,7 +165,9 @@ void Keeper::initialize(Poco::Util::Application & self) BaseDaemon::initialize(self); logger().information("starting up"); - LOG_INFO(&logger(), "OS Name = {}, OS Version = {}, OS Architecture = {}", + LOG_INFO( + &logger(), + "OS Name = {}, OS Version = {}, OS Architecture = {}", Poco::Environment::osName(), Poco::Environment::osVersion(), Poco::Environment::osArchitecture()); @@ -186,82 +192,64 @@ void Keeper::handleCustomArguments(const std::string & arg, [[maybe_unused]] con void Keeper::defineOptions(Poco::Util::OptionSet & options) { + options.addOption(Poco::Util::Option("help", "h", "show help and exit").required(false).repeatable(false).binding("help")); + options.addOption(Poco::Util::Option("version", "V", "show version and exit").required(false).repeatable(false).binding("version")); options.addOption( - Poco::Util::Option("help", "h", "show help and exit") + Poco::Util::Option( + "force-recovery", "force-recovery", "Force recovery mode allowing Keeper to overwrite cluster configuration without quorum") .required(false) .repeatable(false) - .binding("help")); - options.addOption( - Poco::Util::Option("version", "V", "show version and exit") - .required(false) - .repeatable(false) - .binding("version")); - options.addOption( - Poco::Util::Option("force-recovery", "force-recovery", "Force recovery mode allowing Keeper to overwrite cluster configuration without quorum") - .required(false) - .repeatable(false) - .noArgument() - .callback(Poco::Util::OptionCallback(this, &Keeper::handleCustomArguments))); + .noArgument() + .callback(Poco::Util::OptionCallback(this, &Keeper::handleCustomArguments))); BaseDaemon::defineOptions(options); } namespace { -struct KeeperHTTPContext : public IHTTPContext -{ - explicit KeeperHTTPContext(ContextPtr context_) - : context(std::move(context_)) - {} - - uint64_t getMaxHstsAge() const override + struct KeeperHTTPContext : public IHTTPContext { - return context->getConfigRef().getUInt64("keeper_server.hsts_max_age", 0); - } + explicit KeeperHTTPContext(ContextPtr context_) : context(std::move(context_)) { } - uint64_t getMaxUriSize() const override + uint64_t getMaxHstsAge() const override { return context->getConfigRef().getUInt64("keeper_server.hsts_max_age", 0); } + + uint64_t getMaxUriSize() const override { return context->getConfigRef().getUInt64("keeper_server.http_max_uri_size", 1048576); } + + uint64_t getMaxFields() const override { return context->getConfigRef().getUInt64("keeper_server.http_max_fields", 1000000); } + + uint64_t getMaxFieldNameSize() const override + { + return context->getConfigRef().getUInt64("keeper_server.http_max_field_name_size", 128 * 1024); + } + + uint64_t getMaxFieldValueSize() const override + { + return context->getConfigRef().getUInt64("keeper_server.http_max_field_value_size", 128 * 1024); + } + + uint64_t getMaxChunkSize() const override + { + return context->getConfigRef().getUInt64("keeper_server.http_max_chunk_size", 100_GiB); + } + + Poco::Timespan getReceiveTimeout() const override + { + return {context->getConfigRef().getInt64("keeper_server.http_receive_timeout", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0}; + } + + Poco::Timespan getSendTimeout() const override + { + return {context->getConfigRef().getInt64("keeper_server.http_send_timeout", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0}; + } + + ContextPtr context; + }; + + HTTPContextPtr httpContext() { - return context->getConfigRef().getUInt64("keeper_server.http_max_uri_size", 1048576); + return std::make_shared(Context::getGlobalContextInstance()); } - uint64_t getMaxFields() const override - { - return context->getConfigRef().getUInt64("keeper_server.http_max_fields", 1000000); - } - - uint64_t getMaxFieldNameSize() const override - { - return context->getConfigRef().getUInt64("keeper_server.http_max_field_name_size", 128 * 1024); - } - - uint64_t getMaxFieldValueSize() const override - { - return context->getConfigRef().getUInt64("keeper_server.http_max_field_value_size", 128 * 1024); - } - - uint64_t getMaxChunkSize() const override - { - return context->getConfigRef().getUInt64("keeper_server.http_max_chunk_size", 100_GiB); - } - - Poco::Timespan getReceiveTimeout() const override - { - return {context->getConfigRef().getInt64("keeper_server.http_receive_timeout", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0}; - } - - Poco::Timespan getSendTimeout() const override - { - return {context->getConfigRef().getInt64("keeper_server.http_send_timeout", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0}; - } - - ContextPtr context; -}; - -HTTPContextPtr httpContext() -{ - return std::make_shared(Context::getGlobalContextInstance()); -} - } int Keeper::main(const std::vector & /*args*/) @@ -298,7 +286,7 @@ try std::filesystem::create_directories(path); /// Check that the process user id matches the owner of the data. - assertProcessUserMatchesDataOwner(path, [&](const std::string & message){ LOG_WARNING(log, fmt::runtime(message)); }); + assertProcessUserMatchesDataOwner(path, [&](const std::string & message) { LOG_WARNING(log, fmt::runtime(message)); }); DB::ServerUUID::load(path + "/uuid", log); @@ -307,8 +295,7 @@ try GlobalThreadPool::initialize( config().getUInt("max_thread_pool_size", 100), config().getUInt("max_thread_pool_free_size", 1000), - config().getUInt("thread_pool_queue_size", 10000) - ); + config().getUInt("thread_pool_queue_size", 10000)); static ServerErrorHandler error_handler; Poco::ErrorHandler::set(&error_handler); @@ -350,8 +337,7 @@ try for (const auto & server : *servers) metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); return metrics; - } - ); + }); std::vector listen_hosts = DB::getMultipleValuesFromConfig(config(), "", "listen_host"); @@ -367,10 +353,7 @@ try global_context->initializeKeeperDispatcher(/* start_async = */ true); FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); - auto config_getter = [&] () -> const Poco::Util::AbstractConfiguration & - { - return global_context->getConfigRef(); - }; + auto config_getter = [&]() -> const Poco::Util::AbstractConfiguration & { return global_context->getConfigRef(); }; auto tcp_receive_timeout = config().getInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC); auto tcp_send_timeout = config().getInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC); @@ -379,43 +362,55 @@ try { /// TCP Keeper const char * port_name = "keeper_server.tcp_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(Poco::Timespan{tcp_receive_timeout, 0}); - socket.setSendTimeout(Poco::Timespan{tcp_send_timeout, 0}); - servers->emplace_back( - listen_host, - port_name, - "Keeper (tcp): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory( - config_getter, global_context->getKeeperDispatcher(), - tcp_receive_timeout, tcp_send_timeout, false), server_pool, socket)); - }); + createServer( + listen_host, + port_name, + listen_try, + [&](UInt16 port) + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(Poco::Timespan{tcp_receive_timeout, 0}); + socket.setSendTimeout(Poco::Timespan{tcp_send_timeout, 0}); + servers->emplace_back( + listen_host, + port_name, + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory( + config_getter, global_context->getKeeperDispatcher(), tcp_receive_timeout, tcp_send_timeout, false), + server_pool, + socket)); + }); const char * secure_port_name = "keeper_server.tcp_port_secure"; - createServer(listen_host, secure_port_name, listen_try, [&](UInt16 port) - { + createServer( + listen_host, + secure_port_name, + listen_try, + [&](UInt16 port) + { #if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan{tcp_receive_timeout, 0}); - socket.setSendTimeout(Poco::Timespan{tcp_send_timeout, 0}); - servers->emplace_back( - listen_host, - secure_port_name, - "Keeper with secure protocol (tcp_secure): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory( - config_getter, global_context->getKeeperDispatcher(), - tcp_receive_timeout, tcp_send_timeout, true), server_pool, socket)); + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan{tcp_receive_timeout, 0}); + socket.setSendTimeout(Poco::Timespan{tcp_send_timeout, 0}); + servers->emplace_back( + listen_host, + secure_port_name, + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory( + config_getter, global_context->getKeeperDispatcher(), tcp_receive_timeout, tcp_send_timeout, true), + server_pool, + socket)); #else - UNUSED(port); - throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); + UNUSED(port); + throw Exception( + ErrorCodes::SUPPORT_IS_DISABLED, + "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); #endif - }); + }); const auto & config = config_getter(); auto http_context = httpContext(); @@ -426,19 +421,27 @@ try /// Prometheus (if defined and not setup yet with http_port) port_name = "prometheus.port"; - createServer(listen_host, port_name, listen_try, [&, my_http_context = std::move(http_context)](UInt16 port) mutable - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(my_http_context->getReceiveTimeout()); - socket.setSendTimeout(my_http_context->getSendTimeout()); - servers->emplace_back( - listen_host, - port_name, - "Prometheus: http://" + address.toString(), - std::make_unique( - std::move(my_http_context), createPrometheusMainHandlerFactory(*this, config_getter(), async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params)); - }); + createServer( + listen_host, + port_name, + listen_try, + [&, my_http_context = std::move(http_context)](UInt16 port) mutable + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(my_http_context->getReceiveTimeout()); + socket.setSendTimeout(my_http_context->getSendTimeout()); + servers->emplace_back( + listen_host, + port_name, + "Prometheus: http://" + address.toString(), + std::make_unique( + std::move(my_http_context), + createPrometheusMainHandlerFactory(*this, config_getter(), async_metrics, "PrometheusHandler-factory"), + server_pool, + socket, + http_params)); + }); } for (auto & server : *servers) @@ -454,7 +457,7 @@ try /// ConfigReloader have to strict parameters which are redundant in our case auto main_config_reloader = std::make_unique( config_path, - include_from_path, + std::vector{{include_from_path}}, config().getString("path", ""), std::move(unused_cache), unused_event, @@ -463,7 +466,7 @@ try if (config->has("keeper_server")) global_context->updateKeeperConfiguration(*config); }, - /* already_loaded = */ false); /// Reload it right now (initial loading) + /* already_loaded = */ false); /// Reload it right now (initial loading) SCOPE_EXIT({ LOG_INFO(log, "Shutting down."); @@ -488,7 +491,10 @@ try current_connections = waitServersToFinish(*servers, servers_lock, config().getInt("shutdown_wait_unfinished", 5)); if (current_connections) - LOG_INFO(log, "Closed connections to Keeper. But {} remain. Probably some users cannot finish their connections after context shutdown.", current_connections); + LOG_INFO( + log, + "Closed connections to Keeper. But {} remain. Probably some users cannot finish their connections after context shutdown.", + current_connections); else LOG_INFO(log, "Closed connections to Keeper."); @@ -526,11 +532,10 @@ catch (...) void Keeper::logRevision() const { - Poco::Logger::root().information("Starting ClickHouse Keeper " + std::string{VERSION_STRING} - + "(revision : " + std::to_string(ClickHouseRevision::getVersionRevision()) - + ", git hash: " + (git_hash.empty() ? "" : git_hash) - + ", build id: " + (build_id.empty() ? "" : build_id) + ")" - + ", PID " + std::to_string(getpid())); + Poco::Logger::root().information( + "Starting ClickHouse Keeper " + std::string{VERSION_STRING} + "(revision : " + + std::to_string(ClickHouseRevision::getVersionRevision()) + ", git hash: " + (git_hash.empty() ? "" : git_hash) + + ", build id: " + (build_id.empty() ? "" : build_id) + ")" + ", PID " + std::to_string(getpid())); } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 071f7d3177e..af9abc1024f 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1100,9 +1100,16 @@ try SensitiveDataMasker::setInstance(std::make_unique(config(), "query_masking_rules")); } + const std::string cert_path = config().getString("openSSL.server.certificateFile", ""); + const std::string key_path = config().getString("openSSL.server.privateKeyFile", ""); + + std::vector extra_paths = {include_from_path}; + if (!cert_path.empty()) extra_paths.emplace_back(cert_path); + if (!key_path.empty()) extra_paths.emplace_back(key_path); + auto main_config_reloader = std::make_unique( config_path, - include_from_path, + extra_paths, config().getString("path", ""), std::move(main_config_zk_node_cache), main_config_zk_changed_event, diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index 15765045c97..bb7d9dfd4f7 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -807,7 +807,7 @@ void UsersConfigAccessStorage::load( config_reloader.reset(); config_reloader = std::make_unique( users_config_path, - include_from_path, + std::vector{{include_from_path}}, preprocessed_dir, zkutil::ZooKeeperNodeCache(get_zookeeper_function), std::make_shared(), diff --git a/src/Common/Config/ConfigReloader.cpp b/src/Common/Config/ConfigReloader.cpp index de7011b67bf..5ff2fcbf502 100644 --- a/src/Common/Config/ConfigReloader.cpp +++ b/src/Common/Config/ConfigReloader.cpp @@ -14,14 +14,15 @@ namespace DB { ConfigReloader::ConfigReloader( - const std::string & path_, - const std::string & include_from_path_, + std::string_view config_path_, + const std::vector& extra_paths_, const std::string & preprocessed_dir_, zkutil::ZooKeeperNodeCache && zk_node_cache_, const zkutil::EventPtr & zk_changed_event_, Updater && updater_, bool already_loaded) - : path(path_), include_from_path(include_from_path_) + : config_path(config_path_) + , extra_paths(extra_paths_) , preprocessed_dir(preprocessed_dir_) , zk_node_cache(std::move(zk_node_cache_)) , zk_changed_event(zk_changed_event_) @@ -98,10 +99,10 @@ void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallbac FilesChangesTracker new_files = getNewFileList(); if (force || need_reload_from_zk || new_files.isDifferOrNewerThan(files)) { - ConfigProcessor config_processor(path); + ConfigProcessor config_processor(config_path); ConfigProcessor::LoadedConfig loaded_config; - LOG_DEBUG(log, "Loading config '{}'", path); + LOG_DEBUG(log, "Loading config '{}'", config_path); try { @@ -118,7 +119,7 @@ void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallbac if (throw_on_error) throw; - tryLogCurrentException(log, "ZooKeeper error when loading config from '" + path + "'"); + tryLogCurrentException(log, "ZooKeeper error when loading config from '" + config_path + "'"); return; } catch (...) @@ -126,7 +127,7 @@ void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallbac if (throw_on_error) throw; - tryLogCurrentException(log, "Error loading config from '" + path + "'"); + tryLogCurrentException(log, "Error loading config from '" + config_path + "'"); return; } config_processor.savePreprocessedConfig(loaded_config, preprocessed_dir); @@ -142,7 +143,7 @@ void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallbac need_reload_from_zk = false; } - LOG_DEBUG(log, "Loaded config '{}', performing update on configuration", path); + LOG_DEBUG(log, "Loaded config '{}', performing update on configuration", config_path); try { @@ -152,11 +153,11 @@ void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallbac { if (throw_on_error) throw; - tryLogCurrentException(log, "Error updating configuration from '" + path + "' config."); + tryLogCurrentException(log, "Error updating configuration from '" + config_path + "' config."); return; } - LOG_DEBUG(log, "Loaded config '{}', performed update on configuration", path); + LOG_DEBUG(log, "Loaded config '{}', performed update on configuration", config_path); } } @@ -196,10 +197,11 @@ ConfigReloader::FilesChangesTracker ConfigReloader::getNewFileList() const { FilesChangesTracker file_list; - file_list.addIfExists(path); - file_list.addIfExists(include_from_path); + file_list.addIfExists(config_path); + for (const std::string& path : extra_paths) + file_list.addIfExists(path); - for (const auto & merge_path : ConfigProcessor::getConfigMergeFiles(path)) + for (const auto & merge_path : ConfigProcessor::getConfigMergeFiles(config_path)) file_list.addIfExists(merge_path); return file_list; diff --git a/src/Common/Config/ConfigReloader.h b/src/Common/Config/ConfigReloader.h index 982e21c91e2..c5a18524318 100644 --- a/src/Common/Config/ConfigReloader.h +++ b/src/Common/Config/ConfigReloader.h @@ -4,7 +4,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -22,23 +23,21 @@ class Context; /** Every two seconds checks configuration files for update. * If configuration is changed, then config will be reloaded by ConfigProcessor * and the reloaded config will be applied via Updater functor. - * It doesn't take into account changes of --config-file, and parameters. + * It doesn't take into account changes of --config-file and . */ class ConfigReloader { public: using Updater = std::function; - /** include_from_path is usually /etc/metrika.xml (i.e. value of tag) - */ ConfigReloader( - const std::string & path, - const std::string & include_from_path, - const std::string & preprocessed_dir, - zkutil::ZooKeeperNodeCache && zk_node_cache, - const zkutil::EventPtr & zk_changed_event, - Updater && updater, - bool already_loaded); + std::string_view path_, + const std::vector& extra_paths_, + const std::string & preprocessed_dir, + zkutil::ZooKeeperNodeCache && zk_node_cache, + const zkutil::EventPtr & zk_changed_event, + Updater && updater, + bool already_loaded); ~ConfigReloader(); @@ -73,8 +72,9 @@ private: Poco::Logger * log = &Poco::Logger::get("ConfigReloader"); - std::string path; - std::string include_from_path; + std::string config_path; + std::vector extra_paths; + std::string preprocessed_dir; FilesChangesTracker files; zkutil::ZooKeeperNodeCache zk_node_cache; From 23e3c57a6ad61074a3bb76f3e419faee9313f727 Mon Sep 17 00:00:00 2001 From: Mike Kot Date: Tue, 11 Jul 2023 10:13:30 +0000 Subject: [PATCH 33/38] reload certificates for Keeper --- programs/keeper/Keeper.cpp | 330 +++++++++++++++++++------------------ 1 file changed, 168 insertions(+), 162 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 2a54ddb952f..f4bb08676d2 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -1,28 +1,29 @@ #include "Keeper.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include +#include #include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include @@ -30,13 +31,13 @@ #include #include -#include #include +#include #include "Core/Defines.h" #include "config.h" -#include "config_tools.h" #include "config_version.h" +#include "config_tools.h" #if USE_SSL @@ -44,8 +45,8 @@ # include #endif -#include #include +#include #include @@ -70,9 +71,9 @@ int mainEntryClickHouseKeeper(int argc, char ** argv) // Weak symbols don't work correctly on Darwin // so we have a stub implementation to avoid linker errors -void collectCrashLog(Int32, UInt64, const String &, const StackTrace &) -{ -} +void collectCrashLog( + Int32, UInt64, const String &, const StackTrace &) +{} #endif @@ -87,8 +88,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -Poco::Net::SocketAddress -Keeper::socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure) const +Poco::Net::SocketAddress Keeper::socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure) const { auto address = makeSocketAddress(host, port, &logger()); socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ config().getBool("listen_reuse_port", false)); @@ -114,9 +114,7 @@ void Keeper::createServer(const std::string & listen_host, const char * port_nam if (listen_try) { - LOG_WARNING( - &logger(), - "{}. If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, then consider to " + LOG_WARNING(&logger(), "{}. If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, then consider to " "specify not disabled IPv4 or IPv6 address to listen in element of configuration " "file. Example for disabled IPv6: 0.0.0.0 ." " Example for disabled IPv4: ::", @@ -140,13 +138,12 @@ int Keeper::run() if (config().hasOption("help")) { Poco::Util::HelpFormatter help_formatter(Keeper::options()); - auto header_str = fmt::format( - "{0} [OPTION] [-- [ARG]...]\n" + auto header_str = fmt::format("{0} [OPTION] [-- [ARG]...]\n" #if ENABLE_CLICKHOUSE_KEEPER_CLIENT - "{0} client [OPTION]\n" + "{0} client [OPTION]\n" #endif - "positional arguments can be used to rewrite config.xml properties, for example, --http_port=8010", - commandName()); + "positional arguments can be used to rewrite config.xml properties, for example, --http_port=8010", + commandName()); help_formatter.setHeader(header_str); help_formatter.format(std::cout); return 0; @@ -165,9 +162,7 @@ void Keeper::initialize(Poco::Util::Application & self) BaseDaemon::initialize(self); logger().information("starting up"); - LOG_INFO( - &logger(), - "OS Name = {}, OS Version = {}, OS Architecture = {}", + LOG_INFO(&logger(), "OS Name = {}, OS Version = {}, OS Architecture = {}", Poco::Environment::osName(), Poco::Environment::osVersion(), Poco::Environment::osArchitecture()); @@ -192,64 +187,82 @@ void Keeper::handleCustomArguments(const std::string & arg, [[maybe_unused]] con void Keeper::defineOptions(Poco::Util::OptionSet & options) { - options.addOption(Poco::Util::Option("help", "h", "show help and exit").required(false).repeatable(false).binding("help")); - options.addOption(Poco::Util::Option("version", "V", "show version and exit").required(false).repeatable(false).binding("version")); options.addOption( - Poco::Util::Option( - "force-recovery", "force-recovery", "Force recovery mode allowing Keeper to overwrite cluster configuration without quorum") + Poco::Util::Option("help", "h", "show help and exit") .required(false) .repeatable(false) - .noArgument() - .callback(Poco::Util::OptionCallback(this, &Keeper::handleCustomArguments))); + .binding("help")); + options.addOption( + Poco::Util::Option("version", "V", "show version and exit") + .required(false) + .repeatable(false) + .binding("version")); + options.addOption( + Poco::Util::Option("force-recovery", "force-recovery", "Force recovery mode allowing Keeper to overwrite cluster configuration without quorum") + .required(false) + .repeatable(false) + .noArgument() + .callback(Poco::Util::OptionCallback(this, &Keeper::handleCustomArguments))); BaseDaemon::defineOptions(options); } namespace { - struct KeeperHTTPContext : public IHTTPContext +struct KeeperHTTPContext : public IHTTPContext +{ + explicit KeeperHTTPContext(ContextPtr context_) + : context(std::move(context_)) + {} + + uint64_t getMaxHstsAge() const override { - explicit KeeperHTTPContext(ContextPtr context_) : context(std::move(context_)) { } - - uint64_t getMaxHstsAge() const override { return context->getConfigRef().getUInt64("keeper_server.hsts_max_age", 0); } - - uint64_t getMaxUriSize() const override { return context->getConfigRef().getUInt64("keeper_server.http_max_uri_size", 1048576); } - - uint64_t getMaxFields() const override { return context->getConfigRef().getUInt64("keeper_server.http_max_fields", 1000000); } - - uint64_t getMaxFieldNameSize() const override - { - return context->getConfigRef().getUInt64("keeper_server.http_max_field_name_size", 128 * 1024); - } - - uint64_t getMaxFieldValueSize() const override - { - return context->getConfigRef().getUInt64("keeper_server.http_max_field_value_size", 128 * 1024); - } - - uint64_t getMaxChunkSize() const override - { - return context->getConfigRef().getUInt64("keeper_server.http_max_chunk_size", 100_GiB); - } - - Poco::Timespan getReceiveTimeout() const override - { - return {context->getConfigRef().getInt64("keeper_server.http_receive_timeout", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0}; - } - - Poco::Timespan getSendTimeout() const override - { - return {context->getConfigRef().getInt64("keeper_server.http_send_timeout", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0}; - } - - ContextPtr context; - }; - - HTTPContextPtr httpContext() - { - return std::make_shared(Context::getGlobalContextInstance()); + return context->getConfigRef().getUInt64("keeper_server.hsts_max_age", 0); } + uint64_t getMaxUriSize() const override + { + return context->getConfigRef().getUInt64("keeper_server.http_max_uri_size", 1048576); + } + + uint64_t getMaxFields() const override + { + return context->getConfigRef().getUInt64("keeper_server.http_max_fields", 1000000); + } + + uint64_t getMaxFieldNameSize() const override + { + return context->getConfigRef().getUInt64("keeper_server.http_max_field_name_size", 128 * 1024); + } + + uint64_t getMaxFieldValueSize() const override + { + return context->getConfigRef().getUInt64("keeper_server.http_max_field_value_size", 128 * 1024); + } + + uint64_t getMaxChunkSize() const override + { + return context->getConfigRef().getUInt64("keeper_server.http_max_chunk_size", 100_GiB); + } + + Poco::Timespan getReceiveTimeout() const override + { + return {context->getConfigRef().getInt64("keeper_server.http_receive_timeout", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0}; + } + + Poco::Timespan getSendTimeout() const override + { + return {context->getConfigRef().getInt64("keeper_server.http_send_timeout", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0}; + } + + ContextPtr context; +}; + +HTTPContextPtr httpContext() +{ + return std::make_shared(Context::getGlobalContextInstance()); +} + } int Keeper::main(const std::vector & /*args*/) @@ -286,7 +299,7 @@ try std::filesystem::create_directories(path); /// Check that the process user id matches the owner of the data. - assertProcessUserMatchesDataOwner(path, [&](const std::string & message) { LOG_WARNING(log, fmt::runtime(message)); }); + assertProcessUserMatchesDataOwner(path, [&](const std::string & message){ LOG_WARNING(log, fmt::runtime(message)); }); DB::ServerUUID::load(path + "/uuid", log); @@ -295,7 +308,8 @@ try GlobalThreadPool::initialize( config().getUInt("max_thread_pool_size", 100), config().getUInt("max_thread_pool_free_size", 1000), - config().getUInt("thread_pool_queue_size", 10000)); + config().getUInt("thread_pool_queue_size", 10000) + ); static ServerErrorHandler error_handler; Poco::ErrorHandler::set(&error_handler); @@ -337,7 +351,8 @@ try for (const auto & server : *servers) metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); return metrics; - }); + } + ); std::vector listen_hosts = DB::getMultipleValuesFromConfig(config(), "", "listen_host"); @@ -353,7 +368,10 @@ try global_context->initializeKeeperDispatcher(/* start_async = */ true); FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); - auto config_getter = [&]() -> const Poco::Util::AbstractConfiguration & { return global_context->getConfigRef(); }; + auto config_getter = [&] () -> const Poco::Util::AbstractConfiguration & + { + return global_context->getConfigRef(); + }; auto tcp_receive_timeout = config().getInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC); auto tcp_send_timeout = config().getInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC); @@ -362,55 +380,43 @@ try { /// TCP Keeper const char * port_name = "keeper_server.tcp_port"; - createServer( - listen_host, - port_name, - listen_try, - [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(Poco::Timespan{tcp_receive_timeout, 0}); - socket.setSendTimeout(Poco::Timespan{tcp_send_timeout, 0}); - servers->emplace_back( - listen_host, - port_name, - "Keeper (tcp): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory( - config_getter, global_context->getKeeperDispatcher(), tcp_receive_timeout, tcp_send_timeout, false), - server_pool, - socket)); - }); + createServer(listen_host, port_name, listen_try, [&](UInt16 port) + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(Poco::Timespan{tcp_receive_timeout, 0}); + socket.setSendTimeout(Poco::Timespan{tcp_send_timeout, 0}); + servers->emplace_back( + listen_host, + port_name, + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory( + config_getter, global_context->getKeeperDispatcher(), + tcp_receive_timeout, tcp_send_timeout, false), server_pool, socket)); + }); const char * secure_port_name = "keeper_server.tcp_port_secure"; - createServer( - listen_host, - secure_port_name, - listen_try, - [&](UInt16 port) - { + createServer(listen_host, secure_port_name, listen_try, [&](UInt16 port) + { #if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan{tcp_receive_timeout, 0}); - socket.setSendTimeout(Poco::Timespan{tcp_send_timeout, 0}); - servers->emplace_back( - listen_host, - secure_port_name, - "Keeper with secure protocol (tcp_secure): " + address.toString(), - std::make_unique( - new KeeperTCPHandlerFactory( - config_getter, global_context->getKeeperDispatcher(), tcp_receive_timeout, tcp_send_timeout, true), - server_pool, - socket)); + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan{tcp_receive_timeout, 0}); + socket.setSendTimeout(Poco::Timespan{tcp_send_timeout, 0}); + servers->emplace_back( + listen_host, + secure_port_name, + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory( + config_getter, global_context->getKeeperDispatcher(), + tcp_receive_timeout, tcp_send_timeout, true), server_pool, socket)); #else - UNUSED(port); - throw Exception( - ErrorCodes::SUPPORT_IS_DISABLED, - "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); + UNUSED(port); + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support."); #endif - }); + }); const auto & config = config_getter(); auto http_context = httpContext(); @@ -421,27 +427,19 @@ try /// Prometheus (if defined and not setup yet with http_port) port_name = "prometheus.port"; - createServer( - listen_host, - port_name, - listen_try, - [&, my_http_context = std::move(http_context)](UInt16 port) mutable - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(my_http_context->getReceiveTimeout()); - socket.setSendTimeout(my_http_context->getSendTimeout()); - servers->emplace_back( - listen_host, - port_name, - "Prometheus: http://" + address.toString(), - std::make_unique( - std::move(my_http_context), - createPrometheusMainHandlerFactory(*this, config_getter(), async_metrics, "PrometheusHandler-factory"), - server_pool, - socket, - http_params)); - }); + createServer(listen_host, port_name, listen_try, [&, my_http_context = std::move(http_context)](UInt16 port) mutable + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(my_http_context->getReceiveTimeout()); + socket.setSendTimeout(my_http_context->getSendTimeout()); + servers->emplace_back( + listen_host, + port_name, + "Prometheus: http://" + address.toString(), + std::make_unique( + std::move(my_http_context), createPrometheusMainHandlerFactory(*this, config_getter(), async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params)); + }); } for (auto & server : *servers) @@ -454,10 +452,18 @@ try zkutil::EventPtr unused_event = std::make_shared(); zkutil::ZooKeeperNodeCache unused_cache([] { return nullptr; }); + + const std::string cert_path = config().getString("openSSL.server.certificateFile", ""); + const std::string key_path = config().getString("openSSL.server.privateKeyFile", ""); + + std::vector extra_paths = {include_from_path}; + if (!cert_path.empty()) extra_paths.emplace_back(cert_path); + if (!key_path.empty()) extra_paths.emplace_back(key_path); + /// ConfigReloader have to strict parameters which are redundant in our case auto main_config_reloader = std::make_unique( config_path, - std::vector{{include_from_path}}, + extra_paths, config().getString("path", ""), std::move(unused_cache), unused_event, @@ -465,8 +471,10 @@ try { if (config->has("keeper_server")) global_context->updateKeeperConfiguration(*config); + + CertificateReloader::instance().tryLoad(*config); }, - /* already_loaded = */ false); /// Reload it right now (initial loading) + /* already_loaded = */ false); /// Reload it right now (initial loading) SCOPE_EXIT({ LOG_INFO(log, "Shutting down."); @@ -491,10 +499,7 @@ try current_connections = waitServersToFinish(*servers, servers_lock, config().getInt("shutdown_wait_unfinished", 5)); if (current_connections) - LOG_INFO( - log, - "Closed connections to Keeper. But {} remain. Probably some users cannot finish their connections after context shutdown.", - current_connections); + LOG_INFO(log, "Closed connections to Keeper. But {} remain. Probably some users cannot finish their connections after context shutdown.", current_connections); else LOG_INFO(log, "Closed connections to Keeper."); @@ -532,10 +537,11 @@ catch (...) void Keeper::logRevision() const { - Poco::Logger::root().information( - "Starting ClickHouse Keeper " + std::string{VERSION_STRING} + "(revision : " - + std::to_string(ClickHouseRevision::getVersionRevision()) + ", git hash: " + (git_hash.empty() ? "" : git_hash) - + ", build id: " + (build_id.empty() ? "" : build_id) + ")" + ", PID " + std::to_string(getpid())); + Poco::Logger::root().information("Starting ClickHouse Keeper " + std::string{VERSION_STRING} + + "(revision : " + std::to_string(ClickHouseRevision::getVersionRevision()) + + ", git hash: " + (git_hash.empty() ? "" : git_hash) + + ", build id: " + (build_id.empty() ? "" : build_id) + ")" + + ", PID " + std::to_string(getpid())); } From 471e5a056dc14251136372b3e22925df5a178f94 Mon Sep 17 00:00:00 2001 From: Mike Kot Date: Tue, 11 Jul 2023 10:40:55 +0000 Subject: [PATCH 34/38] fix --- programs/keeper/Keeper.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index f4bb08676d2..4c0f08eb314 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -472,7 +472,9 @@ try if (config->has("keeper_server")) global_context->updateKeeperConfiguration(*config); +#if USE_SSL CertificateReloader::instance().tryLoad(*config); +#endif }, /* already_loaded = */ false); /// Reload it right now (initial loading) From 3e8906e5a231c714020d180bb40edfea6838d404 Mon Sep 17 00:00:00 2001 From: Mike Kot Date: Tue, 11 Jul 2023 12:23:10 +0000 Subject: [PATCH 35/38] fix build for non-ssl systems --- programs/keeper/Keeper.cpp | 2 +- programs/server/Server.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 4c0f08eb314..6034d63a016 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -43,6 +42,7 @@ #if USE_SSL # include # include +# include #endif #include diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index af9abc1024f..c7a7ba71e83 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -88,7 +88,6 @@ #include #include #include -#include #include #include #include @@ -109,6 +108,7 @@ #if USE_SSL # include +# include #endif #if USE_GRPC From 49c1beb8705399dfb40ae5213ebca70ba31e3852 Mon Sep 17 00:00:00 2001 From: Alexey Gerasimchuck Date: Tue, 11 Jul 2023 22:25:30 +1000 Subject: [PATCH 36/38] Used timeout function instead of undefined clickhouse_client_loop_timeout in functional tests (#51923) * removed undefined clickhouse_client_loop_timeout * test fix and improvement * Changes after review iteration --- .../0_stateless/02242_delete_user_race.sh | 27 ++++++++++++------- .../0_stateless/02243_drop_user_grant_race.sh | 13 ++++----- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/queries/0_stateless/02242_delete_user_race.sh b/tests/queries/0_stateless/02242_delete_user_race.sh index f22b7796bd4..8f387333c33 100755 --- a/tests/queries/0_stateless/02242_delete_user_race.sh +++ b/tests/queries/0_stateless/02242_delete_user_race.sh @@ -22,18 +22,27 @@ $CLICKHOUSE_CLIENT -nm -q " function delete_user() { - $CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS test_user_02242" ||: + while true; do + $CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS test_user_02242" ||: + sleep 0.$RANDOM; + done } function create_and_login_user() { - $CLICKHOUSE_CLIENT -q "CREATE USER IF NOT EXISTS test_user_02242" ||: - $CLICKHOUSE_CLIENT -u "test_user_02242" -q "SELECT COUNT(*) FROM system.session_log WHERE user == 'test_user_02242'" > /dev/null ||: + while true; do + $CLICKHOUSE_CLIENT -q "CREATE USER IF NOT EXISTS test_user_02242" ||: + $CLICKHOUSE_CLIENT -u "test_user_02242" -q "SELECT COUNT(*) FROM system.session_log WHERE user == 'test_user_02242'" > /dev/null ||: + sleep 0.$RANDOM; + done } function set_role() { - $CLICKHOUSE_CLIENT -q "SET ROLE test_role_02242 TO test_user_02242" ||: + while true; do + $CLICKHOUSE_CLIENT -q "SET DEFAULT ROLE test_role_02242 TO test_user_02242" ||: + sleep 0.$RANDOM; + done } export -f delete_user @@ -42,12 +51,10 @@ export -f set_role TIMEOUT=10 -for (( i = 0 ; i < 100; ++i )) -do - clickhouse_client_loop_timeout $TIMEOUT create_and_login_user 2> /dev/null & - clickhouse_client_loop_timeout $TIMEOUT delete_user 2> /dev/null & - clickhouse_client_loop_timeout $TIMEOUT set_role 2> /dev/null & -done + +timeout $TIMEOUT bash -c create_and_login_user 2> /dev/null & +timeout $TIMEOUT bash -c delete_user 2> /dev/null & +timeout $TIMEOUT bash -c set_role 2> /dev/null & wait diff --git a/tests/queries/0_stateless/02243_drop_user_grant_race.sh b/tests/queries/0_stateless/02243_drop_user_grant_race.sh index e36be96aa02..46ad776006e 100755 --- a/tests/queries/0_stateless/02243_drop_user_grant_race.sh +++ b/tests/queries/0_stateless/02243_drop_user_grant_race.sh @@ -19,17 +19,18 @@ $CLICKHOUSE_CLIENT -nm -q " function create_drop_grant() { - $CLICKHOUSE_CLIENT -q "CREATE USER IF NOT EXISTS test_user_02243 GRANTEES NONE" ||: - $CLICKHOUSE_CLIENT -q "GRANT ALL ON *.* TO test_user_02243 WITH GRANT OPTION" ||: - $CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS test_user_02243" & - $CLICKHOUSE_CLIENT --user test_user_02243 -q "GRANT ALL ON *.* TO kek_02243" & - wait + while true; do + $CLICKHOUSE_CLIENT -q "CREATE USER IF NOT EXISTS test_user_02243 GRANTEES NONE" ||: + $CLICKHOUSE_CLIENT -q "GRANT ALL ON *.* TO test_user_02243 WITH GRANT OPTION" ||: + $CLICKHOUSE_CLIENT -q "DROP USER IF EXISTS test_user_02243" & + $CLICKHOUSE_CLIENT --user test_user_02243 -q "GRANT ALL ON *.* TO kek_02243" & + done } export -f create_drop_grant TIMEOUT=10 -clickhouse_client_loop_timeout $TIMEOUT create_drop_grant 2> /dev/null & +timeout $TIMEOUT bash -c create_drop_grant 2> /dev/null & wait $CLICKHOUSE_CLIENT --user kek_02243 -q "SELECT * FROM test" 2>&1| grep -Fa "Exception: " | grep -Eo ACCESS_DENIED | uniq From da3de04470e6305dc7d0e6a8713791b479b378d4 Mon Sep 17 00:00:00 2001 From: Mike Kot Date: Tue, 11 Jul 2023 12:27:23 +0000 Subject: [PATCH 37/38] fix --- src/Common/Config/ConfigReloader.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Common/Config/ConfigReloader.h b/src/Common/Config/ConfigReloader.h index c5a18524318..2529c7a5236 100644 --- a/src/Common/Config/ConfigReloader.h +++ b/src/Common/Config/ConfigReloader.h @@ -4,8 +4,7 @@ #include #include #include -#include -#include +#include #include #include #include From d0f81fb1cd10f45ad4fe9679ad9b55841b59b7ba Mon Sep 17 00:00:00 2001 From: Mike Kot Date: Tue, 11 Jul 2023 13:33:07 +0000 Subject: [PATCH 38/38] fix --- programs/keeper/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index 18bdc8f317c..54c39f5709f 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -73,6 +73,7 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/TCPServer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/NotFoundHandler.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/ProtocolServerAdapter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/CertificateReloader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/PrometheusRequestHandler.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/PrometheusMetricsWriter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/waitServersToFinish.cpp