From 9820beae68642144b981d0ca17d3d82bbb1ed3d8 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 19 Jan 2023 16:11:13 +0100 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Antonio Andelic --- docs/en/interfaces/schema-inference.md | 10 +++++----- src/Formats/EscapingRuleUtils.cpp | 2 +- src/Processors/Formats/ISchemaReader.h | 2 +- src/Processors/Formats/Impl/CSVRowInputFormat.cpp | 8 ++++---- .../Formats/Impl/CustomSeparatedRowInputFormat.h | 2 +- .../Formats/Impl/TabSeparatedRowInputFormat.cpp | 2 +- .../Formats/RowInputFormatWithNamesAndTypes.cpp | 6 +++--- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index a8e5a8b3e66..45d284b7682 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -558,7 +558,7 @@ and if the value is not a number, ClickHouse treats it as a string. If you don't want ClickHouse to try to determine complex types using some parsers and heuristics, you can disable setting `input_format_csv_use_best_effort_in_schema_inference` and ClickHouse will treat all columns as Strings. -If setting `input_format_csv_detect_header` is enabled, ClickHouse will try to detect a header with column names (and maybe types) while schema inference. This setting is enabled by default. +If setting `input_format_csv_detect_header` is enabled, ClickHouse will try to detect the header with column names (and maybe types) while inferring schema. This setting is enabled by default. **Examples:** @@ -708,7 +708,7 @@ $$) └────────┴───────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` -Not that header can be detected only if there is at least one column with non-String type. If all columns have String type, header is not detected: +Note that the header can be detected only if there is at least one column with a non-String type. If all columns have String type, the header is not detected: ```sql SELECT * FROM format(CSV, @@ -734,7 +734,7 @@ the recursive parser to determine the most appropriate type. If the type cannot If you don't want ClickHouse to try to determine complex types using some parsers and heuristics, you can disable setting `input_format_tsv_use_best_effort_in_schema_inference` and ClickHouse will treat all columns as Strings. -If setting `input_format_tsv_detect_header` is enabled, ClickHouse will try to detect a header with column names (and maybe types) while schema inference. This setting is enabled by default. +If setting `input_format_tsv_detect_header` is enabled, ClickHouse will try to detect the header with column names (and maybe types) while inferring schema. This setting is enabled by default. **Examples:** @@ -894,7 +894,7 @@ $$) └────────┴───────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` -Not that header can be detected only if there is at least one column with non-String type. If all columns have String type, header is not detected: +Note that the header can be detected only if there is at least one column with a non-String type. If all columns have String type, the header is not detected: ```sql SELECT * FROM format(TSV, @@ -1024,7 +1024,7 @@ DESC format(TSV, '[1,2,3] 42.42 Hello World!') In CustomSeparated format ClickHouse first extracts all column values from the row according to specified delimiters and then tries to infer the data type for each value according to escaping rule. -If setting `input_format_custom_detect_header` is enabled, ClickHouse will try to detect a header with column names (and maybe types) while schema inference. This setting is enabled by default. +If setting `input_format_custom_detect_header` is enabled, ClickHouse will try to detect the header with column names (and maybe types) while inferring schema. This setting is enabled by default. **Example** diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 96e8851c782..e1f6929887c 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -310,7 +310,7 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet return type; } - /// Case when CSV value is not in quotes. Check if it's a number or date/datetime, and if not, determine it's as a string. + /// Case when CSV value is not in quotes. Check if it's a number or date/datetime, and if not, determine it as a string. if (auto number_type = tryInferNumberFromString(field, format_settings)) return number_type; diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 285bb02fa42..edc5c6068c3 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -190,7 +190,7 @@ void chooseResultColumnTypes( throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values"); if (types.size() != column_names.size()) - throw Exception(ErrorCodes::INCORRECT_DATA,"The number of column names {} differs with the number of types {}", column_names.size(), types.size()); + throw Exception(ErrorCodes::INCORRECT_DATA, "The number of column names {} differs from the number of types {}", column_names.size(), types.size()); for (size_t i = 0; i != types.size(); ++i) chooseResultColumnType(schema_reader, types[i], new_types[i], default_type, column_names[i], row); diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index df55f03a15c..4465f722478 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -26,8 +26,8 @@ namespace { void checkBadDelimiter(char delimiter) { - const String bad_delimiters = " \t\"'.UL"; - if (bad_delimiters.find(delimiter) != String::npos) + constexpr std::string_view bad_delimiters = " \t\"'.UL"; + if (bad_delimiters.find(delimiter) != std::string_view::npos) throw Exception( String("CSV format may not work correctly with delimiter '") + delimiter + "'. Try use CustomSeparated format instead.", @@ -343,7 +343,7 @@ std::pair, DataTypes> CSVSchemaReader::readRowAndGetFieldsAn DataTypes CSVSchemaReader::readRowAndGetDataTypesImpl() { - return readRowAndGetFieldsAndDataTypes().second; + return std::move(readRowAndGetFieldsAndDataTypes().second); } @@ -435,7 +435,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool, bool) { - size_t min_rows = 3; /// Make it 3 for header auto detection (first 3 rows must be always in the same segment). + static constexpr size_t min_rows = 3; /// Make it 3 for header auto detection (first 3 rows must be always in the same segment). factory.registerFileSegmentationEngine(format_name, [min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows) { return fileSegmentationEngineCSVImpl(in, memory, min_bytes, min_rows, max_rows); diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index a127ed53403..26ee32be370 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -83,7 +83,7 @@ public: void setReadBuffer(ReadBuffer & in_) override; private: - enum class ReadFieldMode + enum class ReadFieldMode : uint8_t { AS_STRING, AS_FIELD, diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 11739645847..931fc0d6d06 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -422,7 +422,7 @@ void registerFileSegmentationEngineTabSeparated(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool, bool) { - size_t min_rows = 3; /// Make it 3 for header auto detection (first 3 rows must be always in the same segment). + static constexpr size_t min_rows = 3; /// Make it 3 for header auto detection (first 3 rows must be always in the same segment). factory.registerFileSegmentationEngine(format_name, [is_raw, min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows) { return fileSegmentationEngineTabSeparatedImpl(in, memory, is_raw, min_bytes, min_rows, max_rows); diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 5337b4cad4d..618375c3341 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -160,7 +160,7 @@ void RowInputFormatWithNamesAndTypes::tryDetectHeader(std::vector & colu } /// First row is a header with column names. - column_names_out = first_row_values; + column_names_out = std::move(first_row_values); peekable_buf->dropCheckpoint(); is_header_detected = true; @@ -187,7 +187,7 @@ void RowInputFormatWithNamesAndTypes::tryDetectHeader(std::vector & colu } /// The second row is a header with type names. - type_names_out = second_row_values; + type_names_out = std::move(second_row_values); peekable_buf->dropCheckpoint(); } @@ -461,7 +461,7 @@ void FormatWithNamesAndTypesSchemaReader::tryDetectHeader(std::vector & /// with all String elements can be real data and we cannot use them as a header. if (checkIfAllTypesAreString(data_types)) { - buffered_types = data_types; + buffered_types = std::move(data_types); return; }