From 395e0d6756478fba784c2cc7e2575d590b9ac996 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Thu, 10 Jun 2021 13:57:19 +0000 Subject: [PATCH 1/2] done --- src/Processors/Formats/IInputFormat.h | 7 ++++++- .../Formats/Impl/CSVRowInputFormat.cpp | 6 +++--- .../Formats/Impl/CSVRowInputFormat.h | 6 +----- ...3_csvwithnames_subset_of_columns.reference | 2 ++ .../01903_csvwithnames_subset_of_columns.sh | 19 +++++++++++++++++++ 5 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.reference create mode 100755 tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.sh diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index 95910bf51e5..f8811962260 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -12,7 +12,7 @@ struct ColumnMapping { /// Non-atomic because there is strict `happens-before` between read and write access /// See InputFormatParallelParsing - bool is_set; + bool is_set{false}; /// Maps indexes of columns in the input file to indexes of table columns using OptionalIndexes = std::vector>; OptionalIndexes column_indexes_for_input_fields; @@ -22,6 +22,11 @@ struct ColumnMapping /// read the file header, and never changed afterwards. /// For other columns, it is updated on each read() call. std::vector read_columns; + + + /// Whether we have any columns that are not read from file at all, + /// and must be always initialized with defaults. + bool have_always_default_columns{false}; }; using ColumnMappingPtr = std::shared_ptr; diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 4ccc0db4cfe..93c39995e34 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -193,7 +193,7 @@ void CSVRowInputFormat::readPrefix() { if (!read_column) { - have_always_default_columns = true; + column_mapping->have_always_default_columns = true; break; } } @@ -221,7 +221,7 @@ bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext /// Track whether we have to fill any columns in this row with default /// values. If not, we return an empty column mask to the caller, so that /// it doesn't have to check it. - bool have_default_columns = have_always_default_columns; + bool have_default_columns = column_mapping->have_always_default_columns; ext.read_columns.assign(column_mapping->read_columns.size(), true); const auto delimiter = format_settings.csv.delimiter; @@ -416,7 +416,7 @@ void CSVRowInputFormat::resetParser() RowInputFormatWithDiagnosticInfo::resetParser(); column_mapping->column_indexes_for_input_fields.clear(); column_mapping->read_columns.clear(); - have_always_default_columns = false; + column_mapping->have_always_default_columns = false; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index 230acc51268..b6075745b39 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -31,17 +31,13 @@ public: void resetParser() override; private: + /// There fields are computed in constructor. bool with_names; const FormatSettings format_settings; DataTypes data_types; - using IndexesMap = std::unordered_map; IndexesMap column_indexes_by_names; - /// Whether we have any columns that are not read from file at all, - /// and must be always initialized with defaults. - bool have_always_default_columns = false; - void addInputColumn(const String & column_name); void setupAllColumnsByTableSchema(); diff --git a/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.reference b/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.reference new file mode 100644 index 00000000000..0d28df3186f --- /dev/null +++ b/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.reference @@ -0,0 +1,2 @@ +1000000 +2000000 diff --git a/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.sh b/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.sh new file mode 100755 index 00000000000..7ad7a4b67bd --- /dev/null +++ b/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_01903" + +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_01903 (col0 Date, col1 Nullable(UInt8)) ENGINE MergeTree() PARTITION BY toYYYYMM(col0) ORDER BY col0;" + +(echo col0,col1; for _ in `seq 1 1000000`; do echo '2021-05-05',1; done) | $CLICKHOUSE_CLIENT -q "INSERT INTO test_01903 FORMAT CSVWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT count() FROM test_01903" + +(echo col0; for _ in `seq 1 1000000`; do echo '2021-05-05'; done) | $CLICKHOUSE_CLIENT -q "INSERT INTO test_01903 (col0) FORMAT CSVWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT count() FROM test_01903" + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_01903" From a0d7e97732db48c6fad21bf7eb00aad6bbc44629 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Thu, 10 Jun 2021 15:25:38 +0000 Subject: [PATCH 2/2] better --- .../01903_csvwithnames_subset_of_columns.reference | 1 + .../0_stateless/01903_csvwithnames_subset_of_columns.sh | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.reference b/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.reference index 0d28df3186f..1e008a436cc 100644 --- a/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.reference +++ b/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.reference @@ -1,2 +1,3 @@ 1000000 2000000 +3000000 diff --git a/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.sh b/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.sh index 7ad7a4b67bd..17d2c4a47d5 100755 --- a/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.sh +++ b/tests/queries/0_stateless/01903_csvwithnames_subset_of_columns.sh @@ -16,4 +16,8 @@ $CLICKHOUSE_CLIENT -q "SELECT count() FROM test_01903" $CLICKHOUSE_CLIENT -q "SELECT count() FROM test_01903" +(echo col0; for _ in `seq 1 1000000`; do echo '2021-05-05'; done) | $CLICKHOUSE_CLIENT -q "INSERT INTO test_01903 (col0) FORMAT TSVWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT count() FROM test_01903" + $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_01903"