From ca4088b400d28b39165588c5596b80ecc555e744 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Thu, 5 Nov 2020 16:53:56 +0300 Subject: [PATCH] Try to treat input values as enum id in tsv/scv --- src/DataTypes/DataTypeEnum.cpp | 6 ++--- src/DataTypes/DataTypeEnum.h | 23 +++++++++++++++---- src/IO/ReadHelpers.cpp | 8 +++++++ src/IO/ReadHelpers.h | 2 ++ ...558_enum_as_num_in_tsv_csv_input.reference | 10 ++++++++ .../01558_enum_as_num_in_tsv_csv_input.sql | 23 +++++++++++++++++++ 6 files changed, 64 insertions(+), 8 deletions(-) create mode 100644 tests/queries/0_stateless/01558_enum_as_num_in_tsv_csv_input.reference create mode 100644 tests/queries/0_stateless/01558_enum_as_num_in_tsv_csv_input.sql diff --git a/src/DataTypes/DataTypeEnum.cpp b/src/DataTypes/DataTypeEnum.cpp index ce61794facd..53b309e1db7 100644 --- a/src/DataTypes/DataTypeEnum.cpp +++ b/src/DataTypes/DataTypeEnum.cpp @@ -155,7 +155,7 @@ void DataTypeEnum::deserializeTextEscaped(IColumn & column, ReadBuffer & i /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. std::string field_name; readEscapedString(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name))); + assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); } } @@ -182,7 +182,7 @@ void DataTypeEnum::deserializeWholeText(IColumn & column, ReadBuffer & ist { std::string field_name; readString(field_name, istr); - assert_cast(column).getData().push_back(getValue(StringRef(field_name))); + assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); } } @@ -226,7 +226,7 @@ void DataTypeEnum::deserializeTextCSV(IColumn & column, ReadBuffer & istr, { std::string field_name; readCSVString(field_name, istr, settings.csv); - assert_cast(column).getData().push_back(getValue(StringRef(field_name))); + assert_cast(column).getData().push_back(getValue(StringRef(field_name), true)); } } diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index a66b36c6a8d..c6850e022d3 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -80,13 +81,25 @@ public: return findByValue(value)->second; } - FieldType getValue(StringRef field_name) const + FieldType getValue(StringRef field_name, bool try_treat_as_id=false) const { - const auto it = name_to_value_map.find(field_name); - if (!it) + const auto value_it = name_to_value_map.find(field_name); + if (!value_it) + { + /// It is used in CSV and TSV input formats. If we fail to find given string in + /// enum names and this string is number, we will try to treat it as enum id. + if (try_treat_as_id && isStringNumber(field_name)) + { + FieldType x; + ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); + readText(x, tmp_buf); + const auto name_it = value_to_name_map.find(x); + if (name_it != value_to_name_map.end()) + return x; + } throw Exception{"Unknown element '" + field_name.toString() + "' for type " + getName(), ErrorCodes::BAD_ARGUMENTS}; - - return it->getMapped(); + } + return value_it->getMapped(); } FieldType readValue(ReadBuffer & istr) const diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index bf41de3959a..73b4738b078 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1111,4 +1111,12 @@ bool loadAtPosition(ReadBuffer & in, DB::Memory<> & memory, char * & current) return loaded_more; } +bool isStringNumber(StringRef str) +{ + for (size_t i = 0; i != str.size; ++i) + if (!isdigit(str.data[i])) + return false; + return true; +} + } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 9ff1858c723..5cb4ec680c7 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -1228,4 +1228,6 @@ void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current); */ bool loadAtPosition(ReadBuffer & in, Memory<> & memory, char * & current); +bool isStringNumber(StringRef str); + } diff --git a/tests/queries/0_stateless/01558_enum_as_num_in_tsv_csv_input.reference b/tests/queries/0_stateless/01558_enum_as_num_in_tsv_csv_input.reference new file mode 100644 index 00000000000..103709cf498 --- /dev/null +++ b/tests/queries/0_stateless/01558_enum_as_num_in_tsv_csv_input.reference @@ -0,0 +1,10 @@ +1 a +2 3 +3 3 +4 a +5 b +6 a +7 3 +8 3 +9 a +10 b diff --git a/tests/queries/0_stateless/01558_enum_as_num_in_tsv_csv_input.sql b/tests/queries/0_stateless/01558_enum_as_num_in_tsv_csv_input.sql new file mode 100644 index 00000000000..6a0f2a97b4f --- /dev/null +++ b/tests/queries/0_stateless/01558_enum_as_num_in_tsv_csv_input.sql @@ -0,0 +1,23 @@ +DROP TABLE IF EXISTS enum_as_num; + +CREATE TABLE enum_as_num ( + Id Int32, + Value Enum('a' = 1, '3' = 2, 'b' = 3) +) ENGINE=Memory(); + +INSERT INTO enum_as_num FORMAT TSV 1 1 +INSERT INTO enum_as_num FORMAT TSV 2 2 +INSERT INTO enum_as_num FORMAT TSV 3 3 +INSERT INTO enum_as_num FORMAT TSV 4 a +INSERT INTO enum_as_num FORMAT TSV 5 b + +INSERT INTO enum_as_num FORMAT CSV 6,1 +INSERT INTO enum_as_num FORMAT CSV 7,2 +INSERT INTO enum_as_num FORMAT CSV 8,3 +INSERT INTO enum_as_num FORMAT CSV 9,a +INSERT INTO enum_as_num FORMAT CSV 10,b + +SELECT * FROM enum_as_num ORDER BY Id; + + +DROP TABLE IF EXISTS enum_as_num;