Try to treat input values as enum id in tsv/scv

This commit is contained in:
Pavel Kruglov 2020-11-05 16:53:56 +03:00
parent 39cd1c0371
commit ca4088b400
6 changed files with 64 additions and 8 deletions

View File

@ -155,7 +155,7 @@ void DataTypeEnum<Type>::deserializeTextEscaped(IColumn & column, ReadBuffer & i
/// NOTE It would be nice to do without creating a temporary object - at least extract std::string out. /// NOTE It would be nice to do without creating a temporary object - at least extract std::string out.
std::string field_name; std::string field_name;
readEscapedString(field_name, istr); readEscapedString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name))); assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name), true));
} }
} }
@ -182,7 +182,7 @@ void DataTypeEnum<Type>::deserializeWholeText(IColumn & column, ReadBuffer & ist
{ {
std::string field_name; std::string field_name;
readString(field_name, istr); readString(field_name, istr);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name))); assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name), true));
} }
} }
@ -226,7 +226,7 @@ void DataTypeEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
{ {
std::string field_name; std::string field_name;
readCSVString(field_name, istr, settings.csv); readCSVString(field_name, istr, settings.csv);
assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name))); assert_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(field_name), true));
} }
} }

View File

@ -4,6 +4,7 @@
#include <Columns/ColumnVector.h> #include <Columns/ColumnVector.h>
#include <Columns/ColumnConst.h> #include <Columns/ColumnConst.h>
#include <Common/HashTable/HashMap.h> #include <Common/HashTable/HashMap.h>
#include <IO/ReadHelpers.h>
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
@ -80,13 +81,25 @@ public:
return findByValue(value)->second; return findByValue(value)->second;
} }
FieldType getValue(StringRef field_name) const FieldType getValue(StringRef field_name, bool try_treat_as_id=false) const
{ {
const auto it = name_to_value_map.find(field_name); const auto value_it = name_to_value_map.find(field_name);
if (!it) if (!value_it)
{
/// It is used in CSV and TSV input formats. If we fail to find given string in
/// enum names and this string is number, we will try to treat it as enum id.
if (try_treat_as_id && isStringNumber(field_name))
{
FieldType x;
ReadBufferFromMemory tmp_buf(field_name.data, field_name.size);
readText(x, tmp_buf);
const auto name_it = value_to_name_map.find(x);
if (name_it != value_to_name_map.end())
return x;
}
throw Exception{"Unknown element '" + field_name.toString() + "' for type " + getName(), ErrorCodes::BAD_ARGUMENTS}; throw Exception{"Unknown element '" + field_name.toString() + "' for type " + getName(), ErrorCodes::BAD_ARGUMENTS};
}
return it->getMapped(); return value_it->getMapped();
} }
FieldType readValue(ReadBuffer & istr) const FieldType readValue(ReadBuffer & istr) const

View File

@ -1111,4 +1111,12 @@ bool loadAtPosition(ReadBuffer & in, DB::Memory<> & memory, char * & current)
return loaded_more; return loaded_more;
} }
bool isStringNumber(StringRef str)
{
for (size_t i = 0; i != str.size; ++i)
if (!isdigit(str.data[i]))
return false;
return true;
}
} }

View File

@ -1228,4 +1228,6 @@ void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current);
*/ */
bool loadAtPosition(ReadBuffer & in, Memory<> & memory, char * & current); bool loadAtPosition(ReadBuffer & in, Memory<> & memory, char * & current);
bool isStringNumber(StringRef str);
} }

View File

@ -0,0 +1,10 @@
1 a
2 3
3 3
4 a
5 b
6 a
7 3
8 3
9 a
10 b

View File

@ -0,0 +1,23 @@
DROP TABLE IF EXISTS enum_as_num;
CREATE TABLE enum_as_num (
Id Int32,
Value Enum('a' = 1, '3' = 2, 'b' = 3)
) ENGINE=Memory();
INSERT INTO enum_as_num FORMAT TSV 1 1
INSERT INTO enum_as_num FORMAT TSV 2 2
INSERT INTO enum_as_num FORMAT TSV 3 3
INSERT INTO enum_as_num FORMAT TSV 4 a
INSERT INTO enum_as_num FORMAT TSV 5 b
INSERT INTO enum_as_num FORMAT CSV 6,1
INSERT INTO enum_as_num FORMAT CSV 7,2
INSERT INTO enum_as_num FORMAT CSV 8,3
INSERT INTO enum_as_num FORMAT CSV 9,a
INSERT INTO enum_as_num FORMAT CSV 10,b
SELECT * FROM enum_as_num ORDER BY Id;
DROP TABLE IF EXISTS enum_as_num;