Merge pull request #35724 from Avogar/fix-order

Improve schema inference for JSONEachRow and TSKV formats
This commit is contained in:
Anton Popov 2022-04-04 11:00:21 +02:00 committed by GitHub
commit 11e18a16f3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 190 additions and 88 deletions

View File

@ -270,13 +270,13 @@ struct JSONEachRowFieldsExtractor
std::vector<String> column_names;
};
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings)
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings)
{
JSONEachRowFieldsExtractor extractor;
auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl<JSONEachRowFieldsExtractor, '{', '}'>(in, json_strings, extractor);
std::unordered_map<String, DataTypePtr> result;
NamesAndTypesList result;
for (size_t i = 0; i != extractor.column_names.size(); ++i)
result[extractor.column_names[i]] = data_types[i];
result.emplace_back(extractor.column_names[i], data_types[i]);
return result;
}

View File

@ -20,9 +20,9 @@ std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in
DataTypePtr getDataTypeFromJSONField(const String & field);
/// Read row in JSONEachRow format and try to determine type for each field.
/// Return map {column_name : type}.
/// Return list of names and types.
/// If cannot determine the type of some field, return nullptr for it.
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings);
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings);
/// Read row in JSONCompactEachRow format and try to determine type for each field.
/// If cannot determine the type of some field, return nullptr for it.

View File

@ -96,21 +96,33 @@ IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t ma
NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
{
auto names_and_types = readRowAndGetNamesAndDataTypes();
bool eof = false;
auto names_and_types = readRowAndGetNamesAndDataTypes(eof);
std::unordered_map<String, DataTypePtr> names_to_types;
std::vector<String> names_order;
names_to_types.reserve(names_and_types.size());
names_order.reserve(names_and_types.size());
for (const auto & [name, type] : names_and_types)
{
names_to_types[name] = type;
names_order.push_back(name);
}
for (size_t row = 1; row < max_rows_to_read; ++row)
{
auto new_names_and_types = readRowAndGetNamesAndDataTypes();
if (new_names_and_types.empty())
auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof);
if (eof)
/// We reached eof.
break;
for (const auto & [name, new_type] : new_names_and_types)
{
auto it = names_and_types.find(name);
auto it = names_to_types.find(name);
/// If we didn't see this column before, just add it.
if (it == names_and_types.end())
if (it == names_to_types.end())
{
names_and_types[name] = new_type;
names_to_types[name] = new_type;
names_order.push_back(name);
continue;
}
@ -133,12 +145,13 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
}
/// Check that we read at list one column.
if (names_and_types.empty())
if (names_to_types.empty())
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data");
NamesAndTypesList result;
for (auto & [name, type] : names_and_types)
for (auto & name : names_order)
{
auto & type = names_to_types[name];
/// Check that we could determine the type of this column.
if (!type)
{

View File

@ -68,10 +68,10 @@ public:
protected:
/// Read one row and determine types of columns in it.
/// Return map {column_name : type}.
/// Return list with names and types.
/// If it's impossible to determine the type for some column, return nullptr for it.
/// Return empty map is can't read more data.
virtual std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() = 0;
/// Set eof = true if can't read more data.
virtual NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) = 0;
private:
size_t max_rows_to_read;

View File

@ -312,7 +312,7 @@ JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_str
}
std::unordered_map<String, DataTypePtr> JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes()
NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
{
if (first_row)
{
@ -339,7 +339,10 @@ std::unordered_map<String, DataTypePtr> JSONEachRowSchemaReader::readRowAndGetNa
skipWhitespaceIfAny(in);
if (in.eof())
{
eof = true;
return {};
}
return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings);
}

View File

@ -91,7 +91,7 @@ public:
JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings);
private:
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() override;
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
bool json_strings;
bool first_row = true;

View File

@ -222,7 +222,7 @@ TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & form
{
}
std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndDataTypes()
NamesAndTypesList TSKVSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
{
if (first_row)
{
@ -231,7 +231,10 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
}
if (in.eof())
{
eof = true;
return {};
}
if (*in.position() == '\n')
{
@ -239,7 +242,7 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
return {};
}
std::unordered_map<String, DataTypePtr> names_and_types;
NamesAndTypesList names_and_types;
StringRef name_ref;
String name_buf;
String value;
@ -250,7 +253,7 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
if (has_value)
{
readEscapedString(value, in);
names_and_types[std::move(name)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped);
names_and_types.emplace_back(std::move(name), determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped));
}
else
{

View File

@ -59,7 +59,7 @@ public:
TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() override;
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
const FormatSettings format_settings;
bool first_row = true;

View File

@ -53,34 +53,34 @@ c Map(String, Nullable(Float64))
d Nullable(UInt8)
42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1
JSONEachRow
d Nullable(UInt8)
a Nullable(Float64)
b Array(Tuple(Nullable(Float64), Nullable(String)))
c Map(String, Nullable(Float64))
a Nullable(Float64)
1 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 42.42
d Nullable(UInt8)
42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1
a Nullable(Float64)
b Array(Tuple(Nullable(Float64), Nullable(String)))
c Map(String, Nullable(Float64))
a Nullable(Float64)
d Nullable(UInt8)
\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N
1 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 32
32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 1
a Nullable(Float64)
b Nullable(String)
c Array(Nullable(Float64))
a Nullable(Float64)
s1 [] 1
\N [2] 2
\N [] \N
\N [] \N
\N [3] \N
1 s1 []
2 \N [2]
\N \N []
\N \N []
\N \N [3]
TSKV
a Nullable(String)
b Nullable(String)
c Nullable(String)
a Nullable(String)
s1 \N 1
} [2] 2
1 s1 \N
2 } [2]
\N \N \N
\N \N \N
\N [3] \N
\N \N [3]
Values
c1 Nullable(Float64)
c2 Nullable(String)

View File

@ -1,52 +1,52 @@
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
111 Hello
123 World
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
Hello 111
World 123
1 2 [1,2,3] [['abc'],[],['d','e']]
c1 Nullable(Float64)
c2 Nullable(Float64)
c3 Array(Nullable(Float64))
c4 Array(Array(Nullable(String)))
111 Hello
123 World
111 Hello
131 Hello
123 World
b Nullable(Float64)
Hello 111
World 123
Hello 111
Hello 131
World 123
a Nullable(String)
b Nullable(Float64)

View File

@ -1,3 +1,3 @@
CREATE TABLE default.test\n(\n `y` Nullable(String),\n `x` Nullable(Float64)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\')
CREATE TABLE default.test\n(\n `x` Nullable(Float64),\n `y` Nullable(String)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\')
OK
OK

View File

@ -1,8 +1,8 @@
a Nullable(String)
b Nullable(String)
c Nullable(String)
a Nullable(String)
s1 \N 1
} [2] 2
1 s1 \N
2 } [2]
\N \N \N
\N \N \N
\N [3] \N
\N \N [3]

View File

@ -0,0 +1,34 @@
a Nullable(String)
b Nullable(String)
c Nullable(String)
1 s1 \N
2 } [2]
\N \N \N
\N \N \N
\N \N [3]
b Nullable(String)
a Nullable(String)
c Nullable(String)
e Nullable(String)
1 \N \N \N
\N 2 3 \N
\N \N \N \N
\N \N \N 3
3 3 1 \N
a Nullable(Float64)
b Nullable(String)
c Array(Nullable(Float64))
1 s1 []
2 \N [2]
\N \N []
\N \N []
\N \N [3]
b Nullable(Float64)
a Nullable(Float64)
c Nullable(Float64)
e Nullable(Float64)
1 \N \N \N
\N 2 3 \N
\N \N \N \N
\N \N \N 3
3 3 1 \N

View File

@ -0,0 +1,49 @@
#!/usr/bin/env bash
# Tags: no-parallel, no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
FILE_NAME=test_02247.data
DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME
touch $DATA_FILE
echo -e 'a=1\tb=s1\tc=\N
c=[2]\ta=2\tb=\N}
a=\N
c=[3]\ta=\N' > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')"
echo -e 'b=1
a=2\tc=3
e=3
c=1\tb=3\ta=3' > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')"
echo -e '{"a" : 1, "b" : "s1", "c" : null}
{"c" : [2], "a" : 2, "b" : null}
{}
{"a" : null}
{"c" : [3], "a" : null}' > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')"
echo -e '{"b" : 1}
{"a" : 2, "c" : 3}
{}
{"e" : 3}
{"c" : 1, "b" : 3, "a" : 3}' > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')"
rm $DATA_FILE