Improve schema inference for JSONEachRow and TSKV formats

This commit is contained in:
avogar 2022-03-29 14:47:51 +00:00
parent eb787c1ddc
commit ce97ccbfb9
10 changed files with 122 additions and 22 deletions

View File

@ -270,13 +270,13 @@ struct JSONEachRowFieldsExtractor
std::vector<String> column_names;
};
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings)
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings)
{
JSONEachRowFieldsExtractor extractor;
auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl<JSONEachRowFieldsExtractor, '{', '}'>(in, json_strings, extractor);
std::unordered_map<String, DataTypePtr> result;
NamesAndTypesList result;
for (size_t i = 0; i != extractor.column_names.size(); ++i)
result[extractor.column_names[i]] = data_types[i];
result.emplace_back(extractor.column_names[i], data_types[i]);
return result;
}

View File

@ -20,9 +20,9 @@ std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in
DataTypePtr getDataTypeFromJSONField(const String & field);
/// Read row in JSONEachRow format and try to determine type for each field.
/// Return map {column_name : type}.
/// Return list of names and types.
/// If cannot determine the type of some field, return nullptr for it.
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings);
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings);
/// Read row in JSONCompactEachRow format and try to determine type for each field.
/// If cannot determine the type of some field, return nullptr for it.

View File

@ -96,21 +96,33 @@ IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t ma
NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
{
auto names_and_types = readRowAndGetNamesAndDataTypes();
bool eof = false;
auto names_and_types = readRowAndGetNamesAndDataTypes(eof);
std::unordered_map<String, DataTypePtr> names_to_types;
std::vector<String> names_order;
names_to_types.reserve(names_and_types.size());
names_order.reserve(names_and_types.size());
for (const auto & [name, type] : names_and_types)
{
names_to_types[name] = type;
names_order.push_back(name);
}
for (size_t row = 1; row < max_rows_to_read; ++row)
{
auto new_names_and_types = readRowAndGetNamesAndDataTypes();
if (new_names_and_types.empty())
auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof);
if (eof)
/// We reached eof.
break;
for (const auto & [name, new_type] : new_names_and_types)
{
auto it = names_and_types.find(name);
auto it = names_to_types.find(name);
/// If we didn't see this column before, just add it.
if (it == names_and_types.end())
if (it == names_to_types.end())
{
names_and_types[name] = new_type;
names_to_types[name] = new_type;
names_order.push_back(name);
continue;
}
@ -133,12 +145,13 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
}
/// Check that we read at list one column.
if (names_and_types.empty())
if (names_to_types.empty())
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data");
NamesAndTypesList result;
for (auto & [name, type] : names_and_types)
for (auto & name : names_order)
{
auto & type = names_to_types[name];
/// Check that we could determine the type of this column.
if (!type)
{

View File

@ -68,10 +68,10 @@ public:
protected:
/// Read one row and determine types of columns in it.
/// Return map {column_name : type}.
/// Return list with names and types.
/// If it's impossible to determine the type for some column, return nullptr for it.
/// Return empty map is can't read more data.
virtual std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() = 0;
/// Set eof = true if can't read more data.
virtual NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) = 0;
private:
size_t max_rows_to_read;

View File

@ -312,7 +312,7 @@ JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_str
}
std::unordered_map<String, DataTypePtr> JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes()
NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
{
if (first_row)
{
@ -339,7 +339,10 @@ std::unordered_map<String, DataTypePtr> JSONEachRowSchemaReader::readRowAndGetNa
skipWhitespaceIfAny(in);
if (in.eof())
{
eof = true;
return {};
}
return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings);
}

View File

@ -91,7 +91,7 @@ public:
JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings);
private:
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() override;
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
bool json_strings;
bool first_row = true;

View File

@ -222,7 +222,7 @@ TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & form
{
}
std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndDataTypes()
NamesAndTypesList TSKVSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
{
if (first_row)
{
@ -231,7 +231,10 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
}
if (in.eof())
{
eof = true;
return {};
}
if (*in.position() == '\n')
{
@ -239,7 +242,7 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
return {};
}
std::unordered_map<String, DataTypePtr> names_and_types;
NamesAndTypesList names_and_types;
StringRef name_ref;
String name_buf;
String value;
@ -250,7 +253,7 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
if (has_value)
{
readEscapedString(value, in);
names_and_types[std::move(name)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped);
names_and_types.emplace_back(std::move(name), determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped));
}
else
{

View File

@ -59,7 +59,7 @@ public:
TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() override;
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
const FormatSettings format_settings;
bool first_row = true;

View File

@ -0,0 +1,34 @@
a Nullable(String)
b Nullable(String)
c Nullable(String)
1 s1 \N
2 } [2]
\N \N \N
\N \N \N
\N \N [3]
b Nullable(String)
a Nullable(String)
c Nullable(String)
e Nullable(String)
1 \N \N \N
\N 2 3 \N
\N \N \N \N
\N \N \N 3
3 3 1 \N
a Nullable(Float64)
b Nullable(String)
c Array(Nullable(Float64))
1 s1 []
2 \N [2]
\N \N []
\N \N []
\N \N [3]
b Nullable(Float64)
a Nullable(Float64)
c Nullable(Float64)
e Nullable(Float64)
1 \N \N \N
\N 2 3 \N
\N \N \N \N
\N \N \N 3
3 3 1 \N

View File

@ -0,0 +1,47 @@
#!/usr/bin/env bash
# Tags: no-parallel, no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
FILE_NAME=test_02247.data
DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME
touch $DATA_FILE
echo -e 'a=1\tb=s1\tc=\N
c=[2]\ta=2\tb=\N}
a=\N
c=[3]\ta=\N' > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')"
echo -e 'b=1
a=2\tc=3
e=3
c=1\tb=3\ta=3' > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')"
echo -e '{"a" : 1, "b" : "s1", "c" : null}
{"c" : [2], "a" : 2, "b" : null}
{}
{"a" : null}
{"c" : [3], "a" : null}' > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')"
echo -e '{"b" : 1}
{"a" : 2, "c" : 3}
{}
{"e" : 3}
{"c" : 1, "b" : 3, "a" : 3}' > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')"
rm $DATA_FILE