mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Improve schema inference for JSONEachRow and TSKV formats
This commit is contained in:
parent
eb787c1ddc
commit
ce97ccbfb9
@ -270,13 +270,13 @@ struct JSONEachRowFieldsExtractor
|
||||
std::vector<String> column_names;
|
||||
};
|
||||
|
||||
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings)
|
||||
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings)
|
||||
{
|
||||
JSONEachRowFieldsExtractor extractor;
|
||||
auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl<JSONEachRowFieldsExtractor, '{', '}'>(in, json_strings, extractor);
|
||||
std::unordered_map<String, DataTypePtr> result;
|
||||
NamesAndTypesList result;
|
||||
for (size_t i = 0; i != extractor.column_names.size(); ++i)
|
||||
result[extractor.column_names[i]] = data_types[i];
|
||||
result.emplace_back(extractor.column_names[i], data_types[i]);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -20,9 +20,9 @@ std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in
|
||||
DataTypePtr getDataTypeFromJSONField(const String & field);
|
||||
|
||||
/// Read row in JSONEachRow format and try to determine type for each field.
|
||||
/// Return map {column_name : type}.
|
||||
/// Return list of names and types.
|
||||
/// If cannot determine the type of some field, return nullptr for it.
|
||||
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings);
|
||||
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings);
|
||||
|
||||
/// Read row in JSONCompactEachRow format and try to determine type for each field.
|
||||
/// If cannot determine the type of some field, return nullptr for it.
|
||||
|
@ -96,21 +96,33 @@ IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t ma
|
||||
|
||||
NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
|
||||
{
|
||||
auto names_and_types = readRowAndGetNamesAndDataTypes();
|
||||
bool eof = false;
|
||||
auto names_and_types = readRowAndGetNamesAndDataTypes(eof);
|
||||
std::unordered_map<String, DataTypePtr> names_to_types;
|
||||
std::vector<String> names_order;
|
||||
names_to_types.reserve(names_and_types.size());
|
||||
names_order.reserve(names_and_types.size());
|
||||
for (const auto & [name, type] : names_and_types)
|
||||
{
|
||||
names_to_types[name] = type;
|
||||
names_order.push_back(name);
|
||||
}
|
||||
|
||||
for (size_t row = 1; row < max_rows_to_read; ++row)
|
||||
{
|
||||
auto new_names_and_types = readRowAndGetNamesAndDataTypes();
|
||||
if (new_names_and_types.empty())
|
||||
auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof);
|
||||
if (eof)
|
||||
/// We reached eof.
|
||||
break;
|
||||
|
||||
for (const auto & [name, new_type] : new_names_and_types)
|
||||
{
|
||||
auto it = names_and_types.find(name);
|
||||
auto it = names_to_types.find(name);
|
||||
/// If we didn't see this column before, just add it.
|
||||
if (it == names_and_types.end())
|
||||
if (it == names_to_types.end())
|
||||
{
|
||||
names_and_types[name] = new_type;
|
||||
names_to_types[name] = new_type;
|
||||
names_order.push_back(name);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -133,12 +145,13 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
|
||||
}
|
||||
|
||||
/// Check that we read at list one column.
|
||||
if (names_and_types.empty())
|
||||
if (names_to_types.empty())
|
||||
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data");
|
||||
|
||||
NamesAndTypesList result;
|
||||
for (auto & [name, type] : names_and_types)
|
||||
for (auto & name : names_order)
|
||||
{
|
||||
auto & type = names_to_types[name];
|
||||
/// Check that we could determine the type of this column.
|
||||
if (!type)
|
||||
{
|
||||
|
@ -68,10 +68,10 @@ public:
|
||||
|
||||
protected:
|
||||
/// Read one row and determine types of columns in it.
|
||||
/// Return map {column_name : type}.
|
||||
/// Return list with names and types.
|
||||
/// If it's impossible to determine the type for some column, return nullptr for it.
|
||||
/// Return empty map is can't read more data.
|
||||
virtual std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() = 0;
|
||||
/// Set eof = true if can't read more data.
|
||||
virtual NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) = 0;
|
||||
|
||||
private:
|
||||
size_t max_rows_to_read;
|
||||
|
@ -312,7 +312,7 @@ JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_str
|
||||
}
|
||||
|
||||
|
||||
std::unordered_map<String, DataTypePtr> JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes()
|
||||
NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
|
||||
{
|
||||
if (first_row)
|
||||
{
|
||||
@ -339,7 +339,10 @@ std::unordered_map<String, DataTypePtr> JSONEachRowSchemaReader::readRowAndGetNa
|
||||
|
||||
skipWhitespaceIfAny(in);
|
||||
if (in.eof())
|
||||
{
|
||||
eof = true;
|
||||
return {};
|
||||
}
|
||||
|
||||
return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings);
|
||||
}
|
||||
|
@ -91,7 +91,7 @@ public:
|
||||
JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings);
|
||||
|
||||
private:
|
||||
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() override;
|
||||
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
|
||||
|
||||
bool json_strings;
|
||||
bool first_row = true;
|
||||
|
@ -222,7 +222,7 @@ TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & form
|
||||
{
|
||||
}
|
||||
|
||||
std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndDataTypes()
|
||||
NamesAndTypesList TSKVSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
|
||||
{
|
||||
if (first_row)
|
||||
{
|
||||
@ -231,7 +231,10 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
|
||||
}
|
||||
|
||||
if (in.eof())
|
||||
{
|
||||
eof = true;
|
||||
return {};
|
||||
}
|
||||
|
||||
if (*in.position() == '\n')
|
||||
{
|
||||
@ -239,7 +242,7 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
|
||||
return {};
|
||||
}
|
||||
|
||||
std::unordered_map<String, DataTypePtr> names_and_types;
|
||||
NamesAndTypesList names_and_types;
|
||||
StringRef name_ref;
|
||||
String name_buf;
|
||||
String value;
|
||||
@ -250,7 +253,7 @@ std::unordered_map<String, DataTypePtr> TSKVSchemaReader::readRowAndGetNamesAndD
|
||||
if (has_value)
|
||||
{
|
||||
readEscapedString(value, in);
|
||||
names_and_types[std::move(name)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped);
|
||||
names_and_types.emplace_back(std::move(name), determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -59,7 +59,7 @@ public:
|
||||
TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
|
||||
|
||||
private:
|
||||
std::unordered_map<String, DataTypePtr> readRowAndGetNamesAndDataTypes() override;
|
||||
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
|
||||
|
||||
const FormatSettings format_settings;
|
||||
bool first_row = true;
|
||||
|
@ -0,0 +1,34 @@
|
||||
a Nullable(String)
|
||||
b Nullable(String)
|
||||
c Nullable(String)
|
||||
1 s1 \N
|
||||
2 } [2]
|
||||
\N \N \N
|
||||
\N \N \N
|
||||
\N \N [3]
|
||||
b Nullable(String)
|
||||
a Nullable(String)
|
||||
c Nullable(String)
|
||||
e Nullable(String)
|
||||
1 \N \N \N
|
||||
\N 2 3 \N
|
||||
\N \N \N \N
|
||||
\N \N \N 3
|
||||
3 3 1 \N
|
||||
a Nullable(Float64)
|
||||
b Nullable(String)
|
||||
c Array(Nullable(Float64))
|
||||
1 s1 []
|
||||
2 \N [2]
|
||||
\N \N []
|
||||
\N \N []
|
||||
\N \N [3]
|
||||
b Nullable(Float64)
|
||||
a Nullable(Float64)
|
||||
c Nullable(Float64)
|
||||
e Nullable(Float64)
|
||||
1 \N \N \N
|
||||
\N 2 3 \N
|
||||
\N \N \N \N
|
||||
\N \N \N 3
|
||||
3 3 1 \N
|
@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel, no-fasttest
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
|
||||
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
||||
FILE_NAME=test_02247.data
|
||||
DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME
|
||||
|
||||
touch $DATA_FILE
|
||||
|
||||
echo -e 'a=1\tb=s1\tc=\N
|
||||
c=[2]\ta=2\tb=\N}
|
||||
a=\N
|
||||
c=[3]\ta=\N' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')"
|
||||
|
||||
echo -e 'b=1
|
||||
a=2\tc=3
|
||||
e=3
|
||||
c=1\tb=3\ta=3' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')"
|
||||
|
||||
|
||||
echo -e '{"a" : 1, "b" : "s1", "c" : null}
|
||||
{"c" : [2], "a" : 2, "b" : null}
|
||||
{}
|
||||
{"a" : null}
|
||||
{"c" : [3], "a" : null}' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')"
|
||||
|
||||
echo -e '{"b" : 1}
|
||||
{"a" : 2, "c" : 3}
|
||||
{}
|
||||
{"e" : 3}
|
||||
{"c" : 1, "b" : 3, "a" : 3}' > $DATA_FILE
|
||||
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')"
|
||||
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')"
|
||||
|
||||
|
||||
rm $DATA_FILE
|
Loading…
Reference in New Issue
Block a user