2019-12-05 10:13:40 +00:00
|
|
|
#include <IO/ReadHelpers.h>
|
2020-09-02 04:05:02 +00:00
|
|
|
#include <IO/ReadBufferFromString.h>
|
2019-12-05 10:13:40 +00:00
|
|
|
|
|
|
|
#include <Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h>
|
|
|
|
#include <Formats/FormatFactory.h>
|
|
|
|
#include <DataTypes/NestedUtils.h>
|
2021-03-09 14:46:52 +00:00
|
|
|
#include <DataTypes/Serializations/SerializationNullable.h>
|
2019-12-05 10:13:40 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int INCORRECT_DATA;
|
|
|
|
extern const int CANNOT_READ_ALL_DATA;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(ReadBuffer & in_,
|
|
|
|
const Block & header_,
|
|
|
|
Params params_,
|
|
|
|
const FormatSettings & format_settings_,
|
2020-09-02 04:05:02 +00:00
|
|
|
bool with_names_,
|
|
|
|
bool yield_strings_)
|
|
|
|
: IRowInputFormat(header_, in_, std::move(params_)), format_settings(format_settings_), with_names(with_names_), yield_strings(yield_strings_)
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto & sample = getPort().getHeader();
|
2019-12-05 10:13:40 +00:00
|
|
|
size_t num_columns = sample.columns();
|
|
|
|
|
|
|
|
data_types.resize(num_columns);
|
|
|
|
column_indexes_by_names.reserve(num_columns);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_columns; ++i)
|
|
|
|
{
|
|
|
|
const auto & column_info = sample.getByPosition(i);
|
|
|
|
|
|
|
|
data_types[i] = column_info.type;
|
|
|
|
column_indexes_by_names.emplace(column_info.name, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-11 00:51:27 +00:00
|
|
|
void JSONCompactEachRowRowInputFormat::resetParser()
|
|
|
|
{
|
|
|
|
IRowInputFormat::resetParser();
|
|
|
|
column_indexes_for_input_fields.clear();
|
|
|
|
not_seen_columns.clear();
|
|
|
|
}
|
|
|
|
|
2019-12-05 10:13:40 +00:00
|
|
|
void JSONCompactEachRowRowInputFormat::readPrefix()
|
|
|
|
{
|
2020-06-11 00:51:27 +00:00
|
|
|
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
2021-08-27 03:00:12 +00:00
|
|
|
skipBOMIfExists(*in);
|
2020-06-11 00:51:27 +00:00
|
|
|
|
2019-12-05 10:13:40 +00:00
|
|
|
if (with_names)
|
|
|
|
{
|
|
|
|
size_t num_columns = getPort().getHeader().columns();
|
|
|
|
read_columns.assign(num_columns, false);
|
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
assertChar('[', *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
do
|
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
String column_name;
|
2021-08-27 03:00:12 +00:00
|
|
|
readJSONString(column_name, *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
addInputColumn(column_name);
|
2021-08-27 03:00:12 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
2021-08-27 03:00:12 +00:00
|
|
|
while (checkChar(',', *in));
|
|
|
|
assertChar(']', *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
skipEndOfLine();
|
|
|
|
|
|
|
|
/// Type checking
|
2021-08-27 03:00:12 +00:00
|
|
|
assertChar('[', *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
|
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
String data_type;
|
2021-08-27 03:00:12 +00:00
|
|
|
readJSONString(data_type, *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
|
|
|
|
if (column_indexes_for_input_fields[i] &&
|
|
|
|
data_types[*column_indexes_for_input_fields[i]]->getName() != data_type)
|
|
|
|
{
|
|
|
|
throw Exception(
|
|
|
|
"Type of '" + getPort().getHeader().getByPosition(*column_indexes_for_input_fields[i]).name
|
|
|
|
+ "' must be " + data_types[*column_indexes_for_input_fields[i]]->getName() +
|
|
|
|
", not " + data_type,
|
|
|
|
ErrorCodes::INCORRECT_DATA
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (i != column_indexes_for_input_fields.size() - 1)
|
2021-08-27 03:00:12 +00:00
|
|
|
assertChar(',', *in);
|
|
|
|
skipWhitespaceIfAny(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
2021-08-27 03:00:12 +00:00
|
|
|
assertChar(']', *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
size_t num_columns = getPort().getHeader().columns();
|
|
|
|
read_columns.assign(num_columns, true);
|
|
|
|
column_indexes_for_input_fields.resize(num_columns);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_columns; ++i)
|
|
|
|
{
|
|
|
|
column_indexes_for_input_fields[i] = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < read_columns.size(); ++i)
|
|
|
|
{
|
|
|
|
if (!read_columns[i])
|
|
|
|
{
|
|
|
|
not_seen_columns.emplace_back(i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void JSONCompactEachRowRowInputFormat::addInputColumn(const String & column_name)
|
|
|
|
{
|
|
|
|
names_of_columns.emplace_back(column_name);
|
|
|
|
|
|
|
|
const auto column_it = column_indexes_by_names.find(column_name);
|
|
|
|
if (column_it == column_indexes_by_names.end())
|
|
|
|
{
|
|
|
|
if (format_settings.skip_unknown_fields)
|
|
|
|
{
|
|
|
|
column_indexes_for_input_fields.push_back(std::nullopt);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
throw Exception(
|
|
|
|
"Unknown field found in JSONCompactEachRow header: '" + column_name + "' " +
|
|
|
|
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
|
|
|
|
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
|
|
|
|
ErrorCodes::INCORRECT_DATA
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto column_index = column_it->second;
|
|
|
|
|
|
|
|
if (read_columns[column_index])
|
|
|
|
throw Exception("Duplicate field found while parsing JSONCompactEachRow header: " + column_name, ErrorCodes::INCORRECT_DATA);
|
|
|
|
|
|
|
|
read_columns[column_index] = true;
|
|
|
|
column_indexes_for_input_fields.emplace_back(column_index);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool JSONCompactEachRowRowInputFormat::readRow(DB::MutableColumns &columns, DB::RowReadExtension &ext)
|
|
|
|
{
|
|
|
|
skipEndOfLine();
|
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
if (in->eof())
|
2019-12-05 10:13:40 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
size_t num_columns = columns.size();
|
|
|
|
|
|
|
|
read_columns.assign(num_columns, false);
|
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
assertChar('[', *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
|
|
|
{
|
|
|
|
const auto & table_column = column_indexes_for_input_fields[file_column];
|
|
|
|
if (table_column)
|
|
|
|
{
|
|
|
|
readField(*table_column, columns);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
skipJSONField(*in, StringRef(names_of_columns[file_column]));
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
if (in->eof())
|
2020-12-10 17:26:36 +00:00
|
|
|
throw ParsingException("Unexpected end of stream while parsing JSONCompactEachRow format", ErrorCodes::CANNOT_READ_ALL_DATA);
|
2019-12-05 10:13:40 +00:00
|
|
|
if (file_column + 1 != column_indexes_for_input_fields.size())
|
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
assertChar(',', *in);
|
|
|
|
skipWhitespaceIfAny(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
}
|
2021-08-27 03:00:12 +00:00
|
|
|
assertChar(']', *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2020-03-09 02:55:28 +00:00
|
|
|
for (const auto & name : not_seen_columns)
|
|
|
|
columns[name]->insertDefault();
|
2019-12-05 10:13:40 +00:00
|
|
|
|
|
|
|
ext.read_columns = read_columns;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void JSONCompactEachRowRowInputFormat::skipEndOfLine()
|
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
if (!in->eof() && (*in->position() == ',' || *in->position() == ';'))
|
|
|
|
++in->position();
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void JSONCompactEachRowRowInputFormat::readField(size_t index, MutableColumns & columns)
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
read_columns[index] = true;
|
|
|
|
const auto & type = data_types[index];
|
2021-03-09 14:46:52 +00:00
|
|
|
const auto & serialization = serializations[index];
|
2020-09-02 04:05:02 +00:00
|
|
|
|
|
|
|
if (yield_strings)
|
|
|
|
{
|
|
|
|
String str;
|
2021-08-27 03:00:12 +00:00
|
|
|
readJSONString(str, *in);
|
2020-09-02 04:05:02 +00:00
|
|
|
|
|
|
|
ReadBufferFromString buf(str);
|
|
|
|
|
2020-09-09 06:20:14 +00:00
|
|
|
if (format_settings.null_as_default && !type->isNullable())
|
2021-03-09 14:46:52 +00:00
|
|
|
read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization);
|
2020-09-09 06:20:14 +00:00
|
|
|
else
|
2021-03-09 14:46:52 +00:00
|
|
|
serialization->deserializeWholeText(*columns[index], buf, format_settings);
|
2020-09-02 04:05:02 +00:00
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
else
|
2020-09-02 04:05:02 +00:00
|
|
|
{
|
|
|
|
if (format_settings.null_as_default && !type->isNullable())
|
2021-08-27 03:00:12 +00:00
|
|
|
read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], *in, format_settings, serialization);
|
2020-09-02 04:05:02 +00:00
|
|
|
else
|
2021-08-27 03:00:12 +00:00
|
|
|
serialization->deserializeTextJSON(*columns[index], *in, format_settings);
|
2020-09-02 04:05:02 +00:00
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
catch (Exception & e)
|
|
|
|
{
|
2020-11-19 14:44:58 +00:00
|
|
|
e.addMessage("(while reading the value of key " + getPort().getHeader().getByPosition(index).name + ")");
|
2019-12-05 10:13:40 +00:00
|
|
|
throw;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void JSONCompactEachRowRowInputFormat::syncAfterError()
|
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
skipToUnescapedNextLineOrEOF(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void registerInputFormatProcessorJSONCompactEachRow(FormatFactory & factory)
|
|
|
|
{
|
|
|
|
factory.registerInputFormatProcessor("JSONCompactEachRow", [](
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
|
|
|
IRowInputFormat::Params params,
|
|
|
|
const FormatSettings & settings)
|
|
|
|
{
|
2020-09-02 04:05:02 +00:00
|
|
|
return std::make_shared<JSONCompactEachRowRowInputFormat>(buf, sample, std::move(params), settings, false, false);
|
2019-12-05 10:13:40 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
factory.registerInputFormatProcessor("JSONCompactEachRowWithNamesAndTypes", [](
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
|
|
|
IRowInputFormat::Params params,
|
|
|
|
const FormatSettings & settings)
|
|
|
|
{
|
2020-09-02 04:05:02 +00:00
|
|
|
return std::make_shared<JSONCompactEachRowRowInputFormat>(buf, sample, std::move(params), settings, true, false);
|
|
|
|
});
|
|
|
|
|
|
|
|
factory.registerInputFormatProcessor("JSONCompactStringsEachRow", [](
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
|
|
|
IRowInputFormat::Params params,
|
|
|
|
const FormatSettings & settings)
|
|
|
|
{
|
|
|
|
return std::make_shared<JSONCompactEachRowRowInputFormat>(buf, sample, std::move(params), settings, false, true);
|
|
|
|
});
|
|
|
|
|
|
|
|
factory.registerInputFormatProcessor("JSONCompactStringsEachRowWithNamesAndTypes", [](
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
|
|
|
IRowInputFormat::Params params,
|
|
|
|
const FormatSettings & settings)
|
|
|
|
{
|
|
|
|
return std::make_shared<JSONCompactEachRowRowInputFormat>(buf, sample, std::move(params), settings, true, true);
|
2019-12-05 10:13:40 +00:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|