2021-10-14 10:32:49 +00:00
|
|
|
#include <Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h>
|
|
|
|
|
2019-12-05 10:13:40 +00:00
|
|
|
#include <IO/ReadHelpers.h>
|
2021-10-14 10:32:49 +00:00
|
|
|
#include <IO/Operators.h>
|
2019-12-05 10:13:40 +00:00
|
|
|
#include <Formats/FormatFactory.h>
|
2021-10-14 10:32:49 +00:00
|
|
|
#include <Formats/verbosePrintString.h>
|
|
|
|
#include <Formats/JSONEachRowUtils.h>
|
2021-10-20 11:48:54 +00:00
|
|
|
#include <Formats/registerWithNamesAndTypes.h>
|
2019-12-05 10:13:40 +00:00
|
|
|
#include <DataTypes/NestedUtils.h>
|
2021-03-09 14:46:52 +00:00
|
|
|
#include <DataTypes/Serializations/SerializationNullable.h>
|
2019-12-05 10:13:40 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int INCORRECT_DATA;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(
|
|
|
|
const Block & header_,
|
|
|
|
ReadBuffer & in_,
|
|
|
|
Params params_,
|
|
|
|
bool with_names_,
|
|
|
|
bool with_types_,
|
|
|
|
bool yield_strings_,
|
|
|
|
const FormatSettings & format_settings_)
|
|
|
|
: RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_)
|
|
|
|
, yield_strings(yield_strings_)
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter()
|
2020-06-11 00:51:27 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
assertChar('[', *in);
|
2020-06-11 00:51:27 +00:00
|
|
|
}
|
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
void JSONCompactEachRowRowInputFormat::skipFieldDelimiter()
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
assertChar(',', *in);
|
|
|
|
}
|
2020-06-11 00:51:27 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter()
|
|
|
|
{
|
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
assertChar(']', *in);
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
if (!in->eof() && (*in->position() == ',' || *in->position() == ';'))
|
|
|
|
++in->position();
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
String JSONCompactEachRowRowInputFormat::readFieldIntoString()
|
|
|
|
{
|
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
String field;
|
|
|
|
readJSONString(field, *in);
|
|
|
|
return field;
|
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 12:05:49 +00:00
|
|
|
void JSONCompactEachRowRowInputFormat::skipField(size_t file_column)
|
2021-10-14 10:32:49 +00:00
|
|
|
{
|
|
|
|
skipWhitespaceIfAny(*in);
|
2021-10-14 12:05:49 +00:00
|
|
|
skipJSONField(*in, column_mapping->names_of_columns[file_column]);
|
2021-10-14 10:32:49 +00:00
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 12:05:49 +00:00
|
|
|
void JSONCompactEachRowRowInputFormat::skipHeaderRow()
|
2021-10-14 10:32:49 +00:00
|
|
|
{
|
|
|
|
skipRowStartDelimiter();
|
|
|
|
size_t i = 0;
|
|
|
|
do
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
if (i >= column_mapping->names_of_columns.size())
|
|
|
|
throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names");
|
2021-10-14 12:05:49 +00:00
|
|
|
skipField(i++);
|
2021-10-14 10:32:49 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
2021-10-14 10:32:49 +00:00
|
|
|
while (checkChar(',', *in));
|
|
|
|
|
|
|
|
skipRowEndDelimiter();
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
std::vector<String> JSONCompactEachRowRowInputFormat::readHeaderRow()
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
skipRowStartDelimiter();
|
|
|
|
std::vector<String> fields;
|
|
|
|
do
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
fields.push_back(readFieldIntoString());
|
|
|
|
skipWhitespaceIfAny(*in);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
2021-10-14 10:32:49 +00:00
|
|
|
while (checkChar(',', *in));
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
skipRowEndDelimiter();
|
|
|
|
return fields;
|
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name)
|
|
|
|
{
|
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
void JSONCompactEachRowRowInputFormat::syncAfterError()
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
skipToUnescapedNextLineOrEOF(*in);
|
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
|
|
|
|
{
|
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
if (!checkChar('[', *in))
|
|
|
|
{
|
|
|
|
out << "ERROR: There is no '[' before the row.\n";
|
2019-12-05 10:13:40 +00:00
|
|
|
return false;
|
2021-10-14 10:32:49 +00:00
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
return true;
|
|
|
|
}
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
|
|
|
|
{
|
|
|
|
try
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
assertChar(',', *in);
|
|
|
|
}
|
|
|
|
catch (const DB::Exception &)
|
|
|
|
{
|
|
|
|
if (*in->position() == ']')
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
out << "ERROR: Closing parenthesis (']') found where comma is expected."
|
|
|
|
" It's like your file has less columns than expected.\n"
|
|
|
|
"And if your file has the right number of columns, maybe it has unescaped quotes in values.\n";
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
out << "ERROR: There is no comma. ";
|
|
|
|
verbosePrintString(in->position(), in->position() + 1, out);
|
|
|
|
out << " found instead.\n";
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
2021-10-14 10:32:49 +00:00
|
|
|
return false;
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-08-27 03:00:12 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
2021-10-14 10:32:49 +00:00
|
|
|
|
|
|
|
if (in->eof())
|
|
|
|
{
|
|
|
|
out << "ERROR: Unexpected end of file. ']' expected at the end of row.";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!checkChar(']', *in))
|
|
|
|
{
|
|
|
|
out << "ERROR: There is no closing parenthesis (']') at the end of the row. ";
|
|
|
|
verbosePrintString(in->position(), in->position() + 1, out);
|
|
|
|
out << " found instead.\n";
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
skipWhitespaceIfAny(*in);
|
|
|
|
|
|
|
|
if (in->eof())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if ((*in->position() == ',' || *in->position() == ';'))
|
2021-08-27 03:00:12 +00:00
|
|
|
++in->position();
|
2019-12-05 10:13:40 +00:00
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
skipWhitespaceIfAny(*in);
|
2021-10-14 10:32:49 +00:00
|
|
|
return true;
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
void registerInputFormatJSONCompactEachRow(FormatFactory & factory)
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-14 10:32:49 +00:00
|
|
|
for (bool yield_strings : {true, false})
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-20 12:47:20 +00:00
|
|
|
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
|
2020-09-02 04:05:02 +00:00
|
|
|
{
|
2021-10-20 12:47:20 +00:00
|
|
|
factory.registerInputFormat(format_name, [with_names, with_types, yield_strings](
|
2021-10-14 10:32:49 +00:00
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
|
|
|
IRowInputFormat::Params params,
|
|
|
|
const FormatSettings & settings)
|
|
|
|
{
|
|
|
|
return std::make_shared<JSONCompactEachRowRowInputFormat>(sample, buf, std::move(params), with_names, with_types, yield_strings, settings);
|
2021-10-20 12:47:20 +00:00
|
|
|
});
|
2021-10-14 10:32:49 +00:00
|
|
|
};
|
2020-09-02 04:05:02 +00:00
|
|
|
|
2021-10-20 12:47:20 +00:00
|
|
|
registerWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-14 10:32:49 +00:00
|
|
|
void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory)
|
2019-12-05 10:13:40 +00:00
|
|
|
{
|
2021-10-20 12:47:20 +00:00
|
|
|
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
|
2021-10-20 11:48:54 +00:00
|
|
|
{
|
2021-10-27 19:16:34 +00:00
|
|
|
/// In case when we have names and/or types in the first two/one rows,
|
|
|
|
/// we need to read at least one more row of actual data. So, set
|
|
|
|
/// the minimum of rows for segmentation engine according to
|
|
|
|
/// parameters with_names and with_types.
|
2021-10-20 12:47:20 +00:00
|
|
|
size_t min_rows = 1 + int(with_names) + int(with_types);
|
|
|
|
factory.registerFileSegmentationEngine(format_name, [min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
|
2021-10-20 11:48:54 +00:00
|
|
|
{
|
|
|
|
return fileSegmentationEngineJSONCompactEachRow(in, memory, min_chunk_size, min_rows);
|
2021-10-20 12:47:20 +00:00
|
|
|
});
|
2021-10-20 11:48:54 +00:00
|
|
|
};
|
|
|
|
|
2021-10-20 12:47:20 +00:00
|
|
|
registerWithNamesAndTypes("JSONCompactEachRow", register_func);
|
|
|
|
registerWithNamesAndTypes("JSONCompactStringsEachRow", register_func);
|
2019-12-05 10:13:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|