ClickHouse/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp

226 lines
7.7 KiB
C++
Raw Normal View History

2019-02-19 18:41:18 +00:00
#include <IO/ReadHelpers.h>
#include <Processors/Formats/Impl/TSKVRowInputFormat.h>
#include <Formats/FormatFactory.h>
2021-03-09 14:46:52 +00:00
#include <DataTypes/Serializations/SerializationNullable.h>
2019-02-19 18:41:18 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
extern const int CANNOT_READ_ALL_DATA;
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
}
2019-08-03 11:02:40 +00:00
TSKVRowInputFormat::TSKVRowInputFormat(ReadBuffer & in_, Block header_, Params params_, const FormatSettings & format_settings_)
: IRowInputFormat(std::move(header_), in_, std::move(params_)), format_settings(format_settings_), name_map(header_.columns())
2019-02-19 18:41:18 +00:00
{
2019-07-31 09:43:08 +00:00
const auto & sample_block = getPort().getHeader();
size_t num_columns = sample_block.columns();
2019-02-19 18:41:18 +00:00
for (size_t i = 0; i < num_columns; ++i)
2019-07-31 09:43:08 +00:00
name_map[sample_block.getByPosition(i).name] = i; /// NOTE You could place names more cache-locally.
2019-02-19 18:41:18 +00:00
}
void TSKVRowInputFormat::readPrefix()
{
/// In this format, we assume that column name cannot contain BOM,
/// so BOM at beginning of stream cannot be confused with name of field, and it is safe to skip it.
skipBOMIfExists(*in);
}
2019-02-19 18:41:18 +00:00
/** Read the field name in the `tskv` format.
* Return true if the field is followed by an equal sign,
* otherwise (field with no value) return false.
* The reference to the field name will be written to `ref`.
* A temporary `tmp` buffer can also be used to copy the field name to it.
* When reading, skips the name and the equal sign after it.
*/
static bool readName(ReadBuffer & buf, StringRef & ref, String & tmp)
{
tmp.clear();
while (!buf.eof())
{
const char * next_pos = find_first_symbols<'\t', '\n', '\\', '='>(buf.position(), buf.buffer().end());
if (next_pos == buf.buffer().end())
{
tmp.append(buf.position(), next_pos - buf.position());
buf.next();
continue;
}
/// Came to the end of the name.
if (*next_pos != '\\')
{
bool have_value = *next_pos == '=';
if (tmp.empty())
{
/// No need to copy data, you can refer directly to the `buf`.
ref = StringRef(buf.position(), next_pos - buf.position());
buf.position() += next_pos + have_value - buf.position();
}
else
{
/// Copy the data to a temporary string and return a reference to it.
tmp.append(buf.position(), next_pos - buf.position());
buf.position() += next_pos + have_value - buf.position();
ref = StringRef(tmp);
}
return have_value;
}
/// The name has an escape sequence.
else
{
tmp.append(buf.position(), next_pos - buf.position());
buf.position() += next_pos + 1 - buf.position();
if (buf.eof())
throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
tmp.push_back(parseEscapeSequence(*buf.position()));
++buf.position();
continue;
}
}
2020-12-10 17:26:36 +00:00
throw ParsingException("Unexpected end of stream while reading key name from TSKV format", ErrorCodes::CANNOT_READ_ALL_DATA);
2019-02-19 18:41:18 +00:00
}
bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext)
{
if (in->eof())
2019-02-19 18:41:18 +00:00
return false;
2020-04-22 06:34:20 +00:00
const auto & header = getPort().getHeader();
2019-02-19 18:41:18 +00:00
size_t num_columns = columns.size();
/// Set of columns for which the values were read. The rest will be filled with default values.
read_columns.assign(num_columns, false);
seen_columns.assign(num_columns, false);
2019-02-19 18:41:18 +00:00
if (unlikely(*in->position() == '\n'))
2019-02-19 18:41:18 +00:00
{
/// An empty string. It is permissible, but it is unclear why.
++in->position();
2019-02-19 18:41:18 +00:00
}
else
{
while (true)
{
StringRef name_ref;
bool has_value = readName(*in, name_ref, name_buf);
2019-02-19 18:41:18 +00:00
ssize_t index = -1;
if (has_value)
{
/// NOTE Optimization is possible by caching the order of fields (which is almost always the same)
/// and quickly checking for the next expected field, instead of searching the hash table.
2020-04-22 06:34:20 +00:00
auto * it = name_map.find(name_ref);
if (!it)
2019-02-19 18:41:18 +00:00
{
if (!format_settings.skip_unknown_fields)
throw Exception("Unknown field found while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
/// If the key is not found, skip the value.
2020-10-29 17:22:48 +00:00
NullOutput sink;
readEscapedStringInto(sink, *in);
2019-02-19 18:41:18 +00:00
}
else
{
2019-10-29 15:16:51 +00:00
index = it->getMapped();
2019-02-19 18:41:18 +00:00
if (seen_columns[index])
2019-02-19 18:41:18 +00:00
throw Exception("Duplicate field found while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
seen_columns[index] = read_columns[index] = true;
const auto & type = getPort().getHeader().getByPosition(index).type;
2021-03-09 14:46:52 +00:00
const auto & serialization = serializations[index];
if (format_settings.null_as_default && !type->isNullable())
read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], *in, format_settings, serialization);
else
serialization->deserializeTextEscaped(*columns[index], *in, format_settings);
2019-02-19 18:41:18 +00:00
}
}
else
{
/// The only thing that can go without value is `tskv` fragment that is ignored.
if (!(name_ref.size == 4 && 0 == memcmp(name_ref.data, "tskv", 4)))
throw Exception("Found field without value while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
}
if (in->eof())
2019-02-19 18:41:18 +00:00
{
2020-12-10 17:26:36 +00:00
throw ParsingException("Unexpected end of stream after field in TSKV format: " + name_ref.toString(), ErrorCodes::CANNOT_READ_ALL_DATA);
2019-02-19 18:41:18 +00:00
}
else if (*in->position() == '\t')
2019-02-19 18:41:18 +00:00
{
++in->position();
2019-02-19 18:41:18 +00:00
continue;
}
else if (*in->position() == '\n')
2019-02-19 18:41:18 +00:00
{
++in->position();
2019-02-19 18:41:18 +00:00
break;
}
else
{
/// Possibly a garbage was written into column, remove it
if (index >= 0)
{
columns[index]->popBack(1);
seen_columns[index] = read_columns[index] = false;
2019-02-19 18:41:18 +00:00
}
throw Exception("Found garbage after field in TSKV format: " + name_ref.toString(), ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED);
}
}
}
/// Fill in the not met columns with default values.
for (size_t i = 0; i < num_columns; ++i)
2019-10-07 16:08:07 +00:00
if (!seen_columns[i])
2019-02-19 18:41:18 +00:00
header.getByPosition(i).type->insertDefaultInto(*columns[i]);
/// return info about defaults set
ext.read_columns = read_columns;
return true;
}
void TSKVRowInputFormat::syncAfterError()
{
skipToUnescapedNextLineOrEOF(*in);
2019-02-19 18:41:18 +00:00
}
void TSKVRowInputFormat::resetParser()
{
IRowInputFormat::resetParser();
read_columns.clear();
seen_columns.clear();
name_buf.clear();
}
2021-10-11 16:11:50 +00:00
void registerInputFormatTSKV(FormatFactory & factory)
2019-02-19 18:41:18 +00:00
{
2021-10-11 16:11:50 +00:00
factory.registerInputFormat("TSKV", [](
2019-02-19 18:41:18 +00:00
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
2019-07-31 09:43:08 +00:00
return std::make_shared<TSKVRowInputFormat>(buf, sample, std::move(params), settings);
2019-02-19 18:41:18 +00:00
});
}
}