ClickHouse/dbms/src/DataStreams/JSONEachRowRowInputStream.cpp

145 lines
3.6 KiB
C++
Raw Normal View History

#include <DB/IO/ReadHelpers.h>
#include <DB/DataStreams/JSONEachRowRowInputStream.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int CANNOT_READ_ALL_DATA;
}
JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & sample_, bool skip_unknown_)
: istr(istr_), sample(sample_), skip_unknown(skip_unknown_), name_map(sample.columns())
{
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
skipBOMIfExists(istr);
size_t columns = sample.columns();
for (size_t i = 0; i < columns; ++i)
2017-03-25 20:12:56 +00:00
name_map[sample.safeGetByPosition(i).name] = i; /// NOTE You could place names more cache-locally.
}
2017-03-25 20:12:56 +00:00
/** Read the field name in JSON format.
* A reference to the field name will be written to ref.
* You can also use temporary `tmp` buffer to copy field name there.
*/
static StringRef readName(ReadBuffer & buf, String & tmp)
{
if (buf.position() + 1 < buf.buffer().end())
{
const char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
if (next_pos != buf.buffer().end() && *next_pos != '\\')
{
2017-03-25 20:12:56 +00:00
/// The most likely option is that there is no escape sequence in the key name, and the entire name is placed in the buffer.
assertChar('"', buf);
StringRef res(buf.position(), next_pos - buf.position());
buf.position() += next_pos - buf.position();
assertChar('"', buf);
return res;
}
}
readJSONString(tmp, buf);
return tmp;
}
static void skipColonDelimeter(ReadBuffer & istr)
{
skipWhitespaceIfAny(istr);
assertChar(':', istr);
skipWhitespaceIfAny(istr);
}
bool JSONEachRowRowInputStream::read(Block & block)
{
skipWhitespaceIfAny(istr);
if (istr.eof())
return false;
assertChar('{', istr);
size_t columns = block.columns();
2017-03-25 20:12:56 +00:00
/// Set of columns for which the values were read. The rest will be filled with default values.
/// TODO Ability to provide your DEFAULTs.
2016-03-07 04:56:19 +00:00
bool read_columns[columns];
memset(read_columns, 0, columns);
bool first = true;
while (true)
{
skipWhitespaceIfAny(istr);
if (istr.eof())
throw Exception("Unexpected end of stream while parsing JSONEachRow format", ErrorCodes::CANNOT_READ_ALL_DATA);
else if (*istr.position() == '}')
{
++istr.position();
break;
}
if (first)
first = false;
else
{
assertChar(',', istr);
skipWhitespaceIfAny(istr);
}
StringRef name_ref = readName(istr, name_buf);
2017-03-25 20:12:56 +00:00
/// NOTE Optimization is possible by caching the order of fields (which is almost always the same)
/// and a quick check to match the next expected field, instead of searching the hash table.
auto it = name_map.find(name_ref);
if (name_map.end() == it)
{
if (!skip_unknown)
throw Exception("Unknown field found while parsing JSONEachRow format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
skipColonDelimeter(istr);
skipJSONFieldPlain(istr, name_ref);
continue;
}
size_t index = it->second;
if (read_columns[index])
throw Exception("Duplicate field found while parsing JSONEachRow format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
skipColonDelimeter(istr);
read_columns[index] = true;
auto & col = block.getByPosition(index);
col.type.get()->deserializeTextJSON(*col.column.get(), istr);
}
skipWhitespaceIfAny(istr);
if (!istr.eof() && *istr.position() == ',')
++istr.position();
2017-03-25 20:12:56 +00:00
/// Fill non-visited columns with the default values.
for (size_t i = 0; i < columns; ++i)
if (!read_columns[i])
block.getByPosition(i).column.get()->insertDefault();
return true;
}
Text formats allow to skip errors (#407) * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Allow to skip errors in text formats: added settings 'input_format_allow_errors_num' and 'input_format_allow_errors_ratio' [#CLICKHOUSE-2778]. https://github.com/yandex/ClickHouse/issues/134 * Added test [#CLICKHOUSE-2778].
2017-01-27 04:29:47 +00:00
void JSONEachRowRowInputStream::syncAfterError()
{
skipToUnescapedNextLineOrEOF(istr);
}
}