mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Merge pull request #3332 from yandex/fix-performance-regression-while-parsing-jsoneachrow
Fix performance regression in parsing JSONEachRow format.
This commit is contained in:
commit
1dba31a313
@ -12,6 +12,7 @@ namespace ErrorCodes
|
|||||||
{
|
{
|
||||||
extern const int INCORRECT_DATA;
|
extern const int INCORRECT_DATA;
|
||||||
extern const int CANNOT_READ_ALL_DATA;
|
extern const int CANNOT_READ_ALL_DATA;
|
||||||
|
extern const int LOGICAL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
@ -47,6 +48,8 @@ JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const B
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prev_positions.assign(num_columns, name_map.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
const String & JSONEachRowRowInputStream::columnName(size_t i) const
|
const String & JSONEachRowRowInputStream::columnName(size_t i) const
|
||||||
@ -54,34 +57,52 @@ const String & JSONEachRowRowInputStream::columnName(size_t i) const
|
|||||||
return header.getByPosition(i).name;
|
return header.getByPosition(i).name;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t JSONEachRowRowInputStream::columnIndex(const StringRef& name) const
|
inline size_t JSONEachRowRowInputStream::columnIndex(const StringRef & name, size_t key_index)
|
||||||
{
|
{
|
||||||
/// NOTE Optimization is possible by caching the order of fields (which is almost always the same)
|
/// Optimization by caching the order of fields (which is almost always the same)
|
||||||
/// and a quick check to match the next expected field, instead of searching the hash table.
|
/// and a quick check to match the next expected field, instead of searching the hash table.
|
||||||
|
|
||||||
const auto it = name_map.find(name);
|
if (prev_positions.size() > key_index
|
||||||
return name_map.end() == it ? UNKNOWN_FIELD : it->second;
|
&& prev_positions[key_index] != name_map.end()
|
||||||
|
&& name == prev_positions[key_index]->first)
|
||||||
|
{
|
||||||
|
return prev_positions[key_index]->second;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const auto it = name_map.find(name);
|
||||||
|
|
||||||
|
if (name_map.end() != it)
|
||||||
|
{
|
||||||
|
if (key_index < prev_positions.size())
|
||||||
|
prev_positions[key_index] = it;
|
||||||
|
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return UNKNOWN_FIELD;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Read the field name and convert it to column name
|
/** Read the field name and convert it to column name
|
||||||
* (taking into account the current nested name prefix)
|
* (taking into account the current nested name prefix)
|
||||||
|
* Resulting StringRef is valid only before next read from buf.
|
||||||
*/
|
*/
|
||||||
StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
|
StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
|
||||||
{
|
{
|
||||||
// This is just an optimization: try to avoid calling readJSONStringInto()
|
// This is just an optimization: try to avoid copying the name into current_column_name
|
||||||
|
|
||||||
if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end())
|
if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end())
|
||||||
{
|
{
|
||||||
const char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
|
char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
|
||||||
|
|
||||||
if (next_pos != buf.buffer().end() && *next_pos != '\\')
|
if (next_pos != buf.buffer().end() && *next_pos != '\\')
|
||||||
{
|
{
|
||||||
/// The most likely option is that there is no escape sequence in the key name, and the entire name is placed in the buffer.
|
/// The most likely option is that there is no escape sequence in the key name, and the entire name is placed in the buffer.
|
||||||
assertChar('"', buf);
|
assertChar('"', buf);
|
||||||
current_column_name.assign(buf.position(), next_pos - buf.position());
|
StringRef res(buf.position(), next_pos - buf.position());
|
||||||
buf.position() += next_pos - buf.position();
|
buf.position() = next_pos + 1;
|
||||||
assertChar('"', buf);
|
return res;
|
||||||
return current_column_name;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -91,7 +112,7 @@ StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void skipColonDelimeter(ReadBuffer & istr)
|
static inline void skipColonDelimeter(ReadBuffer & istr)
|
||||||
{
|
{
|
||||||
skipWhitespaceIfAny(istr);
|
skipWhitespaceIfAny(istr);
|
||||||
assertChar(':', istr);
|
assertChar(':', istr);
|
||||||
@ -124,7 +145,7 @@ void JSONEachRowRowInputStream::readField(size_t index, MutableColumns & columns
|
|||||||
read_columns[index] = true;
|
read_columns[index] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool JSONEachRowRowInputStream::advanceToNextKey(size_t key_index)
|
inline bool JSONEachRowRowInputStream::advanceToNextKey(size_t key_index)
|
||||||
{
|
{
|
||||||
skipWhitespaceIfAny(istr);
|
skipWhitespaceIfAny(istr);
|
||||||
|
|
||||||
@ -151,15 +172,31 @@ void JSONEachRowRowInputStream::readJSONObject(MutableColumns & columns)
|
|||||||
for (size_t key_index = 0; advanceToNextKey(key_index); ++key_index)
|
for (size_t key_index = 0; advanceToNextKey(key_index); ++key_index)
|
||||||
{
|
{
|
||||||
StringRef name_ref = readColumnName(istr);
|
StringRef name_ref = readColumnName(istr);
|
||||||
skipColonDelimeter(istr);
|
const size_t column_index = columnIndex(name_ref, key_index);
|
||||||
|
|
||||||
const size_t column_index = columnIndex(name_ref);
|
if (unlikely(ssize_t(column_index) < 0))
|
||||||
if (column_index == UNKNOWN_FIELD)
|
{
|
||||||
skipUnknownField(name_ref);
|
/// name_ref may point directly to the input buffer
|
||||||
else if (column_index == NESTED_FIELD)
|
/// and input buffer may be filled with new data on next read
|
||||||
readNestedData(name_ref.toString(), columns);
|
/// If we want to use name_ref after another reads from buffer, we must copy it to temporary string.
|
||||||
|
|
||||||
|
current_column_name.assign(name_ref.data, name_ref.size);
|
||||||
|
name_ref = StringRef(current_column_name);
|
||||||
|
|
||||||
|
skipColonDelimeter(istr);
|
||||||
|
|
||||||
|
if (column_index == UNKNOWN_FIELD)
|
||||||
|
skipUnknownField(name_ref);
|
||||||
|
else if (column_index == NESTED_FIELD)
|
||||||
|
readNestedData(name_ref.toString(), columns);
|
||||||
|
else
|
||||||
|
throw Exception("Logical error: illegal value of column_index", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
|
skipColonDelimeter(istr);
|
||||||
readField(column_index, columns);
|
readField(column_index, columns);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
const String & columnName(size_t i) const;
|
const String & columnName(size_t i) const;
|
||||||
size_t columnIndex(const StringRef & name) const;
|
size_t columnIndex(const StringRef & name, size_t key_index);
|
||||||
bool advanceToNextKey(size_t key_index);
|
bool advanceToNextKey(size_t key_index);
|
||||||
void skipUnknownField(const StringRef & name_ref);
|
void skipUnknownField(const StringRef & name_ref);
|
||||||
StringRef readColumnName(ReadBuffer & buf);
|
StringRef readColumnName(ReadBuffer & buf);
|
||||||
@ -60,6 +60,9 @@ private:
|
|||||||
/// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map.
|
/// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map.
|
||||||
using NameMap = HashMap<StringRef, size_t, StringRefHash>;
|
using NameMap = HashMap<StringRef, size_t, StringRefHash>;
|
||||||
NameMap name_map;
|
NameMap name_map;
|
||||||
|
|
||||||
|
/// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
|
||||||
|
std::vector<NameMap::iterator> prev_positions;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -71,7 +71,7 @@ UInt128 stringToUUID(const String & str)
|
|||||||
return parseFromString<UUID>(str);
|
return parseFromString<UUID>(str);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __attribute__((__noinline__)) throwAtAssertionFailed(const char * s, ReadBuffer & buf)
|
void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf)
|
||||||
{
|
{
|
||||||
WriteBufferFromOwnString out;
|
WriteBufferFromOwnString out;
|
||||||
out << "Cannot parse input: expected " << escape << s;
|
out << "Cannot parse input: expected " << escape << s;
|
||||||
@ -120,15 +120,6 @@ void assertString(const char * s, ReadBuffer & buf)
|
|||||||
throwAtAssertionFailed(s, buf);
|
throwAtAssertionFailed(s, buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assertChar(char symbol, ReadBuffer & buf)
|
|
||||||
{
|
|
||||||
if (buf.eof() || *buf.position() != symbol)
|
|
||||||
{
|
|
||||||
char err[2] = {symbol, '\0'};
|
|
||||||
throwAtAssertionFailed(err, buf);
|
|
||||||
}
|
|
||||||
++buf.position();
|
|
||||||
}
|
|
||||||
|
|
||||||
void assertEOF(ReadBuffer & buf)
|
void assertEOF(ReadBuffer & buf)
|
||||||
{
|
{
|
||||||
|
@ -162,7 +162,18 @@ void readVectorBinary(std::vector<T> & v, ReadBuffer & buf, size_t MAX_VECTOR_SI
|
|||||||
|
|
||||||
void assertString(const char * s, ReadBuffer & buf);
|
void assertString(const char * s, ReadBuffer & buf);
|
||||||
void assertEOF(ReadBuffer & buf);
|
void assertEOF(ReadBuffer & buf);
|
||||||
void assertChar(char symbol, ReadBuffer & buf);
|
|
||||||
|
void throwAtAssertionFailed(const char * s, ReadBuffer & buf);
|
||||||
|
|
||||||
|
inline void assertChar(char symbol, ReadBuffer & buf)
|
||||||
|
{
|
||||||
|
if (buf.eof() || *buf.position() != symbol)
|
||||||
|
{
|
||||||
|
char err[2] = {symbol, '\0'};
|
||||||
|
throwAtAssertionFailed(err, buf);
|
||||||
|
}
|
||||||
|
++buf.position();
|
||||||
|
}
|
||||||
|
|
||||||
inline void assertString(const String & s, ReadBuffer & buf)
|
inline void assertString(const String & s, ReadBuffer & buf)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user