Merge pull request #3332 from yandex/fix-performance-regression-while-parsing-jsoneachrow

Fix performance regression in parsing JSONEachRow format.
This commit is contained in:
alexey-milovidov 2018-10-10 02:09:22 +03:00 committed by GitHub
commit 1dba31a313
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 72 additions and 30 deletions

View File

@ -12,6 +12,7 @@ namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int CANNOT_READ_ALL_DATA;
extern const int LOGICAL_ERROR;
}
namespace
@ -47,6 +48,8 @@ JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const B
}
}
}
prev_positions.assign(num_columns, name_map.end());
}
const String & JSONEachRowRowInputStream::columnName(size_t i) const
@ -54,34 +57,52 @@ const String & JSONEachRowRowInputStream::columnName(size_t i) const
return header.getByPosition(i).name;
}
size_t JSONEachRowRowInputStream::columnIndex(const StringRef& name) const
inline size_t JSONEachRowRowInputStream::columnIndex(const StringRef & name, size_t key_index)
{
/// NOTE Optimization is possible by caching the order of fields (which is almost always the same)
/// Optimization by caching the order of fields (which is almost always the same)
/// and a quick check to match the next expected field, instead of searching the hash table.
const auto it = name_map.find(name);
return name_map.end() == it ? UNKNOWN_FIELD : it->second;
if (prev_positions.size() > key_index
&& prev_positions[key_index] != name_map.end()
&& name == prev_positions[key_index]->first)
{
return prev_positions[key_index]->second;
}
else
{
const auto it = name_map.find(name);
if (name_map.end() != it)
{
if (key_index < prev_positions.size())
prev_positions[key_index] = it;
return it->second;
}
else
return UNKNOWN_FIELD;
}
}
/** Read the field name and convert it to column name
* (taking into account the current nested name prefix)
* Resulting StringRef is valid only before next read from buf.
*/
StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
{
// This is just an optimization: try to avoid calling readJSONStringInto()
// This is just an optimization: try to avoid copying the name into current_column_name
if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end())
{
const char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
if (next_pos != buf.buffer().end() && *next_pos != '\\')
{
/// The most likely option is that there is no escape sequence in the key name, and the entire name is placed in the buffer.
assertChar('"', buf);
current_column_name.assign(buf.position(), next_pos - buf.position());
buf.position() += next_pos - buf.position();
assertChar('"', buf);
return current_column_name;
StringRef res(buf.position(), next_pos - buf.position());
buf.position() = next_pos + 1;
return res;
}
}
@ -91,7 +112,7 @@ StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
}
static void skipColonDelimeter(ReadBuffer & istr)
static inline void skipColonDelimeter(ReadBuffer & istr)
{
skipWhitespaceIfAny(istr);
assertChar(':', istr);
@ -124,7 +145,7 @@ void JSONEachRowRowInputStream::readField(size_t index, MutableColumns & columns
read_columns[index] = true;
}
bool JSONEachRowRowInputStream::advanceToNextKey(size_t key_index)
inline bool JSONEachRowRowInputStream::advanceToNextKey(size_t key_index)
{
skipWhitespaceIfAny(istr);
@ -151,15 +172,31 @@ void JSONEachRowRowInputStream::readJSONObject(MutableColumns & columns)
for (size_t key_index = 0; advanceToNextKey(key_index); ++key_index)
{
StringRef name_ref = readColumnName(istr);
skipColonDelimeter(istr);
const size_t column_index = columnIndex(name_ref, key_index);
const size_t column_index = columnIndex(name_ref);
if (column_index == UNKNOWN_FIELD)
skipUnknownField(name_ref);
else if (column_index == NESTED_FIELD)
readNestedData(name_ref.toString(), columns);
if (unlikely(ssize_t(column_index) < 0))
{
/// name_ref may point directly to the input buffer
/// and input buffer may be filled with new data on next read
/// If we want to use name_ref after another reads from buffer, we must copy it to temporary string.
current_column_name.assign(name_ref.data, name_ref.size);
name_ref = StringRef(current_column_name);
skipColonDelimeter(istr);
if (column_index == UNKNOWN_FIELD)
skipUnknownField(name_ref);
else if (column_index == NESTED_FIELD)
readNestedData(name_ref.toString(), columns);
else
throw Exception("Logical error: illegal value of column_index", ErrorCodes::LOGICAL_ERROR);
}
else
{
skipColonDelimeter(istr);
readField(column_index, columns);
}
}
}

View File

@ -28,7 +28,7 @@ public:
private:
const String & columnName(size_t i) const;
size_t columnIndex(const StringRef & name) const;
size_t columnIndex(const StringRef & name, size_t key_index);
bool advanceToNextKey(size_t key_index);
void skipUnknownField(const StringRef & name_ref);
StringRef readColumnName(ReadBuffer & buf);
@ -60,6 +60,9 @@ private:
/// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map.
using NameMap = HashMap<StringRef, size_t, StringRefHash>;
NameMap name_map;
/// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
std::vector<NameMap::iterator> prev_positions;
};
}

View File

@ -71,7 +71,7 @@ UInt128 stringToUUID(const String & str)
return parseFromString<UUID>(str);
}
static void __attribute__((__noinline__)) throwAtAssertionFailed(const char * s, ReadBuffer & buf)
void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf)
{
WriteBufferFromOwnString out;
out << "Cannot parse input: expected " << escape << s;
@ -120,15 +120,6 @@ void assertString(const char * s, ReadBuffer & buf)
throwAtAssertionFailed(s, buf);
}
void assertChar(char symbol, ReadBuffer & buf)
{
if (buf.eof() || *buf.position() != symbol)
{
char err[2] = {symbol, '\0'};
throwAtAssertionFailed(err, buf);
}
++buf.position();
}
void assertEOF(ReadBuffer & buf)
{

View File

@ -162,7 +162,18 @@ void readVectorBinary(std::vector<T> & v, ReadBuffer & buf, size_t MAX_VECTOR_SI
void assertString(const char * s, ReadBuffer & buf);
void assertEOF(ReadBuffer & buf);
void assertChar(char symbol, ReadBuffer & buf);
void throwAtAssertionFailed(const char * s, ReadBuffer & buf);
inline void assertChar(char symbol, ReadBuffer & buf)
{
if (buf.eof() || *buf.position() != symbol)
{
char err[2] = {symbol, '\0'};
throwAtAssertionFailed(err, buf);
}
++buf.position();
}
inline void assertString(const String & s, ReadBuffer & buf)
{