Merge pull request #3332 from yandex/fix-performance-regression-while-parsing-jsoneachrow

Fix performance regression in parsing JSONEachRow format.
This commit is contained in:
alexey-milovidov 2018-10-10 02:09:22 +03:00 committed by GitHub
commit 1dba31a313
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 72 additions and 30 deletions

View File

@ -12,6 +12,7 @@ namespace ErrorCodes
{ {
extern const int INCORRECT_DATA; extern const int INCORRECT_DATA;
extern const int CANNOT_READ_ALL_DATA; extern const int CANNOT_READ_ALL_DATA;
extern const int LOGICAL_ERROR;
} }
namespace namespace
@ -47,6 +48,8 @@ JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const B
} }
} }
} }
prev_positions.assign(num_columns, name_map.end());
} }
const String & JSONEachRowRowInputStream::columnName(size_t i) const const String & JSONEachRowRowInputStream::columnName(size_t i) const
@ -54,34 +57,52 @@ const String & JSONEachRowRowInputStream::columnName(size_t i) const
return header.getByPosition(i).name; return header.getByPosition(i).name;
} }
size_t JSONEachRowRowInputStream::columnIndex(const StringRef& name) const inline size_t JSONEachRowRowInputStream::columnIndex(const StringRef & name, size_t key_index)
{ {
/// NOTE Optimization is possible by caching the order of fields (which is almost always the same) /// Optimization by caching the order of fields (which is almost always the same)
/// and a quick check to match the next expected field, instead of searching the hash table. /// and a quick check to match the next expected field, instead of searching the hash table.
const auto it = name_map.find(name); if (prev_positions.size() > key_index
return name_map.end() == it ? UNKNOWN_FIELD : it->second; && prev_positions[key_index] != name_map.end()
&& name == prev_positions[key_index]->first)
{
return prev_positions[key_index]->second;
}
else
{
const auto it = name_map.find(name);
if (name_map.end() != it)
{
if (key_index < prev_positions.size())
prev_positions[key_index] = it;
return it->second;
}
else
return UNKNOWN_FIELD;
}
} }
/** Read the field name and convert it to column name /** Read the field name and convert it to column name
* (taking into account the current nested name prefix) * (taking into account the current nested name prefix)
* Resulting StringRef is valid only before next read from buf.
*/ */
StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf) StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
{ {
// This is just an optimization: try to avoid calling readJSONStringInto() // This is just an optimization: try to avoid copying the name into current_column_name
if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end()) if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end())
{ {
const char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end()); char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
if (next_pos != buf.buffer().end() && *next_pos != '\\') if (next_pos != buf.buffer().end() && *next_pos != '\\')
{ {
/// The most likely option is that there is no escape sequence in the key name, and the entire name is placed in the buffer. /// The most likely option is that there is no escape sequence in the key name, and the entire name is placed in the buffer.
assertChar('"', buf); assertChar('"', buf);
current_column_name.assign(buf.position(), next_pos - buf.position()); StringRef res(buf.position(), next_pos - buf.position());
buf.position() += next_pos - buf.position(); buf.position() = next_pos + 1;
assertChar('"', buf); return res;
return current_column_name;
} }
} }
@ -91,7 +112,7 @@ StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
} }
static void skipColonDelimeter(ReadBuffer & istr) static inline void skipColonDelimeter(ReadBuffer & istr)
{ {
skipWhitespaceIfAny(istr); skipWhitespaceIfAny(istr);
assertChar(':', istr); assertChar(':', istr);
@ -124,7 +145,7 @@ void JSONEachRowRowInputStream::readField(size_t index, MutableColumns & columns
read_columns[index] = true; read_columns[index] = true;
} }
bool JSONEachRowRowInputStream::advanceToNextKey(size_t key_index) inline bool JSONEachRowRowInputStream::advanceToNextKey(size_t key_index)
{ {
skipWhitespaceIfAny(istr); skipWhitespaceIfAny(istr);
@ -151,15 +172,31 @@ void JSONEachRowRowInputStream::readJSONObject(MutableColumns & columns)
for (size_t key_index = 0; advanceToNextKey(key_index); ++key_index) for (size_t key_index = 0; advanceToNextKey(key_index); ++key_index)
{ {
StringRef name_ref = readColumnName(istr); StringRef name_ref = readColumnName(istr);
skipColonDelimeter(istr); const size_t column_index = columnIndex(name_ref, key_index);
const size_t column_index = columnIndex(name_ref); if (unlikely(ssize_t(column_index) < 0))
if (column_index == UNKNOWN_FIELD) {
skipUnknownField(name_ref); /// name_ref may point directly to the input buffer
else if (column_index == NESTED_FIELD) /// and input buffer may be filled with new data on next read
readNestedData(name_ref.toString(), columns); /// If we want to use name_ref after another reads from buffer, we must copy it to temporary string.
current_column_name.assign(name_ref.data, name_ref.size);
name_ref = StringRef(current_column_name);
skipColonDelimeter(istr);
if (column_index == UNKNOWN_FIELD)
skipUnknownField(name_ref);
else if (column_index == NESTED_FIELD)
readNestedData(name_ref.toString(), columns);
else
throw Exception("Logical error: illegal value of column_index", ErrorCodes::LOGICAL_ERROR);
}
else else
{
skipColonDelimeter(istr);
readField(column_index, columns); readField(column_index, columns);
}
} }
} }

View File

@ -28,7 +28,7 @@ public:
private: private:
const String & columnName(size_t i) const; const String & columnName(size_t i) const;
size_t columnIndex(const StringRef & name) const; size_t columnIndex(const StringRef & name, size_t key_index);
bool advanceToNextKey(size_t key_index); bool advanceToNextKey(size_t key_index);
void skipUnknownField(const StringRef & name_ref); void skipUnknownField(const StringRef & name_ref);
StringRef readColumnName(ReadBuffer & buf); StringRef readColumnName(ReadBuffer & buf);
@ -60,6 +60,9 @@ private:
/// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map. /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map.
using NameMap = HashMap<StringRef, size_t, StringRefHash>; using NameMap = HashMap<StringRef, size_t, StringRefHash>;
NameMap name_map; NameMap name_map;
/// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
std::vector<NameMap::iterator> prev_positions;
}; };
} }

View File

@ -71,7 +71,7 @@ UInt128 stringToUUID(const String & str)
return parseFromString<UUID>(str); return parseFromString<UUID>(str);
} }
static void __attribute__((__noinline__)) throwAtAssertionFailed(const char * s, ReadBuffer & buf) void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf)
{ {
WriteBufferFromOwnString out; WriteBufferFromOwnString out;
out << "Cannot parse input: expected " << escape << s; out << "Cannot parse input: expected " << escape << s;
@ -120,15 +120,6 @@ void assertString(const char * s, ReadBuffer & buf)
throwAtAssertionFailed(s, buf); throwAtAssertionFailed(s, buf);
} }
void assertChar(char symbol, ReadBuffer & buf)
{
if (buf.eof() || *buf.position() != symbol)
{
char err[2] = {symbol, '\0'};
throwAtAssertionFailed(err, buf);
}
++buf.position();
}
void assertEOF(ReadBuffer & buf) void assertEOF(ReadBuffer & buf)
{ {

View File

@ -162,7 +162,18 @@ void readVectorBinary(std::vector<T> & v, ReadBuffer & buf, size_t MAX_VECTOR_SI
void assertString(const char * s, ReadBuffer & buf); void assertString(const char * s, ReadBuffer & buf);
void assertEOF(ReadBuffer & buf); void assertEOF(ReadBuffer & buf);
void assertChar(char symbol, ReadBuffer & buf);
void throwAtAssertionFailed(const char * s, ReadBuffer & buf);
inline void assertChar(char symbol, ReadBuffer & buf)
{
if (buf.eof() || *buf.position() != symbol)
{
char err[2] = {symbol, '\0'};
throwAtAssertionFailed(err, buf);
}
++buf.position();
}
inline void assertString(const String & s, ReadBuffer & buf) inline void assertString(const String & s, ReadBuffer & buf)
{ {