Merge pull request #54613 from bigo-sg/improve_json_query

Improve json sql functions by serializing json element into column's buffer direclty
This commit is contained in:
robot-clickhouse 2023-09-15 19:35:30 +02:00 committed by GitHub
commit 51851ecc21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 370 additions and 33 deletions

View File

@ -8,6 +8,9 @@
# include <base/defines.h>
# include <simdjson.h>
# include "ElementTypes.h"
# include <Common/PODArray_fwd.h>
# include <Common/PODArray.h>
# include <charconv>
namespace DB
{
@ -16,6 +19,254 @@ namespace ErrorCodes
extern const int CANNOT_ALLOCATE_MEMORY;
}
/// Format elements of basic types into string.
/// The original implementation is mini_formatter in simdjson.h. But it is not public API, so we
/// add a implementation here.
class SimdJSONBasicFormatter
{
public:
explicit SimdJSONBasicFormatter(PaddedPODArray<UInt8> & buffer_) : buffer(buffer_) {}
inline void comma() { oneChar(','); }
/** Start an array, prints [ **/
inline void startArray() { oneChar('['); }
/** End an array, prints ] **/
inline void endArray() { oneChar(']'); }
/** Start an array, prints { **/
inline void startObject() { oneChar('{'); }
/** Start an array, prints } **/
inline void endObject() { oneChar('}'); }
/** Prints a true **/
inline void trueAtom()
{
const char * s = "true";
buffer.insert(s, s + 4);
}
/** Prints a false **/
inline void falseAtom()
{
const char * s = "false";
buffer.insert(s, s + 5);
}
/** Prints a null **/
inline void nullAtom()
{
const char * s = "null";
buffer.insert(s, s + 4);
}
/** Prints a number **/
inline void number(int64_t x)
{
char number_buffer[24];
auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x);
buffer.insert(number_buffer, res.ptr);
}
/** Prints a number **/
inline void number(uint64_t x)
{
char number_buffer[24];
auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x);
buffer.insert(number_buffer, res.ptr);
}
/** Prints a number **/
inline void number(double x)
{
char number_buffer[24];
auto res = std::to_chars(number_buffer, number_buffer + sizeof(number_buffer), x);
buffer.insert(number_buffer, res.ptr);
}
/** Prints a key (string + colon) **/
inline void key(std::string_view unescaped)
{
string(unescaped);
oneChar(':');
}
/** Prints a string. The string is escaped as needed. **/
inline void string(std::string_view unescaped)
{
oneChar('\"');
size_t i = 0;
// Fast path for the case where we have no control character, no ", and no backslash.
// This should include most keys.
//
// We would like to use 'bool' but some compilers take offense to bitwise operation
// with bool types.
constexpr static char needs_escaping[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for (; i + 8 <= unescaped.length(); i += 8)
{
// Poor's man vectorization. This could get much faster if we used SIMD.
//
// It is not the case that replacing '|' with '||' would be neutral performance-wise.
if (needs_escaping[uint8_t(unescaped[i])] | needs_escaping[uint8_t(unescaped[i + 1])]
| needs_escaping[uint8_t(unescaped[i + 2])] | needs_escaping[uint8_t(unescaped[i + 3])]
| needs_escaping[uint8_t(unescaped[i + 4])] | needs_escaping[uint8_t(unescaped[i + 5])]
| needs_escaping[uint8_t(unescaped[i + 6])] | needs_escaping[uint8_t(unescaped[i + 7])])
{
break;
}
}
for (; i < unescaped.length(); i++)
{
if (needs_escaping[uint8_t(unescaped[i])])
{
break;
}
}
// The following is also possible and omits a 256-byte table, but it is slower:
// for (; (i < unescaped.length()) && (uint8_t(unescaped[i]) > 0x1F)
// && (unescaped[i] != '\"') && (unescaped[i] != '\\'); i++) {}
// At least for long strings, the following should be fast. We could
// do better by integrating the checks and the insertion.
buffer.insert(unescaped.data(), unescaped.data() + i);
// We caught a control character if we enter this loop (slow).
// Note that we are do not restart from the beginning, but rather we continue
// from the point where we encountered something that requires escaping.
for (; i < unescaped.length(); i++)
{
switch (unescaped[i])
{
case '\"': {
const char * s = "\\\"";
buffer.insert(s, s + 2);
}
break;
case '\\': {
const char * s = "\\\\";
buffer.insert(s, s + 2);
}
break;
default:
if (uint8_t(unescaped[i]) <= 0x1F)
{
// If packed, this uses 8 * 32 bytes.
// Note that we expect most compilers to embed this code in the data
// section.
constexpr static simdjson::escape_sequence escaped[32] = {
{6, "\\u0000"}, {6, "\\u0001"}, {6, "\\u0002"}, {6, "\\u0003"}, {6, "\\u0004"}, {6, "\\u0005"}, {6, "\\u0006"},
{6, "\\u0007"}, {2, "\\b"}, {2, "\\t"}, {2, "\\n"}, {6, "\\u000b"}, {2, "\\f"}, {2, "\\r"},
{6, "\\u000e"}, {6, "\\u000f"}, {6, "\\u0010"}, {6, "\\u0011"}, {6, "\\u0012"}, {6, "\\u0013"}, {6, "\\u0014"},
{6, "\\u0015"}, {6, "\\u0016"}, {6, "\\u0017"}, {6, "\\u0018"}, {6, "\\u0019"}, {6, "\\u001a"}, {6, "\\u001b"},
{6, "\\u001c"}, {6, "\\u001d"}, {6, "\\u001e"}, {6, "\\u001f"}};
auto u = escaped[uint8_t(unescaped[i])];
buffer.insert(u.string, u.string + u.length);
}
else
{
oneChar(unescaped[i]);
}
} // switch
} // for
oneChar('\"');
}
inline void oneChar(char c)
{
buffer.push_back(c);
}
private:
PaddedPODArray<UInt8> & buffer;
};
/// Format object elements into string, element, array, object, kv-pair.
/// Similar to string_builder in simdjson.h.
class SimdJSONElementFormatter
{
public:
explicit SimdJSONElementFormatter(PaddedPODArray<UInt8> & buffer_) : format(buffer_) {}
/** Append an element to the builder (to be printed) **/
inline void append(simdjson::dom::element value)
{
switch (value.type())
{
case simdjson::dom::element_type::UINT64: {
format.number(value.get_uint64().value_unsafe());
break;
}
case simdjson::dom::element_type::INT64: {
format.number(value.get_int64().value_unsafe());
break;
}
case simdjson::dom::element_type::DOUBLE: {
format.number(value.get_double().value_unsafe());
break;
}
case simdjson::dom::element_type::STRING: {
format.string(value.get_string().value_unsafe());
break;
}
case simdjson::dom::element_type::BOOL: {
if (value.get_bool().value_unsafe())
format.trueAtom();
else
format.falseAtom();
break;
}
case simdjson::dom::element_type::NULL_VALUE: {
format.nullAtom();
break;
}
case simdjson::dom::element_type::ARRAY: {
append(value.get_array().value_unsafe());
break;
}
case simdjson::dom::element_type::OBJECT: {
append(value.get_object().value_unsafe());
break;
}
}
}
/** Append an array to the builder (to be printed) **/
inline void append(simdjson::dom::array value)
{
format.startArray();
auto iter = value.begin();
auto end = value.end();
if (iter != end)
{
append(*iter);
for (++iter; iter != end; ++iter)
{
format.comma();
append(*iter);
}
}
format.endArray();
}
inline void append(simdjson::dom::object value)
{
format.startObject();
auto pair = value.begin();
auto end = value.end();
if (pair != end)
{
append(*pair);
for (++pair; pair != end; ++pair)
{
format.comma();
append(*pair);
}
}
format.endObject();
}
inline void append(simdjson::dom::key_value_pair kv)
{
format.key(kv.key);
append(kv.value);
}
private:
SimdJSONBasicFormatter format;
};
/// This class can be used as an argument for the template class FunctionJSON.
/// It provides ability to parse JSONs using simdjson library.
struct SimdJSONParser

View File

@ -35,10 +35,92 @@ extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
extern const int BAD_ARGUMENTS;
}
/// Have implemented the operator << for json elements. So we could use stringstream to serialize json elements.
/// But stingstream have bad performance, not recommend to use it.
template <typename Element>
class DefaultJSONStringSerializer
{
public:
explicit DefaultJSONStringSerializer(ColumnString & col_str_) : col_str(col_str_) { }
inline void addRawData(const char * ptr, size_t len)
{
out << std::string_view(ptr, len);
}
inline void addRawString(std::string_view str)
{
out << str;
}
/// serialize the json element into stringstream
inline void addElement(const Element & element)
{
out << element.getElement();
}
inline void commit()
{
auto out_str = out.str();
col_str.insertData(out_str.data(), out_str.size());
}
inline void rollback() {}
private:
ColumnString & col_str;
std::stringstream out; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
};
/// A more efficient way to serialize json elements into destination column.
/// Formatter takes the chars buffer in the ColumnString and put data into it directly.
template<typename Element, typename Formatter>
class JSONStringSerializer
{
public:
explicit JSONStringSerializer(ColumnString & col_str_)
: col_str(col_str_), chars(col_str_.getChars()), offsets(col_str_.getOffsets()), formatter(col_str_.getChars())
{
prev_offset = offsets.empty() ? 0 : offsets.back();
}
/// Put the data into column's buffer directly.
inline void addRawData(const char * ptr, size_t len)
{
chars.insert(ptr, ptr + len);
}
inline void addRawString(std::string_view str)
{
chars.insert(str.data(), str.data() + str.size());
}
/// serialize the json element into column's buffer directly
inline void addElement(const Element & element)
{
formatter.append(element.getElement());
}
inline void commit()
{
chars.push_back(0);
offsets.push_back(chars.size());
}
inline void rollback()
{
chars.resize(prev_offset);
}
private:
ColumnString & col_str;
ColumnString::Chars & chars;
IColumn::Offsets & offsets;
Formatter formatter;
size_t prev_offset;
};
class EmptyJSONStringSerializer{};
class FunctionSQLJSONHelpers
{
public:
template <typename Name, template <typename> typename Impl, class JSONParser>
template <typename Name, typename Impl, class JSONParser>
class Executor
{
public:
@ -116,7 +198,7 @@ public:
bool document_ok = false;
/// Parse JSON for every row
Impl<JSONParser> impl;
Impl impl;
for (const auto i : collections::range(0, input_rows_count))
{
std::string_view json{
@ -138,7 +220,7 @@ public:
};
};
template <typename Name, template <typename> typename Impl>
template <typename Name, template <typename, typename> typename Impl>
class FunctionSQLJSON : public IFunction, WithConstContext
{
public:
@ -155,7 +237,8 @@ public:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
return Impl<DummyJSONParser>::getReturnType(Name::name, arguments, getContext());
return Impl<DummyJSONParser, DefaultJSONStringSerializer<DummyJSONParser::Element>>::getReturnType(
Name::name, arguments, getContext());
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
@ -168,9 +251,14 @@ public:
unsigned parse_depth = static_cast<unsigned>(getContext()->getSettingsRef().max_parser_depth);
#if USE_SIMDJSON
if (getContext()->getSettingsRef().allow_simdjson)
return FunctionSQLJSONHelpers::Executor<Name, Impl, SimdJSONParser>::run(arguments, result_type, input_rows_count, parse_depth, getContext());
return FunctionSQLJSONHelpers::Executor<
Name,
Impl<SimdJSONParser, JSONStringSerializer<SimdJSONParser::Element, SimdJSONElementFormatter>>,
SimdJSONParser>::run(arguments, result_type, input_rows_count, parse_depth, getContext());
#endif
return FunctionSQLJSONHelpers::Executor<Name, Impl, DummyJSONParser>::run(arguments, result_type, input_rows_count, parse_depth, getContext());
return FunctionSQLJSONHelpers::
Executor<Name, Impl<DummyJSONParser, DefaultJSONStringSerializer<DummyJSONParser::Element>>, DummyJSONParser>::run(
arguments, result_type, input_rows_count, parse_depth, getContext());
}
};
@ -189,7 +277,7 @@ struct NameJSONQuery
static constexpr auto name{"JSON_QUERY"};
};
template <typename JSONParser>
template <typename JSONParser, typename JSONStringSerializer>
class JSONExistsImpl
{
public:
@ -228,7 +316,7 @@ public:
}
};
template <typename JSONParser>
template <typename JSONParser, typename JSONStringSerializer>
class JSONValueImpl
{
public:
@ -279,11 +367,7 @@ public:
if (status == VisitorStatus::Exhausted)
return false;
std::stringstream out; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
out << current_element.getElement();
auto output_str = out.str();
ColumnString * col_str;
ColumnString * col_str = nullptr;
if (isColumnNullable(dest))
{
ColumnNullable & col_null = assert_cast<ColumnNullable &>(dest);
@ -294,20 +378,15 @@ public:
{
col_str = assert_cast<ColumnString *>(&dest);
}
ColumnString::Chars & data = col_str->getChars();
ColumnString::Offsets & offsets = col_str->getOffsets();
JSONStringSerializer json_serializer(*col_str);
if (current_element.isString())
{
ReadBufferFromString buf(output_str);
readJSONStringInto(data, buf);
data.push_back(0);
offsets.push_back(data.size());
auto str = current_element.getString();
json_serializer.addRawString(str);
}
else
{
col_str->insertData(output_str.data(), output_str.size());
}
json_serializer.addElement(current_element);
json_serializer.commit();
return true;
}
};
@ -316,7 +395,7 @@ public:
* Function to test jsonpath member access, will be removed in final PR
* @tparam JSONParser parser
*/
template <typename JSONParser>
template <typename JSONParser, typename JSONStringSerializer>
class JSONQueryImpl
{
public:
@ -328,23 +407,27 @@ public:
static bool insertResultToColumn(IColumn & dest, const Element & root, ASTPtr & query_ptr, const ContextPtr &)
{
ColumnString & col_str = assert_cast<ColumnString &>(dest);
GeneratorJSONPath<JSONParser> generator_json_path(query_ptr);
Element current_element = root;
VisitorStatus status;
std::stringstream out; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
/// Create json array of results: [res1, res2, ...]
out << "[";
bool success = false;
const char * array_begin = "[";
const char * array_end = "]";
const char * comma = ", ";
JSONStringSerializer json_serializer(col_str);
json_serializer.addRawData(array_begin, 1);
while ((status = generator_json_path.getNextItem(current_element)) != VisitorStatus::Exhausted)
{
if (status == VisitorStatus::Ok)
{
if (success)
{
out << ", ";
json_serializer.addRawData(comma, 2);
}
success = true;
out << current_element.getElement();
json_serializer.addElement(current_element);
}
else if (status == VisitorStatus::Error)
{
@ -354,14 +437,13 @@ public:
}
current_element = root;
}
out << "]";
if (!success)
{
json_serializer.rollback();
return false;
}
ColumnString & col_str = assert_cast<ColumnString &>(dest);
auto output_str = out.str();
col_str.insertData(output_str.data(), output_str.size());
json_serializer.addRawData(array_end, 1);
json_serializer.commit();
return true;
}
};

View File

@ -33,4 +33,8 @@
<query>SELECT 'simdjson-12', count() FROM zeros(5000000) WHERE NOT ignore(JSONExtractFloat(materialize({long_json}), 'fparam', 'nested_2', -2))</query>
<query>SELECT 'simdjson-13', count() FROM zeros(5000000) WHERE NOT ignore(JSONExtractBool(materialize({long_json}), 'bparam'))</query>
<query>SELECT 'simdjson-14', count() FROM zeros(5000000) WHERE NOT ignore(JSON_VALUE(materialize({long_json}), '$.nparam'))</query>
<query>SELECT 'simdjson-15', count() FROM zeros(5000000) WHERE NOT ignore(JSON_QUERY(materialize({long_json}), '$.nparam'))</query>
<query>SELECT 'simdjson-16', count() FROM zeros(5000000) WHERE NOT ignore(JSON_VALUE(materialize({json}), '$.nparam'))</query>
<query>SELECT 'simdjson-17', count() FROM zeros(5000000) WHERE NOT ignore(JSON_QUERY(materialize({json}), '$.nparam'))</query>
</test>