mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
add simd ondemand parser
This commit is contained in:
parent
4511352efc
commit
93464f52f4
@ -164,3 +164,10 @@ template <typename... Args>
|
||||
constexpr void UNUSED(Args &&... args [[maybe_unused]]) // NOLINT(cppcoreguidelines-missing-std-forward)
|
||||
{
|
||||
}
|
||||
|
||||
#define DB_CONCATENATE_IMPL(s1, s2) s1##s2
|
||||
#define DB_CONCATENATE(s1, s2) DB_CONCATENATE_IMPL(s1, s2)
|
||||
|
||||
#define DB_ANONYMOUS_VARIABLE(str) \
|
||||
DB_CONCATENATE(DB_CONCATENATE(DB_CONCATENATE(str, __COUNTER__), _), __LINE__)
|
||||
|
||||
|
@ -611,6 +611,7 @@
|
||||
M(730, REFRESH_FAILED) \
|
||||
M(731, QUERY_CACHE_USED_WITH_NON_THROW_OVERFLOW_MODE) \
|
||||
M(733, TABLE_IS_BEING_RESTARTED) \
|
||||
M(734, JSON_PARSE_ERROR) \
|
||||
\
|
||||
M(900, DISTRIBUTED_CACHE_ERROR) \
|
||||
M(901, CANNOT_USE_DISTRIBUTED_CACHE) \
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "config.h"
|
||||
|
||||
#if USE_SIMDJSON
|
||||
# include <Common/logger_useful.h>
|
||||
# include <base/types.h>
|
||||
# include <Common/Exception.h>
|
||||
# include <base/defines.h>
|
||||
@ -18,8 +19,19 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CANNOT_ALLOCATE_MEMORY;
|
||||
extern const int JSON_PARSE_ERROR;
|
||||
}
|
||||
|
||||
#define SIMDJSON_ASSIGN_OR_THROW_IMPL(_result, _lhs, _rexpr) \
|
||||
auto && _result = (_rexpr); \
|
||||
if (_result.error() != ::simdjson::SUCCESS) \
|
||||
throw DB::ErrnoException(ErrorCodes::JSON_PARSE_ERROR, "simdjson error: {}", std::string(::simdjson::error_message(_result.error()))); \
|
||||
_lhs = std::move(_result).value_unsafe()
|
||||
|
||||
#define SIMDJSON_ASSIGN_OR_THROW(_lhs, _rexpr) \
|
||||
SIMDJSON_ASSIGN_OR_THROW_IMPL( \
|
||||
DB_ANONYMOUS_VARIABLE(_simdjson_sesult), _lhs, _rexpr)
|
||||
|
||||
/// Format elements of basic types into string.
|
||||
/// The original implementation is mini_formatter in simdjson.h. But it is not public API, so we
|
||||
/// add a implementation here.
|
||||
@ -264,13 +276,93 @@ public:
|
||||
format.key(kv.key);
|
||||
append(kv.value);
|
||||
}
|
||||
|
||||
void append(simdjson::ondemand::value value)
|
||||
{
|
||||
switch (value.type())
|
||||
{
|
||||
case simdjson::ondemand::json_type::array:
|
||||
append(value.get_array());
|
||||
break;
|
||||
case simdjson::ondemand::json_type::object:
|
||||
append(value.get_object());
|
||||
break;
|
||||
case simdjson::ondemand::json_type::number:
|
||||
{
|
||||
|
||||
simdjson::ondemand::number_type nt{};
|
||||
auto res = value.get_number_type().get(nt);
|
||||
chassert(res == simdjson::SUCCESS);
|
||||
switch(nt)
|
||||
{
|
||||
case simdjson::ondemand::number_type::signed_integer:
|
||||
format.number(value.get_int64().value_unsafe());
|
||||
break;
|
||||
case simdjson::ondemand::number_type::unsigned_integer:
|
||||
format.number(value.get_uint64().value_unsafe());
|
||||
break;
|
||||
case simdjson::ondemand::number_type::floating_point_number:
|
||||
format.number(value.get_double().value_unsafe());
|
||||
break;
|
||||
case simdjson::ondemand::number_type::big_integer:
|
||||
format.string(value.get_string().value_unsafe());
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case simdjson::ondemand::json_type::string:
|
||||
format.string(value.get_string().value_unsafe());
|
||||
break;
|
||||
case simdjson::ondemand::json_type::boolean:
|
||||
if (value.get_bool().value_unsafe())
|
||||
format.trueAtom();
|
||||
else
|
||||
format.falseAtom();
|
||||
break;
|
||||
case simdjson::ondemand::json_type::null:
|
||||
format.nullAtom();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void append(simdjson::ondemand::array array)
|
||||
{
|
||||
format.startArray();
|
||||
int i = 0;
|
||||
for (simdjson::ondemand::value value : array)
|
||||
{
|
||||
if (i++ != 0)
|
||||
format.comma();
|
||||
append(value);
|
||||
}
|
||||
format.endArray();
|
||||
}
|
||||
|
||||
void append(simdjson::ondemand::object object)
|
||||
{
|
||||
format.startObject();
|
||||
int i = 0;
|
||||
for (simdjson::ondemand::field field : object)
|
||||
{
|
||||
if (i++ != 0)
|
||||
format.comma();
|
||||
append(field);
|
||||
}
|
||||
format.endObject();
|
||||
}
|
||||
|
||||
void append(simdjson::ondemand::field field)
|
||||
{
|
||||
format.key(field.unescaped_key());
|
||||
append(field.value());
|
||||
}
|
||||
private:
|
||||
SimdJSONBasicFormatter format;
|
||||
};
|
||||
|
||||
/// This class can be used as an argument for the template class FunctionJSON.
|
||||
/// It provides ability to parse JSONs using simdjson library.
|
||||
struct SimdJSONParser
|
||||
struct DomSimdJSONParser
|
||||
{
|
||||
class Array;
|
||||
class Object;
|
||||
@ -419,16 +511,215 @@ private:
|
||||
simdjson::dom::parser parser;
|
||||
};
|
||||
|
||||
inline ALWAYS_INLINE SimdJSONParser::Array SimdJSONParser::Element::getArray() const
|
||||
inline ALWAYS_INLINE DomSimdJSONParser::Array DomSimdJSONParser::Element::getArray() const
|
||||
{
|
||||
return element.get_array().value_unsafe();
|
||||
}
|
||||
|
||||
inline ALWAYS_INLINE SimdJSONParser::Object SimdJSONParser::Element::getObject() const
|
||||
inline ALWAYS_INLINE DomSimdJSONParser::Object DomSimdJSONParser::Element::getObject() const
|
||||
{
|
||||
return element.get_object().value_unsafe();
|
||||
}
|
||||
|
||||
struct OnDemandSimdJSONParser
|
||||
{
|
||||
class Array;
|
||||
class Object;
|
||||
|
||||
/// References an element in a JSON document, representing a JSON null, boolean, string, number,
|
||||
/// array or object.
|
||||
class Element
|
||||
{
|
||||
public:
|
||||
ALWAYS_INLINE Element() {} /// NOLINT
|
||||
ALWAYS_INLINE Element(simdjson::ondemand::value && value_) { value = std::move(value_); }
|
||||
ALWAYS_INLINE Element & operator=(const simdjson::ondemand::value & value_) { value = value_; return *this; }
|
||||
|
||||
ALWAYS_INLINE ElementType type() const
|
||||
{
|
||||
if (value.type() == simdjson::ondemand::json_type::object)
|
||||
return ElementType::OBJECT;
|
||||
if (value.type() == simdjson::ondemand::json_type::array)
|
||||
return ElementType::ARRAY;
|
||||
if (value.type() == simdjson::ondemand::json_type::boolean)
|
||||
return ElementType::BOOL;
|
||||
if (value.type() == simdjson::ondemand::json_type::string)
|
||||
return ElementType::STRING;
|
||||
if (value.type() == simdjson::ondemand::json_type::number)
|
||||
{
|
||||
auto res = value.get_number_type();
|
||||
if (res.error())
|
||||
return ElementType::NULL_VALUE;
|
||||
if (res.value() == simdjson::ondemand::number_type::signed_integer)
|
||||
return ElementType::INT64;
|
||||
if (res.value() == simdjson::ondemand::number_type::unsigned_integer)
|
||||
return ElementType::UINT64;
|
||||
if (res.value() == simdjson::ondemand::number_type::floating_point_number)
|
||||
return ElementType::DOUBLE;
|
||||
}
|
||||
return ElementType::NULL_VALUE;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool isInt64() const { auto res = value.get_number_type(); return !res.error() && res.value() == simdjson::ondemand::number_type::signed_integer; }
|
||||
ALWAYS_INLINE bool isUInt64() const { auto res = value.get_number_type(); return !res.error() && res.value() == simdjson::ondemand::number_type::unsigned_integer; }
|
||||
ALWAYS_INLINE bool isDouble() const { auto res = value.get_number_type(); return !res.error() && res.value() == simdjson::ondemand::number_type::floating_point_number; }
|
||||
ALWAYS_INLINE bool isString() const { auto r = value.type(); return !r.error() && r.value() == simdjson::ondemand::json_type::string; }
|
||||
ALWAYS_INLINE bool isArray() const
|
||||
{
|
||||
auto r = value.type();
|
||||
return !r.error() && r.value() == simdjson::ondemand::json_type::array;
|
||||
}
|
||||
ALWAYS_INLINE bool isObject() const
|
||||
{
|
||||
auto r = value.type();
|
||||
return !r.error() && r.value() == simdjson::ondemand::json_type::object;
|
||||
}
|
||||
ALWAYS_INLINE bool isBool() const { return value.type() == simdjson::ondemand::json_type::boolean; }
|
||||
ALWAYS_INLINE bool isNull() const { return value.type() == simdjson::ondemand::json_type::null; }
|
||||
|
||||
ALWAYS_INLINE Int64 getInt64() const { return value.get_int64().value(); }
|
||||
ALWAYS_INLINE UInt64 getUInt64() const { return value.get_uint64().value(); }
|
||||
ALWAYS_INLINE double getDouble() const { return value.get_double().value(); }
|
||||
ALWAYS_INLINE bool getBool() const { return value.get_bool().value(); }
|
||||
ALWAYS_INLINE std::string_view getString() const
|
||||
{
|
||||
auto r = value.get_string();
|
||||
if (r.error())
|
||||
return {};
|
||||
return r.value();
|
||||
}
|
||||
ALWAYS_INLINE Array getArray() const
|
||||
{
|
||||
return value.get_array().value();
|
||||
}
|
||||
ALWAYS_INLINE Object getObject() const
|
||||
{
|
||||
return value.get_object().value();
|
||||
}
|
||||
|
||||
ALWAYS_INLINE simdjson::ondemand::value getElement() const { return value; }
|
||||
|
||||
private:
|
||||
mutable simdjson::ondemand::value value;
|
||||
};
|
||||
|
||||
/// References an array in a JSON document.
|
||||
class Array
|
||||
{
|
||||
public:
|
||||
class Iterator
|
||||
{
|
||||
public:
|
||||
ALWAYS_INLINE Iterator(const simdjson::ondemand::array_iterator & it_) : it(it_) {} /// NOLINT
|
||||
ALWAYS_INLINE Element operator*() const { return (*it).value(); }
|
||||
ALWAYS_INLINE Iterator & operator++() { ++it; return *this; }
|
||||
ALWAYS_INLINE friend bool operator!=(const Iterator & left, const Iterator & right) { return left.it != right.it; }
|
||||
ALWAYS_INLINE friend bool operator==(const Iterator & left, const Iterator & right) { return !(left != right); }
|
||||
private:
|
||||
mutable simdjson::ondemand::array_iterator it;
|
||||
};
|
||||
|
||||
ALWAYS_INLINE Array(const simdjson::ondemand::array & array_) : array(array_) {} /// NOLINT
|
||||
ALWAYS_INLINE Iterator begin() const { return array.begin().value(); }
|
||||
ALWAYS_INLINE Iterator end() const { return array.end().value(); }
|
||||
ALWAYS_INLINE size_t size() const { return array.count_elements().value(); }
|
||||
ALWAYS_INLINE Element operator[](size_t index) const { return array.at(index).value(); }
|
||||
|
||||
private:
|
||||
mutable simdjson::ondemand::array array;
|
||||
};
|
||||
|
||||
using KeyValuePair = std::pair<std::string_view, Element>;
|
||||
|
||||
/// References an object in a JSON document.
|
||||
class Object
|
||||
{
|
||||
public:
|
||||
class Iterator
|
||||
{
|
||||
public:
|
||||
ALWAYS_INLINE Iterator(const simdjson::ondemand::object_iterator & it_) : it(it_) {} /// NOLINT
|
||||
|
||||
ALWAYS_INLINE KeyValuePair operator*() const
|
||||
{
|
||||
SIMDJSON_ASSIGN_OR_THROW(auto field_wrapper, *it);
|
||||
SIMDJSON_ASSIGN_OR_THROW(std::string_view key, field_wrapper.unescaped_key());
|
||||
::simdjson::ondemand::value v = field_wrapper.value();
|
||||
return {key, Element(std::move(v))};
|
||||
}
|
||||
|
||||
ALWAYS_INLINE Iterator & operator++() { ++it; return *this; }
|
||||
ALWAYS_INLINE friend bool operator!=(const Iterator & left, const Iterator & right) { return left.it != right.it; }
|
||||
ALWAYS_INLINE friend bool operator==(const Iterator & left, const Iterator & right) { return !(left != right); }
|
||||
private:
|
||||
mutable simdjson::ondemand::object_iterator it;
|
||||
};
|
||||
|
||||
ALWAYS_INLINE Object(const simdjson::ondemand::object & object_) : object(object_) {} /// NOLINT
|
||||
ALWAYS_INLINE Iterator begin() const { return object.begin().value(); }
|
||||
ALWAYS_INLINE Iterator end() const { return object.end().value(); }
|
||||
///NOTE: call size() before iterate
|
||||
ALWAYS_INLINE size_t size() const
|
||||
{
|
||||
return object.count_fields().value();
|
||||
}
|
||||
|
||||
bool find(std::string_view key, Element & result) const
|
||||
{
|
||||
auto x = object.find_field_unordered(key);
|
||||
if (x.error())
|
||||
return false;
|
||||
|
||||
result = x.value_unsafe();
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Optional: Provides access to an object's element by index.
|
||||
KeyValuePair operator[](size_t index) const
|
||||
{
|
||||
///SIMDJSON_ASSIGN_OR_THROW(auto b, object.reset());
|
||||
///(void)b;
|
||||
SIMDJSON_ASSIGN_OR_THROW(auto it, object.begin());
|
||||
while (index--)
|
||||
{
|
||||
(void)*(it); /// NEED TO DO THIS TO ITERATE
|
||||
++it;
|
||||
}
|
||||
SIMDJSON_ASSIGN_OR_THROW(auto field, *it);
|
||||
std::string_view key = field.unescaped_key().value();
|
||||
simdjson::ondemand::value value = field.value();
|
||||
return {key, Element(std::move(value))};
|
||||
}
|
||||
|
||||
private:
|
||||
mutable simdjson::ondemand::object object;
|
||||
};
|
||||
|
||||
/// Parses a JSON document, returns the reference to its root element if succeeded.
|
||||
bool parse(std::string_view json, Element & result)
|
||||
{
|
||||
padstr = json;
|
||||
auto res = parser.iterate(padstr);
|
||||
if (res.error())
|
||||
return false;
|
||||
|
||||
document = std::move(res.value());
|
||||
auto v = document.get_value();
|
||||
if (v.error())
|
||||
return false;
|
||||
|
||||
result = v.value();
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
simdjson::ondemand::parser parser;
|
||||
simdjson::ondemand::document document{};
|
||||
simdjson::padded_string padstr;
|
||||
};
|
||||
|
||||
using SimdJSONParser = OnDemandSimdJSONParser;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -1210,9 +1210,9 @@ public:
|
||||
auto array = element.getArray();
|
||||
auto it = array.begin();
|
||||
|
||||
for (size_t index = 0; (index != nested.size()) && (it != array.end()); ++index)
|
||||
for (size_t index = 0; (index != nested.size()) && (it != array.end()); ++index, ++it)
|
||||
{
|
||||
if (nested[index]->insertResultToColumn(tuple.getColumn(index), *it++, insert_settings, format_settings, error))
|
||||
if (nested[index]->insertResultToColumn(tuple.getColumn(index), *it, insert_settings, format_settings, error))
|
||||
{
|
||||
were_valid_elements = true;
|
||||
}
|
||||
@ -1238,9 +1238,9 @@ public:
|
||||
if (name_to_index_map.empty())
|
||||
{
|
||||
auto it = object.begin();
|
||||
for (size_t index = 0; (index != nested.size()) && (it != object.end()); ++index)
|
||||
for (size_t index = 0; (index != nested.size()) && (it != object.end()); ++index, ++it)
|
||||
{
|
||||
if (nested[index]->insertResultToColumn(tuple.getColumn(index), (*it++).second, insert_settings, format_settings, error))
|
||||
if (nested[index]->insertResultToColumn(tuple.getColumn(index), (*it).second, insert_settings, format_settings, error))
|
||||
{
|
||||
were_valid_elements = true;
|
||||
}
|
||||
@ -1315,18 +1315,18 @@ public:
|
||||
error = fmt::format("cannot read Map value from JSON element: {}", jsonElementToString<JSONParser>(element, format_settings));
|
||||
return false;
|
||||
}
|
||||
|
||||
auto & map_col = assert_cast<ColumnMap &>(column);
|
||||
auto & offsets = map_col.getNestedColumn().getOffsets();
|
||||
auto & tuple_col = map_col.getNestedData();
|
||||
auto & key_col = tuple_col.getColumn(0);
|
||||
auto & value_col = tuple_col.getColumn(1);
|
||||
size_t old_size = tuple_col.size();
|
||||
|
||||
auto object = element.getObject();
|
||||
auto it = object.begin();
|
||||
size_t object_size{};
|
||||
for (; it != object.end(); ++it)
|
||||
{
|
||||
++object_size;
|
||||
auto pair = *it;
|
||||
|
||||
/// Insert key
|
||||
@ -1349,7 +1349,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
offsets.push_back(old_size + object.size());
|
||||
offsets.push_back(old_size + object_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -273,12 +273,13 @@ private:
|
||||
if (element.isArray())
|
||||
{
|
||||
auto array = element.getArray();
|
||||
size_t array_size = array.size();
|
||||
if (index >= 0)
|
||||
--index;
|
||||
else
|
||||
index += array.size();
|
||||
index += array_size;
|
||||
|
||||
if (static_cast<size_t>(index) >= array.size())
|
||||
if (static_cast<size_t>(index) >= array_size)
|
||||
return false;
|
||||
element = array[index];
|
||||
out_key = {};
|
||||
@ -290,12 +291,13 @@ private:
|
||||
if (element.isObject())
|
||||
{
|
||||
auto object = element.getObject();
|
||||
size_t object_size = object.size();
|
||||
if (index >= 0)
|
||||
--index;
|
||||
else
|
||||
index += object.size();
|
||||
index += object_size;
|
||||
|
||||
if (static_cast<size_t>(index) >= object.size())
|
||||
if (static_cast<size_t>(index) >= object_size)
|
||||
return false;
|
||||
std::tie(out_key, element) = object[index];
|
||||
return true;
|
||||
@ -621,7 +623,7 @@ public:
|
||||
|
||||
static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view, const FormatSettings &, String &)
|
||||
{
|
||||
size_t size;
|
||||
size_t size{};
|
||||
if (element.isArray())
|
||||
size = element.getArray().size();
|
||||
else if (element.isObject())
|
||||
@ -984,10 +986,13 @@ public:
|
||||
auto array = element.getArray();
|
||||
ColumnArray & col_res = assert_cast<ColumnArray &>(dest);
|
||||
|
||||
size_t size = 0;
|
||||
for (auto value : array)
|
||||
{
|
||||
++size;
|
||||
JSONExtractRawImpl<JSONParser>::insertResultToColumn(col_res.getData(), value, {}, format_settings, error);
|
||||
|
||||
col_res.getOffsets().push_back(col_res.getOffsets().back() + array.size());
|
||||
}
|
||||
col_res.getOffsets().push_back(col_res.getOffsets().back() + size);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
@ -1020,13 +1025,15 @@ public:
|
||||
auto & col_key = assert_cast<ColumnString &>(col_tuple.getColumn(0));
|
||||
auto & col_value = assert_cast<ColumnString &>(col_tuple.getColumn(1));
|
||||
|
||||
size_t size = 0;
|
||||
for (const auto & [key, value] : object)
|
||||
{
|
||||
col_key.insertData(key.data(), key.size());
|
||||
JSONExtractRawImpl<JSONParser>::insertResultToColumn(col_value, value, {}, format_settings, error);
|
||||
++size;
|
||||
}
|
||||
|
||||
col_arr.getOffsets().push_back(col_arr.getOffsets().back() + object.size());
|
||||
col_arr.getOffsets().push_back(col_arr.getOffsets().back() + size);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
@ -1054,12 +1061,14 @@ public:
|
||||
ColumnArray & col_res = assert_cast<ColumnArray &>(dest);
|
||||
auto & col_key = assert_cast<ColumnString &>(col_res.getData());
|
||||
|
||||
size_t count = 0;
|
||||
for (const auto & [key, value] : object)
|
||||
{
|
||||
++count;
|
||||
col_key.insertData(key.data(), key.size());
|
||||
}
|
||||
|
||||
col_res.getOffsets().push_back(col_res.getOffsets().back() + object.size());
|
||||
col_res.getOffsets().push_back(col_res.getOffsets().back() + count);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user