Merge pull request #5235 from vitlibar/improve-new-json-functions

Improve new json functions
This commit is contained in:
alexey-milovidov 2019-05-18 12:51:41 +03:00 committed by GitHub
commit 24b16da9fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 1684 additions and 625 deletions

3
.gitmodules vendored
View File

@ -82,3 +82,6 @@
[submodule "contrib/simdjson"]
path = contrib/simdjson
url = https://github.com/lemire/simdjson.git
[submodule "contrib/rapidjson"]
path = contrib/rapidjson
url = https://github.com/Tencent/rapidjson

View File

@ -328,6 +328,7 @@ include (cmake/find_base64.cmake)
include (cmake/find_hyperscan.cmake)
include (cmake/find_lfalloc.cmake)
include (cmake/find_simdjson.cmake)
include (cmake/find_rapidjson.cmake)
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
find_contrib_lib(metrohash)

View File

@ -0,0 +1,9 @@
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/rapidjson/include/rapidjson/rapidjson.h")
message (WARNING "submodule contrib/rapidjson is missing. to fix try run: \n git submodule update --init --recursive")
return()
endif ()
option (USE_RAPIDJSON "Use rapidjson" ON)
set (RAPIDJSON_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/rapidjson/include")
message(STATUS "Using rapidjson=${USE_RAPIDJSON}: ${RAPIDJSON_INCLUDE_DIR}")

1
contrib/rapidjson vendored Submodule

@ -0,0 +1 @@
Subproject commit 01950eb7acec78818d68b762efc869bba2420d82

2
contrib/simdjson vendored

@ -1 +1 @@
Subproject commit 681cd3369860f4eada49a387cbff93030f759c95
Subproject commit 14cd1f7a0b0563db78bda8053a9f6ac2ea95a441

View File

@ -26,6 +26,7 @@
#cmakedefine01 USE_SSL
#cmakedefine01 USE_HYPERSCAN
#cmakedefine01 USE_SIMDJSON
#cmakedefine01 USE_RAPIDJSON
#cmakedefine01 USE_LFALLOC
#cmakedefine01 USE_LFALLOC_RANDOM_HINT

View File

@ -430,6 +430,13 @@ inline bool_if_safe_conversion<A, B> greaterOrEqualsOp(A a, B b)
template <typename From, typename To>
inline bool NO_SANITIZE_UNDEFINED convertNumeric(From value, To & result)
{
/// If the type is actually the same it's not necessary to do any checks.
if constexpr (std::is_same_v<From, To>)
{
result = value;
return true;
}
/// Note that NaNs doesn't compare equal to anything, but they are still in range of any Float type.
if (isNaN(value) && std::is_floating_point_v<To>)
{

View File

@ -323,8 +323,9 @@ struct Settings : public SettingsCollection<Settings>
M(SettingBool, allow_experimental_data_skipping_indices, false, "If it is set to true, data skipping indices can be used in CREATE TABLE/ALTER TABLE queries.") \
\
M(SettingBool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.") \
M(SettingBool, allow_simdjson, 1, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.") \
\
M(SettingUInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in single INSERTed block. Zero means unlimited. Throw exception if the block contains too many partitions. This setting is a safety threshold, because using large number of partitions is a common misconception.") \
M(SettingUInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in single INSERTed block. Zero means unlimited. Throw exception if the block contains too many partitions. This setting is a safety threshold, because using large number of partitions is a common misconception.")
DECLARE_SETTINGS_COLLECTION(LIST_OF_SETTINGS)

View File

@ -73,3 +73,7 @@ endif()
if(USE_SIMDJSON)
target_link_libraries(clickhouse_functions PRIVATE ${SIMDJSON_LIBRARY})
endif()
if(USE_RAPIDJSON)
target_include_directories(clickhouse_functions SYSTEM PRIVATE ${RAPIDJSON_INCLUDE_DIR})
endif()

View File

@ -0,0 +1,57 @@
#pragma once
#include <common/StringRef.h>
#include <Common/Exception.h>
#include <Core/Types.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/// This class can be used as an argument for the template class FunctionJSON when we unable to parse JSONs.
/// It can't do anything useful and just throws an exception.
struct DummyJSONParser
{
static constexpr bool need_preallocate = false;
void preallocate(size_t) {}
bool parse(const StringRef &) { throw Exception{"Functions JSON* are not supported without AVX2", ErrorCodes::NOT_IMPLEMENTED}; }
using Iterator = std::nullptr_t;
Iterator getRoot() const { return nullptr; }
static bool isInt64(const Iterator &) { return false; }
static bool isUInt64(const Iterator &) { return false; }
static bool isDouble(const Iterator &) { return false; }
static bool isString(const Iterator &) { return false; }
static bool isArray(const Iterator &) { return false; }
static bool isObject(const Iterator &) { return false; }
static bool isBool(const Iterator &) { return false; }
static bool isNull(const Iterator &) { return true; }
static Int64 getInt64(const Iterator &) { return 0; }
static UInt64 getUInt64(const Iterator &) { return 0; }
static double getDouble(const Iterator &) { return 0; }
static bool getBool(const Iterator &) { return false; }
static StringRef getString(const Iterator &) { return {}; }
static size_t sizeOfArray(const Iterator &) { return 0; }
static bool firstArrayElement(Iterator &) { return false; }
static bool arrayElementByIndex(Iterator &, size_t) { return false; }
static bool nextArrayElement(Iterator &) { return false; }
static size_t sizeOfObject(const Iterator &) { return 0; }
static bool firstObjectMember(Iterator &) { return false; }
static bool firstObjectMember(Iterator &, StringRef &) { return false; }
static bool objectMemberByIndex(Iterator &, size_t) { return false; }
static bool objectMemberByName(Iterator &, const StringRef &) { return false; }
static bool nextObjectMember(Iterator &) { return false; }
static bool nextObjectMember(Iterator &, StringRef &) { return false; }
static bool isObjectMember(const Iterator &) { return false; }
static StringRef getKey(const Iterator &) { return {}; }
};
}

View File

@ -1,378 +1,24 @@
#include <Functions/FunctionsJSON.h>
#include <Functions/FunctionFactory.h>
#include <Common/config.h>
#if USE_SIMDJSON
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
namespace DB
{
template <typename T>
class JSONNullableImplBase
{
public:
static DataTypePtr getType() { return std::make_shared<DataTypeNullable>(std::make_shared<T>()); }
static Field getDefault() { return {}; }
};
class JSONHasImpl : public JSONNullableImplBase<DataTypeUInt8>
{
public:
static constexpr auto name{"jsonHas"};
static Field getValue(ParsedJson::iterator &) { return {1}; }
};
class JSONLengthImpl : public JSONNullableImplBase<DataTypeUInt64>
{
public:
static constexpr auto name{"jsonLength"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (!pjh.is_object_or_array())
return getDefault();
size_t size = 0;
if (pjh.down())
{
size += 1;
while (pjh.next())
size += 1;
}
return {size};
}
};
class JSONTypeImpl : public JSONNullableImplBase<DataTypeString>
{
public:
static constexpr auto name{"jsonType"};
static Field getValue(ParsedJson::iterator & pjh)
{
switch (pjh.get_type())
{
case '[':
return "Array";
case '{':
return "Object";
case '"':
return "String";
case 'l':
return "Int64";
case 'd':
return "Float64";
case 't':
return "Bool";
case 'f':
return "Bool";
case 'n':
return "Null";
default:
return "Unknown";
}
}
};
class JSONExtractImpl
{
public:
static constexpr auto name{"jsonExtract"};
static DataTypePtr getType(const DataTypePtr & type)
{
WhichDataType which{type};
if (which.isNativeUInt() || which.isNativeInt() || which.isFloat() || which.isEnum() || which.isDateOrDateTime()
|| which.isStringOrFixedString() || which.isInterval())
return std::make_shared<DataTypeNullable>(type);
if (which.isArray())
{
auto array_type = static_cast<const DataTypeArray *>(type.get());
return std::make_shared<DataTypeArray>(getType(array_type->getNestedType()));
}
if (which.isTuple())
{
auto tuple_type = static_cast<const DataTypeTuple *>(type.get());
DataTypes types;
types.reserve(tuple_type->getElements().size());
for (const DataTypePtr & element : tuple_type->getElements())
{
types.push_back(getType(element));
}
return std::make_shared<DataTypeTuple>(std::move(types));
}
throw Exception{"Unsupported return type schema: " + type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
}
static Field getDefault(const DataTypePtr & type)
{
WhichDataType which{type};
if (which.isNativeUInt() || which.isNativeInt() || which.isFloat() || which.isEnum() || which.isDateOrDateTime()
|| which.isStringOrFixedString() || which.isInterval())
return {};
if (which.isArray())
return {Array{}};
if (which.isTuple())
{
auto tuple_type = static_cast<const DataTypeTuple *>(type.get());
Tuple tuple;
tuple.toUnderType().reserve(tuple_type->getElements().size());
for (const DataTypePtr & element : tuple_type->getElements())
tuple.toUnderType().push_back(getDefault(element));
return {tuple};
}
// should not reach
throw Exception{"Unsupported return type schema: " + type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
}
static Field getValue(ParsedJson::iterator & pjh, const DataTypePtr & type)
{
WhichDataType which{type};
if (which.isNativeUInt() || which.isNativeInt() || which.isEnum() || which.isDateOrDateTime() || which.isInterval())
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault(type);
}
if (which.isFloat())
{
if (pjh.is_integer())
return {static_cast<double>(pjh.get_integer())};
else if (pjh.is_double())
return {pjh.get_double()};
else
return getDefault(type);
}
if (which.isStringOrFixedString())
{
if (pjh.is_string())
return {String{pjh.get_string()}};
else
return getDefault(type);
}
if (which.isArray())
{
if (!pjh.is_object_or_array())
return getDefault(type);
auto array_type = static_cast<const DataTypeArray *>(type.get());
Array array;
bool first = true;
while (first ? pjh.down() : pjh.next())
{
first = false;
ParsedJson::iterator pjh1{pjh};
array.push_back(getValue(pjh1, array_type->getNestedType()));
}
return {array};
}
if (which.isTuple())
{
if (!pjh.is_object_or_array())
return getDefault(type);
auto tuple_type = static_cast<const DataTypeTuple *>(type.get());
Tuple tuple;
tuple.toUnderType().reserve(tuple_type->getElements().size());
bool valid = true;
bool first = true;
for (const DataTypePtr & element : tuple_type->getElements())
{
if (valid)
{
valid &= first ? pjh.down() : pjh.next();
first = false;
ParsedJson::iterator pjh1{pjh};
tuple.toUnderType().push_back(getValue(pjh1, element));
}
else
tuple.toUnderType().push_back(getDefault(element));
}
return {tuple};
}
// should not reach
throw Exception{"Unsupported return type schema: " + type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
}
};
class JSONExtractUIntImpl : public JSONNullableImplBase<DataTypeUInt64>
{
public:
static constexpr auto name{"jsonExtractUInt"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault();
}
};
class JSONExtractIntImpl : public JSONNullableImplBase<DataTypeInt64>
{
public:
static constexpr auto name{"jsonExtractInt"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault();
}
};
class JSONExtractFloatImpl : public JSONNullableImplBase<DataTypeFloat64>
{
public:
static constexpr auto name{"jsonExtractFloat"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_double())
return {pjh.get_double()};
else
return getDefault();
}
};
class JSONExtractBoolImpl : public JSONNullableImplBase<DataTypeUInt8>
{
public:
static constexpr auto name{"jsonExtractBool"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.get_type() == 't')
return {1};
else if (pjh.get_type() == 'f')
return {0};
else
return getDefault();
}
};
// class JSONExtractRawImpl: public JSONNullableImplBase<DataTypeString>
// {
// public:
// static constexpr auto name {"jsonExtractRaw"};
// static Field getValue(ParsedJson::iterator & pjh)
// {
// //
// }
// };
class JSONExtractStringImpl : public JSONNullableImplBase<DataTypeString>
{
public:
static constexpr auto name{"jsonExtractString"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_string())
return {String{pjh.get_string()}};
else
return getDefault();
}
};
}
#else
namespace DB
{
struct JSONHasImpl { static constexpr auto name{"jsonHas"}; };
struct JSONLengthImpl { static constexpr auto name{"jsonLength"}; };
struct JSONTypeImpl { static constexpr auto name{"jsonType"}; };
struct JSONExtractImpl { static constexpr auto name{"jsonExtract"}; };
struct JSONExtractUIntImpl { static constexpr auto name{"jsonExtractUInt"}; };
struct JSONExtractIntImpl { static constexpr auto name{"jsonExtractInt"}; };
struct JSONExtractFloatImpl { static constexpr auto name{"jsonExtractFloat"}; };
struct JSONExtractBoolImpl { static constexpr auto name{"jsonExtractBool"}; };
//struct JSONExtractRawImpl { static constexpr auto name {"jsonExtractRaw"}; };
struct JSONExtractStringImpl { static constexpr auto name{"jsonExtractString"}; };
}
#endif
namespace DB
{
void registerFunctionsJSON(FunctionFactory & factory)
{
#if USE_SIMDJSON
if (__builtin_cpu_supports("avx2"))
{
factory.registerFunction<FunctionJSONBase<JSONHasImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONLengthImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONTypeImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONExtractImpl, true>>();
factory.registerFunction<FunctionJSONBase<JSONExtractUIntImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONExtractIntImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONExtractFloatImpl, false>>();
factory.registerFunction<FunctionJSONBase<JSONExtractBoolImpl, false>>();
// factory.registerFunction<FunctionJSONBase<
// JSONExtractRawImpl,
// false
// >>();
factory.registerFunction<FunctionJSONBase<JSONExtractStringImpl, false>>();
return;
}
#endif
factory.registerFunction<FunctionJSONDummy<JSONHasImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONLengthImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONTypeImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractUIntImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractIntImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractFloatImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractBoolImpl>>();
//factory.registerFunction<FunctionJSONDummy<JSONExtractRawImpl>>();
factory.registerFunction<FunctionJSONDummy<JSONExtractStringImpl>>();
factory.registerFunction<FunctionJSON<NameJSONHas, JSONHasImpl>>();
factory.registerFunction<FunctionJSON<NameJSONLength, JSONLengthImpl>>();
factory.registerFunction<FunctionJSON<NameJSONKey, JSONKeyImpl>>();
factory.registerFunction<FunctionJSON<NameJSONType, JSONTypeImpl>>();
factory.registerFunction<FunctionJSON<NameJSONExtractInt, JSONExtractInt64Impl>>();
factory.registerFunction<FunctionJSON<NameJSONExtractUInt, JSONExtractUInt64Impl>>();
factory.registerFunction<FunctionJSON<NameJSONExtractFloat, JSONExtractFloat64Impl>>();
factory.registerFunction<FunctionJSON<NameJSONExtractBool, JSONExtractBoolImpl>>();
factory.registerFunction<FunctionJSON<NameJSONExtractString, JSONExtractStringImpl>>();
factory.registerFunction<FunctionJSON<NameJSONExtract, JSONExtractImpl>>();
factory.registerFunction<FunctionJSON<NameJSONExtractKeysAndValues, JSONExtractKeysAndValuesImpl>>();
factory.registerFunction<FunctionJSON<NameJSONExtractRaw, JSONExtractRawImpl>>();
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,211 @@
#pragma once
#include <Common/config.h>
#if USE_RAPIDJSON
#include <common/StringRef.h>
#include <Common/Exception.h>
#include <Core/Types.h>
#include <rapidjson/document.h>
namespace DB
{
/// This class can be used as an argument for the template class FunctionJSON.
/// It provides ability to parse JSONs using rapidjson library.
struct RapidJSONParser
{
static constexpr bool need_preallocate = false;
void preallocate(size_t) {}
bool parse(const StringRef & json)
{
document.Parse(json.data);
return !document.HasParseError();
}
struct Iterator
{
public:
Iterator() {}
Iterator(const rapidjson::Document & document) : value(&document) {}
Iterator(const Iterator & src)
: value(src.value)
, is_object_member(src.is_object_member)
, current_in_array(src.current_in_array)
, end_of_array(src.end_of_array) {}
Iterator & operator =(const Iterator & src)
{
value = src.value;
is_object_member = src.is_object_member;
current_in_array = src.current_in_array;
end_of_array = src.end_of_array;
return *this;
}
bool isInt64() const { return value->IsInt64(); }
bool isUInt64() const { return value->IsUint64(); }
bool isDouble() const { return value->IsDouble(); }
bool isBool() const { return value->IsBool(); }
bool isString() const { return value->IsString(); }
bool isArray() const { return value->IsArray(); }
bool isObject() const { return value->IsObject(); }
bool isNull() const { return value->IsNull(); }
Int64 getInt64() const { return value->GetInt64(); }
UInt64 getUInt64() const { return value->GetUint64(); }
double getDouble() const { return value->GetDouble(); }
bool getBool() const { return value->GetBool(); }
StringRef getString() const { return {value->GetString(), value->GetStringLength()}; }
size_t sizeOfArray() const { return value->Size(); }
bool arrayElementByIndex(size_t index)
{
if (index >= value->Size())
return false;
setRange(value->Begin() + index, value->End());
value = current_in_array++;
return true;
}
bool nextArrayElement()
{
if (current_in_array == end_of_array)
return false;
value = current_in_array++;
return true;
}
size_t sizeOfObject() const { return value->MemberCount(); }
bool objectMemberByIndex(size_t index)
{
if (index >= value->MemberCount())
return false;
setRange(value->MemberBegin() + index, value->MemberEnd());
value = &(current_in_object++)->value;
return true;
}
bool objectMemberByIndex(size_t index, StringRef & key)
{
if (index >= value->MemberCount())
return false;
setRange(value->MemberBegin() + index, value->MemberEnd());
key = getKeyImpl(current_in_object);
value = &(current_in_object++)->value;
return true;
}
bool objectMemberByName(const StringRef & name)
{
auto it = value->FindMember(name.data);
if (it == value->MemberEnd())
return false;
setRange(it, value->MemberEnd());
value = &(current_in_object++)->value;
return true;
}
bool nextObjectMember()
{
if (current_in_object == end_of_object)
return false;
value = &(current_in_object++)->value;
return true;
}
bool nextObjectMember(StringRef & key)
{
if (current_in_object == end_of_object)
return false;
key = getKeyImpl(current_in_object);
value = &(current_in_object++)->value;
return true;
}
bool isObjectMember() const { return is_object_member; }
StringRef getKey() const
{
return getKeyImpl(current_in_object - 1);
}
private:
void setRange(rapidjson::Value::ConstValueIterator current, rapidjson::Value::ConstValueIterator end)
{
current_in_array = &*current;
end_of_array = &*end;
is_object_member = false;
}
void setRange(rapidjson::Value::ConstMemberIterator current, rapidjson::Value::ConstMemberIterator end)
{
current_in_object = &*current;
end_of_object = &*end;
is_object_member = true;
}
static StringRef getKeyImpl(const rapidjson::GenericMember<rapidjson::UTF8<>, rapidjson::MemoryPoolAllocator<>> * member)
{
const auto & name = member->name;
return {name.GetString(), name.GetStringLength()};
}
const rapidjson::Value * value = nullptr;
bool is_object_member = false;
union
{
const rapidjson::GenericMember<rapidjson::UTF8<>, rapidjson::MemoryPoolAllocator<>> * current_in_object;
const rapidjson::Value * current_in_array;
};
union
{
const rapidjson::GenericMember<rapidjson::UTF8<>, rapidjson::MemoryPoolAllocator<>> * end_of_object;
const rapidjson::Value * end_of_array;
};
};
Iterator getRoot() { return Iterator{document}; }
static bool isInt64(const Iterator & it) { return it.isInt64(); }
static bool isUInt64(const Iterator & it) { return it.isUInt64(); }
static bool isDouble(const Iterator & it) { return it.isDouble(); }
static bool isBool(const Iterator & it) { return it.isBool(); }
static bool isString(const Iterator & it) { return it.isString(); }
static bool isArray(const Iterator & it) { return it.isArray(); }
static bool isObject(const Iterator & it) { return it.isObject(); }
static bool isNull(const Iterator & it) { return it.isNull(); }
static Int64 getInt64(const Iterator & it) { return it.getInt64(); }
static UInt64 getUInt64(const Iterator & it) { return it.getUInt64(); }
static double getDouble(const Iterator & it) { return it.getDouble(); }
static bool getBool(const Iterator & it) { return it.getBool(); }
static StringRef getString(const Iterator & it) { return it.getString(); }
static size_t sizeOfArray(const Iterator & it) { return it.sizeOfArray(); }
static bool firstArrayElement(Iterator & it) { return it.arrayElementByIndex(0); }
static bool arrayElementByIndex(Iterator & it, size_t index) { return it.arrayElementByIndex(index); }
static bool nextArrayElement(Iterator & it) { return it.nextArrayElement(); }
static size_t sizeOfObject(const Iterator & it) { return it.sizeOfObject(); }
static bool firstObjectMember(Iterator & it) { return it.objectMemberByIndex(0); }
static bool firstObjectMember(Iterator & it, StringRef & first_key) { return it.objectMemberByIndex(0, first_key); }
static bool objectMemberByIndex(Iterator & it, size_t index) { return it.objectMemberByIndex(index); }
static bool objectMemberByName(Iterator & it, const StringRef & name) { return it.objectMemberByName(name); }
static bool nextObjectMember(Iterator & it) { return it.nextObjectMember(); }
static bool nextObjectMember(Iterator & it, StringRef & next_key) { return it.nextObjectMember(next_key); }
static bool isObjectMember(const Iterator & it) { return it.isObjectMember(); }
static StringRef getKey(const Iterator & it) { return it.getKey(); }
private:
rapidjson::Document document;
};
}
#endif

View File

@ -0,0 +1,150 @@
#pragma once
#include <Common/config.h>
#if USE_SIMDJSON
#include <common/StringRef.h>
#include <Common/Exception.h>
#include <Core/Types.h>
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
#pragma clang diagnostic ignored "-Wnewline-eof"
#endif
#include <simdjson/jsonparser.h>
#ifdef __clang__
#pragma clang diagnostic pop
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
}
/// This class can be used as an argument for the template class FunctionJSON.
/// It provides ability to parse JSONs using simdjson library.
struct SimdJSONParser
{
static constexpr bool need_preallocate = true;
void preallocate(size_t max_size)
{
if (!pj.allocateCapacity(max_size))
throw Exception{"Can not allocate memory for " + std::to_string(max_size) + " units when parsing JSON",
ErrorCodes::CANNOT_ALLOCATE_MEMORY};
}
bool parse(const StringRef & json) { return !json_parse(json.data, json.size, pj); }
using Iterator = ParsedJson::iterator;
Iterator getRoot() { return Iterator{pj}; }
static bool isInt64(const Iterator & it) { return it.is_integer(); }
static bool isUInt64(const Iterator &) { return false; /* See https://github.com/lemire/simdjson/issues/68 */ }
static bool isDouble(const Iterator & it) { return it.is_double(); }
static bool isString(const Iterator & it) { return it.is_string(); }
static bool isArray(const Iterator & it) { return it.is_array(); }
static bool isObject(const Iterator & it) { return it.is_object(); }
static bool isBool(const Iterator & it) { return it.get_type() == 't' || it.get_type() == 'f'; }
static bool isNull(const Iterator & it) { return it.is_null(); }
static Int64 getInt64(const Iterator & it) { return it.get_integer(); }
static UInt64 getUInt64(const Iterator &) { return 0; /* isUInt64() never returns true */ }
static double getDouble(const Iterator & it) { return it.get_double(); }
static bool getBool(const Iterator & it) { return it.get_type() == 't'; }
static StringRef getString(const Iterator & it) { return StringRef{it.get_string(), it.get_string_length()}; }
static size_t sizeOfArray(const Iterator & it)
{
size_t size = 0;
Iterator it2 = it;
if (it2.down())
{
do
++size;
while (it2.next());
}
return size;
}
static bool firstArrayElement(Iterator & it) { return it.down(); }
static bool arrayElementByIndex(Iterator & it, size_t index)
{
if (!it.down())
return false;
while (index--)
if (!it.next())
return false;
return true;
}
static bool nextArrayElement(Iterator & it) { return it.next(); }
static size_t sizeOfObject(const Iterator & it)
{
size_t size = 0;
Iterator it2 = it;
if (it2.down())
{
do
++size;
while (it2.next() && it2.next());
}
return size;
}
static bool firstObjectMember(Iterator & it) { return it.down() && it.next(); }
static bool firstObjectMember(Iterator & it, StringRef & first_key)
{
if (!it.down())
return false;
first_key.data = it.get_string();
first_key.size = it.get_string_length();
return it.next();
}
static bool objectMemberByIndex(Iterator & it, size_t index)
{
if (!it.down())
return false;
while (index--)
if (!it.next() || !it.next())
return false;
return it.next();
}
static bool objectMemberByName(Iterator & it, const StringRef & name) { return it.move_to_key(name.data); }
static bool nextObjectMember(Iterator & it) { return it.next() && it.next(); }
static bool nextObjectMember(Iterator & it, StringRef & next_key)
{
if (!it.next())
return false;
next_key.data = it.get_string();
next_key.size = it.get_string_length();
return it.next();
}
static bool isObjectMember(const Iterator & it) { return it.get_scope_type() == '{'; }
static StringRef getKey(const Iterator & it)
{
Iterator it2 = it;
it2.prev();
return StringRef{it2.get_string(), it2.get_string_length()};
}
private:
ParsedJson pj;
};
}
#endif

View File

@ -38,18 +38,29 @@ private:
working_buffer = internal_buffer;
}
static constexpr size_t initial_size = 32;
public:
WriteBufferFromVector(VectorType & vector_)
: WriteBuffer(reinterpret_cast<Position>(vector_.data()), vector_.size()), vector(vector_)
{
if (vector.empty())
{
static constexpr size_t initial_size = 32;
vector.resize(initial_size);
set(reinterpret_cast<Position>(vector.data()), vector.size());
}
}
/// Append to vector instead of rewrite.
struct AppendModeTag {};
WriteBufferFromVector(VectorType & vector_, AppendModeTag)
: WriteBuffer(nullptr, 0), vector(vector_)
{
size_t old_size = vector.size();
vector.resize(vector.capacity() < initial_size ? initial_size : vector.capacity());
set(reinterpret_cast<Position>(vector.data() + old_size), (vector.size() - old_size) * sizeof(typename VectorType::value_type));
}
void finish()
{
if (is_finished)

View File

@ -0,0 +1,66 @@
--JSONLength--
2
3
0
--JSONHas--
1
1
0
--JSONKey--
a
b
b
a
--JSONType--
Object
Array
--JSONExtract<numeric>--
-100
200
300
1
0
--JSONExtractString--
hello
hello
\n\0
--JSONExtract (generic)--
('hello',[-100,200,300])
('hello',[-100,200,300])
([-100,200,300],'hello')
('hello\0',0)
hello
[-100,200,300]
(-100,200,300)
[-100,0,0]
[-100,NULL,NULL]
[0,200,0]
[NULL,200,NULL]
-100
200
\N
1
Thursday
Friday
--JSONExtractKeysAndValues--
[('a','hello')]
[('b',[-100,200,300])]
[('a','hello'),('b','world')]
[('a',5),('b',7),('c',11)]
--JSONExtractRaw--
{"a":"hello","b":[-100,200,300]}
"hello"
[-100,200,300]
-100
{"a":"hello","b":[-100,200,300],"c":{"d":[121,144]}}
{"d":[121,144]}
[121,144]
144
{"passed":true}
{}
"\\n\\u0000"
"☺"

View File

@ -0,0 +1,76 @@
SET allow_simdjson=1;
SELECT '--JSONLength--';
SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}');
SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
SELECT JSONLength('{}');
SELECT '--JSONHas--';
SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'c');
SELECT '--JSONKey--';
SELECT JSONKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1);
SELECT JSONKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2);
SELECT JSONKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1);
SELECT JSONKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2);
SELECT '--JSONType--';
SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}');
SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
SELECT '--JSONExtract<numeric>--';
SELECT JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1);
SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2);
SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1);
SELECT JSONExtractBool('{"passed": true}', 'passed');
SELECT JSONExtractBool('"HX-=');
SELECT '--JSONExtractString--';
SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1);
select JSONExtractString('{"abc":"\\n\\u0000"}', 'abc');
select JSONExtractString('{"abc":"\\u263a"}', 'abc');
select JSONExtractString('{"abc":"\\u263"}', 'abc');
select JSONExtractString('{"abc":"hello}', 'abc');
SELECT '--JSONExtract (generic)--';
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(a String, b Array(Float64))');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(a FixedString(6), c UInt8)');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'a', 'String');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Float32)');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Tuple(Int8, Float32, UInt16)');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Int8)');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(UInt8)');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(UInt8))');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1, 'Int8');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2, 'Int32');
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4, 'Nullable(Int64)');
SELECT JSONExtract('{"passed": true}', 'passed', 'UInt8');
SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)');
SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)');
SELECT '--JSONExtractKeysAndValues--';
SELECT JSONExtractKeysAndValues('{"a": "hello", "b": [-100, 200.0, 300]}', 'String');
SELECT JSONExtractKeysAndValues('{"a": "hello", "b": [-100, 200.0, 300]}', 'Array(Float64)');
SELECT JSONExtractKeysAndValues('{"a": "hello", "b": "world"}', 'String');
SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8');
SELECT '--JSONExtractRaw--';
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}');
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1);
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300], "c":{"d":[121,144]}}');
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300], "c":{"d":[121,144]}}', 'c');
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300], "c":{"d":[121,144]}}', 'c', 'd');
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300], "c":{"d":[121,144]}}', 'c', 'd', 2);
SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300], "c":{"d":[121,144]}}', 'c', 'd', 3);
SELECT JSONExtractRaw('{"passed": true}');
SELECT JSONExtractRaw('{}');
select JSONExtractRaw('{"abc":"\\n\\u0000"}', 'abc');
select JSONExtractRaw('{"abc":"\\u263a"}', 'abc');

View File

@ -1,16 +0,0 @@
4
Object
1
1
a
hello
hello
3
Array
-100
200
300
('a','hello','b',[-100,200,300])
[-100,NULL,300]
['a','hello','b',NULL]
[(NULL,NULL,NULL),(NULL,NULL,NULL),(NULL,NULL,NULL),(-100,200,44)]

View File

@ -1,16 +0,0 @@
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}');
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}');
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1);
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 2);
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1);
select jsonExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2);
select jsonExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1);
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, String, String, Array(Float64))');
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Array(Int32)', 'b');
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Array(String)');
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Array(Tuple(Int16, Float32, UInt8))');

View File

@ -60,98 +60,138 @@ There is currently no support for code points in the format `\uXXXX\uYYYY` that
The following functions are based on [simdjson](https://github.com/lemire/simdjson) designed for more complex JSON parsing requirements. The assumption 2 mentioned above still applies.
## jsonHas(params[, accessors]...)
## JSONHas(json[, indices_or_keys]...)
If the value exists in the JSON document, `1` will be returned.
If the value does not exist, `null` will be returned.
If the value does not exist, `0` will be returned.
Examples:
```
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = null
select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1
select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0
```
An accessor can be either a string, a positive integer or a negative integer.
`indices_or_keys` is a list of zero or more arguments each of them can be either string or integer.
* String = access object member by key.
* Positive integer = access the n-th member/key from the beginning.
* Negative integer = access the n-th member/key from the end.
You may use integers to access both JSON arrays and JSON objects. JSON objects are accessed as an array with the `[key, value, key, value, ...]` layout.
You may use integers to access both JSON arrays and JSON objects.
So, for example:
```
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a'
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'hello'
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'b'
select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a'
select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b'
select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b'
select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a'
select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello'
```
## jsonLength(params[, accessors]...)
## JSONLength(json[, indices_or_keys]...)
Return the length of a JSON array or a JSON object. For JSON objects, both keys and values are included.
Return the length of a JSON array or a JSON object.
If the value does not exist or has a wrong type, `null` will be returned.
If the value does not exist or has a wrong type, `0` will be returned.
Examples:
```
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 4
select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3
select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2
```
The usage of accessors is the same as above.
## jsonType(params[, accessors]...)
## JSONType(json[, indices_or_keys]...)
Return the type of a JSON value.
If the value does not exist, `null` will be returned.
If the value does not exist, `Null` will be returned.
Examples:
```
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object'
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String'
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array'
select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object'
select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String'
select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array'
```
The usage of accessors is the same as above.
## JSONExtractUInt(json[, indices_or_keys]...)
## JSONExtractInt(json[, indices_or_keys]...)
## JSONExtractFloat(json[, indices_or_keys]...)
## JSONExtractBool(json[, indices_or_keys]...)
## jsonExtractUInt(params[, accessors]...)
## jsonExtractInt(params[, accessors]...)
## jsonExtractFloat(params[, accessors]...)
## jsonExtractBool(params[, accessors]...)
## jsonExtractString(params[, accessors]...)
Parses a JSON and extract a value. These functions are similar to `visitParam` functions.
Parse data from JSON values which is similar to `visitParam` functions.
If the value does not exist or has a wrong type, `null` will be returned.
If the value does not exist or has a wrong type, `0` will be returned.
Examples:
```
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello'
select jsonExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100
select jsonExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0
select jsonExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300
select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100
select JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0
select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300
```
The usage of accessors is the same as above.
## JSONExtractString(json[, indices_or_keys]...)
## jsonExtract(params, type[, accessors]...)
Parses a JSON and extract a string. This function is similar to `visitParamExtractString` functions.
Parse data from JSON values with a given ClickHouse data type.
If the value does not exist or has a wrong type, an empty string will be returned.
If the value does not exist or has a wrong type, `null` will be returned.
The value is unescaped. If unescaping failed, it returns an empty string.
Examples:
```
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Int8', 'b', 1) = -100
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, String, String, Array(Float64))') = ('a', 'hello', 'b', [-100.0, 200.0, 300.0])
select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello'
select JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'
select JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺'
select JSONExtractString('{"abc":"\\u263"}', 'abc') = ''
select JSONExtractString('{"abc":"hello}', 'abc') = ''
```
The usage of accessors is the same as above.
## JSONExtract(json[, indices_or_keys...], return_type)
Parses a JSON and extract a value of the given ClickHouse data type.
This is a generalization of the previous `JSONExtract<type>` functions.
This means
`JSONExtract(..., 'String')` returns exactly the same as `JSONExtractString()`,
`JSONExtract(..., 'Float64')` returns exactly the same as `JSONExtractFloat()`.
Examples:
```
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300])
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello')
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL]
SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4, 'Nullable(Int64)') = NULL
SELECT JSONExtract('{"passed": true}', 'passed', 'UInt8') = 1
SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Thursday'
SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday'
```
## JSONExtractKeysAndValues(json[, indices_or_keys...], value_type)
Parse key-value pairs from a JSON where the values are of the given ClickHouse data type.
Example:
```
SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)];
```
## JSONExtractRaw(json[, indices_or_keys]...)
Returns a part of JSON.
If the part does not exist or has a wrong type, an empty string will be returned.
Example:
```
select JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]'
```