Add Simdjson API

This commit is contained in:
hcz 2019-03-14 10:55:04 +08:00 committed by Vitaly Baranov
parent e14dc2cac0
commit 560246c0c3
7 changed files with 693 additions and 0 deletions

View File

@ -313,3 +313,5 @@ endif()
if (USE_INTERNAL_HYPERSCAN_LIBRARY)
add_subdirectory (hyperscan)
endif()
add_subdirectory (simdjson)
set (SIMDJSON_BUILD_STATIC ON CACHE INTERNAL "")

1
contrib/simdjson vendored Submodule

@ -0,0 +1 @@
Subproject commit b0c43028875e40461046774318088666e6284c52

View File

@ -0,0 +1,429 @@
#include <Functions/FunctionsJSON.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h>
#include <Functions/FunctionFactory.h>
namespace DB
{
template <typename T>
class JSONNullableImplBase
{
public:
static DataTypePtr getType()
{
return std::make_shared<DataTypeNullable>(
std::make_shared<T>()
);
}
static Field getDefault()
{
return {};
}
};
class JSONHasImpl: public JSONNullableImplBase<DataTypeUInt8>
{
public:
static constexpr auto name {"jsonHas"};
static Field getValue(ParsedJson::iterator &)
{
return {1};
}
};
class JSONLengthImpl: public JSONNullableImplBase<DataTypeUInt64>
{
public:
static constexpr auto name {"jsonLength"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (!pjh.is_object_or_array())
return getDefault();
size_t size = 0;
if (pjh.down())
{
size += 1;
while (pjh.next())
size += 1;
}
return {size};
}
};
class JSONTypeImpl: public JSONNullableImplBase<DataTypeUInt8>
{
public:
static constexpr auto name {"jsonType"};
static Field getValue(ParsedJson::iterator & pjh)
{
// types: [{"sltfn
return pjh.get_type();
}
};
class JSONExtractImpl
{
public:
static constexpr auto name {"jsonExtract"};
static DataTypePtr getType(const DataTypePtr & type)
{
WhichDataType which {
type
};
if (which.isNativeUInt())
return std::make_shared<DataTypeNullable>(
std::make_shared<DataTypeUInt64>()
);
if (which.isNativeInt())
return std::make_shared<DataTypeNullable>(
std::make_shared<DataTypeInt64>()
);
if (which.isFloat())
return std::make_shared<DataTypeNullable>(
std::make_shared<DataTypeFloat64>()
);
if (
which.isEnum()
|| which.isDateOrDateTime()
|| which.isStringOrFixedString()
|| which.isInterval()
)
return std::make_shared<DataTypeNullable>(
type
);
if (which.isArray())
{
auto array_type {
static_cast<const DataTypeArray *>(type.get())
};
return std::make_shared<DataTypeArray>(
getType(array_type->getNestedType())
);
}
if (which.isTuple())
{
auto tuple_type {
static_cast<const DataTypeTuple *>(type.get())
};
DataTypes types;
types.reserve(tuple_type->getElements().size());
for (const DataTypePtr & element: tuple_type->getElements())
{
types.push_back(getType(element));
}
return std::make_shared<DataTypeTuple>(
std::move(types)
);
}
throw Exception {
"Unsupported return type schema: " + type->getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT
};
}
static Field getDefault(const DataTypePtr & type)
{
WhichDataType which {
type
};
if (
which.isNativeUInt()
|| which.isNativeInt()
|| which.isFloat()
|| which.isEnum()
|| which.isDateOrDateTime()
|| which.isStringOrFixedString()
|| which.isInterval()
)
return {};
if (which.isArray())
return {Array {}};
if (which.isTuple())
{
auto tuple_type {
static_cast<const DataTypeTuple *>(type.get())
};
Tuple tuple;
tuple.toUnderType().reserve(tuple_type->getElements().size());
for (const DataTypePtr & element: tuple_type->getElements())
tuple.toUnderType().push_back(getDefault(element));
return {tuple};
}
// should not reach
throw Exception {
"Unsupported return type schema: " + type->getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT
};
}
static Field getValue(ParsedJson::iterator & pjh, const DataTypePtr & type)
{
WhichDataType which {
type
};
if (
which.isNativeUInt()
|| which.isNativeInt()
|| which.isEnum()
|| which.isDateOrDateTime()
|| which.isInterval()
)
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault(type);
}
if (which.isFloat())
{
if (pjh.is_integer())
return {(double) pjh.get_integer()};
else if (pjh.is_double())
return {pjh.get_double()};
else
return getDefault(type);
}
if (which.isStringOrFixedString())
{
if (pjh.is_string())
return {String {pjh.get_string()}};
else
return getDefault(type);
}
if (which.isArray())
{
if (!pjh.is_object_or_array())
return getDefault(type);
auto array_type {
static_cast<const DataTypeArray *>(type.get())
};
Array array;
bool first = true;
while (first ? pjh.down() : pjh.next())
{
first = false;
ParsedJson::iterator pjh1 {
pjh
};
array.push_back(getValue(pjh1, array_type->getNestedType()));
}
return {array};
}
if (which.isTuple())
{
if (!pjh.is_object_or_array())
return getDefault(type);
auto tuple_type {
static_cast<const DataTypeTuple *>(type.get())
};
Tuple tuple;
tuple.toUnderType().reserve(tuple_type->getElements().size());
bool valid = true;
bool first = true;
for (const DataTypePtr & element: tuple_type->getElements())
{
if (valid)
{
valid &= first ? pjh.down() : pjh.next();
first = false;
ParsedJson::iterator pjh1 {
pjh
};
tuple.toUnderType().push_back(getValue(pjh1, element));
}
else
tuple.toUnderType().push_back(getDefault(element));
}
return {tuple};
}
// should not reach
throw Exception {
"Unsupported return type schema: " + type->getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT
};
}
};
class JSONExtractUIntImpl: public JSONNullableImplBase<DataTypeUInt64>
{
public:
static constexpr auto name {"jsonExtractUInt"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault();
}
};
class JSONExtractIntImpl: public JSONNullableImplBase<DataTypeInt64>
{
public:
static constexpr auto name {"jsonExtractInt"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_integer())
return {pjh.get_integer()};
else
return getDefault();
}
};
class JSONExtractFloatImpl: public JSONNullableImplBase<DataTypeFloat64>
{
public:
static constexpr auto name {"jsonExtractFloat"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_double())
return {pjh.get_double()};
else
return getDefault();
}
};
class JSONExtractBoolImpl: public JSONNullableImplBase<DataTypeUInt8>
{
public:
static constexpr auto name {"jsonExtractBool"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.get_type() == 't')
return {1};
else if (pjh.get_type() == 'f')
return {0};
else
return getDefault();
}
};
// class JSONExtractRawImpl: public JSONNullableImplBase<DataTypeString>
// {
// public:
// static constexpr auto name {"jsonExtractRaw"};
// static Field getValue(ParsedJson::iterator & pjh)
// {
// //
// }
// };
class JSONExtractStringImpl: public JSONNullableImplBase<DataTypeString>
{
public:
static constexpr auto name {"jsonExtractString"};
static Field getValue(ParsedJson::iterator & pjh)
{
if (pjh.is_string())
return {String {pjh.get_string()}};
else
return getDefault();
}
};
void registerFunctionsJSON(FunctionFactory & factory)
{
factory.registerFunction<FunctionJSONBase<
JSONHasImpl,
false
>>();
factory.registerFunction<FunctionJSONBase<
JSONLengthImpl,
false
>>();
factory.registerFunction<FunctionJSONBase<
JSONTypeImpl,
false
>>();
factory.registerFunction<FunctionJSONBase<
JSONExtractImpl,
true
>>();
factory.registerFunction<FunctionJSONBase<
JSONExtractUIntImpl,
false
>>();
factory.registerFunction<FunctionJSONBase<
JSONExtractIntImpl,
false
>>();
factory.registerFunction<FunctionJSONBase<
JSONExtractFloatImpl,
false
>>();
factory.registerFunction<FunctionJSONBase<
JSONExtractBoolImpl,
false
>>();
// factory.registerFunction<FunctionJSONBase<
// JSONExtractRawImpl,
// false
// >>();
factory.registerFunction<FunctionJSONBase<
JSONExtractStringImpl,
false
>>();
}
}

View File

@ -0,0 +1,227 @@
#pragma once
#include <Columns/ColumnConst.h>
#include <Columns/ColumnsNumber.h>
#include <Common/typeid_cast.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <ext/range.h>
#include <simdjson/jsonparser.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
template <typename Impl, bool ExtraArg>
class FunctionJSONBase : public IFunction
{
private:
enum class Action
{
key = 1,
index = 2,
};
mutable std::vector<Action> actions;
mutable DataTypePtr virtual_type;
bool tryMove(
ParsedJson::iterator & pjh,
Action action,
const Field & accessor
)
{
switch (action)
{
case Action::key:
if (
!pjh.is_object()
|| !pjh.move_to_key(accessor.get<String>().data())
)
return false;
break;
case Action::index:
if (
!pjh.is_object_or_array()
|| !pjh.down()
)
return false;
int steps = accessor.get<Int64>();
if (steps > 0)
steps -= 1;
else if (steps < 0)
{
steps += 1;
ParsedJson::iterator pjh1 {
pjh
};
while (pjh1.next())
steps += 1;
}
else
return false;
for (const auto i : ext::range(0, steps))
{
(void) i;
if (!pjh.next())
return false;
}
break;
}
return true;
}
public:
static constexpr auto name = Impl::name;
static FunctionPtr create(const Context &)
{
return std::make_shared<FunctionJSONBase>();
}
String getName() const override
{
return Impl::name;
}
bool isVariadic() const override
{
return true;
}
size_t getNumberOfArguments() const override
{
return 0;
}
bool useDefaultImplementationForConstants() const override
{
return true;
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if constexpr (ExtraArg)
{
if (arguments.size() < 2)
throw Exception {
"Function " + getName() + " requires at least two arguments",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH
};
virtual_type = arguments[1];
}
else
{
if (arguments.size() < 1)
throw Exception {
"Function " + getName() + " requires at least one arguments",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH
};
}
if (!isString(arguments[0]))
throw Exception {
"Illegal type " + arguments[0]->getName()
+ " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT
};
actions.reserve(arguments.size() - 1 - ExtraArg);
for (const auto i : ext::range(1 + ExtraArg, arguments.size()))
{
if (isString(arguments[i]))
actions.push_back(Action::key);
else if (isInteger(arguments[i]))
actions.push_back(Action::index);
else
throw Exception {
"Illegal type " + arguments[i]->getName()
+ " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT
};
}
if constexpr (ExtraArg)
return Impl::getType(virtual_type);
else
return Impl::getType();
}
void executeImpl(
Block & block,
const ColumnNumbers & arguments,
size_t result_pos,
size_t input_rows_count
) override
{
MutableColumnPtr to {
block.getByPosition(result_pos).type->createColumn()
};
to->reserve(input_rows_count);
const ColumnPtr & arg_json = block.getByPosition(arguments[0]).column;
for (const auto i : ext::range(0, input_rows_count))
{
// TODO: avoid multiple memory allocation?
ParsedJson pj {
build_parsed_json((*arg_json)[i].get<String>())
};
ParsedJson::iterator pjh {
pj
};
bool ok = true;
for (const auto j : ext::range(0, actions.size()))
{
ok = tryMove(
pjh,
actions[j],
(*block.getByPosition(arguments[j + 1 + ExtraArg]).column)[i]
);
if (!ok)
break;
}
if (ok)
{
if constexpr (ExtraArg)
to->insert(Impl::getValue(pjh, virtual_type));
else
to->insert(Impl::getValue(pjh));
}
else
{
if constexpr (ExtraArg)
to->insert(Impl::getDefault(virtual_type));
else
to->insert(Impl::getDefault());
}
}
block.getByPosition(result_pos).column = std::move(to);
}
};
}

View File

@ -40,6 +40,7 @@ void registerFunctionsMath(FunctionFactory &);
void registerFunctionsGeo(FunctionFactory &);
void registerFunctionsNull(FunctionFactory &);
void registerFunctionsFindCluster(FunctionFactory &);
void registerFunctionsJSON(FunctionFactory &);
void registerFunctionTransform(FunctionFactory &);
#if USE_ICU
@ -82,6 +83,7 @@ void registerFunctions()
registerFunctionsGeo(factory);
registerFunctionsNull(factory);
registerFunctionsFindCluster(factory);
registerFunctionsJSON(factory);
registerFunctionTransform(factory);
#if USE_ICU

View File

@ -0,0 +1,16 @@
4
123
1
1
a
hello
hello
3
91
-100
200
300
('a','hello','b',[-100,200,300])
[-100,NULL,300]
['a','hello','b',NULL]
[(NULL,NULL,NULL),(NULL,NULL,NULL),(NULL,NULL,NULL),(-100,200,300)]

View File

@ -0,0 +1,16 @@
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}');
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}');
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1);
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 2);
select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a');
select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b');
select jsonExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1);
select jsonExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2);
select jsonExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1);
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', ('', '', '', [0.0]));
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', [-0], 'b');
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', ['']);
select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', [(-0, 0.0, 0)]);