From 560246c0c30102f92c4b93077fbc019b12940b1a Mon Sep 17 00:00:00 2001 From: hcz Date: Thu, 14 Mar 2019 10:55:04 +0800 Subject: [PATCH] Add Simdjson API --- contrib/CMakeLists.txt | 2 + contrib/simdjson | 1 + dbms/src/Functions/FunctionsJSON.cpp | 429 ++++++++++++++++++ dbms/src/Functions/FunctionsJSON.h | 227 +++++++++ dbms/src/Functions/registerFunctions.cpp | 2 + .../00918_json_simdjson_api.reference | 16 + .../0_stateless/00918_json_simdjson_api.sql | 16 + 7 files changed, 693 insertions(+) create mode 160000 contrib/simdjson create mode 100644 dbms/src/Functions/FunctionsJSON.cpp create mode 100644 dbms/src/Functions/FunctionsJSON.h create mode 100644 dbms/tests/queries/0_stateless/00918_json_simdjson_api.reference create mode 100644 dbms/tests/queries/0_stateless/00918_json_simdjson_api.sql diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index e553773e84a..f7c6c0dc2c5 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -313,3 +313,5 @@ endif() if (USE_INTERNAL_HYPERSCAN_LIBRARY) add_subdirectory (hyperscan) endif() +add_subdirectory (simdjson) +set (SIMDJSON_BUILD_STATIC ON CACHE INTERNAL "") diff --git a/contrib/simdjson b/contrib/simdjson new file mode 160000 index 00000000000..b0c43028875 --- /dev/null +++ b/contrib/simdjson @@ -0,0 +1 @@ +Subproject commit b0c43028875e40461046774318088666e6284c52 diff --git a/dbms/src/Functions/FunctionsJSON.cpp b/dbms/src/Functions/FunctionsJSON.cpp new file mode 100644 index 00000000000..2f6d9f0d4b2 --- /dev/null +++ b/dbms/src/Functions/FunctionsJSON.cpp @@ -0,0 +1,429 @@ +#include + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +template +class JSONNullableImplBase +{ +public: + static DataTypePtr getType() + { + return std::make_shared( + std::make_shared() + ); + } + + static Field getDefault() + { + return {}; + } +}; + +class JSONHasImpl: public JSONNullableImplBase +{ +public: + static constexpr auto name {"jsonHas"}; + + static Field getValue(ParsedJson::iterator &) + { + return {1}; + } +}; + +class JSONLengthImpl: public JSONNullableImplBase +{ +public: + static constexpr auto name {"jsonLength"}; + + static Field getValue(ParsedJson::iterator & pjh) + { + if (!pjh.is_object_or_array()) + return getDefault(); + + size_t size = 0; + + if (pjh.down()) + { + size += 1; + + while (pjh.next()) + size += 1; + } + + return {size}; + } +}; + +class JSONTypeImpl: public JSONNullableImplBase +{ +public: + static constexpr auto name {"jsonType"}; + + static Field getValue(ParsedJson::iterator & pjh) + { + // types: [{"sltfn + return pjh.get_type(); + } +}; + +class JSONExtractImpl +{ +public: + static constexpr auto name {"jsonExtract"}; + + static DataTypePtr getType(const DataTypePtr & type) + { + WhichDataType which { + type + }; + + if (which.isNativeUInt()) + return std::make_shared( + std::make_shared() + ); + + if (which.isNativeInt()) + return std::make_shared( + std::make_shared() + ); + + if (which.isFloat()) + return std::make_shared( + std::make_shared() + ); + + if ( + which.isEnum() + || which.isDateOrDateTime() + || which.isStringOrFixedString() + || which.isInterval() + ) + return std::make_shared( + type + ); + + if (which.isArray()) + { + auto array_type { + static_cast(type.get()) + }; + + return std::make_shared( + getType(array_type->getNestedType()) + ); + } + + if (which.isTuple()) + { + auto tuple_type { + static_cast(type.get()) + }; + + DataTypes types; + types.reserve(tuple_type->getElements().size()); + + for (const DataTypePtr & element: tuple_type->getElements()) + { + types.push_back(getType(element)); + } + + return std::make_shared( + std::move(types) + ); + } + + throw Exception { + "Unsupported return type schema: " + type->getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT + }; + } + + static Field getDefault(const DataTypePtr & type) + { + WhichDataType which { + type + }; + + if ( + which.isNativeUInt() + || which.isNativeInt() + || which.isFloat() + || which.isEnum() + || which.isDateOrDateTime() + || which.isStringOrFixedString() + || which.isInterval() + ) + return {}; + + if (which.isArray()) + return {Array {}}; + + if (which.isTuple()) + { + auto tuple_type { + static_cast(type.get()) + }; + + Tuple tuple; + tuple.toUnderType().reserve(tuple_type->getElements().size()); + + for (const DataTypePtr & element: tuple_type->getElements()) + tuple.toUnderType().push_back(getDefault(element)); + + return {tuple}; + } + + // should not reach + throw Exception { + "Unsupported return type schema: " + type->getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT + }; + } + + static Field getValue(ParsedJson::iterator & pjh, const DataTypePtr & type) + { + WhichDataType which { + type + }; + + if ( + which.isNativeUInt() + || which.isNativeInt() + || which.isEnum() + || which.isDateOrDateTime() + || which.isInterval() + ) + { + if (pjh.is_integer()) + return {pjh.get_integer()}; + else + return getDefault(type); + } + + if (which.isFloat()) + { + if (pjh.is_integer()) + return {(double) pjh.get_integer()}; + else if (pjh.is_double()) + return {pjh.get_double()}; + else + return getDefault(type); + } + + if (which.isStringOrFixedString()) + { + if (pjh.is_string()) + return {String {pjh.get_string()}}; + else + return getDefault(type); + + } + + if (which.isArray()) + { + if (!pjh.is_object_or_array()) + return getDefault(type); + + auto array_type { + static_cast(type.get()) + }; + + Array array; + + bool first = true; + + while (first ? pjh.down() : pjh.next()) + { + first = false; + + ParsedJson::iterator pjh1 { + pjh + }; + + array.push_back(getValue(pjh1, array_type->getNestedType())); + } + + return {array}; + } + + if (which.isTuple()) + { + if (!pjh.is_object_or_array()) + return getDefault(type); + + auto tuple_type { + static_cast(type.get()) + }; + + Tuple tuple; + tuple.toUnderType().reserve(tuple_type->getElements().size()); + + bool valid = true; + bool first = true; + + for (const DataTypePtr & element: tuple_type->getElements()) + { + if (valid) + { + valid &= first ? pjh.down() : pjh.next(); + first = false; + + ParsedJson::iterator pjh1 { + pjh + }; + + tuple.toUnderType().push_back(getValue(pjh1, element)); + } + else + tuple.toUnderType().push_back(getDefault(element)); + } + + return {tuple}; + } + + // should not reach + throw Exception { + "Unsupported return type schema: " + type->getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT + }; + } +}; + +class JSONExtractUIntImpl: public JSONNullableImplBase +{ +public: + static constexpr auto name {"jsonExtractUInt"}; + + static Field getValue(ParsedJson::iterator & pjh) + { + if (pjh.is_integer()) + return {pjh.get_integer()}; + else + return getDefault(); + } +}; + +class JSONExtractIntImpl: public JSONNullableImplBase +{ +public: + static constexpr auto name {"jsonExtractInt"}; + + static Field getValue(ParsedJson::iterator & pjh) + { + if (pjh.is_integer()) + return {pjh.get_integer()}; + else + return getDefault(); + } +}; + +class JSONExtractFloatImpl: public JSONNullableImplBase +{ +public: + static constexpr auto name {"jsonExtractFloat"}; + + static Field getValue(ParsedJson::iterator & pjh) + { + if (pjh.is_double()) + return {pjh.get_double()}; + else + return getDefault(); + } +}; + +class JSONExtractBoolImpl: public JSONNullableImplBase +{ +public: + static constexpr auto name {"jsonExtractBool"}; + + static Field getValue(ParsedJson::iterator & pjh) + { + if (pjh.get_type() == 't') + return {1}; + else if (pjh.get_type() == 'f') + return {0}; + else + return getDefault(); + } +}; + +// class JSONExtractRawImpl: public JSONNullableImplBase +// { +// public: +// static constexpr auto name {"jsonExtractRaw"}; + +// static Field getValue(ParsedJson::iterator & pjh) +// { +// // +// } +// }; + +class JSONExtractStringImpl: public JSONNullableImplBase +{ +public: + static constexpr auto name {"jsonExtractString"}; + + static Field getValue(ParsedJson::iterator & pjh) + { + if (pjh.is_string()) + return {String {pjh.get_string()}}; + else + return getDefault(); + + } +}; + +void registerFunctionsJSON(FunctionFactory & factory) +{ + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + // factory.registerFunction>(); + factory.registerFunction>(); +} + +} diff --git a/dbms/src/Functions/FunctionsJSON.h b/dbms/src/Functions/FunctionsJSON.h new file mode 100644 index 00000000000..b6c8f952f1e --- /dev/null +++ b/dbms/src/Functions/FunctionsJSON.h @@ -0,0 +1,227 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +template +class FunctionJSONBase : public IFunction +{ +private: + enum class Action + { + key = 1, + index = 2, + }; + + mutable std::vector actions; + mutable DataTypePtr virtual_type; + + bool tryMove( + ParsedJson::iterator & pjh, + Action action, + const Field & accessor + ) + { + switch (action) + { + case Action::key: + if ( + !pjh.is_object() + || !pjh.move_to_key(accessor.get().data()) + ) + return false; + + break; + case Action::index: + if ( + !pjh.is_object_or_array() + || !pjh.down() + ) + return false; + + int steps = accessor.get(); + + if (steps > 0) + steps -= 1; + else if (steps < 0) + { + steps += 1; + + ParsedJson::iterator pjh1 { + pjh + }; + + while (pjh1.next()) + steps += 1; + } + else + return false; + + for (const auto i : ext::range(0, steps)) + { + (void) i; + + if (!pjh.next()) + return false; + } + + break; + } + + return true; + } + +public: + static constexpr auto name = Impl::name; + + static FunctionPtr create(const Context &) + { + return std::make_shared(); + } + + String getName() const override + { + return Impl::name; + } + + bool isVariadic() const override + { + return true; + } + + size_t getNumberOfArguments() const override + { + return 0; + } + + bool useDefaultImplementationForConstants() const override + { + return true; + } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if constexpr (ExtraArg) + { + if (arguments.size() < 2) + throw Exception { + "Function " + getName() + " requires at least two arguments", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH + }; + + virtual_type = arguments[1]; + } + else + { + if (arguments.size() < 1) + throw Exception { + "Function " + getName() + " requires at least one arguments", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH + }; + } + + if (!isString(arguments[0])) + throw Exception { + "Illegal type " + arguments[0]->getName() + + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT + }; + + actions.reserve(arguments.size() - 1 - ExtraArg); + + for (const auto i : ext::range(1 + ExtraArg, arguments.size())) + { + if (isString(arguments[i])) + actions.push_back(Action::key); + else if (isInteger(arguments[i])) + actions.push_back(Action::index); + else + throw Exception { + "Illegal type " + arguments[i]->getName() + + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT + }; + } + + if constexpr (ExtraArg) + return Impl::getType(virtual_type); + else + return Impl::getType(); + } + + void executeImpl( + Block & block, + const ColumnNumbers & arguments, + size_t result_pos, + size_t input_rows_count + ) override + { + MutableColumnPtr to { + block.getByPosition(result_pos).type->createColumn() + }; + to->reserve(input_rows_count); + + const ColumnPtr & arg_json = block.getByPosition(arguments[0]).column; + + for (const auto i : ext::range(0, input_rows_count)) + { + // TODO: avoid multiple memory allocation? + ParsedJson pj { + build_parsed_json((*arg_json)[i].get()) + }; + ParsedJson::iterator pjh { + pj + }; + + bool ok = true; + + for (const auto j : ext::range(0, actions.size())) + { + ok = tryMove( + pjh, + actions[j], + (*block.getByPosition(arguments[j + 1 + ExtraArg]).column)[i] + ); + + if (!ok) + break; + } + + if (ok) + { + if constexpr (ExtraArg) + to->insert(Impl::getValue(pjh, virtual_type)); + else + to->insert(Impl::getValue(pjh)); + } + else + { + if constexpr (ExtraArg) + to->insert(Impl::getDefault(virtual_type)); + else + to->insert(Impl::getDefault()); + } + } + + block.getByPosition(result_pos).column = std::move(to); + } +}; + +} diff --git a/dbms/src/Functions/registerFunctions.cpp b/dbms/src/Functions/registerFunctions.cpp index 4fa77aaa02d..af1bd6a34cf 100644 --- a/dbms/src/Functions/registerFunctions.cpp +++ b/dbms/src/Functions/registerFunctions.cpp @@ -40,6 +40,7 @@ void registerFunctionsMath(FunctionFactory &); void registerFunctionsGeo(FunctionFactory &); void registerFunctionsNull(FunctionFactory &); void registerFunctionsFindCluster(FunctionFactory &); +void registerFunctionsJSON(FunctionFactory &); void registerFunctionTransform(FunctionFactory &); #if USE_ICU @@ -82,6 +83,7 @@ void registerFunctions() registerFunctionsGeo(factory); registerFunctionsNull(factory); registerFunctionsFindCluster(factory); + registerFunctionsJSON(factory); registerFunctionTransform(factory); #if USE_ICU diff --git a/dbms/tests/queries/0_stateless/00918_json_simdjson_api.reference b/dbms/tests/queries/0_stateless/00918_json_simdjson_api.reference new file mode 100644 index 00000000000..85ae16b842c --- /dev/null +++ b/dbms/tests/queries/0_stateless/00918_json_simdjson_api.reference @@ -0,0 +1,16 @@ +4 +123 +1 +1 +a +hello +hello +3 +91 +-100 +200 +300 +('a','hello','b',[-100,200,300]) +[-100,NULL,300] +['a','hello','b',NULL] +[(NULL,NULL,NULL),(NULL,NULL,NULL),(NULL,NULL,NULL),(-100,200,300)] diff --git a/dbms/tests/queries/0_stateless/00918_json_simdjson_api.sql b/dbms/tests/queries/0_stateless/00918_json_simdjson_api.sql new file mode 100644 index 00000000000..6fad2e5ce42 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00918_json_simdjson_api.sql @@ -0,0 +1,16 @@ +select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}'); +select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}'); +select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'a'); +select jsonHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b'); +select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1); +select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 2); +select jsonExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a'); +select jsonLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b'); +select jsonType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b'); +select jsonExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1); +select jsonExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2); +select jsonExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1); +select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', ('', '', '', [0.0])); +select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', [-0], 'b'); +select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', ['']); +select jsonExtract('{"a": "hello", "b": [-100, 200.0, 300]}', [(-0, 0.0, 0)]);