Merge pull request #42761 from AlfVII/fix-slow-json-extract-with-low-cardinality

Fixed slowness in JSONExtract with LowCardinality(String) tuples
This commit is contained in:
Kruglov Pavel 2022-11-17 12:49:18 +01:00 committed by GitHub
commit 1b68f605a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 305 additions and 31 deletions

View File

@ -20,17 +20,19 @@
#include <Columns/ColumnArray.h>
#include <Columns/ColumnTuple.h>
#include <DataTypes/Serializations/SerializationDecimal.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeEnum.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeFixedString.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeUUID.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/Serializations/SerializationDecimal.h>
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
@ -720,8 +722,16 @@ public:
return false;
}
auto & col_vec = assert_cast<ColumnVector<NumberType> &>(dest);
col_vec.insertValue(value);
if (dest.getDataType() == TypeIndex::LowCardinality)
{
ColumnLowCardinality & col_low = assert_cast<ColumnLowCardinality &>(dest);
col_low.insertData(reinterpret_cast<const char *>(&value), sizeof(value));
}
else
{
auto & col_vec = assert_cast<ColumnVector<NumberType> &>(dest);
col_vec.insertValue(value);
}
return true;
}
};
@ -825,8 +835,17 @@ public:
return JSONExtractRawImpl<JSONParser>::insertResultToColumn(dest, element, {});
auto str = element.getString();
ColumnString & col_str = assert_cast<ColumnString &>(dest);
col_str.insertData(str.data(), str.size());
if (dest.getDataType() == TypeIndex::LowCardinality)
{
ColumnLowCardinality & col_low = assert_cast<ColumnLowCardinality &>(dest);
col_low.insertData(str.data(), str.size());
}
else
{
ColumnString & col_str = assert_cast<ColumnString &>(dest);
col_str.insertData(str.data(), str.size());
}
return true;
}
};
@ -855,25 +874,41 @@ struct JSONExtractTree
}
};
class LowCardinalityNode : public Node
class LowCardinalityFixedStringNode : public Node
{
public:
LowCardinalityNode(DataTypePtr dictionary_type_, std::unique_ptr<Node> impl_)
: dictionary_type(dictionary_type_), impl(std::move(impl_)) {}
explicit LowCardinalityFixedStringNode(const size_t fixed_length_) : fixed_length(fixed_length_) { }
bool insertResultToColumn(IColumn & dest, const Element & element) override
{
auto from_col = dictionary_type->createColumn();
if (impl->insertResultToColumn(*from_col, element))
// If element is an object we delegate the insertion to JSONExtractRawImpl
if (element.isObject())
return JSONExtractRawImpl<JSONParser>::insertResultToLowCardinalityFixedStringColumn(dest, element, fixed_length);
else if (!element.isString())
return false;
auto str = element.getString();
if (str.size() > fixed_length)
return false;
// For the non low cardinality case of FixedString, the padding is done in the FixedString Column implementation.
// In order to avoid having to pass the data to a FixedString Column and read it back (which would slow down the execution)
// the data is padded here and written directly to the Low Cardinality Column
if (str.size() == fixed_length)
{
std::string_view value = from_col->getDataAt(0).toView();
assert_cast<ColumnLowCardinality &>(dest).insertData(value.data(), value.size());
return true;
assert_cast<ColumnLowCardinality &>(dest).insertData(str.data(), str.size());
}
return false;
else
{
String padded_str(str);
padded_str.resize(fixed_length, '\0');
assert_cast<ColumnLowCardinality &>(dest).insertData(padded_str.data(), padded_str.size());
}
return true;
}
private:
DataTypePtr dictionary_type;
std::unique_ptr<Node> impl;
const size_t fixed_length;
};
class UUIDNode : public Node
@ -885,7 +920,15 @@ struct JSONExtractTree
return false;
auto uuid = parseFromString<UUID>(element.getString());
assert_cast<ColumnUUID &>(dest).insert(uuid);
if (dest.getDataType() == TypeIndex::LowCardinality)
{
ColumnLowCardinality & col_low = assert_cast<ColumnLowCardinality &>(dest);
col_low.insertData(reinterpret_cast<const char *>(&uuid), sizeof(uuid));
}
else
{
assert_cast<ColumnUUID &>(dest).insert(uuid);
}
return true;
}
};
@ -928,6 +971,7 @@ struct JSONExtractTree
assert_cast<ColumnDecimal<DecimalType> &>(dest).insert(value);
return true;
}
private:
DataTypePtr data_type;
};
@ -946,13 +990,18 @@ struct JSONExtractTree
public:
bool insertResultToColumn(IColumn & dest, const Element & element) override
{
if (!element.isString())
if (element.isNull())
return false;
auto & col_str = assert_cast<ColumnFixedString &>(dest);
if (!element.isString())
return JSONExtractRawImpl<JSONParser>::insertResultToFixedStringColumn(dest, element, {});
auto str = element.getString();
auto & col_str = assert_cast<ColumnFixedString &>(dest);
if (str.size() > col_str.getN())
return false;
col_str.insertData(str.data(), str.size());
return true;
}
};
@ -1178,9 +1227,18 @@ struct JSONExtractTree
case TypeIndex::UUID: return std::make_unique<UUIDNode>();
case TypeIndex::LowCardinality:
{
// The low cardinality case is treated in two different ways:
// For FixedString type, an especial class is implemented for inserting the data in the destination column,
// as the string length must be passed in order to check and pad the incoming data.
// For the rest of low cardinality types, the insertion is done in their corresponding class, adapting the data
// as needed for the insertData function of the ColumnLowCardinality.
auto dictionary_type = typeid_cast<const DataTypeLowCardinality *>(type.get())->getDictionaryType();
auto impl = build(function_name, dictionary_type);
return std::make_unique<LowCardinalityNode>(dictionary_type, std::move(impl));
if ((*dictionary_type).getTypeId() == TypeIndex::FixedString)
{
auto fixed_length = typeid_cast<const DataTypeFixedString *>(dictionary_type.get())->getN();
return std::make_unique<LowCardinalityFixedStringNode>(fixed_length);
}
return build(function_name, dictionary_type);
}
case TypeIndex::Decimal256: return std::make_unique<DecimalNode<Decimal256>>(type);
case TypeIndex::Decimal128: return std::make_unique<DecimalNode<Decimal128>>(type);
@ -1332,13 +1390,63 @@ public:
static bool insertResultToColumn(IColumn & dest, const Element & element, std::string_view)
{
ColumnString & col_str = assert_cast<ColumnString &>(dest);
auto & chars = col_str.getChars();
WriteBufferFromVector<ColumnString::Chars> buf(chars, AppendModeTag());
if (dest.getDataType() == TypeIndex::LowCardinality)
{
ColumnString::Chars chars;
WriteBufferFromVector<ColumnString::Chars> buf(chars, AppendModeTag());
traverse(element, buf);
buf.finalize();
assert_cast<ColumnLowCardinality &>(dest).insertData(reinterpret_cast<const char *>(chars.data()), chars.size());
}
else
{
ColumnString & col_str = assert_cast<ColumnString &>(dest);
auto & chars = col_str.getChars();
WriteBufferFromVector<ColumnString::Chars> buf(chars, AppendModeTag());
traverse(element, buf);
buf.finalize();
chars.push_back(0);
col_str.getOffsets().push_back(chars.size());
}
return true;
}
// We use insertResultToFixedStringColumn in case we are inserting raw data in a FixedString column
static bool insertResultToFixedStringColumn(IColumn & dest, const Element & element, std::string_view)
{
ColumnFixedString::Chars chars;
WriteBufferFromVector<ColumnFixedString::Chars> buf(chars, AppendModeTag());
traverse(element, buf);
buf.finalize();
chars.push_back(0);
col_str.getOffsets().push_back(chars.size());
auto & col_str = assert_cast<ColumnFixedString &>(dest);
if (chars.size() > col_str.getN())
return false;
chars.resize_fill(col_str.getN());
col_str.insertData(reinterpret_cast<const char *>(chars.data()), chars.size());
return true;
}
// We use insertResultToLowCardinalityFixedStringColumn in case we are inserting raw data in a Low Cardinality FixedString column
static bool insertResultToLowCardinalityFixedStringColumn(IColumn & dest, const Element & element, size_t fixed_length)
{
if (element.getObject().size() > fixed_length)
return false;
ColumnFixedString::Chars chars;
WriteBufferFromVector<ColumnFixedString::Chars> buf(chars, AppendModeTag());
traverse(element, buf);
buf.finalize();
if (chars.size() > fixed_length)
return false;
chars.resize_fill(fixed_length);
assert_cast<ColumnLowCardinality &>(dest).insertData(reinterpret_cast<const char *>(chars.data()), chars.size());
return true;
}

View File

@ -0,0 +1,73 @@
<test>
<substitutions>
<substitution>
<name>string_json</name>
<values>
<value>'{"a": "hi", "b": "hello", "c": "hola", "d": "see you, bye, bye"}'</value>
</values>
</substitution>
<substitution>
<name>int_json</name>
<values>
<value>'{"a": 11, "b": 2222, "c": 33333333, "d": 4444444444444444}'</value>
</values>
</substitution>
<substitution>
<name>uuid_json</name>
<values>
<value>'{"a": "2d49dc6e-ddce-4cd0-afb8-790956df54c4", "b": "2d49dc6e-ddce-4cd0-afb8-790956df54c3", "c": "2d49dc6e-ddce-4cd0-afb8-790956df54c1", "d": "2d49dc6e-ddce-4cd0-afb8-790956df54c1"}'</value>
</values>
</substitution>
<substitution>
<name>low_cardinality_tuple_string</name>
<values>
<value>'Tuple(a LowCardinality(String), b LowCardinality(String), c LowCardinality(String), d LowCardinality(String) )'</value>
</values>
</substitution>
<substitution>
<name>low_cardinality_tuple_fixed_string</name>
<values>
<value>'Tuple(a LowCardinality(FixedString(20)), b LowCardinality(FixedString(20)), c LowCardinality(FixedString(20)), d LowCardinality(FixedString(20)) )'</value>
</values>
</substitution>
<substitution>
<name>low_cardinality_tuple_int8</name>
<values>
<value>'Tuple(a LowCardinality(Int8), b LowCardinality(Int8), c LowCardinality(Int8), d LowCardinality(Int8) )'</value>
</values>
</substitution>
<substitution>
<name>low_cardinality_tuple_int16</name>
<values>
<value>'Tuple(a LowCardinality(Int16), b LowCardinality(Int16), c LowCardinality(Int16), d LowCardinality(Int16) )'</value>
</values>
</substitution>
<substitution>
<name>low_cardinality_tuple_int32</name>
<values>
<value>'Tuple(a LowCardinality(Int32), b LowCardinality(Int32), c LowCardinality(Int32), d LowCardinality(Int32) )'</value>
</values>
</substitution>
<substitution>
<name>low_cardinality_tuple_int64</name>
<values>
<value>'Tuple(a LowCardinality(Int64), b LowCardinality(Int64), c LowCardinality(Int64), d LowCardinality(Int64) )'</value>
</values>
</substitution>
<substitution>
<name>low_cardinality_tuple_uuid</name>
<values>
<value>'Tuple(a LowCardinality(UUID), b LowCardinality(UUID), c LowCardinality(UUID), d LowCardinality(UUID) )'</value>
</values>
</substitution>
</substitutions>
<query>SELECT 'fixed_string_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({string_json}), {low_cardinality_tuple_fixed_string})) FORMAT Null </query>
<query>SELECT 'string_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({string_json}), {low_cardinality_tuple_string})) FORMAT Null </query>
<query>SELECT 'int8_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({int_json}), {low_cardinality_tuple_int8})) FORMAT Null </query>
<query>SELECT 'int16_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({int_json}), {low_cardinality_tuple_int16})) FORMAT Null </query>
<query>SELECT 'int32_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({int_json}), {low_cardinality_tuple_int32})) FORMAT Null </query>
<query>SELECT 'int64_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({int_json}), {low_cardinality_tuple_int64})) FORMAT Null </query>
<query>SELECT 'uuid_json' FROM zeros(500000) WHERE NOT ignore(JSONExtract(materialize({uuid_json}), {low_cardinality_tuple_uuid})) FORMAT Null </query>
</test>

View File

@ -0,0 +1,7 @@
('hi','hello','hola','see you, bye, bye')
('hi\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0','hello\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0','hola\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0','see you, bye, bye\0\0\0')
(11,0,0,0)
(11,2222,0,0)
(11,2222,33333333,0)
(11,2222,33333333,4444444444444444)
('2d49dc6e-ddce-4cd0-afb8-790956df54c4','2d49dc6e-ddce-4cd0-afb8-790956df54c3','2d49dc6e-ddce-4cd0-afb8-790956df54c1','2d49dc6e-ddce-4cd0-afb8-790956df54c1')

View File

@ -0,0 +1,55 @@
-- Tags: no-fasttest
DROP TABLE IF EXISTS test_low_cardinality_string;
DROP TABLE IF EXISTS test_low_cardinality_uuid;
DROP TABLE IF EXISTS test_low_cardinality_int;
CREATE TABLE test_low_cardinality_string (data String) ENGINE MergeTree ORDER BY data;
CREATE TABLE test_low_cardinality_uuid (data String) ENGINE MergeTree ORDER BY data;
CREATE TABLE test_low_cardinality_int (data String) ENGINE MergeTree ORDER BY data;
INSERT INTO test_low_cardinality_string (data) VALUES ('{"a": "hi", "b": "hello", "c": "hola", "d": "see you, bye, bye"}');
INSERT INTO test_low_cardinality_int (data) VALUES ('{"a": 11, "b": 2222, "c": 33333333, "d": 4444444444444444}');
INSERT INTO test_low_cardinality_uuid (data) VALUES ('{"a": "2d49dc6e-ddce-4cd0-afb8-790956df54c4", "b": "2d49dc6e-ddce-4cd0-afb8-790956df54c3", "c": "2d49dc6e-ddce-4cd0-afb8-790956df54c1", "d": "2d49dc6e-ddce-4cd0-afb8-790956df54c1"}');
SELECT JSONExtract(data, 'Tuple(
a LowCardinality(String),
b LowCardinality(String),
c LowCardinality(String),
d LowCardinality(String)
)') AS json FROM test_low_cardinality_string;
SELECT JSONExtract(data, 'Tuple(
a LowCardinality(FixedString(20)),
b LowCardinality(FixedString(20)),
c LowCardinality(FixedString(20)),
d LowCardinality(FixedString(20))
)') AS json FROM test_low_cardinality_string;
SELECT JSONExtract(data, 'Tuple(
a LowCardinality(Int8),
b LowCardinality(Int8),
c LowCardinality(Int8),
d LowCardinality(Int8)
)') AS json FROM test_low_cardinality_int;
SELECT JSONExtract(data, 'Tuple(
a LowCardinality(Int16),
b LowCardinality(Int16),
c LowCardinality(Int16),
d LowCardinality(Int16)
)') AS json FROM test_low_cardinality_int;
SELECT JSONExtract(data, 'Tuple(
a LowCardinality(Int32),
b LowCardinality(Int32),
c LowCardinality(Int32),
d LowCardinality(Int32)
)') AS json FROM test_low_cardinality_int;
SELECT JSONExtract(data, 'Tuple(
a LowCardinality(Int64),
b LowCardinality(Int64),
c LowCardinality(Int64),
d LowCardinality(Int64)
)') AS json FROM test_low_cardinality_int;
SELECT JSONExtract(data, 'Tuple(
a LowCardinality(UUID),
b LowCardinality(UUID),
c LowCardinality(UUID),
d LowCardinality(UUID)
)') AS json FROM test_low_cardinality_uuid;
DROP TABLE test_low_cardinality_string;
DROP TABLE test_low_cardinality_uuid;
DROP TABLE test_low_cardinality_int;

View File

@ -0,0 +1 @@
('{"b":{"c":1,"d":"str"}}\0')

View File

@ -0,0 +1,6 @@
-- Tags: no-fasttest
DROP TABLE IF EXISTS test_fixed_string_nested_json;
CREATE TABLE test_fixed_string_nested_json (data String) ENGINE MergeTree ORDER BY data;
INSERT INTO test_fixed_string_nested_json (data) VALUES ('{"a" : {"b" : {"c" : 1, "d" : "str"}}}');
SELECT JSONExtract(data, 'Tuple(a FixedString(24))') AS json FROM test_fixed_string_nested_json;
DROP TABLE test_fixed_string_nested_json;

View File

@ -0,0 +1,2 @@
('{"b":{"c":1,"d":"str"}}','','','')
('{"b":{"c":1,"d":"str"}}','','','')

View File

@ -0,0 +1,3 @@
-- Tags: no-fasttest
SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a LowCardinality(String), b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))');
SELECT JSONExtract('{"a" : {"b" : {"c" : 1, "d" : "str"}}}', 'Tuple( a String, b LowCardinality(String), c LowCardinality(String), d LowCardinality(String))');

View File

@ -0,0 +1,10 @@
\0\0\0\0\0\0\0\0\0\0\0
{"a":123456}
\0\0\0\0\0
123456
\0\0\0\0\0
123456
\0\0\0\0\0
\0\0\0\0\0
131231
131231

View File

@ -0,0 +1,9 @@
-- Tags: no-fasttest
SELECT JSONExtract('{"a": 123456}', 'FixedString(11)');
SELECT JSONExtract('{"a": 123456}', 'FixedString(12)');
SELECT JSONExtract('{"a": "123456"}', 'a', 'FixedString(5)');
SELECT JSONExtract('{"a": "123456"}', 'a', 'FixedString(6)');
SELECT JSONExtract('{"a": 123456}', 'a', 'FixedString(5)');
SELECT JSONExtract('{"a": 123456}', 'a', 'FixedString(6)');
SELECT JSONExtract(materialize('{"a": 131231}'), 'a', 'LowCardinality(FixedString(5))') FROM numbers(2);
SELECT JSONExtract(materialize('{"a": 131231}'), 'a', 'LowCardinality(FixedString(6))') FROM numbers(2);