Refactor JSONExtract functions and support more types and reuse its code in new JSON type

This commit is contained in:
avogar 2024-07-03 13:40:41 +00:00
parent 5a6e6d3c5d
commit ea3b0e735d
13 changed files with 4158 additions and 1810 deletions

View File

@ -493,3 +493,37 @@ SELECT count(), dynamicType(d), _part FROM test GROUP BY _part, dynamicType(d) O
```
As we can see, ClickHouse kept the most frequent types `UInt64` and `Array(UInt64)` and casted all other types to `String`.
## JSONExtract functions with Dynamic
All `JSONExtract*` functions support `Dynamic` type:
```sql
SELECT JSONExtract('{"a" : [1, 2, 3]}', 'a', 'Dynamic') AS dynamic, dynamicType(dynamic) AS dynamic_type;
```
```text
┌─dynamic─┬─dynamic_type───────────┐
│ [1,2,3] │ Array(Nullable(Int64)) │
└─────────┴────────────────────────┘
```
```sql
SELECT JSONExtract('{"obj" : {"a" : 42, "b" : "Hello", "c" : [1,2,3]}}', 'obj', 'Map(String, Variant(UInt32, String, Array(UInt32)))') AS map_of_dynamics, mapApply((k, v) -> (k, variantType(v)), map_of_dynamics) AS map_of_dynamic_types```
```text
┌─map_of_dynamics──────────────────┬─map_of_dynamic_types────────────────────────────┐
│ {'a':42,'b':'Hello','c':[1,2,3]} │ {'a':'UInt32','b':'String','c':'Array(UInt32)'} │
└──────────────────────────────────┴─────────────────────────────────────────────────┘
```
```sql
SELECT JSONExtractKeysAndValues('{"a" : 42, "b" : "Hello", "c" : [1,2,3]}', 'Variant(UInt32, String, Array(UInt32))') AS dynamics, arrayMap(x -> (x.1, variantType(x.2)), dynamics) AS dynamic_types```
```
```text
┌─dynamics───────────────────────────────┬─dynamic_types─────────────────────────────────────────┐
│ [('a',42),('b','Hello'),('c',[1,2,3])] │ [('a','UInt32'),('b','String'),('c','Array(UInt32)')] │
└────────────────────────────────────────┴───────────────────────────────────────────────────────┘
```

View File

@ -14,6 +14,7 @@
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;

View File

@ -12,6 +12,7 @@
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
#include <Interpreters/Context.h>
namespace DB
{

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,35 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <Columns/IColumn.h>
#include <Formats/FormatSettings.h>
namespace DB
{
struct JSONExtractInsertSettings
{
bool convert_bool_to_integer = true;
bool insert_default_on_invalid_elements_in_complex_types = false;
};
template <typename JSONParser>
class JSONExtractTreeNode
{
public:
JSONExtractTreeNode() = default;
virtual ~JSONExtractTreeNode() = default;
virtual bool insertResultToColumn(IColumn &, const typename JSONParser::Element &, const JSONExtractInsertSettings & insert_setting, const FormatSettings & format_settings, String & error) const = 0;
};
/// Build a tree for insertion JSON element into a column with provided data type.
template <typename JSONParser>
std::unique_ptr<JSONExtractTreeNode<JSONParser>> buildJSONExtractTree(const DataTypePtr & type, const char * source_for_exception_message);
template <typename JSONParser>
void jsonElementToString(const typename JSONParser::Element & element, WriteBuffer & buf, const FormatSettings & format_settings);
template <typename JSONParser, typename NumberType>
bool tryGetNumericValueFromJSONElement(NumberType & value, const typename JSONParser::Element & element, bool convert_bool_to_integer, String & error);
}

View File

@ -225,19 +225,6 @@ namespace
Paths paths;
};
bool checkIfTypesAreEqual(const DataTypes & types)
{
if (types.empty())
return true;
for (size_t i = 1; i < types.size(); ++i)
{
if (!types[0]->equals(*types[i]))
return false;
}
return true;
}
void updateTypeIndexes(DataTypes & data_types, TypeIndexesSet & type_indexes)
{
type_indexes.clear();
@ -272,24 +259,31 @@ namespace
type_indexes.erase(TypeIndex::Nothing);
}
/// If we have both Int64 and UInt64, convert all Int64 to UInt64,
/// If we have both Int64 and UInt64, convert all not-negative Int64 to UInt64,
/// because UInt64 is inferred only in case of Int64 overflow.
void transformIntegers(DataTypes & data_types, TypeIndexesSet & type_indexes)
void transformIntegers(DataTypes & data_types, TypeIndexesSet & type_indexes, JSONInferenceInfo * json_info)
{
if (!type_indexes.contains(TypeIndex::Int64) || !type_indexes.contains(TypeIndex::UInt64))
return;
bool have_negative_integers = false;
for (auto & type : data_types)
{
if (WhichDataType(type).isInt64())
type = std::make_shared<DataTypeUInt64>();
{
bool is_negative = json_info->negative_integers.contains(type.get());
have_negative_integers |= is_negative;
if (!is_negative)
type = std::make_shared<DataTypeUInt64>();
}
}
type_indexes.erase(TypeIndex::Int64);
if (!have_negative_integers)
type_indexes.erase(TypeIndex::Int64);
}
/// If we have both Int64 and Float64 types, convert all Int64 to Float64.
void transformIntegersAndFloatsToFloats(DataTypes & data_types, TypeIndexesSet & type_indexes)
void transformIntegersAndFloatsToFloats(DataTypes & data_types, TypeIndexesSet & type_indexes, JSONInferenceInfo * json_info)
{
bool have_floats = type_indexes.contains(TypeIndex::Float64);
bool have_integers = type_indexes.contains(TypeIndex::Int64) || type_indexes.contains(TypeIndex::UInt64);
@ -300,7 +294,12 @@ namespace
{
WhichDataType which(type);
if (which.isInt64() || which.isUInt64())
type = std::make_shared<DataTypeFloat64>();
{
auto new_type = std::make_shared<DataTypeFloat64>();
if (json_info->numbers_parsed_from_json_strings.erase(type.get()))
json_info->numbers_parsed_from_json_strings.insert(new_type.get());
type = new_type;
}
}
type_indexes.erase(TypeIndex::Int64);
@ -635,9 +634,9 @@ namespace
if (settings.try_infer_integers)
{
/// Transform Int64 to UInt64 if needed.
transformIntegers(data_types, type_indexes);
transformIntegers(data_types, type_indexes, json_info);
/// Transform integers to floats if needed.
transformIntegersAndFloatsToFloats(data_types, type_indexes);
transformIntegersAndFloatsToFloats(data_types, type_indexes, json_info);
}
/// Transform Date to DateTime or both to String if needed.
@ -887,7 +886,7 @@ namespace
}
template <bool is_json>
DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
if (buf.eof())
return nullptr;
@ -911,7 +910,12 @@ namespace
Int64 tmp_int;
buf.position() = number_start;
if (tryReadIntText(tmp_int, buf))
return std::make_shared<DataTypeInt64>();
{
auto type = std::make_shared<DataTypeInt64>();
if (json_info && tmp_int < 0)
json_info->negative_integers.insert(type.get());
return type;
}
/// In case of Int64 overflow we can try to infer UInt64.
UInt64 tmp_uint;
@ -934,7 +938,12 @@ namespace
Int64 tmp_int;
if (tryReadIntText(tmp_int, peekable_buf))
return std::make_shared<DataTypeInt64>();
{
auto type = std::make_shared<DataTypeInt64>();
if (json_info && tmp_int < 0)
json_info->negative_integers.insert(type.get());
return type;
}
peekable_buf.rollbackToCheckpoint(/* drop= */ true);
/// In case of Int64 overflow we can try to infer UInt64.
@ -952,7 +961,7 @@ namespace
}
template <bool is_json>
DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings)
DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_inference_info = nullptr)
{
ReadBufferFromString buf(field);
@ -960,7 +969,12 @@ namespace
{
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof())
return std::make_shared<DataTypeInt64>();
{
auto type = std::make_shared<DataTypeInt64>();
if (json_inference_info && tmp_int < 0)
json_inference_info->negative_integers.insert(type.get());
return type;
}
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
buf.position() = buf.buffer().begin();
@ -1011,7 +1025,7 @@ namespace
{
if (settings.json.try_infer_numbers_from_strings)
{
if (auto number_type = tryInferNumberFromStringImpl<true>(field, settings))
if (auto number_type = tryInferNumberFromStringImpl<true>(field, settings, json_info))
{
json_info->numbers_parsed_from_json_strings.insert(number_type.get());
return number_type;
@ -1254,10 +1268,23 @@ namespace
}
/// Number
return tryInferNumber<is_json>(buf, settings);
return tryInferNumber<is_json>(buf, settings, json_info);
}
}
bool checkIfTypesAreEqual(const DataTypes & types)
{
if (types.empty())
return true;
for (size_t i = 1; i < types.size(); ++i)
{
if (!types[0]->equals(*types[i]))
return false;
}
return true;
}
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings)
{
DataTypes types = {first, second};
@ -1275,6 +1302,11 @@ void transformInferredJSONTypesIfNeeded(
second = std::move(types[1]);
}
void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
transformInferredTypesIfNeededImpl<true>(types, settings, json_info);
}
void transformInferredJSONTypesFromDifferentFilesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings)
{
JSONInferenceInfo json_info;
@ -1396,6 +1428,12 @@ DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSetting
return tryInferNumberFromStringImpl<false>(field, settings);
}
DataTypePtr tryInferJSONNumberFromString(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
return tryInferNumberFromStringImpl<false>(field, settings, json_info);
}
DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings)
{
if (settings.try_infer_dates && tryInferDate(field))

View File

@ -2,6 +2,7 @@
#include <DataTypes/IDataType.h>
#include <IO/ReadBuffer.h>
#include <Formats/FormatSettings.h>
#include <vector>
@ -18,6 +19,11 @@ struct JSONInferenceInfo
/// We store numbers that were parsed from strings.
/// It's used in types transformation to change such numbers back to string if needed.
std::unordered_set<const IDataType *> numbers_parsed_from_json_strings;
/// Store integer types that were inferred from negative numbers.
/// It's used to determine common type for Int64 and UInt64
/// TODO: check it not only in JSON formats.
std::unordered_set<const IDataType *> negative_integers;
/// Indicates if currently we are inferring type for Map/Object key.
bool is_object_key = false;
/// When we transform types for the same column from different files
@ -48,6 +54,7 @@ DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const Forma
/// Try to parse a number value from a string. By default, it tries to parse Float64,
/// but if setting try_infer_integers is enabled, it also tries to parse Int64.
DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings);
DataTypePtr tryInferJSONNumberFromString(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// It takes two types inferred for the same column and tries to transform them to a common type if possible.
/// It's also used when we try to infer some not ordinary types from another types.
@ -77,6 +84,7 @@ void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, c
/// Example 2:
/// We merge DataTypeJSONPaths types to a single DataTypeJSONPaths type with union of all JSON paths.
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info);
void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Make final transform for types inferred in JSON format. It does 3 types of transformation:
/// 1) Checks if type is unnamed Tuple(...), tries to transform nested types to find a common type for them and if all nested types
@ -107,4 +115,6 @@ NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header);
/// Check if type contains Nothing, like Array(Tuple(Nullable(Nothing), String))
bool checkIfTypeIsComplete(const DataTypePtr & type);
bool checkIfTypesAreEqual(const DataTypes & types);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
2020-01-01
2020-01-01
2020-01-01 00:00:00
2020-01-01 00:00:00.000000
127.0.0.1
2001:db8:85a3::8a2e:370:7334
42
42
42
42
42
42
42
42
42
42
Hello
Hello
\0\0\0
Hello\0\0\0\0\0
5801c962-1182-458a-89f8-d077da5074f9

View File

@ -0,0 +1,29 @@
set allow_suspicious_low_cardinality_types=1;
select JSONExtract('{"a" : "2020-01-01"}', 'a', 'Date');
select JSONExtract('{"a" : "2020-01-01"}', 'a', 'Date32');
select JSONExtract('{"a" : "2020-01-01 00:00:00"}', 'a', 'DateTime');
select JSONExtract('{"a" : "2020-01-01 00:00:00.000000"}', 'a', 'DateTime64(6)');
select JSONExtract('{"a" : "127.0.0.1"}', 'a', 'IPv4');
select JSONExtract('{"a" : "2001:0db8:85a3:0000:0000:8a2e:0370:7334"}', 'a', 'IPv6');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(UInt8)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Int8)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(UInt16)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Int16)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(UInt32)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Int32)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(UInt64)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Int64)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Float32)');
select JSONExtract('{"a" : 42}', 'a', 'LowCardinality(Float32)');
select JSONExtract('{"a" : "Hello"}', 'a', 'LowCardinality(String)');
select JSONExtract('{"a" : "Hello"}', 'a', 'LowCardinality(FixedString(5))');
select JSONExtract('{"a" : "Hello"}', 'a', 'LowCardinality(FixedString(3))');
select JSONExtract('{"a" : "Hello"}', 'a', 'LowCardinality(FixedString(10))');
select JSONExtract('{"a" : "5801c962-1182-458a-89f8-d077da5074f9"}', 'a', 'LowCardinality(UUID)');

View File

@ -0,0 +1,30 @@
true Bool
42 Int64
-42 Int64
18446744073709551615 UInt64
42.42 Float64
42 Int64
-42 Int64
18446744073709551615 UInt64
Hello String
2020-01-01 Date
2020-01-01 00:00:00.000000000 DateTime64(9)
[1,2,3] Array(Nullable(Int64))
['str1','str2','str3'] Array(Nullable(String))
[[[1],[2,3,4]],[[5,6],[7]]] Array(Array(Array(Nullable(Int64))))
['2020-01-01 00:00:00.000000000','2020-01-01 00:00:00.000000000'] Array(Nullable(DateTime64(9)))
['2020-01-01','2020-01-01 date'] Array(Nullable(String))
['2020-01-01','2020-01-01 00:00:00','str'] Array(Nullable(String))
['2020-01-01','2020-01-01 00:00:00','42'] Array(Nullable(String))
['str','42'] Array(Nullable(String))
[42,42.42] Array(Nullable(Float64))
[42,18446744073709552000,42.42] Array(Nullable(Float64))
[42,42.42] Array(Nullable(Float64))
[NULL,NULL] Array(Nullable(String))
[NULL,42] Array(Nullable(Int64))
[[NULL],[],[42]] Array(Array(Nullable(Int64)))
[[],[NULL,NULL],[1,NULL,3],[NULL,2,NULL]] Array(Array(Nullable(Int64)))
[[],[NULL,NULL],['1',NULL,'3'],[NULL,'2',NULL],['2020-01-01']] Array(Array(Nullable(String)))
('str',42,[42]) Tuple(Nullable(String), Nullable(Int64), Array(Nullable(Int64)))
[42,18446744073709551615] Array(Nullable(UInt64))
(-42,18446744073709551615) Tuple(Nullable(Int64), Nullable(UInt64))

View File

@ -0,0 +1,37 @@
set input_format_json_try_infer_numbers_from_strings=1;
select JSONExtract(materialize('{"d" : true}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : 42}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : -42}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : 18446744073709551615}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : 42.42}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : "42"}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : "-42"}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : "18446744073709551615"}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : "Hello"}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : "2020-01-01"}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : "2020-01-01 00:00:00.000"}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [1, 2, 3]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : ["str1", "str2", "str3"]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [[[1], [2, 3, 4]], [[5, 6], [7]]]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : ["2020-01-01", "2020-01-01 00:00:00"]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : ["2020-01-01", "2020-01-01 date"]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : ["2020-01-01", "2020-01-01 00:00:00", "str"]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : ["2020-01-01", "2020-01-01 00:00:00", "42"]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : ["str", "42"]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [42, 42.42]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [42, 18446744073709551615, 42.42]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [42, 42.42]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [null, null]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [null, 42]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [[null], [], [42]]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"a" : [[], [null, null], ["1", null, "3"], [null, "2", null]]}'), 'a', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"a" : [[], [null, null], ["1", null, "3"], [null, "2", null], ["2020-01-01"]]}'), 'a', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : ["str", 42, [42]]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [42, 18446744073709551615]}'), 'd', 'Dynamic') as d, dynamicType(d);
select JSONExtract(materialize('{"d" : [-42, 18446744073709551615]}'), 'd', 'Dynamic') as d, dynamicType(d);