Merge pull request #44019 from Avogar/refactor-schema-inference

Refactor and improve schema inference for text formats
This commit is contained in:
Kruglov Pavel 2022-12-20 17:29:03 +01:00 committed by GitHub
commit 643a35bed1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
69 changed files with 1881 additions and 1209 deletions

View File

@ -3588,6 +3588,13 @@ y Nullable(String)
z IPv4
```
## schema_inference_make_columns_nullable {#schema_inference_make_columns_nullable}
Controls making inferred types `Nullable` in schema inference for formats without information about nullability.
If the setting is enabled, the inferred type will be `Nullable` only if column contains `NULL` in a sample that is parsed during schema inference.
Default value: `false`.
## input_format_try_infer_integers {#input_format_try_infer_integers}
If enabled, ClickHouse will try to infer integers instead of floats in schema inference for text formats. If all numbers in the column from input data are integers, the result type will be `Int64`, if at least one number is float, the result type will be `Float64`.

View File

@ -764,6 +764,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \
M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column_name1 column_type1, column_name2 column_type2, ...'", 0) \
M(Bool, schema_inference_make_columns_nullable, true, "If set to true, all inferred types will be Nullable in schema inference for formats without information about nullability.", 0) \
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \
M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \

View File

@ -3,6 +3,7 @@
#include <cstdint>
#include <string>
#include <vector>
#include <unordered_set>
#include <base/strong_typedef.h>
#include <base/Decimal.h>
#include <base/defines.h>
@ -93,4 +94,5 @@ using Int256 = ::Int256;
/// Not a data type in database, defined just for convenience.
using Strings = std::vector<String>;
using TypeIndexesSet = std::unordered_set<TypeIndex>;
}

View File

@ -41,6 +41,8 @@ public:
SerializationPtr doGetDefaultSerialization() const override;
bool hasNullableSubcolumns() const { return is_nullable; }
const String & getSchemaFormat() const { return schema_format; }
};
}

View File

@ -8,74 +8,62 @@
namespace DB
{
void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &)> transform_simple_types, std::function<void(DataTypes &)> transform_complex_types)
void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &, const TypeIndexesSet &)> transform_simple_types, std::function<void(DataTypes &, const TypeIndexesSet &)> transform_complex_types)
{
TypeIndexesSet type_indexes;
for (const auto & type : types)
type_indexes.insert(type->getTypeId());
/// Arrays
if (type_indexes.contains(TypeIndex::Array))
{
/// Arrays
bool have_array = false;
bool all_arrays = true;
DataTypes nested_types;
for (const auto & type : types)
/// All types are Array
if (type_indexes.size() == 1)
{
if (const DataTypeArray * type_array = typeid_cast<const DataTypeArray *>(type.get()))
{
have_array = true;
nested_types.push_back(type_array->getNestedType());
}
else
all_arrays = false;
DataTypes nested_types;
for (const auto & type : types)
nested_types.push_back(typeid_cast<const DataTypeArray *>(type.get())->getNestedType());
transformTypesRecursively(nested_types, transform_simple_types, transform_complex_types);
for (size_t i = 0; i != types.size(); ++i)
types[i] = std::make_shared<DataTypeArray>(nested_types[i]);
}
if (have_array)
{
if (all_arrays)
{
transformTypesRecursively(nested_types, transform_simple_types, transform_complex_types);
for (size_t i = 0; i != types.size(); ++i)
types[i] = std::make_shared<DataTypeArray>(nested_types[i]);
}
if (transform_complex_types)
transform_complex_types(types, type_indexes);
if (transform_complex_types)
transform_complex_types(types);
return;
}
return;
}
/// Tuples
if (type_indexes.contains(TypeIndex::Tuple))
{
/// Tuples
bool have_tuple = false;
bool all_tuples = true;
size_t tuple_size = 0;
std::vector<DataTypes> nested_types;
for (const auto & type : types)
/// All types are Tuple
if (type_indexes.size() == 1)
{
if (const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(type.get()))
{
if (!have_tuple)
{
tuple_size = type_tuple->getElements().size();
nested_types.resize(tuple_size);
for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
nested_types[elem_idx].reserve(types.size());
}
else if (tuple_size != type_tuple->getElements().size())
return;
std::vector<DataTypes> nested_types;
const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(types[0].get());
size_t tuple_size = type_tuple->getElements().size();
nested_types.resize(tuple_size);
for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
nested_types[elem_idx].reserve(types.size());
have_tuple = true;
bool sizes_are_equal = true;
for (const auto & type : types)
{
type_tuple = typeid_cast<const DataTypeTuple *>(type.get());
if (type_tuple->getElements().size() != tuple_size)
{
sizes_are_equal = false;
break;
}
for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
nested_types[elem_idx].emplace_back(type_tuple->getElements()[elem_idx]);
}
else
all_tuples = false;
}
if (have_tuple)
{
if (all_tuples)
if (sizes_are_equal)
{
std::vector<DataTypes> transposed_nested_types(types.size());
for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
@ -88,56 +76,47 @@ void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &
for (size_t i = 0; i != types.size(); ++i)
types[i] = std::make_shared<DataTypeTuple>(transposed_nested_types[i]);
}
if (transform_complex_types)
transform_complex_types(types);
return;
}
if (transform_complex_types)
transform_complex_types(types, type_indexes);
return;
}
/// Maps
if (type_indexes.contains(TypeIndex::Map))
{
/// Maps
bool have_maps = false;
bool all_maps = true;
DataTypes key_types;
DataTypes value_types;
key_types.reserve(types.size());
value_types.reserve(types.size());
for (const auto & type : types)
/// All types are Map
if (type_indexes.size() == 1)
{
if (const DataTypeMap * type_map = typeid_cast<const DataTypeMap *>(type.get()))
DataTypes key_types;
DataTypes value_types;
key_types.reserve(types.size());
value_types.reserve(types.size());
for (const auto & type : types)
{
have_maps = true;
const DataTypeMap * type_map = typeid_cast<const DataTypeMap *>(type.get());
key_types.emplace_back(type_map->getKeyType());
value_types.emplace_back(type_map->getValueType());
}
else
all_maps = false;
transformTypesRecursively(key_types, transform_simple_types, transform_complex_types);
transformTypesRecursively(value_types, transform_simple_types, transform_complex_types);
for (size_t i = 0; i != types.size(); ++i)
types[i] = std::make_shared<DataTypeMap>(key_types[i], value_types[i]);
}
if (have_maps)
{
if (all_maps)
{
transformTypesRecursively(key_types, transform_simple_types, transform_complex_types);
transformTypesRecursively(value_types, transform_simple_types, transform_complex_types);
if (transform_complex_types)
transform_complex_types(types, type_indexes);
for (size_t i = 0; i != types.size(); ++i)
types[i] = std::make_shared<DataTypeMap>(key_types[i], value_types[i]);
}
if (transform_complex_types)
transform_complex_types(types);
return;
}
return;
}
/// Nullable
if (type_indexes.contains(TypeIndex::Nullable))
{
/// Nullable
bool have_nullable = false;
std::vector<UInt8> is_nullable;
is_nullable.reserve(types.size());
DataTypes nested_types;
@ -146,7 +125,6 @@ void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &
{
if (const DataTypeNullable * type_nullable = typeid_cast<const DataTypeNullable *>(type.get()))
{
have_nullable = true;
is_nullable.push_back(1);
nested_types.push_back(type_nullable->getNestedType());
}
@ -157,28 +135,28 @@ void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &
}
}
if (have_nullable)
transformTypesRecursively(nested_types, transform_simple_types, transform_complex_types);
for (size_t i = 0; i != types.size(); ++i)
{
transformTypesRecursively(nested_types, transform_simple_types, transform_complex_types);
for (size_t i = 0; i != types.size(); ++i)
{
if (is_nullable[i])
types[i] = makeNullable(nested_types[i]);
else
types[i] = nested_types[i];
}
return;
if (is_nullable[i])
types[i] = makeNullable(nested_types[i]);
else
types[i] = nested_types[i];
}
if (transform_complex_types)
transform_complex_types(types, type_indexes);
return;
}
transform_simple_types(types);
transform_simple_types(types, type_indexes);
}
void callOnNestedSimpleTypes(DataTypePtr & type, std::function<void(DataTypePtr &)> callback)
{
DataTypes types = {type};
transformTypesRecursively(types, [callback](auto & data_types){ callback(data_types[0]); }, {});
transformTypesRecursively(types, [callback](auto & data_types, const TypeIndexesSet &){ callback(data_types[0]); }, {});
}
}

View File

@ -12,7 +12,7 @@ namespace DB
/// If not all types are the same complex type (Array/Map/Tuple), this function won't be called to nested types.
/// Function transform_simple_types will be applied to resulting simple types after all recursive calls.
/// Function transform_complex_types will be applied to complex types (Array/Map/Tuple) after recursive call to their nested types.
void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &)> transform_simple_types, std::function<void(DataTypes &)> transform_complex_types);
void transformTypesRecursively(DataTypes & types, std::function<void(DataTypes &, const TypeIndexesSet &)> transform_simple_types, std::function<void(DataTypes &, const TypeIndexesSet &)> transform_complex_types);
void callOnNestedSimpleTypes(DataTypePtr & type, std::function<void(DataTypePtr &)> callback);

View File

@ -1,21 +1,11 @@
#include <Formats/EscapingRuleUtils.h>
#include <Formats/JSONUtils.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/transformTypesRecursively.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromString.h>
@ -261,556 +251,76 @@ String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule e
return readByEscapingRule<true>(buf, escaping_rule, format_settings);
}
void transformInferredTypesIfNeededImpl(DataTypes & types, const FormatSettings & settings, bool is_json, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings = nullptr)
{
/// Do nothing if we didn't try to infer something special.
if (!settings.try_infer_integers && !settings.try_infer_dates && !settings.try_infer_datetimes && !is_json)
return;
auto transform_simple_types = [&](DataTypes & data_types)
{
/// If we have floats and integers convert them all to float.
if (settings.try_infer_integers)
{
bool have_floats = false;
bool have_integers = false;
for (const auto & type : data_types)
{
have_floats |= isFloat(type);
have_integers |= isInteger(type) && !isBool(type);
}
if (have_floats && have_integers)
{
for (auto & type : data_types)
{
if (isInteger(type))
type = std::make_shared<DataTypeFloat64>();
}
}
}
/// If we have only dates and datetimes, convert dates to datetime.
/// If we have date/datetimes and smth else, convert them to string, because
/// There is a special case when we inferred both Date/DateTime and Int64 from Strings,
/// for example: "arr: ["2020-01-01", "2000"]" -> Tuple(Date, Int64),
/// so if we have Date/DateTime and smth else (not only String) we should
/// convert Date/DateTime back to String, so then we will be able to
/// convert Int64 back to String as well.
if (settings.try_infer_dates || settings.try_infer_datetimes)
{
bool have_dates = false;
bool have_datetimes = false;
bool all_dates_or_datetimes = true;
for (const auto & type : data_types)
{
have_dates |= isDate(type);
have_datetimes |= isDateTime64(type);
all_dates_or_datetimes &= isDate(type) || isDateTime64(type);
}
if (!all_dates_or_datetimes && (have_dates || have_datetimes))
{
for (auto & type : data_types)
{
if (isDate(type) || isDateTime64(type))
type = std::make_shared<DataTypeString>();
}
}
else if (have_dates && have_datetimes)
{
for (auto & type : data_types)
{
if (isDate(type))
type = std::make_shared<DataTypeDateTime64>(9);
}
}
}
if (!is_json)
return;
/// Check settings specific for JSON formats.
/// If we have numbers and strings, convert numbers to strings.
if (settings.json.try_infer_numbers_from_strings || settings.json.read_numbers_as_strings)
{
bool have_strings = false;
bool have_numbers = false;
for (const auto & type : data_types)
{
have_strings |= isString(type);
have_numbers |= isNumber(type);
}
if (have_strings && have_numbers)
{
for (auto & type : data_types)
{
if (isNumber(type)
&& (settings.json.read_numbers_as_strings || !numbers_parsed_from_json_strings
|| numbers_parsed_from_json_strings->contains(type.get())))
type = std::make_shared<DataTypeString>();
}
}
}
if (settings.json.read_bools_as_numbers)
{
/// Note that have_floats and have_integers both cannot be
/// equal to true as in one of previous checks we convert
/// integers to floats if we have both.
bool have_floats = false;
bool have_integers = false;
bool have_bools = false;
for (const auto & type : data_types)
{
have_floats |= isFloat(type);
have_integers |= isInteger(type) && !isBool(type);
have_bools |= isBool(type);
}
if (have_bools && (have_integers || have_floats))
{
for (auto & type : data_types)
{
if (isBool(type))
{
if (have_integers)
type = std::make_shared<DataTypeInt64>();
else
type = std::make_shared<DataTypeFloat64>();
}
}
}
}
};
auto transform_complex_types = [&](DataTypes & data_types)
{
if (!is_json)
return;
bool have_maps = false;
bool have_objects = false;
bool have_strings = false;
bool are_maps_equal = true;
DataTypePtr first_map_type;
for (const auto & type : data_types)
{
if (isMap(type))
{
if (!have_maps)
{
first_map_type = type;
have_maps = true;
}
else
{
are_maps_equal &= type->equals(*first_map_type);
}
}
else if (isObject(type))
{
have_objects = true;
}
else if (isString(type))
{
have_strings = false;
}
}
if (have_maps && (have_objects || !are_maps_equal))
{
for (auto & type : data_types)
{
if (isMap(type))
type = std::make_shared<DataTypeObject>("json", true);
}
}
if (settings.json.read_objects_as_strings && have_strings && (have_maps || have_objects))
{
for (auto & type : data_types)
{
if (isMap(type) || isObject(type))
type = std::make_shared<DataTypeString>();
}
}
};
transformTypesRecursively(types, transform_simple_types, transform_complex_types);
}
void transformInferredTypesIfNeeded(DataTypes & types, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
{
transformInferredTypesIfNeededImpl(types, settings, escaping_rule == FormatSettings::EscapingRule::JSON);
}
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
{
DataTypes types = {first, second};
transformInferredTypesIfNeeded(types, settings, escaping_rule);
first = std::move(types[0]);
second = std::move(types[1]);
}
void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings)
{
transformInferredTypesIfNeededImpl(types, settings, true, numbers_parsed_from_json_strings);
}
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings)
{
DataTypes types = {first, second};
transformInferredJSONTypesIfNeeded(types, settings);
first = std::move(types[0]);
second = std::move(types[1]);
}
bool tryInferDate(const std::string_view & field)
{
ReadBufferFromString buf(field);
DayNum tmp;
return tryReadDateText(tmp, buf) && buf.eof();
}
bool tryInferDateTime(const std::string_view & field, const FormatSettings & settings)
{
if (field.empty())
return false;
ReadBufferFromString buf(field);
Float64 tmp_float;
/// Check if it's just a number, and if so, don't try to infer DateTime from it,
/// because we can interpret this number as a timestamp and it will lead to
/// inferring DateTime instead of simple Int64/Float64 in some cases.
if (tryReadFloatText(tmp_float, buf) && buf.eof())
return false;
buf.seek(0, SEEK_SET); /// Return position to the beginning
DateTime64 tmp;
switch (settings.date_time_input_format)
{
case FormatSettings::DateTimeInputFormat::Basic:
if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof())
return true;
break;
case FormatSettings::DateTimeInputFormat::BestEffort:
if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
break;
case FormatSettings::DateTimeInputFormat::BestEffortUS:
if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
return true;
break;
}
return false;
}
DataTypePtr tryInferDateOrDateTime(const std::string_view & field, const FormatSettings & settings)
{
if (settings.try_infer_dates && tryInferDate(field))
return makeNullable(std::make_shared<DataTypeDate>());
if (settings.try_infer_datetimes && tryInferDateTime(field, settings))
return makeNullable(std::make_shared<DataTypeDateTime64>(9));
return nullptr;
}
static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBufferFromString & buf, const FormatSettings & settings)
{
if (buf.eof())
return nullptr;
/// Array
if (checkChar('[', buf))
{
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ']')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
if (buf.eof())
return nullptr;
++buf.position();
if (nested_types.empty())
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
transformInferredTypesIfNeeded(nested_types, settings);
auto least_supertype = tryGetLeastSupertype(nested_types);
if (!least_supertype)
return nullptr;
return std::make_shared<DataTypeArray>(least_supertype);
}
/// Tuple
if (checkChar('(', buf))
{
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ')')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto nested_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
if (buf.eof() || nested_types.empty())
return nullptr;
++buf.position();
return std::make_shared<DataTypeTuple>(nested_types);
}
/// Map
if (checkChar('{', buf))
{
skipWhitespaceIfAny(buf);
DataTypes key_types;
DataTypes value_types;
bool first = true;
while (!buf.eof() && *buf.position() != '}')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto key_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!key_type)
return nullptr;
key_types.push_back(key_type);
skipWhitespaceIfAny(buf);
if (!checkChar(':', buf))
return nullptr;
skipWhitespaceIfAny(buf);
auto value_type = determineDataTypeForSingleFieldImpl(buf, settings);
if (!value_type)
return nullptr;
value_types.push_back(value_type);
}
if (buf.eof())
return nullptr;
++buf.position();
skipWhitespaceIfAny(buf);
if (key_types.empty())
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeNothing>(), std::make_shared<DataTypeNothing>());
transformInferredTypesIfNeeded(key_types, settings);
transformInferredTypesIfNeeded(value_types, settings);
auto key_least_supertype = tryGetLeastSupertype(key_types);
auto value_least_supertype = tryGetLeastSupertype(value_types);
if (!key_least_supertype || !value_least_supertype)
return nullptr;
if (!DataTypeMap::checkKeyType(key_least_supertype))
return nullptr;
return std::make_shared<DataTypeMap>(key_least_supertype, value_least_supertype);
}
/// String
if (*buf.position() == '\'')
{
++buf.position();
String field;
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
field.append(buf.position(), next_pos);
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\'')
break;
field.push_back(*buf.position());
if (*buf.position() == '\\')
++buf.position();
}
if (buf.eof())
return nullptr;
++buf.position();
if (auto type = tryInferDateOrDateTime(field, settings))
return type;
return std::make_shared<DataTypeString>();
}
/// Bool
if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf))
return DataTypeFactory::instance().get("Bool");
/// Null
if (checkStringCaseInsensitive("NULL", buf))
return std::make_shared<DataTypeNothing>();
/// Number
Float64 tmp;
auto * pos_before_float = buf.position();
if (tryReadFloatText(tmp, buf))
{
if (settings.try_infer_integers)
{
auto * float_end_pos = buf.position();
buf.position() = pos_before_float;
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.position() == float_end_pos)
return std::make_shared<DataTypeInt64>();
buf.position() = float_end_pos;
}
return std::make_shared<DataTypeFloat64>();
}
return nullptr;
}
static DataTypePtr determineDataTypeForSingleField(ReadBufferFromString & buf, const FormatSettings & settings)
{
return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf, settings));
}
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Quoted:
{
ReadBufferFromString buf(field);
auto type = determineDataTypeForSingleField(buf, format_settings);
return buf.eof() ? type : nullptr;
}
return tryInferDataTypeForSingleField(field, format_settings);
case FormatSettings::EscapingRule::JSON:
return JSONUtils::getDataTypeFromField(field, format_settings);
return tryInferDataTypeForSingleJSONField(field, format_settings, json_info);
case FormatSettings::EscapingRule::CSV:
{
if (!format_settings.csv.use_best_effort_in_schema_inference)
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
if (field.empty() || field == format_settings.csv.null_representation)
if (field.empty())
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Nullable(Bool)");
if (field == format_settings.csv.null_representation)
return makeNullable(std::make_shared<DataTypeNothing>());
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Bool");
/// In CSV complex types are serialized in quotes. If we have quotes, we should try to infer type
/// from data inside quotes.
if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"')))
{
auto data = std::string_view(field.data() + 1, field.size() - 2);
if (auto date_type = tryInferDateOrDateTime(data, format_settings))
/// First, try to infer dates and datetimes.
if (auto date_type = tryInferDateOrDateTimeFromString(data, format_settings))
return date_type;
ReadBufferFromString buf(data);
/// Try to determine the type of value inside quotes
auto type = determineDataTypeForSingleField(buf, format_settings);
auto type = tryInferDataTypeForSingleField(data, format_settings);
if (!type)
return nullptr;
/// If it's a number or tuple in quotes or there is some unread data in buffer, we determine it as a string.
if (isNumber(removeNullable(type)) || isTuple(type) || !buf.eof())
return makeNullable(std::make_shared<DataTypeString>());
/// If we couldn't infer any type or it's a number or tuple in quotes, we determine it as a string.
if (!type || isNumber(removeNullable(type)) || isTuple(type))
return std::make_shared<DataTypeString>();
return type;
}
/// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string.
if (format_settings.try_infer_integers)
{
ReadBufferFromString buf(field);
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof())
return makeNullable(std::make_shared<DataTypeInt64>());
}
auto type = tryInferNumberFromString(field, format_settings);
ReadBufferFromString buf(field);
Float64 tmp;
if (tryReadFloatText(tmp, buf) && buf.eof())
return makeNullable(std::make_shared<DataTypeFloat64>());
if (!type)
return std::make_shared<DataTypeString>();
return makeNullable(std::make_shared<DataTypeString>());
return type;
}
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
case FormatSettings::EscapingRule::Escaped:
{
if (!format_settings.tsv.use_best_effort_in_schema_inference)
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
if (field.empty() || field == format_settings.tsv.null_representation)
if (field.empty())
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Nullable(Bool)");
if (field == format_settings.tsv.null_representation)
return makeNullable(std::make_shared<DataTypeNothing>());
if (auto date_type = tryInferDateOrDateTime(field, format_settings))
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Bool");
if (auto date_type = tryInferDateOrDateTimeFromString(field, format_settings))
return date_type;
ReadBufferFromString buf(field);
auto type = determineDataTypeForSingleField(buf, format_settings);
if (!buf.eof())
return makeNullable(std::make_shared<DataTypeString>());
auto type = tryInferDataTypeForSingleField(field, format_settings);
if (!type)
return std::make_shared<DataTypeString>();
return type;
}
default:
@ -818,15 +328,34 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe
}
}
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
DataTypes tryInferDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info)
{
DataTypes data_types;
data_types.reserve(fields.size());
for (const auto & field : fields)
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule));
data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, escaping_rule, json_info));
return data_types;
}
void transformInferredTypesByEscapingRuleIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::JSON:
transformInferredJSONTypesIfNeeded(first, second, settings, json_info);
break;
case FormatSettings::EscapingRule::Escaped: [[fallthrough]];
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
case FormatSettings::EscapingRule::Quoted: [[fallthrough]];
case FormatSettings::EscapingRule::CSV:
transformInferredTypesIfNeeded(first, second, settings);
break;
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot transform inferred types for value with {} escaping rule", escapingRuleToString(escaping_rule));
}
}
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule)
{
switch (escaping_rule)
@ -834,7 +363,7 @@ DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escap
case FormatSettings::EscapingRule::CSV:
case FormatSettings::EscapingRule::Escaped:
case FormatSettings::EscapingRule::Raw:
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
default:
return nullptr;
}
@ -851,9 +380,10 @@ DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::E
String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings)
{
return fmt::format(
"schema_inference_hints={}, max_rows_to_read_for_schema_inference={}",
"schema_inference_hints={}, max_rows_to_read_for_schema_inference={}, schema_inference_make_columns_nullable={}",
settings.schema_inference_hints,
settings.max_rows_to_read_for_schema_inference);
settings.max_rows_to_read_for_schema_inference,
settings.schema_inference_make_columns_nullable);
}
String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule)
@ -890,7 +420,11 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
settings.csv.tuple_delimiter);
break;
case FormatSettings::EscapingRule::JSON:
result += fmt::format(", try_infer_numbers_from_strings={}, read_bools_as_numbers={}", settings.json.try_infer_numbers_from_strings, settings.json.read_bools_as_numbers);
result += fmt::format(
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, try_infer_objects={}",
settings.json.try_infer_numbers_from_strings,
settings.json.read_bools_as_numbers,
settings.json.try_infer_objects);
break;
default:
break;

View File

@ -1,6 +1,7 @@
#pragma once
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/Serializations/ISerialization.h>
#include <IO/ReadBuffer.h>
@ -38,45 +39,17 @@ String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule es
/// Try to determine the type of the field written by a specific escaping rule.
/// If cannot, return nullptr.
/// - For Quoted escaping rule we can interpret a single field as a constant
/// expression and get it's type by evaluation this expression.
/// - For JSON escaping rule we can use JSON parser to parse a single field
/// and then convert JSON type of this field to ClickHouse type.
/// - For CSV escaping rule we can do the next:
/// - If the field is an unquoted string, then we try to parse it as a number,
/// and if we cannot, treat it as a String.
/// - If the field is a string in quotes, then we try to use some
/// tweaks and heuristics to determine the type inside quotes, and if we can't or
/// the result is a number or tuple (we don't parse numbers in quotes and don't
/// support tuples in CSV) we treat it as a String.
/// - If input_format_csv_use_best_effort_in_schema_inference is disabled, we
/// treat everything as a string.
/// - For TSV and TSVRaw we try to use some tweaks and heuristics to determine the type
/// of value if setting input_format_tsv_use_best_effort_in_schema_inference is enabled,
/// otherwise we treat everything as a string.
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule);
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule);
/// See tryInferDataTypeForSingle(JSON)Field in SchemaInferenceUtils.h
DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr);
DataTypes tryInferDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr);
/// Check if we need to transform types inferred from data and transform it if necessary.
/// See transformInferred(JSON)TypesIfNeeded in SchemaInferenceUtils.h
void transformInferredTypesByEscapingRuleIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info = nullptr);
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule);
DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules);
/// Try to infer Date or Datetime from string if corresponding settings are enabled.
DataTypePtr tryInferDateOrDateTime(const std::string_view & field, const FormatSettings & settings);
/// Check if we need to transform types inferred from data and transform it if necessary.
/// It's used when we try to infer some not ordinary types from another types.
/// For example dates from strings, we should check if dates were inferred from all strings
/// in the same way and if not, transform inferred dates back to strings.
/// For example, if we have array of strings and we tried to infer dates from them,
/// to make the result type Array(Date) we should ensure that all strings were
/// successfully parsed as dated and if not, convert all dates back to strings and make result type Array(String).
void transformInferredTypesIfNeeded(DataTypes & types, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule = FormatSettings::EscapingRule::Escaped);
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule = FormatSettings::EscapingRule::Escaped);
/// Same as transformInferredTypesIfNeeded but takes into account settings that are special for JSON formats.
void transformInferredJSONTypesIfNeeded(DataTypes & types, const FormatSettings & settings, const std::unordered_set<const IDataType *> * numbers_parsed_from_json_strings = nullptr);
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings);
String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings);
String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, FormatSettings::EscapingRule escaping_rule);

View File

@ -169,6 +169,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference;
format_settings.schema_inference_hints = settings.schema_inference_hints;
format_settings.schema_inference_make_columns_nullable = settings.schema_inference_make_columns_nullable;
format_settings.mysql_dump.table_name = settings.input_format_mysql_dump_table_name;
format_settings.mysql_dump.map_column_names = settings.input_format_mysql_dump_map_column_names;
format_settings.sql_insert.max_batch_size = settings.output_format_sql_insert_max_batch_size;
@ -182,6 +183,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string;
format_settings.bson.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_bson_skip_fields_with_unsupported_types_in_schema_inference;
format_settings.max_binary_string_size = settings.format_binary_max_string_size;
format_settings.max_parser_depth = context->getSettingsRef().max_parser_depth;
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
if (format_settings.schema.is_server)

View File

@ -71,6 +71,8 @@ struct FormatSettings
Raw
};
bool schema_inference_make_columns_nullable = true;
DateTimeOutputFormat date_time_output_format = DateTimeOutputFormat::Simple;
bool input_format_ipv4_default_on_conversion_error = false;
@ -81,6 +83,8 @@ struct FormatSettings
UInt64 max_binary_string_size = 0;
UInt64 max_parser_depth = DBMS_DEFAULT_MAX_PARSER_DEPTH;
struct
{
UInt64 row_group_size = 1000000;

View File

@ -6,19 +6,13 @@
#include <IO/WriteBufferValidUTF8.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeFactory.h>
#include <Common/JSONParsers/SimdJSONParser.h>
#include <Common/JSONParsers/RapidJSONParser.h>
#include <Common/JSONParsers/DummyJSONParser.h>
#include <base/find_symbols.h>
#include <Common/logger_useful.h>
namespace DB
{
@ -122,206 +116,6 @@ namespace JSONUtils
return {loadAtPosition(in, memory, pos), number_of_rows};
}
template <const char opening_bracket, const char closing_bracket>
static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in)
{
Memory memory;
fileSegmentationEngineJSONEachRowImpl<opening_bracket, closing_bracket>(in, memory, 0, 1, 1);
return String(memory.data(), memory.size());
}
template <class Element>
DataTypePtr getDataTypeFromFieldImpl(const Element & field, const FormatSettings & settings, std::unordered_set<const IDataType *> & numbers_parsed_from_json_strings)
{
if (field.isNull())
return nullptr;
if (field.isBool())
return DataTypeFactory::instance().get("Nullable(Bool)");
if (field.isInt64() || field.isUInt64())
{
if (settings.try_infer_integers)
return makeNullable(std::make_shared<DataTypeInt64>());
return makeNullable(std::make_shared<DataTypeFloat64>());
}
if (field.isDouble())
return makeNullable(std::make_shared<DataTypeFloat64>());
if (field.isString())
{
if (auto date_type = tryInferDateOrDateTime(field.getString(), settings))
return date_type;
if (!settings.json.try_infer_numbers_from_strings)
return makeNullable(std::make_shared<DataTypeString>());
ReadBufferFromString buf(field.getString());
if (settings.try_infer_integers)
{
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof())
{
auto type = std::make_shared<DataTypeInt64>();
numbers_parsed_from_json_strings.insert(type.get());
return makeNullable(type);
}
}
Float64 tmp;
if (tryReadFloatText(tmp, buf) && buf.eof())
{
auto type = std::make_shared<DataTypeFloat64>();
numbers_parsed_from_json_strings.insert(type.get());
return makeNullable(type);
}
return makeNullable(std::make_shared<DataTypeString>());
}
if (field.isArray())
{
auto array = field.getArray();
/// Return nullptr in case of empty array because we cannot determine nested type.
if (array.size() == 0)
return nullptr;
DataTypes nested_data_types;
/// If this array contains fields with different types we will treat it as Tuple.
bool are_types_the_same = true;
for (const auto element : array)
{
auto type = getDataTypeFromFieldImpl(element, settings, numbers_parsed_from_json_strings);
if (!type)
return nullptr;
if (!nested_data_types.empty() && !type->equals(*nested_data_types.back()))
are_types_the_same = false;
nested_data_types.push_back(std::move(type));
}
if (!are_types_the_same)
{
auto nested_types_copy = nested_data_types;
transformInferredJSONTypesIfNeeded(nested_types_copy, settings, &numbers_parsed_from_json_strings);
are_types_the_same = true;
for (size_t i = 1; i < nested_types_copy.size(); ++i)
are_types_the_same &= nested_types_copy[i]->equals(*nested_types_copy[i - 1]);
if (are_types_the_same)
nested_data_types = std::move(nested_types_copy);
}
if (!are_types_the_same)
return std::make_shared<DataTypeTuple>(nested_data_types);
return std::make_shared<DataTypeArray>(nested_data_types.back());
}
if (field.isObject())
{
auto object = field.getObject();
DataTypes value_types;
for (const auto key_value_pair : object)
{
auto type = getDataTypeFromFieldImpl(key_value_pair.second, settings, numbers_parsed_from_json_strings);
if (!type)
{
/// If we couldn't infer nested type and Object type is not enabled,
/// we can't determine the type of this JSON field.
if (!settings.json.try_infer_objects)
{
/// If read_objects_as_strings is enabled, we can read objects into strings.
if (settings.json.read_objects_as_strings)
return makeNullable(std::make_shared<DataTypeString>());
return nullptr;
}
continue;
}
if (settings.json.try_infer_objects && isObject(type))
return std::make_shared<DataTypeObject>("json", true);
value_types.push_back(type);
}
if (value_types.empty())
return nullptr;
transformInferredJSONTypesIfNeeded(value_types, settings, &numbers_parsed_from_json_strings);
bool are_types_equal = true;
for (size_t i = 1; i < value_types.size(); ++i)
are_types_equal &= value_types[i]->equals(*value_types[0]);
if (!are_types_equal)
{
if (!settings.json.try_infer_objects)
{
/// If read_objects_as_strings is enabled, we can read objects into strings.
if (settings.json.read_objects_as_strings)
return makeNullable(std::make_shared<DataTypeString>());
return nullptr;
}
return std::make_shared<DataTypeObject>("json", true);
}
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_types[0]);
}
throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"};
}
auto getJSONParserAndElement()
{
#if USE_SIMDJSON
return std::pair<SimdJSONParser, SimdJSONParser::Element>();
#elif USE_RAPIDJSON
return std::pair<RapidJSONParser, RapidJSONParser::Element>();
#else
return std::pair<DummyJSONParser, DummyJSONParser::Element>();
#endif
}
DataTypePtr getDataTypeFromField(const String & field, const FormatSettings & settings)
{
auto [parser, element] = getJSONParserAndElement();
bool parsed = parser.parse(field, element);
if (!parsed)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", field);
std::unordered_set<const IDataType *> numbers_parsed_from_json_strings;
return getDataTypeFromFieldImpl(element, settings, numbers_parsed_from_json_strings);
}
template <class Extractor, const char opening_bracket, const char closing_bracket>
static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, const FormatSettings & settings, bool /*json_strings*/, Extractor & extractor)
{
String line = readJSONEachRowLineIntoStringImpl<opening_bracket, closing_bracket>(in);
auto [parser, element] = getJSONParserAndElement();
bool parsed = parser.parse(line, element);
if (!parsed)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", line);
auto fields = extractor.extract(element);
DataTypes data_types;
data_types.reserve(fields.size());
std::unordered_set<const IDataType *> numbers_parsed_from_json_strings;
for (const auto & field : fields)
data_types.push_back(getDataTypeFromFieldImpl(field, settings, numbers_parsed_from_json_strings));
/// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings.
/// Should we try to parse data inside strings somehow in this case?
return data_types;
}
std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows)
{
return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_bytes, 1, max_rows);
@ -333,68 +127,56 @@ namespace JSONUtils
return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_bytes, min_rows, max_rows);
}
struct JSONEachRowFieldsExtractor
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info)
{
template <class Element>
std::vector<Element> extract(const Element & element)
skipWhitespaceIfAny(in);
assertChar('{', in);
bool first = true;
NamesAndTypesList names_and_types;
String field;
while (!in.eof() && *in.position() != '}')
{
/// {..., "<column_name>" : <value>, ...}
if (!first)
skipComma(in);
else
first = false;
if (!element.isObject())
throw Exception(ErrorCodes::INCORRECT_DATA, "Root JSON value is not an object");
auto object = element.getObject();
std::vector<Element> fields;
fields.reserve(object.size());
column_names.reserve(object.size());
for (const auto & key_value_pair : object)
{
column_names.emplace_back(key_value_pair.first);
fields.push_back(key_value_pair.second);
}
return fields;
auto name = readFieldName(in);
auto type = tryInferDataTypeForSingleJSONField(in, settings, inference_info);
names_and_types.emplace_back(name, type);
}
std::vector<String> column_names;
};
if (in.eof())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading JSON object");
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings)
{
JSONEachRowFieldsExtractor extractor;
auto data_types
= determineColumnDataTypesFromJSONEachRowDataImpl<JSONEachRowFieldsExtractor, '{', '}'>(in, settings, json_strings, extractor);
NamesAndTypesList result;
for (size_t i = 0; i != extractor.column_names.size(); ++i)
result.emplace_back(extractor.column_names[i], data_types[i]);
return result;
assertChar('}', in);
return names_and_types;
}
struct JSONCompactEachRowFieldsExtractor
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info)
{
template <class Element>
std::vector<Element> extract(const Element & element)
skipWhitespaceIfAny(in);
assertChar('[', in);
bool first = true;
DataTypes types;
String field;
while (!in.eof() && *in.position() != ']')
{
/// [..., <value>, ...]
if (!element.isArray())
throw Exception(ErrorCodes::INCORRECT_DATA, "Root JSON value is not an array");
auto array = element.getArray();
std::vector<Element> fields;
fields.reserve(array.size());
for (size_t i = 0; i != array.size(); ++i)
fields.push_back(array[i]);
return fields;
if (!first)
skipComma(in);
else
first = false;
auto type = tryInferDataTypeForSingleJSONField(in, settings, inference_info);
types.push_back(std::move(type));
}
};
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings)
{
JSONCompactEachRowFieldsExtractor extractor;
return determineColumnDataTypesFromJSONEachRowDataImpl<JSONCompactEachRowFieldsExtractor, '[', ']'>(in, settings, json_strings, extractor);
if (in.eof())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading JSON array");
assertChar(']', in);
return types;
}
bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf)
{
/// For JSONEachRow we can safely skip whitespace characters

View File

@ -13,24 +13,21 @@
namespace DB
{
struct JSONInferenceInfo;
namespace JSONUtils
{
std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t max_rows);
std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_bytes, size_t min_rows, size_t max_rows);
/// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable.
/// JSON array with different nested types is treated as Tuple.
/// If cannot convert (for example when field contains null), return nullptr.
DataTypePtr getDataTypeFromField(const String & field, const FormatSettings & settings);
/// Read row in JSONEachRow format and try to determine type for each field.
/// Return list of names and types.
/// If cannot determine the type of some field, return nullptr for it.
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings);
NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info);
/// Read row in JSONCompactEachRow format and try to determine type for each field.
/// If cannot determine the type of some field, return nullptr for it.
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, bool json_strings);
DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, const FormatSettings & settings, JSONInferenceInfo * inference_info);
bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf);

View File

@ -197,69 +197,6 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o
return readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, retry, context, buf_out);
}
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type)
{
if (!type)
return nullptr;
WhichDataType which(type);
if (which.isNothing())
return nullptr;
if (which.isNullable())
{
const auto * nullable_type = assert_cast<const DataTypeNullable *>(type.get());
return makeNullableRecursivelyAndCheckForNothing(nullable_type->getNestedType());
}
if (which.isArray())
{
const auto * array_type = assert_cast<const DataTypeArray *>(type.get());
auto nested_type = makeNullableRecursivelyAndCheckForNothing(array_type->getNestedType());
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
}
if (which.isTuple())
{
const auto * tuple_type = assert_cast<const DataTypeTuple *>(type.get());
DataTypes nested_types;
for (const auto & element : tuple_type->getElements())
{
auto nested_type = makeNullableRecursivelyAndCheckForNothing(element);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
return std::make_shared<DataTypeTuple>(std::move(nested_types));
}
if (which.isMap())
{
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
auto key_type = makeNullableRecursivelyAndCheckForNothing(map_type->getKeyType());
auto value_type = makeNullableRecursivelyAndCheckForNothing(map_type->getValueType());
return key_type && value_type ? std::make_shared<DataTypeMap>(removeNullable(key_type), value_type) : nullptr;
}
if (which.isLowCarnality())
{
const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get());
auto nested_type = makeNullableRecursivelyAndCheckForNothing(lc_type->getDictionaryType());
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
}
return makeNullable(type);
}
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header)
{
NamesAndTypesList result;
for (auto & [name, type] : header.getNamesAndTypesList())
result.emplace_back(name, makeNullableRecursivelyAndCheckForNothing(type));
return result;
}
SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context)
{
return getKeysForSchemaCache({source}, format, format_settings, context).front();

View File

@ -35,21 +35,7 @@ ColumnsDescription readSchemaFromFormat(
ContextPtr & context,
std::unique_ptr<ReadBuffer> & buf_out);
/// Make type Nullable recursively:
/// - Type -> Nullable(type)
/// - Array(Type) -> Array(Nullable(Type))
/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN))
/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType))
/// - LowCardinality(Type) -> LowCardinality(Nullable(Type))
/// If type is Nothing or one of the nested types is Nothing, return nullptr.
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type);
/// Call makeNullableRecursivelyAndCheckForNothing for all types
/// in the block and return names and types.
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header);
SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context);
SchemaCache::Keys getKeysForSchemaCache(const Strings & sources, const String & format, const std::optional<FormatSettings> & format_settings, const ContextPtr & context);
void splitSchemaCacheKey(const String & key, String & source, String & format, String & additional_format_info);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,93 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <IO/ReadBuffer.h>
namespace DB
{
/// Struct with some additional information about inferred types for JSON formats.
struct JSONInferenceInfo
{
/// We store numbers that were parsed from strings.
/// It's used in types transformation to change such numbers back to string if needed.
std::unordered_set<const IDataType *> numbers_parsed_from_json_strings;
/// Indicates if currently we are inferring type for Map/Object key.
bool is_object_key = false;
};
/// Try to determine datatype of the value in buffer/string. If the type cannot be inferred, return nullptr.
/// In general, it tries to parse a type using the following logic:
/// If we see '[', we try to parse an array of values and recursively determine datatype for each element.
/// If we see '(', we try to parse a tuple of values and recursively determine datatype for each element.
/// If we see '{', we try to parse a Map of keys and values and recursively determine datatype for each key/value.
/// If we see a quote '\'', we treat it as a string and read until next quote.
/// If we see NULL it returns Nullable(Nothing)
/// Otherwise we try to read a number.
DataTypePtr tryInferDataTypeForSingleField(ReadBuffer & buf, const FormatSettings & settings);
DataTypePtr tryInferDataTypeForSingleField(std::string_view field, const FormatSettings & settings);
/// The same as tryInferDataTypeForSingleField, but for JSON values.
DataTypePtr tryInferDataTypeForSingleJSONField(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info);
DataTypePtr tryInferDataTypeForSingleJSONField(std::string_view field, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Try to parse Date or DateTime value from a string.
DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings);
/// Try to parse a number value from a string. By default, it tries to parse Float64,
/// but if setting try_infer_integers is enabled, it also tries to parse Int64.
DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings);
/// It takes two types inferred for the same column and tries to transform them to a common type if possible.
/// It's also used when we try to infer some not ordinary types from another types.
/// Example 1:
/// Dates inferred from strings. In this case we should check if dates were inferred from all strings
/// in the same way and if not, transform inferred dates back to strings.
/// For example, when we have Array(Date) (like `['2020-01-01', '2020-02-02']`) and Array(String) (like `['string', 'abc']`
/// we will convert the first type to Array(String).
/// Example 2:
/// When we have integers and floats for the same value, we should convert all integers to floats.
/// For example, when we have Array(Int64) (like `[123, 456]`) and Array(Float64) (like `[42.42, 4.42]`)
/// we will convert the first type to Array(Float64)
/// Example 3:
/// When we have not complete types like Nullable(Nothing), Array(Nullable(Nothing)) or Tuple(UInt64, Nullable(Nothing)),
/// we try to complete them using the other type.
/// For example, if we have Tuple(UInt64, Nullable(Nothing)) and Tuple(Nullable(Nothing), String) we will convert both
/// types to common type Tuple(Nullable(UInt64), Nullable(String))
void transformInferredTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings);
/// The same as transformInferredTypesIfNeeded but uses some specific transformations for JSON.
/// Example 1:
/// When we have numbers inferred from strings and strings, we convert all such numbers back to string.
/// For example, if we have Array(Int64) (like `['123', '456']`) and Array(String) (like `['str', 'abc']`)
/// we will convert the first type to Array(String). Note that we collect information about numbers inferred
/// from strings in json_info while inference and use it here, so we will know that Array(Int64) contains
/// integer inferred from a string.
/// Example 2:
/// When we have maps with different value types, we convert all types to JSON object type.
/// For example, if we have Map(String, UInt64) (like `{"a" : 123}`) and Map(String, String) (like `{"b" : 'abc'}`)
/// we will convert both types to Object('JSON').
void transformInferredJSONTypesIfNeeded(DataTypePtr & first, DataTypePtr & second, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Check if type is Tuple(...), try to transform nested types to find a common type for them and if all nested types
/// are the same after transform, we convert this tuple to an Array with common nested type.
/// For example, if we have Tuple(String, Nullable(Nothing)) we will convert it to Array(String).
/// It's used when all rows were read and we have Tuple in the result type that can be actually an Array.
void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info);
/// Make type Nullable recursively:
/// - Type -> Nullable(type)
/// - Array(Type) -> Array(Nullable(Type))
/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN))
/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType))
/// - LowCardinality(Type) -> LowCardinality(Nullable(Type))
DataTypePtr makeNullableRecursively(DataTypePtr type);
/// Call makeNullableRecursively for all types
/// in the block and return names and types.
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header);
/// Check if type contains Nothing, like Array(Tuple(Nullable(Nothing), String))
bool checkIfTypeIsComplete(const DataTypePtr & type);
}

View File

@ -319,12 +319,17 @@ template void readStringUntilEOFInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8
/** Parse the escape sequence, which can be simple (one character after backslash) or more complex (multiple characters).
* It is assumed that the cursor is located on the `\` symbol
*/
template <typename Vector>
static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
template <typename Vector, typename ReturnType = void>
static ReturnType parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
{
++buf.position();
if (buf.eof())
throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
{
if constexpr (std::is_same_v<ReturnType, void>)
throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
else
return ReturnType(false);
}
char char_after_backslash = *buf.position();
@ -363,6 +368,8 @@ static void parseComplexEscapeSequence(Vector & s, ReadBuffer & buf)
s.push_back(decoded_char);
++buf.position();
}
return ReturnType(true);
}
@ -521,14 +528,18 @@ template void readEscapedStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf
* backslash escape sequences are also parsed,
* that could be slightly confusing.
*/
template <char quote, bool enable_sql_style_quoting, typename Vector>
static void readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
template <char quote, bool enable_sql_style_quoting, typename Vector, typename ReturnType = void>
static ReturnType readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
{
static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
if (buf.eof() || *buf.position() != quote)
{
throw ParsingException(ErrorCodes::CANNOT_PARSE_QUOTED_STRING,
"Cannot parse quoted string: expected opening quote '{}', got '{}'",
std::string{quote}, buf.eof() ? "EOF" : std::string{*buf.position()});
if constexpr (throw_exception)
throw ParsingException(ErrorCodes::CANNOT_PARSE_QUOTED_STRING,
"Cannot parse quoted string: expected opening quote '{}', got '{}'",
std::string{quote}, buf.eof() ? "EOF" : std::string{*buf.position()});
else
return ReturnType(false);
}
++buf.position();
@ -554,15 +565,26 @@ static void readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
continue;
}
return;
return ReturnType(true);
}
if (*buf.position() == '\\')
parseComplexEscapeSequence(s, buf);
{
if constexpr (throw_exception)
parseComplexEscapeSequence<Vector, ReturnType>(s, buf);
else
{
if (!parseComplexEscapeSequence<Vector, ReturnType>(s, buf))
return ReturnType(false);
}
}
}
throw ParsingException("Cannot parse quoted string: expected closing quote",
ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
if constexpr (throw_exception)
throw ParsingException("Cannot parse quoted string: expected closing quote",
ErrorCodes::CANNOT_PARSE_QUOTED_STRING);
else
return ReturnType(false);
}
template <bool enable_sql_style_quoting, typename Vector>
@ -571,6 +593,14 @@ void readQuotedStringInto(Vector & s, ReadBuffer & buf)
readAnyQuotedStringInto<'\'', enable_sql_style_quoting>(s, buf);
}
template <typename Vector>
bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf)
{
return readAnyQuotedStringInto<'\'', false, Vector, bool>(s, buf);
}
template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf);
template <bool enable_sql_style_quoting, typename Vector>
void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf)
{
@ -934,6 +964,7 @@ template void readJSONStringInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UIn
template bool readJSONStringInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template void readJSONStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf);
template void readJSONStringInto<String>(String & s, ReadBuffer & buf);
template bool readJSONStringInto<String, bool>(String & s, ReadBuffer & buf);
template <typename Vector, typename ReturnType>
ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf)
@ -1501,6 +1532,43 @@ static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_fu
peekable_buf.position() = end;
}
template <typename Vector>
static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf)
{
assertChar('\'', buf);
s.push_back('\'');
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
s.append(buf.position(), next_pos);
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\'')
break;
s.push_back(*buf.position());
if (*buf.position() == '\\')
{
++buf.position();
if (!buf.eof())
{
s.push_back(*buf.position());
++buf.position();
}
}
}
if (buf.eof())
return;
++buf.position();
s.push_back('\'');
}
template <char opening_bracket, char closing_bracket, typename Vector>
static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf)
{
@ -1518,20 +1586,19 @@ static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf)
if (!buf.hasPendingData())
continue;
s.push_back(*buf.position());
if (*buf.position() == '\'')
{
readQuotedStringInto<false>(s, buf);
s.push_back('\'');
readQuotedStringFieldInto(s, buf);
}
else if (*buf.position() == opening_bracket)
{
s.push_back(opening_bracket);
++balance;
++buf.position();
}
else if (*buf.position() == closing_bracket)
{
s.push_back(closing_bracket);
--balance;
++buf.position();
}
@ -1554,11 +1621,7 @@ void readQuotedFieldInto(Vector & s, ReadBuffer & buf)
/// - Number: integer, float, decimal.
if (*buf.position() == '\'')
{
s.push_back('\'');
readQuotedStringInto<false>(s, buf);
s.push_back('\'');
}
readQuotedStringFieldInto(s, buf);
else if (*buf.position() == '[')
readQuotedFieldInBracketsInto<'[', ']'>(s, buf);
else if (*buf.position() == '(')

View File

@ -613,6 +613,9 @@ bool tryReadJSONStringInto(Vector & s, ReadBuffer & buf)
return readJSONStringInto<Vector, bool>(s, buf);
}
template <typename Vector>
bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf);
/// Reads chunk of data between {} in that way,
/// that it has balanced parentheses sequence of {}.
/// So, it may form a JSON object, but it can be incorrenct.

View File

@ -1,6 +1,5 @@
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/DataTypeString.h>
#include <Interpreters/parseColumnsListForTableFunction.h>
#include <boost/algorithm/string.hpp>
@ -11,65 +10,29 @@ namespace DB
namespace ErrorCodes
{
extern const int ONLY_NULLS_WHILE_READING_SCHEMA;
extern const int TYPE_MISMATCH;
extern const int INCORRECT_DATA;
extern const int EMPTY_DATA_PASSED;
extern const int BAD_ARGUMENTS;
}
void chooseResultColumnType(
DataTypePtr & type,
DataTypePtr & new_type,
std::function<void(DataTypePtr &, DataTypePtr &)> transform_types_if_needed,
const DataTypePtr & default_type,
const String & column_name,
size_t row)
void checkFinalInferredType(DataTypePtr & type, const String & name, const FormatSettings & settings, const DataTypePtr & default_type, size_t rows_read)
{
if (!type)
{
type = new_type;
return;
}
if (!new_type || type->equals(*new_type))
return;
transform_types_if_needed(type, new_type);
if (type->equals(*new_type))
return;
/// If the new type and the previous type for this column are different,
/// we will use default type if we have it or throw an exception.
if (default_type)
type = default_type;
else
{
throw Exception(
ErrorCodes::TYPE_MISMATCH,
"Automatically defined type {} for column '{}' in row {} differs from type defined by previous rows: {}. "
"You can specify the type for this column using setting schema_inference_hints",
type->getName(),
column_name,
row,
new_type->getName());
}
}
void checkResultColumnTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t rows_read)
{
if (!type)
if (!checkIfTypeIsComplete(type))
{
if (!default_type)
throw Exception(
ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA,
"Cannot determine type for column '{}' by first {} rows of data, most likely this column contains only Nulls or empty "
"Arrays/Maps. You can specify the type for this column using setting schema_inference_hints",
"Arrays/Maps. You can specify the type for this column using setting schema_inference_hints. "
"If your data contains complex JSON objects, try enabling one of the settings allow_experimental_object_type/input_format_json_read_objects_as_strings",
name,
rows_read);
type = default_type;
}
result.emplace_back(name, type);
if (settings.schema_inference_make_columns_nullable)
type = makeNullableRecursively(type);
}
IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
@ -88,6 +51,11 @@ void IIRowSchemaReader::setContext(ContextPtr & context)
}
}
void IIRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings);
}
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IIRowSchemaReader(in_, format_settings_), column_names(splitColumnNames(format_settings.column_names_for_schema_inference))
{
@ -160,23 +128,28 @@ NamesAndTypesList IRowSchemaReader::readSchema()
if (new_data_types.size() != data_types.size())
throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values");
for (size_t i = 0; i != data_types.size(); ++i)
for (field_index = 0; field_index != data_types.size(); ++field_index)
{
/// Check if we couldn't determine the type of this column in a new row
/// or the type for this column was taken from hints.
if (!new_data_types[i] || hints.contains(column_names[i]))
if (!new_data_types[field_index] || hints.contains(column_names[field_index]))
continue;
auto transform_types_if_needed = [&](DataTypePtr & type, DataTypePtr & new_type){ transformTypesIfNeeded(type, new_type, i); };
chooseResultColumnType(data_types[i], new_data_types[i], transform_types_if_needed, getDefaultType(i), std::to_string(i + 1), rows_read);
chooseResultColumnType(*this, data_types[field_index], new_data_types[field_index], getDefaultType(field_index), std::to_string(field_index + 1), rows_read);
}
}
NamesAndTypesList result;
for (size_t i = 0; i != data_types.size(); ++i)
for (field_index = 0; field_index != data_types.size(); ++field_index)
{
/// Check that we could determine the type of this column.
checkResultColumnTypeAndAppend(result, data_types[i], column_names[i], getDefaultType(i), rows_read);
/// Don't check/change types from hints.
if (!hints.contains(column_names[field_index]))
{
transformFinalTypeIfNeeded(data_types[field_index]);
/// Check that we could determine the type of this column.
checkFinalInferredType(data_types[field_index], column_names[field_index], format_settings, getDefaultType(field_index), rows_read);
}
result.emplace_back(column_names[field_index], data_types[field_index]);
}
return result;
@ -208,11 +181,6 @@ DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const
return nullptr;
}
void IRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t)
{
transformInferredTypesIfNeeded(type, new_type, format_settings);
}
IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
: IIRowSchemaReader(in_, format_settings_, default_type_)
{
@ -245,7 +213,6 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
names_order.push_back(name);
}
auto transform_types_if_needed = [&](DataTypePtr & type, DataTypePtr & new_type){ transformTypesIfNeeded(type, new_type); };
for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read)
{
auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof);
@ -277,7 +244,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
continue;
auto & type = it->second;
chooseResultColumnType(type, new_type, transform_types_if_needed, default_type, name, rows_read);
chooseResultColumnType(*this, type, new_type, default_type, name, rows_read);
}
}
@ -285,20 +252,21 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
if (names_to_types.empty())
throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data");
NamesAndTypesList result;
NamesAndTypesList result = getStaticNamesAndTypes();
for (auto & name : names_order)
{
auto & type = names_to_types[name];
/// Check that we could determine the type of this column.
checkResultColumnTypeAndAppend(result, type, name, default_type, rows_read);
/// Don't check/change types from hints.
if (!hints.contains(name))
{
transformFinalTypeIfNeeded(type);
/// Check that we could determine the type of this column.
checkFinalInferredType(type, name, format_settings, default_type, rows_read);
}
result.emplace_back(name, type);
}
return result;
}
void IRowWithNamesSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings);
}
}

View File

@ -9,6 +9,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
}
/// Base class for schema inference for the data in some specific format.
/// It reads some data from read buffer and try to determine the schema
/// from read data.
@ -45,10 +50,14 @@ public:
bool needContext() const override { return !hints_str.empty(); }
void setContext(ContextPtr & context) override;
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
protected:
void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; }
size_t getNumRowsRead() const override { return rows_read; }
virtual void transformFinalTypeIfNeeded(DataTypePtr &) {}
size_t max_rows_to_read;
size_t rows_read = 0;
DataTypePtr default_type;
@ -82,7 +91,7 @@ protected:
void setColumnNames(const std::vector<String> & names) { column_names = names; }
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t index);
size_t field_index;
private:
DataTypePtr getDefaultType(size_t column) const;
@ -111,7 +120,10 @@ protected:
/// Set eof = true if can't read more data.
virtual NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) = 0;
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
/// Get special static types that have the same name/type for each row.
/// For example, in JSONObjectEachRow format we have static column with
/// type String and name from a settings for object keys.
virtual NamesAndTypesList getStaticNamesAndTypes() { return {}; }
};
/// Base class for schema inference for formats that don't need any data to
@ -125,16 +137,46 @@ public:
virtual ~IExternalSchemaReader() = default;
};
template <class SchemaReader>
void chooseResultColumnType(
SchemaReader & schema_reader,
DataTypePtr & type,
DataTypePtr & new_type,
std::function<void(DataTypePtr &, DataTypePtr &)> transform_types_if_needed,
const DataTypePtr & default_type,
const String & column_name,
size_t row);
size_t row)
{
if (!type)
{
type = new_type;
return;
}
void checkResultColumnTypeAndAppend(
NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t rows_read);
if (!new_type || type->equals(*new_type))
return;
schema_reader.transformTypesIfNeeded(type, new_type);
if (type->equals(*new_type))
return;
/// If the new type and the previous type for this column are different,
/// we will use default type if we have it or throw an exception.
if (default_type)
type = default_type;
else
{
throw Exception(
ErrorCodes::TYPE_MISMATCH,
"Automatically defined type {} for column '{}' in row {} differs from type defined by previous rows: {}. "
"You can specify the type for this column using setting schema_inference_hints",
type->getName(),
column_name,
row,
new_type->getName());
}
}
void checkFinalInferredType(DataTypePtr & type, const String & name, const FormatSettings & settings, const DataTypePtr & default_type, size_t rows_read);
Strings splitColumnNames(const String & column_names_str);

View File

@ -3,7 +3,7 @@
#if USE_ARROW
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>

View File

@ -772,27 +772,27 @@ DataTypePtr BSONEachRowSchemaReader::getDataTypeFromBSONField(BSONType type, boo
case BSONType::DOUBLE:
{
in.ignore(sizeof(Float64));
return makeNullable(std::make_shared<DataTypeFloat64>());
return std::make_shared<DataTypeFloat64>();
}
case BSONType::BOOL:
{
in.ignore(sizeof(UInt8));
return makeNullable(DataTypeFactory::instance().get("Bool"));
return DataTypeFactory::instance().get("Bool");
}
case BSONType::INT64:
{
in.ignore(sizeof(Int64));
return makeNullable(std::make_shared<DataTypeInt64>());
return std::make_shared<DataTypeInt64>();
}
case BSONType::DATETIME:
{
in.ignore(sizeof(Int64));
return makeNullable(std::make_shared<DataTypeDateTime64>(6, "UTC"));
return std::make_shared<DataTypeDateTime64>(6, "UTC");
}
case BSONType::INT32:
{
in.ignore(sizeof(Int32));
return makeNullable(std::make_shared<DataTypeInt32>());
return std::make_shared<DataTypeInt32>();
}
case BSONType::SYMBOL: [[fallthrough]];
case BSONType::JAVA_SCRIPT_CODE: [[fallthrough]];
@ -802,7 +802,7 @@ DataTypePtr BSONEachRowSchemaReader::getDataTypeFromBSONField(BSONType type, boo
BSONSizeT size;
readBinary(size, in);
in.ignore(size);
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
}
case BSONType::DOCUMENT:
{
@ -856,10 +856,10 @@ DataTypePtr BSONEachRowSchemaReader::getDataTypeFromBSONField(BSONType type, boo
{
case BSONBinarySubtype::BINARY_OLD: [[fallthrough]];
case BSONBinarySubtype::BINARY:
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
case BSONBinarySubtype::UUID_OLD: [[fallthrough]];
case BSONBinarySubtype::UUID:
return makeNullable(std::make_shared<DataTypeUUID>());
return std::make_shared<DataTypeUUID>();
default:
throw Exception(ErrorCodes::UNKNOWN_TYPE, "BSON binary subtype {} is not supported", getBSONBinarySubtypeName(subtype));
}

View File

@ -274,15 +274,15 @@ void CSVFormatReader::skipPrefixBeforeHeader()
}
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_)
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(
in_,
format_setting_,
format_settings_,
with_names_,
with_types_,
&reader,
getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV))
, reader(in_, format_setting_)
, reader(in_, format_settings_)
{
}
@ -293,7 +293,7 @@ DataTypes CSVSchemaReader::readRowAndGetDataTypes()
return {};
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV);
return tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV);
}

View File

@ -75,7 +75,7 @@ public:
class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_);
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
private:
DataTypes readRowAndGetDataTypes() override;

View File

@ -1,6 +1,7 @@
#include <Processors/Formats/Impl/CustomSeparatedRowInputFormat.h>
#include <Processors/Formats/Impl/TemplateRowInputFormat.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <IO/Operators.h>
@ -370,12 +371,12 @@ DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes()
first_row = false;
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
return tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), &json_inference_info);
}
void CustomSeparatedSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t)
void CustomSeparatedSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings, reader.getEscapingRule());
transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, reader.getEscapingRule(), &json_inference_info);
}
void registerInputFormatCustomSeparated(FormatFactory & factory)

View File

@ -2,6 +2,7 @@
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/PeekableReadBuffer.h>
#include <IO/ReadHelpers.h>
@ -100,11 +101,12 @@ public:
private:
DataTypes readRowAndGetDataTypes() override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
PeekableReadBuffer buf;
CustomSeparatedFormatReader reader;
bool first_row = true;
JSONInferenceInfo json_inference_info;
};
}

View File

@ -2,8 +2,11 @@
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/JSONUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Interpreters/parseColumnsListForTableFunction.h>
#include <IO/ReadHelpers.h>
#include <base/find_symbols.h>
#include <Common/logger_useful.h>
namespace DB
{
@ -170,19 +173,25 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase(
ReadBuffer & in_, const FormatSettings & format_settings_, std::unique_ptr<JSONColumnsReaderBase> reader_)
: ISchemaReader(in_)
, format_settings(format_settings_)
, hints_str(format_settings_.schema_inference_hints)
, reader(std::move(reader_))
, column_names_from_settings(splitColumnNames(format_settings_.column_names_for_schema_inference))
{
}
void JSONColumnsSchemaReaderBase::chooseResulType(DataTypePtr & type, DataTypePtr & new_type, const String & column_name, size_t row) const
void JSONColumnsSchemaReaderBase::setContext(ContextPtr & ctx)
{
auto convert_types_if_needed = [&](DataTypePtr & first, DataTypePtr & second)
ColumnsDescription columns;
if (tryParseColumnsListFromString(hints_str, columns, ctx))
{
DataTypes types = {first, second};
transformInferredJSONTypesIfNeeded(types, format_settings);
};
chooseResultColumnType(type, new_type, convert_types_if_needed, nullptr, column_name, row);
for (const auto & [name, type] : columns.getAll())
hints[name] = type;
}
}
void JSONColumnsSchemaReaderBase::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredJSONTypesIfNeeded(type, new_type, format_settings, &inference_info);
}
NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
@ -220,9 +229,18 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
if (!names_to_types.contains(column_name))
names_order.push_back(column_name);
rows_in_block = 0;
auto column_type = readColumnAndGetDataType(column_name, rows_in_block, format_settings.max_rows_to_read_for_schema_inference - total_rows_read);
chooseResulType(names_to_types[column_name], column_type, column_name, total_rows_read + 1);
if (const auto it = hints.find(column_name); it != hints.end())
{
names_to_types[column_name] = it->second;
}
else
{
rows_in_block = 0;
auto column_type = readColumnAndGetDataType(
column_name, rows_in_block, format_settings.max_rows_to_read_for_schema_inference - total_rows_read);
chooseResultColumnType(*this, names_to_types[column_name], column_type, nullptr, column_name, total_rows_read + 1);
}
++iteration;
}
while (!reader->checkChunkEndOrSkipColumnDelimiter());
@ -237,8 +255,14 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
for (auto & name : names_order)
{
auto & type = names_to_types[name];
/// Check that we could determine the type of this column.
checkResultColumnTypeAndAppend(result, type, name, nullptr, format_settings.max_rows_to_read_for_schema_inference);
/// Don't check/change types from hints.
if (!hints.contains(name))
{
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
/// Check that we could determine the type of this column.
checkFinalInferredType(type, name, format_settings, nullptr, format_settings.max_rows_to_read_for_schema_inference);
}
result.emplace_back(name, type);
}
return result;
@ -262,8 +286,8 @@ DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String &
}
readJSONField(field, in);
DataTypePtr field_type = JSONUtils::getDataTypeFromField(field, format_settings);
chooseResulType(column_type, field_type, column_name, rows_read);
DataTypePtr field_type = tryInferDataTypeForSingleJSONField(field, format_settings, &inference_info);
chooseResultColumnType(*this, column_type, field_type, nullptr, column_name, rows_read);
++rows_read;
}
while (!reader->checkColumnEndOrSkipFieldDelimiter());

View File

@ -1,6 +1,7 @@
#pragma once
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Processors/Formats/IInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
@ -76,18 +77,23 @@ class JSONColumnsSchemaReaderBase : public ISchemaReader
public:
JSONColumnsSchemaReaderBase(ReadBuffer & in_, const FormatSettings & format_settings_, std::unique_ptr<JSONColumnsReaderBase> reader_);
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
bool needContext() const override { return !hints_str.empty(); }
void setContext(ContextPtr & ctx) override;
private:
NamesAndTypesList readSchema() override;
/// Read whole column in the block (up to max_rows_to_read rows) and extract the data type.
DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read);
/// Choose result type for column from two inferred types from different rows.
void chooseResulType(DataTypePtr & type, DataTypePtr & new_type, const String & column_name, size_t row) const;
const FormatSettings format_settings;
String hints_str;
std::unordered_map<String, DataTypePtr> hints;
std::unique_ptr<JSONColumnsReaderBase> reader;
Names column_names_from_settings;
JSONInferenceInfo inference_info;
};
}

View File

@ -7,6 +7,7 @@
#include <Formats/verbosePrintString.h>
#include <Formats/JSONUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
@ -202,12 +203,17 @@ DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes()
if (in.eof())
return {};
return JSONUtils::readRowAndGetDataTypesForJSONCompactEachRow(in, format_settings, reader.yieldStrings());
return JSONUtils::readRowAndGetDataTypesForJSONCompactEachRow(in, format_settings, &inference_info);
}
void JSONCompactEachRowRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t)
void JSONCompactEachRowRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredJSONTypesIfNeeded(type, new_type, format_settings);
transformInferredJSONTypesIfNeeded(type, new_type, format_settings, &inference_info);
}
void JSONCompactEachRowRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
{
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
}
void registerInputFormatJSONCompactEachRow(FormatFactory & factory)

View File

@ -4,6 +4,7 @@
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Common/HashTable/HashMap.h>
namespace DB
@ -80,10 +81,12 @@ public:
private:
DataTypes readRowAndGetDataTypes() override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
void transformFinalTypeIfNeeded(DataTypePtr & type) override;
JSONCompactEachRowFormatReader reader;
bool first_row = true;
JSONInferenceInfo inference_info;
};
}

View File

@ -4,6 +4,7 @@
#include <Processors/Formats/Impl/JSONEachRowRowInputFormat.h>
#include <Formats/JSONUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Formats/FormatFactory.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
@ -300,9 +301,8 @@ void JSONEachRowRowInputFormat::readSuffix()
assertEOF(*in);
}
JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings_)
JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IRowWithNamesSchemaReader(in_, format_settings_)
, json_strings(json_strings_)
{
}
@ -336,12 +336,17 @@ NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool &
return {};
}
return JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, json_strings);
return JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, &inference_info);
}
void JSONEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredJSONTypesIfNeeded(type, new_type, format_settings);
transformInferredJSONTypesIfNeeded(type, new_type, format_settings, &inference_info);
}
void JSONEachRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
{
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
}
void registerInputFormatJSONEachRow(FormatFactory & factory)
@ -391,11 +396,11 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory
void registerJSONEachRowSchemaReader(FormatFactory & factory)
{
auto register_schema_reader = [&](const String & format_name, bool json_strings)
auto register_schema_reader = [&](const String & format_name)
{
factory.registerSchemaReader(format_name, [json_strings](ReadBuffer & buf, const FormatSettings & settings)
factory.registerSchemaReader(format_name, [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_unique<JSONEachRowSchemaReader>(buf, json_strings, settings);
return std::make_unique<JSONEachRowSchemaReader>(buf, settings);
});
factory.registerAdditionalInfoForSchemaCacheGetter(format_name, [](const FormatSettings & settings)
{
@ -403,10 +408,10 @@ void registerJSONEachRowSchemaReader(FormatFactory & factory)
});
};
register_schema_reader("JSONEachRow", false);
register_schema_reader("JSONLines", false);
register_schema_reader("NDJSON", false);
register_schema_reader("JSONStringsEachRow", true);
register_schema_reader("JSONEachRow");
register_schema_reader("JSONLines");
register_schema_reader("NDJSON");
register_schema_reader("JSONStringsEachRow");
}
}

View File

@ -4,6 +4,7 @@
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Common/HashTable/HashMap.h>
@ -94,15 +95,16 @@ protected:
class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader
{
public:
JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings_);
JSONEachRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
void transformFinalTypeIfNeeded(DataTypePtr & type) override;
bool json_strings;
bool first_row = true;
bool data_in_square_brackets = false;
JSONInferenceInfo inference_info;
};
}

View File

@ -2,6 +2,7 @@
#include <Formats/JSONUtils.h>
#include <Formats/FormatFactory.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <DataTypes/DataTypeString.h>
namespace DB
@ -85,15 +86,25 @@ NamesAndTypesList JSONObjectEachRowSchemaReader::readRowAndGetNamesAndDataTypes(
JSONUtils::skipComma(in);
JSONUtils::readFieldName(in);
auto names_and_types = JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, false);
return JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, format_settings, &inference_info);
}
NamesAndTypesList JSONObjectEachRowSchemaReader::getStaticNamesAndTypes()
{
if (!format_settings.json_object_each_row.column_for_object_name.empty())
names_and_types.emplace_front(format_settings.json_object_each_row.column_for_object_name, std::make_shared<DataTypeString>());
return names_and_types;
return {{format_settings.json_object_each_row.column_for_object_name, std::make_shared<DataTypeString>()}};
return {};
}
void JSONObjectEachRowSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredJSONTypesIfNeeded(type, new_type, format_settings);
transformInferredJSONTypesIfNeeded(type, new_type, format_settings, &inference_info);
}
void JSONObjectEachRowSchemaReader::transformFinalTypeIfNeeded(DataTypePtr & type)
{
transformJSONTupleToArrayIfPossible(type, format_settings, &inference_info);
}
void registerInputFormatJSONObjectEachRow(FormatFactory & factory)

View File

@ -4,6 +4,7 @@
#include <Processors/Formats/Impl/JSONEachRowRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Common/HashTable/HashMap.h>
@ -41,9 +42,12 @@ public:
private:
NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override;
NamesAndTypesList getStaticNamesAndTypes() override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
void transformFinalTypeIfNeeded(DataTypePtr & type) override;
bool first_row = true;
JSONInferenceInfo inference_info;
};
std::optional<size_t> getColumnIndexForJSONObjectEachRowObjectName(const Block & header, const FormatSettings & settings);

View File

@ -247,6 +247,14 @@ static void insertNull(IColumn & column, DataTypePtr type)
static void insertUUID(IColumn & column, DataTypePtr type, const char * value, size_t size)
{
auto insert_func = [&](IColumn & column_, DataTypePtr type_)
{
insertUUID(column_, type_, value, size);
};
if (checkAndInsertNullable(column, type, insert_func) || checkAndInsertLowCardinality(column, type, insert_func))
return;
if (!isUUID(type))
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert MessagePack UUID into column with type {}.", type->getName());
ReadBufferFromMemory buf(value, size);
@ -470,16 +478,16 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object)
{
case msgpack::type::object_type::POSITIVE_INTEGER: [[fallthrough]];
case msgpack::type::object_type::NEGATIVE_INTEGER:
return makeNullable(std::make_shared<DataTypeInt64>());
return std::make_shared<DataTypeInt64>();
case msgpack::type::object_type::FLOAT32:
return makeNullable(std::make_shared<DataTypeFloat32>());
return std::make_shared<DataTypeFloat32>();
case msgpack::type::object_type::FLOAT64:
return makeNullable(std::make_shared<DataTypeFloat64>());
return std::make_shared<DataTypeFloat64>();
case msgpack::type::object_type::BOOLEAN:
return makeNullable(std::make_shared<DataTypeUInt8>());
return std::make_shared<DataTypeUInt8>();
case msgpack::type::object_type::BIN: [[fallthrough]];
case msgpack::type::object_type::STR:
return makeNullable(std::make_shared<DataTypeString>());
return std::make_shared<DataTypeString>();
case msgpack::type::object_type::ARRAY:
{
msgpack::object_array object_array = object.via.array;

View File

@ -435,7 +435,7 @@ DataTypes MySQLDumpSchemaReader::readRowAndGetDataTypes()
skipFieldDelimiter(in);
readQuotedField(value, in);
auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
auto type = tryInferDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
data_types.push_back(std::move(type));
}
skipEndOfRow(in, table_name);

View File

@ -3,7 +3,7 @@
#if USE_ORC
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>

View File

@ -4,7 +4,7 @@
#if USE_PARQUET
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/copyData.h>
#include <arrow/api.h>

View File

@ -3,6 +3,7 @@
#include <Processors/Formats/Impl/RegexpRowInputFormat.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <Formats/newLineSegmentationEngine.h>
#include <IO/ReadHelpers.h>
@ -155,15 +156,15 @@ DataTypes RegexpSchemaReader::readRowAndGetDataTypes()
for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i)
{
String field(field_extractor.getField(i));
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule));
data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, &json_inference_info));
}
return data_types;
}
void RegexpSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t)
void RegexpSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings, format_settings.regexp.escaping_rule);
transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, format_settings.regexp.escaping_rule, &json_inference_info);
}

View File

@ -5,12 +5,13 @@
#include <string>
#include <vector>
#include <Core/Block.h>
#include <IO/PeekableReadBuffer.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/FormatFactory.h>
#include <IO/PeekableReadBuffer.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <Formats/SchemaInferenceUtils.h>
namespace DB
@ -81,12 +82,13 @@ public:
private:
DataTypes readRowAndGetDataTypes() override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
using EscapingRule = FormatSettings::EscapingRule;
RegexpFieldExtractor field_extractor;
PeekableReadBuffer buf;
JSONInferenceInfo json_inference_info;
};
}

View File

@ -249,7 +249,7 @@ NamesAndTypesList TSKVSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof)
if (has_value)
{
readEscapedString(value, in);
names_and_types.emplace_back(std::move(name), determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped));
names_and_types.emplace_back(std::move(name), tryInferDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped));
}
else
{

View File

@ -268,7 +268,7 @@ DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypes()
return {};
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
return tryInferDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
}
void registerInputFormatTabSeparated(FormatFactory & factory)

View File

@ -2,6 +2,7 @@
#include <Formats/FormatFactory.h>
#include <Formats/verbosePrintString.h>
#include <Formats/EscapingRuleUtils.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/Operators.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/Serializations/SerializationNullable.h>
@ -511,16 +512,16 @@ DataTypes TemplateSchemaReader::readRowAndGetDataTypes()
format_reader.skipDelimiter(i);
updateFormatSettingsIfNeeded(row_format.escaping_rules[i], format_settings, row_format, default_csv_delimiter, i);
field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings);
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i]));
data_types.push_back(tryInferDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], &json_inference_info));
}
format_reader.skipRowEndDelimiter();
return data_types;
}
void TemplateSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t column_idx)
void TemplateSchemaReader::transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type)
{
transformInferredTypesIfNeeded(type, new_type, format_settings, row_format.escaping_rules[column_idx]);
transformInferredTypesByEscapingRuleIfNeeded(type, new_type, format_settings, row_format.escaping_rules[field_index], &json_inference_info);
}
static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings)

View File

@ -5,6 +5,7 @@
#include <Processors/Formats/ISchemaReader.h>
#include <Formats/FormatSettings.h>
#include <Formats/ParsedTemplateFormatString.h>
#include <Formats/SchemaInferenceUtils.h>
#include <IO/ReadHelpers.h>
#include <IO/PeekableReadBuffer.h>
#include <Interpreters/Context.h>
@ -121,13 +122,14 @@ public:
DataTypes readRowAndGetDataTypes() override;
private:
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type, size_t column_idx) override;
void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override;
PeekableReadBuffer buf;
const ParsedTemplateFormatString format;
const ParsedTemplateFormatString row_format;
TemplateFormatReader format_reader;
bool first_row = true;
JSONInferenceInfo json_inference_info;
const char default_csv_delimiter;
};

View File

@ -593,14 +593,14 @@ DataTypes ValuesSchemaReader::readRowAndGetDataTypes()
{
if (!data_types.empty())
{
skipWhitespaceIfAny(buf);
assertChar(',', buf);
skipWhitespaceIfAny(buf);
}
readQuotedField(value, buf);
auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
auto type = tryInferDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
data_types.push_back(std::move(type));
skipWhitespaceIfAny(buf);
}
assertChar(')', buf);

View File

@ -0,0 +1,23 @@
<test>
<substitutions>
<substitution>
<name>format</name>
<values>
<value>TabSeparated</value>
<value>CSV</value>
<value>Values</value>
<value>JSONEachRow</value>
<value>JSONCompactEachRow</value>
</values>
</substitution>
</substitutions>
<fill_query>INSERT INTO function file(data.{format}) SELECT WatchID, Title, EventTime, RefererCategories, RefererRegions FROM test.hits LIMIT 25000 SETTINGS engine_file_truncate_on_insert=1</fill_query>
<query>DESC file(data.{format}) SETTINGS schema_inference_use_cache_for_file=0</query>
<drop_query>INSERT INTO FUNCTION file(data.{format}) SELECT * FROM numbers(0) SETTINGS engine_file_truncate_on_insert=1</drop_query>
</test>

View File

@ -24,12 +24,12 @@ fixed_string Nullable(String)
Str: 0 100
Str: 1 200
array Array(Nullable(UInt64))
tuple Tuple(Nullable(UInt64), Nullable(String))
tuple Tuple(`1` Nullable(UInt64), `2` Nullable(String))
map Map(String, Nullable(UInt64))
[0,1] (0,'0') {'0':0}
[1,2] (1,'1') {'1':1}
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
nested1 Array(Tuple(`1` Array(Nullable(UInt64)), `2` Map(String, Nullable(UInt64))))
nested2 Tuple(`1` Tuple(`1` Array(Array(Nullable(UInt64))), `2` Map(UInt64, Array(Tuple(`1` Nullable(UInt64), `2` Nullable(String))))), `2` Nullable(UInt8))
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
ArrowStream
@ -58,12 +58,12 @@ fixed_string Nullable(String)
Str: 0 100
Str: 1 200
array Array(Nullable(UInt64))
tuple Tuple(Nullable(UInt64), Nullable(String))
tuple Tuple(`1` Nullable(UInt64), `2` Nullable(String))
map Map(String, Nullable(UInt64))
[0,1] (0,'0') {'0':0}
[1,2] (1,'1') {'1':1}
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
nested1 Array(Tuple(`1` Array(Nullable(UInt64)), `2` Map(String, Nullable(UInt64))))
nested2 Tuple(`1` Tuple(`1` Array(Array(Nullable(UInt64))), `2` Map(UInt64, Array(Tuple(`1` Nullable(UInt64), `2` Nullable(String))))), `2` Nullable(UInt8))
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
Parquet
@ -92,12 +92,12 @@ fixed_string Nullable(String)
Str: 0 100
Str: 1 200
array Array(Nullable(UInt64))
tuple Tuple(Nullable(UInt64), Nullable(String))
tuple Tuple(`1` Nullable(UInt64), `2` Nullable(String))
map Map(String, Nullable(UInt64))
[0,1] (0,'0') {'0':0}
[1,2] (1,'1') {'1':1}
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
nested1 Array(Tuple(`1` Array(Nullable(UInt64)), `2` Map(String, Nullable(UInt64))))
nested2 Tuple(`1` Tuple(`1` Array(Array(Nullable(UInt64))), `2` Map(UInt64, Array(Tuple(`1` Nullable(UInt64), `2` Nullable(String))))), `2` Nullable(UInt8))
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
ORC
@ -126,12 +126,12 @@ fixed_string Nullable(String)
Str: 0 100
Str: 1 200
array Array(Nullable(Int64))
tuple Tuple(Nullable(Int64), Nullable(String))
tuple Tuple(`1` Nullable(Int64), `2` Nullable(String))
map Map(String, Nullable(Int64))
[0,1] (0,'0') {'0':0}
[1,2] (1,'1') {'1':1}
nested1 Array(Tuple(Array(Nullable(Int64)), Map(String, Nullable(Int64))))
nested2 Tuple(Tuple(Array(Array(Nullable(Int64))), Map(Int64, Array(Tuple(Nullable(Int64), Nullable(String))))), Nullable(Int8))
nested1 Array(Tuple(`1` Array(Nullable(Int64)), `2` Map(String, Nullable(Int64))))
nested2 Tuple(`1` Tuple(`1` Array(Array(Nullable(Int64))), `2` Map(Int64, Array(Tuple(`1` Nullable(Int64), `2` Nullable(String))))), `2` Nullable(Int8))
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
Native

View File

@ -1,4 +1,4 @@
5e7084e0-019f-461f-9e70-84e0019f561f
5e7084e0-019f-461f-9e70-84e0019f561f
5e7084e0-019f-461f-9e70-84e0019f561f
5e7084e0-019f-461f-9e70-84e0019f561f UUID
5e7084e0-019f-461f-9e70-84e0019f561f Nullable(UUID)

View File

@ -1,5 +1,5 @@
drop table if exists test;
create table test (x UInt32, y String, d Date) engine=Memory() as select number as x, toString(number) as y, toDate(number) as d from numbers(10);
insert into table function file('data.native.zst') select * from test;
insert into table function file('data.native.zst') select * from test settings engine_file_truncate_on_insert=1;
desc file('data.native.zst');
select * from file('data.native.zst');

View File

@ -2,7 +2,7 @@ Arrow
x Nullable(UInt64)
arr1 Array(Nullable(UInt64))
arr2 Array(Array(Nullable(String)))
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
arr3 Array(Tuple(`1` Nullable(String), `2` Nullable(UInt64)))
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
@ -12,7 +12,7 @@ ArrowStream
x Nullable(UInt64)
arr1 Array(Nullable(UInt64))
arr2 Array(Array(Nullable(String)))
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
arr3 Array(Tuple(`1` Nullable(String), `2` Nullable(UInt64)))
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
@ -22,7 +22,7 @@ Parquet
x Nullable(UInt64)
arr1 Array(Nullable(UInt64))
arr2 Array(Array(Nullable(String)))
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
arr3 Array(Tuple(`1` Nullable(String), `2` Nullable(UInt64)))
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
@ -32,7 +32,7 @@ ORC
x Nullable(Int64)
arr1 Array(Nullable(Int64))
arr2 Array(Array(Nullable(String)))
arr3 Array(Tuple(Nullable(String), Nullable(Int64)))
arr3 Array(Tuple(`1` Nullable(String), `2` Nullable(Int64)))
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]

View File

@ -4,7 +4,8 @@ drop table if exists test_02245_s3_nested_parquet2;
set input_format_parquet_import_nested = 1;
create table test_02245_s3_nested_parquet1(a Int64, b Tuple(a Int64, b String)) engine=S3(s3_conn, filename='test_02245_s3_nested_parquet1_{_partition_id}', format='Parquet') partition by a;
insert into test_02245_s3_nested_parquet1 values (1, (2, 'a'));
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_parquet1_*', format='Parquet'); -- { serverError 47 }
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_parquet1_*', format='Parquet');
1 2 a
create table test_02245_s3_nested_parquet2(a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))) engine=S3(s3_conn, filename='test_02245_s3_nested_parquet2_{_partition_id}', format='Parquet') partition by a;
insert into test_02245_s3_nested_parquet2 values (1, (2, (3, 'a')));
select a, b.a, b.b.c, b.b.d from s3(s3_conn, filename='test_02245_s3_nested_parquet2_*', format='Parquet', structure='a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))');
@ -14,7 +15,8 @@ drop table if exists test_02245_s3_nested_arrow2;
set input_format_arrow_import_nested=1;
create table test_02245_s3_nested_arrow1(a Int64, b Tuple(a Int64, b String)) engine=S3(s3_conn, filename='test_02245_s3_nested_arrow1_{_partition_id}', format='Arrow') partition by a;
insert into test_02245_s3_nested_arrow1 values (1, (2, 'a'));
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_arrow1_*', format='Arrow'); -- { serverError 47 }
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_arrow1_*', format='Arrow');
1 2 a
create table test_02245_s3_nested_arrow2(a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))) engine=S3(s3_conn, filename='test_02245_s3_nested_arrow2_{_partition_id}', format='Arrow') partition by a;
insert into test_02245_s3_nested_arrow2 values (1, (2, (3, 'a')));
select a, b.a, b.b.c, b.b.d from s3(s3_conn, filename='test_02245_s3_nested_arrow2_*', format='Arrow', structure='a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))');
@ -24,7 +26,8 @@ drop table if exists test_02245_s3_nested_orc2;
set input_format_orc_import_nested=1;
create table test_02245_s3_nested_orc1(a Int64, b Tuple(a Int64, b String)) engine=S3(s3_conn, filename='test_02245_s3_nested_orc1_{_partition_id}', format='ORC') partition by a;
insert into test_02245_s3_nested_orc1 values (1, (2, 'a'));
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_orc1_*', format='ORC'); -- { serverError 47 }
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_orc1_*', format='ORC');
1 2 a
create table test_02245_s3_nested_orc2(a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))) engine=S3(s3_conn, filename='test_02245_s3_nested_orc2_{_partition_id}', format='ORC') partition by a;
insert into test_02245_s3_nested_orc2 values (1, (2, (3, 'a')));
select a, b.a, b.b.c, b.b.d from s3(s3_conn, filename='test_02245_s3_nested_orc2_*', format='ORC', structure='a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))');

View File

@ -8,7 +8,7 @@ set input_format_parquet_import_nested = 1;
create table test_02245_s3_nested_parquet1(a Int64, b Tuple(a Int64, b String)) engine=S3(s3_conn, filename='test_02245_s3_nested_parquet1_{_partition_id}', format='Parquet') partition by a;
insert into test_02245_s3_nested_parquet1 values (1, (2, 'a'));
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_parquet1_*', format='Parquet'); -- { serverError 47 }
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_parquet1_*', format='Parquet');
create table test_02245_s3_nested_parquet2(a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))) engine=S3(s3_conn, filename='test_02245_s3_nested_parquet2_{_partition_id}', format='Parquet') partition by a;
insert into test_02245_s3_nested_parquet2 values (1, (2, (3, 'a')));
@ -22,7 +22,7 @@ set input_format_arrow_import_nested=1;
create table test_02245_s3_nested_arrow1(a Int64, b Tuple(a Int64, b String)) engine=S3(s3_conn, filename='test_02245_s3_nested_arrow1_{_partition_id}', format='Arrow') partition by a;
insert into test_02245_s3_nested_arrow1 values (1, (2, 'a'));
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_arrow1_*', format='Arrow'); -- { serverError 47 }
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_arrow1_*', format='Arrow');
create table test_02245_s3_nested_arrow2(a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))) engine=S3(s3_conn, filename='test_02245_s3_nested_arrow2_{_partition_id}', format='Arrow') partition by a;
insert into test_02245_s3_nested_arrow2 values (1, (2, (3, 'a')));
@ -36,7 +36,7 @@ set input_format_orc_import_nested=1;
create table test_02245_s3_nested_orc1(a Int64, b Tuple(a Int64, b String)) engine=S3(s3_conn, filename='test_02245_s3_nested_orc1_{_partition_id}', format='ORC') partition by a;
insert into test_02245_s3_nested_orc1 values (1, (2, 'a'));
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_orc1_*', format='ORC'); -- { serverError 47 }
select a, b.a, b.b from s3(s3_conn, filename='test_02245_s3_nested_orc1_*', format='ORC');
create table test_02245_s3_nested_orc2(a Int64, b Tuple(a Int64, b Tuple(c Int64, d String))) engine=S3(s3_conn, filename='test_02245_s3_nested_orc2_{_partition_id}', format='ORC') partition by a;
insert into test_02245_s3_nested_orc2 values (1, (2, (3, 'a')));

View File

@ -14,5 +14,5 @@ $CLICKHOUSE_CLIENT -q "select * from file('${CLICKHOUSE_TEST_UNIQUE_NAME}_data*.
$CLICKHOUSE_CLIENT -q "insert into function file('${CLICKHOUSE_TEST_UNIQUE_NAME}_data4.jsonl', 'TSV') select 1 as x";
$CLICKHOUSE_CLIENT -q "insert into function file('${CLICKHOUSE_TEST_UNIQUE_NAME}_data1.jsonl', 'TSV') select [1,2,3] as x SETTINGS engine_file_truncate_on_insert = 1";
$CLICKHOUSE_CLIENT -q "select * from file('${CLICKHOUSE_TEST_UNIQUE_NAME}_data*.jsonl') settings schema_inference_use_cache_for_file=0" 2>&1 | grep -F -q "INCORRECT_DATA" && echo "OK" || echo "FAIL";
$CLICKHOUSE_CLIENT -q "select * from file('${CLICKHOUSE_TEST_UNIQUE_NAME}_data*.jsonl') settings schema_inference_use_cache_for_file=0" 2>&1 | grep -F -q "CANNOT_PARSE_INPUT_ASSERTION_FAILED" && echo "OK" || echo "FAIL";

View File

@ -1,7 +1,7 @@
-- Tags: no-fasttest
insert into function file('02268_data.jsonl', 'TSV') select 1;
select * from file('02268_data.jsonl'); --{serverError 117}
select * from file('02268_data.jsonl'); --{serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
insert into function file('02268_data.jsonCompactEachRow', 'TSV') select 1;
select * from file('02268_data.jsonCompactEachRow'); --{serverError 117}
select * from file('02268_data.jsonCompactEachRow'); --{serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}

View File

@ -1,2 +1,2 @@
insert into function file('02269_data', 'RowBinary') select 1;
insert into function file('02269_data', 'RowBinary') select 1 settings engine_file_truncate_on_insert=1;
select * from file('02269_data', 'RowBinary', 'x UInt8');

View File

@ -233,11 +233,11 @@ Schema inference
x Nullable(Int32)
x Nullable(Int64)
x Nullable(Int64)
FAIL
OK
x Array(Nullable(Int32))
x Array(Nullable(Int64))
x Array(Nullable(Int64))
FAIL
OK
OK
OK
OK

View File

@ -164,7 +164,7 @@ $CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select number::Int64 as x from numbers(2)"
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select number::UInt64 as x from numbers(2)"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select toString(number) as x from numbers(2)"
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)" 2>&1 | grep -q -F "TYPE_MISMATCH" && echo "OK" || echo "FAIL"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select [number::Bool] as x from numbers(2) settings engine_file_truncate_on_insert=1"
@ -174,7 +174,7 @@ $CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select [number::Int64] as x from numbers(2)"
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select [number::UInt64] as x from numbers(2)"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select [toString(number)] as x from numbers(2)"
$CLICKHOUSE_CLIENT -q "desc file(02475_data.bsonEachRow)" 2>&1 | grep -q -F "TYPE_MISMATCH" && echo "OK" || echo "FAIL"
$CLICKHOUSE_CLIENT -q "insert into function file(02475_data.bsonEachRow) select [] as x from numbers(2) settings engine_file_truncate_on_insert=1"

View File

@ -0,0 +1,48 @@
JSONEachRow
x Array(Nullable(Int64))
x Array(Nullable(Int64))
x Array(Nullable(Int64))
x Array(Nullable(Int64))
x Tuple(Nullable(String), Nullable(Int64))
x Tuple(Nullable(String), Nullable(Int64))
x Map(String, Nullable(Int64))
x Map(String, Nullable(Int64))
x Array(Nullable(Int64))
x Array(Array(Nullable(Int64)))
x Array(Map(String, Nullable(Int64)))
x Array(Array(Nullable(String)))
x Array(Int64)
x Array(Nullable(Int64))
x Array(Int64)
x Array(Nullable(Int64))
JSONCompactEachRow
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Tuple(Nullable(String), Nullable(Int64))
c1 Tuple(Nullable(String), Nullable(Int64))
c1 Map(String, Nullable(Int64))
c1 Map(String, Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Array(Nullable(Int64)))
c1 Array(Map(String, Nullable(Int64)))
c1 Array(Array(Nullable(String)))
c1 Array(Int64)
c1 Array(Nullable(Int64))
c1 Array(Int64)
c1 Array(Nullable(Int64))
CSV
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Array(Nullable(Int64))
c1 Map(String, Nullable(Int64))
c1 Map(String, Nullable(Int64))
c1 Array(Array(Nullable(Int64)))
c1 Array(Map(String, Nullable(Int64)))
c1 Array(Array(Nullable(String)))
c1 Array(Int64)
c1 Array(Nullable(Int64))
c1 Array(Int64)
c1 Array(Nullable(Int64))

View File

@ -0,0 +1,63 @@
select 'JSONEachRow';
set schema_inference_make_columns_nullable=1;
desc format(JSONEachRow, '{"x" : 1234}, {"x" : "String"}') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH }
desc format(JSONEachRow, '{"x" : [null, 1]}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : []}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : [null]}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : [1, null]}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : ["abc", 1]}');
desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : ["abc", null]}');
desc format(JSONEachRow, '{"x" : {}}, {"x" : {"a" : 1}}');
desc format(JSONEachRow, '{"x" : {"a" : null}}, {"x" : {"b" : 1}}');
desc format(JSONEachRow, '{"x" : null}, {"x" : [1, 2]}');
desc format(JSONEachRow, '{"x" : [[], [null], [1, 2, 3]]}');
desc format(JSONEachRow, '{"x" : [{"a" : null}, {"b" : 1}]}');
desc format(JSONEachRow, '{"x" : [["2020-01-01", null, "1234"], ["abcd"]]}');
set schema_inference_make_columns_nullable=0;
desc format(JSONEachRow, '{"x" : [1, 2]}');
desc format(JSONEachRow, '{"x" : [null, 1]}');
desc format(JSONEachRow, '{"x" : [1, 2]}, {"x" : [3]}');
desc format(JSONEachRow, '{"x" : [1, 2]}, {"x" : [null]}');
select 'JSONCompactEachRow';
set schema_inference_make_columns_nullable=1;
desc format(JSONCompactEachRow, '[1234], ["String"]') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH }
desc format(JSONCompactEachRow, '[[null, 1]]');
desc format(JSONCompactEachRow, '[[null, 1]], [[]]');
desc format(JSONCompactEachRow, '[[null, 1]], [[null]]');
desc format(JSONCompactEachRow, '[[null, 1]], [[1, null]]');
desc format(JSONCompactEachRow, '[[null, 1]], [["abc", 1]]');
desc format(JSONCompactEachRow, '[[null, 1]], [["abc", null]]');
desc format(JSONCompactEachRow, '[{}], [{"a" : 1}]');
desc format(JSONCompactEachRow, '[{"a" : null}], [{"b" : 1}]');
desc format(JSONCompactEachRow, '[null], [[1, 2]]');
desc format(JSONCompactEachRow, '[[[], [null], [1, 2, 3]]]');
desc format(JSONCompactEachRow, '[[{"a" : null}, {"b" : 1}]]');
desc format(JSONCompactEachRow, '[[["2020-01-01", null, "1234"], ["abcd"]]]');
set schema_inference_make_columns_nullable=0;
desc format(JSONCompactEachRow, '[[1, 2]]');
desc format(JSONCompactEachRow, '[[null, 1]]');
desc format(JSONCompactEachRow, '[[1, 2]], [[3]]');
desc format(JSONCompactEachRow, '[[1, 2]], [[null]]');
select 'CSV';
set schema_inference_make_columns_nullable=1;
desc format(CSV, '"[null, 1]"');
desc format(CSV, '"[null, 1]"\n"[]"');
desc format(CSV, '"[null, 1]"\n"[null]"');
desc format(CSV, '"[null, 1]"\n"[1, null]"');
desc format(CSV, '"{}"\n"{\'a\' : 1}"');
desc format(CSV, '"{\'a\' : null}"\n"{\'b\' : 1}"');
desc format(CSV, '"[[], [null], [1, 2, 3]]"');
desc format(CSV, '"[{\'a\' : null}, {\'b\' : 1}]"');
desc format(CSV, '"[[\'2020-01-01\', null, \'1234\'], [\'abcd\']]"');
set schema_inference_make_columns_nullable=0;
desc format(CSV, '"[1,2]"');
desc format(CSV, '"[NULL, 1]"');
desc format(CSV, '"[1, 2]"\n"[3]"');
desc format(CSV, '"[1, 2]"\n"[null]"');

View File

@ -0,0 +1 @@
s Nullable(String)

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select randomString(100) as s format JSONEachRow" | $CLICKHOUSE_LOCAL -q "desc test" --table='test' --input-format='JSONEachRow'

View File

@ -0,0 +1,2 @@
c1 Array(Nullable(String))
c1 Nullable(String)

View File

@ -0,0 +1,2 @@
desc format(CSV, '"[\'abc\\\'\']"');
desc format(Values, '(\'abc\\\'\')');

View File

@ -0,0 +1,20 @@
x Nullable(Float64)
x Nullable(Float64)
x Nullable(Int64)
x Nullable(Int64)
x Nullable(Float64)
x Nullable(Float64)
x Array(Nullable(Float64))
x Array(Nullable(Float64))
x Array(Nullable(Float64))
x Array(Nullable(Float64))
c1 Nullable(Float64)
c1 Nullable(Float64)
c1 Nullable(Int64)
c1 Nullable(Int64)
c1 Nullable(Float64)
c1 Nullable(Float64)
c1 Array(Nullable(Float64))
c1 Array(Nullable(Float64))
c1 Array(Nullable(Float64))
c1 Array(Nullable(Float64))

View File

@ -0,0 +1,28 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1.2}')";
echo '{"x" : 1.2}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1}')";
echo '{"x" : 1}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1e10}')";
echo '{"x" : 1e10}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : [1, 42.42, 1, 1e10]}')";
echo '{"x" : [1, 42.42, 1, 1e10]}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : [1, 42.42, false]}')";
echo '{"x" : [1, 42.42, false]}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(TSV, '1.2')";
echo '1.2' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(TSV, '1')";
echo '1' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(TSV, '1e10')";
echo '1e10' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(TSV, '[1, 42.42, 1, 1e10]')";
echo '[1, 42.42, 1, 1e10]' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test";
$CLICKHOUSE_LOCAL -q "desc format(TSV, '[1, 42.42, false]')";
echo '[1, 42.42, false]' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test";

View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
# Tags: no-parallel
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select repeat('[', 10000) || '1,2,3' || repeat(']', 10000)" > 02501_deep_nested_array.tsv
$CLICKHOUSE_LOCAL -q "desc file(02501_deep_nested_array.tsv)" 2>&1 | grep -q -F "TOO_DEEP_RECURSION" && echo "OK" || echo "FAIL"
rm 02501_deep_nested_array.tsv

View File

@ -0,0 +1,2 @@
desc format(Values, '(\'abc)'); -- { serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED }