Merge pull request #35582 from Avogar/improve-schema-inference

Improve schema inference and add some fixes
This commit is contained in:
Kruglov Pavel 2022-04-08 13:44:52 +02:00 committed by GitHub
commit d7b88d7683
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
59 changed files with 1167 additions and 368 deletions

View File

@ -641,6 +641,12 @@ class IColumn;
M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \
M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Parquet", 0) \
M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format ORC", 0) \
M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \
M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
\
M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \

View File

@ -45,22 +45,7 @@ DataTypeMap::DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & valu
void DataTypeMap::assertKeyType() const
{
bool type_error = false;
if (key_type->getTypeId() == TypeIndex::LowCardinality)
{
const auto & low_cardinality_data_type = assert_cast<const DataTypeLowCardinality &>(*key_type);
if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType())))
type_error = true;
}
else if (!key_type->isValueRepresentedByInteger()
&& !isStringOrFixedString(*key_type)
&& !WhichDataType(key_type).isNothing()
&& !WhichDataType(key_type).isUUID())
{
type_error = true;
}
if (type_error)
if (!checkKeyType(key_type))
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Type of Map key must be a type, that can be represented by integer or String or FixedString (possibly LowCardinality) or UUID,"
" but {} given", key_type->getName());
@ -102,6 +87,25 @@ bool DataTypeMap::equals(const IDataType & rhs) const
return nested->equals(*rhs_map.nested);
}
bool DataTypeMap::checkKeyType(DataTypePtr key_type)
{
if (key_type->getTypeId() == TypeIndex::LowCardinality)
{
const auto & low_cardinality_data_type = assert_cast<const DataTypeLowCardinality &>(*key_type);
if (!isStringOrFixedString(*(low_cardinality_data_type.getDictionaryType())))
return false;
}
else if (!key_type->isValueRepresentedByInteger()
&& !isStringOrFixedString(*key_type)
&& !WhichDataType(key_type).isNothing()
&& !WhichDataType(key_type).isUUID())
{
return false;
}
return true;
}
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.size() != 2)

View File

@ -48,6 +48,8 @@ public:
SerializationPtr doGetDefaultSerialization() const override;
static bool checkKeyType(DataTypePtr key_type);
private:
void assertKeyType() const;
};

View File

@ -5,12 +5,17 @@
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/DataTypeMap.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Poco/JSON/Parser.h>
#include <IO/ReadBufferFromString.h>
#include <Parsers/TokenIterator.h>
#include <Parsers/ExpressionListParsers.h>
#include <Interpreters/evaluateConstantExpression.h>
namespace DB
{
@ -18,7 +23,6 @@ namespace DB
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
}
FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule)
@ -138,7 +142,8 @@ bool deserializeFieldByEscapingRule(
serialization->deserializeTextRaw(column, buf, format_settings);
break;
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule));
throw Exception(
ErrorCodes::BAD_ARGUMENTS, "Escaping rule {} is not suitable for deserialization", escapingRuleToString(escaping_rule));
}
return read;
}
@ -176,7 +181,8 @@ void serializeFieldByEscapingRule(
}
}
void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
void writeStringByEscapingRule(
const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
{
switch (escaping_rule)
{
@ -249,85 +255,269 @@ String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule e
return readByEscapingRule<true>(buf, escaping_rule, format_settings);
}
static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context)
static DataTypePtr determineDataTypeForSingleFieldImpl(ReadBuffer & buf)
{
if (!context)
throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression");
if (buf.eof())
return nullptr;
ParserExpression parser;
Expected expected;
Tokens tokens(field.data, field.data + field.size);
IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth);
ASTPtr ast;
/// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats.
bool parsed = parser.parse(token_iterator, ast, expected);
if (!parsed || !token_iterator->isEnd())
return false;
try
/// Array
if (checkChar('[', buf))
{
std::pair<Field, DataTypePtr> result = evaluateConstantExpression(ast, context);
type = generalizeDataType(result.second);
return true;
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ']')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto nested_type = determineDataTypeForSingleFieldImpl(buf);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
if (buf.eof())
return nullptr;
++buf.position();
if (nested_types.empty())
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
auto least_supertype = tryGetLeastSupertype(nested_types);
if (!least_supertype)
return nullptr;
return std::make_shared<DataTypeArray>(least_supertype);
}
catch (...)
/// Tuple
if (checkChar('(', buf))
{
return false;
skipWhitespaceIfAny(buf);
DataTypes nested_types;
bool first = true;
while (!buf.eof() && *buf.position() != ')')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto nested_type = determineDataTypeForSingleFieldImpl(buf);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
}
if (buf.eof() || nested_types.empty())
return nullptr;
++buf.position();
return std::make_shared<DataTypeTuple>(nested_types);
}
/// Map
if (checkChar('{', buf))
{
skipWhitespaceIfAny(buf);
DataTypes key_types;
DataTypes value_types;
bool first = true;
while (!buf.eof() && *buf.position() != '}')
{
if (!first)
{
skipWhitespaceIfAny(buf);
if (!checkChar(',', buf))
return nullptr;
skipWhitespaceIfAny(buf);
}
else
first = false;
auto key_type = determineDataTypeForSingleFieldImpl(buf);
if (!key_type)
return nullptr;
key_types.push_back(key_type);
skipWhitespaceIfAny(buf);
if (!checkChar(':', buf))
return nullptr;
skipWhitespaceIfAny(buf);
auto value_type = determineDataTypeForSingleFieldImpl(buf);
if (!value_type)
return nullptr;
value_types.push_back(value_type);
}
if (buf.eof())
return nullptr;
++buf.position();
skipWhitespaceIfAny(buf);
if (key_types.empty())
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeNothing>(), std::make_shared<DataTypeNothing>());
auto key_least_supertype = tryGetLeastSupertype(key_types);
auto value_least_supertype = tryGetLeastSupertype(value_types);
if (!key_least_supertype || !value_least_supertype)
return nullptr;
if (!DataTypeMap::checkKeyType(key_least_supertype))
return nullptr;
return std::make_shared<DataTypeMap>(key_least_supertype, value_least_supertype);
}
/// String
if (*buf.position() == '\'')
{
++buf.position();
while (!buf.eof())
{
char * next_pos = find_first_symbols<'\\', '\''>(buf.position(), buf.buffer().end());
buf.position() = next_pos;
if (!buf.hasPendingData())
continue;
if (*buf.position() == '\'')
break;
if (*buf.position() == '\\')
++buf.position();
}
if (buf.eof())
return nullptr;
++buf.position();
return std::make_shared<DataTypeString>();
}
/// Bool
if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf))
return DataTypeFactory::instance().get("Bool");
/// Null
if (checkStringCaseInsensitive("NULL", buf))
return std::make_shared<DataTypeNothing>();
/// Number
Float64 tmp;
if (tryReadFloatText(tmp, buf))
return std::make_shared<DataTypeFloat64>();
return nullptr;
}
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context)
static DataTypePtr determineDataTypeForSingleField(ReadBuffer & buf)
{
return makeNullableRecursivelyAndCheckForNothing(determineDataTypeForSingleFieldImpl(buf));
}
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
{
switch (escaping_rule)
{
case FormatSettings::EscapingRule::Quoted:
{
DataTypePtr type;
bool parsed = evaluateConstantExpressionFromString(field, type, context);
return parsed ? type : nullptr;
ReadBufferFromString buf(field);
auto type = determineDataTypeForSingleField(buf);
return buf.eof() ? type : nullptr;
}
case FormatSettings::EscapingRule::JSON:
return getDataTypeFromJSONField(field);
case FormatSettings::EscapingRule::CSV:
{
if (!format_settings.csv.input_format_use_best_effort_in_schema_inference)
return makeNullable(std::make_shared<DataTypeString>());
if (field.empty() || field == format_settings.csv.null_representation)
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return std::make_shared<DataTypeUInt8>();
return DataTypeFactory::instance().get("Nullable(Bool)");
DataTypePtr type;
bool parsed;
if (field[0] == '\'' || field[0] == '"')
if (field.size() > 1 && ((field.front() == '\'' && field.back() == '\'') || (field.front() == '"' && field.back() == '"')))
{
/// Try to evaluate expression inside quotes.
parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context);
/// If it's a number in quotes we determine it as a string.
if (parsed && type && isNumber(removeNullable(type)))
return makeNullable(std::make_shared<DataTypeString>());
}
else
parsed = evaluateConstantExpressionFromString(field, type, context);
ReadBufferFromString buf(std::string_view(field.data() + 1, field.size() - 2));
/// Try to determine the type of value inside quotes
auto type = determineDataTypeForSingleField(buf);
/// If we couldn't parse an expression, determine it as a string.
return parsed ? type : makeNullable(std::make_shared<DataTypeString>());
if (!type)
return nullptr;
/// If it's a number or tuple in quotes or there is some unread data in buffer, we determine it as a string.
if (isNumber(removeNullable(type)) || isTuple(type) || !buf.eof())
return makeNullable(std::make_shared<DataTypeString>());
return type;
}
/// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string.
ReadBufferFromString buf(field);
Float64 tmp;
if (tryReadFloatText(tmp, buf) && buf.eof())
return makeNullable(std::make_shared<DataTypeFloat64>());
return makeNullable(std::make_shared<DataTypeString>());
}
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
case FormatSettings::EscapingRule::Escaped:
/// TODO: Try to use some heuristics here to determine the type of data.
return field.empty() ? nullptr : makeNullable(std::make_shared<DataTypeString>());
{
if (!format_settings.tsv.input_format_use_best_effort_in_schema_inference)
return makeNullable(std::make_shared<DataTypeString>());
if (field.empty() || field == format_settings.tsv.null_representation)
return nullptr;
if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation)
return DataTypeFactory::instance().get("Nullable(Bool)");
ReadBufferFromString buf(field);
auto type = determineDataTypeForSingleField(buf);
if (!buf.eof())
return makeNullable(std::make_shared<DataTypeString>());
return type;
}
default:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule));
}
}
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context)
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule)
{
DataTypes data_types;
data_types.reserve(fields.size());
for (const auto & field : fields)
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context));
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule));
return data_types;
}
@ -344,4 +534,12 @@ DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escap
}
}
DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules)
{
DataTypes data_types;
for (const auto & rule : escaping_rules)
data_types.push_back(getDefaultDataTypeForEscapingRule(rule));
return data_types;
}
}

View File

@ -43,15 +43,21 @@ String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule es
/// - For JSON escaping rule we can use JSON parser to parse a single field
/// and then convert JSON type of this field to ClickHouse type.
/// - For CSV escaping rule we can do the next:
/// - If the field is an unquoted string, then we could try to evaluate it
/// as a constant expression, and if it fails, treat it as a String.
/// - If the field is a string in quotes, then we can try to evaluate
/// expression inside quotes as a constant expression, and if it fails or
/// the result is a number (we don't parse numbers in quotes) we treat it as a String.
/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here)
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr);
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr);
/// - If the field is an unquoted string, then we try to parse it as a number,
/// and if we cannot, treat it as a String.
/// - If the field is a string in quotes, then we try to use some
/// tweaks and heuristics to determine the type inside quotes, and if we can't or
/// the result is a number or tuple (we don't parse numbers in quotes and don't
/// support tuples in CSV) we treat it as a String.
/// - If input_format_csv_use_best_effort_in_schema_inference is disabled, we
/// treat everything as a string.
/// - For TSV and TSVRaw we try to use some tweaks and heuristics to determine the type
/// of value if setting input_format_tsv_use_best_effort_in_schema_inference is enabled,
/// otherwise we treat everything as a string.
DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule);
DataTypes determineDataTypesByEscapingRule(const std::vector<String> & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule);
DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule);
DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::EscapingRule> & escaping_rules);
}

View File

@ -65,6 +65,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
format_settings.csv.null_representation = settings.format_csv_null_representation;
format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
format_settings.csv.input_format_use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
@ -97,6 +98,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.parquet.import_nested = settings.input_format_parquet_import_nested;
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
format_settings.pretty.color = settings.output_format_pretty_color;
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
@ -117,6 +119,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
format_settings.tsv.null_representation = settings.format_tsv_null_representation;
format_settings.tsv.input_format_use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
@ -126,10 +129,17 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching;
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
@ -137,6 +147,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns;
format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation;
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference;
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
if (format_settings.schema.is_server)
@ -371,7 +382,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader(
throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR);
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
return schema_reader_creator(buf, format_settings, context);
return schema_reader_creator(buf, format_settings);
}
ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(

View File

@ -97,7 +97,7 @@ private:
/// The checker should return true if format support append.
using AppendSupportChecker = std::function<bool(const FormatSettings & settings)>;
using SchemaReaderCreator = std::function<SchemaReaderPtr(ReadBuffer & in, const FormatSettings & settings, ContextPtr context)>;
using SchemaReaderCreator = std::function<SchemaReaderPtr(ReadBuffer & in, const FormatSettings & settings)>;
using ExternalSchemaReaderCreator = std::function<ExternalSchemaReaderPtr(const FormatSettings & settings)>;
struct Creators

View File

@ -36,6 +36,8 @@ struct FormatSettings
bool seekable_read = true;
UInt64 max_rows_to_read_for_schema_inference = 100;
String column_names_for_schema_inference = "";
enum class DateTimeInputFormat
{
Basic, /// Default format for fast parsing: YYYY-MM-DD hh:mm:ss (ISO-8601 without fractional part and timezone) or NNNNNNNNNN unix timestamp.
@ -77,6 +79,7 @@ struct FormatSettings
bool low_cardinality_as_dictionary = false;
bool import_nested = false;
bool allow_missing_columns = false;
bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false;
} arrow;
@ -104,6 +107,7 @@ struct FormatSettings
bool input_format_arrays_as_nested_csv = false;
String null_representation = "\\N";
char tuple_delimiter = ',';
bool input_format_use_best_effort_in_schema_inference = true;
} csv;
struct HiveText
@ -141,6 +145,7 @@ struct FormatSettings
UInt64 row_group_size = 1000000;
bool import_nested = false;
bool allow_missing_columns = false;
bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false;
std::unordered_set<int> skip_row_groups = {};
} parquet;
@ -209,6 +214,7 @@ struct FormatSettings
bool crlf_end_of_line = false;
String null_representation = "\\N";
bool input_format_enum_as_number = false;
bool input_format_use_best_effort_in_schema_inference = true;
} tsv;
struct
@ -223,6 +229,7 @@ struct FormatSettings
bool import_nested = false;
bool allow_missing_columns = false;
int64_t row_batch_size = 100'000;
bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false;
std::unordered_set<int> skip_stripes = {};
} orc;

View File

@ -105,8 +105,11 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o
return readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, buf_out);
}
DataTypePtr generalizeDataType(DataTypePtr type)
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type)
{
if (!type)
return nullptr;
WhichDataType which(type);
if (which.isNothing())
@ -115,16 +118,13 @@ DataTypePtr generalizeDataType(DataTypePtr type)
if (which.isNullable())
{
const auto * nullable_type = assert_cast<const DataTypeNullable *>(type.get());
return generalizeDataType(nullable_type->getNestedType());
return makeNullableRecursivelyAndCheckForNothing(nullable_type->getNestedType());
}
if (isNumber(type))
return makeNullable(std::make_shared<DataTypeFloat64>());
if (which.isArray())
{
const auto * array_type = assert_cast<const DataTypeArray *>(type.get());
auto nested_type = generalizeDataType(array_type->getNestedType());
auto nested_type = makeNullableRecursivelyAndCheckForNothing(array_type->getNestedType());
return nested_type ? std::make_shared<DataTypeArray>(nested_type) : nullptr;
}
@ -134,7 +134,7 @@ DataTypePtr generalizeDataType(DataTypePtr type)
DataTypes nested_types;
for (const auto & element : tuple_type->getElements())
{
auto nested_type = generalizeDataType(element);
auto nested_type = makeNullableRecursivelyAndCheckForNothing(element);
if (!nested_type)
return nullptr;
nested_types.push_back(nested_type);
@ -145,19 +145,27 @@ DataTypePtr generalizeDataType(DataTypePtr type)
if (which.isMap())
{
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
auto key_type = removeNullable(generalizeDataType(map_type->getKeyType()));
auto value_type = generalizeDataType(map_type->getValueType());
return key_type && value_type ? std::make_shared<DataTypeMap>(key_type, value_type) : nullptr;
auto key_type = makeNullableRecursivelyAndCheckForNothing(map_type->getKeyType());
auto value_type = makeNullableRecursivelyAndCheckForNothing(map_type->getValueType());
return key_type && value_type ? std::make_shared<DataTypeMap>(removeNullable(key_type), value_type) : nullptr;
}
if (which.isLowCarnality())
{
const auto * lc_type = assert_cast<const DataTypeLowCardinality *>(type.get());
auto nested_type = generalizeDataType(lc_type->getDictionaryType());
auto nested_type = makeNullableRecursivelyAndCheckForNothing(lc_type->getDictionaryType());
return nested_type ? std::make_shared<DataTypeLowCardinality>(nested_type) : nullptr;
}
return makeNullable(type);
}
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header)
{
NamesAndTypesList result;
for (auto & [name, type] : header.getNamesAndTypesList())
result.emplace_back(name, makeNullableRecursivelyAndCheckForNothing(type));
return result;
}
}

View File

@ -29,14 +29,16 @@ ColumnsDescription readSchemaFromFormat(
ContextPtr context,
std::unique_ptr<ReadBuffer> & buf_out);
/// Convert type to the most general type:
/// - IntN, UIntN, FloatN, Decimal -> Float64
/// Make type Nullable recursively:
/// - Type -> Nullable(type)
/// - Array(Type) -> Array(Nullable(Type))
/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN))
/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType))
/// - LowCardinality(Type) -> LowCardinality(Nullable(Type))
/// If type is Nothing or one of the nested types is Nothing, return nullptr.
DataTypePtr generalizeDataType(DataTypePtr type);
DataTypePtr makeNullableRecursivelyAndCheckForNothing(DataTypePtr type);
/// Call makeNullableRecursivelyAndCheckForNothing for all types
/// in the block and return names and types.
NamesAndTypesList getNamesAndRecursivelyNullableTypes(const Block & header);
}

View File

@ -1366,6 +1366,7 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
/// - Tuples: (...)
/// - Maps: {...}
/// - NULL
/// - Bool: true/false
/// - Number: integer, float, decimal.
if (*buf.position() == '\'')
@ -1394,6 +1395,16 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
s.append("NaN");
}
}
else if (checkCharCaseInsensitive('t', buf))
{
assertStringCaseInsensitive("rue", buf);
s.append("true");
}
else if (checkCharCaseInsensitive('f', buf))
{
assertStringCaseInsensitive("alse", buf);
s.append("false");
}
else
{
/// It's an integer, float or decimal. They all can be parsed as float.

View File

@ -2,6 +2,7 @@
#include <Formats/ReadSchemaUtils.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <boost/algorithm/string.hpp>
namespace DB
{
@ -66,9 +67,32 @@ static void checkTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, c
result.emplace_back(name, type);
}
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_)
: ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_)
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, bool allow_bools_as_numbers_)
: ISchemaReader(in_), max_rows_to_read(format_settings.max_rows_to_read_for_schema_inference), allow_bools_as_numbers(allow_bools_as_numbers_)
{
if (!format_settings.column_names_for_schema_inference.empty())
{
/// column_names_for_schema_inference is a string in format 'column1,column2,column3,...'
boost::split(column_names, format_settings.column_names_for_schema_inference, boost::is_any_of(","));
for (auto & column_name : column_names)
{
std::string col_name_trimmed = boost::trim_copy(column_name);
if (!col_name_trimmed.empty())
column_name = col_name_trimmed;
}
}
}
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_, bool allow_bools_as_numbers_)
: IRowSchemaReader(in_, format_settings, allow_bools_as_numbers_)
{
default_type = default_type_;
}
IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_, bool allow_bools_as_numbers_)
: IRowSchemaReader(in_, format_settings, allow_bools_as_numbers_)
{
default_types = default_types_;
}
NamesAndTypesList IRowSchemaReader::readSchema()
@ -90,7 +114,7 @@ NamesAndTypesList IRowSchemaReader::readSchema()
if (!new_data_types[i])
continue;
chooseResultType(data_types[i], new_data_types[i], allow_bools_as_numbers, default_type, std::to_string(i + 1), row);
chooseResultType(data_types[i], new_data_types[i], allow_bools_as_numbers, getDefaultType(i), std::to_string(i + 1), row);
}
}
@ -115,12 +139,21 @@ NamesAndTypesList IRowSchemaReader::readSchema()
for (size_t i = 0; i != data_types.size(); ++i)
{
/// Check that we could determine the type of this column.
checkTypeAndAppend(result, data_types[i], column_names[i], default_type, max_rows_to_read);
checkTypeAndAppend(result, data_types[i], column_names[i], getDefaultType(i), max_rows_to_read);
}
return result;
}
DataTypePtr IRowSchemaReader::getDefaultType(size_t column) const
{
if (default_type)
return default_type;
if (column < default_types.size() && default_types[column])
return default_types[column];
return nullptr;
}
IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_)
: ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_)
{

View File

@ -31,12 +31,17 @@ protected:
/// Base class for schema inference for formats that read data row by row.
/// It reads data row by row (up to max_rows_to_read), determines types of columns
/// for each row and compare them with types from the previous rows. If some column
/// contains values with different types in different rows, the default type will be
/// used for this column or the exception will be thrown (if default type is not set).
/// contains values with different types in different rows, the default type
/// (from argument default_type_) will be used for this column or the exception
/// will be thrown (if default type is not set). If different columns have different
/// default types, you can provide them by default_types_ argument.
class IRowSchemaReader : public ISchemaReader
{
public:
IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr, bool allow_bools_as_numbers_ = false);
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, bool allow_bools_as_numbers_ = false);
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, DataTypePtr default_type_, bool allow_bools_as_numbers_ = false);
IRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, const DataTypes & default_types_, bool allow_bools_as_numbers_ = false);
NamesAndTypesList readSchema() override;
protected:
@ -49,8 +54,11 @@ protected:
void setColumnNames(const std::vector<String> & names) { column_names = names; }
private:
DataTypePtr getDefaultType(size_t column) const;
size_t max_rows_to_read;
DataTypePtr default_type;
DataTypes default_types;
bool allow_bools_as_numbers;
std::vector<String> column_names;
};

View File

@ -3,6 +3,7 @@
#if USE_ARROW
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
@ -171,8 +172,9 @@ NamesAndTypesList ArrowSchemaReader::readSchema()
schema = createFileReader(in, format_settings, is_stopped)->schema();
}
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow");
return header.getNamesAndTypesList();
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
*schema, stream ? "ArrowStream" : "Arrow", format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference);
return getNamesAndRecursivelyNullableTypes(header);
}
void registerInputFormatArrow(FormatFactory & factory)
@ -202,13 +204,13 @@ void registerArrowSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader(
"Arrow",
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
[](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<ArrowSchemaReader>(buf, false, settings);
});
factory.registerSchemaReader(
"ArrowStream",
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
[](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<ArrowSchemaReader>(buf, true, settings);
});}

View File

@ -15,6 +15,7 @@
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeNothing.h>
#include <Common/DateLUTImpl.h>
#include <base/types.h>
#include <Processors/Chunk.h>
@ -26,11 +27,13 @@
#include <Columns/ColumnUnique.h>
#include <Columns/ColumnMap.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnNothing.h>
#include <Interpreters/castColumn.h>
#include <Common/quoteString.h>
#include <algorithm>
#include <arrow/builder.h>
#include <arrow/array.h>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/case_conv.hpp>
/// UINT16 and UINT32 are processed separately, see comments in readColumnFromArrowColumn.
@ -329,12 +332,17 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
const std::string & format_name,
bool is_nullable,
std::unordered_map<String, std::shared_ptr<ColumnWithTypeAndName>> & dictionary_values,
bool read_ints_as_dates)
bool read_ints_as_dates,
bool allow_null_type,
bool skip_columns_with_unsupported_types,
bool & skipped)
{
if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST
&& arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT)
{
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates);
auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
if (skipped)
return {};
auto nullmap_column = readByteMapFromArrowColumn(arrow_column);
auto nullable_type = std::make_shared<DataTypeNullable>(std::move(nested_column.type));
auto nullable_column = ColumnNullable::create(nested_column.column, nullmap_column);
@ -379,7 +387,10 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
case arrow::Type::MAP:
{
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
if (skipped)
return {};
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
const auto * tuple_column = assert_cast<const ColumnTuple *>(nested_column.column.get());
@ -391,7 +402,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
case arrow::Type::LIST:
{
auto arrow_nested_column = getNestedArrowColumn(arrow_column);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
if (skipped)
return {};
auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
auto array_column = ColumnArray::create(nested_column.column, offsets_column);
auto array_type = std::make_shared<DataTypeArray>(nested_column.type);
@ -416,7 +429,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
for (int i = 0; i != arrow_struct_type->num_fields(); ++i)
{
auto nested_arrow_column = std::make_shared<arrow::ChunkedArray>(nested_arrow_columns[i]);
auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates);
auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
if (skipped)
return {};
tuple_elements.emplace_back(std::move(element.column));
tuple_types.emplace_back(std::move(element.type));
tuple_names.emplace_back(std::move(element.name));
@ -439,7 +454,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
dict_array.emplace_back(dict_chunk.dictionary());
}
auto arrow_dict_column = std::make_shared<arrow::ChunkedArray>(dict_array);
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates);
auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates, allow_null_type, skip_columns_with_unsupported_types, skipped);
/// We should convert read column to ColumnUnique.
auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn();
@ -469,9 +484,33 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
# undef DISPATCH
// TODO: read JSON as a string?
// TODO: read UUID as a string?
case arrow::Type::NA:
{
if (allow_null_type)
{
auto type = std::make_shared<DataTypeNothing>();
auto column = ColumnNothing::create(arrow_column->length());
return {std::move(column), type, column_name};
}
[[fallthrough]];
}
default:
throw Exception(ErrorCodes::UNKNOWN_TYPE,
"Unsupported {} type '{}' of an input column '{}'.", format_name, arrow_column->type()->name(), column_name);
{
if (skip_columns_with_unsupported_types)
{
skipped = true;
return {};
}
throw Exception(
ErrorCodes::UNKNOWN_TYPE,
"Unsupported {} type '{}' of an input column '{}'. If it happens during schema inference and you want to skip columns with "
"unsupported types, you can enable setting input_format_{}_skip_columns_with_unsupported_types_in_schema_inference",
format_name,
arrow_column->type()->name(),
column_name,
boost::algorithm::to_lower_copy(format_name));
}
}
}
@ -485,8 +524,9 @@ static void checkStatus(const arrow::Status & status, const String & column_name
throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()};
}
Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
const arrow::Schema & schema, const std::string & format_name, const Block * hint_header, bool ignore_case)
const arrow::Schema & schema, const std::string & format_name, bool skip_columns_with_unsupported_types, const Block * hint_header, bool ignore_case)
{
ColumnsWithTypeAndName sample_columns;
std::unordered_set<String> nested_table_names;
@ -512,9 +552,14 @@ Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
arrow::ArrayVector array_vector = {arrow_array};
auto arrow_column = std::make_shared<arrow::ChunkedArray>(array_vector);
std::unordered_map<std::string, std::shared_ptr<ColumnWithTypeAndName>> dict_values;
ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false);
sample_columns.emplace_back(std::move(sample_column));
bool skipped = false;
bool allow_null_type = false;
if (hint_header && hint_header->has(field->name()) && hint_header->getByName(field->name()).type->isNullable())
allow_null_type = true;
ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(
arrow_column, field->name(), format_name, false, dict_values, false, allow_null_type, skip_columns_with_unsupported_types, skipped);
if (!skipped)
sample_columns.emplace_back(std::move(sample_column));
}
return Block(std::move(sample_columns));
}
@ -559,6 +604,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
UInt64 num_rows = name_to_column_ptr.begin()->second->length();
columns_list.reserve(header.rows());
std::unordered_map<String, BlockPtr> nested_tables;
bool skipped = false;
for (size_t column_i = 0, columns = header.columns(); column_i < columns; ++column_i)
{
const ColumnWithTypeAndName & header_column = header.getByPosition(column_i);
@ -582,7 +628,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
{
std::shared_ptr<arrow::ChunkedArray> arrow_column = name_to_column_ptr[search_nested_table_name];
ColumnsWithTypeAndName cols
= {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)};
= {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true, true, false, skipped)};
Block block(cols);
nested_tables[search_nested_table_name] = std::make_shared<Block>(Nested::flatten(block));
}
@ -615,7 +661,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
else
{
auto arrow_column = name_to_column_ptr[search_column_name];
column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true);
column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true, true, false, skipped);
}
try
@ -642,7 +688,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr &
std::vector<size_t> ArrowColumnToCHColumn::getMissingColumns(const arrow::Schema & schema) const
{
std::vector<size_t> missing_columns;
auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, &header, case_insensitive_matching);
auto block_from_arrow = arrowSchemaToCHHeader(schema, format_name, false, &header, case_insensitive_matching);
auto flatten_block_from_arrow = Nested::flatten(block_from_arrow);
for (size_t i = 0, columns = header.columns(); i < columns; ++i)

View File

@ -38,7 +38,11 @@ public:
/// Transform arrow schema to ClickHouse header. If hint_header is provided,
/// we will skip columns in schema that are not in hint_header.
static Block arrowSchemaToCHHeader(
const arrow::Schema & schema, const std::string & format_name, const Block * hint_header = nullptr, bool ignore_case = false);
const arrow::Schema & schema,
const std::string & format_name,
bool skip_columns_with_unsupported_types = false,
const Block * hint_header = nullptr,
bool ignore_case = false);
private:
const Block & header;

View File

@ -924,12 +924,12 @@ void registerInputFormatAvro(FormatFactory & factory)
void registerAvroSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<AvroSchemaReader>(buf, false, settings);
});
factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<AvroSchemaReader>(buf, true, settings);
});

View File

@ -95,7 +95,7 @@ void BinaryFormatReader::skipField(size_t file_column)
}
BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_)
: FormatWithNamesAndTypesSchemaReader(in_, format_settings_, true, true, &reader), reader(in_, format_settings_)
{
}
@ -119,7 +119,7 @@ void registerInputFormatRowBinary(FormatFactory & factory)
void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<BinaryWithNamesAndTypesSchemaReader>(buf, settings);
});

View File

@ -9,7 +9,6 @@
#include <Formats/EscapingRuleUtils.h>
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeString.h>
@ -259,16 +258,15 @@ bool CSVFormatReader::readField(
}
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_)
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_)
: FormatWithNamesAndTypesSchemaReader(
in_,
format_setting_.max_rows_to_read_for_schema_inference,
format_setting_,
with_names_,
with_types_,
&reader,
getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV))
, reader(in_, format_setting_)
, context(context_)
{
}
@ -279,7 +277,7 @@ DataTypes CSVSchemaReader::readRowAndGetDataTypes()
return {};
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context);
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV);
}
@ -382,9 +380,9 @@ void registerCSVSchemaReader(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<CSVSchemaReader>(buf, with_names, with_types, settings, context);
return std::make_shared<CSVSchemaReader>(buf, with_names, with_types, settings);
});
};

View File

@ -74,13 +74,12 @@ public:
class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_);
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_);
private:
DataTypes readRowAndGetDataTypes() override;
CSVFormatReader reader;
ContextPtr context;
};
}

View File

@ -289,17 +289,16 @@ void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_)
}
CustomSeparatedSchemaReader::CustomSeparatedSchemaReader(
ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_)
ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_)
: FormatWithNamesAndTypesSchemaReader(
buf,
format_setting_.max_rows_to_read_for_schema_inference,
format_setting_,
with_names_,
with_types_,
&reader,
getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule))
, buf(in_)
, reader(buf, ignore_spaces_, updateFormatSettings(format_setting_))
, context(context_)
{
}
@ -315,7 +314,7 @@ DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes()
first_row = false;
auto fields = reader.readRow();
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context);
return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule());
}
void registerInputFormatCustomSeparated(FormatFactory & factory)
@ -343,9 +342,9 @@ void registerCustomSeparatedSchemaReader(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<CustomSeparatedSchemaReader>(buf, with_names, with_types, ignore_spaces, settings, context);
return std::make_shared<CustomSeparatedSchemaReader>(buf, with_names, with_types, ignore_spaces, settings);
});
};

View File

@ -92,14 +92,13 @@ private:
class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader
{
public:
CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_);
CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_);
private:
DataTypes readRowAndGetDataTypes() override;
PeekableReadBuffer buf;
CustomSeparatedFormatReader reader;
ContextPtr context;
bool first_row = true;
};

View File

@ -181,15 +181,10 @@ bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer &
return true;
}
JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_)
JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(
ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(
in_,
format_settings_.max_rows_to_read_for_schema_inference,
with_names_,
with_types_,
&reader,
nullptr,
format_settings_.json.read_bools_as_numbers)
in_, format_settings_, with_names_, with_types_, &reader, nullptr, format_settings_.json.read_bools_as_numbers)
, reader(in_, yield_strings_, format_settings_)
{
}
@ -239,7 +234,7 @@ void registerJSONCompactEachRowSchemaReader(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<JSONCompactEachRowRowSchemaReader>(buf, with_names, with_types, json_strings, settings);
});

View File

@ -387,12 +387,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory
void registerJSONEachRowSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_unique<JSONEachRowSchemaReader>(buf, false, settings);
});
factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_unique<JSONEachRowSchemaReader>(buf, true, settings);
});

View File

@ -414,7 +414,7 @@ void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_)
}
MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns)
: IRowSchemaReader(buf, format_settings_), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns)
{
if (!number_of_columns)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data");
@ -535,7 +535,7 @@ void registerInputFormatMsgPack(FormatFactory & factory)
void registerMsgPackSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<MsgPackSchemaReader>(buf, settings);
});

View File

@ -133,7 +133,7 @@ void registerOutputFormatNative(FormatFactory & factory)
void registerNativeSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr)
factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &)
{
return std::make_shared<NativeSchemaReader>(buf);
});

View File

@ -3,6 +3,7 @@
#if USE_ORC
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
@ -183,8 +184,9 @@ NamesAndTypesList ORCSchemaReader::readSchema()
std::shared_ptr<arrow::Schema> schema;
std::atomic<int> is_stopped = 0;
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC");
return header.getNamesAndTypesList();
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
*schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference);
return getNamesAndRecursivelyNullableTypes(header);
}
void registerInputFormatORC(FormatFactory & factory)
@ -205,7 +207,7 @@ void registerORCSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader(
"ORC",
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
[](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<ORCSchemaReader>(buf, settings);
}

View File

@ -4,6 +4,7 @@
#if USE_PARQUET
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/copyData.h>
#include <arrow/api.h>
@ -176,8 +177,9 @@ NamesAndTypesList ParquetSchemaReader::readSchema()
std::shared_ptr<arrow::Schema> schema;
std::atomic<int> is_stopped = 0;
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet");
return header.getNamesAndTypesList();
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
*schema, "Parquet", format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference);
return getNamesAndRecursivelyNullableTypes(header);
}
void registerInputFormatParquet(FormatFactory & factory)
@ -198,7 +200,7 @@ void registerParquetSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader(
"Parquet",
[](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
[](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<ParquetSchemaReader>(buf, settings);
}

View File

@ -128,15 +128,14 @@ void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_)
IInputFormat::setReadBuffer(*buf);
}
RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_)
RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IRowSchemaReader(
buf,
format_settings_.max_rows_to_read_for_schema_inference,
format_settings_,
getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule))
, format_settings(format_settings_)
, field_extractor(format_settings)
, buf(in_)
, context(context_)
{
}
@ -152,7 +151,7 @@ DataTypes RegexpSchemaReader::readRowAndGetDataTypes()
for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i)
{
String field(field_extractor.getField(i));
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context));
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule));
}
return data_types;
@ -203,9 +202,9 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory)
void registerRegexpSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<RegexpSchemaReader>(buf, settings, context);
return std::make_shared<RegexpSchemaReader>(buf, settings);
});
}

View File

@ -76,7 +76,7 @@ private:
class RegexpSchemaReader : public IRowSchemaReader
{
public:
RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_);
RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings);
private:
DataTypes readRowAndGetDataTypes() override;
@ -85,7 +85,6 @@ private:
const FormatSettings format_settings;
RegexpFieldExtractor field_extractor;
PeekableReadBuffer buf;
ContextPtr context;
};
}

View File

@ -283,7 +283,7 @@ void registerInputFormatTSKV(FormatFactory & factory)
}
void registerTSKVSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<TSKVSchemaReader>(buf, settings);
});

View File

@ -235,7 +235,7 @@ TabSeparatedSchemaReader::TabSeparatedSchemaReader(
ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_)
: FormatWithNamesAndTypesSchemaReader(
in_,
format_settings_.max_rows_to_read_for_schema_inference,
format_settings_,
with_names_,
with_types_,
&reader,
@ -280,7 +280,7 @@ void registerTSVSchemaReader(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr)
factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<TabSeparatedSchemaReader>(buf, with_names, with_types, is_raw, settings);
});

View File

@ -453,14 +453,12 @@ TemplateSchemaReader::TemplateSchemaReader(
const ParsedTemplateFormatString & format_,
const ParsedTemplateFormatString & row_format_,
std::string row_between_delimiter,
const FormatSettings & format_settings_,
ContextPtr context_)
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference)
const FormatSettings & format_settings_)
: IRowSchemaReader(buf, format_settings_, getDefaultDataTypeForEscapingRules(row_format_.escaping_rules))
, buf(in_)
, format(format_)
, row_format(row_format_)
, format_settings(format_settings_)
, context(context_)
, format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings)
{
setColumnNames(row_format.column_names);
@ -489,7 +487,7 @@ DataTypes TemplateSchemaReader::readRowAndGetDataTypes()
format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front();
field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings);
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context));
data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i]));
}
format_reader.skipRowEndDelimiter();
@ -564,12 +562,12 @@ void registerTemplateSchemaReader(FormatFactory & factory)
{
for (bool ignore_spaces : {false, true})
{
factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings)
{
size_t index = 0;
auto idx_getter = [&](const String &) -> std::optional<size_t> { return index++; };
auto row_format = fillRowFormat(settings, idx_getter, false);
return std::make_shared<TemplateSchemaReader>(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context);
return std::make_shared<TemplateSchemaReader>(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings);
});
}
}

View File

@ -116,8 +116,7 @@ public:
const ParsedTemplateFormatString & format_,
const ParsedTemplateFormatString & row_format_,
std::string row_between_delimiter,
const FormatSettings & format_settings_,
ContextPtr context_);
const FormatSettings & format_settings_);
DataTypes readRowAndGetDataTypes() override;
@ -126,7 +125,6 @@ private:
const ParsedTemplateFormatString format;
const ParsedTemplateFormatString row_format;
FormatSettings format_settings;
ContextPtr context;
TemplateFormatReader format_reader;
bool first_row = true;
};

View File

@ -6,6 +6,7 @@
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/ReadSchemaUtils.h>
#include <Formats/EscapingRuleUtils.h>
#include <Core/Block.h>
#include <base/find_symbols.h>
#include <Common/typeid_cast.h>
@ -571,8 +572,8 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_)
IInputFormat::setReadBuffer(*buf);
}
ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_)
: IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_)
ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
: IRowSchemaReader(buf, format_settings_), buf(in_), format_settings(format_settings_)
{
}
@ -589,38 +590,25 @@ DataTypes ValuesSchemaReader::readRowAndGetDataTypes()
return {};
assertChar('(', buf);
PeekableReadBufferCheckpoint checkpoint(buf);
skipToNextRow(&buf, 0, 1);
buf.makeContinuousMemoryFromCheckpointToPos();
buf.rollbackToCheckpoint();
Tokens tokens(buf.position(), buf.buffer().end());
IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth);
skipWhitespaceIfAny(buf);
DataTypes data_types;
bool finish = false;
while (!finish)
String value;
while (!buf.eof() && *buf.position() != ')')
{
Expected expected;
ASTPtr ast;
if (!data_types.empty())
{
skipWhitespaceIfAny(buf);
assertChar(',', buf);
skipWhitespaceIfAny(buf);
}
bool parsed = parser.parse(token_iterator, ast, expected);
/// Consider delimiter after value (',' or ')') as part of expression
parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket;
if (!parsed)
throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}",
String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end));
std::pair<Field, DataTypePtr> result = evaluateConstantExpression(ast, context);
data_types.push_back(generalizeDataType(result.second));
if (token_iterator->type == TokenType::ClosingRoundBracket)
finish = true;
++token_iterator;
buf.position() = const_cast<char *>(token_iterator->begin);
readQuotedFieldIntoString(value, buf);
auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
data_types.push_back(std::move(type));
}
assertChar(')', buf);
skipWhitespaceIfAny(buf);
if (!buf.eof() && *buf.position() == ',')
++buf.position();
@ -642,9 +630,9 @@ void registerInputFormatValues(FormatFactory & factory)
void registerValuesSchemaReader(FormatFactory & factory)
{
factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context)
factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<ValuesSchemaReader>(buf, settings, context);
return std::make_shared<ValuesSchemaReader>(buf, settings);
});
}

View File

@ -97,13 +97,13 @@ private:
class ValuesSchemaReader : public IRowSchemaReader
{
public:
ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_);
ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings);
private:
DataTypes readRowAndGetDataTypes() override;
PeekableReadBuffer buf;
ContextPtr context;
const FormatSettings format_settings;
ParserExpression parser;
bool first_row = true;
};

View File

@ -293,13 +293,13 @@ void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_)
FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader(
ReadBuffer & in_,
size_t max_rows_to_read_,
const FormatSettings & format_settings,
bool with_names_,
bool with_types_,
FormatWithNamesAndTypesReader * format_reader_,
DataTypePtr default_type_,
bool allow_bools_as_numbers_)
: IRowSchemaReader(in_, max_rows_to_read_, default_type_, allow_bools_as_numbers_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_)
: IRowSchemaReader(in_, format_settings, default_type_, allow_bools_as_numbers_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_)
{
}

View File

@ -124,7 +124,7 @@ class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader
public:
FormatWithNamesAndTypesSchemaReader(
ReadBuffer & in,
size_t max_rows_to_read_,
const FormatSettings & format_settings,
bool with_names_,
bool with_types_,
FormatWithNamesAndTypesReader * format_reader_,

View File

@ -2,30 +2,6 @@
0 0 0
0 0 0
1 2 3
10 11 12
13 14 15
16 17 18
20 21 22
23 24 25
26 27 28
4 5 6
7 8 9
0 0 0
0 0 0
0 0 0
1 2 3
10 11 12
13 14 15
16 17 18
20 21 22
23 24 25
26 27 28
4 5 6
7 8 9
0 0 0
0 0 0
0 0 0
1 2 3
4 5 6
7 8 9
10 11 12
@ -38,14 +14,26 @@
0 0 0
0 0 0
1 2 3
4 5 6
7 8 9
10 11 12
13 14 15
16 17 18
20 21 22
23 24 25
26 27 28
0 0 0
0 0 0
0 0 0
1 2 3
4 5 6
7 8 9
10 11 12
13 14 15
16 17 18
20 21 22
23 24 25
26 27 28
0 0 0
0 0 0
0 0 0
@ -62,14 +50,26 @@
0 0 0
0 0 0
1 2 3
4 5 6
7 8 9
10 11 12
13 14 15
16 17 18
20 21 22
23 24 25
26 27 28
0 0 0
0 0 0
0 0 0
1 2 3
4 5 6
7 8 9
10 11 12
13 14 15
16 17 18
20 21 22
23 24 25
26 27 28
0 0 0
0 0 0
0 0 0

View File

@ -1,17 +1,17 @@
TSV
c1 Nullable(String)
c1 Nullable(Float64)
c2 Nullable(String)
c3 Nullable(String)
c4 Nullable(String)
42 Some string [1, 2, 3, 4] (1, 2, 3)
42 abcd [] (4, 5, 6)
c3 Array(Nullable(Float64))
c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))
42 Some string [1,2,3,4] (1,2,3)
42 abcd [] (4,5,6)
TSVWithNames
number Nullable(String)
number Nullable(Float64)
string Nullable(String)
array Nullable(String)
tuple Nullable(String)
42 Some string [1, 2, 3, 4] (1, 2, 3)
42 abcd [] (4, 5, 6)
array Array(Nullable(Float64))
tuple Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))
42 Some string [1,2,3,4] (1,2,3)
42 abcd [] (4,5,6)
CSV
c1 Nullable(Float64)
c2 Nullable(String)
@ -73,13 +73,13 @@ c Array(Nullable(Float64))
\N \N []
\N \N [3]
TSKV
a Nullable(String)
a Nullable(Float64)
b Nullable(String)
c Nullable(String)
1 s1 \N
c Array(Nullable(Float64))
1 s1 []
2 } [2]
\N \N \N
\N \N \N
\N \N []
\N \N []
\N \N [3]
Values
c1 Nullable(Float64)
@ -96,7 +96,7 @@ c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(Strin
42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')])
\N Some string [10] (1,2) ([],[])
Regexp
c1 Nullable(String)
c1 Nullable(Float64)
c2 Nullable(String)
c3 Nullable(String)
42 Some string 1 [([1, 2, 3], String 1), ([], String 1)]

View File

@ -1,137 +1,137 @@
Arrow
int8 Int8
uint8 UInt8
int16 Int16
uint16 UInt16
int32 Int32
uint32 UInt32
int64 Int64
uint64 UInt64
int8 Nullable(Int8)
uint8 Nullable(UInt8)
int16 Nullable(Int16)
uint16 Nullable(UInt16)
int32 Nullable(Int32)
uint32 Nullable(UInt32)
int64 Nullable(Int64)
uint64 Nullable(UInt64)
0 0 0 0 0 0 0 0
-1 1 -1 1 -1 1 -1 1
float32 Float32
float64 Float64
decimal32 Decimal(9, 5)
decimal64 Decimal(18, 5)
float32 Nullable(Float32)
float64 Nullable(Float64)
decimal32 Nullable(Decimal(9, 5))
decimal64 Nullable(Decimal(18, 5))
0 0 0 0
1.2 0.7692307692307692 3.33333 333.33333
date UInt16
date32 Date32
date Nullable(UInt16)
date32 Nullable(Date32)
0 1970-01-01
1 1970-01-02
str String
fixed_string String
str Nullable(String)
fixed_string Nullable(String)
Str: 0 100
Str: 1 200
array Array(UInt64)
tuple Tuple(`tuple.0` UInt64, `tuple.1` String)
map Map(String, UInt64)
array Array(Nullable(UInt64))
tuple Tuple(Nullable(UInt64), Nullable(String))
map Map(String, Nullable(UInt64))
[0,1] (0,'0') {'0':0}
[1,2] (1,'1') {'1':1}
nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64)))
nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8)
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
ArrowStream
int8 Int8
uint8 UInt8
int16 Int16
uint16 UInt16
int32 Int32
uint32 UInt32
int64 Int64
uint64 UInt64
int8 Nullable(Int8)
uint8 Nullable(UInt8)
int16 Nullable(Int16)
uint16 Nullable(UInt16)
int32 Nullable(Int32)
uint32 Nullable(UInt32)
int64 Nullable(Int64)
uint64 Nullable(UInt64)
0 0 0 0 0 0 0 0
-1 1 -1 1 -1 1 -1 1
float32 Float32
float64 Float64
decimal32 Decimal(9, 5)
decimal64 Decimal(18, 5)
float32 Nullable(Float32)
float64 Nullable(Float64)
decimal32 Nullable(Decimal(9, 5))
decimal64 Nullable(Decimal(18, 5))
0 0 0 0
1.2 0.7692307692307692 3.33333 333.33333
date UInt16
date32 Date32
date Nullable(UInt16)
date32 Nullable(Date32)
0 1970-01-01
1 1970-01-02
str String
fixed_string String
str Nullable(String)
fixed_string Nullable(String)
Str: 0 100
Str: 1 200
array Array(UInt64)
tuple Tuple(`tuple.0` UInt64, `tuple.1` String)
map Map(String, UInt64)
array Array(Nullable(UInt64))
tuple Tuple(Nullable(UInt64), Nullable(String))
map Map(String, Nullable(UInt64))
[0,1] (0,'0') {'0':0}
[1,2] (1,'1') {'1':1}
nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64)))
nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8)
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
Parquet
int8 Int8
uint8 UInt8
int16 Int16
uint16 UInt16
int32 Int32
uint32 Int64
int64 Int64
uint64 UInt64
int8 Nullable(Int8)
uint8 Nullable(UInt8)
int16 Nullable(Int16)
uint16 Nullable(UInt16)
int32 Nullable(Int32)
uint32 Nullable(Int64)
int64 Nullable(Int64)
uint64 Nullable(UInt64)
0 0 0 0 0 0 0 0
-1 1 -1 1 -1 1 -1 1
float32 Float32
float64 Float64
decimal32 Decimal(9, 5)
decimal64 Decimal(18, 5)
float32 Nullable(Float32)
float64 Nullable(Float64)
decimal32 Nullable(Decimal(9, 5))
decimal64 Nullable(Decimal(18, 5))
0 0 0 0
1.2 0.7692307692307692 3.33333 333.33333
date UInt16
date32 Date32
date Nullable(UInt16)
date32 Nullable(Date32)
0 1970-01-01
1 1970-01-02
str String
fixed_string String
str Nullable(String)
fixed_string Nullable(String)
Str: 0 100
Str: 1 200
array Array(UInt64)
tuple Tuple(`tuple.0` UInt64, `tuple.1` String)
map Map(String, UInt64)
array Array(Nullable(UInt64))
tuple Tuple(Nullable(UInt64), Nullable(String))
map Map(String, Nullable(UInt64))
[0,1] (0,'0') {'0':0}
[1,2] (1,'1') {'1':1}
nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64)))
nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8)
nested1 Array(Tuple(Array(Nullable(UInt64)), Map(String, Nullable(UInt64))))
nested2 Tuple(Tuple(Array(Array(Nullable(UInt64))), Map(UInt64, Array(Tuple(Nullable(UInt64), Nullable(String))))), Nullable(UInt8))
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
ORC
int8 Int8
uint8 Int8
int16 Int16
uint16 Int16
int32 Int32
uint32 Int32
int64 Int64
uint64 Int64
int8 Nullable(Int8)
uint8 Nullable(Int8)
int16 Nullable(Int16)
uint16 Nullable(Int16)
int32 Nullable(Int32)
uint32 Nullable(Int32)
int64 Nullable(Int64)
uint64 Nullable(Int64)
0 0 0 0 0 0 0 0
-1 1 -1 1 -1 1 -1 1
float32 Float32
float64 Float64
decimal32 Decimal(9, 5)
decimal64 Decimal(18, 5)
float32 Nullable(Float32)
float64 Nullable(Float64)
decimal32 Nullable(Decimal(9, 5))
decimal64 Nullable(Decimal(18, 5))
0 0 0 0
1.2 0.7692307692307692 3.33333 333.33333
date Date32
date32 Date32
date Nullable(Date32)
date32 Nullable(Date32)
1970-01-01 1970-01-01
1970-01-02 1970-01-02
str String
fixed_string String
str Nullable(String)
fixed_string Nullable(String)
Str: 0 100
Str: 1 200
array Array(Int64)
tuple Tuple(`tuple.0` Int64, `tuple.1` String)
map Map(String, Int64)
array Array(Nullable(Int64))
tuple Tuple(Nullable(Int64), Nullable(String))
map Map(String, Nullable(Int64))
[0,1] (0,'0') {'0':0}
[1,2] (1,'1') {'1':1}
nested1 Array(Tuple(`nested1.0` Array(Int64), `nested1.1` Map(String, Int64)))
nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(Int64)), `nested2.0.1` Map(Int64, Array(Tuple(`nested2.0.1.0` Int64, `nested2.0.1.1` String)))), `nested2.1` Int8)
nested1 Array(Tuple(Array(Nullable(Int64)), Map(String, Nullable(Int64))))
nested2 Tuple(Tuple(Array(Array(Nullable(Int64))), Map(Int64, Array(Tuple(Nullable(Int64), Nullable(String))))), Nullable(Int8))
[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42)
[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42)
Native

View File

@ -1 +1 @@
x LowCardinality(UInt64)
x LowCardinality(Nullable(UInt64))

View File

@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1"
$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1, engine_file_truncate_on_insert=1"
$CLICKHOUSE_CLIENT -q "desc file('arrow.dict', 'Arrow')"

View File

@ -9,7 +9,7 @@ x Nullable(Float64)
7
8
9
c1 Nullable(String)
c2 Nullable(String)
c3 Nullable(String)
c1 Nullable(Float64)
c2 Nullable(Float64)
c3 Nullable(Float64)
1 2 3

View File

@ -1,8 +1,8 @@
a Nullable(String)
a Nullable(Float64)
b Nullable(String)
c Nullable(String)
1 s1 \N
c Array(Nullable(Float64))
1 s1 []
2 } [2]
\N \N \N
\N \N \N
\N \N []
\N \N []
\N \N [3]

View File

@ -0,0 +1,40 @@
Arrow
x Nullable(UInt64)
arr1 Array(Nullable(UInt64))
arr2 Array(Array(Nullable(String)))
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)]
4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)]
ArrowStream
x Nullable(UInt64)
arr1 Array(Nullable(UInt64))
arr2 Array(Array(Nullable(String)))
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)]
4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)]
Parquet
x Nullable(UInt64)
arr1 Array(Nullable(UInt64))
arr2 Array(Array(Nullable(String)))
arr3 Array(Tuple(Nullable(String), Nullable(UInt64)))
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)]
4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)]
ORC
x Nullable(Int64)
arr1 Array(Nullable(Int64))
arr2 Array(Array(Nullable(String)))
arr3 Array(Tuple(Nullable(String), Nullable(Int64)))
0 [0,1] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,0)]
\N [NULL,2] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,1)]
2 [2,3] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,2)]
\N [NULL,4] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,3)]
4 [4,5] [[NULL,'String'],[NULL],[]] [(NULL,NULL),('String',NULL),(NULL,4)]

View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Tags: no-parallel, no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
FILE_NAME=test_02242.data
DATA_FILE=$USER_FILES_PATH/$FILE_NAME
for format in Arrow ArrowStream Parquet ORC
do
echo $format
$CLICKHOUSE_CLIENT -q "select number % 2 ? NULL : number as x, [number % 2 ? NULL : number, number + 1] as arr1, [[NULL, 'String'], [NULL], []] as arr2, [(NULL, NULL), ('String', NULL), (NULL, number)] as arr3 from numbers(5) format $format" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')"
done
rm $DATA_FILE

View File

@ -0,0 +1,28 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test_02243"
$CLICKHOUSE_CLIENT -q "create table test_02243 (image_path Nullable(String),
caption Nullable(String),
NSFW Nullable(String),
similarity Nullable(Float64),
LICENSE Nullable(String),
url Nullable(String),
key Nullable(UInt64),
shard_id Nullable(UInt64),
status Nullable(String),
error_message Nullable(String),
width Nullable(UInt32),
height Nullable(UInt32),
exif Nullable(String),
original_width Nullable(UInt32),
original_height Nullable(UInt32)) engine=Memory"
cat $CUR_DIR/data_parquet_bad_column/metadata_0.parquet | $CLICKHOUSE_CLIENT --stacktrace -q "insert into test_02243 format Parquet"
$CLICKHOUSE_CLIENT -q "select count() from test_02243"
$CLICKHOUSE_CLIENT -q "drop table test_02243"

View File

@ -0,0 +1,8 @@
x Nullable(String)
y Nullable(Float64)
x Nullable(String)
y Nullable(Float64)
x Nullable(String)
y Nullable(Float64)
x Nullable(String)
y Nullable(Float64)

View File

@ -0,0 +1,14 @@
-- Tags: no-fasttest, no-parallel
insert into function file('test_02244', 'TSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1;
desc file('test_02244', 'TSV') settings column_names_for_schema_inference='x,y';
insert into function file('test_02244', 'CSV', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1;
desc file('test_02244', 'CSV') settings column_names_for_schema_inference='x,y';
insert into function file('test_02244', 'JSONCompactEachRow', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1;
desc file('test_02244', 'JSONCompactEachRow') settings column_names_for_schema_inference='x,y';
insert into function file('test_02244', 'Values', 'x String, y UInt32') select 'Hello, world!', 42 settings engine_file_truncate_on_insert=1;
desc file('test_02244', 'Values') settings column_names_for_schema_inference='x,y';

View File

@ -0,0 +1,16 @@
OK
image_path Nullable(String)
caption Nullable(String)
NSFW Nullable(String)
similarity Nullable(Float64)
LICENSE Nullable(String)
url Nullable(String)
key Nullable(Int64)
shard_id Nullable(Int64)
status Nullable(String)
width Nullable(Int64)
height Nullable(Int64)
exif Nullable(String)
original_width Nullable(Int64)
original_height Nullable(Int64)
10

View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
FILE_NAME=test_02245.parquet
DATA_FILE=$USER_FILES_PATH/$FILE_NAME
cp $CUR_DIR/data_parquet_bad_column/metadata_0.parquet $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "CANNOT_EXTRACT_TABLE_STRUCTURE" && echo "OK" || echo "FAIL"
$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1"
$CLICKHOUSE_CLIENT -q "select count(*) from file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1"

View File

@ -1,21 +1,21 @@
c1 Nullable(String)
c2 Nullable(String)
c3 Nullable(String)
c1 Nullable(String)
c2 Nullable(String)
c3 Nullable(String)
c1 Nullable(Float64)
c2 Nullable(Float64)
c3 Nullable(Float64)
c1 Nullable(Float64)
c2 Nullable(Float64)
c3 Nullable(Float64)
c1 UInt64
c2 UInt64
c3 UInt64
c1 Nullable(String)
c2 Nullable(String)
c3 Nullable(String)
c1 Nullable(Float64)
c2 Nullable(Float64)
c3 Nullable(Float64)
c1 UInt64
c2 UInt64
c3 UInt64
c1 Nullable(String)
c2 Nullable(String)
c3 Nullable(String)
c1 Nullable(Float64)
c2 Nullable(Float64)
c3 Nullable(Float64)
c1 UInt64
c2 UInt64
c3 UInt64

View File

@ -10,4 +10,5 @@ desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test
desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64');
desc s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/{a,b,c}.tsv', 'test', 'testtest', 'TSV', 'c1 UInt64, c2 UInt64, c3 UInt64', 'auto');
SELECT * FROM s3(decodeURLComponent(NULL), [NULL]); --{serverError 170}

View File

@ -0,0 +1,107 @@
TSV
c1 Nullable(Float64)
c2 Nullable(String)
c3 Array(Nullable(Float64))
c4 Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64))
42 Some string [1,2,3,4] (1,2,3)
42 abcd [] (4,5,6)
c1 Nullable(String)
[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)]
[]
[({}, [], 0)]
[({}, [NULL], NULL)]
[({}, [\'String3\'], NULL)]
[({\'key3\': NULL}, []), NULL]
c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64)))
[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)]
[]
[({},[],0)]
[({},[NULL],NULL)]
[({},['String3'],NULL)]
[({'key3':NULL},[],NULL)]
c1 Nullable(Bool)
true
false
\N
c1 Array(Nullable(Bool))
[true,NULL]
[]
[NULL]
[false]
c1 Nullable(String)
[]
c1 Nullable(String)
{}
c1 Nullable(String)
()
c1 Nullable(String)
[1, 2, 3
c1 Nullable(String)
[(1, 2, 3 4)]
c1 Nullable(String)
[1, 2, 3 + 4]
c1 Nullable(String)
(1, 2,
c1 Nullable(String)
[1, Some trash, 42.2]
c1 Nullable(String)
[1, \'String\', {\'key\' : 2}]
c1 Nullable(String)
{\'key\' : 1, [1] : 10}
c1 Nullable(String)
{}{}
c1 Nullable(String)
[1, 2, 3
c1 Nullable(String)
[abc, def]
c1 Array(Nullable(String))
['abc','def']
c1 Nullable(String)
[\'string]
c1 Nullable(String)
\'string
c1 Nullable(Float64)
42.42
c1 Nullable(String)
42.42sometrash
c1 Nullable(String)
[42.42sometrash, 42.42]
CSV
c1 Nullable(String)
c2 Nullable(String)
c3 Array(Nullable(Float64))
c4 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)))
42 Some string [1,2,3,4] [(1,2,3)]
42\\ abcd [] [(4,5,6)]
c1 Nullable(String)
[({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)]
[]
[({}, [], 0)]
[({}, [NULL], NULL)]
[({}, [\'String3\'], NULL)]
[({\'key3\': NULL}, []), NULL]
c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64)))
[({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)]
[]
[({},[],0)]
[({},[NULL],NULL)]
[({},['String3'],NULL)]
[({'key3':NULL},[],NULL)]
c1 Nullable(Bool)
true
false
\N
c1 Array(Nullable(Bool))
[true,NULL]
[]
[NULL]
[false]
c1 Nullable(String)
(1, 2, 3)
c1 Nullable(String)
123.123
c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)))
[(1,2,3)]
c1 Array(Tuple(Nullable(Float64), Nullable(Float64), Nullable(Float64)))
[(1,2,3)]

View File

@ -0,0 +1,220 @@
#!/usr/bin/env bash
# Tags: no-parallel, no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
FILE_NAME=test_02149.data
DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME
touch $DATA_FILE
echo "TSV"
echo -e "42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3)
42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]
[]
[({}, [], 0)]
[({}, [NULL], NULL)]
[({}, ['String3'], NULL)]
[({'key3': NULL}, []), NULL]"> $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV') settings input_format_tsv_use_best_effort_in_schema_inference=false"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV') settings input_format_tsv_use_best_effort_in_schema_inference=false"
echo -e "[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]
[]
[({}, [], 0)]
[({}, [NULL], NULL)]
[({}, ['String3'], NULL)]
[({'key3': NULL}, [], NULL)]"> $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "true
false
\N" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[true, NULL]
[]
[NULL]
[false]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "{}" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "()" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[1, 2, 3" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[(1, 2, 3 4)]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[1, 2, 3 + 4]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "(1, 2," > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[1, Some trash, 42.2]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[1, 'String', {'key' : 2}]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "{'key' : 1, [1] : 10}" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "{}{}" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[1, 2, 3" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[abc, def]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "['abc', 'def']" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "['string]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "'string" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "42.42" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "42.42sometrash" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo -e "[42.42sometrash, 42.42]" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')"
echo
echo "CSV"
echo -e "42,Some string,'[1, 2, 3, 4]','[(1, 2, 3)]'
42\,abcd,'[]','[(4, 5, 6)]'" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
echo -e "\"[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]\"
'[]'
'[({}, [], 0)]'
'[({}, [NULL], NULL)]'
\"[({}, ['String3'], NULL)]\"
\"[({'key3': NULL}, []), NULL]\""> $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV') settings input_format_csv_use_best_effort_in_schema_inference=false"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV') settings input_format_csv_use_best_effort_in_schema_inference=false"
echo -e "\"[({'key' : 42.42}, ['String', 'String2'], 42.42), ({}, [], -42), ({'key2' : NULL}, [NULL], NULL)]\"
'[]'
'[({}, [], 0)]'
'[({}, [NULL], NULL)]'
\"[({}, ['String3'], NULL)]\"
\"[({'key3': NULL}, [], NULL)]\""> $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
echo -e "true
false
\N" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
echo -e "'[true, NULL]'
'[]'
'[NULL]'
'[false]'" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
echo -e "'(1, 2, 3)'"> $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
echo -e "'123.123'"> $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
echo -e "'[(1, 2, 3)]'"> $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"
echo -e "\"[(1, 2, 3)]\""> $DATA_FILE
$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')"
$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')"

View File

@ -1,15 +1,15 @@
a Nullable(String)
a Nullable(Float64)
b Nullable(String)
c Nullable(String)
1 s1 \N
c Array(Nullable(Float64))
1 s1 []
2 } [2]
\N \N \N
\N \N \N
\N \N []
\N \N []
\N \N [3]
b Nullable(String)
a Nullable(String)
c Nullable(String)
e Nullable(String)
b Nullable(Float64)
a Nullable(Float64)
c Nullable(Float64)
e Nullable(Float64)
1 \N \N \N
\N 2 3 \N
\N \N \N \N

View File

@ -10,3 +10,4 @@ ths
offsett
numer
ue
alse