Merge branch 'master' into fix-30975

This commit is contained in:
Alexey Milovidov 2022-12-27 23:21:10 +03:00 committed by GitHub
commit 14d7266e70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 220 additions and 40 deletions

0
docker/test/fuzzer/run-fuzzer.sh Normal file → Executable file
View File

View File

@ -72,9 +72,12 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
{
auto types_without_low_cardinality = convertLowCardinalityTypesToNested(argument_types);
/// If one of the types is Nullable, we apply aggregate function combinator "Null".
if (std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(),
/// If one of the types is Nullable, we apply aggregate function combinator "Null" if it's not window function.
/// Window functions are not real aggregate functions. Applying combinators doesn't make sense for them,
/// they must handle the nullability themselves
auto properties = tryGetPropertiesImpl(name);
bool is_window_function = properties.has_value() && properties->is_window_function;
if (!is_window_function && std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(),
[](const auto & type) { return type->isNullable(); }))
{
AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix("Null");

View File

@ -23,7 +23,7 @@ public:
throw Exception("Incorrect number of arguments for aggregate function with " + getName() + " suffix",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isUInt8(arguments.back()))
if (!isUInt8(arguments.back()) && !arguments.back()->onlyNull())
throw Exception("Illegal type " + arguments.back()->getName() + " of last argument for aggregate function with " + getName() + " suffix",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
@ -52,6 +52,7 @@ class AggregateFunctionIfNullUnary final
private:
size_t num_arguments;
bool filter_is_nullable = false;
bool filter_is_only_null = false;
/// The name of the nested function, including combinators (i.e. *If)
///
@ -84,10 +85,8 @@ private:
return assert_cast<const ColumnUInt8 &>(*filter_column).getData()[row_num] && !filter_null_map[row_num];
}
else
{
return assert_cast<const ColumnUInt8 &>(*filter_column).getData()[row_num];
}
return assert_cast<const ColumnUInt8 &>(*filter_column).getData()[row_num];
}
public:
@ -106,10 +105,14 @@ public:
"Aggregate function {} require at least one argument", getName());
filter_is_nullable = arguments[num_arguments - 1]->isNullable();
filter_is_only_null = arguments[num_arguments - 1]->onlyNull();
}
void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
{
if (filter_is_only_null)
return;
const ColumnNullable * column = assert_cast<const ColumnNullable *>(columns[0]);
const IColumn * nested_column = &column->getNestedColumn();
if (!column->isNullAt(row_num) && singleFilter(columns, row_num))
@ -127,6 +130,9 @@ public:
Arena * arena,
ssize_t) const override
{
if (filter_is_only_null)
return;
const ColumnNullable * column = assert_cast<const ColumnNullable *>(columns[0]);
const UInt8 * null_map = column->getNullMapData().data();
const IColumn * columns_param[] = {&column->getNestedColumn()};
@ -177,6 +183,11 @@ public:
#if USE_EMBEDDED_COMPILER
bool isCompilable() const override
{
return canBeNativeType(*this->argument_types.back()) && this->nested_function->isCompilable();
}
void compileAdd(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr, const DataTypes & arguments_types, const std::vector<llvm::Value *> & argument_values) const override
{
llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
@ -224,6 +235,9 @@ class AggregateFunctionIfNullVariadic final : public AggregateFunctionNullBase<
serialize_flag,
AggregateFunctionIfNullVariadic<result_is_nullable, serialize_flag>>
{
private:
bool filter_is_only_null = false;
public:
String getName() const override
@ -243,6 +257,8 @@ public:
for (size_t i = 0; i < number_of_arguments; ++i)
is_nullable[i] = arguments[i]->isNullable();
filter_is_only_null = arguments.back()->onlyNull();
}
static inline bool singleFilter(const IColumn ** columns, size_t row_num, size_t num_arguments)
@ -282,6 +298,9 @@ public:
void addBatchSinglePlace(
size_t row_begin, size_t row_end, AggregateDataPtr __restrict place, const IColumn ** columns, Arena * arena, ssize_t) const final
{
if (filter_is_only_null)
return;
std::unique_ptr<UInt8[]> final_null_flags = std::make_unique<UInt8[]>(row_end);
const size_t filter_column_num = number_of_arguments - 1;
@ -346,6 +365,11 @@ public:
#if USE_EMBEDDED_COMPILER
bool isCompilable() const override
{
return canBeNativeType(*this->argument_types.back()) && this->nested_function->isCompilable();
}
void compileAdd(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr, const DataTypes & arguments_types, const std::vector<llvm::Value *> & argument_values) const override
{
/// TODO: Check

View File

@ -42,7 +42,7 @@ public:
if (num_arguments == 0)
throw Exception("Aggregate function " + getName() + " require at least one argument", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isUInt8(types.back()))
if (!isUInt8(types.back()) && !types.back()->onlyNull())
throw Exception("Last argument for aggregate function " + getName() + " must be UInt8", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
@ -199,12 +199,16 @@ public:
AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
std::unordered_set<size_t> getArgumentsThatCanBeOnlyNull() const override
{
return {num_arguments - 1};
}
#if USE_EMBEDDED_COMPILER
bool isCompilable() const override
{
return nested_func->isCompilable();
return canBeNativeType(*this->argument_types.back()) && nested_func->isCompilable();
}
void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr) const override

View File

@ -29,7 +29,13 @@ public:
size_t size = arguments.size();
DataTypes res(size);
for (size_t i = 0; i < size; ++i)
res[i] = removeNullable(arguments[i]);
{
/// Nullable(Nothing) is processed separately, don't convert it to Nothing.
if (arguments[i]->onlyNull())
res[i] = arguments[i];
else
res[i] = removeNullable(arguments[i]);
}
return res;
}
@ -41,12 +47,16 @@ public:
{
bool has_nullable_types = false;
bool has_null_types = false;
for (const auto & arg_type : arguments)
std::unordered_set<size_t> arguments_that_can_be_only_null;
if (nested_function)
arguments_that_can_be_only_null = nested_function->getArgumentsThatCanBeOnlyNull();
for (size_t i = 0; i < arguments.size(); ++i)
{
if (arg_type->isNullable())
if (arguments[i]->isNullable())
{
has_nullable_types = true;
if (arg_type->onlyNull())
if (arguments[i]->onlyNull() && !arguments_that_can_be_only_null.contains(i))
{
has_null_types = true;
break;

View File

@ -345,6 +345,14 @@ public:
return nullptr;
}
/// For most functions if one of arguments is always NULL, we return NULL (it's implemented in combinator Null),
/// but in some functions we can want to process this argument somehow (for example condition argument in If combinator).
/// This method returns the set of argument indexes that can be always NULL, they will be skipped in combinator Null.
virtual std::unordered_set<size_t> getArgumentsThatCanBeOnlyNull() const
{
return {};
}
/** Return the nested function if this is an Aggregate Function Combinator.
* Otherwise return nullptr.
*/
@ -828,6 +836,9 @@ struct AggregateFunctionProperties
* Some may also name this property as "non-commutative".
*/
bool is_order_dependent = false;
/// Indicates if it's actually window function.
bool is_window_function = false;
};

View File

@ -81,6 +81,7 @@ public:
if (nested_if_function_arguments_nodes.size() != 3)
return;
auto & cond_argument = nested_if_function_arguments_nodes[0];
const auto * if_true_condition_constant_node = nested_if_function_arguments_nodes[1]->as<ConstantNode>();
const auto * if_false_condition_constant_node = nested_if_function_arguments_nodes[2]->as<ConstantNode>();
@ -107,8 +108,8 @@ public:
return;
}
/// Rewrite `sum(if(cond, 0, 1))` into `countIf(not(cond))`.
if (if_true_condition_value == 0 && if_false_condition_value == 1)
/// Rewrite `sum(if(cond, 0, 1))` into `countIf(not(cond))` if condition is not Nullable (otherwise the result can be different).
if (if_true_condition_value == 0 && if_false_condition_value == 1 && !cond_argument->getResultType()->isNullable())
{
DataTypePtr not_function_result_type = std::make_shared<DataTypeUInt8>();

View File

@ -318,6 +318,12 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet
if (auto date_type = tryInferDateOrDateTimeFromString(field, format_settings))
return date_type;
/// Special case when we have number that starts with 0. In TSV we don't parse such numbers,
/// see readIntTextUnsafe in ReadHelpers.h. If we see data started with 0, we can determine it
/// as a String, so parsing won't fail.
if (field[0] == '0' && field.size() != 1)
return std::make_shared<DataTypeString>();
auto type = tryInferDataTypeForSingleField(field, format_settings);
if (!type)
return std::make_shared<DataTypeString>();

View File

@ -250,7 +250,7 @@ namespace
{
if (isArray(type))
nested_types.push_back(assert_cast<const DataTypeArray &>(*type).getNestedType());
else
else if (isTuple(type))
{
const auto & elements = assert_cast<const DataTypeTuple &>(*type).getElements();
for (const auto & element : elements)
@ -262,7 +262,10 @@ namespace
if (checkIfTypesAreEqual(nested_types))
{
for (auto & type : data_types)
type = std::make_shared<DataTypeArray>(nested_types.back());
{
if (isArray(type) || isTuple(type))
type = std::make_shared<DataTypeArray>(nested_types.back());
}
}
}
@ -826,14 +829,40 @@ void transformInferredJSONTypesIfNeeded(
void transformJSONTupleToArrayIfPossible(DataTypePtr & data_type, const FormatSettings & settings, JSONInferenceInfo * json_info)
{
if (!data_type || !isTuple(data_type))
if (!data_type)
return;
const auto * tuple_type = assert_cast<const DataTypeTuple *>(data_type.get());
auto nested_types = tuple_type->getElements();
transformInferredTypesIfNeededImpl<true>(nested_types, settings, json_info);
if (checkIfTypesAreEqual(nested_types))
data_type = std::make_shared<DataTypeArray>(nested_types.back());
if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get()))
{
auto nested_type = array_type->getNestedType();
transformJSONTupleToArrayIfPossible(nested_type, settings, json_info);
data_type = std::make_shared<DataTypeArray>(nested_type);
return;
}
if (const auto * map_type = typeid_cast<const DataTypeMap *>(data_type.get()))
{
auto value_type = map_type->getValueType();
transformJSONTupleToArrayIfPossible(value_type, settings, json_info);
data_type = std::make_shared<DataTypeMap>(map_type->getKeyType(), value_type);
return;
}
if (const auto * tuple_type = typeid_cast<const DataTypeTuple *>(data_type.get()))
{
auto nested_types = tuple_type->getElements();
for (auto & nested_type : nested_types)
transformJSONTupleToArrayIfPossible(nested_type, settings, json_info);
auto nested_types_copy = nested_types;
transformInferredTypesIfNeededImpl<true>(nested_types_copy, settings, json_info);
if (checkIfTypesAreEqual(nested_types_copy))
data_type = std::make_shared<DataTypeArray>(nested_types_copy.back());
else
data_type = std::make_shared<DataTypeTuple>(nested_types);
return;
}
}
DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings)

View File

@ -176,6 +176,7 @@ public:
ColumnPtr executeShortCircuit(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const;
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
size_t getNumberOfArguments() const override { return 0; }
bool canBeExecutedOnLowCardinalityDictionary() const override { return false; }
bool useDefaultImplementationForNulls() const override { return !Impl::specialImplementationForNulls(); }

View File

@ -1026,6 +1026,7 @@ public:
}
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
ColumnNumbers getArgumentsThatDontImplyNullableReturnType(size_t /*number_of_arguments*/) const override { return {0}; }
bool canBeExecutedOnLowCardinalityDictionary() const override { return false; }
/// Get result types by argument types. If the function does not apply to these arguments, throw an exception.
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override

View File

@ -51,6 +51,7 @@ public:
size_t getNumberOfArguments() const override { return 0; }
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForNothing() const override { return false; }
bool canBeExecutedOnLowCardinalityDictionary() const override { return false; }
ColumnNumbers getArgumentsThatDontImplyNullableReturnType(size_t number_of_arguments) const override
{

View File

@ -71,7 +71,7 @@ Chunk ArrowBlockInputFormat::generate()
++record_batch_current;
arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result);
arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result, (*table_result)->num_rows());
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
/// Otherwise fill the missing columns with zero values of its type.

View File

@ -69,7 +69,6 @@ namespace ErrorCodes
extern const int DUPLICATE_COLUMN;
extern const int THERE_IS_NO_COLUMN;
extern const int UNKNOWN_EXCEPTION;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int INCORRECT_DATA;
}
@ -810,7 +809,7 @@ ArrowColumnToCHColumn::ArrowColumnToCHColumn(
{
}
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table)
void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table, size_t num_rows)
{
NameToColumnPtr name_to_column_ptr;
for (auto column_name : table->ColumnNames())
@ -824,16 +823,12 @@ void ArrowColumnToCHColumn::arrowTableToCHChunk(Chunk & res, std::shared_ptr<arr
name_to_column_ptr[std::move(column_name)] = arrow_column;
}
arrowColumnsToCHChunk(res, name_to_column_ptr);
arrowColumnsToCHChunk(res, name_to_column_ptr, num_rows);
}
void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr)
void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows)
{
if (unlikely(name_to_column_ptr.empty()))
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Columns is empty");
Columns columns_list;
UInt64 num_rows = name_to_column_ptr.begin()->second->length();
columns_list.reserve(header.columns());
std::unordered_map<String, std::pair<BlockPtr, std::shared_ptr<NestedColumnExtractHelper>>> nested_tables;
bool skipped = false;

View File

@ -28,9 +28,9 @@ public:
bool allow_missing_columns_,
bool case_insensitive_matching_ = false);
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table);
void arrowTableToCHChunk(Chunk & res, std::shared_ptr<arrow::Table> & table, size_t num_rows);
void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr);
void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr, size_t num_rows);
/// Get missing columns that exists in header but not in arrow::Schema
std::vector<size_t> getMissingColumns(const arrow::Schema & schema) const;

View File

@ -54,14 +54,19 @@ Chunk ORCBlockInputFormat::generate()
throw ParsingException(
ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_result.status().ToString());
/// We should extract the number of rows directly from the stripe, because in case when
/// record batch contains 0 columns (for example if we requested only columns that
/// are not presented in data) the number of rows in record batch will be 0.
size_t num_rows = file_reader->GetRawORCReader()->getStripe(stripe_current)->getNumberOfRows();
auto table = table_result.ValueOrDie();
if (!table || !table->num_rows())
if (!table || !num_rows)
return {};
++stripe_current;
Chunk res;
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
arrow_column_to_ch_column->arrowTableToCHChunk(res, table, num_rows);
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
/// Otherwise fill the missing columns with zero values of its type.
if (format_settings.defaults_for_omitted_fields)

View File

@ -70,7 +70,7 @@ Chunk ParquetBlockInputFormat::generate()
++row_group_current;
arrow_column_to_ch_column->arrowTableToCHChunk(res, table);
arrow_column_to_ch_column->arrowTableToCHChunk(res, table, table->num_rows());
/// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
/// Otherwise fill the missing columns with zero values of its type.

View File

@ -2313,7 +2313,8 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
.returns_default_when_only_null = true,
// This probably doesn't make any difference for window functions because
// it is an Aggregator-specific setting.
.is_order_dependent = true };
.is_order_dependent = true,
.is_window_function = true};
factory.registerFunction("rank", {[](const std::string & name,
const DataTypes & argument_types, const Array & parameters, const Settings *)

View File

@ -25,7 +25,7 @@ MAX_RETRY = 5
# Number of times a check can re-run as a whole.
# It is needed, because we are using AWS "spot" instances, that are terminated often
MAX_WORKFLOW_RERUN = 20
MAX_WORKFLOW_RERUN = 30
WorkflowDescription = namedtuple(
"WorkflowDescription",

View File

@ -0,0 +1,3 @@
x Array(Array(Nullable(Int64)))
x Tuple(Array(Array(Nullable(Int64))), Nullable(Int64))
x Map(String, Array(Nullable(Int64)))

View File

@ -0,0 +1,4 @@
desc format(JSONEachRow, '{"x" : [[42, null], [24, null]]}');
desc format(JSONEachRow, '{"x" : [[[42, null], []], 24]}');
desc format(JSONEachRow, '{"x" : {"key" : [42, null]}}');

View File

@ -0,0 +1,18 @@
-- { echoOn }
select sumIf(1, NULL);
0
select sumIf(NULL, 1);
\N
select sumIf(NULL, NULL);
\N
select countIf(1, NULL);
0
select countIf(NULL, 1);
0
select countIf(1, NULL);
0
select sumArray([NULL, NULL]);
\N
select countArray([NULL, NULL]);
0

View File

@ -0,0 +1,11 @@
-- { echoOn }
select sumIf(1, NULL);
select sumIf(NULL, 1);
select sumIf(NULL, NULL);
select countIf(1, NULL);
select countIf(NULL, 1);
select countIf(1, NULL);
select sumArray([NULL, NULL]);
select countArray([NULL, NULL]);

View File

@ -0,0 +1,3 @@
1024
0
1024

View File

@ -0,0 +1,4 @@
select sum(if((number % NULL) = 2, 0, 1)) FROM numbers(1024) settings optimize_rewrite_sum_if_to_count_if=0;
select sum(if((number % NULL) = 2, 0, 1)) FROM numbers(1024) settings optimize_rewrite_sum_if_to_count_if=1, allow_experimental_analyzer=0;
select sum(if((number % NULL) = 2, 0, 1)) FROM numbers(1024) settings optimize_rewrite_sum_if_to_count_if=1, allow_experimental_analyzer=1;

View File

@ -0,0 +1,8 @@
Hello
Hello
Hello
6 6
Hello
Hello
Hello
6 6

View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
#Tags: no-fasttest, no-parallel
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select number as x from numbers(3) format Parquet" > 02511_data1.parquet
$CLICKHOUSE_LOCAL -q "select y from file(02511_data1.parquet, auto, 'x UInt64, y String default \'Hello\'') settings input_format_parquet_allow_missing_columns=1"
$CLICKHOUSE_LOCAL -q "select number as x, 'Hello' as y from numbers(3) format Parquet" > 02511_data2.parquet
$CLICKHOUSE_LOCAL -q "select count(*), count(y) from file('02511_data*.parquet', auto, 'x UInt64, y String') settings input_format_parquet_allow_missing_columns=1"
$CLICKHOUSE_LOCAL -q "select number as x from numbers(3) format ORC" > 02511_data1.orc
$CLICKHOUSE_LOCAL -q "select y from file(02511_data1.orc, auto, 'x UInt64, y String default \'Hello\'') settings input_format_orc_allow_missing_columns=1"
$CLICKHOUSE_LOCAL -q "select number as x, 'Hello' as y from numbers(3) format ORC" > 02511_data2.orc
$CLICKHOUSE_LOCAL -q "select count(*), count(y) from file('02511_data*.orc', auto, 'x UInt64, y String') settings input_format_orc_allow_missing_columns=1"
rm 02511_data*

View File

@ -0,0 +1,5 @@
create table if not exists t (`arr.key` Array(LowCardinality(String)), `arr.value` Array(LowCardinality(String))) engine = Memory;
insert into t (`arr.key`, `arr.value`) values (['a'], ['b']);
select if(true, if(lowerUTF8(arr.key) = 'a', 1, 2), 3) as x from t left array join arr;
drop table t;

View File

@ -0,0 +1 @@
Nullable(String) 0123

View File

@ -0,0 +1,2 @@
select toTypeName(*), * from format(TSV, '0123');

View File

@ -0,0 +1,4 @@
UInt8
UInt8
UInt8
UInt8

View File

@ -0,0 +1,5 @@
select toTypeName(if(toLowCardinality(number % 2), 1, 2)) from numbers(1);
select toTypeName(multiIf(toLowCardinality(number % 2), 1, 1, 2, 3)) from numbers(1);
select toTypeName(toLowCardinality(number % 2) and 2) from numbers(1);
select toTypeName(toLowCardinality(number % 2) or 2) from numbers(1);