Support unaligned array join

For left ARRAY JOIN, expand to the greatest size. If all sizes are zero,
resize to one which is the same as the old behavior, i.e. emptyArrayToSingle

For non-left ARRAY JOIN, expand to the greatest size but keep empty if all sizes are zero.
This commit is contained in:
Amos Bird 2018-12-12 11:13:02 +08:00
parent 1cc69100f1
commit 86f462acff
6 changed files with 108 additions and 4 deletions

View File

@ -143,8 +143,15 @@ ExpressionAction ExpressionAction::arrayJoin(const NameSet & array_joined_column
a.type = ARRAY_JOIN;
a.array_joined_columns = array_joined_columns;
a.array_join_is_left = array_join_is_left;
a.unaligned_array_join = context.getSettingsRef().enable_unaligned_array_join;
if (array_join_is_left)
if (a.unaligned_array_join)
{
a.function_length = FunctionFactory::instance().get("length", context);
a.function_greatest = FunctionFactory::instance().get("greatest", context);
a.function_arrayResize = FunctionFactory::instance().get("arrayResize", context);
}
else if (array_join_is_left)
a.function_builder = FunctionFactory::instance().get("emptyArrayToSingle", context);
return a;
@ -375,7 +382,44 @@ void ExpressionAction::execute(Block & block) const
/// If LEFT ARRAY JOIN, then we create columns in which empty arrays are replaced by arrays with one element - the default value.
std::map<String, ColumnPtr> non_empty_array_columns;
if (array_join_is_left)
if (unaligned_array_join)
{
/// Resize all array joined columns to the longest one, (at least 1 if LEFT ARRAY JOIN), padded with default values.
auto rows = block.rows();
auto uint64 = std::make_shared<DataTypeUInt64>();
ColumnWithTypeAndName column_of_max_length;
if (array_join_is_left)
column_of_max_length = ColumnWithTypeAndName(uint64->createColumnConst(rows, 1u), uint64, {});
else
column_of_max_length = ColumnWithTypeAndName(uint64->createColumnConst(rows, 0u), uint64, {});
for (const auto & name : array_joined_columns)
{
auto & src_col = block.getByName(name);
Block tmp_block{src_col, {{}, uint64, {}}};
function_length->build({src_col})->execute(tmp_block, {0}, 1, rows);
Block tmp_block2{
column_of_max_length, tmp_block.safeGetByPosition(1), {{}, uint64, {}}};
function_greatest->build({column_of_max_length, tmp_block.safeGetByPosition(1)})->execute(tmp_block2, {0, 1}, 2, rows);
column_of_max_length = tmp_block2.safeGetByPosition(2);
}
for (const auto & name : array_joined_columns)
{
auto & src_col = block.getByName(name);
Block tmp_block{src_col, column_of_max_length, {{}, src_col.type, {}}};
function_arrayResize->build({src_col, column_of_max_length})->execute(tmp_block, {0, 1}, 2, rows);
any_array_ptr = src_col.column = tmp_block.safeGetByPosition(2).column;
}
if (ColumnPtr converted = any_array_ptr->convertToFullColumnIfConst())
any_array_ptr = converted;
any_array = typeid_cast<const ColumnArray *>(&*any_array_ptr);
}
else if (array_join_is_left && !unaligned_array_join)
{
for (const auto & name : array_joined_columns)
{
@ -404,13 +448,13 @@ void ExpressionAction::execute(Block & block) const
if (!typeid_cast<const DataTypeArray *>(&*current.type))
throw Exception("ARRAY JOIN of not array: " + current.name, ErrorCodes::TYPE_MISMATCH);
ColumnPtr array_ptr = array_join_is_left ? non_empty_array_columns[current.name] : current.column;
ColumnPtr array_ptr = (array_join_is_left && !unaligned_array_join) ? non_empty_array_columns[current.name] : current.column;
if (ColumnPtr converted = array_ptr->convertToFullColumnIfConst())
array_ptr = converted;
const ColumnArray & array = typeid_cast<const ColumnArray &>(*array_ptr);
if (!array.hasEqualOffsets(typeid_cast<const ColumnArray &>(*any_array_ptr)))
if (!unaligned_array_join && !array.hasEqualOffsets(typeid_cast<const ColumnArray &>(*any_array_ptr)))
throw Exception("Sizes of ARRAY-JOIN-ed arrays do not match", ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
current.column = typeid_cast<const ColumnArray &>(*array_ptr).getDataPtr();

View File

@ -87,6 +87,12 @@ public:
/// For APPLY_FUNCTION and LEFT ARRAY JOIN.
/// FunctionBuilder is used before action was added to ExpressionActions (when we don't know types of arguments).
FunctionBuilderPtr function_builder;
/// For unaligned [LEFT] ARRAY JOIN
FunctionBuilderPtr function_length;
FunctionBuilderPtr function_greatest;
FunctionBuilderPtr function_arrayResize;
/// Can be used after action was added to ExpressionActions if we want to get function signature or properties like monotonicity.
FunctionBasePtr function_base;
/// Prepared function which is used in function execution.
@ -97,6 +103,7 @@ public:
/// For ARRAY_JOIN
NameSet array_joined_columns;
bool array_join_is_left = false;
bool unaligned_array_join = false;
/// For JOIN
std::shared_ptr<const Join> join;

View File

@ -292,6 +292,7 @@ struct Settings
M(SettingBool, allow_ddl, true, "If it is set to true, then a user is allowed to executed DDL queries.") \
M(SettingBool, parallel_view_processing, false, "Enables pushing to attached views concurrently instead of sequentially.") \
M(SettingBool, enable_debug_queries, false, "Enables debug queries such as AST.") \
M(SettingBool, enable_unaligned_array_join, false, "Enables unaligned array join.") \
#define DECLARE(TYPE, NAME, DEFAULT, DESCRIPTION) \

View File

@ -0,0 +1,26 @@
<test>
<name>array_join</name>
<type>once</type>
<stop_conditions>
<any_of>
<average_speed_not_changing_for_ms>10000</average_speed_not_changing_for_ms>
<total_time_ms>1000</total_time_ms>
</any_of>
</stop_conditions>
<metrics>
<max_rows_per_second />
</metrics>
<main_metric>
<max_rows_per_second />
</main_metric>
<query>SELECT count() FROM (SELECT [number] a, [number * 2] b FROM system.numbers) AS t ARRAY JOIN a, b WHERE NOT ignore(a + b)</query>
<query>SELECT count() FROM (SELECT [number] a, [number * 2] b FROM system.numbers) AS t LEFT ARRAY JOIN a, b WHERE NOT ignore(a + b)</query>
<query>SELECT count() FROM (SELECT [number] a, [number * 2] b FROM system.numbers) AS t ARRAY JOIN a, b WHERE NOT ignore(a + b) SETTINGS enable_unaligned_array_join = 1</query>
<query>SELECT count() FROM (SELECT [number] a, [number * 2] b FROM system.numbers) AS t LEFT ARRAY JOIN a, b WHERE NOT ignore(a + b) SETTINGS enable_unaligned_array_join = 1</query>
<query>SELECT count() FROM (SELECT [number] a, [number * 2, number] b FROM system.numbers) AS t ARRAY JOIN a, b WHERE NOT ignore(a + b) SETTINGS enable_unaligned_array_join = 1</query>
<query>SELECT count() FROM (SELECT [number] a, [number * 2, number] b FROM system.numbers) AS t LEFT ARRAY JOIN a, b WHERE NOT ignore(a + b) SETTINGS enable_unaligned_array_join = 1</query>
</test>

View File

@ -0,0 +1,24 @@
1 [0] [0] 0 0
2 [] [0,1] 0 0
2 [] [0,1] 0 1
3 [0] [] 0 0
4 [] [0] 0 0
5 [0] [0,1] 0 0
5 [0] [0,1] 0 1
7 [0] [0] 0 0
8 [] [0,1] 0 0
8 [] [0,1] 0 1
9 [0] [] 0 0
0 [] [] 0 0
1 [0] [0] 0 0
2 [] [0,1] 0 0
2 [] [0,1] 0 1
3 [0] [] 0 0
4 [] [0] 0 0
5 [0] [0,1] 0 0
5 [0] [0,1] 0 1
6 [] [] 0 0
7 [0] [0] 0 0
8 [] [0,1] 0 0
8 [] [0,1] 0 1
9 [0] [] 0 0

View File

@ -0,0 +1,2 @@
SELECT number, arr1, arr2, x, y FROM (SELECT number, range(number % 2) AS arr1, range(number % 3) arr2 FROM system.numbers LIMIT 10) ARRAY JOIN arr1 AS x, arr2 AS y SETTINGS enable_unaligned_array_join = 1;
SELECT number, arr1, arr2, x, y FROM (SELECT number, range(number % 2) AS arr1, range(number % 3) arr2 FROM system.numbers LIMIT 10) LEFT ARRAY JOIN arr1 AS x, arr2 AS y SETTINGS enable_unaligned_array_join = 1;