ClickHouse/src/Functions/concat.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

241 lines
8.7 KiB
C++
Raw Normal View History

#include <Columns/ColumnString.h>
2018-09-09 23:36:06 +00:00
#include <DataTypes/DataTypeString.h>
2018-09-10 00:51:08 +00:00
#include <DataTypes/getLeastSupertype.h>
2018-09-09 23:36:06 +00:00
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/GatherUtils/Algorithms.h>
2018-09-09 23:47:56 +00:00
#include <Functions/GatherUtils/Sinks.h>
#include <Functions/GatherUtils/Slices.h>
#include <Functions/GatherUtils/Sources.h>
2021-05-17 07:30:42 +00:00
#include <Functions/IFunction.h>
2018-09-09 23:36:06 +00:00
#include <IO/WriteHelpers.h>
2021-10-02 07:13:14 +00:00
#include <base/map.h>
#include <base/range.h>
2018-09-09 23:36:06 +00:00
#include "formatString.h"
2018-09-09 23:36:06 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_COLUMN;
2018-09-09 23:36:06 +00:00
}
using namespace GatherUtils;
2020-09-07 18:00:37 +00:00
namespace
{
2018-09-09 23:36:06 +00:00
template <typename Name, bool is_injective>
class ConcatImpl : public IFunction
{
public:
static constexpr auto name = Name::name;
2021-06-01 12:20:52 +00:00
explicit ConcatImpl(ContextPtr context_) : context(context_) {}
static FunctionPtr create(ContextPtr context) { return std::make_shared<ConcatImpl>(context); }
2018-09-09 23:36:06 +00:00
String getName() const override { return name; }
2018-09-09 23:36:06 +00:00
bool isVariadic() const override { return true; }
2018-09-09 23:36:06 +00:00
size_t getNumberOfArguments() const override { return 0; }
2018-09-09 23:36:06 +00:00
bool isInjective(const ColumnsWithTypeAndName &) const override { return is_injective; }
2018-09-09 23:36:06 +00:00
2021-06-22 16:21:23 +00:00
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
2018-09-09 23:36:06 +00:00
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() < 2)
throw Exception(
2022-03-28 04:58:22 +00:00
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be at least 2",
getName(),
arguments.size());
2019-05-20 21:12:25 +00:00
2021-06-15 19:55:21 +00:00
for (const auto arg_idx : collections::range(0, arguments.size()))
2018-09-09 23:36:06 +00:00
{
2020-04-22 08:31:10 +00:00
const auto * arg = arguments[arg_idx].get();
2018-09-09 23:36:06 +00:00
if (!isStringOrFixedString(arg))
2022-03-28 04:58:22 +00:00
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument {} of function {}",
arg->getName(),
arg_idx + 1,
getName());
2018-09-09 23:36:06 +00:00
}
return std::make_shared<DataTypeString>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
2018-09-09 23:36:06 +00:00
{
/// Format function is not proven to be faster for two arguments.
/// Actually there is overhead of 2 to 5 extra instructions for each string for checking empty strings in FormatImpl.
/// Though, benchmarks are really close, for most examples we saw executeBinary is slightly faster (0-3%).
/// For 3 and more arguments FormatImpl is much faster (up to 50-60%).
2018-09-09 23:36:06 +00:00
if (arguments.size() == 2)
2020-10-17 16:48:53 +00:00
return executeBinary(arguments, input_rows_count);
2018-09-09 23:36:06 +00:00
else
2020-10-17 16:48:53 +00:00
return executeFormatImpl(arguments, input_rows_count);
2018-09-09 23:36:06 +00:00
}
private:
2021-06-01 12:20:52 +00:00
ContextWeakPtr context;
2018-09-09 23:36:06 +00:00
ColumnPtr executeBinary(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const
2018-09-09 23:36:06 +00:00
{
2020-10-17 16:48:53 +00:00
const IColumn * c0 = arguments[0].column.get();
const IColumn * c1 = arguments[1].column.get();
2018-09-09 23:36:06 +00:00
const ColumnString * c0_string = checkAndGetColumn<ColumnString>(c0);
const ColumnString * c1_string = checkAndGetColumn<ColumnString>(c1);
const ColumnConst * c0_const_string = checkAndGetColumnConst<ColumnString>(c0);
const ColumnConst * c1_const_string = checkAndGetColumnConst<ColumnString>(c1);
auto c_res = ColumnString::create();
if (c0_string && c1_string)
concat(StringSource(*c0_string), StringSource(*c1_string), StringSink(*c_res, c0->size()));
else if (c0_string && c1_const_string)
concat(StringSource(*c0_string), ConstSource<StringSource>(*c1_const_string), StringSink(*c_res, c0->size()));
else if (c0_const_string && c1_string)
concat(ConstSource<StringSource>(*c0_const_string), StringSource(*c1_string), StringSink(*c_res, c0->size()));
else
{
/// Fallback: use generic implementation for not very important cases.
2020-10-17 16:48:53 +00:00
return executeFormatImpl(arguments, input_rows_count);
2018-09-09 23:36:06 +00:00
}
2020-10-17 16:48:53 +00:00
return c_res;
2018-09-09 23:36:06 +00:00
}
ColumnPtr executeFormatImpl(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const
2018-09-09 23:36:06 +00:00
{
2020-05-15 18:55:30 +00:00
const size_t num_arguments = arguments.size();
2020-05-16 20:15:19 +00:00
assert(num_arguments >= 2);
2020-05-15 15:48:19 +00:00
auto c_res = ColumnString::create();
2020-05-15 18:55:30 +00:00
std::vector<const ColumnString::Chars *> data(num_arguments);
std::vector<const ColumnString::Offsets *> offsets(num_arguments);
std::vector<size_t> fixed_string_sizes(num_arguments);
2022-03-28 04:58:22 +00:00
std::vector<std::optional<String>> constant_strings(num_arguments);
bool has_column_string = false;
bool has_column_fixed_string = false;
2020-05-15 18:55:30 +00:00
for (size_t i = 0; i < num_arguments; ++i)
{
2020-10-17 16:48:53 +00:00
const ColumnPtr & column = arguments[i].column;
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
{
has_column_string = true;
data[i] = &col->getChars();
offsets[i] = &col->getOffsets();
}
else if (const ColumnFixedString * fixed_col = checkAndGetColumn<ColumnFixedString>(column.get()))
{
has_column_fixed_string = true;
data[i] = &fixed_col->getChars();
fixed_string_sizes[i] = fixed_col->getN();
}
else if (const ColumnConst * const_col = checkAndGetColumnConstStringOrFixedString(column.get()))
{
constant_strings[i] = const_col->getValue<String>();
}
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}",
column->getName(), getName());
}
2018-09-09 23:36:06 +00:00
String pattern;
2020-05-15 18:55:30 +00:00
pattern.reserve(2 * num_arguments);
2020-05-15 18:55:30 +00:00
for (size_t i = 0; i < num_arguments; ++i)
pattern += "{}";
FormatImpl::formatExecute(
has_column_string,
has_column_fixed_string,
std::move(pattern),
data,
offsets,
fixed_string_sizes,
constant_strings,
c_res->getChars(),
c_res->getOffsets(),
input_rows_count);
2018-09-09 23:36:06 +00:00
2020-10-17 16:48:53 +00:00
return c_res;
2018-09-09 23:36:06 +00:00
}
};
2018-09-10 00:51:08 +00:00
2018-09-09 23:36:06 +00:00
struct NameConcat
{
static constexpr auto name = "concat";
};
struct NameConcatAssumeInjective
{
static constexpr auto name = "concatAssumeInjective";
};
using FunctionConcat = ConcatImpl<NameConcat, false>;
using FunctionConcatAssumeInjective = ConcatImpl<NameConcatAssumeInjective, true>;
2018-09-10 00:51:08 +00:00
/// Also works with arrays.
2021-05-15 17:33:15 +00:00
class ConcatOverloadResolver : public IFunctionOverloadResolver
2018-09-10 00:51:08 +00:00
{
public:
static constexpr auto name = "concat";
2021-06-01 12:20:52 +00:00
static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique<ConcatOverloadResolver>(context); }
2018-09-10 00:51:08 +00:00
2021-06-01 12:20:52 +00:00
explicit ConcatOverloadResolver(ContextPtr context_) : context(context_) {}
2018-09-10 00:51:08 +00:00
String getName() const override { return name; }
2018-09-10 01:19:14 +00:00
size_t getNumberOfArguments() const override { return 0; }
2018-09-10 01:39:06 +00:00
bool isVariadic() const override { return true; }
2018-09-10 00:51:08 +00:00
2021-05-15 17:33:15 +00:00
FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
2018-09-10 00:51:08 +00:00
{
if (isArray(arguments.at(0).type))
2019-12-10 13:21:26 +00:00
{
2021-05-15 17:33:15 +00:00
return FunctionFactory::instance().getImpl("arrayConcat", context)->build(arguments);
2019-12-10 13:21:26 +00:00
}
else if (isMap(arguments.at(0).type))
{
return FunctionFactory::instance().getImpl("mapConcat", context)->build(arguments);
}
2018-09-10 00:51:08 +00:00
else
2021-05-15 17:33:15 +00:00
return std::make_unique<FunctionToFunctionBaseAdaptor>(
2021-06-15 19:55:21 +00:00
FunctionConcat::create(context), collections::map<DataTypes>(arguments, [](const auto & elem) { return elem.type; }), return_type);
2018-09-10 00:51:08 +00:00
}
2021-05-15 17:33:15 +00:00
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
2018-09-10 00:51:08 +00:00
{
2018-09-10 02:36:33 +00:00
if (arguments.size() < 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be at least 2.",
getName(), arguments.size());
2018-09-10 00:51:08 +00:00
2019-11-05 13:23:09 +00:00
/// We always return Strings from concat, even if arguments were fixed strings.
return std::make_shared<DataTypeString>();
2018-09-10 00:51:08 +00:00
}
private:
2021-06-01 12:20:52 +00:00
ContextPtr context;
2018-09-10 00:51:08 +00:00
};
2020-09-07 18:00:37 +00:00
}
2018-09-10 00:51:08 +00:00
REGISTER_FUNCTION(Concat)
2018-09-09 23:36:06 +00:00
{
2022-08-27 20:06:03 +00:00
factory.registerFunction<ConcatOverloadResolver>({}, FunctionFactory::CaseInsensitive);
2018-09-09 23:36:06 +00:00
factory.registerFunction<FunctionConcatAssumeInjective>();
}
}