2017-04-21 17:47:27 +00:00
|
|
|
|
#pragma once
|
2011-10-03 04:06:34 +00:00
|
|
|
|
|
2011-10-03 05:29:11 +00:00
|
|
|
|
#include <Poco/UTF8Encoding.h>
|
|
|
|
|
#include <Poco/Unicode.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
|
#include <Columns/ColumnConst.h>
|
|
|
|
|
#include <Columns/ColumnFixedString.h>
|
|
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
|
#include <Columns/ColumnString.h>
|
2017-07-13 20:58:19 +00:00
|
|
|
|
#include <Common/typeid_cast.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
|
#include <DataTypes/DataTypeFixedString.h>
|
|
|
|
|
#include <DataTypes/DataTypeString.h>
|
|
|
|
|
#include <Functions/IFunction.h>
|
2017-07-21 06:35:58 +00:00
|
|
|
|
#include <Functions/FunctionHelpers.h>
|
2015-05-28 15:34:37 +00:00
|
|
|
|
|
2011-10-03 04:06:34 +00:00
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
2017-06-13 02:06:53 +00:00
|
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
|
{
|
|
|
|
|
extern const int ILLEGAL_COLUMN;
|
|
|
|
|
}
|
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
|
/** String functions
|
2011-10-03 04:06:34 +00:00
|
|
|
|
*
|
2012-09-23 00:04:17 +00:00
|
|
|
|
* length, empty, notEmpty,
|
2016-06-23 22:28:15 +00:00
|
|
|
|
* concat, substring, lower, upper, reverse
|
|
|
|
|
* lengthUTF8, substringUTF8, lowerUTF8, upperUTF8, reverseUTF8
|
2011-10-03 04:06:34 +00:00
|
|
|
|
*
|
2017-04-01 07:20:54 +00:00
|
|
|
|
* s -> UInt8: empty, notEmpty
|
2017-05-12 14:01:02 +00:00
|
|
|
|
* s -> UInt64: length, lengthUTF8
|
|
|
|
|
* s -> s: lower, upper, lowerUTF8, upperUTF8, reverse, reverseUTF8
|
|
|
|
|
* s, s -> s: concat
|
|
|
|
|
* s, c1, c2 -> s: substring, substringUTF8
|
2017-04-01 07:20:54 +00:00
|
|
|
|
* s, c1, c2, s2 -> s: replace, replaceUTF8
|
2014-06-26 00:58:14 +00:00
|
|
|
|
*
|
2017-05-27 15:45:25 +00:00
|
|
|
|
* The search functions for strings and regular expressions are located separately.
|
|
|
|
|
* URL functions are located separately.
|
|
|
|
|
* String encoding functions, converting to other types are located separately.
|
2012-09-03 04:45:46 +00:00
|
|
|
|
*
|
2017-05-27 15:45:25 +00:00
|
|
|
|
* The functions length, empty, notEmpty, reverse also work with arrays.
|
2011-10-03 04:06:34 +00:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
2017-03-10 17:53:32 +00:00
|
|
|
|
/// xor or do nothing
|
|
|
|
|
template <bool>
|
|
|
|
|
UInt8 xor_or_identity(const UInt8 c, const int mask)
|
2011-10-03 04:06:34 +00:00
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
|
return c ^ mask;
|
2011-10-03 04:06:34 +00:00
|
|
|
|
};
|
2017-03-10 17:53:32 +00:00
|
|
|
|
template <>
|
|
|
|
|
inline UInt8 xor_or_identity<false>(const UInt8 c, const int)
|
2011-10-03 05:29:11 +00:00
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
|
return c;
|
2017-03-10 17:53:32 +00:00
|
|
|
|
}
|
2015-06-04 12:16:42 +00:00
|
|
|
|
|
|
|
|
|
/// It is caller's responsibility to ensure the presence of a valid cyrillic sequence in array
|
|
|
|
|
template <bool to_lower>
|
2017-07-21 06:35:58 +00:00
|
|
|
|
inline void UTF8CyrillicToCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst)
|
2015-06-04 12:16:42 +00:00
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
|
if (src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
|
|
|
|
|
{
|
|
|
|
|
/// ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ
|
|
|
|
|
*dst++ = xor_or_identity<to_lower>(*src++, 0x1);
|
|
|
|
|
*dst++ = xor_or_identity<to_lower>(*src++, 0x10);
|
|
|
|
|
}
|
|
|
|
|
else if (src[0] == 0xD1u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
|
|
|
|
|
{
|
|
|
|
|
/// ѐёђѓєѕіїјљњћќѝўџ
|
|
|
|
|
*dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
|
|
|
|
|
*dst++ = xor_or_identity<!to_lower>(*src++, 0x10);
|
|
|
|
|
}
|
|
|
|
|
else if (src[0] == 0xD0u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
|
|
|
|
|
{
|
|
|
|
|
/// А-П
|
|
|
|
|
*dst++ = *src++;
|
|
|
|
|
*dst++ = xor_or_identity<to_lower>(*src++, 0x20);
|
|
|
|
|
}
|
|
|
|
|
else if (src[0] == 0xD0u && (src[1] >= 0xB0u && src[1] <= 0xBFu))
|
|
|
|
|
{
|
|
|
|
|
/// а-п
|
|
|
|
|
*dst++ = *src++;
|
|
|
|
|
*dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
|
|
|
|
|
}
|
|
|
|
|
else if (src[0] == 0xD0u && (src[1] >= 0xA0u && src[1] <= 0xAFu))
|
|
|
|
|
{
|
2017-05-12 14:01:02 +00:00
|
|
|
|
/// Р-Я
|
2017-04-01 07:20:54 +00:00
|
|
|
|
*dst++ = xor_or_identity<to_lower>(*src++, 0x1);
|
|
|
|
|
*dst++ = xor_or_identity<to_lower>(*src++, 0x20);
|
|
|
|
|
}
|
|
|
|
|
else if (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
|
|
|
|
|
{
|
|
|
|
|
/// р-я
|
|
|
|
|
*dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
|
|
|
|
|
*dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
|
|
|
|
|
}
|
2017-03-10 17:53:32 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-06-04 12:16:42 +00:00
|
|
|
|
|
2017-05-27 15:45:25 +00:00
|
|
|
|
/** If the string contains UTF-8 encoded text, convert it to the lower (upper) case.
|
|
|
|
|
* Note: It is assumed that after the character is converted to another case,
|
|
|
|
|
* the length of its multibyte sequence in UTF-8 does not change.
|
|
|
|
|
* Otherwise, the behavior is undefined.
|
2015-06-10 12:47:27 +00:00
|
|
|
|
*/
|
2017-03-10 17:53:32 +00:00
|
|
|
|
template <char not_case_lower_bound,
|
2017-04-01 07:20:54 +00:00
|
|
|
|
char not_case_upper_bound,
|
|
|
|
|
int to_case(int),
|
|
|
|
|
void cyrillic_to_case(const UInt8 *&, const UInt8 *, UInt8 *&)>
|
2015-06-10 12:47:27 +00:00
|
|
|
|
struct LowerUpperUTF8Impl
|
2015-05-28 12:32:43 +00:00
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
|
static void vector(const ColumnString::Chars_t & data,
|
|
|
|
|
const ColumnString::Offsets_t & offsets,
|
|
|
|
|
ColumnString::Chars_t & res_data,
|
|
|
|
|
ColumnString::Offsets_t & res_offsets);
|
2015-05-28 12:32:43 +00:00
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
|
static void vector_fixed(const ColumnString::Chars_t & data, size_t n, ColumnString::Chars_t & res_data);
|
2015-05-28 12:32:43 +00:00
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
|
static void constant(const std::string & data, std::string & res_data);
|
2015-05-28 12:32:43 +00:00
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
|
/** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
|
|
|
|
|
* `src` and `dst` are incremented by corresponding sequence lengths. */
|
2017-07-21 06:35:58 +00:00
|
|
|
|
static void toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst);
|
2015-09-24 12:58:18 +00:00
|
|
|
|
|
2015-05-28 12:32:43 +00:00
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
|
static constexpr auto ascii_upper_bound = '\x7f';
|
|
|
|
|
static constexpr auto flip_case_mask = 'A' ^ 'a';
|
2015-09-24 12:58:18 +00:00
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
|
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst);
|
2011-10-10 12:26:26 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
2017-03-10 17:53:32 +00:00
|
|
|
|
template <typename Impl, typename Name, bool is_injective = false>
|
|
|
|
|
class FunctionStringToString : public IFunction
|
2011-10-03 04:06:34 +00:00
|
|
|
|
{
|
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
|
static constexpr auto name = Name::name;
|
|
|
|
|
static FunctionPtr create(const Context & context)
|
|
|
|
|
{
|
|
|
|
|
return std::make_shared<FunctionStringToString>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
String getName() const override
|
|
|
|
|
{
|
|
|
|
|
return name;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t getNumberOfArguments() const override
|
|
|
|
|
{
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
2017-07-23 08:40:43 +00:00
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
|
bool isInjective(const Block &) override
|
|
|
|
|
{
|
|
|
|
|
return is_injective;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
|
|
|
|
{
|
2017-07-21 06:35:58 +00:00
|
|
|
|
if (!checkDataType<DataTypeString>(&*arguments[0]) && !checkDataType<DataTypeFixedString>(&*arguments[0]))
|
2017-04-01 07:20:54 +00:00
|
|
|
|
throw Exception(
|
|
|
|
|
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
|
|
|
|
|
|
return arguments[0]->clone();
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-23 08:40:43 +00:00
|
|
|
|
bool useDefaultImplementationForConstants() const override { return true; }
|
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
|
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override
|
|
|
|
|
{
|
2017-07-21 06:35:58 +00:00
|
|
|
|
const ColumnPtr column = block.getByPosition(arguments[0]).column;
|
|
|
|
|
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
|
2017-04-01 07:20:54 +00:00
|
|
|
|
{
|
|
|
|
|
std::shared_ptr<ColumnString> col_res = std::make_shared<ColumnString>();
|
2017-07-21 06:35:58 +00:00
|
|
|
|
block.getByPosition(result).column = col_res;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
|
|
|
|
|
}
|
2017-07-21 06:35:58 +00:00
|
|
|
|
else if (const ColumnFixedString * col = checkAndGetColumn<ColumnFixedString>(column.get()))
|
2017-04-01 07:20:54 +00:00
|
|
|
|
{
|
|
|
|
|
auto col_res = std::make_shared<ColumnFixedString>(col->getN());
|
2017-07-21 06:35:58 +00:00
|
|
|
|
block.getByPosition(result).column = col_res;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Impl::vector_fixed(col->getChars(), col->getN(), col_res->getChars());
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
throw Exception(
|
2017-07-21 06:35:58 +00:00
|
|
|
|
"Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(),
|
2017-04-01 07:20:54 +00:00
|
|
|
|
ErrorCodes::ILLEGAL_COLUMN);
|
|
|
|
|
}
|
2011-10-03 05:29:11 +00:00
|
|
|
|
};
|
|
|
|
|
|
2017-03-10 17:53:32 +00:00
|
|
|
|
struct NameLowerUTF8
|
2015-10-19 21:07:24 +00:00
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
|
static constexpr auto name = "lowerUTF8";
|
2015-10-19 21:07:24 +00:00
|
|
|
|
};
|
2017-03-10 17:53:32 +00:00
|
|
|
|
struct NameUpperUTF8
|
2011-10-10 06:10:49 +00:00
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
|
static constexpr auto name = "upperUTF8";
|
2011-10-10 10:05:39 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
2017-06-10 09:04:31 +00:00
|
|
|
|
using FunctionLowerUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase<true>>, NameLowerUTF8>;
|
|
|
|
|
using FunctionUpperUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase<false>>, NameUpperUTF8>;
|
|
|
|
|
|
2011-10-03 04:06:34 +00:00
|
|
|
|
}
|