ClickHouse/dbms/include/DB/Functions/FunctionsString.h

1174 lines
38 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <Poco/UTF8Encoding.h>
#include <Poco/Unicode.h>
#include <DB/DataTypes/DataTypesNumberFixed.h>
#include <DB/DataTypes/DataTypeString.h>
#include <DB/DataTypes/DataTypeFixedString.h>
#include <DB/DataTypes/DataTypeArray.h>
#include <DB/Columns/ColumnString.h>
#include <DB/Columns/ColumnArray.h>
#include <DB/Columns/ColumnFixedString.h>
#include <DB/Columns/ColumnConst.h>
#include <DB/Functions/IFunction.h>
namespace DB
{
/** Функции работы со строками:
*
* length, empty, notEmpty,
* concat, substring, lower, upper, reverse, // left, right, insert, replace,
* // escape, quote, unescape, unquote, trim, trimLeft, trimRight, pad, padLeft
* lengthUTF8, substringUTF8, lowerUTF8, upperUTF8, reverseUTF8, // leftUTF8, rightUTF8, insertUTF8, replaceUTF8
* // padUTF8, padLeftUTF8.
*
* s -> UInt8: empty, notEmpty
* s -> UInt64: length, lengthUTF8
* s -> s: lower, upper, lowerUTF8, upperUTF8, reverse, reverseUTF8, escape, quote, unescape, unquote, trim, trimLeft, trimRight
* s, s -> s: concat
* s, c1, c2 -> s: substring, substringUTF8
* s, c1 -> s: left, right, leftUTF8, rightUTF8
* s, c1, s2 -> s: insert, insertUTF8, pad, padLeft, padUTF8, padLeftUTF8
* s, c1, c2, s2 -> s: replace, replaceUTF8
*
* Функции поиска строк и регулярных выражений расположены отдельно.
* Функции работы с URL расположены отдельно.
* Функции кодирования строк, конвертации в другие типы расположены отдельно.
*
* Функции length, empty, notEmpty также работают с массивами.
*/
template <bool negative = false>
struct EmptyImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
PODArray<UInt8> & res)
{
size_t size = offsets.size();
ColumnString::Offset_t prev_offset = 1;
for (size_t i = 0; i < size; ++i)
{
res[i] = negative ^ (offsets[i] == prev_offset);
prev_offset = offsets[i] + 1;
}
}
static void vector_fixed_to_constant(const ColumnString::Chars_t & data, size_t n,
UInt8 & res)
{
res = negative ^ (n == 0);
}
static void vector_fixed_to_vector(const ColumnString::Chars_t & data, size_t n,
PODArray<UInt8> & res)
{
}
static void constant(const std::string & data, UInt8 & res)
{
res = negative ^ (data.empty());
}
static void array(const ColumnString::Offsets_t & offsets, PODArray<UInt8> & res)
{
size_t size = offsets.size();
ColumnString::Offset_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
res[i] = negative ^ (offsets[i] == prev_offset);
prev_offset = offsets[i];
}
}
static void constant_array(const Array & data, UInt8 & res)
{
res = negative ^ (data.empty());
}
};
/** Вычисляет длину строки в байтах.
*/
struct LengthImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
PODArray<UInt64> & res)
{
size_t size = offsets.size();
for (size_t i = 0; i < size; ++i)
res[i] = i == 0
? (offsets[i] - 1)
: (offsets[i] - 1 - offsets[i - 1]);
}
static void vector_fixed_to_constant(const ColumnString::Chars_t & data, size_t n,
UInt64 & res)
{
res = n;
}
static void vector_fixed_to_vector(const ColumnString::Chars_t & data, size_t n,
PODArray<UInt64> & res)
{
}
static void constant(const std::string & data, UInt64 & res)
{
res = data.size();
}
static void array(const ColumnString::Offsets_t & offsets, PODArray<UInt64> & res)
{
size_t size = offsets.size();
for (size_t i = 0; i < size; ++i)
res[i] = i == 0
? (offsets[i])
: (offsets[i] - offsets[i - 1]);
}
static void constant_array(const Array & data, UInt64 & res)
{
res = data.size();
}
};
/** Если строка представляет собой текст в кодировке UTF-8, то возвращает длину текста в кодовых точках.
* (не в символах: длина текста "ё" может быть как 1, так и 2, в зависимости от нормализации)
* Иначе - поведение не определено.
*/
struct LengthUTF8Impl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
PODArray<UInt64> & res)
{
size_t size = offsets.size();
ColumnString::Offset_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
res[i] = 0;
for (const UInt8 * c = &data[prev_offset]; c + 1 < &data[offsets[i]]; ++c)
if (*c <= 0x7F || *c >= 0xC0)
++res[i];
prev_offset = offsets[i];
}
}
static void vector_fixed_to_constant(const ColumnString::Chars_t & data, size_t n,
UInt64 & res)
{
}
static void vector_fixed_to_vector(const ColumnString::Chars_t & data, size_t n,
PODArray<UInt64> & res)
{
size_t size = data.size() / n;
for (size_t i = 0; i < size; ++i)
{
res[i] = 0;
for (const UInt8 * c = &data[i * n]; c < &data[(i + 1) * n]; ++c)
if (*c <= 0x7F || *c >= 0xC0)
++res[i];
}
}
static void constant(const std::string & data, UInt64 & res)
{
res = 0;
for (const UInt8 * c = reinterpret_cast<const UInt8 *>(data.data()); c < reinterpret_cast<const UInt8 *>(data.data() + data.size()); ++c)
if (*c <= 0x7F || *c >= 0xC0)
++res;
}
static void array(const ColumnString::Offsets_t & offsets, PODArray<UInt64> & res)
{
throw Exception("Cannot apply function lengthUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
static void constant_array(const Array & data, UInt64 & res)
{
throw Exception("Cannot apply function lengthUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
};
/** Переводит строку в нижний (верхний) регистр, в текущей локали, в однобайтовой кодировке.
*/
template <int F(int)>
struct LowerUpperImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
array(&*data.begin(), &*data.end(), &*res_data.begin());
}
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
res_data.resize(data.size());
array(&*data.begin(), &*data.end(), &*res_data.begin());
}
static void constant(const std::string & data, std::string & res_data)
{
res_data.resize(data.size());
array(reinterpret_cast<const UInt8 *>(&*data.begin()), reinterpret_cast<const UInt8 *>(&*data.end()),
reinterpret_cast<UInt8 *>(&*res_data.begin()));
}
private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
for (; src < src_end; ++src, ++dst)
*dst = F(*src);
}
};
/** Если строка содержит текст в кодировке UTF-8 - перевести его в нижний (верхний) регистр.
* Замечание: предполагается, что после перевода символа в другой регистр,
* длина его мультибайтовой последовательности в UTF-8 не меняется.
* Иначе - поведение не определено.
*/
template <int F(int)>
struct LowerUpperUTF8Impl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
array(&*data.begin(), &*data.end(), &*res_data.begin());
}
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
res_data.resize(data.size());
array(&*data.begin(), &*data.end(), &*res_data.begin());
}
static void constant(const std::string & data, std::string & res_data)
{
res_data.resize(data.size());
array(reinterpret_cast<const UInt8 *>(&*data.begin()), reinterpret_cast<const UInt8 *>(&*data.end()),
reinterpret_cast<UInt8 *>(&*res_data.begin()));
}
private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
static Poco::UTF8Encoding utf8;
while (src < src_end)
{
int chars = utf8.convert(F(utf8.convert(src)), dst, src_end - src);
if (chars)
{
src += chars;
dst += chars;
}
else
{
++src;
++dst;
}
}
}
};
/** Разворачивает строку в байтах.
*/
struct ReverseImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
size_t size = offsets.size();
ColumnString::Offset_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
for (size_t j = prev_offset; j < offsets[i] - 1; ++j)
res_data[j] = data[offsets[i] + prev_offset - 2 - j];
res_data[offsets[i] - 1] = 0;
prev_offset = offsets[i];
}
}
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
res_data.resize(data.size());
size_t size = data.size() / n;
for (size_t i = 0; i < size; ++i)
for (size_t j = i * n; j < (i + 1) * n; ++j)
res_data[j] = data[(i * 2 + 1) * n - j - 1];
}
static void constant(const std::string & data, std::string & res_data)
{
res_data.resize(data.size());
for (size_t j = 0; j < data.size(); ++j)
res_data[j] = data[data.size() - j - 1];
}
};
/** Разворачивает последовательность кодовых точек в строке в кодировке UTF-8.
* Результат может не соответствовать ожидаемому, так как модифицирующие кодовые точки (например, диакритика) могут примениться не к тем символам.
* Если строка не в кодировке UTF-8, то поведение не определено.
*/
struct ReverseUTF8Impl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.resize(data.size());
res_offsets.assign(offsets);
size_t size = offsets.size();
ColumnString::Offset_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
ColumnString::Offset_t j = prev_offset;
while (j < offsets[i] - 1)
{
if (data[j] < 0xBF)
{
res_data[offsets[i] + prev_offset - 2 - j] = data[j];
j += 1;
}
else if (data[j] < 0xE0)
{
memcpy(&res_data[offsets[i] + prev_offset - 2 - j - 1], &data[j], 2);
j += 2;
}
else if (data[j] < 0xF0)
{
memcpy(&res_data[offsets[i] + prev_offset - 2 - j - 2], &data[j], 3);
j += 3;
}
else
{
res_data[offsets[i] + prev_offset - 2 - j] = data[j];
j += 1;
}
}
res_data[offsets[i] - 1] = 0;
prev_offset = offsets[i];
}
}
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
ColumnString::Chars_t & res_data)
{
throw Exception("Cannot apply function reverseUTF8 to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
}
static void constant(const std::string & data, std::string & res_data)
{
res_data.resize(data.size());
size_t j = 0;
while (j < data.size())
{
if (static_cast<unsigned char>(data[j]) < 0xBF)
{
res_data[data.size() - 1 - j] = data[j];
j += 1;
}
else if (static_cast<unsigned char>(data[j]) < 0xE0)
{
memcpy(&res_data[data.size() - 1 - j - 1], &data[j], 2);
j += 2;
}
else if (static_cast<unsigned char>(data[j]) < 0xF0)
{
memcpy(&res_data[data.size() - 1 - j - 2], &data[j], 3);
j += 3;
}
else
{
res_data[data.size() - 1 - j] = data[j];
j += 1;
}
}
}
};
/** Склеивает две строки.
*/
struct ConcatImpl
{
static void vector_vector(
const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets,
const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets,
ColumnString::Chars_t & c_data, ColumnString::Offsets_t & c_offsets)
{
size_t size = a_offsets.size();
c_data.resize(a_data.size() + b_data.size() - size);
c_offsets.resize(size);
ColumnString::Offset_t offset = 0;
ColumnString::Offset_t a_offset = 0;
ColumnString::Offset_t b_offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&c_data[offset], &a_data[a_offset], a_offsets[i] - a_offset - 1);
offset += a_offsets[i] - a_offset - 1;
memcpy(&c_data[offset], &b_data[b_offset], b_offsets[i] - b_offset);
offset += b_offsets[i] - b_offset;
a_offset = a_offsets[i];
b_offset = b_offsets[i];
c_offsets[i] = offset;
}
}
static void vector_fixed_vector(
const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets,
const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n,
ColumnString::Chars_t & c_data, ColumnString::Offsets_t & c_offsets)
{
size_t size = a_offsets.size();
c_data.resize(a_data.size() + b_data.size());
c_offsets.resize(size);
ColumnString::Offset_t offset = 0;
ColumnString::Offset_t a_offset = 0;
ColumnString::Offset_t b_offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&c_data[offset], &a_data[a_offset], a_offsets[i] - a_offset - 1);
offset += a_offsets[i] - a_offset - 1;
memcpy(&c_data[offset], &b_data[b_offset], b_n);
offset += b_n;
c_data[offset] = 0;
offset += 1;
a_offset = a_offsets[i];
b_offset += b_n;
c_offsets[i] = offset;
}
}
static void vector_constant(
const ColumnString::Chars_t & a_data, const ColumnString::Offsets_t & a_offsets,
const std::string & b,
ColumnString::Chars_t & c_data, ColumnString::Offsets_t & c_offsets)
{
size_t size = a_offsets.size();
c_data.resize(a_data.size() + b.size() * size);
c_offsets.assign(a_offsets);
for (size_t i = 0; i < size; ++i)
c_offsets[i] += b.size() * (i + 1);
ColumnString::Offset_t offset = 0;
ColumnString::Offset_t a_offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&c_data[offset], &a_data[a_offset], a_offsets[i] - a_offset - 1);
offset += a_offsets[i] - a_offset - 1;
memcpy(&c_data[offset], b.data(), b.size() + 1);
offset += b.size() + 1;
a_offset = a_offsets[i];
}
}
static void fixed_vector_vector(
const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n,
const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets,
ColumnString::Chars_t & c_data, ColumnString::Offsets_t & c_offsets)
{
size_t size = b_offsets.size();
c_data.resize(a_data.size() + b_data.size());
c_offsets.resize(size);
ColumnString::Offset_t offset = 0;
ColumnString::Offset_t a_offset = 0;
ColumnString::Offset_t b_offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&c_data[offset], &a_data[a_offset], a_n);
offset += a_n;
memcpy(&c_data[offset], &b_data[b_offset], b_offsets[i] - b_offset);
offset += b_offsets[i] - b_offset;
a_offset = a_n;
b_offset = b_offsets[i];
c_offsets[i] = offset;
}
}
static void fixed_vector_fixed_vector(
const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n,
const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n,
ColumnString::Chars_t & c_data, ColumnString::Offsets_t & c_offsets)
{
size_t size = a_data.size() / a_n;
c_data.resize(a_data.size() + b_data.size() + size);
c_offsets.resize(size);
ColumnString::Offset_t offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&c_data[offset], &a_data[i * a_n], a_n);
offset += a_n;
memcpy(&c_data[offset], &b_data[i * b_n], b_n);
offset += b_n;
c_data[offset] = 0;
++offset;
c_offsets[i] = offset;
}
}
static void fixed_vector_constant(
const ColumnString::Chars_t & a_data, ColumnString::Offset_t a_n,
const std::string & b,
ColumnString::Chars_t & c_data, ColumnString::Offsets_t & c_offsets)
{
size_t size = a_data.size() / a_n;
ColumnString::Offset_t b_n = b.size();
c_data.resize(a_data.size() + size * b_n + size);
c_offsets.resize(size);
ColumnString::Offset_t offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&c_data[offset], &a_data[i * a_n], a_n);
offset += a_n;
memcpy(&c_data[offset], b.data(), b_n);
offset += b_n;
c_data[offset] = 0;
++offset;
c_offsets[i] = offset;
}
}
static void constant_vector(
const std::string & a,
const ColumnString::Chars_t & b_data, const ColumnString::Offsets_t & b_offsets,
ColumnString::Chars_t & c_data, ColumnString::Offsets_t & c_offsets)
{
size_t size = b_offsets.size();
c_data.resize(b_data.size() + a.size() * size);
c_offsets.assign(b_offsets);
for (size_t i = 0; i < size; ++i)
c_offsets[i] += a.size() * (i + 1);
ColumnString::Offset_t offset = 0;
ColumnString::Offset_t b_offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&c_data[offset], a.data(), a.size());
offset += a.size();
memcpy(&c_data[offset], &b_data[b_offset], b_offsets[i] - b_offset);
offset += b_offsets[i] - b_offset;
b_offset = b_offsets[i];
}
}
static void constant_fixed_vector(
const std::string & a,
const ColumnString::Chars_t & b_data, ColumnString::Offset_t b_n,
ColumnString::Chars_t & c_data, ColumnString::Offsets_t & c_offsets)
{
size_t size = b_data.size() / b_n;
ColumnString::Offset_t a_n = a.size();
c_data.resize(size * a_n + b_data.size() + size);
c_offsets.resize(size);
ColumnString::Offset_t offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&c_data[offset], a.data(), a_n);
offset += a_n;
memcpy(&c_data[offset], &b_data[i * b_n], b_n);
offset += b_n;
c_data[offset] = 0;
++offset;
c_offsets[i] = offset;
}
}
static void constant_constant(
const std::string & a,
const std::string & b,
std::string & c)
{
c = a + b;
}
};
/** Выделяет подстроку в строке, как последовательности байт.
*/
struct SubstringImpl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
size_t start, size_t length,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
ColumnString::Offset_t prev_offset = 0;
ColumnString::Offset_t res_offset = 0;
for (size_t i = 0; i < size; ++i)
{
if (start + prev_offset >= offsets[i] + 1)
{
res_data.resize(res_data.size() + 1);
res_data[res_offset] = 0;
++res_offset;
}
else
{
size_t bytes_to_copy = std::min(offsets[i] - prev_offset - start, length);
res_data.resize(res_data.size() + bytes_to_copy + 1);
memcpy(&res_data[res_offset], &data[prev_offset + start - 1], bytes_to_copy);
res_offset += bytes_to_copy + 1;
res_data[res_offset - 1] = 0;
}
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
static void vector_fixed(const ColumnString::Chars_t & data, size_t n,
size_t start, size_t length,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
if (length == 0 || start + length > n + 1)
throw Exception("Index out of bound for function substring of fixed size value", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
size_t size = data.size() / n;
res_offsets.resize(size);
res_data.resize(length * size + size);
ColumnString::Offset_t res_offset = 0;
for (size_t i = 0; i < size; ++i)
{
memcpy(&res_data[res_offset], &data[i * n + start - 1], length);
res_offset += length;
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
}
}
static void constant(const std::string & data,
size_t start, size_t length,
std::string & res_data)
{
if (start + length > data.size() + 1)
throw Exception("Index out of bound for function substring of fixed size value", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
res_data = data.substr(start - 1, length);
}
};
/** Если строка в кодировке UTF-8, то выделяет в ней подстроку кодовых точек.
* Иначе - поведение не определено.
*/
struct SubstringUTF8Impl
{
static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets,
size_t start, size_t length,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
ColumnString::Offset_t prev_offset = 0;
ColumnString::Offset_t res_offset = 0;
for (size_t i = 0; i < size; ++i)
{
ColumnString::Offset_t j = prev_offset;
ColumnString::Offset_t pos = 1;
ColumnString::Offset_t bytes_start = 0;
ColumnString::Offset_t bytes_length = 0;
while (j < offsets[i] - 1)
{
if (pos == start)
bytes_start = j - prev_offset + 1;
if (data[j] < 0xBF)
j += 1;
else if (data[j] < 0xE0)
j += 2;
else if (data[j] < 0xF0)
j += 3;
else
j += 1;
if (pos >= start && pos < start + length)
bytes_length = j - prev_offset + 1 - bytes_start;
else if (pos >= start + length)
break;
++pos;
}
if (bytes_start == 0)
{
res_data.resize(res_data.size() + 1);
res_data[res_offset] = 0;
++res_offset;
}
else
{
size_t bytes_to_copy = std::min(offsets[i] - prev_offset - bytes_start, bytes_length);
res_data.resize(res_data.size() + bytes_to_copy + 1);
memcpy(&res_data[res_offset], &data[prev_offset + bytes_start - 1], bytes_to_copy);
res_offset += bytes_to_copy + 1;
res_data[res_offset - 1] = 0;
}
res_offsets[i] = res_offset;
prev_offset = offsets[i];
}
}
static void vector_fixed(const ColumnString::Chars_t & data, ColumnString::Offset_t n,
size_t start, size_t length,
ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets)
{
throw Exception("Cannot apply function substringUTF8 to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
}
static void constant(const std::string & data,
size_t start, size_t length,
std::string & res_data)
{
if (start + length > data.size() + 1)
throw Exception("Index out of bound for function substring of constant value", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
ColumnString::Offset_t j = 0;
ColumnString::Offset_t pos = 1;
ColumnString::Offset_t bytes_start = 0;
ColumnString::Offset_t bytes_length = 0;
while (j < data.size())
{
if (pos == start)
bytes_start = j + 1;
if (static_cast<unsigned char>(data[j]) < 0xBF)
j += 1;
else if (static_cast<unsigned char>(data[j]) < 0xE0)
j += 2;
else if (static_cast<unsigned char>(data[j]) < 0xF0)
j += 3;
else
j += 1;
if (pos >= start && pos < start + length)
bytes_length = j + 1 - bytes_start;
else if (pos >= start + length)
break;
++pos;
}
if (bytes_start != 0)
res_data = data.substr(bytes_start - 1, bytes_length);
}
};
template <typename Impl, typename Name, typename ResultType>
class FunctionStringOrArrayToT : public IFunction
{
public:
/// Получить имя функции.
String getName() const
{
return Name::get();
}
/// Получить тип результата по типам аргументов. Если функция неприменима для данных аргументов - кинуть исключение.
DataTypePtr getReturnType(const DataTypes & arguments) const
{
if (arguments.size() != 1)
throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
+ toString(arguments.size()) + ", should be 1.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!dynamic_cast<const DataTypeString *>(&*arguments[0]) && !dynamic_cast<const DataTypeFixedString *>(&*arguments[0])
&& !dynamic_cast<const DataTypeArray *>(&*arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return new typename DataTypeFromFieldType<ResultType>::Type;
}
/// Выполнить функцию над блоком.
void execute(Block & block, const ColumnNumbers & arguments, size_t result)
{
const ColumnPtr column = block.getByPosition(arguments[0]).column;
if (const ColumnString * col = dynamic_cast<const ColumnString *>(&*column))
{
ColumnVector<ResultType> * col_res = new ColumnVector<ResultType>;
block.getByPosition(result).column = col_res;
typename ColumnVector<ResultType>::Container_t & vec_res = col_res->getData();
vec_res.resize(col->size());
Impl::vector(col->getChars(), col->getOffsets(), vec_res);
}
else if (const ColumnFixedString * col = dynamic_cast<const ColumnFixedString *>(&*column))
{
/// Для фиксированной строки, только функция lengthUTF8 возвращает не константу.
if ("lengthUTF8" != getName())
{
ResultType res = 0;
Impl::vector_fixed_to_constant(col->getChars(), col->getN(), res);
ColumnConst<ResultType> * col_res = new ColumnConst<ResultType>(col->size(), res);
block.getByPosition(result).column = col_res;
}
else
{
ColumnVector<ResultType> * col_res = new ColumnVector<ResultType>;
block.getByPosition(result).column = col_res;
typename ColumnVector<ResultType>::Container_t & vec_res = col_res->getData();
vec_res.resize(col->size());
Impl::vector_fixed_to_vector(col->getChars(), col->getN(), vec_res);
}
}
else if (const ColumnConstString * col = dynamic_cast<const ColumnConstString *>(&*column))
{
ResultType res = 0;
Impl::constant(col->getData(), res);
ColumnConst<ResultType> * col_res = new ColumnConst<ResultType>(col->size(), res);
block.getByPosition(result).column = col_res;
}
else if (const ColumnArray * col = dynamic_cast<const ColumnArray *>(&*column))
{
ColumnVector<ResultType> * col_res = new ColumnVector<ResultType>;
block.getByPosition(result).column = col_res;
typename ColumnVector<ResultType>::Container_t & vec_res = col_res->getData();
vec_res.resize(col->size());
Impl::array(col->getOffsets(), vec_res);
}
else if (const ColumnConstArray * col = dynamic_cast<const ColumnConstArray *>(&*column))
{
ResultType res = 0;
Impl::constant_array(col->getData(), res);
ColumnConst<ResultType> * col_res = new ColumnConst<ResultType>(col->size(), res);
block.getByPosition(result).column = col_res;
}
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
+ " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
};
template <typename Impl, typename Name>
class FunctionStringToString : public IFunction
{
public:
/// Получить имя функции.
String getName() const
{
return Name::get();
}
/// Получить тип результата по типам аргументов. Если функция неприменима для данных аргументов - кинуть исключение.
DataTypePtr getReturnType(const DataTypes & arguments) const
{
if (arguments.size() != 1)
throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
+ toString(arguments.size()) + ", should be 1.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!dynamic_cast<const DataTypeString *>(&*arguments[0]) && !dynamic_cast<const DataTypeFixedString *>(&*arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return arguments[0]->clone();
}
/// Выполнить функцию над блоком.
void execute(Block & block, const ColumnNumbers & arguments, size_t result)
{
const ColumnPtr column = block.getByPosition(arguments[0]).column;
if (const ColumnString * col = dynamic_cast<const ColumnString *>(&*column))
{
ColumnString * col_res = new ColumnString;
block.getByPosition(result).column = col_res;
Impl::vector(col->getChars(), col->getOffsets(),
col_res->getChars(), col_res->getOffsets());
}
else if (const ColumnFixedString * col = dynamic_cast<const ColumnFixedString *>(&*column))
{
ColumnFixedString * col_res = new ColumnFixedString(col->getN());
block.getByPosition(result).column = col_res;
Impl::vector_fixed(col->getChars(), col->getN(),
col_res->getChars());
}
else if (const ColumnConstString * col = dynamic_cast<const ColumnConstString *>(&*column))
{
String res;
Impl::constant(col->getData(), res);
ColumnConstString * col_res = new ColumnConstString(col->size(), res);
block.getByPosition(result).column = col_res;
}
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
+ " of argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
};
template <typename Impl, typename Name>
class FunctionStringStringToString : public IFunction
{
public:
/// Получить имя функции.
String getName() const
{
return Name::get();
}
/// Получить тип результата по типам аргументов. Если функция неприменима для данных аргументов - кинуть исключение.
DataTypePtr getReturnType(const DataTypes & arguments) const
{
if (arguments.size() != 2)
throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
+ toString(arguments.size()) + ", should be 2.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!dynamic_cast<const DataTypeString *>(&*arguments[0]) && !dynamic_cast<const DataTypeFixedString *>(&*arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!dynamic_cast<const DataTypeString *>(&*arguments[1]) && !dynamic_cast<const DataTypeFixedString *>(&*arguments[1]))
throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return new DataTypeString;
}
/// Выполнить функцию над блоком.
void execute(Block & block, const ColumnNumbers & arguments, size_t result)
{
const IColumn * c0 = &*block.getByPosition(arguments[0]).column;
const IColumn * c1 = &*block.getByPosition(arguments[1]).column;
const ColumnString * c0_string = dynamic_cast<const ColumnString *>(c0);
const ColumnString * c1_string = dynamic_cast<const ColumnString *>(c1);
const ColumnFixedString * c0_fixed_string = dynamic_cast<const ColumnFixedString *>(c0);
const ColumnFixedString * c1_fixed_string = dynamic_cast<const ColumnFixedString *>(c1);
const ColumnConstString * c0_const = dynamic_cast<const ColumnConstString *>(c0);
const ColumnConstString * c1_const = dynamic_cast<const ColumnConstString *>(c1);
/// Результат - const string
if (c0_const && c1_const)
{
ColumnConstString * c_res = new ColumnConstString(c0_const->size(), "");
block.getByPosition(result).column = c_res;
Impl::constant_constant(c0_const->getData(), c1_const->getData(), c_res->getData());
}
else
{
ColumnString * c_res = new ColumnString;
block.getByPosition(result).column = c_res;
ColumnString::Chars_t & vec_res = c_res->getChars();
ColumnString::Offsets_t & offsets_res = c_res->getOffsets();
if (c0_string && c1_string)
Impl::vector_vector(
c0_string->getChars(), c0_string->getOffsets(),
c1_string->getChars(), c1_string->getOffsets(),
vec_res, offsets_res);
else if (c0_string && c1_fixed_string)
Impl::vector_fixed_vector(
c0_string->getChars(), c0_string->getOffsets(),
c1_fixed_string->getChars(), c1_fixed_string->getN(),
vec_res, offsets_res);
else if (c0_string && c1_const)
Impl::vector_constant(
c0_string->getChars(), c0_string->getOffsets(),
c1_const->getData(),
vec_res, offsets_res);
else if (c0_fixed_string && c1_string)
Impl::fixed_vector_vector(
c0_fixed_string->getChars(), c0_fixed_string->getN(),
c1_string->getChars(), c1_string->getOffsets(),
vec_res, offsets_res);
else if (c0_const && c1_string)
Impl::constant_vector(
c0_const->getData(),
c1_string->getChars(), c1_string->getOffsets(),
vec_res, offsets_res);
else if (c0_fixed_string && c1_fixed_string)
Impl::fixed_vector_fixed_vector(
c0_fixed_string->getChars(), c0_fixed_string->getN(),
c1_fixed_string->getChars(), c1_fixed_string->getN(),
vec_res, offsets_res);
else if (c0_fixed_string && c1_const)
Impl::fixed_vector_constant(
c0_fixed_string->getChars(), c0_fixed_string->getN(),
c1_const->getData(),
vec_res, offsets_res);
else if (c0_const && c1_fixed_string)
Impl::constant_fixed_vector(
c0_const->getData(),
c1_fixed_string->getChars(), c1_fixed_string->getN(),
vec_res, offsets_res);
else
throw Exception("Illegal columns "
+ block.getByPosition(arguments[0]).column->getName() + " and "
+ block.getByPosition(arguments[1]).column->getName()
+ " of arguments of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
}
};
template <typename Impl, typename Name>
class FunctionStringNumNumToString : public IFunction
{
public:
/// Получить имя функции.
String getName() const
{
return Name::get();
}
/// Получить тип результата по типам аргументов. Если функция неприменима для данных аргументов - кинуть исключение.
DataTypePtr getReturnType(const DataTypes & arguments) const
{
if (arguments.size() != 3)
throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
+ toString(arguments.size()) + ", should be 3.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!dynamic_cast<const DataTypeString *>(&*arguments[0]) && !dynamic_cast<const DataTypeFixedString *>(&*arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!arguments[1]->isNumeric() || !arguments[2]->isNumeric())
throw Exception("Illegal type " + (arguments[1]->isNumeric() ? arguments[2]->getName() : arguments[1]->getName()) + " of argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return new DataTypeString;
}
/// Выполнить функцию над блоком.
void execute(Block & block, const ColumnNumbers & arguments, size_t result)
{
const ColumnPtr column_string = block.getByPosition(arguments[0]).column;
const ColumnPtr column_start = block.getByPosition(arguments[1]).column;
const ColumnPtr column_length = block.getByPosition(arguments[2]).column;
if (!column_start->isConst() || !column_length->isConst())
throw Exception("2nd and 3rd arguments of function " + getName() + " must be constants.");
FieldVisitorConvertToNumber<UInt64> converter;
Field start_field = (*block.getByPosition(arguments[1]).column)[0];
Field length_field = (*block.getByPosition(arguments[2]).column)[0];
UInt64 start = apply_visitor(converter, start_field);
UInt64 length = apply_visitor(converter, length_field);
if (start == 0)
throw Exception("Second argument of function substring must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
if (const ColumnString * col = dynamic_cast<const ColumnString *>(&*column_string))
{
ColumnString * col_res = new ColumnString;
block.getByPosition(result).column = col_res;
Impl::vector(col->getChars(), col->getOffsets(),
start, length,
col_res->getChars(), col_res->getOffsets());
}
else if (const ColumnFixedString * col = dynamic_cast<const ColumnFixedString *>(&*column_string))
{
ColumnString * col_res = new ColumnString;
block.getByPosition(result).column = col_res;
Impl::vector_fixed(col->getChars(), col->getN(),
start, length,
col_res->getChars(), col_res->getOffsets());
}
else if (const ColumnConstString * col = dynamic_cast<const ColumnConstString *>(&*column_string))
{
String res;
Impl::constant(col->getData(), start, length, res);
ColumnConstString * col_res = new ColumnConstString(col->size(), res);
block.getByPosition(result).column = col_res;
}
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
+ " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
};
struct NameEmpty { static const char * get() { return "empty"; } };
struct NameNotEmpty { static const char * get() { return "notEmpty"; } };
struct NameLength { static const char * get() { return "length"; } };
struct NameLengthUTF8 { static const char * get() { return "lengthUTF8"; } };
struct NameLower { static const char * get() { return "lower"; } };
struct NameUpper { static const char * get() { return "upper"; } };
struct NameLowerUTF8 { static const char * get() { return "lowerUTF8"; } };
struct NameUpperUTF8 { static const char * get() { return "upperUTF8"; } };
struct NameReverse { static const char * get() { return "reverse"; } };
struct NameReverseUTF8 { static const char * get() { return "reverseUTF8"; } };
struct NameConcat { static const char * get() { return "concat"; } };
struct NameSubstring { static const char * get() { return "substring"; } };
struct NameSubstringUTF8 { static const char * get() { return "substringUTF8"; } };
typedef FunctionStringOrArrayToT<EmptyImpl<false>, NameEmpty, UInt8> FunctionEmpty;
typedef FunctionStringOrArrayToT<EmptyImpl<true>, NameNotEmpty, UInt8> FunctionNotEmpty;
typedef FunctionStringOrArrayToT<LengthImpl, NameLength, UInt64> FunctionLength;
typedef FunctionStringOrArrayToT<LengthUTF8Impl, NameLengthUTF8, UInt64> FunctionLengthUTF8;
typedef FunctionStringToString<LowerUpperImpl<tolower>, NameLower> FunctionLower;
typedef FunctionStringToString<LowerUpperImpl<toupper>, NameUpper> FunctionUpper;
typedef FunctionStringToString<LowerUpperUTF8Impl<Poco::Unicode::toLower>, NameLowerUTF8> FunctionLowerUTF8;
typedef FunctionStringToString<LowerUpperUTF8Impl<Poco::Unicode::toUpper>, NameUpperUTF8> FunctionUpperUTF8;
typedef FunctionStringToString<ReverseImpl, NameReverse> FunctionReverse;
typedef FunctionStringToString<ReverseUTF8Impl, NameReverseUTF8> FunctionReverseUTF8;
typedef FunctionStringStringToString<ConcatImpl, NameConcat> FunctionConcat;
typedef FunctionStringNumNumToString<SubstringImpl, NameSubstring> FunctionSubstring;
typedef FunctionStringNumNumToString<SubstringUTF8Impl, NameSubstringUTF8> FunctionSubstringUTF8;
}