#pragma once #include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int ILLEGAL_COLUMN; } /** String functions * * length, empty, notEmpty, * concat, substring, lower, upper, reverse * lengthUTF8, substringUTF8, lowerUTF8, upperUTF8, reverseUTF8 * * s -> UInt8: empty, notEmpty * s -> UInt64: length, lengthUTF8 * s -> s: lower, upper, lowerUTF8, upperUTF8, reverse, reverseUTF8 * s, s -> s: concat * s, c1, c2 -> s: substring, substringUTF8 * s, c1, c2, s2 -> s: replace, replaceUTF8 * * The search functions for strings and regular expressions are located separately. * URL functions are located separately. * String encoding functions, converting to other types are located separately. * * The functions length, empty, notEmpty, reverse also work with arrays. */ /// xor or do nothing template UInt8 xor_or_identity(const UInt8 c, const int mask) { return c ^ mask; }; template <> inline UInt8 xor_or_identity(const UInt8 c, const int) { return c; } /// It is caller's responsibility to ensure the presence of a valid cyrillic sequence in array template inline void UTF8CyrillicToCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst) { if (src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0x8Fu)) { /// ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ *dst++ = xor_or_identity(*src++, 0x1); *dst++ = xor_or_identity(*src++, 0x10); } else if (src[0] == 0xD1u && (src[1] >= 0x90u && src[1] <= 0x9Fu)) { /// ѐёђѓєѕіїјљњћќѝўџ *dst++ = xor_or_identity(*src++, 0x1); *dst++ = xor_or_identity(*src++, 0x10); } else if (src[0] == 0xD0u && (src[1] >= 0x90u && src[1] <= 0x9Fu)) { /// А-П *dst++ = *src++; *dst++ = xor_or_identity(*src++, 0x20); } else if (src[0] == 0xD0u && (src[1] >= 0xB0u && src[1] <= 0xBFu)) { /// а-п *dst++ = *src++; *dst++ = xor_or_identity(*src++, 0x20); } else if (src[0] == 0xD0u && (src[1] >= 0xA0u && src[1] <= 0xAFu)) { /// Р-Я *dst++ = xor_or_identity(*src++, 0x1); *dst++ = xor_or_identity(*src++, 0x20); } else if (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x8Fu)) { /// р-я *dst++ = xor_or_identity(*src++, 0x1); *dst++ = xor_or_identity(*src++, 0x20); } } /** If the string contains UTF-8 encoded text, convert it to the lower (upper) case. * Note: It is assumed that after the character is converted to another case, * the length of its multibyte sequence in UTF-8 does not change. * Otherwise, the behavior is undefined. */ template struct LowerUpperUTF8Impl { static void vector(const ColumnString::Chars_t & data, const ColumnString::Offsets_t & offsets, ColumnString::Chars_t & res_data, ColumnString::Offsets_t & res_offsets); static void vector_fixed(const ColumnString::Chars_t & data, size_t n, ColumnString::Chars_t & res_data); static void constant(const std::string & data, std::string & res_data); /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`. * `src` and `dst` are incremented by corresponding sequence lengths. */ static void toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst); private: static constexpr auto ascii_upper_bound = '\x7f'; static constexpr auto flip_case_mask = 'A' ^ 'a'; static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst); }; template class FunctionStringToString : public IFunction { public: static constexpr auto name = Name::name; static FunctionPtr create(const Context & context) { return std::make_shared(); } String getName() const override { return name; } size_t getNumberOfArguments() const override { return 1; } bool isInjective(const Block &) override { return is_injective; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!checkDataType(&*arguments[0]) && !checkDataType(&*arguments[0])) throw Exception( "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); return arguments[0]->clone(); } bool useDefaultImplementationForConstants() const override { return true; } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override { const ColumnPtr column = block.getByPosition(arguments[0]).column; if (const ColumnString * col = checkAndGetColumn(column.get())) { std::shared_ptr col_res = std::make_shared(); block.getByPosition(result).column = col_res; Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets()); } else if (const ColumnFixedString * col = checkAndGetColumn(column.get())) { auto col_res = std::make_shared(col->getN()); block.getByPosition(result).column = col_res; Impl::vector_fixed(col->getChars(), col->getN(), col_res->getChars()); } else throw Exception( "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); } }; struct NameLowerUTF8 { static constexpr auto name = "lowerUTF8"; }; struct NameUpperUTF8 { static constexpr auto name = "upperUTF8"; }; using FunctionLowerUTF8 = FunctionStringToString>, NameLowerUTF8>; using FunctionUpperUTF8 = FunctionStringToString>, NameUpperUTF8>; }