#include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace DB { using namespace GatherUtils; namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ZERO_ARRAY_OR_TUPLE_INDEX; } namespace { /// If 'is_utf8' - measure offset and length in code points instead of bytes. template class FunctionSubstring : public IFunction { public: static constexpr auto name = is_utf8 ? "substringUTF8" : "substring"; static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } bool useDefaultImplementationForConstants() const override { return true; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { const size_t number_of_arguments = arguments.size(); if (number_of_arguments < 2 || number_of_arguments > 3) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Number of arguments for function {} doesn't match: " "passed {}, should be 2 or 3", getName(), number_of_arguments); if constexpr (is_utf8) { /// UTF8 variant is not available for FixedString and Enum arguments. if (!isString(arguments[0])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}, expected String", arguments[0]->getName(), getName()); } else { if (!isStringOrFixedString(arguments[0]) && !isEnum(arguments[0])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}, expected String, FixedString or Enum", arguments[0]->getName(), getName()); } if (!isNativeNumber(arguments[1])) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of second argument of function {}, expected (U)Int*", arguments[1]->getName(), getName()); if (number_of_arguments == 3 && !isNativeNumber(arguments[2])) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of second argument of function {}, expected (U)Int*", arguments[2]->getName(), getName()); return std::make_shared(); } template ColumnPtr executeForSource(const ColumnPtr & column_offset, const ColumnPtr & column_length, bool column_offset_const, bool column_length_const, Int64 offset, Int64 length, Source && source, size_t input_rows_count) const { auto col_res = ColumnString::create(); if (!column_length) { if (column_offset_const) { if (offset > 0) sliceFromLeftConstantOffsetUnbounded(source, StringSink(*col_res, input_rows_count), static_cast(offset - 1)); else if (offset < 0) sliceFromRightConstantOffsetUnbounded(source, StringSink(*col_res, input_rows_count), -static_cast(offset)); else throw Exception(ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX, "Indices in strings are 1-based"); } else sliceDynamicOffsetUnbounded(source, StringSink(*col_res, input_rows_count), *column_offset); } else { if (column_offset_const && column_length_const) { if (offset > 0) sliceFromLeftConstantOffsetBounded(source, StringSink(*col_res, input_rows_count), static_cast(offset - 1), length); else if (offset < 0) sliceFromRightConstantOffsetBounded(source, StringSink(*col_res, input_rows_count), -static_cast(offset), length); else throw Exception(ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX, "Indices in strings are 1-based"); } else sliceDynamicOffsetBounded(source, StringSink(*col_res, input_rows_count), *column_offset, *column_length); } return col_res; } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const size_t number_of_arguments = arguments.size(); ColumnPtr column_string = arguments[0].column; ColumnPtr column_offset = arguments[1].column; ColumnPtr column_length; if (number_of_arguments == 3) column_length = arguments[2].column; const ColumnConst * column_offset_const = checkAndGetColumn(column_offset.get()); const ColumnConst * column_length_const = nullptr; if (number_of_arguments == 3) column_length_const = checkAndGetColumn(column_length.get()); Int64 offset = 0; Int64 length = 0; if (column_offset_const) offset = column_offset_const->getInt(0); if (column_length_const) length = column_length_const->getInt(0); if constexpr (is_utf8) { if (const ColumnString * col = checkAndGetColumn(column_string.get())) { bool all_ascii = UTF8::isAllASCII(col->getChars().data(), col->getChars().size()); if (all_ascii) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, StringSource(*col), input_rows_count); else return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, UTF8StringSource(*col), input_rows_count); } if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) { StringRef str_ref = col_const->getDataAt(0); bool all_ascii = UTF8::isAllASCII(reinterpret_cast(str_ref.data), str_ref.size); if (all_ascii) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); else return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); } throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName()); } else { if (const ColumnString * col = checkAndGetColumn(column_string.get())) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, StringSource(*col), input_rows_count); if (const ColumnFixedString * col_fixed = checkAndGetColumn(column_string.get())) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, FixedStringSource(*col_fixed), input_rows_count); if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const), input_rows_count); if (const ColumnConst * col_const_fixed = checkAndGetColumnConst(column_string.get())) return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, ConstSource(*col_const_fixed), input_rows_count); if (isEnum(arguments[0].type)) { if (const typename DataTypeEnum8::ColumnType * col_enum8 = checkAndGetColumn(column_string.get())) { const auto * type_enum8 = assert_cast(arguments[0].type.get()); return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, EnumSource(*col_enum8, *type_enum8), input_rows_count); } if (const typename DataTypeEnum16::ColumnType * col_enum16 = checkAndGetColumn(column_string.get())) { const auto * type_enum16 = assert_cast(arguments[0].type.get()); return executeForSource(column_offset, column_length, column_offset_const, column_length_const, offset, length, EnumSource(*col_enum16, *type_enum16), input_rows_count); } } throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName()); } } }; } REGISTER_FUNCTION(Substring) { factory.registerFunction>({}, FunctionFactory::CaseInsensitive); factory.registerAlias("substr", "substring", FunctionFactory::CaseInsensitive); // MySQL alias factory.registerAlias("mid", "substring", FunctionFactory::CaseInsensitive); /// MySQL alias factory.registerAlias("byteSlice", "substring", FunctionFactory::CaseInsensitive); /// resembles PostgreSQL's get_byte function, similar to ClickHouse's bitSlice factory.registerFunction>({}, FunctionFactory::CaseSensitive); } }