ClickHouse/src/Storages/MergeTree/MergeTreeIndexFullText.cpp

948 lines
33 KiB
C++
Raw Normal View History

2019-05-10 03:42:28 +00:00
#include <Storages/MergeTree/MergeTreeIndexFullText.h>
2019-02-20 11:22:07 +00:00
2019-02-25 17:12:09 +00:00
#include <Common/StringUtils/StringUtils.h>
2019-02-20 12:48:50 +00:00
#include <Common/UTF8Helpers.h>
2019-02-20 11:22:07 +00:00
#include <DataTypes/DataTypesNumber.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
2019-02-20 12:12:41 +00:00
#include <Interpreters/ExpressionActions.h>
#include <Interpreters/ExpressionAnalyzer.h>
#include <Interpreters/TreeRewriter.h>
#include <Interpreters/misc.h>
2019-02-20 12:12:41 +00:00
#include <Storages/MergeTree/MergeTreeData.h>
2019-03-07 09:15:58 +00:00
#include <Storages/MergeTree/RPNBuilder.h>
2019-02-24 21:17:52 +00:00
#include <Parsers/ASTIdentifier.h>
2019-02-20 12:12:41 +00:00
#include <Parsers/ASTLiteral.h>
2019-02-24 21:17:52 +00:00
#include <Parsers/ASTSubquery.h>
#include <Core/Defines.h>
#include <Columns/ColumnMap.h>
#include <DataTypes/DataTypeMap.h>
2019-02-20 12:12:41 +00:00
#include <Poco/Logger.h>
2019-02-20 11:22:07 +00:00
2019-02-21 20:32:36 +00:00
#include <boost/algorithm/string.hpp>
#if defined(__SSE2__)
#include <immintrin.h>
#if defined(__SSE4_2__)
#include <nmmintrin.h>
#endif
#endif
2019-02-20 11:22:07 +00:00
namespace DB
{
namespace ErrorCodes
{
2020-02-25 18:02:41 +00:00
extern const int LOGICAL_ERROR;
2019-02-20 11:22:07 +00:00
extern const int INCORRECT_QUERY;
2020-07-10 17:53:58 +00:00
extern const int BAD_ARGUMENTS;
2019-02-20 11:22:07 +00:00
}
2019-02-21 20:32:36 +00:00
/// Adds all tokens from string to bloom filter.
2019-02-20 16:24:46 +00:00
static void stringToBloomFilter(
const String & string, TokenExtractorPtr token_extractor, BloomFilter & bloom_filter)
{
const char * data = string.data();
size_t size = string.size();
size_t cur = 0;
size_t token_start = 0;
size_t token_len = 0;
while (cur < size && token_extractor->nextInField(data, size, &cur, &token_start, &token_len))
bloom_filter.add(data + token_start, token_len);
}
static void columnToBloomFilter(
2020-05-28 12:37:05 +00:00
const char * data, size_t size, TokenExtractorPtr token_extractor, BloomFilter & bloom_filter)
2019-02-20 16:24:46 +00:00
{
2019-02-20 13:18:15 +00:00
size_t cur = 0;
size_t token_start = 0;
size_t token_len = 0;
while (cur < size && token_extractor->nextInColumn(data, size, &cur, &token_start, &token_len))
2019-02-20 16:24:46 +00:00
bloom_filter.add(data + token_start, token_len);
2019-02-20 13:18:15 +00:00
}
2019-02-21 20:32:36 +00:00
/// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.)
static void likeStringToBloomFilter(
2020-05-28 12:37:05 +00:00
const String & data, TokenExtractorPtr token_extractor, BloomFilter & bloom_filter)
2019-02-21 20:32:36 +00:00
{
size_t cur = 0;
String token;
2019-02-21 21:29:24 +00:00
while (cur < data.size() && token_extractor->nextLike(data, &cur, token))
2019-02-21 20:32:36 +00:00
bloom_filter.add(token.c_str(), token.size());
}
2019-07-16 11:40:11 +00:00
/// Unified condition for equals, startsWith and endsWith
2020-07-10 08:13:21 +00:00
bool MergeTreeConditionFullText::createFunctionEqualsCondition(
RPNElement & out, const Field & value, const BloomFilterParameters & params, TokenExtractorPtr token_extractor)
2019-07-16 11:40:11 +00:00
{
out.function = RPNElement::FUNCTION_EQUALS;
2020-05-28 12:37:05 +00:00
out.bloom_filter = std::make_unique<BloomFilter>(params);
stringToBloomFilter(value.get<String>(), token_extractor, *out.bloom_filter);
2019-07-16 11:40:11 +00:00
return true;
}
2019-02-20 13:18:15 +00:00
2020-05-28 12:37:05 +00:00
MergeTreeIndexGranuleFullText::MergeTreeIndexGranuleFullText(
const String & index_name_,
size_t columns_number,
const BloomFilterParameters & params_)
: index_name(index_name_)
, params(params_)
2019-02-22 19:59:40 +00:00
, bloom_filters(
2020-05-28 12:37:05 +00:00
columns_number, BloomFilter(params))
, has_elems(false)
{
}
2019-02-20 11:22:07 +00:00
2019-05-10 03:42:28 +00:00
void MergeTreeIndexGranuleFullText::serializeBinary(WriteBuffer & ostr) const
2019-02-20 11:22:07 +00:00
{
if (empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty fulltext index {}.", backQuote(index_name));
2019-02-20 11:22:07 +00:00
2019-02-23 09:26:32 +00:00
for (const auto & bloom_filter : bloom_filters)
2020-05-28 12:37:05 +00:00
ostr.write(reinterpret_cast<const char *>(bloom_filter.getFilter().data()), params.filter_size);
2019-02-20 11:22:07 +00:00
}
void MergeTreeIndexGranuleFullText::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version)
2019-02-20 11:22:07 +00:00
{
if (version != 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown index version {}.", version);
2019-02-23 09:26:32 +00:00
for (auto & bloom_filter : bloom_filters)
2019-02-22 19:59:40 +00:00
{
2020-05-28 12:37:05 +00:00
istr.read(reinterpret_cast<char *>(
bloom_filter.getFilter().data()), params.filter_size);
2019-02-22 19:59:40 +00:00
}
2019-02-20 12:48:50 +00:00
has_elems = true;
2019-02-20 11:22:07 +00:00
}
2019-03-11 17:59:36 +00:00
2020-05-28 12:37:05 +00:00
MergeTreeIndexAggregatorFullText::MergeTreeIndexAggregatorFullText(
const Names & index_columns_,
const String & index_name_,
const BloomFilterParameters & params_,
TokenExtractorPtr token_extractor_)
: index_columns(index_columns_)
, index_name (index_name_)
, params(params_)
, token_extractor(token_extractor_)
, granule(
std::make_shared<MergeTreeIndexGranuleFullText>(
index_name, index_columns.size(), params))
{
}
2019-03-11 17:59:36 +00:00
2019-05-10 03:42:28 +00:00
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorFullText::getGranuleAndReset()
2019-03-11 17:59:36 +00:00
{
2020-05-28 12:37:05 +00:00
auto new_granule = std::make_shared<MergeTreeIndexGranuleFullText>(
index_name, index_columns.size(), params);
2019-03-11 17:59:36 +00:00
new_granule.swap(granule);
return new_granule;
}
2019-05-10 03:42:28 +00:00
void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, size_t limit)
2019-02-20 11:22:07 +00:00
{
if (*pos >= block.rows())
throw Exception(
"The provided position is not less than the number of block rows. Position: "
+ toString(*pos) + ", Block rows: " + toString(block.rows()) + ".", ErrorCodes::LOGICAL_ERROR);
size_t rows_read = std::min(limit, block.rows() - *pos);
2020-05-28 12:37:05 +00:00
for (size_t col = 0; col < index_columns.size(); ++col)
2019-02-20 11:22:07 +00:00
{
2020-05-28 12:37:05 +00:00
const auto & column = block.getByName(index_columns[col]).column;
if (column->getDataType() == TypeIndex::Map)
2019-02-22 19:59:40 +00:00
{
//update for the key of Map type
auto * column_map = assert_cast<ColumnMap *>(const_cast<IColumn *>(column.get()));
auto & column_array = assert_cast<ColumnArray &>(column_map->getNestedColumn());
auto & column_tuple = assert_cast<ColumnTuple &>(column_array.getData());
auto & column_key = column_tuple.getColumn(0);
for (size_t i = 0; i < rows_read; ++i)
{
size_t element_start_row = column_array.getOffsets()[*pos - 1];
size_t elements_size = column_array.getOffsets()[*pos] - element_start_row;
for (size_t row_num = 0; row_num < elements_size; row_num++)
{
auto ref = column_key.getDataAt(element_start_row + row_num);
columnToBloomFilter(ref.data, ref.size, token_extractor, granule->bloom_filters[col]);
}
*pos += 1;
}
}
else
{
for (size_t i = 0; i < rows_read; ++i)
{
auto ref = column->getDataAt(*pos + i);
columnToBloomFilter(ref.data, ref.size, token_extractor, granule->bloom_filters[col]);
}
*pos += rows_read;
2019-02-22 19:59:40 +00:00
}
2019-02-20 11:22:07 +00:00
}
2019-03-11 17:59:36 +00:00
granule->has_elems = true;
2019-02-20 11:22:07 +00:00
}
2019-02-20 12:12:41 +00:00
2019-05-10 03:42:28 +00:00
MergeTreeConditionFullText::MergeTreeConditionFullText(
2019-02-20 16:24:46 +00:00
const SelectQueryInfo & query_info,
ContextPtr context,
2020-05-28 12:37:05 +00:00
const Block & index_sample_block,
const BloomFilterParameters & params_,
TokenExtractorPtr token_extactor_)
: index_columns(index_sample_block.getNames())
, index_data_types(index_sample_block.getNamesAndTypesList().getTypes())
, params(params_)
, token_extractor(token_extactor_)
, prepared_sets(query_info.sets)
2019-02-20 16:24:46 +00:00
{
2019-03-12 13:36:15 +00:00
rpn = std::move(
RPNBuilder<RPNElement>(
query_info, context,
[this] (const ASTPtr & node, ContextPtr /* context */, Block & block_with_constants, RPNElement & out) -> bool
2019-03-12 13:36:15 +00:00
{
return this->atomFromAST(node, block_with_constants, out);
}).extractRPN());
2019-02-20 16:24:46 +00:00
}
2019-05-10 03:42:28 +00:00
bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const
2019-02-20 16:24:46 +00:00
{
/// Check like in KeyCondition.
std::vector<bool> rpn_stack;
for (const auto & element : rpn)
{
if (element.function == RPNElement::FUNCTION_UNKNOWN
|| element.function == RPNElement::ALWAYS_TRUE)
{
rpn_stack.push_back(true);
}
else if (element.function == RPNElement::FUNCTION_EQUALS
2019-02-20 20:17:44 +00:00
|| element.function == RPNElement::FUNCTION_NOT_EQUALS
2019-02-24 21:17:52 +00:00
|| element.function == RPNElement::FUNCTION_IN
|| element.function == RPNElement::FUNCTION_NOT_IN
|| element.function == RPNElement::FUNCTION_MULTI_SEARCH
2019-02-20 20:17:44 +00:00
|| element.function == RPNElement::ALWAYS_FALSE)
2019-02-20 16:24:46 +00:00
{
rpn_stack.push_back(false);
}
else if (element.function == RPNElement::FUNCTION_NOT)
{
// do nothing
}
else if (element.function == RPNElement::FUNCTION_AND)
{
auto arg1 = rpn_stack.back();
rpn_stack.pop_back();
auto arg2 = rpn_stack.back();
rpn_stack.back() = arg1 && arg2;
}
else if (element.function == RPNElement::FUNCTION_OR)
{
auto arg1 = rpn_stack.back();
rpn_stack.pop_back();
auto arg2 = rpn_stack.back();
rpn_stack.back() = arg1 || arg2;
}
else
throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
}
return rpn_stack[0];
}
2019-05-10 03:42:28 +00:00
bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const
2019-02-20 16:24:46 +00:00
{
2019-05-10 03:42:28 +00:00
std::shared_ptr<MergeTreeIndexGranuleFullText> granule
= std::dynamic_pointer_cast<MergeTreeIndexGranuleFullText>(idx_granule);
2019-02-20 16:24:46 +00:00
if (!granule)
throw Exception(
"BloomFilter index condition got a granule with the wrong type.", ErrorCodes::LOGICAL_ERROR);
/// Check like in KeyCondition.
std::vector<BoolMask> rpn_stack;
2019-02-22 07:59:07 +00:00
for (const auto & element : rpn)
2019-02-20 16:24:46 +00:00
{
if (element.function == RPNElement::FUNCTION_UNKNOWN)
{
rpn_stack.emplace_back(true, true);
}
else if (element.function == RPNElement::FUNCTION_EQUALS
2019-02-22 07:59:07 +00:00
|| element.function == RPNElement::FUNCTION_NOT_EQUALS)
2019-02-20 16:24:46 +00:00
{
rpn_stack.emplace_back(granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true);
2019-02-20 16:24:46 +00:00
if (element.function == RPNElement::FUNCTION_NOT_EQUALS)
rpn_stack.back() = !rpn_stack.back();
}
2019-02-24 21:17:52 +00:00
else if (element.function == RPNElement::FUNCTION_IN
|| element.function == RPNElement::FUNCTION_NOT_IN)
2019-02-24 21:17:52 +00:00
{
2019-02-25 08:43:19 +00:00
std::vector<bool> result(element.set_bloom_filters.back().size(), true);
2019-02-24 21:17:52 +00:00
2019-02-25 18:38:57 +00:00
for (size_t column = 0; column < element.set_key_position.size(); ++column)
2019-02-24 21:17:52 +00:00
{
2019-02-25 18:38:57 +00:00
const size_t key_idx = element.set_key_position[column];
2019-02-24 21:17:52 +00:00
2019-02-25 08:43:19 +00:00
const auto & bloom_filters = element.set_bloom_filters[column];
for (size_t row = 0; row < bloom_filters.size(); ++row)
result[row] = result[row] && granule->bloom_filters[key_idx].contains(bloom_filters[row]);
2019-02-24 21:17:52 +00:00
}
2019-02-25 08:43:19 +00:00
rpn_stack.emplace_back(
std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true);
2019-02-24 21:17:52 +00:00
if (element.function == RPNElement::FUNCTION_NOT_IN)
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH)
{
std::vector<bool> result(element.set_bloom_filters.back().size(), true);
const auto & bloom_filters = element.set_bloom_filters[0];
for (size_t row = 0; row < bloom_filters.size(); ++row)
result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]);
rpn_stack.emplace_back(
std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true);
}
2019-02-20 16:24:46 +00:00
else if (element.function == RPNElement::FUNCTION_NOT)
{
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == RPNElement::FUNCTION_AND)
{
auto arg1 = rpn_stack.back();
rpn_stack.pop_back();
auto arg2 = rpn_stack.back();
rpn_stack.back() = arg1 & arg2;
}
else if (element.function == RPNElement::FUNCTION_OR)
{
auto arg1 = rpn_stack.back();
rpn_stack.pop_back();
auto arg2 = rpn_stack.back();
rpn_stack.back() = arg1 | arg2;
}
else if (element.function == RPNElement::ALWAYS_FALSE)
{
rpn_stack.emplace_back(false, true);
}
else if (element.function == RPNElement::ALWAYS_TRUE)
{
rpn_stack.emplace_back(true, false);
}
else
throw Exception("Unexpected function type in BloomFilterCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
2019-02-20 16:24:46 +00:00
}
if (rpn_stack.size() != 1)
throw Exception("Unexpected stack size in BloomFilterCondition::mayBeTrueOnGranule", ErrorCodes::LOGICAL_ERROR);
2019-02-20 16:24:46 +00:00
return rpn_stack[0].can_be_true;
}
bool MergeTreeConditionFullText::getKey(const std::string & key_column_name, size_t & key_column_num)
2019-02-20 16:24:46 +00:00
{
auto it = std::find(index_columns.begin(), index_columns.end(), key_column_name);
2020-05-28 12:37:05 +00:00
if (it == index_columns.end())
2019-02-20 16:24:46 +00:00
return false;
2020-05-28 12:37:05 +00:00
key_column_num = static_cast<size_t>(it - index_columns.begin());
2019-02-20 16:24:46 +00:00
return true;
}
2019-05-10 03:42:28 +00:00
bool MergeTreeConditionFullText::atomFromAST(
const ASTPtr & node, Block & block_with_constants, RPNElement & out)
2019-02-20 16:24:46 +00:00
{
Field const_value;
DataTypePtr const_type;
2019-02-22 08:15:52 +00:00
if (const auto * func = typeid_cast<const ASTFunction *>(node.get()))
2019-02-20 16:24:46 +00:00
{
const ASTs & args = typeid_cast<const ASTExpressionList &>(*func->arguments).children;
if (args.size() != 2)
return false;
size_t key_arg_pos; /// Position of argument with key column (non-const argument)
2019-02-20 16:24:46 +00:00
size_t key_column_num = -1; /// Number of a key column (inside key_column_names array)
2019-07-16 11:40:11 +00:00
const auto & func_name = func->name;
2019-02-20 16:24:46 +00:00
if (functionIsInOrGlobalInOperator(func_name) && tryPrepareSetBloomFilter(args, out))
2019-02-22 10:51:19 +00:00
{
key_arg_pos = 0;
2019-02-22 10:51:19 +00:00
}
else if (KeyCondition::getConstant(args[1], block_with_constants, const_value, const_type) && getKey(args[0]->getColumnName(), key_column_num))
2019-02-20 16:24:46 +00:00
{
key_arg_pos = 0;
}
else if (KeyCondition::getConstant(args[0], block_with_constants, const_value, const_type) && getKey(args[1]->getColumnName(), key_column_num))
{
key_arg_pos = 1;
2019-02-20 16:24:46 +00:00
}
else if (const auto * index_function = args[0].get()->as<ASTFunction>())
{
if (index_function->name == "arrayElement")
{
auto column_name = assert_cast<ASTIdentifier *>(index_function->arguments.get()->children[0].get())->name();
if (!getKey(column_name, key_column_num))
return false;
key_arg_pos = 0;
auto & argument = index_function->arguments.get()->children[1];
if (const auto * literal = argument->as<ASTLiteral>())
{
const_value = literal->value;
if (const_value.getType() != Field::Types::String)
return false;
const_type = std::make_shared<DataTypeString>();
}
else
{
return false;
}
}
else
{
return false;
}
}
2019-02-20 16:24:46 +00:00
else
{
2019-02-20 16:24:46 +00:00
return false;
}
2019-02-20 16:24:46 +00:00
if (const_type && const_type->getTypeId() != TypeIndex::String
&& const_type->getTypeId() != TypeIndex::FixedString
&& const_type->getTypeId() != TypeIndex::Array)
{
return false;
}
if (key_arg_pos == 1 && (func_name != "equals" && func_name != "notEquals"))
return false;
2020-05-28 12:37:05 +00:00
else if (!token_extractor->supportLike() && (func_name == "like" || func_name == "notLike"))
return false;
if (func_name == "notEquals")
{
out.key_column = key_column_num;
out.function = RPNElement::FUNCTION_NOT_EQUALS;
2020-05-28 12:37:05 +00:00
out.bloom_filter = std::make_unique<BloomFilter>(params);
stringToBloomFilter(const_value.get<String>(), token_extractor, *out.bloom_filter);
return true;
}
else if (func_name == "equals")
{
out.key_column = key_column_num;
2020-05-28 12:37:05 +00:00
return createFunctionEqualsCondition(out, const_value, params, token_extractor);
}
else if (func_name == "like")
{
out.key_column = key_column_num;
out.function = RPNElement::FUNCTION_EQUALS;
2020-05-28 12:37:05 +00:00
out.bloom_filter = std::make_unique<BloomFilter>(params);
likeStringToBloomFilter(const_value.get<String>(), token_extractor, *out.bloom_filter);
return true;
}
else if (func_name == "notLike")
{
out.key_column = key_column_num;
out.function = RPNElement::FUNCTION_NOT_EQUALS;
2020-05-28 12:37:05 +00:00
out.bloom_filter = std::make_unique<BloomFilter>(params);
likeStringToBloomFilter(const_value.get<String>(), token_extractor, *out.bloom_filter);
return true;
}
else if (func_name == "hasToken")
{
out.key_column = key_column_num;
out.function = RPNElement::FUNCTION_EQUALS;
2020-05-28 12:37:05 +00:00
out.bloom_filter = std::make_unique<BloomFilter>(params);
stringToBloomFilter(const_value.get<String>(), token_extractor, *out.bloom_filter);
return true;
}
else if (func_name == "startsWith")
{
out.key_column = key_column_num;
2020-05-28 12:37:05 +00:00
return createFunctionEqualsCondition(out, const_value, params, token_extractor);
}
else if (func_name == "endsWith")
{
out.key_column = key_column_num;
2020-05-28 12:37:05 +00:00
return createFunctionEqualsCondition(out, const_value, params, token_extractor);
}
else if (func_name == "multiSearchAny")
{
out.key_column = key_column_num;
out.function = RPNElement::FUNCTION_MULTI_SEARCH;
2019-02-20 16:24:46 +00:00
/// 2d vector is not needed here but is used because already exists for FUNCTION_IN
std::vector<std::vector<BloomFilter>> bloom_filters;
bloom_filters.emplace_back();
for (const auto & element : const_value.get<Array>())
{
if (element.getType() != Field::Types::String)
return false;
2020-05-28 12:37:05 +00:00
bloom_filters.back().emplace_back(params);
stringToBloomFilter(element.get<String>(), token_extractor, bloom_filters.back().back());
}
out.set_bloom_filters = std::move(bloom_filters);
return true;
}
else if (func_name == "notIn")
{
out.key_column = key_column_num;
out.function = RPNElement::FUNCTION_NOT_IN;
return true;
}
else if (func_name == "in")
{
out.key_column = key_column_num;
out.function = RPNElement::FUNCTION_IN;
return true;
}
return false;
2019-02-20 16:24:46 +00:00
}
else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
{
/// Check constant like in KeyCondition
if (const_value.getType() == Field::Types::UInt64
|| const_value.getType() == Field::Types::Int64
|| const_value.getType() == Field::Types::Float64)
2019-02-20 19:27:23 +00:00
{
/// Zero in all types is represented in memory the same way as in UInt64.
out.function = const_value.get<UInt64>()
? RPNElement::ALWAYS_TRUE
: RPNElement::ALWAYS_FALSE;
2019-02-20 16:24:46 +00:00
2019-02-20 19:27:23 +00:00
return true;
}
2019-02-20 16:24:46 +00:00
}
return false;
}
2019-05-10 03:42:28 +00:00
bool MergeTreeConditionFullText::tryPrepareSetBloomFilter(
2019-02-22 10:51:19 +00:00
const ASTs & args,
2019-02-24 21:17:52 +00:00
RPNElement & out)
2019-02-22 10:51:19 +00:00
{
2019-02-24 21:17:52 +00:00
const ASTPtr & left_arg = args[0];
const ASTPtr & right_arg = args[1];
std::vector<KeyTuplePositionMapping> key_tuple_mapping;
DataTypes data_types;
const auto * left_arg_tuple = typeid_cast<const ASTFunction *>(left_arg.get());
if (left_arg_tuple && left_arg_tuple->name == "tuple")
{
const auto & tuple_elements = left_arg_tuple->arguments->children;
for (size_t i = 0; i < tuple_elements.size(); ++i)
{
size_t key = 0;
if (getKey(tuple_elements[i]->getColumnName(), key))
2019-02-24 21:17:52 +00:00
{
key_tuple_mapping.emplace_back(i, key);
2020-05-28 12:37:05 +00:00
data_types.push_back(index_data_types[key]);
2019-02-24 21:17:52 +00:00
}
}
}
else
{
size_t key = 0;
if (getKey(left_arg->getColumnName(), key))
2019-02-24 21:17:52 +00:00
{
2019-02-25 08:43:19 +00:00
key_tuple_mapping.emplace_back(0, key);
2020-05-28 12:37:05 +00:00
data_types.push_back(index_data_types[key]);
2019-02-24 21:17:52 +00:00
}
}
if (key_tuple_mapping.empty())
return false;
PreparedSetKey set_key;
if (typeid_cast<const ASTSubquery *>(right_arg.get()) || typeid_cast<const ASTIdentifier *>(right_arg.get()))
set_key = PreparedSetKey::forSubquery(*right_arg);
else
set_key = PreparedSetKey::forLiteral(*right_arg, data_types);
auto set_it = prepared_sets.find(set_key);
if (set_it == prepared_sets.end())
return false;
const SetPtr & prepared_set = set_it->second;
if (!prepared_set->hasExplicitSetElements())
return false;
2019-02-25 08:43:19 +00:00
for (const auto & data_type : prepared_set->getDataTypes())
if (data_type->getTypeId() != TypeIndex::String && data_type->getTypeId() != TypeIndex::FixedString)
return false;
2019-05-10 03:42:28 +00:00
std::vector<std::vector<BloomFilter>> bloom_filters;
2019-02-25 18:38:57 +00:00
std::vector<size_t> key_position;
2019-02-24 21:17:52 +00:00
Columns columns = prepared_set->getSetElements();
2020-03-09 02:05:04 +00:00
for (const auto & elem : key_tuple_mapping)
2019-02-24 21:17:52 +00:00
{
2019-02-25 08:43:19 +00:00
bloom_filters.emplace_back();
2020-03-09 02:05:04 +00:00
key_position.push_back(elem.key_index);
2019-02-25 18:38:57 +00:00
2020-03-09 02:05:04 +00:00
size_t tuple_idx = elem.tuple_index;
2019-02-24 21:17:52 +00:00
const auto & column = columns[tuple_idx];
for (size_t row = 0; row < prepared_set->getTotalRowCount(); ++row)
{
2020-05-28 12:37:05 +00:00
bloom_filters.back().emplace_back(params);
2019-02-24 21:17:52 +00:00
auto ref = column->getDataAt(row);
columnToBloomFilter(ref.data, ref.size, token_extractor, bloom_filters.back().back());
2019-02-24 21:17:52 +00:00
}
}
2019-02-25 18:38:57 +00:00
out.set_key_position = std::move(key_position);
2019-02-24 21:17:52 +00:00
out.set_bloom_filters = std::move(bloom_filters);
return true;
}
2019-02-22 10:51:19 +00:00
2019-05-10 03:42:28 +00:00
MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const
2019-02-20 12:12:41 +00:00
{
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeIndexGranuleFullText>(index.name, index.column_names.size(), params);
2019-02-20 12:12:41 +00:00
}
2019-05-10 03:42:28 +00:00
MergeTreeIndexAggregatorPtr MergeTreeIndexFullText::createIndexAggregator() const
2019-03-11 17:59:36 +00:00
{
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeIndexAggregatorFullText>(index.column_names, index.name, params, token_extractor.get());
2019-03-11 17:59:36 +00:00
}
2019-06-19 15:30:48 +00:00
MergeTreeIndexConditionPtr MergeTreeIndexFullText::createIndexCondition(
const SelectQueryInfo & query, ContextPtr context) const
2019-02-20 12:12:41 +00:00
{
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeConditionFullText>(query, context, index.sample_block, params, token_extractor.get());
2019-02-20 12:12:41 +00:00
};
2019-05-10 03:42:28 +00:00
bool MergeTreeIndexFullText::mayBenefitFromIndexForIn(const ASTPtr & node) const
2019-02-25 08:43:19 +00:00
{
2020-05-28 12:37:05 +00:00
return std::find(std::cbegin(index.column_names), std::cend(index.column_names), node->getColumnName()) != std::cend(index.column_names);
2019-02-25 08:43:19 +00:00
}
2019-02-20 12:12:41 +00:00
bool NgramTokenExtractor::nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const
2019-02-21 20:32:36 +00:00
{
2019-02-21 21:29:24 +00:00
*token_start = *pos;
*token_len = 0;
2019-02-24 18:55:56 +00:00
size_t code_points = 0;
for (; code_points < n && *token_start + *token_len < len; ++code_points)
2019-02-21 21:29:24 +00:00
{
size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_len]));
*token_len += sz;
}
2019-02-22 07:59:07 +00:00
*pos += UTF8::seqLength(static_cast<UInt8>(data[*pos]));
2019-02-24 18:55:56 +00:00
return code_points == n;
2019-02-21 20:32:36 +00:00
}
2019-02-21 21:29:24 +00:00
bool NgramTokenExtractor::nextLike(const String & str, size_t * pos, String & token) const
2019-02-20 12:12:41 +00:00
{
2019-02-21 21:29:24 +00:00
token.clear();
2019-02-20 12:12:41 +00:00
2019-02-22 07:59:07 +00:00
size_t code_points = 0;
2019-02-21 21:29:24 +00:00
bool escaped = false;
2019-02-22 07:59:07 +00:00
for (size_t i = *pos; i < str.size();)
2019-02-20 12:12:41 +00:00
{
2019-02-22 07:59:07 +00:00
if (escaped && (str[i] == '%' || str[i] == '_' || str[i] == '\\'))
2019-02-20 12:48:50 +00:00
{
2019-02-22 07:59:07 +00:00
token += str[i];
++code_points;
2019-02-21 21:29:24 +00:00
escaped = false;
2019-02-22 07:59:07 +00:00
++i;
2019-02-20 12:48:50 +00:00
}
2019-02-22 07:59:07 +00:00
else if (!escaped && (str[i] == '%' || str[i] == '_'))
2019-02-21 20:32:36 +00:00
{
2019-02-21 21:29:24 +00:00
/// This token is too small, go to the next.
token.clear();
2019-02-22 07:59:07 +00:00
code_points = 0;
2019-02-21 21:29:24 +00:00
escaped = false;
2019-02-22 07:59:07 +00:00
*pos = ++i;
2019-02-21 21:29:24 +00:00
}
2019-02-22 07:59:07 +00:00
else if (!escaped && str[i] == '\\')
2019-02-21 21:29:24 +00:00
{
escaped = true;
2019-02-22 07:59:07 +00:00
++i;
2019-02-21 21:29:24 +00:00
}
else
{
2019-02-22 07:59:07 +00:00
const size_t sz = UTF8::seqLength(static_cast<UInt8>(str[i]));
for (size_t j = 0; j < sz; ++j)
token += str[i + j];
i += sz;
++code_points;
2019-02-21 21:29:24 +00:00
escaped = false;
2019-02-21 20:32:36 +00:00
}
2019-02-25 18:46:54 +00:00
if (code_points == n)
{
2019-02-22 07:59:07 +00:00
*pos += UTF8::seqLength(static_cast<UInt8>(str[*pos]));
2019-02-21 21:29:24 +00:00
return true;
}
2019-02-21 20:32:36 +00:00
}
2019-02-21 21:29:24 +00:00
return false;
}
2019-02-20 12:12:41 +00:00
bool SplitTokenExtractor::nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const
{
*token_start = *pos;
*token_len = 0;
while (*pos < len)
{
if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos]))
{
/// Finish current token if any
if (*token_len > 0)
return true;
*token_start = ++*pos;
}
else
{
/// Note that UTF-8 sequence is completely consisted of non-ASCII bytes.
++*pos;
++*token_len;
}
}
return *token_len > 0;
}
bool SplitTokenExtractor::nextInColumn(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const
2019-02-25 14:23:19 +00:00
{
*token_start = *pos;
*token_len = 0;
2019-02-25 14:23:19 +00:00
while (*pos < len)
{
2020-08-08 01:01:47 +00:00
#if defined(__SSE2__) && !defined(MEMORY_SANITIZER) /// We read uninitialized bytes and decide on the calculated mask
// NOTE: we assume that `data` string is padded from the right with 15 bytes.
const __m128i haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + *pos));
const size_t haystack_length = 16;
#if defined(__SSE4_2__)
// With the help of https://www.strchr.com/strcmp_and_strlen_using_sse_4.2
const auto alnum_chars_ranges = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
'\xFF', '\x80', 'z', 'a', 'Z', 'A', '9', '0');
2020-04-11 21:37:19 +00:00
// Every bit represents if `haystack` character is in the ranges (1) or not (0)
const int result_bitmask = _mm_cvtsi128_si32(_mm_cmpestrm(alnum_chars_ranges, 8, haystack, haystack_length, _SIDD_CMP_RANGES));
#else
// NOTE: -1 and +1 required since SSE2 has no `>=` and `<=` instructions on packed 8-bit integers (epi8).
const auto number_begin = _mm_set1_epi8('0' - 1);
const auto number_end = _mm_set1_epi8('9' + 1);
const auto alpha_lower_begin = _mm_set1_epi8('a' - 1);
const auto alpha_lower_end = _mm_set1_epi8('z' + 1);
const auto alpha_upper_begin = _mm_set1_epi8('A' - 1);
const auto alpha_upper_end = _mm_set1_epi8('Z' + 1);
2021-01-22 23:57:35 +00:00
const auto zero = _mm_set1_epi8(0);
2020-08-08 01:01:47 +00:00
// every bit represents if `haystack` character `c` satisfies condition:
// (c < 0) || (c > '0' - 1 && c < '9' + 1) || (c > 'a' - 1 && c < 'z' + 1) || (c > 'A' - 1 && c < 'Z' + 1)
// < 0 since _mm_cmplt_epi8 threats chars as SIGNED, and so all chars > 0x80 are negative.
const int result_bitmask = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(_mm_or_si128(
_mm_cmplt_epi8(haystack, zero),
_mm_and_si128(_mm_cmpgt_epi8(haystack, number_begin), _mm_cmplt_epi8(haystack, number_end))),
_mm_and_si128(_mm_cmpgt_epi8(haystack, alpha_lower_begin), _mm_cmplt_epi8(haystack, alpha_lower_end))),
_mm_and_si128(_mm_cmpgt_epi8(haystack, alpha_upper_begin), _mm_cmplt_epi8(haystack, alpha_upper_end))));
#endif
if (result_bitmask == 0)
{
if (*token_len != 0)
// end of token started on previous haystack
return true;
*pos += haystack_length;
continue;
}
const auto token_start_pos_in_current_haystack = getTrailingZeroBitsUnsafe(result_bitmask);
if (*token_len == 0)
// new token
*token_start = *pos + token_start_pos_in_current_haystack;
else if (token_start_pos_in_current_haystack != 0)
// end of token starting in one of previous haystacks
return true;
const auto token_bytes_in_current_haystack = getTrailingZeroBitsUnsafe(~(result_bitmask >> token_start_pos_in_current_haystack));
*token_len += token_bytes_in_current_haystack;
*pos += token_start_pos_in_current_haystack + token_bytes_in_current_haystack;
if (token_start_pos_in_current_haystack + token_bytes_in_current_haystack == haystack_length)
// check if there are leftovers in next `haystack`
continue;
break;
#else
2019-03-12 15:20:54 +00:00
if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos]))
2019-02-25 14:23:19 +00:00
{
2020-01-08 10:20:55 +00:00
/// Finish current token if any
2019-02-25 14:23:19 +00:00
if (*token_len > 0)
return true;
*token_start = ++*pos;
}
else
2019-02-25 17:12:09 +00:00
{
2020-01-08 10:20:55 +00:00
/// Note that UTF-8 sequence is completely consisted of non-ASCII bytes.
++*pos;
++*token_len;
2019-02-25 17:12:09 +00:00
}
#endif
2019-02-25 14:23:19 +00:00
}
#if defined(__SSE2__) && !defined(MEMORY_SANITIZER)
2021-01-22 23:57:35 +00:00
// Could happen only if string is not padded with zeros, and we accidentally hopped over the end of data.
if (*token_start > len)
return false;
*token_len = std::min(len - *token_start, *token_len);
#endif
2019-02-25 14:23:19 +00:00
return *token_len > 0;
}
2019-03-06 15:30:27 +00:00
bool SplitTokenExtractor::nextLike(const String & str, size_t * pos, String & token) const
2019-02-25 14:23:19 +00:00
{
2019-03-06 15:30:27 +00:00
token.clear();
bool bad_token = false; // % or _ before token
2019-03-12 13:17:22 +00:00
bool escaped = false;
2019-03-12 15:20:54 +00:00
while (*pos < str.size())
2019-03-06 15:30:27 +00:00
{
2019-03-12 13:17:22 +00:00
if (!escaped && (str[*pos] == '%' || str[*pos] == '_'))
2019-03-06 15:30:27 +00:00
{
token.clear();
bad_token = true;
2019-03-12 15:20:54 +00:00
++*pos;
2019-03-06 15:30:27 +00:00
}
2019-03-12 13:17:22 +00:00
else if (!escaped && str[*pos] == '\\')
{
escaped = true;
2019-03-12 15:20:54 +00:00
++*pos;
2019-03-12 13:17:22 +00:00
}
else if (isASCII(str[*pos]) && !isAlphaNumericASCII(str[*pos]))
2019-03-06 15:30:27 +00:00
{
if (!bad_token && !token.empty())
return true;
token.clear();
bad_token = false;
2019-03-12 13:17:22 +00:00
escaped = false;
2019-03-12 15:20:54 +00:00
++*pos;
2019-03-06 15:30:27 +00:00
}
else
{
2019-03-12 15:20:54 +00:00
const size_t sz = UTF8::seqLength(static_cast<UInt8>(str[*pos]));
for (size_t j = 0; j < sz; ++j)
{
token += str[*pos];
++*pos;
}
2019-03-12 13:17:22 +00:00
escaped = false;
2019-03-06 15:30:27 +00:00
}
}
return !bad_token && !token.empty();
2019-02-25 14:23:19 +00:00
}
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr bloomFilterIndexCreator(
2020-05-28 13:09:03 +00:00
const IndexDescription & index)
2019-02-20 12:12:41 +00:00
{
2020-05-28 12:37:05 +00:00
if (index.type == NgramTokenExtractor::getName())
{
size_t n = index.arguments[0].get<size_t>();
2020-07-10 08:13:21 +00:00
BloomFilterParameters params(
index.arguments[1].get<size_t>(),
index.arguments[2].get<size_t>(),
index.arguments[3].get<size_t>());
2019-02-20 12:12:41 +00:00
2020-05-28 12:37:05 +00:00
auto tokenizer = std::make_unique<NgramTokenExtractor>(n);
2019-02-20 12:12:41 +00:00
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeIndexFullText>(index, params, std::move(tokenizer));
}
else if (index.type == SplitTokenExtractor::getName())
{
2020-07-10 08:13:21 +00:00
BloomFilterParameters params(
index.arguments[0].get<size_t>(),
index.arguments[1].get<size_t>(),
index.arguments[2].get<size_t>());
2019-02-20 12:12:41 +00:00
2020-05-28 12:37:05 +00:00
auto tokenizer = std::make_unique<SplitTokenExtractor>();
2019-02-20 12:12:41 +00:00
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeIndexFullText>(index, params, std::move(tokenizer));
}
else
2019-02-20 12:12:41 +00:00
{
2020-05-28 12:37:05 +00:00
throw Exception("Unknown index type: " + backQuote(index.name), ErrorCodes::LOGICAL_ERROR);
}
}
2019-02-20 12:12:41 +00:00
2020-05-28 13:09:03 +00:00
void bloomFilterIndexValidator(const IndexDescription & index, bool /*attach*/)
2020-05-28 12:37:05 +00:00
{
2021-09-06 10:22:06 +00:00
for (const auto & data_type : index.data_types)
{
DataTypePtr index_key_data_type = data_type;
if (data_type->getTypeId() == TypeIndex::Map)
{
DataTypeMap * map_type = assert_cast<DataTypeMap *>(const_cast<IDataType *>(data_type.get()));
index_key_data_type = map_type->getKeyType();
}
if (index_key_data_type->getTypeId() != TypeIndex::String && index_key_data_type->getTypeId() != TypeIndex::FixedString)
throw Exception(ErrorCodes::INCORRECT_QUERY,
"Bloom filter index can be used only with `String`,`FixedString` or `Map` with key of `String` or `FixedString` type.");
2019-02-20 12:12:41 +00:00
}
2020-05-28 12:37:05 +00:00
if (index.type == NgramTokenExtractor::getName())
2019-02-25 18:46:54 +00:00
{
2020-05-28 12:37:05 +00:00
if (index.arguments.size() != 4)
2019-02-24 18:55:56 +00:00
throw Exception("`ngrambf` index must have exactly 4 arguments.", ErrorCodes::INCORRECT_QUERY);
2019-02-25 18:46:54 +00:00
}
2020-05-28 12:37:05 +00:00
else if (index.type == SplitTokenExtractor::getName())
2019-02-25 18:46:54 +00:00
{
2020-05-28 12:37:05 +00:00
if (index.arguments.size() != 3)
2019-02-25 14:23:19 +00:00
throw Exception("`tokenbf` index must have exactly 3 arguments.", ErrorCodes::INCORRECT_QUERY);
2019-02-25 18:46:54 +00:00
}
else
{
2020-05-28 12:37:05 +00:00
throw Exception("Unknown index type: " + backQuote(index.name), ErrorCodes::LOGICAL_ERROR);
2019-02-20 12:12:41 +00:00
}
2020-07-10 08:13:21 +00:00
assert(index.arguments.size() >= 3);
2020-07-10 08:21:40 +00:00
for (const auto & arg : index.arguments)
if (arg.getType() != Field::Types::UInt64)
throw Exception("All parameters to *bf_v1 index must be unsigned integers", ErrorCodes::BAD_ARGUMENTS);
2020-07-10 08:13:21 +00:00
/// Just validate
BloomFilterParameters params(
index.arguments[0].get<size_t>(),
index.arguments[1].get<size_t>(),
index.arguments[2].get<size_t>());
2019-02-20 12:12:41 +00:00
}
2019-02-20 11:22:07 +00:00
}