utf8 + fixes

This commit is contained in:
Nikita Vasilev 2019-02-20 15:48:50 +03:00
parent 4343ede944
commit f7c091d497
9 changed files with 47 additions and 12 deletions

View File

@ -40,6 +40,11 @@ void StringBloomFilter::add(const char * data, size_t len)
} }
} }
void StringBloomFilter::clear()
{
filter.assign(size, 0);
}
bool StringBloomFilter::contains(const StringBloomFilter & bf) bool StringBloomFilter::contains(const StringBloomFilter & bf)
{ {
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i)

View File

@ -41,6 +41,7 @@ public:
bool find(const char * data, size_t len); bool find(const char * data, size_t len);
void add(const char * data, size_t len); void add(const char * data, size_t len);
void clear();
/// Checks if this contains everything from another bloom filter. /// Checks if this contains everything from another bloom filter.
/// Bloom filters must have equal size and seed. /// Bloom filters must have equal size and seed.

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/MergeTreeBloomFilterIndex.h> #include <Storages/MergeTree/MergeTreeBloomFilterIndex.h>
#include <Common/UTF8Helpers.h>
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h> #include <IO/ReadHelpers.h>
@ -23,7 +24,10 @@ namespace ErrorCodes
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index) MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
: IMergeTreeIndexGranule(), index(index), bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed) : IMergeTreeIndexGranule()
, index(index)
, bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed)
, has_elems(false)
{ {
} }
@ -42,6 +46,7 @@ void MergeTreeBloomFilterIndexGranule::deserializeBinary(ReadBuffer & istr)
std::vector<UInt8> filter(index.bloom_filter_size, 0); std::vector<UInt8> filter(index.bloom_filter_size, 0);
istr.read(reinterpret_cast<char *>(filter.data()), index.bloom_filter_size); istr.read(reinterpret_cast<char *>(filter.data()), index.bloom_filter_size);
bloom_filter.setFilter(std::move(filter)); bloom_filter.setFilter(std::move(filter));
has_elems = true;
} }
void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos, size_t limit) void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos, size_t limit)
@ -65,6 +70,7 @@ void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos,
bloom_filter.add(ref.data + token_start, token_len); bloom_filter.add(ref.data + token_start, token_len);
} }
has_elems = true;
*pos += rows_read; *pos += rows_read;
} }
@ -86,17 +92,22 @@ struct NgramTokenExtractor
NgramTokenExtractor(size_t n) : n(n) {} NgramTokenExtractor(size_t n) : n(n) {}
static String getName() { static String getName() {
static String name = "ngram"; static String name = "ngrambf";
return name; return name;
} }
bool operator() (const char *, size_t len, size_t * pos, size_t * token_start, size_t * token_len) bool operator() (const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len)
{ {
if (*pos + n > len) {
return false;
}
*token_start = *pos; *token_start = *pos;
*token_len = n; *token_len = 0;
for (size_t i = 0; i < n; ++i)
{
size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_len]));
if (*token_start + *token_len + sz > len) {
return false;
}
*token_len += sz;
}
++*pos; ++*pos;
return true; return true;
} }
@ -105,7 +116,7 @@ struct NgramTokenExtractor
}; };
std::unique_ptr<IMergeTreeIndex> BloomFilterIndexCreator( std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
const NamesAndTypesList & new_columns, const NamesAndTypesList & new_columns,
std::shared_ptr<ASTIndexDeclaration> node, std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data, const MergeTreeData & data,
@ -153,8 +164,8 @@ std::unique_ptr<IMergeTreeIndex> BloomFilterIndexCreator(
size_t seed = typeid_cast<const ASTLiteral &>( size_t seed = typeid_cast<const ASTLiteral &>(
*node->type->arguments->children[2]).value.get<size_t>();\ *node->type->arguments->children[2]).value.get<size_t>();\
size_t bloom_filter_hashes = static_cast<size_t>( auto bloom_filter_hashes = static_cast<size_t>(
n / (node->granularity * data.index_granularity) * log(2.)); n * log(2.) / (node->granularity * data.index_granularity));
return std::make_unique<MergeTreeBloomFilterIndex>( return std::make_unique<MergeTreeBloomFilterIndex>(
node->name, std::move(index_expr), columns, data_types, sample, node->granularity, node->name, std::move(index_expr), columns, data_types, sample, node->granularity,

View File

@ -22,11 +22,12 @@ struct MergeTreeBloomFilterIndexGranule : public IMergeTreeIndexGranule
void deserializeBinary(ReadBuffer & istr) override; void deserializeBinary(ReadBuffer & istr) override;
bool empty() const override; bool empty() const override { return !has_elems; };
void update(const Block & block, size_t * pos, size_t limit) override; void update(const Block & block, size_t * pos, size_t limit) override;
const MergeTreeBloomFilterIndex & index; const MergeTreeBloomFilterIndex & index;
StringBloomFilter bloom_filter; StringBloomFilter bloom_filter;
bool has_elems;
}; };

View File

@ -348,6 +348,7 @@ void MergeTreeData::setPrimaryKeyIndicesAndColumns(
MergeTreeIndexFactory::instance().get( MergeTreeIndexFactory::instance().get(
all_columns, all_columns,
std::dynamic_pointer_cast<ASTIndexDeclaration>(index_decl->clone()), std::dynamic_pointer_cast<ASTIndexDeclaration>(index_decl->clone()),
*this,
global_context)); global_context));
if (indices_names.find(new_indices.back()->name) != indices_names.end()) if (indices_names.find(new_indices.back()->name) != indices_names.end())

View File

@ -29,6 +29,7 @@ void MergeTreeIndexFactory::registerIndex(const std::string &name, Creator creat
std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get( std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
const NamesAndTypesList & columns, const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node, std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context) const const Context & context) const
{ {
if (!node->type) if (!node->type)
@ -51,24 +52,35 @@ std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
return lft + ", " + rht.first; return lft + ", " + rht.first;
}), }),
ErrorCodes::INCORRECT_QUERY); ErrorCodes::INCORRECT_QUERY);
return it->second(columns, node, context); return it->second(columns, node, data, context);
} }
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator( std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
const NamesAndTypesList & columns, const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node, std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context); const Context & context);
std::unique_ptr<IMergeTreeIndex> setIndexCreator( std::unique_ptr<IMergeTreeIndex> setIndexCreator(
const NamesAndTypesList & columns, const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node, std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context); const Context & context);
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context);
MergeTreeIndexFactory::MergeTreeIndexFactory() MergeTreeIndexFactory::MergeTreeIndexFactory()
{ {
registerIndex("minmax", minmaxIndexCreator); registerIndex("minmax", minmaxIndexCreator);
registerIndex("set", setIndexCreator); registerIndex("set", setIndexCreator);
registerIndex("ngrambf", bloomFilterIndexCreator);
registerIndex("tokenbf", bloomFilterIndexCreator);
} }
} }

View File

@ -106,11 +106,13 @@ public:
std::unique_ptr<IMergeTreeIndex>( std::unique_ptr<IMergeTreeIndex>(
const NamesAndTypesList & columns, const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node, std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context)>; const Context & context)>;
std::unique_ptr<IMergeTreeIndex> get( std::unique_ptr<IMergeTreeIndex> get(
const NamesAndTypesList & columns, const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node, std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context) const; const Context & context) const;
void registerIndex(const std::string & name, Creator creator); void registerIndex(const std::string & name, Creator creator);

View File

@ -119,6 +119,7 @@ IndexConditionPtr MergeTreeMinMaxIndex::createIndexCondition(
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator( std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
const NamesAndTypesList & new_columns, const NamesAndTypesList & new_columns,
std::shared_ptr<ASTIndexDeclaration> node, std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData &,
const Context & context) const Context & context)
{ {
if (node->name.empty()) if (node->name.empty())

View File

@ -354,6 +354,7 @@ IndexConditionPtr MergeTreeSetSkippingIndex::createIndexCondition(
std::unique_ptr<IMergeTreeIndex> setIndexCreator( std::unique_ptr<IMergeTreeIndex> setIndexCreator(
const NamesAndTypesList & new_columns, const NamesAndTypesList & new_columns,
std::shared_ptr<ASTIndexDeclaration> node, std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData &,
const Context & context) const Context & context)
{ {
if (node->name.empty()) if (node->name.empty())