From f7c091d497bdc9537e5ce2a222197ba074b0d455 Mon Sep 17 00:00:00 2001 From: Nikita Vasilev Date: Wed, 20 Feb 2019 15:48:50 +0300 Subject: [PATCH] utf8 + fixes --- dbms/src/Interpreters/BloomFilter.cpp | 5 +++ dbms/src/Interpreters/BloomFilter.h | 1 + .../MergeTree/MergeTreeBloomFilterIndex.cpp | 31 +++++++++++++------ .../MergeTree/MergeTreeBloomFilterIndex.h | 3 +- dbms/src/Storages/MergeTree/MergeTreeData.cpp | 1 + .../Storages/MergeTree/MergeTreeIndices.cpp | 14 ++++++++- .../src/Storages/MergeTree/MergeTreeIndices.h | 2 ++ .../MergeTree/MergeTreeMinMaxIndex.cpp | 1 + .../MergeTree/MergeTreeSetSkippingIndex.cpp | 1 + 9 files changed, 47 insertions(+), 12 deletions(-) diff --git a/dbms/src/Interpreters/BloomFilter.cpp b/dbms/src/Interpreters/BloomFilter.cpp index 033bfbbefc7..70eca860439 100644 --- a/dbms/src/Interpreters/BloomFilter.cpp +++ b/dbms/src/Interpreters/BloomFilter.cpp @@ -40,6 +40,11 @@ void StringBloomFilter::add(const char * data, size_t len) } } +void StringBloomFilter::clear() +{ + filter.assign(size, 0); +} + bool StringBloomFilter::contains(const StringBloomFilter & bf) { for (size_t i = 0; i < size; ++i) diff --git a/dbms/src/Interpreters/BloomFilter.h b/dbms/src/Interpreters/BloomFilter.h index a0e1bb55937..d8610498019 100644 --- a/dbms/src/Interpreters/BloomFilter.h +++ b/dbms/src/Interpreters/BloomFilter.h @@ -41,6 +41,7 @@ public: bool find(const char * data, size_t len); void add(const char * data, size_t len); + void clear(); /// Checks if this contains everything from another bloom filter. /// Bloom filters must have equal size and seed. diff --git a/dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.cpp b/dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.cpp index 1109726c3a0..95ea0139cdd 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -23,7 +24,10 @@ namespace ErrorCodes MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index) - : IMergeTreeIndexGranule(), index(index), bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed) + : IMergeTreeIndexGranule() + , index(index) + , bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed) + , has_elems(false) { } @@ -42,6 +46,7 @@ void MergeTreeBloomFilterIndexGranule::deserializeBinary(ReadBuffer & istr) std::vector filter(index.bloom_filter_size, 0); istr.read(reinterpret_cast(filter.data()), index.bloom_filter_size); bloom_filter.setFilter(std::move(filter)); + has_elems = true; } void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos, size_t limit) @@ -65,6 +70,7 @@ void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos, bloom_filter.add(ref.data + token_start, token_len); } + has_elems = true; *pos += rows_read; } @@ -86,17 +92,22 @@ struct NgramTokenExtractor NgramTokenExtractor(size_t n) : n(n) {} static String getName() { - static String name = "ngram"; + static String name = "ngrambf"; return name; } - bool operator() (const char *, size_t len, size_t * pos, size_t * token_start, size_t * token_len) + bool operator() (const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) { - if (*pos + n > len) { - return false; - } *token_start = *pos; - *token_len = n; + *token_len = 0; + for (size_t i = 0; i < n; ++i) + { + size_t sz = UTF8::seqLength(static_cast(data[*token_start + *token_len])); + if (*token_start + *token_len + sz > len) { + return false; + } + *token_len += sz; + } ++*pos; return true; } @@ -105,7 +116,7 @@ struct NgramTokenExtractor }; -std::unique_ptr BloomFilterIndexCreator( +std::unique_ptr bloomFilterIndexCreator( const NamesAndTypesList & new_columns, std::shared_ptr node, const MergeTreeData & data, @@ -153,8 +164,8 @@ std::unique_ptr BloomFilterIndexCreator( size_t seed = typeid_cast( *node->type->arguments->children[2]).value.get();\ - size_t bloom_filter_hashes = static_cast( - n / (node->granularity * data.index_granularity) * log(2.)); + auto bloom_filter_hashes = static_cast( + n * log(2.) / (node->granularity * data.index_granularity)); return std::make_unique( node->name, std::move(index_expr), columns, data_types, sample, node->granularity, diff --git a/dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.h b/dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.h index c7383683da6..fe809e7c110 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.h +++ b/dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.h @@ -22,11 +22,12 @@ struct MergeTreeBloomFilterIndexGranule : public IMergeTreeIndexGranule void deserializeBinary(ReadBuffer & istr) override; - bool empty() const override; + bool empty() const override { return !has_elems; }; void update(const Block & block, size_t * pos, size_t limit) override; const MergeTreeBloomFilterIndex & index; StringBloomFilter bloom_filter; + bool has_elems; }; diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index fe0a73705b0..d303363939a 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -348,6 +348,7 @@ void MergeTreeData::setPrimaryKeyIndicesAndColumns( MergeTreeIndexFactory::instance().get( all_columns, std::dynamic_pointer_cast(index_decl->clone()), + *this, global_context)); if (indices_names.find(new_indices.back()->name) != indices_names.end()) diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndices.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndices.cpp index 0d426eacba5..9e366f06ed5 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndices.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndices.cpp @@ -29,6 +29,7 @@ void MergeTreeIndexFactory::registerIndex(const std::string &name, Creator creat std::unique_ptr MergeTreeIndexFactory::get( const NamesAndTypesList & columns, std::shared_ptr node, + const MergeTreeData & data, const Context & context) const { if (!node->type) @@ -51,24 +52,35 @@ std::unique_ptr MergeTreeIndexFactory::get( return lft + ", " + rht.first; }), ErrorCodes::INCORRECT_QUERY); - return it->second(columns, node, context); + return it->second(columns, node, data, context); } std::unique_ptr minmaxIndexCreator( const NamesAndTypesList & columns, std::shared_ptr node, + const MergeTreeData & data, const Context & context); std::unique_ptr setIndexCreator( const NamesAndTypesList & columns, std::shared_ptr node, + const MergeTreeData & data, const Context & context); +std::unique_ptr bloomFilterIndexCreator( + const NamesAndTypesList & columns, + std::shared_ptr node, + const MergeTreeData & data, + const Context & context); + + MergeTreeIndexFactory::MergeTreeIndexFactory() { registerIndex("minmax", minmaxIndexCreator); registerIndex("set", setIndexCreator); + registerIndex("ngrambf", bloomFilterIndexCreator); + registerIndex("tokenbf", bloomFilterIndexCreator); } } diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndices.h b/dbms/src/Storages/MergeTree/MergeTreeIndices.h index 6738d667b44..e62f487b9ef 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndices.h +++ b/dbms/src/Storages/MergeTree/MergeTreeIndices.h @@ -106,11 +106,13 @@ public: std::unique_ptr( const NamesAndTypesList & columns, std::shared_ptr node, + const MergeTreeData & data, const Context & context)>; std::unique_ptr get( const NamesAndTypesList & columns, std::shared_ptr node, + const MergeTreeData & data, const Context & context) const; void registerIndex(const std::string & name, Creator creator); diff --git a/dbms/src/Storages/MergeTree/MergeTreeMinMaxIndex.cpp b/dbms/src/Storages/MergeTree/MergeTreeMinMaxIndex.cpp index d9816c3e119..bb94a537ac3 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeMinMaxIndex.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeMinMaxIndex.cpp @@ -119,6 +119,7 @@ IndexConditionPtr MergeTreeMinMaxIndex::createIndexCondition( std::unique_ptr minmaxIndexCreator( const NamesAndTypesList & new_columns, std::shared_ptr node, + const MergeTreeData &, const Context & context) { if (node->name.empty()) diff --git a/dbms/src/Storages/MergeTree/MergeTreeSetSkippingIndex.cpp b/dbms/src/Storages/MergeTree/MergeTreeSetSkippingIndex.cpp index 69323fe8bb4..aa138ece245 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeSetSkippingIndex.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeSetSkippingIndex.cpp @@ -354,6 +354,7 @@ IndexConditionPtr MergeTreeSetSkippingIndex::createIndexCondition( std::unique_ptr setIndexCreator( const NamesAndTypesList & new_columns, std::shared_ptr node, + const MergeTreeData &, const Context & context) { if (node->name.empty())