utf8 + fixes

This commit is contained in:
Nikita Vasilev 2019-02-20 15:48:50 +03:00
parent 4343ede944
commit f7c091d497
9 changed files with 47 additions and 12 deletions

View File

@ -40,6 +40,11 @@ void StringBloomFilter::add(const char * data, size_t len)
}
}
void StringBloomFilter::clear()
{
filter.assign(size, 0);
}
bool StringBloomFilter::contains(const StringBloomFilter & bf)
{
for (size_t i = 0; i < size; ++i)

View File

@ -41,6 +41,7 @@ public:
bool find(const char * data, size_t len);
void add(const char * data, size_t len);
void clear();
/// Checks if this contains everything from another bloom filter.
/// Bloom filters must have equal size and seed.

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/MergeTreeBloomFilterIndex.h>
#include <Common/UTF8Helpers.h>
#include <DataTypes/DataTypesNumber.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
@ -23,7 +24,10 @@ namespace ErrorCodes
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
: IMergeTreeIndexGranule(), index(index), bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed)
: IMergeTreeIndexGranule()
, index(index)
, bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed)
, has_elems(false)
{
}
@ -42,6 +46,7 @@ void MergeTreeBloomFilterIndexGranule::deserializeBinary(ReadBuffer & istr)
std::vector<UInt8> filter(index.bloom_filter_size, 0);
istr.read(reinterpret_cast<char *>(filter.data()), index.bloom_filter_size);
bloom_filter.setFilter(std::move(filter));
has_elems = true;
}
void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos, size_t limit)
@ -65,6 +70,7 @@ void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos,
bloom_filter.add(ref.data + token_start, token_len);
}
has_elems = true;
*pos += rows_read;
}
@ -86,17 +92,22 @@ struct NgramTokenExtractor
NgramTokenExtractor(size_t n) : n(n) {}
static String getName() {
static String name = "ngram";
static String name = "ngrambf";
return name;
}
bool operator() (const char *, size_t len, size_t * pos, size_t * token_start, size_t * token_len)
bool operator() (const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len)
{
if (*pos + n > len) {
return false;
}
*token_start = *pos;
*token_len = n;
*token_len = 0;
for (size_t i = 0; i < n; ++i)
{
size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_len]));
if (*token_start + *token_len + sz > len) {
return false;
}
*token_len += sz;
}
++*pos;
return true;
}
@ -105,7 +116,7 @@ struct NgramTokenExtractor
};
std::unique_ptr<IMergeTreeIndex> BloomFilterIndexCreator(
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
const NamesAndTypesList & new_columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
@ -153,8 +164,8 @@ std::unique_ptr<IMergeTreeIndex> BloomFilterIndexCreator(
size_t seed = typeid_cast<const ASTLiteral &>(
*node->type->arguments->children[2]).value.get<size_t>();\
size_t bloom_filter_hashes = static_cast<size_t>(
n / (node->granularity * data.index_granularity) * log(2.));
auto bloom_filter_hashes = static_cast<size_t>(
n * log(2.) / (node->granularity * data.index_granularity));
return std::make_unique<MergeTreeBloomFilterIndex>(
node->name, std::move(index_expr), columns, data_types, sample, node->granularity,

View File

@ -22,11 +22,12 @@ struct MergeTreeBloomFilterIndexGranule : public IMergeTreeIndexGranule
void deserializeBinary(ReadBuffer & istr) override;
bool empty() const override;
bool empty() const override { return !has_elems; };
void update(const Block & block, size_t * pos, size_t limit) override;
const MergeTreeBloomFilterIndex & index;
StringBloomFilter bloom_filter;
bool has_elems;
};

View File

@ -348,6 +348,7 @@ void MergeTreeData::setPrimaryKeyIndicesAndColumns(
MergeTreeIndexFactory::instance().get(
all_columns,
std::dynamic_pointer_cast<ASTIndexDeclaration>(index_decl->clone()),
*this,
global_context));
if (indices_names.find(new_indices.back()->name) != indices_names.end())

View File

@ -29,6 +29,7 @@ void MergeTreeIndexFactory::registerIndex(const std::string &name, Creator creat
std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context) const
{
if (!node->type)
@ -51,24 +52,35 @@ std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
return lft + ", " + rht.first;
}),
ErrorCodes::INCORRECT_QUERY);
return it->second(columns, node, context);
return it->second(columns, node, data, context);
}
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context);
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context);
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context);
MergeTreeIndexFactory::MergeTreeIndexFactory()
{
registerIndex("minmax", minmaxIndexCreator);
registerIndex("set", setIndexCreator);
registerIndex("ngrambf", bloomFilterIndexCreator);
registerIndex("tokenbf", bloomFilterIndexCreator);
}
}

View File

@ -106,11 +106,13 @@ public:
std::unique_ptr<IMergeTreeIndex>(
const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context)>;
std::unique_ptr<IMergeTreeIndex> get(
const NamesAndTypesList & columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData & data,
const Context & context) const;
void registerIndex(const std::string & name, Creator creator);

View File

@ -119,6 +119,7 @@ IndexConditionPtr MergeTreeMinMaxIndex::createIndexCondition(
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
const NamesAndTypesList & new_columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData &,
const Context & context)
{
if (node->name.empty())

View File

@ -354,6 +354,7 @@ IndexConditionPtr MergeTreeSetSkippingIndex::createIndexCondition(
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
const NamesAndTypesList & new_columns,
std::shared_ptr<ASTIndexDeclaration> node,
const MergeTreeData &,
const Context & context)
{
if (node->name.empty())