mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-01 20:12:02 +00:00
utf8 + fixes
This commit is contained in:
parent
4343ede944
commit
f7c091d497
@ -40,6 +40,11 @@ void StringBloomFilter::add(const char * data, size_t len)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void StringBloomFilter::clear()
|
||||||
|
{
|
||||||
|
filter.assign(size, 0);
|
||||||
|
}
|
||||||
|
|
||||||
bool StringBloomFilter::contains(const StringBloomFilter & bf)
|
bool StringBloomFilter::contains(const StringBloomFilter & bf)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < size; ++i)
|
for (size_t i = 0; i < size; ++i)
|
||||||
|
@ -41,6 +41,7 @@ public:
|
|||||||
|
|
||||||
bool find(const char * data, size_t len);
|
bool find(const char * data, size_t len);
|
||||||
void add(const char * data, size_t len);
|
void add(const char * data, size_t len);
|
||||||
|
void clear();
|
||||||
|
|
||||||
/// Checks if this contains everything from another bloom filter.
|
/// Checks if this contains everything from another bloom filter.
|
||||||
/// Bloom filters must have equal size and seed.
|
/// Bloom filters must have equal size and seed.
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <Storages/MergeTree/MergeTreeBloomFilterIndex.h>
|
#include <Storages/MergeTree/MergeTreeBloomFilterIndex.h>
|
||||||
|
|
||||||
|
#include <Common/UTF8Helpers.h>
|
||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
#include <IO/WriteHelpers.h>
|
#include <IO/WriteHelpers.h>
|
||||||
#include <IO/ReadHelpers.h>
|
#include <IO/ReadHelpers.h>
|
||||||
@ -23,7 +24,10 @@ namespace ErrorCodes
|
|||||||
|
|
||||||
|
|
||||||
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
|
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
|
||||||
: IMergeTreeIndexGranule(), index(index), bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed)
|
: IMergeTreeIndexGranule()
|
||||||
|
, index(index)
|
||||||
|
, bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed)
|
||||||
|
, has_elems(false)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -42,6 +46,7 @@ void MergeTreeBloomFilterIndexGranule::deserializeBinary(ReadBuffer & istr)
|
|||||||
std::vector<UInt8> filter(index.bloom_filter_size, 0);
|
std::vector<UInt8> filter(index.bloom_filter_size, 0);
|
||||||
istr.read(reinterpret_cast<char *>(filter.data()), index.bloom_filter_size);
|
istr.read(reinterpret_cast<char *>(filter.data()), index.bloom_filter_size);
|
||||||
bloom_filter.setFilter(std::move(filter));
|
bloom_filter.setFilter(std::move(filter));
|
||||||
|
has_elems = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos, size_t limit)
|
void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos, size_t limit)
|
||||||
@ -65,6 +70,7 @@ void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos,
|
|||||||
bloom_filter.add(ref.data + token_start, token_len);
|
bloom_filter.add(ref.data + token_start, token_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
has_elems = true;
|
||||||
*pos += rows_read;
|
*pos += rows_read;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -86,17 +92,22 @@ struct NgramTokenExtractor
|
|||||||
NgramTokenExtractor(size_t n) : n(n) {}
|
NgramTokenExtractor(size_t n) : n(n) {}
|
||||||
|
|
||||||
static String getName() {
|
static String getName() {
|
||||||
static String name = "ngram";
|
static String name = "ngrambf";
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator() (const char *, size_t len, size_t * pos, size_t * token_start, size_t * token_len)
|
bool operator() (const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len)
|
||||||
{
|
{
|
||||||
if (*pos + n > len) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*token_start = *pos;
|
*token_start = *pos;
|
||||||
*token_len = n;
|
*token_len = 0;
|
||||||
|
for (size_t i = 0; i < n; ++i)
|
||||||
|
{
|
||||||
|
size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_len]));
|
||||||
|
if (*token_start + *token_len + sz > len) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*token_len += sz;
|
||||||
|
}
|
||||||
++*pos;
|
++*pos;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -105,7 +116,7 @@ struct NgramTokenExtractor
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
std::unique_ptr<IMergeTreeIndex> BloomFilterIndexCreator(
|
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
|
||||||
const NamesAndTypesList & new_columns,
|
const NamesAndTypesList & new_columns,
|
||||||
std::shared_ptr<ASTIndexDeclaration> node,
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
const MergeTreeData & data,
|
const MergeTreeData & data,
|
||||||
@ -153,8 +164,8 @@ std::unique_ptr<IMergeTreeIndex> BloomFilterIndexCreator(
|
|||||||
size_t seed = typeid_cast<const ASTLiteral &>(
|
size_t seed = typeid_cast<const ASTLiteral &>(
|
||||||
*node->type->arguments->children[2]).value.get<size_t>();\
|
*node->type->arguments->children[2]).value.get<size_t>();\
|
||||||
|
|
||||||
size_t bloom_filter_hashes = static_cast<size_t>(
|
auto bloom_filter_hashes = static_cast<size_t>(
|
||||||
n / (node->granularity * data.index_granularity) * log(2.));
|
n * log(2.) / (node->granularity * data.index_granularity));
|
||||||
|
|
||||||
return std::make_unique<MergeTreeBloomFilterIndex>(
|
return std::make_unique<MergeTreeBloomFilterIndex>(
|
||||||
node->name, std::move(index_expr), columns, data_types, sample, node->granularity,
|
node->name, std::move(index_expr), columns, data_types, sample, node->granularity,
|
||||||
|
@ -22,11 +22,12 @@ struct MergeTreeBloomFilterIndexGranule : public IMergeTreeIndexGranule
|
|||||||
|
|
||||||
void deserializeBinary(ReadBuffer & istr) override;
|
void deserializeBinary(ReadBuffer & istr) override;
|
||||||
|
|
||||||
bool empty() const override;
|
bool empty() const override { return !has_elems; };
|
||||||
void update(const Block & block, size_t * pos, size_t limit) override;
|
void update(const Block & block, size_t * pos, size_t limit) override;
|
||||||
|
|
||||||
const MergeTreeBloomFilterIndex & index;
|
const MergeTreeBloomFilterIndex & index;
|
||||||
StringBloomFilter bloom_filter;
|
StringBloomFilter bloom_filter;
|
||||||
|
bool has_elems;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -348,6 +348,7 @@ void MergeTreeData::setPrimaryKeyIndicesAndColumns(
|
|||||||
MergeTreeIndexFactory::instance().get(
|
MergeTreeIndexFactory::instance().get(
|
||||||
all_columns,
|
all_columns,
|
||||||
std::dynamic_pointer_cast<ASTIndexDeclaration>(index_decl->clone()),
|
std::dynamic_pointer_cast<ASTIndexDeclaration>(index_decl->clone()),
|
||||||
|
*this,
|
||||||
global_context));
|
global_context));
|
||||||
|
|
||||||
if (indices_names.find(new_indices.back()->name) != indices_names.end())
|
if (indices_names.find(new_indices.back()->name) != indices_names.end())
|
||||||
|
@ -29,6 +29,7 @@ void MergeTreeIndexFactory::registerIndex(const std::string &name, Creator creat
|
|||||||
std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
|
std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
|
||||||
const NamesAndTypesList & columns,
|
const NamesAndTypesList & columns,
|
||||||
std::shared_ptr<ASTIndexDeclaration> node,
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
|
const MergeTreeData & data,
|
||||||
const Context & context) const
|
const Context & context) const
|
||||||
{
|
{
|
||||||
if (!node->type)
|
if (!node->type)
|
||||||
@ -51,24 +52,35 @@ std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
|
|||||||
return lft + ", " + rht.first;
|
return lft + ", " + rht.first;
|
||||||
}),
|
}),
|
||||||
ErrorCodes::INCORRECT_QUERY);
|
ErrorCodes::INCORRECT_QUERY);
|
||||||
return it->second(columns, node, context);
|
return it->second(columns, node, data, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
|
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
|
||||||
const NamesAndTypesList & columns,
|
const NamesAndTypesList & columns,
|
||||||
std::shared_ptr<ASTIndexDeclaration> node,
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
|
const MergeTreeData & data,
|
||||||
const Context & context);
|
const Context & context);
|
||||||
|
|
||||||
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
|
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
|
||||||
const NamesAndTypesList & columns,
|
const NamesAndTypesList & columns,
|
||||||
std::shared_ptr<ASTIndexDeclaration> node,
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
|
const MergeTreeData & data,
|
||||||
const Context & context);
|
const Context & context);
|
||||||
|
|
||||||
|
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
|
||||||
|
const NamesAndTypesList & columns,
|
||||||
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
|
const MergeTreeData & data,
|
||||||
|
const Context & context);
|
||||||
|
|
||||||
|
|
||||||
MergeTreeIndexFactory::MergeTreeIndexFactory()
|
MergeTreeIndexFactory::MergeTreeIndexFactory()
|
||||||
{
|
{
|
||||||
registerIndex("minmax", minmaxIndexCreator);
|
registerIndex("minmax", minmaxIndexCreator);
|
||||||
registerIndex("set", setIndexCreator);
|
registerIndex("set", setIndexCreator);
|
||||||
|
registerIndex("ngrambf", bloomFilterIndexCreator);
|
||||||
|
registerIndex("tokenbf", bloomFilterIndexCreator);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -106,11 +106,13 @@ public:
|
|||||||
std::unique_ptr<IMergeTreeIndex>(
|
std::unique_ptr<IMergeTreeIndex>(
|
||||||
const NamesAndTypesList & columns,
|
const NamesAndTypesList & columns,
|
||||||
std::shared_ptr<ASTIndexDeclaration> node,
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
|
const MergeTreeData & data,
|
||||||
const Context & context)>;
|
const Context & context)>;
|
||||||
|
|
||||||
std::unique_ptr<IMergeTreeIndex> get(
|
std::unique_ptr<IMergeTreeIndex> get(
|
||||||
const NamesAndTypesList & columns,
|
const NamesAndTypesList & columns,
|
||||||
std::shared_ptr<ASTIndexDeclaration> node,
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
|
const MergeTreeData & data,
|
||||||
const Context & context) const;
|
const Context & context) const;
|
||||||
|
|
||||||
void registerIndex(const std::string & name, Creator creator);
|
void registerIndex(const std::string & name, Creator creator);
|
||||||
|
@ -119,6 +119,7 @@ IndexConditionPtr MergeTreeMinMaxIndex::createIndexCondition(
|
|||||||
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
|
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
|
||||||
const NamesAndTypesList & new_columns,
|
const NamesAndTypesList & new_columns,
|
||||||
std::shared_ptr<ASTIndexDeclaration> node,
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
|
const MergeTreeData &,
|
||||||
const Context & context)
|
const Context & context)
|
||||||
{
|
{
|
||||||
if (node->name.empty())
|
if (node->name.empty())
|
||||||
|
@ -354,6 +354,7 @@ IndexConditionPtr MergeTreeSetSkippingIndex::createIndexCondition(
|
|||||||
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
|
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
|
||||||
const NamesAndTypesList & new_columns,
|
const NamesAndTypesList & new_columns,
|
||||||
std::shared_ptr<ASTIndexDeclaration> node,
|
std::shared_ptr<ASTIndexDeclaration> node,
|
||||||
|
const MergeTreeData &,
|
||||||
const Context & context)
|
const Context & context)
|
||||||
{
|
{
|
||||||
if (node->name.empty())
|
if (node->name.empty())
|
||||||
|
Loading…
Reference in New Issue
Block a user