mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-29 11:02:08 +00:00
utf8 + fixes
This commit is contained in:
parent
4343ede944
commit
f7c091d497
@ -40,6 +40,11 @@ void StringBloomFilter::add(const char * data, size_t len)
|
||||
}
|
||||
}
|
||||
|
||||
void StringBloomFilter::clear()
|
||||
{
|
||||
filter.assign(size, 0);
|
||||
}
|
||||
|
||||
bool StringBloomFilter::contains(const StringBloomFilter & bf)
|
||||
{
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
|
@ -41,6 +41,7 @@ public:
|
||||
|
||||
bool find(const char * data, size_t len);
|
||||
void add(const char * data, size_t len);
|
||||
void clear();
|
||||
|
||||
/// Checks if this contains everything from another bloom filter.
|
||||
/// Bloom filters must have equal size and seed.
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <Storages/MergeTree/MergeTreeBloomFilterIndex.h>
|
||||
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
@ -23,7 +24,10 @@ namespace ErrorCodes
|
||||
|
||||
|
||||
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
|
||||
: IMergeTreeIndexGranule(), index(index), bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed)
|
||||
: IMergeTreeIndexGranule()
|
||||
, index(index)
|
||||
, bloom_filter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed)
|
||||
, has_elems(false)
|
||||
{
|
||||
}
|
||||
|
||||
@ -42,6 +46,7 @@ void MergeTreeBloomFilterIndexGranule::deserializeBinary(ReadBuffer & istr)
|
||||
std::vector<UInt8> filter(index.bloom_filter_size, 0);
|
||||
istr.read(reinterpret_cast<char *>(filter.data()), index.bloom_filter_size);
|
||||
bloom_filter.setFilter(std::move(filter));
|
||||
has_elems = true;
|
||||
}
|
||||
|
||||
void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos, size_t limit)
|
||||
@ -65,6 +70,7 @@ void MergeTreeBloomFilterIndexGranule::update(const Block & block, size_t * pos,
|
||||
bloom_filter.add(ref.data + token_start, token_len);
|
||||
}
|
||||
|
||||
has_elems = true;
|
||||
*pos += rows_read;
|
||||
}
|
||||
|
||||
@ -86,17 +92,22 @@ struct NgramTokenExtractor
|
||||
NgramTokenExtractor(size_t n) : n(n) {}
|
||||
|
||||
static String getName() {
|
||||
static String name = "ngram";
|
||||
static String name = "ngrambf";
|
||||
return name;
|
||||
}
|
||||
|
||||
bool operator() (const char *, size_t len, size_t * pos, size_t * token_start, size_t * token_len)
|
||||
bool operator() (const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len)
|
||||
{
|
||||
if (*pos + n > len) {
|
||||
return false;
|
||||
}
|
||||
*token_start = *pos;
|
||||
*token_len = n;
|
||||
*token_len = 0;
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_len]));
|
||||
if (*token_start + *token_len + sz > len) {
|
||||
return false;
|
||||
}
|
||||
*token_len += sz;
|
||||
}
|
||||
++*pos;
|
||||
return true;
|
||||
}
|
||||
@ -105,7 +116,7 @@ struct NgramTokenExtractor
|
||||
};
|
||||
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> BloomFilterIndexCreator(
|
||||
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
|
||||
const NamesAndTypesList & new_columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData & data,
|
||||
@ -153,8 +164,8 @@ std::unique_ptr<IMergeTreeIndex> BloomFilterIndexCreator(
|
||||
size_t seed = typeid_cast<const ASTLiteral &>(
|
||||
*node->type->arguments->children[2]).value.get<size_t>();\
|
||||
|
||||
size_t bloom_filter_hashes = static_cast<size_t>(
|
||||
n / (node->granularity * data.index_granularity) * log(2.));
|
||||
auto bloom_filter_hashes = static_cast<size_t>(
|
||||
n * log(2.) / (node->granularity * data.index_granularity));
|
||||
|
||||
return std::make_unique<MergeTreeBloomFilterIndex>(
|
||||
node->name, std::move(index_expr), columns, data_types, sample, node->granularity,
|
||||
|
@ -22,11 +22,12 @@ struct MergeTreeBloomFilterIndexGranule : public IMergeTreeIndexGranule
|
||||
|
||||
void deserializeBinary(ReadBuffer & istr) override;
|
||||
|
||||
bool empty() const override;
|
||||
bool empty() const override { return !has_elems; };
|
||||
void update(const Block & block, size_t * pos, size_t limit) override;
|
||||
|
||||
const MergeTreeBloomFilterIndex & index;
|
||||
StringBloomFilter bloom_filter;
|
||||
bool has_elems;
|
||||
};
|
||||
|
||||
|
||||
|
@ -348,6 +348,7 @@ void MergeTreeData::setPrimaryKeyIndicesAndColumns(
|
||||
MergeTreeIndexFactory::instance().get(
|
||||
all_columns,
|
||||
std::dynamic_pointer_cast<ASTIndexDeclaration>(index_decl->clone()),
|
||||
*this,
|
||||
global_context));
|
||||
|
||||
if (indices_names.find(new_indices.back()->name) != indices_names.end())
|
||||
|
@ -29,6 +29,7 @@ void MergeTreeIndexFactory::registerIndex(const std::string &name, Creator creat
|
||||
std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
|
||||
const NamesAndTypesList & columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData & data,
|
||||
const Context & context) const
|
||||
{
|
||||
if (!node->type)
|
||||
@ -51,24 +52,35 @@ std::unique_ptr<IMergeTreeIndex> MergeTreeIndexFactory::get(
|
||||
return lft + ", " + rht.first;
|
||||
}),
|
||||
ErrorCodes::INCORRECT_QUERY);
|
||||
return it->second(columns, node, context);
|
||||
return it->second(columns, node, data, context);
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
|
||||
const NamesAndTypesList & columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData & data,
|
||||
const Context & context);
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
|
||||
const NamesAndTypesList & columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData & data,
|
||||
const Context & context);
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
|
||||
const NamesAndTypesList & columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData & data,
|
||||
const Context & context);
|
||||
|
||||
|
||||
MergeTreeIndexFactory::MergeTreeIndexFactory()
|
||||
{
|
||||
registerIndex("minmax", minmaxIndexCreator);
|
||||
registerIndex("set", setIndexCreator);
|
||||
registerIndex("ngrambf", bloomFilterIndexCreator);
|
||||
registerIndex("tokenbf", bloomFilterIndexCreator);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -106,11 +106,13 @@ public:
|
||||
std::unique_ptr<IMergeTreeIndex>(
|
||||
const NamesAndTypesList & columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData & data,
|
||||
const Context & context)>;
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> get(
|
||||
const NamesAndTypesList & columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData & data,
|
||||
const Context & context) const;
|
||||
|
||||
void registerIndex(const std::string & name, Creator creator);
|
||||
|
@ -119,6 +119,7 @@ IndexConditionPtr MergeTreeMinMaxIndex::createIndexCondition(
|
||||
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
|
||||
const NamesAndTypesList & new_columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData &,
|
||||
const Context & context)
|
||||
{
|
||||
if (node->name.empty())
|
||||
|
@ -354,6 +354,7 @@ IndexConditionPtr MergeTreeSetSkippingIndex::createIndexCondition(
|
||||
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
|
||||
const NamesAndTypesList & new_columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const MergeTreeData &,
|
||||
const Context & context)
|
||||
{
|
||||
if (node->name.empty())
|
||||
|
Loading…
Reference in New Issue
Block a user