2022-05-13 10:37:04 +00:00
|
|
|
#ifdef ENABLE_ANNOY
|
|
|
|
|
2022-04-01 18:47:48 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeIndexAnnoy.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
2022-04-15 12:08:54 +00:00
|
|
|
#include <Common/FieldVisitorsAccurateComparison.h>
|
2022-04-01 18:47:48 +00:00
|
|
|
#include <Common/typeid_cast.h>
|
2022-04-15 12:08:54 +00:00
|
|
|
#include <Parsers/ASTFunction.h>
|
|
|
|
|
|
|
|
#include <Poco/Logger.h>
|
|
|
|
#include <base/logger_useful.h>
|
|
|
|
|
|
|
|
#include "Core/Field.h"
|
|
|
|
#include "Interpreters/Context_fwd.h"
|
|
|
|
#include "MergeTreeIndices.h"
|
|
|
|
#include "KeyCondition.h"
|
|
|
|
#include "Parsers/ASTIdentifier.h"
|
|
|
|
#include "Parsers/ASTSelectQuery.h"
|
|
|
|
#include "Parsers/IAST_fwd.h"
|
|
|
|
#include "Storages/SelectQueryInfo.h"
|
|
|
|
#include "base/types.h"
|
2022-04-01 18:47:48 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace Annoy
|
|
|
|
{
|
|
|
|
|
|
|
|
template<typename Dist>
|
|
|
|
void AnnoyIndexSerialize<Dist>::serialize(WriteBuffer& ostr) const
|
|
|
|
{
|
2022-05-14 08:24:54 +00:00
|
|
|
if (!Base::_built)
|
|
|
|
{
|
2022-04-01 18:47:48 +00:00
|
|
|
throw Exception("Annoy Index should be built before serialization", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
writeIntBinary(Base::_s, ostr);
|
|
|
|
writeIntBinary(Base::_n_items, ostr);
|
|
|
|
writeIntBinary(Base::_n_nodes, ostr);
|
|
|
|
writeIntBinary(Base::_nodes_size, ostr);
|
|
|
|
writeIntBinary(Base::_K, ostr);
|
|
|
|
writeIntBinary(Base::_seed, ostr);
|
2022-04-02 11:17:09 +00:00
|
|
|
writeVectorBinary(Base::_roots, ostr);
|
2022-04-01 18:47:48 +00:00
|
|
|
ostr.write(reinterpret_cast<const char*>(Base::_nodes), Base::_s * Base::_n_nodes);
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename Dist>
|
|
|
|
void AnnoyIndexSerialize<Dist>::deserialize(ReadBuffer& istr)
|
|
|
|
{
|
|
|
|
readIntBinary(Base::_s, istr);
|
|
|
|
readIntBinary(Base::_n_items, istr);
|
|
|
|
readIntBinary(Base::_n_nodes, istr);
|
|
|
|
readIntBinary(Base::_nodes_size, istr);
|
|
|
|
readIntBinary(Base::_K, istr);
|
|
|
|
readIntBinary(Base::_seed, istr);
|
2022-04-02 11:17:09 +00:00
|
|
|
readVectorBinary(Base::_roots, istr);
|
2022-04-01 18:47:48 +00:00
|
|
|
Base::_nodes = realloc(Base::_nodes, Base::_s * Base::_n_nodes);
|
|
|
|
istr.read(reinterpret_cast<char*>(Base::_nodes), Base::_s * Base::_n_nodes);
|
|
|
|
|
2022-04-02 11:17:09 +00:00
|
|
|
Base::_fd = 0;
|
2022-04-01 18:47:48 +00:00
|
|
|
// set flags
|
|
|
|
Base::_loaded = false;
|
|
|
|
Base::_verbose = false;
|
|
|
|
Base::_on_disk = false;
|
|
|
|
Base::_built = true;
|
|
|
|
}
|
|
|
|
|
2022-05-04 21:38:11 +00:00
|
|
|
template<typename Dist>
|
2022-05-14 08:24:54 +00:00
|
|
|
float AnnoyIndexSerialize<Dist>::getSpaceDim() const
|
|
|
|
{
|
2022-05-04 21:38:11 +00:00
|
|
|
return Base::get_f();
|
|
|
|
}
|
|
|
|
|
2022-04-01 18:47:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int LOGICAL_ERROR;
|
2022-05-08 21:24:58 +00:00
|
|
|
extern const int INCORRECT_QUERY;
|
2022-04-01 18:47:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_)
|
|
|
|
: index_name(index_name_)
|
|
|
|
, index_sample_block(index_sample_block_)
|
|
|
|
, index_base(nullptr)
|
|
|
|
{}
|
|
|
|
|
|
|
|
MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(
|
2022-05-14 08:55:07 +00:00
|
|
|
const String & index_name_,
|
2022-04-01 18:47:48 +00:00
|
|
|
const Block & index_sample_block_,
|
|
|
|
AnnoyIndexPtr index_base_)
|
|
|
|
: index_name(index_name_)
|
|
|
|
, index_sample_block(index_sample_block_)
|
|
|
|
, index_base(std::move(index_base_))
|
|
|
|
{}
|
|
|
|
|
2022-05-14 08:24:54 +00:00
|
|
|
bool MergeTreeIndexGranuleAnnoy::empty() const
|
|
|
|
{
|
2022-04-01 18:47:48 +00:00
|
|
|
return !static_cast<bool>(index_base);
|
|
|
|
}
|
|
|
|
|
|
|
|
void MergeTreeIndexGranuleAnnoy::serializeBinary(WriteBuffer & ostr) const
|
|
|
|
{
|
|
|
|
writeIntBinary(index_base->get_f(), ostr); // write dimension
|
|
|
|
index_base->serialize(ostr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void MergeTreeIndexGranuleAnnoy::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
|
|
|
|
{
|
|
|
|
int dimension;
|
|
|
|
readIntBinary(dimension, istr);
|
|
|
|
index_base = std::make_shared<AnnoyIndex>(dimension);
|
|
|
|
index_base->deserialize(istr);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
MergeTreeIndexAggregatorAnnoy::MergeTreeIndexAggregatorAnnoy(const String & index_name_,
|
2022-05-12 12:30:18 +00:00
|
|
|
const Block & index_sample_block_,
|
|
|
|
int index_param_)
|
2022-04-01 18:47:48 +00:00
|
|
|
: index_name(index_name_)
|
|
|
|
, index_sample_block(index_sample_block_)
|
2022-05-12 12:30:18 +00:00
|
|
|
, index_param(index_param_)
|
2022-04-01 18:47:48 +00:00
|
|
|
{}
|
|
|
|
|
|
|
|
bool MergeTreeIndexAggregatorAnnoy::empty() const
|
|
|
|
{
|
2022-05-12 12:30:18 +00:00
|
|
|
return !index_base || index_base->get_n_items() == 0;
|
2022-04-01 18:47:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy::getGranuleAndReset()
|
|
|
|
{
|
2022-05-12 12:30:18 +00:00
|
|
|
index_base->build(index_param);
|
2022-04-15 12:08:54 +00:00
|
|
|
auto granule = std::make_shared<MergeTreeIndexGranuleAnnoy>(index_name, index_sample_block, index_base);
|
2022-05-12 12:30:18 +00:00
|
|
|
index_base = nullptr;
|
2022-04-15 12:08:54 +00:00
|
|
|
return granule;
|
2022-04-01 18:47:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t * pos, size_t limit)
|
|
|
|
{
|
|
|
|
if (*pos >= block.rows())
|
|
|
|
throw Exception(
|
|
|
|
"The provided position is not less than the number of block rows. Position: "
|
|
|
|
+ toString(*pos) + ", Block rows: " + toString(block.rows()) + ".", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
size_t rows_read = std::min(limit, block.rows() - *pos);
|
|
|
|
|
2022-05-14 08:24:54 +00:00
|
|
|
if (index_sample_block.columns() > 1)
|
|
|
|
{
|
2022-04-01 18:47:48 +00:00
|
|
|
throw Exception("Only one column is supported", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto index_column_name = index_sample_block.getByPosition(0).name;
|
|
|
|
const auto & column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
|
2022-04-15 12:08:54 +00:00
|
|
|
const auto & column_tuple = typeid_cast<const ColumnTuple*>(column_cut.get());
|
|
|
|
const auto & columns = column_tuple->getColumns();
|
|
|
|
|
|
|
|
std::vector<std::vector<Float32>> data{column_tuple->size(), std::vector<Float32>()};
|
2022-05-14 08:24:54 +00:00
|
|
|
for (size_t j = 0; j < columns.size(); ++j)
|
|
|
|
{
|
2022-04-15 12:08:54 +00:00
|
|
|
const auto& pod_array = typeid_cast<const ColumnFloat32*>(columns[j].get())->getData();
|
2022-05-14 08:24:54 +00:00
|
|
|
for (size_t i = 0; i < pod_array.size(); ++i)
|
|
|
|
{
|
2022-04-15 12:08:54 +00:00
|
|
|
data[i].push_back(pod_array[i]);
|
|
|
|
}
|
|
|
|
}
|
2022-05-12 12:30:18 +00:00
|
|
|
assert(!data.empty());
|
2022-05-14 08:24:54 +00:00
|
|
|
if (!index_base)
|
|
|
|
{
|
2022-05-12 12:30:18 +00:00
|
|
|
index_base = std::make_shared<AnnoyIndex>(data[0].size());
|
|
|
|
}
|
2022-05-14 08:24:54 +00:00
|
|
|
for (const auto& item : data)
|
|
|
|
{
|
2022-04-15 12:08:54 +00:00
|
|
|
index_base->add_item(index_base->get_n_items(), &item[0]);
|
2022-04-01 18:47:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
*pos += rows_read;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-04-15 12:08:54 +00:00
|
|
|
MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy(
|
2022-05-08 19:30:33 +00:00
|
|
|
const IndexDescription & /*index*/,
|
2022-04-15 12:08:54 +00:00
|
|
|
const SelectQueryInfo & query,
|
|
|
|
ContextPtr context)
|
2022-05-04 21:38:11 +00:00
|
|
|
: condition(query, context)
|
2022-05-14 08:24:54 +00:00
|
|
|
{}
|
2022-04-15 12:08:54 +00:00
|
|
|
|
|
|
|
|
2022-04-01 18:47:48 +00:00
|
|
|
bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const
|
|
|
|
{
|
|
|
|
auto granule = std::dynamic_pointer_cast<MergeTreeIndexGranuleAnnoy>(idx_granule);
|
2022-05-14 08:24:54 +00:00
|
|
|
if (granule == nullptr)
|
|
|
|
{
|
2022-05-12 12:30:18 +00:00
|
|
|
throw Exception("Granule has the wrong type", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
auto annoy = granule->index_base;
|
2022-05-08 21:24:58 +00:00
|
|
|
|
2022-05-14 08:24:54 +00:00
|
|
|
if (condition.getSpaceDim() != annoy->getSpaceDim())
|
|
|
|
{
|
|
|
|
throw Exception("The dimension of the space in the request (" + toString(condition.getSpaceDim()) + ") "
|
2022-05-08 21:24:58 +00:00
|
|
|
+ "does not match with the dimension in the index (" + toString(annoy->getSpaceDim()) + ")", ErrorCodes::INCORRECT_QUERY);
|
|
|
|
}
|
2022-05-08 19:30:33 +00:00
|
|
|
std::vector<float> target_vec = condition.getTargetVector();
|
2022-05-04 21:38:11 +00:00
|
|
|
float max_distance = condition.getComparisonDistance();
|
|
|
|
|
2022-04-02 11:17:09 +00:00
|
|
|
std::vector<int32_t> items;
|
|
|
|
std::vector<float> dist;
|
2022-04-01 18:47:48 +00:00
|
|
|
|
2022-05-13 20:42:50 +00:00
|
|
|
int k_search = -1;
|
|
|
|
auto settings_str = condition.getSettingsStr();
|
2022-05-14 08:24:54 +00:00
|
|
|
if (!settings_str.empty())
|
|
|
|
{
|
2022-05-13 20:42:50 +00:00
|
|
|
try
|
|
|
|
{
|
|
|
|
k_search = std::stoi(settings_str);
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
2022-05-14 08:24:54 +00:00
|
|
|
throw Exception("Setting of the annoy index should be int", ErrorCodes::INCORRECT_QUERY);
|
2022-05-13 20:42:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
annoy->get_nns_by_vector(&target_vec[0], 1, k_search, &items, &dist);
|
2022-05-04 21:38:11 +00:00
|
|
|
return dist[0] < max_distance;
|
2022-04-15 12:08:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const
|
|
|
|
{
|
2022-05-08 21:24:58 +00:00
|
|
|
return condition.alwaysUnknownOrTrue("L2Distance");
|
2022-04-01 18:47:48 +00:00
|
|
|
}
|
|
|
|
|
2022-05-14 08:24:54 +00:00
|
|
|
std::vector<size_t> MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const
|
|
|
|
{
|
|
|
|
UInt64 limit = condition.getLimitCount();
|
|
|
|
std::optional<float> comp_dist
|
|
|
|
= condition.queryHasWhereClause() ? std::optional<float>(condition.getComparisonDistance()) : std::nullopt;
|
|
|
|
std::vector<float> target_vec = condition.getTargetVector();
|
|
|
|
|
|
|
|
auto granule = std::dynamic_pointer_cast<MergeTreeIndexGranuleAnnoy>(idx_granule);
|
|
|
|
if (granule == nullptr)
|
|
|
|
{
|
|
|
|
throw Exception("Granule has the wrong type", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
auto annoy = granule->index_base;
|
|
|
|
|
|
|
|
if (condition.getSpaceDim() != annoy->getSpaceDim())
|
|
|
|
{
|
|
|
|
throw Exception("The dimension of the space in the request (" + toString(condition.getSpaceDim()) + ") "
|
|
|
|
+ "does not match with the dimension in the index (" + toString(annoy->getSpaceDim()) + ")", ErrorCodes::INCORRECT_QUERY);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<int32_t> items;
|
|
|
|
std::vector<float> dist;
|
|
|
|
items.reserve(limit);
|
|
|
|
dist.reserve(limit);
|
|
|
|
|
|
|
|
int k_search = -1;
|
|
|
|
auto settings_str = condition.getSettingsStr();
|
|
|
|
if (!settings_str.empty())
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
k_search = std::stoi(settings_str);
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
throw Exception("Setting of the annoy index should be int", ErrorCodes::INCORRECT_QUERY);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
annoy->get_nns_by_vector(&target_vec[0], 1, k_search, &items, &dist);
|
|
|
|
std::unordered_set<size_t> result;
|
|
|
|
for (size_t i = 0; i < items.size(); ++i)
|
|
|
|
{
|
|
|
|
if (comp_dist && dist[i] > comp_dist)
|
|
|
|
{
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
result.insert(items[i] / 8192);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<size_t> result_vector;
|
|
|
|
result_vector.reserve(result.size());
|
|
|
|
for (auto range : result)
|
|
|
|
{
|
|
|
|
result_vector.push_back(range);
|
|
|
|
}
|
|
|
|
|
|
|
|
return result_vector;
|
|
|
|
}
|
|
|
|
|
2022-04-01 18:47:48 +00:00
|
|
|
|
|
|
|
MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const
|
|
|
|
{
|
|
|
|
return std::make_shared<MergeTreeIndexGranuleAnnoy>(index.name, index.sample_block);
|
|
|
|
}
|
|
|
|
|
|
|
|
MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const
|
|
|
|
{
|
2022-05-12 12:30:18 +00:00
|
|
|
return std::make_shared<MergeTreeIndexAggregatorAnnoy>(index.name, index.sample_block, index_param);
|
2022-04-01 18:47:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(
|
2022-04-15 12:08:54 +00:00
|
|
|
const SelectQueryInfo & query, ContextPtr context) const
|
2022-04-01 18:47:48 +00:00
|
|
|
{
|
2022-04-15 12:08:54 +00:00
|
|
|
return std::make_shared<MergeTreeIndexConditionAnnoy>(index, query, context);
|
2022-04-01 18:47:48 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
MergeTreeIndexFormat MergeTreeIndexAnnoy::getDeserializedFormat(const DiskPtr disk, const std::string & relative_path_prefix) const
|
|
|
|
{
|
|
|
|
if (disk->exists(relative_path_prefix + ".idx2"))
|
|
|
|
return {2, ".idx2"};
|
|
|
|
else if (disk->exists(relative_path_prefix + ".idx"))
|
|
|
|
return {1, ".idx"};
|
|
|
|
return {0 /* unknown */, ""};
|
|
|
|
}
|
|
|
|
|
2022-05-14 10:27:05 +00:00
|
|
|
MergeTreeIndexPtr AnnoyIndexCreator(const IndexDescription & index)
|
2022-04-01 18:47:48 +00:00
|
|
|
{
|
2022-05-12 12:30:18 +00:00
|
|
|
int param = index.arguments[0].get<int>();
|
|
|
|
return std::make_shared<MergeTreeIndexAnnoy>(index, param);
|
2022-04-01 18:47:48 +00:00
|
|
|
}
|
|
|
|
|
2022-05-12 12:30:18 +00:00
|
|
|
void AnnoyIndexValidator(const IndexDescription & index, bool /* attach */)
|
|
|
|
{
|
2022-05-14 08:24:54 +00:00
|
|
|
if (index.arguments.size() != 1)
|
|
|
|
{
|
2022-05-12 12:30:18 +00:00
|
|
|
throw Exception("Annoy index must have exactly one argument.", ErrorCodes::INCORRECT_QUERY);
|
|
|
|
}
|
2022-05-14 08:24:54 +00:00
|
|
|
if (index.arguments[0].getType() != Field::Types::UInt64)
|
|
|
|
{
|
2022-05-12 12:30:18 +00:00
|
|
|
throw Exception("Annoy index argument must be UInt64.", ErrorCodes::INCORRECT_QUERY);
|
|
|
|
}
|
|
|
|
}
|
2022-04-01 18:47:48 +00:00
|
|
|
|
|
|
|
}
|
2022-05-13 10:37:04 +00:00
|
|
|
#endif // ENABLE_ANNOY
|