mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
improvements
This commit is contained in:
parent
ad459bc6e4
commit
3b30cd9259
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -261,7 +261,8 @@
|
||||
url = https://github.com/zlib-ng/minizip-ng
|
||||
[submodule "contrib/annoy"]
|
||||
path = contrib/annoy
|
||||
url = https://github.com/Vector-Similarity-Search-for-ClickHouse/annoy.git
|
||||
url = https://github.com/ClickHouse/annoy.git
|
||||
branch = ClickHouse-master
|
||||
[submodule "contrib/wyhash"]
|
||||
path = contrib/wyhash
|
||||
url = https://github.com/wangyi-fudan/wyhash.git
|
||||
|
8
contrib/CMakeLists.txt
vendored
8
contrib/CMakeLists.txt
vendored
@ -158,12 +158,8 @@ add_contrib (sqlite-cmake sqlite-amalgamation)
|
||||
add_contrib (s2geometry-cmake s2geometry)
|
||||
add_contrib (base-x-cmake base-x)
|
||||
|
||||
set (ENABLE_ANNOY_DEFAULT ${ENABLE_LIBRARIES})
|
||||
option(ENABLE_ANNOY "Enable Annoy index support" ${ENABLE_ANNOY_DEFAULT})
|
||||
if (ENABLE_ANNOY)
|
||||
add_contrib(annoy-cmake annoy)
|
||||
target_compile_definitions(_annoy PUBLIC ENABLE_ANNOY)
|
||||
endif()
|
||||
add_contrib(annoy-cmake annoy)
|
||||
|
||||
# Put all targets defined here and in subdirectories under "contrib/<immediate-subdir>" folders in GUI-based IDEs.
|
||||
# Some of third-party projects may override CMAKE_FOLDER or FOLDER property of their targets, so they would not appear
|
||||
# in "contrib/..." as originally planned, so we workaround this by fixing FOLDER properties of all targets manually,
|
||||
|
2
contrib/annoy
vendored
2
contrib/annoy
vendored
@ -1 +1 @@
|
||||
Subproject commit f2ae13120a2d2a6b35ee27ea7f275782541fdd75
|
||||
Subproject commit ebaa60e5c83d140901b9719471eb47800e5744ac
|
@ -1,3 +1,10 @@
|
||||
option(ENABLE_ANNOY "Enable Annoy index support" ${ENABLE_LIBRARIES})
|
||||
|
||||
if (NOT ENABLE_ANNOY)
|
||||
message (STATUS "Not using annoy")
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/annoy")
|
||||
set(ANNOY_SOURCE_DIR "${ANNOY_PROJECT_DIR}/src")
|
||||
|
||||
@ -6,4 +13,4 @@ target_include_directories(_annoy SYSTEM PUBLIC ${ANNOY_SOURCE_DIR})
|
||||
set_target_properties(_annoy PROPERTIES LINKER_LANGUAGE CXX)
|
||||
|
||||
add_library(ch_contrib::annoy ALIAS _annoy)
|
||||
|
||||
target_compile_definitions(_annoy PUBLIC ENABLE_ANNOY)
|
||||
|
@ -1,19 +1,19 @@
|
||||
# Approximate Nearest Neighbor Search Indexes [experimental] {#table_engines-ANNIndex}
|
||||
|
||||
The main task that indexes help to solve is to find the nearest neighbors for multidimensional data. An example of such a problem could be similar pictures or texts, for which the problem is reduced to finding the nearest [embeddings](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning). They can be created from data using [UDF](../../../sql-reference/functions/index.md#executable-user-defined-functions).
|
||||
The main task that indexes is to quickly find nearest neighbors for multidimensional data. An example of such a problem can be finding similar pictures (texts) for a given picture (text). That problem can be reduced to finding the nearest [embeddings](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning). They can be created from data using [UDF](../../../sql-reference/functions/index.md#executable-user-defined-functions).
|
||||
|
||||
Next query can find closest neighbor in L2 space:
|
||||
The next query finds the closest neighbors in N-dimensional space using the L2 (Euclidean) distance:
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table_name
|
||||
WHERE L2Distance(Column, TargetEmbedding) < Value
|
||||
WHERE L2Distance(Column, Point) < MaxDistance
|
||||
LIMIT N
|
||||
```
|
||||
But it will take some time for execution because of the long calculation of the distance between `TargetEmbedding` and all other vectors. This is where indexes can help. As they store the overall data structure using some methods (clustering, building search trees, etc.), they can give only approximate results for finding the nearest neighbors.
|
||||
But it will take some time for execution because of the long calculation of the distance between `TargetEmbedding` and all other vectors. This is where ANN indexes can help. They store a compact approximation of the search space (e.g. using clustering, search trees, etc.) and are able to compute approximate neighbors quickly.
|
||||
|
||||
## Indexes Structure
|
||||
|
||||
Approximate Nearest Neighbor Search Indexes (`ANNIndexes`) are simmilar to skip indexes. They are constructed by some granules and determine which of them should be skipped. Compared to skip indices, ANN indices use their results not only to skip some group of granules, but also to select particular granules from a set of granules.
|
||||
Approximate Nearest Neighbor Search Indexes (`ANNIndexes`) are similar to skip indexes. They are constructed by some granules and determine which of them should be skipped. Compared to skip indices, ANN indices use their results not only to skip some group of granules, but also to select particular granules from a set of granules.
|
||||
|
||||
`ANNIndexes` are designed to speed up two types of queries:
|
||||
|
||||
@ -21,21 +21,21 @@ Approximate Nearest Neighbor Search Indexes (`ANNIndexes`) are simmilar to skip
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table_name
|
||||
WHERE DistanceFunction(Column, TargetEmbedding) < Value
|
||||
WHERE DistanceFunction(Column, Point) < MaxDistance
|
||||
LIMIT N
|
||||
```
|
||||
- ###### Type 2: Order by
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table_name [WHERE ...]
|
||||
ORDER BY DistanceFunction(Column, TargetEmbedding)
|
||||
ORDER BY DistanceFunction(Column, Point)
|
||||
LIMIT N
|
||||
```
|
||||
|
||||
In these queries, `DistanceFunction` is selected from [tuples of distance functions](../../../sql-reference/functions/tuple-functions/#l1norm). `TargetEmbedding` is a known embedding (something like `(0.1, 0.1, ... )`). `Value` - a float value that will bound the neighbourhood.
|
||||
In these queries, `DistanceFunction` is selected from [distance functions](../../../sql-reference/functions/distance-functions). `Point` is a known vector (something like `(0.1, 0.1, ... )`). `Value` - a float value that will bound the neighbourhood.
|
||||
|
||||
!!! note "Note"
|
||||
ANNIndex can't speed up query that satisfies both types(`where + order by`, only one of them). All queries must have the limit, as algorithms are used to find nearest neighbors and need a specific number of them.
|
||||
ANN index can't speed up query that satisfies both types(`where + order by`, only one of them). All queries must have the limit, as algorithms are used to find nearest neighbors and need a specific number of them.
|
||||
|
||||
Both types of queries are handled the same way. The indexes get `n` neighbors (where `n` is taken from the `LIMIT` clause) and work with them. In `ORDER BY` query they remember the numbers of all parts of the granule that have at least one of neighbor. In `WHERE` query they remember only those parts that satisfy the requirements.
|
||||
|
||||
@ -63,7 +63,7 @@ ENGINE = MergeTree
|
||||
ORDER BY id;
|
||||
```
|
||||
|
||||
Number of granules in granularity should be large. With greater `GRANULARITY` indexes remember the data structure better. The `GRANULARITY` indicates how many granules will be used to construct the index. The more data is provided for the index, the more of it can be handled by one index and the more chances that with the right hyperparameters the index will remember the data structure better. But some indexes can't be built if they don't have enough data, so this granule will always participate in the query. For more information, see the description of indexes.
|
||||
With greater `GRANULARITY` indexes remember the data structure better. The `GRANULARITY` indicates how many granules will be used to construct the index. The more data is provided for the index, the more of it can be handled by one index and the more chances that with the right hyperparameters the index will remember the data structure better. But some indexes can't be built if they don't have enough data, so this granule will always participate in the query. For more information, see the description of indexes.
|
||||
|
||||
As the indexes are built only during insertions into table, `INSERT` and `OPTIMIZE` queries are slower than for ordinary table. At this stage indexes remember all the information about the given data. ANNIndexes should be used if you have immutable or rarely changed data and many read requests.
|
||||
|
||||
@ -79,7 +79,7 @@ Short description of the algorithm:
|
||||
The algorithm recursively divides in half all space by random linear surfaces (lines in 2D, planes in 3D e.t.c.). Thus it makes tree of polyhedrons and points that they contains. Repeating the operation several times for greater accuracy it creates a forest.
|
||||
To find K Nearest Neighbours it goes down through the trees and fills the buffer of closest points using the priority queue of polyhedrons. Next, it sorts buffer and return the nearest K points.
|
||||
|
||||
__Example__:
|
||||
__Examples__:
|
||||
```sql
|
||||
CREATE TABLE t
|
||||
(
|
||||
@ -101,7 +101,20 @@ CREATE TABLE t
|
||||
ENGINE = MergeTree
|
||||
ORDER BY id;
|
||||
```
|
||||
Parameter `T` is the number of trees which algorithm will create. The bigger it is, the slower (approximately linear) it works (in both `CREATE` and `SELECT` requests), but the better accuracy you get (adjusted for randomness).
|
||||
!!! note "Note"
|
||||
Table with array field will work faster, but all arrays **must** have same length.
|
||||
|
||||
In the `SELECT` in the settings (`ann_index_params`) you can specify the size of the internal buffer (more details in the description above or in the [original repository](https://github.com/spotify/annoy)).
|
||||
This parameter may help you to adjust the trade-off between query speed and accuracy.
|
||||
Parameter `T` is the number of trees which algorithm will create. The bigger it is, the slower (approximately linear) it works (in both `CREATE` and `SELECT` requests), but the better accuracy you get (adjusted for randomness).
|
||||
|
||||
Annoy supports only `L2Distance`.
|
||||
|
||||
In the `SELECT` in the settings (`ann_index_select_query_params`) you can specify the size of the internal buffer (more details in the description above or in the [original repository](https://github.com/spotify/annoy)). During the query it will inspect up to `search_k` nodes which defaults to `n_trees * n` if not provided. `search_k` gives you a run-time tradeoff between better accuracy and speed.
|
||||
|
||||
__Example__:
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table_name [WHERE ...]
|
||||
ORDER BY L2Distance(Column, Point)
|
||||
LIMIT N
|
||||
SETTING ann_index_select_query_params=`k_search=100`
|
||||
```
|
@ -554,7 +554,7 @@ endif()
|
||||
|
||||
dbms_target_link_libraries(PUBLIC ch_contrib::consistent_hashing)
|
||||
|
||||
if (TARGET ch_contrib::annoy AND ENABLE_ANNOY)
|
||||
if (TARGET ch_contrib::annoy)
|
||||
dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
|
||||
endif()
|
||||
|
||||
|
@ -603,7 +603,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
|
||||
M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions (hashid, etc)", 0) \
|
||||
M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
|
||||
M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \
|
||||
M(String, ann_index_params, "", "Parameters for ANNIndexes in select queries. String of parameters like `param1=x,param2=y...`. See ANNIndexes documentation for each index", 0) \
|
||||
M(String, ann_index_select_query_params, "", "Parameters passed to ANN indexes in SELECT queries, the format is 'param1=x, param2=y, ...'", 0) \
|
||||
M(Bool, count_distinct_optimization, false, "Rewrite count distinct to subquery of group by", 0) \
|
||||
M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
|
||||
M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \
|
||||
|
@ -24,6 +24,8 @@ namespace ErrorCodes
|
||||
namespace
|
||||
{
|
||||
|
||||
namespace ANN = ApproximateNearestNeighbour;
|
||||
|
||||
template <typename Literal>
|
||||
void extractTargetVectorFromLiteral(ANN::ANNQueryInformation::Embedding & target, Literal literal)
|
||||
{
|
||||
@ -47,6 +49,15 @@ void extractTargetVectorFromLiteral(ANN::ANNQueryInformation::Embedding & target
|
||||
}
|
||||
}
|
||||
|
||||
ANN::ANNQueryInformation::Metric castMetricFromStringToType(String metric_name)
|
||||
{
|
||||
if (metric_name == "L2Distance")
|
||||
return ANN::ANNQueryInformation::Metric::L2;
|
||||
if (metric_name == "LpDistance")
|
||||
return ANN::ANNQueryInformation::Metric::Lp;
|
||||
return ANN::ANNQueryInformation::Metric::Unknown;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace ApproximateNearestNeighbour
|
||||
@ -55,7 +66,7 @@ namespace ApproximateNearestNeighbour
|
||||
ANNCondition::ANNCondition(const SelectQueryInfo & query_info,
|
||||
ContextPtr context) :
|
||||
block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)},
|
||||
ann_index_params{context->getSettings().get("ann_index_params").get<String>()},
|
||||
ann_index_select_query_params{context->getSettings().get("ann_index_select_query_params").get<String>()},
|
||||
index_granularity{context->getMergeTreeSettings().get("index_granularity").get<UInt64>()},
|
||||
index_is_useful{checkQueryStructure(query_info)} {}
|
||||
|
||||
@ -66,20 +77,20 @@ bool ANNCondition::alwaysUnknownOrTrue(String metric_name) const
|
||||
return true; // Query isn't supported
|
||||
}
|
||||
// If query is supported, check metrics for match
|
||||
return !(metric_name == query_information->metric_name);
|
||||
return !(castMetricFromStringToType(metric_name) == query_information->metric);
|
||||
}
|
||||
|
||||
float ANNCondition::getComparisonDistanceForWhereQuery() const
|
||||
{
|
||||
if (index_is_useful && query_information.has_value()
|
||||
&& query_information->query_type == ANNQueryInformation::Type::WhereQuery)
|
||||
&& query_information->query_type == ANNQueryInformation::Type::Where)
|
||||
{
|
||||
return query_information->distance;
|
||||
}
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not supported method for this query type");
|
||||
}
|
||||
|
||||
UInt64 ANNCondition::getLimitCount() const
|
||||
UInt64 ANNCondition::getLimit() const
|
||||
{
|
||||
if (index_is_useful && query_information.has_value())
|
||||
{
|
||||
@ -115,11 +126,11 @@ String ANNCondition::getColumnName() const
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column name was requested for useless or uninitialized index.");
|
||||
}
|
||||
|
||||
String ANNCondition::getMetricName() const
|
||||
ANNQueryInformation::Metric ANNCondition::getMetricType() const
|
||||
{
|
||||
if (index_is_useful && query_information.has_value())
|
||||
{
|
||||
return query_information->metric_name;
|
||||
return query_information->metric;
|
||||
}
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Metric name was requested for useless or uninitialized index.");
|
||||
}
|
||||
@ -133,20 +144,11 @@ float ANNCondition::getPValueForLpDistance() const
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "P from LPDistance was requested for useless or uninitialized index.");
|
||||
}
|
||||
|
||||
bool ANNCondition::queryHasOrderByClause() const
|
||||
ANNQueryInformation::Type ANNCondition::getQueryType() const
|
||||
{
|
||||
if (index_is_useful && query_information.has_value())
|
||||
{
|
||||
return query_information->query_type == ANNQueryInformation::Type::OrderByQuery;
|
||||
}
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Query type was requested for useless or uninitialized index.");
|
||||
}
|
||||
|
||||
bool ANNCondition::queryHasWhereClause() const
|
||||
{
|
||||
if (index_is_useful && query_information.has_value())
|
||||
{
|
||||
return query_information->query_type == ANNQueryInformation::Type::WhereQuery;
|
||||
return query_information->query_type;
|
||||
}
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Query type was requested for useless or uninitialized index.");
|
||||
}
|
||||
@ -465,10 +467,10 @@ bool ANNCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & en
|
||||
return false;
|
||||
}
|
||||
|
||||
expr.metric_name = iter->func_name;
|
||||
expr.metric = castMetricFromStringToType(iter->func_name);
|
||||
++iter;
|
||||
|
||||
if (expr.metric_name == "LpDistance")
|
||||
if (expr.metric == ANN::ANNQueryInformation::Metric::Lp)
|
||||
{
|
||||
if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL &&
|
||||
iter->function != RPNElement::FUNCTION_INT_LITERAL)
|
||||
|
@ -31,15 +31,19 @@ struct ANNQueryInformation
|
||||
|
||||
// Extracted data from valid query
|
||||
Embedding target;
|
||||
String metric_name;
|
||||
enum class Metric
|
||||
{
|
||||
Unknown,
|
||||
L2,
|
||||
Lp
|
||||
} metric;
|
||||
String column_name;
|
||||
UInt64 limit;
|
||||
|
||||
enum class Type
|
||||
{
|
||||
Undefined,
|
||||
OrderByQuery,
|
||||
WhereQuery
|
||||
OrderBy,
|
||||
Where
|
||||
} query_type;
|
||||
|
||||
float p_for_lp_dist = -1.0;
|
||||
@ -78,7 +82,7 @@ struct ANNQueryInformation
|
||||
* spaceDimension(which is targetVector's components count)
|
||||
* column
|
||||
* objects count from LIMIT clause(for both queries)
|
||||
* settings str, if query has settings section with new 'ann_index_params' value,
|
||||
* settings str, if query has settings section with new 'ann_index_select_query_params' value,
|
||||
than you can get the new value(empty by default) calling method getSettingsStr
|
||||
* queryHasOrderByClause and queryHasWhereClause return true if query matches the type
|
||||
|
||||
@ -105,22 +109,20 @@ public:
|
||||
|
||||
String getColumnName() const;
|
||||
|
||||
String getMetricName() const;
|
||||
ANNQueryInformation::Metric getMetricType() const;
|
||||
|
||||
// the P- value if the metric is 'LpDistance'
|
||||
float getPValueForLpDistance() const;
|
||||
|
||||
bool queryHasOrderByClause() const;
|
||||
|
||||
bool queryHasWhereClause() const;
|
||||
ANNQueryInformation::Type getQueryType() const;
|
||||
|
||||
UInt64 getIndexGranularity() const { return index_granularity; }
|
||||
|
||||
// length's value from LIMIT clause, nullopt if not any
|
||||
UInt64 getLimitCount() const;
|
||||
// length's value from LIMIT clause
|
||||
UInt64 getLimit() const;
|
||||
|
||||
// value of 'ann_index_params' if have in SETTINGS clause, empty string otherwise
|
||||
String getParamsStr() const { return ann_index_params; }
|
||||
// value of 'ann_index_select_query_params' if have in SETTINGS clause, empty string otherwise
|
||||
String getParamsStr() const { return ann_index_select_query_params; }
|
||||
|
||||
private:
|
||||
|
||||
@ -188,10 +190,6 @@ private:
|
||||
// Traverses the AST of ORDERBY section
|
||||
void traverseOrderByAST(const ASTPtr & node, RPN & rpn);
|
||||
|
||||
// Checks that at least one rpn is matching for index
|
||||
// New RPNs for other query types can be added here
|
||||
bool matchAllRPNS();
|
||||
|
||||
// Returns true and stores ANNExpr if the query has valid WHERE section
|
||||
static bool matchRPNWhere(RPN & rpn, ANNQueryInformation & expr);
|
||||
|
||||
@ -213,7 +211,7 @@ private:
|
||||
std::optional<ANNQueryInformation> query_information;
|
||||
|
||||
// Get from settings ANNIndex parameters
|
||||
String ann_index_params;
|
||||
String ann_index_select_query_params;
|
||||
UInt64 index_granularity;
|
||||
bool index_is_useful = false;
|
||||
};
|
||||
@ -227,6 +225,4 @@ public:
|
||||
|
||||
}
|
||||
|
||||
namespace ANN = ApproximateNearestNeighbour;
|
||||
|
||||
}
|
||||
|
@ -1641,7 +1641,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
|
||||
if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin)
|
||||
granule = reader.read();
|
||||
// Cast to Ann condition
|
||||
auto ann_condition = std::dynamic_pointer_cast<ANN::IMergeTreeIndexConditionAnn>(condition);
|
||||
auto ann_condition = std::dynamic_pointer_cast<ApproximateNearestNeighbour::IMergeTreeIndexConditionAnn>(condition);
|
||||
if (ann_condition != nullptr)
|
||||
{
|
||||
// vector of indexes of useful ranges
|
||||
|
@ -18,7 +18,7 @@ namespace ApproximateNearestNeighbour
|
||||
{
|
||||
|
||||
template<typename Dist>
|
||||
void AnnoyIndexSerialize<Dist>::serialize(WriteBuffer& ostr) const
|
||||
void AnnoyIndex<Dist>::serialize(WriteBuffer& ostr) const
|
||||
{
|
||||
assert(Base::_built);
|
||||
writeIntBinary(Base::_s, ostr);
|
||||
@ -32,8 +32,9 @@ void AnnoyIndexSerialize<Dist>::serialize(WriteBuffer& ostr) const
|
||||
}
|
||||
|
||||
template<typename Dist>
|
||||
void AnnoyIndexSerialize<Dist>::deserialize(ReadBuffer& istr)
|
||||
void AnnoyIndex<Dist>::deserialize(ReadBuffer& istr)
|
||||
{
|
||||
assert(!Base::_built);
|
||||
readIntBinary(Base::_s, istr);
|
||||
readIntBinary(Base::_n_items, istr);
|
||||
readIntBinary(Base::_n_nodes, istr);
|
||||
@ -53,7 +54,7 @@ void AnnoyIndexSerialize<Dist>::deserialize(ReadBuffer& istr)
|
||||
}
|
||||
|
||||
template<typename Dist>
|
||||
uint64_t AnnoyIndexSerialize<Dist>::getNumOfDimensions() const
|
||||
uint64_t AnnoyIndex<Dist>::getNumOfDimensions() const
|
||||
{
|
||||
return Base::get_f();
|
||||
}
|
||||
@ -71,7 +72,7 @@ namespace ErrorCodes
|
||||
MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_)
|
||||
: index_name(index_name_)
|
||||
, index_sample_block(index_sample_block_)
|
||||
, index_base(nullptr)
|
||||
, index(nullptr)
|
||||
{}
|
||||
|
||||
MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(
|
||||
@ -80,48 +81,40 @@ MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(
|
||||
AnnoyIndexPtr index_base_)
|
||||
: index_name(index_name_)
|
||||
, index_sample_block(index_sample_block_)
|
||||
, index_base(std::move(index_base_))
|
||||
, index(std::move(index_base_))
|
||||
{}
|
||||
|
||||
bool MergeTreeIndexGranuleAnnoy::empty() const
|
||||
{
|
||||
return !static_cast<bool>(index_base);
|
||||
}
|
||||
|
||||
void MergeTreeIndexGranuleAnnoy::serializeBinary(WriteBuffer & ostr) const
|
||||
{
|
||||
writeIntBinary(index_base->getNumOfDimensions(), ostr); // write dimension
|
||||
index_base->serialize(ostr);
|
||||
/// number of dimensions is required in the constructor,
|
||||
/// so it must be written and read separately from the other part
|
||||
writeIntBinary(index->getNumOfDimensions(), ostr); // write dimension
|
||||
index->serialize(ostr);
|
||||
}
|
||||
|
||||
void MergeTreeIndexGranuleAnnoy::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
|
||||
{
|
||||
uint64_t dimension;
|
||||
readIntBinary(dimension, istr);
|
||||
index_base = std::make_shared<AnnoyIndex>(dimension);
|
||||
index_base->deserialize(istr);
|
||||
index = std::make_shared<AnnoyIndex>(dimension);
|
||||
index->deserialize(istr);
|
||||
}
|
||||
|
||||
|
||||
MergeTreeIndexAggregatorAnnoy::MergeTreeIndexAggregatorAnnoy(
|
||||
const String & index_name_,
|
||||
const Block & index_sample_block_,
|
||||
uint64_t index_param_)
|
||||
uint64_t number_of_trees_)
|
||||
: index_name(index_name_)
|
||||
, index_sample_block(index_sample_block_)
|
||||
, index_param(index_param_)
|
||||
, number_of_trees(number_of_trees_)
|
||||
{}
|
||||
|
||||
bool MergeTreeIndexAggregatorAnnoy::empty() const
|
||||
{
|
||||
return !index_base || index_base->get_n_items() == 0;
|
||||
}
|
||||
|
||||
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy::getGranuleAndReset()
|
||||
{
|
||||
index_base->build(index_param);
|
||||
auto granule = std::make_shared<MergeTreeIndexGranuleAnnoy>(index_name, index_sample_block, index_base);
|
||||
index_base = nullptr;
|
||||
index->build(number_of_trees);
|
||||
auto granule = std::make_shared<MergeTreeIndexGranuleAnnoy>(index_name, index_sample_block, index);
|
||||
index = nullptr;
|
||||
return granule;
|
||||
}
|
||||
|
||||
@ -159,11 +152,11 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t * pos, si
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Arrays should have same length");
|
||||
}
|
||||
}
|
||||
index_base = std::make_shared<AnnoyIndex>(size);
|
||||
index = std::make_shared<AnnoyIndex>(size);
|
||||
|
||||
for (size_t current_row = 0; current_row < num_rows; ++current_row)
|
||||
{
|
||||
index_base->add_item(index_base->get_n_items(), &array[offsets[current_row]]);
|
||||
index->add_item(index->get_n_items(), &array[offsets[current_row]]);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -186,13 +179,13 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t * pos, si
|
||||
}
|
||||
}
|
||||
assert(!data.empty());
|
||||
if (!index_base)
|
||||
if (!index)
|
||||
{
|
||||
index_base = std::make_shared<AnnoyIndex>(data[0].size());
|
||||
index = std::make_shared<AnnoyIndex>(data[0].size());
|
||||
}
|
||||
for (const auto& item : data)
|
||||
{
|
||||
index_base->add_item(index_base->get_n_items(), item.data());
|
||||
index->add_item(index->get_n_items(), item.data());
|
||||
}
|
||||
}
|
||||
|
||||
@ -220,10 +213,10 @@ bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const
|
||||
|
||||
std::vector<size_t> MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const
|
||||
{
|
||||
UInt64 limit = condition.getLimitCount();
|
||||
UInt64 limit = condition.getLimit();
|
||||
UInt64 index_granularity = condition.getIndexGranularity();
|
||||
std::optional<float> comp_dist
|
||||
= condition.queryHasWhereClause() ? std::optional<float>(condition.getComparisonDistanceForWhereQuery()) : std::nullopt;
|
||||
std::optional<float> comp_dist = condition.getQueryType() == ANN::ANNQueryInformation::Type::Where ?
|
||||
std::optional<float>(condition.getComparisonDistanceForWhereQuery()) : std::nullopt;
|
||||
|
||||
if (comp_dist && comp_dist.value() < 0)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance");
|
||||
@ -235,7 +228,7 @@ std::vector<size_t> MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndex
|
||||
{
|
||||
throw Exception("Granule has the wrong type", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
auto annoy = granule->index_base;
|
||||
auto annoy = granule->index;
|
||||
|
||||
if (condition.getNumOfDimensions() != annoy->getNumOfDimensions())
|
||||
{
|
||||
@ -243,40 +236,42 @@ std::vector<size_t> MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndex
|
||||
+ "does not match with the dimension in the index (" + toString(annoy->getNumOfDimensions()) + ")", ErrorCodes::INCORRECT_QUERY);
|
||||
}
|
||||
|
||||
std::vector<int32_t> items;
|
||||
std::vector<float> dist;
|
||||
items.reserve(limit);
|
||||
dist.reserve(limit);
|
||||
/// neighbors contain indexes of dots which were closest to target vector
|
||||
std::vector<UInt64> neighbors;
|
||||
std::vector<Float32> distances;
|
||||
neighbors.reserve(limit);
|
||||
distances.reserve(limit);
|
||||
|
||||
int k_search = -1;
|
||||
auto params_str = condition.getParamsStr();
|
||||
String params_str = condition.getParamsStr();
|
||||
if (!params_str.empty())
|
||||
{
|
||||
try
|
||||
{
|
||||
k_search = std::stoi(params_str);
|
||||
/// k_search=... (algorithm will inspect up to search_k nodes which defaults to n_trees * n if not provided)
|
||||
k_search = std::stoi(params_str.data() + 9);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
throw Exception("Setting of the annoy index should be int", ErrorCodes::INCORRECT_QUERY);
|
||||
}
|
||||
}
|
||||
annoy->get_nns_by_vector(target_vec.data(), 1, k_search, &items, &dist);
|
||||
std::unordered_set<size_t> result;
|
||||
for (size_t i = 0; i < items.size(); ++i)
|
||||
annoy->get_nns_by_vector(target_vec.data(), limit, k_search, &neighbors, &distances);
|
||||
std::unordered_set<size_t> granule_numbers;
|
||||
for (size_t i = 0; i < neighbors.size(); ++i)
|
||||
{
|
||||
if (comp_dist && dist[i] > comp_dist)
|
||||
if (comp_dist && distances[i] > comp_dist)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
result.insert(items[i] / index_granularity);
|
||||
granule_numbers.insert(neighbors[i] / index_granularity);
|
||||
}
|
||||
|
||||
std::vector<size_t> result_vector;
|
||||
result_vector.reserve(result.size());
|
||||
for (auto range : result)
|
||||
result_vector.reserve(granule_numbers.size());
|
||||
for (auto granule_number : granule_numbers)
|
||||
{
|
||||
result_vector.push_back(range);
|
||||
result_vector.push_back(granule_number);
|
||||
}
|
||||
|
||||
return result_vector;
|
||||
@ -290,7 +285,7 @@ MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const
|
||||
|
||||
MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator() const
|
||||
{
|
||||
return std::make_shared<MergeTreeIndexAggregatorAnnoy>(index.name, index.sample_block, index_param);
|
||||
return std::make_shared<MergeTreeIndexAggregatorAnnoy>(index.name, index.sample_block, number_of_trees);
|
||||
}
|
||||
|
||||
MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(
|
||||
@ -299,13 +294,13 @@ MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(
|
||||
return std::make_shared<MergeTreeIndexConditionAnnoy>(index, query, context);
|
||||
};
|
||||
|
||||
MergeTreeIndexPtr AnnoyIndexCreator(const IndexDescription & index)
|
||||
MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index)
|
||||
{
|
||||
uint64_t param = index.arguments[0].get<uint64_t>();
|
||||
return std::make_shared<MergeTreeIndexAnnoy>(index, param);
|
||||
}
|
||||
|
||||
void AnnoyIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
{
|
||||
if (index.arguments.size() != 1)
|
||||
{
|
||||
|
@ -10,6 +10,8 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ANN = ApproximateNearestNeighbour;
|
||||
|
||||
// auxiliary namespace for working with spotify-annoy library
|
||||
// mainly for serialization and deserialization of the index
|
||||
namespace ApproximateNearestNeighbour
|
||||
@ -17,13 +19,12 @@ namespace ApproximateNearestNeighbour
|
||||
using AnnoyIndexThreadedBuildPolicy = ::Annoy::AnnoyIndexSingleThreadedBuildPolicy;
|
||||
// TODO: Support different metrics. List of available metrics can be taken from here:
|
||||
// https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171
|
||||
template <typename Dist = ::Annoy::Euclidean>
|
||||
class AnnoyIndexSerialize : public ::Annoy::AnnoyIndex<Int32, Float32, Dist, ::Annoy::Kiss64Random, AnnoyIndexThreadedBuildPolicy>
|
||||
template <typename Distance = ::Annoy::Euclidean>
|
||||
class AnnoyIndex : public ::Annoy::AnnoyIndex<UInt64, Float32, Distance, ::Annoy::Kiss64Random, AnnoyIndexThreadedBuildPolicy>
|
||||
{
|
||||
using Base = ::Annoy::AnnoyIndex<Int32, Float32, Dist, ::Annoy::Kiss64Random, AnnoyIndexThreadedBuildPolicy>;
|
||||
using Base = ::Annoy::AnnoyIndex<UInt64, Float32, Distance, ::Annoy::Kiss64Random, AnnoyIndexThreadedBuildPolicy>;
|
||||
public:
|
||||
AnnoyIndexSerialize() = delete;
|
||||
explicit AnnoyIndexSerialize(const uint64_t dim) : Base::AnnoyIndex(dim) {}
|
||||
explicit AnnoyIndex(const uint64_t dim) : Base::AnnoyIndex(dim) {}
|
||||
void serialize(WriteBuffer& ostr) const;
|
||||
void deserialize(ReadBuffer& istr);
|
||||
uint64_t getNumOfDimensions() const;
|
||||
@ -32,7 +33,7 @@ namespace ApproximateNearestNeighbour
|
||||
|
||||
struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule
|
||||
{
|
||||
using AnnoyIndex = ANN::AnnoyIndexSerialize<>;
|
||||
using AnnoyIndex = ANN::AnnoyIndex<>;
|
||||
using AnnoyIndexPtr = std::shared_ptr<AnnoyIndex>;
|
||||
|
||||
MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_);
|
||||
@ -46,30 +47,30 @@ struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule
|
||||
void serializeBinary(WriteBuffer & ostr) const override;
|
||||
void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
|
||||
|
||||
bool empty() const override;
|
||||
bool empty() const override { return !index.get(); }
|
||||
|
||||
String index_name;
|
||||
Block index_sample_block;
|
||||
AnnoyIndexPtr index_base;
|
||||
AnnoyIndexPtr index;
|
||||
};
|
||||
|
||||
|
||||
struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator
|
||||
{
|
||||
using AnnoyIndex = ANN::AnnoyIndexSerialize<>;
|
||||
using AnnoyIndex = ANN::AnnoyIndex<>;
|
||||
using AnnoyIndexPtr = std::shared_ptr<AnnoyIndex>;
|
||||
|
||||
MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, uint64_t index_param);
|
||||
MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, uint64_t number_of_trees);
|
||||
~MergeTreeIndexAggregatorAnnoy() override = default;
|
||||
|
||||
bool empty() const override;
|
||||
bool empty() const override { return !index || index->get_n_items() == 0; }
|
||||
MergeTreeIndexGranulePtr getGranuleAndReset() override;
|
||||
void update(const Block & block, size_t * pos, size_t limit) override;
|
||||
|
||||
String index_name;
|
||||
Block index_sample_block;
|
||||
const uint64_t index_param;
|
||||
AnnoyIndexPtr index_base;
|
||||
const uint64_t number_of_trees;
|
||||
AnnoyIndexPtr index;
|
||||
};
|
||||
|
||||
|
||||
@ -97,9 +98,9 @@ private:
|
||||
class MergeTreeIndexAnnoy : public IMergeTreeIndex
|
||||
{
|
||||
public:
|
||||
MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t index_param_)
|
||||
MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t number_of_trees_)
|
||||
: IMergeTreeIndex(index_)
|
||||
, index_param(index_param_)
|
||||
, number_of_trees(number_of_trees_)
|
||||
{}
|
||||
|
||||
~MergeTreeIndexAnnoy() override = default;
|
||||
@ -113,7 +114,7 @@ public:
|
||||
bool mayBenefitFromIndexForIn(const ASTPtr & /*node*/) const override { return false; }
|
||||
|
||||
private:
|
||||
const uint64_t index_param;
|
||||
const uint64_t number_of_trees;
|
||||
};
|
||||
|
||||
|
||||
|
@ -103,8 +103,8 @@ MergeTreeIndexFactory::MergeTreeIndexFactory()
|
||||
registerValidator("hypothesis", hypothesisIndexValidator);
|
||||
|
||||
#ifdef ENABLE_ANNOY
|
||||
registerCreator("annoy", AnnoyIndexCreator);
|
||||
registerValidator("annoy", AnnoyIndexValidator);
|
||||
registerCreator("annoy", annoyIndexCreator);
|
||||
registerValidator("annoy", annoyIndexValidator);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -225,8 +225,8 @@ MergeTreeIndexPtr hypothesisIndexCreator(const IndexDescription & index);
|
||||
void hypothesisIndexValidator(const IndexDescription & index, bool attach);
|
||||
|
||||
#ifdef ENABLE_ANNOY
|
||||
MergeTreeIndexPtr AnnoyIndexCreator(const IndexDescription & index);
|
||||
void AnnoyIndexValidator(const IndexDescription & index, bool attach);
|
||||
MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index);
|
||||
void annoyIndexValidator(const IndexDescription & index, bool attach);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
@ -3,3 +3,6 @@
|
||||
3 [0,0,9.5]
|
||||
4 [0,0,9.7]
|
||||
5 [0,0,10.2]
|
||||
1 [0,0,10]
|
||||
5 [0,0,10.2]
|
||||
4 [0,0,9.7]
|
@ -33,4 +33,16 @@ FROM 02354_annoy
|
||||
WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0
|
||||
LIMIT 5;
|
||||
|
||||
SELECT *
|
||||
FROM 02354_annoy
|
||||
ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0])
|
||||
LIMIT 3;
|
||||
|
||||
INSERT INTO 02354_annoy VALUES (1, 1); -- { serverError 53 }
|
||||
|
||||
SELECT *
|
||||
FROM 02354_annoy
|
||||
ORDER BY L2Distance(embedding, [0.0, 0.0])
|
||||
LIMIT 3; -- { serverError 80 }
|
||||
|
||||
DROP TABLE IF EXISTS 02354_annoy;
|
||||
|
@ -1,3 +1,4 @@
|
||||
return()
|
||||
add_executable (pre_compressor compressor.cpp)
|
||||
target_link_libraries(pre_compressor PUBLIC ch_contrib::zstd)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user