mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Dictionaries/TrieDictionary: IP prefix dictionary
This commit implements a basic IP prefix dictionary that allows storing IPv4/IPv6 prefixes and matching them against a single IP address on query time. This allows for doing IP -> ASN matching and other similar things on query time. The implementation is basic for start, using a simple bitwise trie and reusing interface for complex key dictionaries (so using tuple instead of UInt32/FixedString(16) as the key). A faster bitwise trie implementation (like poptrie) is desired to improve lookup performance and memory consumption with large prefix tables.
This commit is contained in:
parent
e7b7f6f73d
commit
9520234365
1
contrib/CMakeLists.txt
vendored
1
contrib/CMakeLists.txt
vendored
@ -31,6 +31,7 @@ endif ()
|
||||
add_subdirectory (libcityhash)
|
||||
add_subdirectory (libfarmhash)
|
||||
add_subdirectory (libmetrohash)
|
||||
add_subdirectory (libbtrie)
|
||||
|
||||
if (USE_INTERNAL_ZLIB_LIBRARY)
|
||||
add_subdirectory (libzlib-ng)
|
||||
|
6
contrib/libbtrie/CMakeLists.txt
Normal file
6
contrib/libbtrie/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
||||
include_directories (BEFORE include)
|
||||
|
||||
add_library (btrie
|
||||
src/btrie.c
|
||||
include/btrie.h
|
||||
)
|
23
contrib/libbtrie/LICENSE
Normal file
23
contrib/libbtrie/LICENSE
Normal file
@ -0,0 +1,23 @@
|
||||
Copyright (c) 2013, CobbLiu
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
155
contrib/libbtrie/include/btrie.h
Normal file
155
contrib/libbtrie/include/btrie.h
Normal file
@ -0,0 +1,155 @@
|
||||
#pragma once
|
||||
|
||||
#if defined (__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/**
|
||||
* In btrie, each leaf means one bit in ip tree.
|
||||
* Left means 0, and right means 1.
|
||||
*/
|
||||
|
||||
#define BTRIE_NULL (uintptr_t) -1
|
||||
#define MAX_PAGES 1024 * 16
|
||||
|
||||
typedef struct btrie_node_s btrie_node_t;
|
||||
|
||||
struct btrie_node_s {
|
||||
btrie_node_t *right;
|
||||
btrie_node_t *left;
|
||||
btrie_node_t *parent;
|
||||
uintptr_t value;
|
||||
};
|
||||
|
||||
|
||||
typedef struct btrie_s {
|
||||
btrie_node_t *root;
|
||||
|
||||
btrie_node_t *free; /* free list of btrie */
|
||||
char *start;
|
||||
size_t size;
|
||||
|
||||
/*
|
||||
* memory pool.
|
||||
* memory management(esp free) will be so easy by using this facility.
|
||||
*/
|
||||
char *pools[MAX_PAGES];
|
||||
size_t len;
|
||||
} btrie_t;
|
||||
|
||||
|
||||
/**
|
||||
* Create an empty btrie
|
||||
*
|
||||
* @Return:
|
||||
* An ip radix_tree created.
|
||||
* NULL if creation failed.
|
||||
*/
|
||||
|
||||
btrie_t *btrie_create();
|
||||
|
||||
/**
|
||||
* Destroy the ip radix_tree
|
||||
*
|
||||
* @Return:
|
||||
* OK if deletion succeed.
|
||||
* ERROR if error occurs while deleting.
|
||||
*/
|
||||
int btrie_destroy(btrie_t *tree);
|
||||
|
||||
/**
|
||||
* Count the nodes in the radix tree.
|
||||
*/
|
||||
size_t btrie_count(btrie_t *tree);
|
||||
|
||||
/**
|
||||
* Return the allocated number of bytes.
|
||||
*/
|
||||
size_t btrie_allocated(btrie_t *tree);
|
||||
|
||||
|
||||
/**
|
||||
* Add an ipv4 into btrie
|
||||
*
|
||||
* @Args:
|
||||
* key: ip address
|
||||
* mask: key's mask
|
||||
* value: value of this IP, may be NULL.
|
||||
*
|
||||
* @Return:
|
||||
* OK for success.
|
||||
* ERROR for failure.
|
||||
*/
|
||||
int btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask,
|
||||
uintptr_t value);
|
||||
|
||||
|
||||
/**
|
||||
* Delete an ipv4 from btrie
|
||||
*
|
||||
* @Args:
|
||||
*
|
||||
* @Return:
|
||||
* OK for success.
|
||||
* ERROR for failure.
|
||||
*/
|
||||
int btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask);
|
||||
|
||||
|
||||
/**
|
||||
* Find an ipv4 from btrie
|
||||
*
|
||||
|
||||
* @Args:
|
||||
*
|
||||
* @Return:
|
||||
* Value if succeed.
|
||||
* NULL if failed.
|
||||
*/
|
||||
uintptr_t btrie_find(btrie_t *tree, uint32_t key);
|
||||
|
||||
|
||||
/**
|
||||
* Add an ipv6 into btrie
|
||||
*
|
||||
* @Args:
|
||||
* key: ip address
|
||||
* mask: key's mask
|
||||
* value: value of this IP, may be NULL.
|
||||
*
|
||||
* @Return:
|
||||
* OK for success.
|
||||
* ERROR for failure.
|
||||
*/
|
||||
int btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask,
|
||||
uintptr_t value);
|
||||
|
||||
/**
|
||||
* Delete an ipv6 from btrie
|
||||
*
|
||||
* @Args:
|
||||
*
|
||||
* @Return:
|
||||
* OK for success.
|
||||
* ERROR for failure.
|
||||
*/
|
||||
int btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask);
|
||||
|
||||
/**
|
||||
* Find an ipv6 from btrie
|
||||
*
|
||||
|
||||
* @Args:
|
||||
*
|
||||
* @Return:
|
||||
* Value if succeed.
|
||||
* NULL if failed.
|
||||
*/
|
||||
uintptr_t btrie_find_a6(btrie_t *tree, const uint8_t *key);
|
||||
|
||||
#if defined (__cplusplus)
|
||||
}
|
||||
#endif
|
460
contrib/libbtrie/src/btrie.c
Normal file
460
contrib/libbtrie/src/btrie.c
Normal file
@ -0,0 +1,460 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <btrie.h>
|
||||
|
||||
#define PAGE_SIZE 4096
|
||||
|
||||
|
||||
static btrie_node_t *
|
||||
btrie_alloc(btrie_t *tree)
|
||||
{
|
||||
btrie_node_t *p;
|
||||
|
||||
if (tree->free) {
|
||||
p = tree->free;
|
||||
tree->free = tree->free->right;
|
||||
return p;
|
||||
}
|
||||
|
||||
if (tree->size < sizeof(btrie_node_t)) {
|
||||
tree->start = (char *) calloc(sizeof(char), PAGE_SIZE);
|
||||
if (tree->start == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
tree->pools[tree->len++] = tree->start;
|
||||
tree->size = PAGE_SIZE;
|
||||
}
|
||||
|
||||
p = (btrie_node_t *) tree->start;
|
||||
|
||||
tree->start += sizeof(btrie_node_t);
|
||||
tree->size -= sizeof(btrie_node_t);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
btrie_t *
|
||||
btrie_create()
|
||||
{
|
||||
btrie_t *tree = (btrie_t *) malloc(sizeof(btrie_t));
|
||||
if (tree == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
tree->free = NULL;
|
||||
tree->start = NULL;
|
||||
tree->size = 0;
|
||||
memset(tree->pools, 0, sizeof(btrie_t *) * MAX_PAGES);
|
||||
tree->len = 0;
|
||||
|
||||
tree->root = btrie_alloc(tree);
|
||||
if (tree->root == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
tree->root->right = NULL;
|
||||
tree->root->left = NULL;
|
||||
tree->root->parent = NULL;
|
||||
tree->root->value = BTRIE_NULL;
|
||||
|
||||
return tree;
|
||||
}
|
||||
|
||||
static size_t
|
||||
subtree_weight(btrie_node_t *node)
|
||||
{
|
||||
size_t weight = 1;
|
||||
if (node->left) {
|
||||
weight += subtree_weight(node->left);
|
||||
}
|
||||
if (node->right) {
|
||||
weight += subtree_weight(node->right);
|
||||
}
|
||||
return weight;
|
||||
}
|
||||
|
||||
size_t
|
||||
btrie_count(btrie_t *tree)
|
||||
{
|
||||
if (tree->root == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return subtree_weight(tree->root);
|
||||
}
|
||||
|
||||
size_t
|
||||
btrie_allocated(btrie_t *tree)
|
||||
{
|
||||
return tree->len * PAGE_SIZE;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask,
|
||||
uintptr_t value)
|
||||
{
|
||||
uint32_t bit;
|
||||
btrie_node_t *node, *next;
|
||||
|
||||
bit = 0x80000000;
|
||||
|
||||
node = tree->root;
|
||||
next = tree->root;
|
||||
|
||||
while (bit & mask) {
|
||||
if (key & bit) {
|
||||
next = node->right;
|
||||
|
||||
} else {
|
||||
next = node->left;
|
||||
}
|
||||
|
||||
if (next == NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
bit >>= 1;
|
||||
node = next;
|
||||
}
|
||||
|
||||
if (next) {
|
||||
if (node->value != BTRIE_NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
node->value = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (bit & mask) {
|
||||
next = btrie_alloc(tree);
|
||||
if (next == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
next->right = NULL;
|
||||
next->left = NULL;
|
||||
next->parent = node;
|
||||
next->value = BTRIE_NULL;
|
||||
|
||||
if (key & bit) {
|
||||
node->right = next;
|
||||
|
||||
} else {
|
||||
node->left = next;
|
||||
}
|
||||
|
||||
bit >>= 1;
|
||||
node = next;
|
||||
}
|
||||
|
||||
node->value = value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask)
|
||||
{
|
||||
uint32_t bit;
|
||||
btrie_node_t *node;
|
||||
|
||||
bit = 0x80000000;
|
||||
node = tree->root;
|
||||
|
||||
while (node && (bit & mask)) {
|
||||
if (key & bit) {
|
||||
node = node->right;
|
||||
|
||||
} else {
|
||||
node = node->left;
|
||||
}
|
||||
|
||||
bit >>= 1;
|
||||
}
|
||||
|
||||
if (node == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (node->right || node->left) {
|
||||
if (node->value != BTRIE_NULL) {
|
||||
node->value = BTRIE_NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
for ( ;; ) {
|
||||
if (node->parent->right == node) {
|
||||
node->parent->right = NULL;
|
||||
|
||||
} else {
|
||||
node->parent->left = NULL;
|
||||
}
|
||||
|
||||
node->right = tree->free;
|
||||
tree->free = node;
|
||||
|
||||
node = node->parent;
|
||||
|
||||
if (node->right || node->left) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (node->value != BTRIE_NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (node->parent == NULL) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
uintptr_t
|
||||
btrie_find(btrie_t *tree, uint32_t key)
|
||||
{
|
||||
uint32_t bit;
|
||||
uintptr_t value;
|
||||
btrie_node_t *node;
|
||||
|
||||
bit = 0x80000000;
|
||||
value = BTRIE_NULL;
|
||||
node = tree->root;
|
||||
|
||||
while (node) {
|
||||
if (node->value != BTRIE_NULL) {
|
||||
value = node->value;
|
||||
}
|
||||
|
||||
if (key & bit) {
|
||||
node = node->right;
|
||||
|
||||
} else {
|
||||
node = node->left;
|
||||
}
|
||||
|
||||
bit >>= 1;
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask,
|
||||
uintptr_t value)
|
||||
{
|
||||
uint8_t bit;
|
||||
uint i;
|
||||
btrie_node_t *node, *next;
|
||||
|
||||
i = 0;
|
||||
bit = 0x80;
|
||||
|
||||
node = tree->root;
|
||||
next = tree->root;
|
||||
|
||||
while (bit & mask[i]) {
|
||||
if (key[i] & bit) {
|
||||
next = node->right;
|
||||
|
||||
} else {
|
||||
next = node->left;
|
||||
}
|
||||
|
||||
if (next == NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
bit >>= 1;
|
||||
node = next;
|
||||
|
||||
if (bit == 0) {
|
||||
if (++i == 16) {
|
||||
break;
|
||||
}
|
||||
|
||||
bit = 0x80;
|
||||
}
|
||||
}
|
||||
|
||||
if (next) {
|
||||
if (node->value != BTRIE_NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
node->value = value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (bit & mask[i]) {
|
||||
next = btrie_alloc(tree);
|
||||
if (next == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
next->right = NULL;
|
||||
next->left = NULL;
|
||||
next->parent = node;
|
||||
next->value = BTRIE_NULL;
|
||||
|
||||
if (key[i] & bit) {
|
||||
node->right = next;
|
||||
|
||||
} else {
|
||||
node->left = next;
|
||||
}
|
||||
|
||||
bit >>= 1;
|
||||
node = next;
|
||||
|
||||
if (bit == 0) {
|
||||
if (++i == 16) {
|
||||
break;
|
||||
}
|
||||
|
||||
bit = 0x80;
|
||||
}
|
||||
}
|
||||
|
||||
node->value = value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask)
|
||||
{
|
||||
uint8_t bit;
|
||||
uint i;
|
||||
btrie_node_t *node;
|
||||
|
||||
i = 0;
|
||||
bit = 0x80;
|
||||
node = tree->root;
|
||||
|
||||
while (node && (bit & mask[i])) {
|
||||
if (key[i] & bit) {
|
||||
node = node->right;
|
||||
|
||||
} else {
|
||||
node = node->left;
|
||||
}
|
||||
|
||||
bit >>= 1;
|
||||
|
||||
if (bit == 0) {
|
||||
if (++i == 16) {
|
||||
break;
|
||||
}
|
||||
|
||||
bit = 0x80;
|
||||
}
|
||||
}
|
||||
|
||||
if (node == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (node->right || node->left) {
|
||||
if (node->value != BTRIE_NULL) {
|
||||
node->value = BTRIE_NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
for ( ;; ) {
|
||||
if (node->parent->right == node) {
|
||||
node->parent->right = NULL;
|
||||
|
||||
} else {
|
||||
node->parent->left = NULL;
|
||||
}
|
||||
|
||||
node->right = tree->free;
|
||||
tree->free = node;
|
||||
|
||||
node = node->parent;
|
||||
|
||||
if (node->right || node->left) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (node->value != BTRIE_NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (node->parent == NULL) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
uintptr_t
|
||||
btrie_find_a6(btrie_t *tree, const uint8_t *key)
|
||||
{
|
||||
uint8_t bit;
|
||||
uintptr_t value;
|
||||
uint i;
|
||||
btrie_node_t *node;
|
||||
|
||||
i = 0;
|
||||
bit = 0x80;
|
||||
value = BTRIE_NULL;
|
||||
node = tree->root;
|
||||
|
||||
while (node) {
|
||||
if (node->value != BTRIE_NULL) {
|
||||
value = node->value;
|
||||
}
|
||||
|
||||
if (key[i] & bit) {
|
||||
node = node->right;
|
||||
|
||||
} else {
|
||||
node = node->left;
|
||||
}
|
||||
|
||||
bit >>= 1;
|
||||
|
||||
if (bit == 0) {
|
||||
i++;
|
||||
bit = 0x80;
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
btrie_destroy(btrie_t *tree)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
|
||||
/* free memory pools */
|
||||
for (i = 0; i < tree->len; i++) {
|
||||
free(tree->pools[i]);
|
||||
}
|
||||
|
||||
free(tree);
|
||||
|
||||
return 0;
|
||||
}
|
94
contrib/libbtrie/test/test_btrie.c
Normal file
94
contrib/libbtrie/test/test_btrie.c
Normal file
@ -0,0 +1,94 @@
|
||||
#include <stdio.h>
|
||||
#include <btrie.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
btrie_t *it;
|
||||
int ret;
|
||||
|
||||
uint8_t prefix_v6[16] = {0xde, 0xad, 0xbe, 0xef};
|
||||
uint8_t mask_v6[16] = {0xff, 0xff, 0xff};
|
||||
uint8_t ip_v6[16] = {0xde, 0xad, 0xbe, 0xef, 0xde};
|
||||
|
||||
it = btrie_create();
|
||||
if (it == NULL) {
|
||||
printf("create error!\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
//add 101.45.69.50/16
|
||||
ret = btrie_insert(it, 1697465650, 0xffff0000, 1);
|
||||
if (ret != 0) {
|
||||
printf("insert 1 error.\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
//add 10.45.69.50/16
|
||||
ret = btrie_insert(it, 170738994, 0xffff0000, 1);
|
||||
if (ret != 0) {
|
||||
printf("insert 2 error.\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
//add 10.45.79.50/16
|
||||
ret = btrie_insert(it, 170741554, 0xffff0000, 1);
|
||||
if (ret == 0) {
|
||||
printf("insert 3 error.\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
//add 102.45.79.50/24
|
||||
ret = btrie_insert(it, 1714245426, 0xffffff00, 1);
|
||||
if (ret != 0) {
|
||||
printf("insert 4 error.\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
ret = btrie_find(it, 170741554);
|
||||
if (ret == 1) {
|
||||
printf("test case 1 passed\n");
|
||||
} else {
|
||||
printf("test case 1 error\n");
|
||||
}
|
||||
|
||||
ret = btrie_find(it, 170786817);
|
||||
if (ret != 1) {
|
||||
printf("test case 2 passed\n");
|
||||
} else {
|
||||
printf("test case 2 error\n");
|
||||
}
|
||||
|
||||
ret = btrie_delete(it, 1714245426, 0xffffff00);
|
||||
if (ret != 0) {
|
||||
printf("delete 1 error\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
ret = btrie_find(it, 1714245426);
|
||||
if (ret != 1) {
|
||||
printf("test case 3 passed\n");
|
||||
} else {
|
||||
printf("test case 3 error\n");
|
||||
}
|
||||
|
||||
//add dead:beef::/32
|
||||
ret = btrie_insert_a6(it, prefix_v6, mask_v6, 1);
|
||||
if (ret != 0) {
|
||||
printf("insert 5 error\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
ret = btrie_find_a6(it, ip_v6);
|
||||
if (ret == 1) {
|
||||
printf("test case 4 passed\n");
|
||||
} else {
|
||||
printf("test case 4 error\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
error:
|
||||
btrie_destroy(it);
|
||||
printf("test failed\n");
|
||||
return 1;
|
||||
}
|
@ -21,6 +21,7 @@ include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libdivide)
|
||||
include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libcpuid/include)
|
||||
include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libfarmhash)
|
||||
include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libmetrohash/src)
|
||||
include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libbtrie/include)
|
||||
include_directories (${ClickHouse_SOURCE_DIR}/libs/libdaemon/include)
|
||||
include_directories (${ClickHouse_BINARY_DIR}/dbms/src)
|
||||
|
||||
@ -153,6 +154,7 @@ if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
src/Dictionaries/FlatDictionary.cpp
|
||||
src/Dictionaries/HashedDictionary.cpp
|
||||
src/Dictionaries/CacheDictionary.cpp
|
||||
src/Dictionaries/TrieDictionary.cpp
|
||||
src/Dictionaries/RangeHashedDictionary.cpp
|
||||
src/Dictionaries/ComplexKeyHashedDictionary.cpp
|
||||
src/Dictionaries/ComplexKeyCacheDictionary.cpp
|
||||
@ -185,6 +187,7 @@ target_link_libraries (dbms
|
||||
${OPENSSL_CRYPTO_LIBRARY}
|
||||
${Boost_SYSTEM_LIBRARY}
|
||||
${Poco_Data_LIBRARY}
|
||||
btrie
|
||||
)
|
||||
|
||||
if (Poco_DataODBC_FOUND)
|
||||
|
545
dbms/src/Dictionaries/TrieDictionary.cpp
Normal file
545
dbms/src/Dictionaries/TrieDictionary.cpp
Normal file
@ -0,0 +1,545 @@
|
||||
#include <ext/map.hpp>
|
||||
#include <ext/range.hpp>
|
||||
#include <Poco/Net/IPAddress.h>
|
||||
#include <Poco/ByteOrder.h>
|
||||
#include <Dictionaries/TrieDictionary.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int DICTIONARY_IS_EMPTY;
|
||||
}
|
||||
|
||||
TrieDictionary::TrieDictionary(
|
||||
const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr,
|
||||
const DictionaryLifetime dict_lifetime, bool require_nonempty)
|
||||
: name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
|
||||
require_nonempty(require_nonempty)
|
||||
{
|
||||
createAttributes();
|
||||
trie = btrie_create();
|
||||
|
||||
try
|
||||
{
|
||||
loadData();
|
||||
calculateBytesAllocated();
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
creation_exception = std::current_exception();
|
||||
}
|
||||
|
||||
creation_time = std::chrono::system_clock::now();
|
||||
}
|
||||
|
||||
TrieDictionary::TrieDictionary(const TrieDictionary & other)
|
||||
: TrieDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
|
||||
{
|
||||
trie = btrie_create();
|
||||
}
|
||||
|
||||
TrieDictionary::~TrieDictionary()
|
||||
{
|
||||
btrie_destroy(trie);
|
||||
}
|
||||
|
||||
#define DECLARE(TYPE)\
|
||||
void TrieDictionary::get##TYPE(\
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
|
||||
PaddedPODArray<TYPE> & out) const\
|
||||
{\
|
||||
validateKeyTypes(key_types);\
|
||||
\
|
||||
const auto & attribute = getAttribute(attribute_name);\
|
||||
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
|
||||
throw Exception{\
|
||||
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
|
||||
ErrorCodes::TYPE_MISMATCH};\
|
||||
\
|
||||
const auto null_value = std::get<TYPE>(attribute.null_values);\
|
||||
\
|
||||
getItemsNumber<TYPE>(attribute, key_columns,\
|
||||
[&] (const std::size_t row, const auto value) { out[row] = value; },\
|
||||
[&] (const std::size_t) { return null_value; });\
|
||||
}
|
||||
DECLARE(UInt8)
|
||||
DECLARE(UInt16)
|
||||
DECLARE(UInt32)
|
||||
DECLARE(UInt64)
|
||||
DECLARE(Int8)
|
||||
DECLARE(Int16)
|
||||
DECLARE(Int32)
|
||||
DECLARE(Int64)
|
||||
DECLARE(Float32)
|
||||
DECLARE(Float64)
|
||||
#undef DECLARE
|
||||
|
||||
void TrieDictionary::getString(
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
|
||||
ColumnString * out) const
|
||||
{
|
||||
validateKeyTypes(key_types);
|
||||
|
||||
const auto & attribute = getAttribute(attribute_name);
|
||||
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
|
||||
throw Exception{
|
||||
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
|
||||
ErrorCodes::TYPE_MISMATCH};
|
||||
|
||||
const auto & null_value = StringRef{std::get<String>(attribute.null_values)};
|
||||
|
||||
getItemsImpl<StringRef, StringRef>(attribute, key_columns,
|
||||
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
|
||||
[&] (const std::size_t) { return null_value; });
|
||||
}
|
||||
|
||||
#define DECLARE(TYPE)\
|
||||
void TrieDictionary::get##TYPE(\
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
|
||||
const PaddedPODArray<TYPE> & def, PaddedPODArray<TYPE> & out) const\
|
||||
{\
|
||||
validateKeyTypes(key_types);\
|
||||
\
|
||||
const auto & attribute = getAttribute(attribute_name);\
|
||||
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
|
||||
throw Exception{\
|
||||
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
|
||||
ErrorCodes::TYPE_MISMATCH};\
|
||||
\
|
||||
getItemsNumber<TYPE>(attribute, key_columns,\
|
||||
[&] (const std::size_t row, const auto value) { out[row] = value; },\
|
||||
[&] (const std::size_t row) { return def[row]; });\
|
||||
}
|
||||
DECLARE(UInt8)
|
||||
DECLARE(UInt16)
|
||||
DECLARE(UInt32)
|
||||
DECLARE(UInt64)
|
||||
DECLARE(Int8)
|
||||
DECLARE(Int16)
|
||||
DECLARE(Int32)
|
||||
DECLARE(Int64)
|
||||
DECLARE(Float32)
|
||||
DECLARE(Float64)
|
||||
#undef DECLARE
|
||||
|
||||
void TrieDictionary::getString(
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
|
||||
const ColumnString * const def, ColumnString * const out) const
|
||||
{
|
||||
validateKeyTypes(key_types);
|
||||
|
||||
const auto & attribute = getAttribute(attribute_name);
|
||||
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
|
||||
throw Exception{
|
||||
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
|
||||
ErrorCodes::TYPE_MISMATCH};
|
||||
|
||||
getItemsImpl<StringRef, StringRef>(attribute, key_columns,
|
||||
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
|
||||
[&] (const std::size_t row) { return def->getDataAt(row); });
|
||||
}
|
||||
|
||||
#define DECLARE(TYPE)\
|
||||
void TrieDictionary::get##TYPE(\
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
|
||||
const TYPE def, PaddedPODArray<TYPE> & out) const\
|
||||
{\
|
||||
validateKeyTypes(key_types);\
|
||||
\
|
||||
const auto & attribute = getAttribute(attribute_name);\
|
||||
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
|
||||
throw Exception{\
|
||||
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
|
||||
ErrorCodes::TYPE_MISMATCH};\
|
||||
\
|
||||
getItemsNumber<TYPE>(attribute, key_columns,\
|
||||
[&] (const std::size_t row, const auto value) { out[row] = value; },\
|
||||
[&] (const std::size_t) { return def; });\
|
||||
}
|
||||
DECLARE(UInt8)
|
||||
DECLARE(UInt16)
|
||||
DECLARE(UInt32)
|
||||
DECLARE(UInt64)
|
||||
DECLARE(Int8)
|
||||
DECLARE(Int16)
|
||||
DECLARE(Int32)
|
||||
DECLARE(Int64)
|
||||
DECLARE(Float32)
|
||||
DECLARE(Float64)
|
||||
#undef DECLARE
|
||||
|
||||
void TrieDictionary::getString(
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
|
||||
const String & def, ColumnString * const out) const
|
||||
{
|
||||
validateKeyTypes(key_types);
|
||||
|
||||
const auto & attribute = getAttribute(attribute_name);
|
||||
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
|
||||
throw Exception{
|
||||
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
|
||||
ErrorCodes::TYPE_MISMATCH};
|
||||
|
||||
getItemsImpl<StringRef, StringRef>(attribute, key_columns,
|
||||
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
|
||||
[&] (const std::size_t) { return StringRef{def}; });
|
||||
}
|
||||
|
||||
void TrieDictionary::has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
validateKeyTypes(key_types);
|
||||
|
||||
const auto & attribute = attributes.front();
|
||||
|
||||
switch (attribute.type)
|
||||
{
|
||||
case AttributeUnderlyingType::UInt8: has<UInt8>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::UInt16: has<UInt16>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::UInt32: has<UInt32>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::UInt64: has<UInt64>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::Int8: has<Int8>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::Int16: has<Int16>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::Int32: has<Int32>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::Int64: has<Int64>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::Float32: has<Float32>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::Float64: has<Float64>(attribute, key_columns, out); break;
|
||||
case AttributeUnderlyingType::String: has<StringRef>(attribute, key_columns, out); break;
|
||||
}
|
||||
}
|
||||
|
||||
void TrieDictionary::createAttributes()
|
||||
{
|
||||
const auto size = dict_struct.attributes.size();
|
||||
attributes.reserve(size);
|
||||
|
||||
for (const auto & attribute : dict_struct.attributes)
|
||||
{
|
||||
attribute_index_by_name.emplace(attribute.name, attributes.size());
|
||||
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
|
||||
|
||||
if (attribute.hierarchical)
|
||||
throw Exception{
|
||||
name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(),
|
||||
ErrorCodes::TYPE_MISMATCH};
|
||||
}
|
||||
}
|
||||
|
||||
void TrieDictionary::loadData()
|
||||
{
|
||||
auto stream = source_ptr->loadAll();
|
||||
stream->readPrefix();
|
||||
|
||||
/// created upfront to avoid excess allocations
|
||||
const auto keys_size = dict_struct.key.value().size();
|
||||
StringRefs keys(keys_size);
|
||||
|
||||
const auto attributes_size = attributes.size();
|
||||
|
||||
while (const auto block = stream->read())
|
||||
{
|
||||
const auto rows = block.rows();
|
||||
element_count += rows;
|
||||
|
||||
const auto key_column_ptrs = ext::map<ConstColumnPlainPtrs>(ext::range(0, keys_size),
|
||||
[&] (const std::size_t attribute_idx) {
|
||||
return block.safeGetByPosition(attribute_idx).column.get();
|
||||
});
|
||||
|
||||
const auto attribute_column_ptrs = ext::map<ConstColumnPlainPtrs>(ext::range(0, attributes_size),
|
||||
[&] (const std::size_t attribute_idx) {
|
||||
return block.safeGetByPosition(keys_size + attribute_idx).column.get();
|
||||
});
|
||||
|
||||
for (const auto row_idx : ext::range(0, rows))
|
||||
{
|
||||
/// calculate key once per row
|
||||
const auto key_column = key_column_ptrs.front();
|
||||
|
||||
for (const auto attribute_idx : ext::range(0, attributes_size))
|
||||
{
|
||||
const auto & attribute_column = *attribute_column_ptrs[attribute_idx];
|
||||
auto & attribute = attributes[attribute_idx];
|
||||
setAttributeValue(attribute, key_column->getDataAt(row_idx), attribute_column[row_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
stream->readSuffix();
|
||||
|
||||
if (require_nonempty && 0 == element_count)
|
||||
throw Exception{
|
||||
name + ": dictionary source is empty and 'require_nonempty' property is set.",
|
||||
ErrorCodes::DICTIONARY_IS_EMPTY};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void TrieDictionary::addAttributeSize(const Attribute & attribute)
|
||||
{
|
||||
const auto & vec = *std::get<ContainerPtrType<T>>(attribute.maps);
|
||||
bytes_allocated += sizeof(ContainerType<T>) + (vec.capacity() * sizeof(T));
|
||||
bucket_count = vec.size();
|
||||
}
|
||||
|
||||
void TrieDictionary::calculateBytesAllocated()
|
||||
{
|
||||
bytes_allocated += attributes.size() * sizeof(attributes.front());
|
||||
|
||||
for (const auto & attribute : attributes)
|
||||
{
|
||||
switch (attribute.type)
|
||||
{
|
||||
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
|
||||
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
|
||||
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
|
||||
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
|
||||
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
|
||||
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
|
||||
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
|
||||
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
|
||||
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
|
||||
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
|
||||
case AttributeUnderlyingType::String:
|
||||
{
|
||||
addAttributeSize<StringRef>(attribute);
|
||||
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bytes_allocated += btrie_allocated(trie);
|
||||
}
|
||||
|
||||
void TrieDictionary::validateKeyTypes(const DataTypes & key_types) const
|
||||
{
|
||||
if (key_types.size() != 1)
|
||||
throw Exception{
|
||||
"Expected a single IP address",
|
||||
ErrorCodes::TYPE_MISMATCH};
|
||||
|
||||
const auto & actual_type = key_types[0]->getName();
|
||||
|
||||
if (actual_type != "UInt32" && actual_type != "FixedString(16)")
|
||||
throw Exception{
|
||||
"Key does not match, expected either UInt32 or FixedString(16)",
|
||||
ErrorCodes::TYPE_MISMATCH};
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void TrieDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value)
|
||||
{
|
||||
std::get<T>(attribute.null_values) = null_value.get<typename NearestFieldType<T>::Type>();
|
||||
std::get<ContainerPtrType<T>>(attribute.maps) = std::make_unique<ContainerType<T>>();
|
||||
}
|
||||
|
||||
TrieDictionary::Attribute TrieDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
|
||||
{
|
||||
Attribute attr{type};
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
|
||||
case AttributeUnderlyingType::String:
|
||||
{
|
||||
std::get<String>(attr.null_values) = null_value.get<String>();
|
||||
std::get<ContainerPtrType<StringRef>>(attr.maps) = std::make_unique<ContainerType<StringRef>>();
|
||||
attr.string_arena = std::make_unique<Arena>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return attr;
|
||||
}
|
||||
|
||||
|
||||
template <typename OutputType, typename ValueSetter, typename DefaultGetter>
|
||||
void TrieDictionary::getItemsNumber(
|
||||
const Attribute & attribute,
|
||||
const ConstColumnPlainPtrs & key_columns,
|
||||
ValueSetter && set_value,
|
||||
DefaultGetter && get_default) const
|
||||
{
|
||||
if (false) {}
|
||||
#define DISPATCH(TYPE) \
|
||||
else if (attribute.type == AttributeUnderlyingType::TYPE) \
|
||||
getItemsImpl<TYPE, OutputType>(attribute, key_columns, std::forward<ValueSetter>(set_value), std::forward<DefaultGetter>(get_default));
|
||||
DISPATCH(UInt8)
|
||||
DISPATCH(UInt16)
|
||||
DISPATCH(UInt32)
|
||||
DISPATCH(UInt64)
|
||||
DISPATCH(Int8)
|
||||
DISPATCH(Int16)
|
||||
DISPATCH(Int32)
|
||||
DISPATCH(Int64)
|
||||
DISPATCH(Float32)
|
||||
DISPATCH(Float64)
|
||||
#undef DISPATCH
|
||||
else
|
||||
throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
|
||||
void TrieDictionary::getItemsImpl(
|
||||
const Attribute & attribute,
|
||||
const ConstColumnPlainPtrs & key_columns,
|
||||
ValueSetter && set_value,
|
||||
DefaultGetter && get_default) const
|
||||
{
|
||||
auto & vec = *std::get<ContainerPtrType<AttributeType>>(attribute.maps);
|
||||
|
||||
const auto first_column = key_columns.front();
|
||||
const auto rows = first_column->size();
|
||||
if (first_column->isNumeric())
|
||||
{
|
||||
for (const auto i : ext::range(0, rows))
|
||||
{
|
||||
auto addr = Int32(first_column->get64(i));
|
||||
uintptr_t slot = btrie_find(trie, addr);
|
||||
set_value(i, slot != BTRIE_NULL ? vec[slot] : get_default(i));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (const auto i : ext::range(0, rows))
|
||||
{
|
||||
auto addr = first_column->getDataAt(i);
|
||||
if (addr.size != 16)
|
||||
throw Exception("Expected key to be FixedString(16)", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
uintptr_t slot = btrie_find_a6(trie, reinterpret_cast<const UInt8*>(addr.data));
|
||||
set_value(i, slot != BTRIE_NULL ? vec[slot] : get_default(i));
|
||||
}
|
||||
}
|
||||
|
||||
query_count.fetch_add(rows, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
bool TrieDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value)
|
||||
{
|
||||
// Insert value into appropriate vector type
|
||||
auto & vec = *std::get<ContainerPtrType<T>>(attribute.maps);
|
||||
size_t row = vec.size();
|
||||
vec.push_back(value);
|
||||
|
||||
// Parse IP address and subnet length from string (e.g. 2a02:6b8::3/64)
|
||||
Poco::Net::IPAddress addr, mask;
|
||||
std::string addr_str(key.toString());
|
||||
size_t pos = addr_str.find('/');
|
||||
if (pos != std::string::npos)
|
||||
{
|
||||
|
||||
addr = Poco::Net::IPAddress(addr_str.substr(0, pos));
|
||||
mask = Poco::Net::IPAddress(std::stoi(addr_str.substr(pos + 1), nullptr, 10), addr.family());
|
||||
}
|
||||
else
|
||||
{
|
||||
addr = Poco::Net::IPAddress(addr_str);
|
||||
mask = Poco::Net::IPAddress(addr.length() * 8, addr.family());
|
||||
}
|
||||
|
||||
/*
|
||||
* Here we might overwrite the same key with the same slot as each key can map to multiple attributes.
|
||||
* However, all columns have equal number of rows so it is okay to store only row number for each key
|
||||
* instead of building a trie for each column. This comes at the cost of additional lookup in attribute
|
||||
* vector on lookup time to return cell from row + column. The reason for this is to save space,
|
||||
* and build only single trie instead of trie for each column.
|
||||
*/
|
||||
if (addr.family() == Poco::Net::IPAddress::IPv4)
|
||||
{
|
||||
UInt32 addr_v4 = Poco::ByteOrder::toNetwork(*reinterpret_cast<const UInt32*>(addr.addr()));
|
||||
UInt32 mask_v4 = Poco::ByteOrder::toNetwork(*reinterpret_cast<const UInt32*>(mask.addr()));
|
||||
return btrie_insert(trie, addr_v4, mask_v4, row) == 0;
|
||||
}
|
||||
|
||||
const uint8_t* addr_v6 = reinterpret_cast<const uint8_t*>(addr.addr());
|
||||
const uint8_t* mask_v6 = reinterpret_cast<const uint8_t*>(mask.addr());
|
||||
return btrie_insert_a6(trie, addr_v6, mask_v6, row) == 0;
|
||||
}
|
||||
|
||||
bool TrieDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value)
|
||||
{
|
||||
switch (attribute.type)
|
||||
{
|
||||
case AttributeUnderlyingType::UInt8: return setAttributeValueImpl<UInt8>(attribute, key, value.get<UInt64>());
|
||||
case AttributeUnderlyingType::UInt16: return setAttributeValueImpl<UInt16>(attribute, key, value.get<UInt64>());
|
||||
case AttributeUnderlyingType::UInt32: return setAttributeValueImpl<UInt32>(attribute, key, value.get<UInt64>());
|
||||
case AttributeUnderlyingType::UInt64: return setAttributeValueImpl<UInt64>(attribute, key, value.get<UInt64>());
|
||||
case AttributeUnderlyingType::Int8: return setAttributeValueImpl<Int8>(attribute, key, value.get<Int64>());
|
||||
case AttributeUnderlyingType::Int16: return setAttributeValueImpl<Int16>(attribute, key, value.get<Int64>());
|
||||
case AttributeUnderlyingType::Int32: return setAttributeValueImpl<Int32>(attribute, key, value.get<Int64>());
|
||||
case AttributeUnderlyingType::Int64: return setAttributeValueImpl<Int64>(attribute, key, value.get<Int64>());
|
||||
case AttributeUnderlyingType::Float32: return setAttributeValueImpl<Float32>(attribute, key, value.get<Float64>());
|
||||
case AttributeUnderlyingType::Float64: return setAttributeValueImpl<Float64>(attribute, key, value.get<Float64>());
|
||||
case AttributeUnderlyingType::String:
|
||||
{
|
||||
const auto & string = value.get<String>();
|
||||
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
|
||||
setAttributeValueImpl<StringRef>(attribute, key, StringRef{string_in_arena, string.size()});
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
const TrieDictionary::Attribute & TrieDictionary::getAttribute(const std::string & attribute_name) const
|
||||
{
|
||||
const auto it = attribute_index_by_name.find(attribute_name);
|
||||
if (it == std::end(attribute_index_by_name))
|
||||
throw Exception{
|
||||
name + ": no such attribute '" + attribute_name + "'",
|
||||
ErrorCodes::BAD_ARGUMENTS};
|
||||
|
||||
return attributes[it->second];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void TrieDictionary::has(const Attribute & attribute, const ConstColumnPlainPtrs & key_columns, PaddedPODArray<UInt8> & out) const
|
||||
{
|
||||
const auto first_column = key_columns.front();
|
||||
const auto rows = first_column->size();
|
||||
if (first_column->isNumeric())
|
||||
{
|
||||
for (const auto i : ext::range(0, rows))
|
||||
{
|
||||
auto addr = Int32(first_column->get64(i));
|
||||
uintptr_t slot = btrie_find(trie, addr);
|
||||
out[i] = (slot != BTRIE_NULL);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (const auto i : ext::range(0, rows))
|
||||
{
|
||||
auto addr = first_column->getDataAt(i);
|
||||
if (unlikely(addr.size != 16))
|
||||
throw Exception("Expected key to be FixedString(16)", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
uintptr_t slot = btrie_find_a6(trie, reinterpret_cast<const UInt8*>(addr.data));
|
||||
out[i] = (slot != BTRIE_NULL);
|
||||
}
|
||||
}
|
||||
|
||||
query_count.fetch_add(rows, std::memory_order_relaxed);}
|
||||
|
||||
}
|
216
dbms/src/Dictionaries/TrieDictionary.h
Normal file
216
dbms/src/Dictionaries/TrieDictionary.h
Normal file
@ -0,0 +1,216 @@
|
||||
#pragma once
|
||||
|
||||
#include <Dictionaries/IDictionary.h>
|
||||
#include <Dictionaries/IDictionarySource.h>
|
||||
#include <Dictionaries/DictionaryStructure.h>
|
||||
#include <Core/StringRef.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Common/Arena.h>
|
||||
#include <ext/range.hpp>
|
||||
#include <btrie.h>
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class TrieDictionary final : public IDictionaryBase
|
||||
{
|
||||
public:
|
||||
TrieDictionary(
|
||||
const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr,
|
||||
const DictionaryLifetime dict_lifetime, bool require_nonempty);
|
||||
|
||||
TrieDictionary(const TrieDictionary & other);
|
||||
|
||||
~TrieDictionary();
|
||||
|
||||
std::string getKeyDescription() const { return key_description; };
|
||||
|
||||
std::exception_ptr getCreationException() const override { return creation_exception; }
|
||||
|
||||
std::string getName() const override { return name; }
|
||||
|
||||
std::string getTypeName() const override { return "Trie"; }
|
||||
|
||||
std::size_t getBytesAllocated() const override { return bytes_allocated; }
|
||||
|
||||
std::size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); }
|
||||
|
||||
double getHitRate() const override { return 1.0; }
|
||||
|
||||
std::size_t getElementCount() const override { return element_count; }
|
||||
|
||||
double getLoadFactor() const override { return static_cast<double>(element_count) / bucket_count; }
|
||||
|
||||
bool isCached() const override { return false; }
|
||||
|
||||
DictionaryPtr clone() const override { return std::make_unique<TrieDictionary>(*this); }
|
||||
|
||||
const IDictionarySource * getSource() const override { return source_ptr.get(); }
|
||||
|
||||
const DictionaryLifetime & getLifetime() const override { return dict_lifetime; }
|
||||
|
||||
const DictionaryStructure & getStructure() const override { return dict_struct; }
|
||||
|
||||
std::chrono::time_point<std::chrono::system_clock> getCreationTime() const override
|
||||
{
|
||||
return creation_time;
|
||||
}
|
||||
|
||||
bool isInjective(const std::string & attribute_name) const override
|
||||
{
|
||||
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
|
||||
}
|
||||
|
||||
#define DECLARE(TYPE)\
|
||||
void get##TYPE(\
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
|
||||
PaddedPODArray<TYPE> & out) const;
|
||||
DECLARE(UInt8)
|
||||
DECLARE(UInt16)
|
||||
DECLARE(UInt32)
|
||||
DECLARE(UInt64)
|
||||
DECLARE(Int8)
|
||||
DECLARE(Int16)
|
||||
DECLARE(Int32)
|
||||
DECLARE(Int64)
|
||||
DECLARE(Float32)
|
||||
DECLARE(Float64)
|
||||
#undef DECLARE
|
||||
|
||||
void getString(
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
|
||||
ColumnString * out) const;
|
||||
|
||||
#define DECLARE(TYPE)\
|
||||
void get##TYPE(\
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
|
||||
const PaddedPODArray<TYPE> & def, PaddedPODArray<TYPE> & out) const;
|
||||
DECLARE(UInt8)
|
||||
DECLARE(UInt16)
|
||||
DECLARE(UInt32)
|
||||
DECLARE(UInt64)
|
||||
DECLARE(Int8)
|
||||
DECLARE(Int16)
|
||||
DECLARE(Int32)
|
||||
DECLARE(Int64)
|
||||
DECLARE(Float32)
|
||||
DECLARE(Float64)
|
||||
#undef DECLARE
|
||||
|
||||
void getString(
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
|
||||
const ColumnString * const def, ColumnString * const out) const;
|
||||
|
||||
#define DECLARE(TYPE)\
|
||||
void get##TYPE(\
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
|
||||
const TYPE def, PaddedPODArray<TYPE> & out) const;
|
||||
DECLARE(UInt8)
|
||||
DECLARE(UInt16)
|
||||
DECLARE(UInt32)
|
||||
DECLARE(UInt64)
|
||||
DECLARE(Int8)
|
||||
DECLARE(Int16)
|
||||
DECLARE(Int32)
|
||||
DECLARE(Int64)
|
||||
DECLARE(Float32)
|
||||
DECLARE(Float64)
|
||||
#undef DECLARE
|
||||
|
||||
void getString(
|
||||
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
|
||||
const String & def, ColumnString * const out) const;
|
||||
|
||||
void has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const;
|
||||
|
||||
private:
|
||||
template <typename Value> using ContainerType = std::vector<Value>;
|
||||
template <typename Value> using ContainerPtrType = std::unique_ptr<ContainerType<Value>>;
|
||||
|
||||
struct Attribute final
|
||||
{
|
||||
AttributeUnderlyingType type;
|
||||
std::tuple<
|
||||
UInt8, UInt16, UInt32, UInt64,
|
||||
Int8, Int16, Int32, Int64,
|
||||
Float32, Float64,
|
||||
String> null_values;
|
||||
std::tuple<
|
||||
ContainerPtrType<UInt8>, ContainerPtrType<UInt16>, ContainerPtrType<UInt32>, ContainerPtrType<UInt64>,
|
||||
ContainerPtrType<Int8>, ContainerPtrType<Int16>, ContainerPtrType<Int32>, ContainerPtrType<Int64>,
|
||||
ContainerPtrType<Float32>, ContainerPtrType<Float64>,
|
||||
ContainerPtrType<StringRef>> maps;
|
||||
std::unique_ptr<Arena> string_arena;
|
||||
};
|
||||
|
||||
void createAttributes();
|
||||
|
||||
void loadData();
|
||||
|
||||
template <typename T>
|
||||
void addAttributeSize(const Attribute & attribute);
|
||||
|
||||
void calculateBytesAllocated();
|
||||
|
||||
void validateKeyTypes(const DataTypes & key_types) const;
|
||||
|
||||
template <typename T>
|
||||
void createAttributeImpl(Attribute & attribute, const Field & null_value);
|
||||
|
||||
Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
|
||||
|
||||
|
||||
template <typename OutputType, typename ValueSetter, typename DefaultGetter>
|
||||
void getItemsNumber(
|
||||
const Attribute & attribute,
|
||||
const ConstColumnPlainPtrs & key_columns,
|
||||
ValueSetter && set_value,
|
||||
DefaultGetter && get_default) const;
|
||||
|
||||
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
|
||||
void getItemsImpl(
|
||||
const Attribute & attribute,
|
||||
const ConstColumnPlainPtrs & key_columns,
|
||||
ValueSetter && set_value,
|
||||
DefaultGetter && get_default) const;
|
||||
|
||||
|
||||
template <typename T>
|
||||
bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value);
|
||||
|
||||
bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value);
|
||||
|
||||
const Attribute & getAttribute(const std::string & attribute_name) const;
|
||||
|
||||
template <typename T>
|
||||
void has(const Attribute & attribute, const ConstColumnPlainPtrs & key_columns, PaddedPODArray<UInt8> & out) const;
|
||||
|
||||
const std::string name;
|
||||
const DictionaryStructure dict_struct;
|
||||
const DictionarySourcePtr source_ptr;
|
||||
const DictionaryLifetime dict_lifetime;
|
||||
const bool require_nonempty;
|
||||
const std::string key_description{dict_struct.getKeyDescription()};
|
||||
|
||||
|
||||
btrie_t *trie;
|
||||
std::map<std::string, std::size_t> attribute_index_by_name;
|
||||
std::vector<Attribute> attributes;
|
||||
|
||||
std::size_t bytes_allocated = 0;
|
||||
std::size_t element_count = 0;
|
||||
std::size_t bucket_count = 0;
|
||||
mutable std::atomic<std::size_t> query_count{0};
|
||||
|
||||
std::chrono::time_point<std::chrono::system_clock> creation_time;
|
||||
|
||||
std::exception_ptr creation_exception;
|
||||
};
|
||||
|
||||
|
||||
}
|
@ -23,6 +23,7 @@
|
||||
#include <Dictionaries/ComplexKeyHashedDictionary.h>
|
||||
#include <Dictionaries/ComplexKeyCacheDictionary.h>
|
||||
#include <Dictionaries/RangeHashedDictionary.h>
|
||||
#include <Dictionaries/TrieDictionary.h>
|
||||
|
||||
#include <ext/range.hpp>
|
||||
|
||||
@ -102,7 +103,8 @@ private:
|
||||
!executeDispatchSimple<HashedDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchSimple<CacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyHashedDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyCacheDictionary>(block, arguments, result, dict_ptr))
|
||||
!executeDispatchComplex<ComplexKeyCacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<TrieDictionary>(block, arguments, result, dict_ptr))
|
||||
throw Exception{
|
||||
"Unsupported dictionary type " + dict_ptr->getTypeName(),
|
||||
ErrorCodes::UNKNOWN_TYPE};
|
||||
@ -285,6 +287,7 @@ private:
|
||||
!executeDispatch<CacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyHashedDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyCacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<TrieDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchRange<RangeHashedDictionary>(block, arguments, result, dict_ptr))
|
||||
throw Exception{
|
||||
"Unsupported dictionary type " + dict_ptr->getTypeName(),
|
||||
@ -551,7 +554,8 @@ private:
|
||||
!executeDispatch<HashedDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatch<CacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyHashedDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyCacheDictionary>(block, arguments, result, dict_ptr))
|
||||
!executeDispatchComplex<ComplexKeyCacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<TrieDictionary>(block, arguments, result, dict_ptr))
|
||||
throw Exception{
|
||||
"Unsupported dictionary type " + dict_ptr->getTypeName(),
|
||||
ErrorCodes::UNKNOWN_TYPE};
|
||||
@ -844,6 +848,7 @@ private:
|
||||
!executeDispatch<CacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyHashedDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyCacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<TrieDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchRange<RangeHashedDictionary>(block, arguments, result, dict_ptr))
|
||||
throw Exception{
|
||||
"Unsupported dictionary type " + dict_ptr->getTypeName(),
|
||||
@ -1153,7 +1158,8 @@ private:
|
||||
!executeDispatch<HashedDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatch<CacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyHashedDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<ComplexKeyCacheDictionary>(block, arguments, result, dict_ptr))
|
||||
!executeDispatchComplex<ComplexKeyCacheDictionary>(block, arguments, result, dict_ptr) &&
|
||||
!executeDispatchComplex<TrieDictionary>(block, arguments, result, dict_ptr))
|
||||
throw Exception{
|
||||
"Unsupported dictionary type " + dict_ptr->getTypeName(),
|
||||
ErrorCodes::UNKNOWN_TYPE};
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <Dictionaries/RangeHashedDictionary.h>
|
||||
#include <Dictionaries/ComplexKeyHashedDictionary.h>
|
||||
#include <Dictionaries/ComplexKeyCacheDictionary.h>
|
||||
#include <Dictionaries/TrieDictionary.h>
|
||||
#include <Dictionaries/DictionaryStructure.h>
|
||||
#include <memory>
|
||||
|
||||
@ -81,6 +82,15 @@ DictionaryPtr DictionaryFactory::create(const std::string & name, Poco::Util::Ab
|
||||
|
||||
return std::make_unique<ComplexKeyCacheDictionary>(name, dict_struct, std::move(source_ptr), dict_lifetime, size);
|
||||
}
|
||||
else if ("ip_trie" == layout_type)
|
||||
{
|
||||
if (!dict_struct.key)
|
||||
throw Exception{"'key' is required for dictionary of layout 'ip_trie'",
|
||||
ErrorCodes::BAD_ARGUMENTS};
|
||||
|
||||
// This is specialised trie for storing IPv4 and IPv6 prefixes.
|
||||
return std::make_unique<TrieDictionary>(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (dict_struct.key)
|
||||
|
@ -101,7 +101,10 @@ The dictionary config file has the following format:
|
||||
<cache>
|
||||
<!- - Cache size in number of cells; rounded up to a degree of two. - ->
|
||||
<size_in_cells>1000000000</size_in_cells>
|
||||
</cache> -->
|
||||
</cache>
|
||||
or
|
||||
<ip_trie />
|
||||
-->
|
||||
</layout>
|
||||
|
||||
<!-- Structure. -->
|
||||
@ -243,6 +246,58 @@ Example of a dictionary by ranges:
|
||||
</dictionary>
|
||||
</dictionaries>
|
||||
|
||||
ip_trie
|
||||
-------
|
||||
The table stores IP prefixes for each key (IP address), which makes it possible to map IP addresses to metadata such as ASN or threat score.
|
||||
|
||||
Example: in the table there are prefixes matches to AS number and country:
|
||||
::
|
||||
prefix asn cca2
|
||||
202.79.32.0/20 17501 NP
|
||||
2620:0:870::/48 3856 US
|
||||
2a02:6b8:1::/48 13238 RU
|
||||
2001:db8::/32 65536 ZZ
|
||||
|
||||
|
||||
When using such a layout, the structure should have the "key" element.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: xml
|
||||
|
||||
<structure>
|
||||
<key>
|
||||
<attribute>
|
||||
<name>prefix</name>
|
||||
<type>String</type>
|
||||
</attribute>
|
||||
</key>
|
||||
<attribute>
|
||||
<name>asn</name>
|
||||
<type>UInt32</type>
|
||||
<null_value />
|
||||
</attribute>
|
||||
<attribute>
|
||||
<name>cca2</name>
|
||||
<type>String</type>
|
||||
<null_value>??</null_value>
|
||||
</attribute>
|
||||
...
|
||||
|
||||
These key must have only one attribute of type String, containing a valid IP prefix. Other types are not yet supported.
|
||||
|
||||
For querying, same functions (dictGetT with tuple) as for complex key dictionaries have to be used:
|
||||
|
||||
``dictGetT('dict_name', 'attr_name', tuple(ip))``
|
||||
|
||||
The function accepts either UInt32 for IPv4 address or FixedString(16) for IPv6 address in wire format:
|
||||
|
||||
``dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1')))``
|
||||
|
||||
No other type is supported. The function returns attribute for a prefix matching the given IP address. If there are overlapping prefixes, the most specific one is returned.
|
||||
|
||||
The data is stored currently in a bitwise trie, it has to fit in memory.
|
||||
|
||||
complex_key_hashed
|
||||
----------------
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user