Merge remote-tracking branch 'origin/master' into add_ttl_option_for_syslog

This commit is contained in:
felixxdu 2020-11-27 10:02:57 +08:00
commit 17e83cbb8d
136 changed files with 5600 additions and 3444 deletions

View File

@ -475,9 +475,6 @@ find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
set (USE_INTERNAL_BTRIE_LIBRARY ON CACHE INTERNAL "")
find_contrib_lib(btrie)
if (ENABLE_TESTS)
include (cmake/find/gtest.cmake)
endif ()

View File

@ -1,44 +0,0 @@
# - Try to find btrie headers and libraries.
#
# Usage of this module as follows:
#
# find_package(btrie)
#
# Variables used by this module, they can change the default behaviour and need
# to be set before calling find_package:
#
# BTRIE_ROOT_DIR Set this variable to the root installation of
# btrie if the module has problems finding
# the proper installation path.
#
# Variables defined by this module:
#
# BTRIE_FOUND System has btrie libs/headers
# BTRIE_LIBRARIES The btrie library/libraries
# BTRIE_INCLUDE_DIR The location of btrie headers
find_path(BTRIE_ROOT_DIR
NAMES include/btrie.h
)
find_library(BTRIE_LIBRARIES
NAMES btrie
PATHS ${BTRIE_ROOT_DIR}/lib ${BTRIE_LIBRARIES_PATHS}
)
find_path(BTRIE_INCLUDE_DIR
NAMES btrie.h
PATHS ${BTRIE_ROOT_DIR}/include ${BTRIE_INCLUDE_PATHS}
)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(btrie DEFAULT_MSG
BTRIE_LIBRARIES
BTRIE_INCLUDE_DIR
)
mark_as_advanced(
BTRIE_ROOT_DIR
BTRIE_LIBRARIES
BTRIE_INCLUDE_DIR
)

View File

@ -1,3 +1,4 @@
# Needed when using Apache Avro serialization format
option (ENABLE_AVRO "Enable Avro" ${ENABLE_LIBRARIES})
if (NOT ENABLE_AVRO)

View File

@ -1,3 +1,5 @@
# Needed when securely connecting to an external server, e.g.
# clickhouse-client --host ... --secure
option(ENABLE_SSL "Enable ssl" ${ENABLE_LIBRARIES})
if(NOT ENABLE_SSL)

View File

@ -66,10 +66,6 @@ if (USE_INTERNAL_FARMHASH_LIBRARY)
add_subdirectory (libfarmhash)
endif ()
if (USE_INTERNAL_BTRIE_LIBRARY)
add_subdirectory (libbtrie)
endif ()
if (USE_INTERNAL_ZLIB_LIBRARY)
set (ZLIB_ENABLE_TESTS 0 CACHE INTERNAL "")
set (SKIP_INSTALL_ALL 1 CACHE INTERNAL "")

View File

@ -1,6 +0,0 @@
add_library(btrie
src/btrie.c
include/btrie.h
)
target_include_directories (btrie SYSTEM PUBLIC include)

View File

@ -1,23 +0,0 @@
Copyright (c) 2013, CobbLiu
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,160 +0,0 @@
#pragma once
#if defined (__cplusplus)
extern "C" {
#endif
#include <stdlib.h>
#include <stdint.h>
/**
* In btrie, each leaf means one bit in ip tree.
* Left means 0, and right means 1.
*/
#define BTRIE_NULL (uintptr_t) -1
#if !defined(BTRIE_MAX_PAGES)
/// 54 ip per page. 8 bytes memory per page when empty
#define BTRIE_MAX_PAGES 1024 * 2048 /// 128m ips , ~16mb ram when empty
// #define BTRIE_MAX_PAGES 1024 * 65535 /// 4g ips (whole ipv4), ~512mb ram when empty
#endif
typedef struct btrie_node_s btrie_node_t;
struct btrie_node_s {
btrie_node_t *right;
btrie_node_t *left;
btrie_node_t *parent;
uintptr_t value;
};
typedef struct btrie_s {
btrie_node_t *root;
btrie_node_t *free; /* free list of btrie */
char *start;
size_t size;
/*
* memory pool.
* memory management(esp free) will be so easy by using this facility.
*/
char *pools[BTRIE_MAX_PAGES];
size_t len;
} btrie_t;
/**
* Create an empty btrie
*
* @Return:
* An ip radix_tree created.
* NULL if creation failed.
*/
btrie_t *btrie_create();
/**
* Destroy the ip radix_tree
*
* @Return:
* OK if deletion succeed.
* ERROR if error occurs while deleting.
*/
int btrie_destroy(btrie_t *tree);
/**
* Count the nodes in the radix tree.
*/
size_t btrie_count(btrie_t *tree);
/**
* Return the allocated number of bytes.
*/
size_t btrie_allocated(btrie_t *tree);
/**
* Add an ipv4 into btrie
*
* @Args:
* key: ip address
* mask: key's mask
* value: value of this IP, may be NULL.
*
* @Return:
* OK for success.
* ERROR for failure.
*/
int btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask,
uintptr_t value);
/**
* Delete an ipv4 from btrie
*
* @Args:
*
* @Return:
* OK for success.
* ERROR for failure.
*/
int btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask);
/**
* Find an ipv4 from btrie
*
* @Args:
*
* @Return:
* Value if succeed.
* NULL if failed.
*/
uintptr_t btrie_find(btrie_t *tree, uint32_t key);
/**
* Add an ipv6 into btrie
*
* @Args:
* key: ip address
* mask: key's mask
* value: value of this IP, may be NULL.
*
* @Return:
* OK for success.
* ERROR for failure.
*/
int btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask,
uintptr_t value);
/**
* Delete an ipv6 from btrie
*
* @Args:
*
* @Return:
* OK for success.
* ERROR for failure.
*/
int btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask);
/**
* Find an ipv6 from btrie
*
* @Args:
*
* @Return:
* Value if succeed.
* NULL if failed.
*/
uintptr_t btrie_find_a6(btrie_t *tree, const uint8_t *key);
#if defined (__cplusplus)
}
#endif

View File

@ -1,460 +0,0 @@
#include <stdlib.h>
#include <string.h>
#include <btrie.h>
#define PAGE_SIZE 4096
static btrie_node_t *
btrie_alloc(btrie_t *tree)
{
btrie_node_t *p;
if (tree->free) {
p = tree->free;
tree->free = tree->free->right;
return p;
}
if (tree->size < sizeof(btrie_node_t)) {
tree->start = (char *) calloc(sizeof(char), PAGE_SIZE);
if (tree->start == NULL) {
return NULL;
}
tree->pools[tree->len++] = tree->start;
tree->size = PAGE_SIZE;
}
p = (btrie_node_t *) tree->start;
tree->start += sizeof(btrie_node_t);
tree->size -= sizeof(btrie_node_t);
return p;
}
btrie_t *
btrie_create()
{
btrie_t *tree = (btrie_t *) malloc(sizeof(btrie_t));
if (tree == NULL) {
return NULL;
}
tree->free = NULL;
tree->start = NULL;
tree->size = 0;
memset(tree->pools, 0, sizeof(btrie_t *) * BTRIE_MAX_PAGES);
tree->len = 0;
tree->root = btrie_alloc(tree);
if (tree->root == NULL) {
return NULL;
}
tree->root->right = NULL;
tree->root->left = NULL;
tree->root->parent = NULL;
tree->root->value = BTRIE_NULL;
return tree;
}
static size_t
subtree_weight(btrie_node_t *node)
{
size_t weight = 1;
if (node->left) {
weight += subtree_weight(node->left);
}
if (node->right) {
weight += subtree_weight(node->right);
}
return weight;
}
size_t
btrie_count(btrie_t *tree)
{
if (tree->root == NULL) {
return 0;
}
return subtree_weight(tree->root);
}
size_t
btrie_allocated(btrie_t *tree)
{
return tree->len * PAGE_SIZE;
}
int
btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask,
uintptr_t value)
{
uint32_t bit;
btrie_node_t *node, *next;
bit = 0x80000000;
node = tree->root;
next = tree->root;
while (bit & mask) {
if (key & bit) {
next = node->right;
} else {
next = node->left;
}
if (next == NULL) {
break;
}
bit >>= 1;
node = next;
}
if (next) {
if (node->value != BTRIE_NULL) {
return -1;
}
node->value = value;
return 0;
}
while (bit & mask) {
next = btrie_alloc(tree);
if (next == NULL) {
return -1;
}
next->right = NULL;
next->left = NULL;
next->parent = node;
next->value = BTRIE_NULL;
if (key & bit) {
node->right = next;
} else {
node->left = next;
}
bit >>= 1;
node = next;
}
node->value = value;
return 0;
}
int
btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask)
{
uint32_t bit;
btrie_node_t *node;
bit = 0x80000000;
node = tree->root;
while (node && (bit & mask)) {
if (key & bit) {
node = node->right;
} else {
node = node->left;
}
bit >>= 1;
}
if (node == NULL) {
return -1;
}
if (node->right || node->left) {
if (node->value != BTRIE_NULL) {
node->value = BTRIE_NULL;
return 0;
}
return -1;
}
for ( ;; ) {
if (node->parent->right == node) {
node->parent->right = NULL;
} else {
node->parent->left = NULL;
}
node->right = tree->free;
tree->free = node;
node = node->parent;
if (node->right || node->left) {
break;
}
if (node->value != BTRIE_NULL) {
break;
}
if (node->parent == NULL) {
break;
}
}
return 0;
}
uintptr_t
btrie_find(btrie_t *tree, uint32_t key)
{
uint32_t bit;
uintptr_t value;
btrie_node_t *node;
bit = 0x80000000;
value = BTRIE_NULL;
node = tree->root;
while (node) {
if (node->value != BTRIE_NULL) {
value = node->value;
}
if (key & bit) {
node = node->right;
} else {
node = node->left;
}
bit >>= 1;
}
return value;
}
int
btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask,
uintptr_t value)
{
uint8_t bit;
unsigned int i;
btrie_node_t *node, *next;
i = 0;
bit = 0x80;
node = tree->root;
next = tree->root;
while (bit & mask[i]) {
if (key[i] & bit) {
next = node->right;
} else {
next = node->left;
}
if (next == NULL) {
break;
}
bit >>= 1;
node = next;
if (bit == 0) {
if (++i == 16) {
break;
}
bit = 0x80;
}
}
if (next) {
if (node->value != BTRIE_NULL) {
return -1;
}
node->value = value;
return 0;
}
while (bit & mask[i]) {
next = btrie_alloc(tree);
if (next == NULL) {
return -1;
}
next->right = NULL;
next->left = NULL;
next->parent = node;
next->value = BTRIE_NULL;
if (key[i] & bit) {
node->right = next;
} else {
node->left = next;
}
bit >>= 1;
node = next;
if (bit == 0) {
if (++i == 16) {
break;
}
bit = 0x80;
}
}
node->value = value;
return 0;
}
int
btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask)
{
uint8_t bit;
unsigned int i;
btrie_node_t *node;
i = 0;
bit = 0x80;
node = tree->root;
while (node && (bit & mask[i])) {
if (key[i] & bit) {
node = node->right;
} else {
node = node->left;
}
bit >>= 1;
if (bit == 0) {
if (++i == 16) {
break;
}
bit = 0x80;
}
}
if (node == NULL) {
return -1;
}
if (node->right || node->left) {
if (node->value != BTRIE_NULL) {
node->value = BTRIE_NULL;
return 0;
}
return -1;
}
for ( ;; ) {
if (node->parent->right == node) {
node->parent->right = NULL;
} else {
node->parent->left = NULL;
}
node->right = tree->free;
tree->free = node;
node = node->parent;
if (node->right || node->left) {
break;
}
if (node->value != BTRIE_NULL) {
break;
}
if (node->parent == NULL) {
break;
}
}
return 0;
}
uintptr_t
btrie_find_a6(btrie_t *tree, const uint8_t *key)
{
uint8_t bit;
uintptr_t value;
unsigned int i;
btrie_node_t *node;
i = 0;
bit = 0x80;
value = BTRIE_NULL;
node = tree->root;
while (node) {
if (node->value != BTRIE_NULL) {
value = node->value;
}
if (key[i] & bit) {
node = node->right;
} else {
node = node->left;
}
bit >>= 1;
if (bit == 0) {
i++;
bit = 0x80;
}
}
return value;
}
int
btrie_destroy(btrie_t *tree)
{
size_t i;
/* free memory pools */
for (i = 0; i < tree->len; i++) {
free(tree->pools[i]);
}
free(tree);
return 0;
}

View File

@ -1,103 +0,0 @@
#include <stdio.h>
#include <btrie.h>
int main()
{
btrie_t *it;
int ret;
uint8_t prefix_v6[16] = {0xde, 0xad, 0xbe, 0xef};
uint8_t mask_v6[16] = {0xff, 0xff, 0xff};
uint8_t ip_v6[16] = {0xde, 0xad, 0xbe, 0xef, 0xde};
it = btrie_create();
if (it == NULL) {
printf("create error!\n");
return 0;
}
//add 101.45.69.50/16
ret = btrie_insert(it, 1697465650, 0xffff0000, 1);
if (ret != 0) {
printf("insert 1 error.\n");
goto error;
}
//add 10.45.69.50/16
ret = btrie_insert(it, 170738994, 0xffff0000, 1);
if (ret != 0) {
printf("insert 2 error.\n");
goto error;
}
//add 10.45.79.50/16
ret = btrie_insert(it, 170741554, 0xffff0000, 1);
if (ret == 0) {
printf("insert 3 error.\n");
goto error;
}
//add 102.45.79.50/24
ret = btrie_insert(it, 1714245426, 0xffffff00, 1);
if (ret != 0) {
printf("insert 4 error.\n");
goto error;
}
ret = btrie_find(it, 170741554);
if (ret == 1) {
printf("test case 1 passed\n");
} else {
printf("test case 1 error\n");
}
ret = btrie_find(it, 170786817);
if (ret != 1) {
printf("test case 2 passed\n");
} else {
printf("test case 2 error\n");
}
ret = btrie_delete(it, 1714245426, 0xffffff00);
if (ret != 0) {
printf("delete 1 error\n");
goto error;
}
ret = btrie_find(it, 1714245426);
if (ret != 1) {
printf("test case 3 passed\n");
} else {
printf("test case 3 error\n");
}
//add dead:beef::/32
ret = btrie_insert_a6(it, prefix_v6, mask_v6, 1);
if (ret != 0) {
printf("insert 5 error\n");
goto error;
}
ret = btrie_find_a6(it, ip_v6);
if (ret == 1) {
printf("test case 4 passed\n");
} else {
printf("test case 4 error\n");
}
// insert 4m ips
for (size_t ip = 1; ip < 1024 * 1024 * 4; ++ip) {
ret = btrie_insert(it, ip, 0xffffffff, 1);
if (ret != 0) {
printf("insert 5 error (%d) (%zu) .\n", ret, ip);
goto error;
}
}
return 0;
error:
btrie_destroy(it);
printf("test failed\n");
return 1;
}

View File

@ -67,26 +67,6 @@ if uname -mpi | grep -q 'x86_64'; then
fi
is_running()
{
pgrep --pidfile "$CLICKHOUSE_PIDFILE" $(echo "${PROGRAM}" | cut -c1-15) 1> /dev/null 2> /dev/null
}
wait_for_done()
{
timeout=$1
attempts=0
while is_running; do
attempts=$(($attempts + 1))
if [ -n "$timeout" ] && [ $attempts -gt $timeout ]; then
return 1
fi
sleep 1
done
}
die()
{
echo $1 >&2
@ -105,49 +85,7 @@ check_config()
initdb()
{
if [ -x "$CLICKHOUSE_BINDIR/$EXTRACT_FROM_CONFIG" ]; then
CLICKHOUSE_DATADIR_FROM_CONFIG=$(su -s $SHELL ${CLICKHOUSE_USER} -c "$CLICKHOUSE_BINDIR/$EXTRACT_FROM_CONFIG --config-file=\"$CLICKHOUSE_CONFIG\" --key=path")
if [ "(" "$?" -ne "0" ")" -o "(" -z "${CLICKHOUSE_DATADIR_FROM_CONFIG}" ")" ]; then
die "Cannot obtain value of path from config file: ${CLICKHOUSE_CONFIG}";
fi
echo "Path to data directory in ${CLICKHOUSE_CONFIG}: ${CLICKHOUSE_DATADIR_FROM_CONFIG}"
else
CLICKHOUSE_DATADIR_FROM_CONFIG=$CLICKHOUSE_DATADIR
fi
if ! getent passwd ${CLICKHOUSE_USER} >/dev/null; then
echo "Can't chown to non-existing user ${CLICKHOUSE_USER}"
return
fi
if ! getent group ${CLICKHOUSE_GROUP} >/dev/null; then
echo "Can't chown to non-existing group ${CLICKHOUSE_GROUP}"
return
fi
if ! $(su -s $SHELL ${CLICKHOUSE_USER} -c "test -r ${CLICKHOUSE_CONFIG}"); then
echo "Warning! clickhouse config [${CLICKHOUSE_CONFIG}] not readable by user [${CLICKHOUSE_USER}]"
fi
if ! $(su -s $SHELL ${CLICKHOUSE_USER} -c "test -O \"${CLICKHOUSE_DATADIR_FROM_CONFIG}\" && test -G \"${CLICKHOUSE_DATADIR_FROM_CONFIG}\""); then
if [ $(dirname "${CLICKHOUSE_DATADIR_FROM_CONFIG}") = "/" ]; then
echo "Directory ${CLICKHOUSE_DATADIR_FROM_CONFIG} seems too dangerous to chown."
else
if [ ! -e "${CLICKHOUSE_DATADIR_FROM_CONFIG}" ]; then
echo "Creating directory ${CLICKHOUSE_DATADIR_FROM_CONFIG}"
mkdir -p "${CLICKHOUSE_DATADIR_FROM_CONFIG}"
fi
echo "Changing owner of [${CLICKHOUSE_DATADIR_FROM_CONFIG}] to [${CLICKHOUSE_USER}:${CLICKHOUSE_GROUP}]"
chown -R ${CLICKHOUSE_USER}:${CLICKHOUSE_GROUP} "${CLICKHOUSE_DATADIR_FROM_CONFIG}"
fi
fi
if ! $(su -s $SHELL ${CLICKHOUSE_USER} -c "test -w ${CLICKHOUSE_LOGDIR}"); then
echo "Changing owner of [${CLICKHOUSE_LOGDIR}/*] to [${CLICKHOUSE_USER}:${CLICKHOUSE_GROUP}]"
chown -R ${CLICKHOUSE_USER}:${CLICKHOUSE_GROUP} ${CLICKHOUSE_LOGDIR}/*
echo "Changing owner of [${CLICKHOUSE_LOGDIR}] to [${CLICKHOUSE_LOGDIR_USER}:${CLICKHOUSE_GROUP}]"
chown ${CLICKHOUSE_LOGDIR_USER}:${CLICKHOUSE_GROUP} ${CLICKHOUSE_LOGDIR}
fi
${CLICKHOUSE_GENERIC_PROGRAM} install --user "${CLICKHOUSE_USER}" --pid-path "${CLICKHOUSE_PIDDIR}" --config-path "${CLICKHOUSE_CONFDIR}" --binary-path "${CLICKHOUSE_BINDIR}"
}
@ -171,17 +109,7 @@ restart()
forcestop()
{
local EXIT_STATUS
EXIT_STATUS=0
echo -n "Stop forcefully $PROGRAM service: "
kill -KILL $(cat "$CLICKHOUSE_PIDFILE")
wait_for_done
echo "DONE"
return $EXIT_STATUS
${CLICKHOUSE_GENERIC_PROGRAM} stop --force --pid-path "${CLICKHOUSE_PIDDIR}"
}
@ -261,16 +189,16 @@ main()
service_or_func restart
;;
condstart)
is_running || service_or_func start
service_or_func start
;;
condstop)
is_running && service_or_func stop
service_or_func stop
;;
condrestart)
is_running && service_or_func restart
service_or_func restart
;;
condreload)
is_running && service_or_func restart
service_or_func restart
;;
initdb)
initdb
@ -293,17 +221,7 @@ main()
status()
{
if is_running; then
echo "$PROGRAM service is running"
exit 0
else
if is_cron_disabled; then
echo "$PROGRAM service is stopped";
else
echo "$PROGRAM: process unexpectedly terminated"
fi
exit 3
fi
${CLICKHOUSE_GENERIC_PROGRAM} status --pid-path "${CLICKHOUSE_PIDDIR}"
}

View File

@ -288,6 +288,7 @@ TESTS_TO_SKIP=(
# Require python libraries like scipy, pandas and numpy
01322_ttest_scipy
01561_mann_whitney_scipy
01545_system_errors
# Checks system.errors

View File

@ -415,4 +415,4 @@ if not args.keep_created_tables and not args.use_existing_tables:
c.execute(q)
print(f'drop\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')
reportStageEnd('drop-2')
reportStageEnd('drop-2')

View File

@ -13,9 +13,9 @@ cmake .. \
-DENABLE_CLICKHOUSE_SERVER=ON \
-DENABLE_CLICKHOUSE_CLIENT=ON \
-DUSE_STATIC_LIBRARIES=OFF \
-DCLICKHOUSE_SPLIT_BINARY=ON \
-DSPLIT_SHARED_LIBRARIES=ON \
-DENABLE_LIBRARIES=OFF \
-DUSE_UNWIND=ON \
-DENABLE_UTILS=OFF \
-DENABLE_TESTS=OFF
```

View File

@ -17,7 +17,6 @@ toc_title: Third-Party Libraries Used
| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) |
| h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) |

View File

@ -4,4 +4,59 @@ toc_priority: 5
# avg {#agg_function-avg}
Calculates the average. Only works for numbers. The result is always Float64.
Calculates the arithmetic mean.
**Syntax**
``` sql
avgWeighted(x)
```
**Parameter**
- `x` — Values.
`x` must be
[Integer](../../../sql-reference/data-types/int-uint.md),
[floating-point](../../../sql-reference/data-types/float.md), or
[Decimal](../../../sql-reference/data-types/decimal.md).
**Returned value**
- `NaN` if the supplied parameter is empty.
- Mean otherwise.
**Return type** is always [Float64](../../../sql-reference/data-types/float.md).
**Example**
Query:
``` sql
SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5)
```
Result:
``` text
┌─avg(x)─┐
│ 2.5 │
└────────┘
```
**Example**
Query:
``` sql
CREATE table test (t UInt8) ENGINE = Memory;
SELECT avg(t) FROM test
```
Result:
``` text
┌─avg(x)─┐
│ nan │
└────────┘
```

View File

@ -14,17 +14,21 @@ avgWeighted(x, weight)
**Parameters**
- `x` — Values. [Integer](../../../sql-reference/data-types/int-uint.md) or [floating-point](../../../sql-reference/data-types/float.md).
- `weight` — Weights of the values. [Integer](../../../sql-reference/data-types/int-uint.md) or [floating-point](../../../sql-reference/data-types/float.md).
- `x` — Values.
- `weight` — Weights of the values.
Type of `x` and `weight` must be the same.
`x` and `weight` must both be
[Integer](../../../sql-reference/data-types/int-uint.md),
[floating-point](../../../sql-reference/data-types/float.md), or
[Decimal](../../../sql-reference/data-types/decimal.md),
but may have different types.
**Returned value**
- Weighted mean.
- `NaN`. If all the weights are equal to 0.
- `NaN` if all the weights are equal to 0 or the supplied weights parameter is empty.
- Weighted mean otherwise.
Type: [Float64](../../../sql-reference/data-types/float.md).
**Return type** is always [Float64](../../../sql-reference/data-types/float.md).
**Example**
@ -42,3 +46,54 @@ Result:
│ 8 │
└────────────────────────┘
```
**Example**
Query:
``` sql
SELECT avgWeighted(x, w)
FROM values('x Int8, w Float64', (4, 1), (1, 0), (10, 2))
```
Result:
``` text
┌─avgWeighted(x, weight)─┐
│ 8 │
└────────────────────────┘
```
**Example**
Query:
``` sql
SELECT avgWeighted(x, w)
FROM values('x Int8, w Int8', (0, 0), (1, 0), (10, 0))
```
Result:
``` text
┌─avgWeighted(x, weight)─┐
│ nan │
└────────────────────────┘
```
**Example**
Query:
``` sql
CREATE table test (t UInt8) ENGINE = Memory;
SELECT avgWeighted(t) FROM test
```
Result:
``` text
┌─avgWeighted(x, weight)─┐
│ nan │
└────────────────────────┘
```

View File

@ -19,7 +19,6 @@ toc_title: Bibliotecas de terceros utilizadas
| Más información | [Licencia de 3 cláusulas BSD](https://github.com/google/googletest/blob/master/LICENSE) |
| H3 | [Licencia Apache 2.0](https://github.com/uber/h3/blob/master/LICENSE) |
| hyperscan | [Licencia de 3 cláusulas BSD](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [Licencia BSD de 2 cláusulas](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Licencia Zlib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
| libgsasl | [Información adicional](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) |

View File

@ -21,7 +21,6 @@ toc_title: "\u06A9\u062A\u0627\u0628\u062E\u0627\u0646\u0647 \u0647\u0627\u06CC
| googletest | [لیسانس 3 بند](https://github.com/google/googletest/blob/master/LICENSE) |
| اچ 3 | [نمایی مجوز 2.0](https://github.com/uber/h3/blob/master/LICENSE) |
| hyperscan | [لیسانس 3 بند](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| لیبتری | [لیسانس 2 بند](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| شکنجه نوجوان | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| لیبیدوید | [مجوز زلب](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
| نوشیدن شراب | [الجی پی ال2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) |

View File

@ -19,7 +19,6 @@ toc_title: "Biblioth\xE8ques Tierces Utilis\xE9es"
| googletest | [Licence BSD 3-Clause](https://github.com/google/googletest/blob/master/LICENSE) |
| h3 | [Licence Apache 2.0](https://github.com/uber/h3/blob/master/LICENSE) |
| hyperscan | [Licence BSD 3-Clause](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [Licence BSD 2-Clause](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Licence Zlib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) |

View File

@ -20,7 +20,6 @@ toc_title: "\u30B5\u30FC\u30C9\u30D1\u30FC\u30C6\u30A3\u88FD\u30E9\u30A4\u30D6\u
| googletest | [BSD3条項ライセンス](https://github.com/google/googletest/blob/master/LICENSE) |
| h3 | [Apacheライセンス2.0](https://github.com/uber/h3/blob/master/LICENSE) |
| hyperscan | [BSD3条項ライセンス](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD2条項ライセンス](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlibライセンス](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) |

View File

@ -18,7 +18,6 @@ toc_title: "\u0418\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c\u044b\u
| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) |
| h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) |

View File

@ -19,7 +19,6 @@ toc_title: "Kullan\u0131lan \xDC\xE7\xFCnc\xFC Taraf K\xFCt\xFCphaneleri"
| googletest | [BSD 3-Clause Lisansı](https://github.com/google/googletest/blob/master/LICENSE) |
| h33 | [Apache Lic 2.0ense 2.0](https://github.com/uber/h3/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause Lisansı](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause Lisansı](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib Lisansı](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
| libgsasl | [LGPL v2. 1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) |

View File

@ -11,7 +11,6 @@
| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) |
| googletest | [BSD3-条款许可](https://github.com/google/googletest/blob/master/LICENSE) |
| 超扫描 | [BSD3-条款许可](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD2-条款许可](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib许可证](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |
| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) |

View File

@ -43,13 +43,81 @@ else ()
${ENABLE_CLICKHOUSE_ALL})
endif ()
message(STATUS "ClickHouse modes:")
if (NOT ENABLE_CLICKHOUSE_SERVER)
message(WARNING "ClickHouse server mode is not going to be built.")
else()
message(STATUS "Server mode: ON")
endif()
if (NOT ENABLE_CLICKHOUSE_CLIENT)
message(WARNING "ClickHouse client mode is not going to be built. You won't be able to connect to the server and run
tests")
else()
message(STATUS "Client mode: ON")
endif()
if (ENABLE_CLICKHOUSE_LOCAL)
message(STATUS "Local mode: ON")
else()
message(STATUS "Local mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_BENCHMARK)
message(STATUS "Benchmark mode: ON")
else()
message(STATUS "Benchmark mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG)
message(STATUS "Extract from config mode: ON")
else()
message(STATUS "Extract from config mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_COMPRESSOR)
message(STATUS "Compressor mode: ON")
else()
message(STATUS "Compressor mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_COPIER)
message(STATUS "Copier mode: ON")
else()
message(STATUS "Copier mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_FORMAT)
message(STATUS "Format mode: ON")
else()
message(STATUS "Format mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_OBFUSCATOR)
message(STATUS "Obfuscator mode: ON")
else()
message(STATUS "Obfuscator mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
message(STATUS "ODBC bridge mode: ON")
else()
message(STATUS "ODBC bridge mode: OFF")
endif()
if (ENABLE_CLICKHOUSE_INSTALL)
message(STATUS "ClickHouse install: ON")
else()
message(STATUS "ClickHouse install: OFF")
endif()
if(NOT (MAKE_STATIC_LIBRARIES OR SPLIT_SHARED_LIBRARIES))
set(CLICKHOUSE_ONE_SHARED ON)
endif()
configure_file (config_tools.h.in ${ConfigIncludePath}/config_tools.h)
macro(clickhouse_target_link_split_lib target name)
if(NOT CLICKHOUSE_ONE_SHARED)
target_link_libraries(${target} PRIVATE clickhouse-${name}-lib)

View File

@ -21,6 +21,7 @@
#include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/MMapReadBufferFromFile.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/copyData.h>
#include <IO/Operators.h>
@ -70,7 +71,7 @@ namespace po = boost::program_options;
namespace fs = std::filesystem;
auto executeScript(const std::string & command, bool throw_on_error = false)
static auto executeScript(const std::string & command, bool throw_on_error = false)
{
auto sh = ShellCommand::execute(command);
WriteBufferFromFileDescriptor wb_stdout(STDOUT_FILENO);
@ -87,7 +88,7 @@ auto executeScript(const std::string & command, bool throw_on_error = false)
return sh->tryWait();
}
bool ask(std::string question)
static bool ask(std::string question)
{
while (true)
{
@ -104,6 +105,16 @@ bool ask(std::string question)
}
}
static bool filesEqual(std::string path1, std::string path2)
{
MMapReadBufferFromFile in1(path1, 0);
MMapReadBufferFromFile in2(path2, 0);
/// memcmp is faster than hashing and comparing hashes
return in1.buffer().size() == in2.buffer().size()
&& 0 == memcmp(in1.buffer().begin(), in2.buffer().begin(), in1.buffer().size());
}
int mainEntryClickHouseInstall(int argc, char ** argv)
{
@ -143,57 +154,89 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary from {}, file doesn't exist",
binary_self_path.string());
fs::path binary_self_canonical_path = fs::canonical(binary_self_path);
/// Copy binary to the destination directory.
/// TODO An option to link instead of copy - useful for developers.
/// TODO Check if the binary is the same.
size_t binary_size = fs::file_size(binary_self_path);
fs::path prefix = fs::path(options["prefix"].as<std::string>());
fs::path bin_dir = prefix / fs::path(options["binary-path"].as<std::string>());
size_t available_space = fs::space(bin_dir).available;
if (available_space < binary_size)
throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space for clickhouse binary in {}, required {}, available {}.",
bin_dir.string(), ReadableSize(binary_size), ReadableSize(available_space));
fs::path main_bin_path = bin_dir / "clickhouse";
fs::path main_bin_tmp_path = bin_dir / "clickhouse.new";
fs::path main_bin_old_path = bin_dir / "clickhouse.old";
fmt::print("Copying ClickHouse binary to {}\n", main_bin_tmp_path.string());
size_t binary_size = fs::file_size(binary_self_path);
try
bool old_binary_exists = fs::exists(main_bin_path);
bool already_installed = false;
/// Check if the binary is the same file (already installed).
if (old_binary_exists && binary_self_canonical_path == fs::canonical(main_bin_path))
{
ReadBufferFromFile in(binary_self_path.string());
WriteBufferFromFile out(main_bin_tmp_path.string());
copyData(in, out);
out.sync();
if (0 != fchmod(out.getFD(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH))
throwFromErrno(fmt::format("Cannot chmod {}", main_bin_tmp_path.string()), ErrorCodes::SYSTEM_ERROR);
out.finalize();
already_installed = true;
fmt::print("ClickHouse binary is already located at {}\n", main_bin_path.string());
}
catch (const Exception & e)
/// Check if binary has the same content.
else if (old_binary_exists && binary_size == fs::file_size(main_bin_path))
{
if (e.code() == ErrorCodes::CANNOT_OPEN_FILE && geteuid() != 0)
std::cerr << "Install must be run as root: sudo ./clickhouse install\n";
throw;
fmt::print("Found already existing ClickHouse binary at {} having the same size. Will check its contents.\n",
main_bin_path.string());
if (filesEqual(binary_self_path.string(), main_bin_path.string()))
{
already_installed = true;
fmt::print("ClickHouse binary is already located at {} and it has the same content as {}\n",
main_bin_path.string(), binary_self_canonical_path.string());
}
}
if (fs::exists(main_bin_path))
if (already_installed)
{
fmt::print("{} already exists, will rename existing binary to {} and put the new binary in place\n",
main_bin_path.string(), main_bin_old_path.string());
/// There is file exchange operation in Linux but it's not portable.
fs::rename(main_bin_path, main_bin_old_path);
if (0 != chmod(main_bin_path.string().c_str(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH))
throwFromErrno(fmt::format("Cannot chmod {}", main_bin_path.string()), ErrorCodes::SYSTEM_ERROR);
}
else
{
size_t available_space = fs::space(bin_dir).available;
if (available_space < binary_size)
throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space for clickhouse binary in {}, required {}, available {}.",
bin_dir.string(), ReadableSize(binary_size), ReadableSize(available_space));
fmt::print("Renaming {} to {}.\n", main_bin_tmp_path.string(), main_bin_path.string());
fs::rename(main_bin_tmp_path, main_bin_path);
fmt::print("Copying ClickHouse binary to {}\n", main_bin_tmp_path.string());
try
{
ReadBufferFromFile in(binary_self_path.string());
WriteBufferFromFile out(main_bin_tmp_path.string());
copyData(in, out);
out.sync();
if (0 != fchmod(out.getFD(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH))
throwFromErrno(fmt::format("Cannot chmod {}", main_bin_tmp_path.string()), ErrorCodes::SYSTEM_ERROR);
out.finalize();
}
catch (const Exception & e)
{
if (e.code() == ErrorCodes::CANNOT_OPEN_FILE && geteuid() != 0)
std::cerr << "Install must be run as root: sudo ./clickhouse install\n";
throw;
}
if (old_binary_exists)
{
fmt::print("{} already exists, will rename existing binary to {} and put the new binary in place\n",
main_bin_path.string(), main_bin_old_path.string());
/// There is file exchange operation in Linux but it's not portable.
fs::rename(main_bin_path, main_bin_old_path);
}
fmt::print("Renaming {} to {}.\n", main_bin_tmp_path.string(), main_bin_path.string());
fs::rename(main_bin_tmp_path, main_bin_path);
}
/// Create symlinks.
@ -401,8 +444,8 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
ConfigurationPtr configuration(new Poco::Util::XMLConfiguration(processor.processConfig()));
if (!configuration->getString("users.default.password", "").empty()
|| configuration->getString("users.default.password_sha256_hex", "").empty()
|| configuration->getString("users.default.password_double_sha1_hex", "").empty())
|| !configuration->getString("users.default.password_sha256_hex", "").empty()
|| !configuration->getString("users.default.password_double_sha1_hex", "").empty())
{
has_password_for_default_user = true;
}
@ -576,7 +619,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
" || echo \"Cannot set 'net_admin' or 'ipc_lock' or 'sys_nice' capability for clickhouse binary."
" This is optional. Taskstats accounting will be disabled."
" To enable taskstats accounting you may add the required capability later manually.\"",
"/tmp/test_setcap.sh", main_bin_path.string());
"/tmp/test_setcap.sh", fs::canonical(main_bin_path).string());
fmt::print(" {}\n", command);
executeScript(command);
#endif
@ -597,10 +640,6 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
}
}
std::string maybe_sudo;
if (getuid() != 0)
maybe_sudo = "sudo ";
std::string maybe_password;
if (has_password_for_default_user)
maybe_password = " --password";
@ -608,10 +647,19 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
fmt::print(
"\nClickHouse has been successfully installed.\n"
"\nStart clickhouse-server with:\n"
" {}clickhouse start\n"
" sudo clickhouse start\n"
"\nStart clickhouse-client with:\n"
" clickhouse-client{}\n\n",
maybe_sudo, maybe_password);
maybe_password);
}
catch (const fs::filesystem_error &)
{
std::cerr << getCurrentExceptionMessage(false) << '\n';
if (getuid() != 0)
std::cerr << "\nRun with sudo.\n";
return getCurrentExceptionCode();
}
catch (...)
{
@ -783,17 +831,20 @@ namespace
return pid;
}
int stop(const fs::path & pid_file)
int stop(const fs::path & pid_file, bool force)
{
UInt64 pid = isRunning(pid_file);
if (!pid)
return 0;
if (0 == kill(pid, 15)) /// Terminate
fmt::print("Sent termination signal.\n", pid);
int signal = force ? SIGKILL : SIGTERM;
const char * signal_name = force ? "kill" : "terminate";
if (0 == kill(pid, signal))
fmt::print("Sent {} signal to process with pid {}.\n", signal_name, pid);
else
throwFromErrno("Cannot send termination signal", ErrorCodes::SYSTEM_ERROR);
throwFromErrno(fmt::format("Cannot send {} signal", signal_name), ErrorCodes::SYSTEM_ERROR);
size_t try_num = 0;
constexpr size_t num_tries = 60;
@ -869,6 +920,7 @@ int mainEntryClickHouseStop(int argc, char ** argv)
desc.add_options()
("help,h", "produce help message")
("pid-path", po::value<std::string>()->default_value("/var/run/clickhouse-server"), "directory for pid file")
("force", po::value<bool>()->default_value(false), "Stop with KILL signal instead of TERM")
;
po::variables_map options;
@ -887,7 +939,7 @@ int mainEntryClickHouseStop(int argc, char ** argv)
{
fs::path pid_file = fs::path(options["pid-path"].as<std::string>()) / "clickhouse-server.pid";
return stop(pid_file);
return stop(pid_file, options["force"].as<bool>());
}
catch (...)
{
@ -940,6 +992,7 @@ int mainEntryClickHouseRestart(int argc, char ** argv)
("config-path", po::value<std::string>()->default_value("/etc/clickhouse-server"), "directory with configs")
("pid-path", po::value<std::string>()->default_value("/var/run/clickhouse-server"), "directory for pid file")
("user", po::value<std::string>()->default_value("clickhouse"), "clickhouse user")
("force", po::value<bool>()->default_value(false), "Stop with KILL signal instead of TERM")
;
po::variables_map options;
@ -962,7 +1015,7 @@ int mainEntryClickHouseRestart(int argc, char ** argv)
fs::path config = fs::path(options["config-path"].as<std::string>()) / "config.xml";
fs::path pid_file = fs::path(options["pid-path"].as<std::string>()) / "clickhouse-server.pid";
if (int res = stop(pid_file))
if (int res = stop(pid_file, options["force"].as<bool>()))
return res;
return start(user, executable, config, pid_file);
}

View File

@ -709,7 +709,7 @@
<!-- Configuration of external dictionaries. See:
https://clickhouse.yandex/docs/en/dicts/external_dicts/
https://clickhouse.tech/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts
-->
<dictionaries_config>*_dictionary.xml</dictionaries_config>

View File

@ -1,3 +1,4 @@
#include <memory>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionAvg.h>
#include <AggregateFunctions/Helpers.h>
@ -13,43 +14,37 @@ namespace ErrorCodes
namespace
{
template <typename T>
struct Avg
bool allowType(const DataTypePtr& type) noexcept
{
using FieldType = std::conditional_t<IsDecimalNumber<T>,
std::conditional_t<std::is_same_v<T, Decimal256>, Decimal256, Decimal128>,
NearestFieldType<T>>;
// using FieldType = std::conditional_t<IsDecimalNumber<T>, Decimal128, NearestFieldType<T>>;
using Function = AggregateFunctionAvg<T, AggregateFunctionAvgData<FieldType, UInt64>>;
};
template <typename T>
using AggregateFuncAvg = typename Avg<T>::Function;
const WhichDataType t(type);
return t.isInt() || t.isUInt() || t.isFloat() || t.isDecimal();
}
AggregateFunctionPtr createAggregateFunctionAvg(const std::string & name, const DataTypes & argument_types, const Array & parameters)
{
assertNoParameters(name, parameters);
assertUnary(name, argument_types);
AggregateFunctionPtr res;
DataTypePtr data_type = argument_types[0];
if (isDecimal(data_type))
res.reset(createWithDecimalType<AggregateFuncAvg>(*data_type, *data_type, argument_types));
else
res.reset(createWithNumericType<AggregateFuncAvg>(*data_type, argument_types));
const DataTypePtr& data_type = argument_types[0];
if (!allowType(data_type))
throw Exception("Illegal type " + data_type->getName() + " of argument for aggregate function " + name,
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
AggregateFunctionPtr res;
if (isDecimal(data_type))
res.reset(createWithDecimalType<AggregateFunctionAvg>(
*data_type, argument_types, getDecimalScale(*data_type)));
else
res.reset(createWithNumericType<AggregateFunctionAvg>(*data_type, argument_types));
if (!res)
throw Exception("Illegal type " + argument_types[0]->getName() + " of argument for aggregate function " + name,
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return res;
}
}
void registerAggregateFunctionAvg(AggregateFunctionFactory & factory)
{
factory.registerFunction("avg", createAggregateFunctionAvg, AggregateFunctionFactory::CaseInsensitive);
}
}

View File

@ -1,78 +1,102 @@
#pragma once
#include <type_traits>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypesNumber.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include "Core/DecimalFunctions.h"
namespace DB
{
namespace ErrorCodes
{
}
template <class T>
using DecimalOrVectorCol = std::conditional_t<IsDecimalNumber<T>, ColumnDecimal<T>, ColumnVector<T>>;
template <typename T, typename Denominator>
struct AggregateFunctionAvgData
{
using NumeratorType = T;
using DenominatorType = Denominator;
template <class T> constexpr bool DecimalOrExtendedInt =
IsDecimalNumber<T>
|| std::is_same_v<T, Int128>
|| std::is_same_v<T, Int256>
|| std::is_same_v<T, UInt128>
|| std::is_same_v<T, UInt256>;
T numerator{0};
/**
* Helper class to encapsulate values conversion for avg and avgWeighted.
*/
template <class Numerator, class Denominator>
struct AvgFraction
{
Numerator numerator{0};
Denominator denominator{0};
template <typename ResultT>
ResultT NO_SANITIZE_UNDEFINED result() const
/// Allow division by zero as sometimes we need to return NaN.
/// Invoked only is either Numerator or Denominator are Decimal.
Float64 NO_SANITIZE_UNDEFINED divideIfAnyDecimal(UInt32 num_scale, UInt32 denom_scale) const
{
if constexpr (std::is_floating_point_v<ResultT>)
if constexpr (std::numeric_limits<ResultT>::is_iec559)
{
if constexpr (is_big_int_v<Denominator>)
return static_cast<ResultT>(numerator) / static_cast<ResultT>(denominator);
else
return static_cast<ResultT>(numerator) / denominator; /// allow division by zero
}
if constexpr (IsDecimalNumber<Numerator> && IsDecimalNumber<Denominator>)
{
// According to the docs, num(S1) / denom(S2) would have scale S1
if (denominator == static_cast<Denominator>(0))
return static_cast<ResultT>(0);
if constexpr (std::is_same_v<Numerator, Decimal256> && std::is_same_v<Denominator, Decimal128>)
///Special case as Decimal256 / Decimal128 = compile error (as Decimal128 is not parametrized by a wide
///int), but an __int128 instead
return DecimalUtils::convertTo<Float64>(
numerator / (denominator.template convertTo<Decimal256>()), num_scale);
else
return DecimalUtils::convertTo<Float64>(numerator / denominator, num_scale);
}
if constexpr (std::is_same_v<T, Decimal256>)
return static_cast<ResultT>(numerator / static_cast<T>(denominator));
/// Numerator is always casted to Float64 to divide correctly if the denominator is not Float64.
Float64 num_converted;
if constexpr (IsDecimalNumber<Numerator>)
num_converted = DecimalUtils::convertTo<Float64>(numerator, num_scale);
else
return static_cast<ResultT>(numerator / denominator);
num_converted = static_cast<Float64>(numerator); /// all other types, including extended integral.
std::conditional_t<DecimalOrExtendedInt<Denominator>,
Float64, Denominator> denom_converted;
if constexpr (IsDecimalNumber<Denominator>)
denom_converted = DecimalUtils::convertTo<Float64>(denominator, denom_scale);
else if constexpr (DecimalOrExtendedInt<Denominator>)
/// no way to divide Float64 and extended integral type without an explicit cast.
denom_converted = static_cast<Float64>(denominator);
else
denom_converted = denominator; /// can divide on float, no cast required.
return num_converted / denom_converted;
}
Float64 NO_SANITIZE_UNDEFINED divide() const
{
if constexpr (DecimalOrExtendedInt<Denominator>) /// if extended int
return static_cast<Float64>(numerator) / static_cast<Float64>(denominator);
else
return static_cast<Float64>(numerator) / denominator;
}
};
/// Calculates arithmetic mean of numbers.
template <typename T, typename Data, typename Derived>
class AggregateFunctionAvgBase : public IAggregateFunctionDataHelper<Data, Derived>
/**
* @tparam Derived When deriving from this class, use the child class name as in CRTP, e.g.
* class Self : Agg<char, bool, bool, Self>.
*/
template <class Numerator, class Denominator, class Derived>
class AggregateFunctionAvgBase : public
IAggregateFunctionDataHelper<AvgFraction<Numerator, Denominator>, Derived>
{
public:
using ResultType = std::conditional_t<IsDecimalNumber<T>, T, Float64>;
using ResultDataType = std::conditional_t<IsDecimalNumber<T>, DataTypeDecimal<T>, DataTypeNumber<Float64>>;
using ColVecType = std::conditional_t<IsDecimalNumber<T>, ColumnDecimal<T>, ColumnVector<T>>;
using ColVecResult = std::conditional_t<IsDecimalNumber<T>, ColumnDecimal<T>, ColumnVector<Float64>>;
using Fraction = AvgFraction<Numerator, Denominator>;
using Base = IAggregateFunctionDataHelper<Fraction, Derived>;
/// ctor for native types
AggregateFunctionAvgBase(const DataTypes & argument_types_) : IAggregateFunctionDataHelper<Data, Derived>(argument_types_, {}), scale(0) {}
explicit AggregateFunctionAvgBase(const DataTypes & argument_types_,
UInt32 num_scale_ = 0, UInt32 denom_scale_ = 0)
: Base(argument_types_, {}), num_scale(num_scale_), denom_scale(denom_scale_) {}
/// ctor for Decimals
AggregateFunctionAvgBase(const IDataType & data_type, const DataTypes & argument_types_)
: IAggregateFunctionDataHelper<Data, Derived>(argument_types_, {}), scale(getDecimalScale(data_type))
{
}
DataTypePtr getReturnType() const override
{
if constexpr (IsDecimalNumber<T>)
return std::make_shared<ResultDataType>(ResultDataType::maxPrecision(), scale);
else
return std::make_shared<ResultDataType>();
}
DataTypePtr getReturnType() const final { return std::make_shared<DataTypeNumber<Float64>>(); }
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
{
@ -84,7 +108,7 @@ public:
{
writeBinary(this->data(place).numerator, buf);
if constexpr (std::is_unsigned_v<typename Data::DenominatorType>)
if constexpr (std::is_unsigned_v<Denominator>)
writeVarUInt(this->data(place).denominator, buf);
else /// Floating point denominator type can be used
writeBinary(this->data(place).denominator, buf);
@ -94,7 +118,7 @@ public:
{
readBinary(this->data(place).numerator, buf);
if constexpr (std::is_unsigned_v<typename Data::DenominatorType>)
if constexpr (std::is_unsigned_v<Denominator>)
readVarUInt(this->data(place).denominator, buf);
else /// Floating point denominator type can be used
readBinary(this->data(place).denominator, buf);
@ -102,29 +126,34 @@ public:
void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
{
auto & column = static_cast<ColVecResult &>(to);
column.getData().push_back(this->data(place).template result<ResultType>());
if constexpr (IsDecimalNumber<Numerator> || IsDecimalNumber<Denominator>)
static_cast<ColumnVector<Float64> &>(to).getData().push_back(
this->data(place).divideIfAnyDecimal(num_scale, denom_scale));
else
static_cast<ColumnVector<Float64> &>(to).getData().push_back(this->data(place).divide());
}
protected:
UInt32 scale;
private:
UInt32 num_scale;
UInt32 denom_scale;
};
template <typename T, typename Data>
class AggregateFunctionAvg final : public AggregateFunctionAvgBase<T, Data, AggregateFunctionAvg<T, Data>>
template <class T>
using AvgFieldType = std::conditional_t<IsDecimalNumber<T>,
std::conditional_t<std::is_same_v<T, Decimal256>, Decimal256, Decimal128>,
NearestFieldType<T>>;
template <class T>
class AggregateFunctionAvg final : public AggregateFunctionAvgBase<AvgFieldType<T>, UInt64, AggregateFunctionAvg<T>>
{
public:
using AggregateFunctionAvgBase<T, Data, AggregateFunctionAvg<T, Data>>::AggregateFunctionAvgBase;
using AggregateFunctionAvgBase<AvgFieldType<T>, UInt64, AggregateFunctionAvg<T>>::AggregateFunctionAvgBase;
using ColVecType = std::conditional_t<IsDecimalNumber<T>, ColumnDecimal<T>, ColumnVector<T>>;
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const final
{
const auto & column = static_cast<const ColVecType &>(*columns[0]);
this->data(place).numerator += column.getData()[row_num];
this->data(place).denominator += 1;
this->data(place).numerator += static_cast<const DecimalOrVectorCol<T> &>(*columns[0]).getData()[row_num];
++this->data(place).denominator;
}
String getName() const override { return "avg"; }
String getName() const final { return "avg"; }
};
}

View File

@ -1,3 +1,5 @@
#include <memory>
#include <type_traits>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionAvgWeighted.h>
#include <AggregateFunctions/Helpers.h>
@ -13,47 +15,91 @@ namespace ErrorCodes
namespace
{
template <typename T>
struct AvgWeighted
bool allowTypes(const DataTypePtr& left, const DataTypePtr& right) noexcept
{
using FieldType = std::conditional_t<IsDecimalNumber<T>,
std::conditional_t<std::is_same_v<T, Decimal256>, Decimal256, Decimal128>,
NearestFieldType<T>>;
// using FieldType = std::conditional_t<IsDecimalNumber<T>, Decimal128, NearestFieldType<T>>;
using Function = AggregateFunctionAvgWeighted<T, AggregateFunctionAvgData<FieldType, FieldType>>;
};
const WhichDataType l_dt(left), r_dt(right);
template <typename T>
using AggregateFuncAvgWeighted = typename AvgWeighted<T>::Function;
constexpr auto allow = [](WhichDataType t)
{
return t.isInt() || t.isUInt() || t.isFloat() || t.isDecimal();
};
return allow(l_dt) && allow(r_dt);
}
#define AT_SWITCH(LINE) \
switch (which.idx) \
{ \
LINE(Int8); LINE(Int16); LINE(Int32); LINE(Int64); LINE(Int128); LINE(Int256); \
LINE(UInt8); LINE(UInt16); LINE(UInt32); LINE(UInt64); LINE(UInt128); LINE(UInt256); \
LINE(Decimal32); LINE(Decimal64); LINE(Decimal128); LINE(Decimal256); \
LINE(Float32); LINE(Float64); \
default: return nullptr; \
}
template <class First, class ... TArgs>
static IAggregateFunction * create(const IDataType & second_type, TArgs && ... args)
{
const WhichDataType which(second_type);
#define LINE(Type) \
case TypeIndex::Type: return new AggregateFunctionAvgWeighted<First, Type>(std::forward<TArgs>(args)...)
AT_SWITCH(LINE)
#undef LINE
}
// Not using helper functions because there are no templates for binary decimal/numeric function.
template <class... TArgs>
static IAggregateFunction * create(const IDataType & first_type, const IDataType & second_type, TArgs && ... args)
{
const WhichDataType which(first_type);
#define LINE(Type) \
case TypeIndex::Type: return create<Type, TArgs...>(second_type, std::forward<TArgs>(args)...)
AT_SWITCH(LINE)
#undef LINE
}
AggregateFunctionPtr createAggregateFunctionAvgWeighted(const std::string & name, const DataTypes & argument_types, const Array & parameters)
{
assertNoParameters(name, parameters);
assertBinary(name, argument_types);
AggregateFunctionPtr res;
const auto data_type = static_cast<const DataTypePtr>(argument_types[0]);
const auto data_type_weight = static_cast<const DataTypePtr>(argument_types[1]);
if (!data_type->equals(*data_type_weight))
throw Exception("Different types " + data_type->getName() + " and " + data_type_weight->getName() + " of arguments for aggregate function " + name,
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (isDecimal(data_type))
res.reset(createWithDecimalType<AggregateFuncAvgWeighted>(*data_type, *data_type, argument_types));
if (!allowTypes(data_type, data_type_weight))
throw Exception(
"Types " + data_type->getName() +
" and " + data_type_weight->getName() +
" are non-conforming as arguments for aggregate function " + name,
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
AggregateFunctionPtr ptr;
const bool left_decimal = isDecimal(data_type);
const bool right_decimal = isDecimal(data_type_weight);
if (left_decimal && right_decimal)
ptr.reset(create(*data_type, *data_type_weight,
argument_types,
getDecimalScale(*data_type), getDecimalScale(*data_type_weight)));
else if (left_decimal)
ptr.reset(create(*data_type, *data_type_weight, argument_types,
getDecimalScale(*data_type)));
else if (right_decimal)
ptr.reset(create(*data_type, *data_type_weight, argument_types,
// numerator is not decimal, so its scale is 0
0, getDecimalScale(*data_type_weight)));
else
res.reset(createWithNumericType<AggregateFuncAvgWeighted>(*data_type, argument_types));
ptr.reset(create(*data_type, *data_type_weight, argument_types));
if (!res)
throw Exception("Illegal type " + data_type->getName() + " of argument for aggregate function " + name,
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return res;
return ptr;
}
}
void registerAggregateFunctionAvgWeighted(AggregateFunctionFactory & factory)
{
factory.registerFunction("avgWeighted", createAggregateFunctionAvgWeighted, AggregateFunctionFactory::CaseSensitive);
}
}

View File

@ -1,26 +1,44 @@
#pragma once
#include <type_traits>
#include <AggregateFunctions/AggregateFunctionAvg.h>
namespace DB
{
template <typename T, typename Data>
class AggregateFunctionAvgWeighted final : public AggregateFunctionAvgBase<T, Data, AggregateFunctionAvgWeighted<T, Data>>
template <class T>
using AvgWeightedFieldType = std::conditional_t<IsDecimalNumber<T>,
std::conditional_t<std::is_same_v<T, Decimal256>, Decimal256, Decimal128>,
std::conditional_t<DecimalOrExtendedInt<T>,
Float64, // no way to do UInt128 * UInt128, better cast to Float64
NearestFieldType<T>>>;
template <class T, class U>
using MaxFieldType = std::conditional_t<(sizeof(AvgWeightedFieldType<T>) > sizeof(AvgWeightedFieldType<U>)),
AvgWeightedFieldType<T>, AvgWeightedFieldType<U>>;
template <class Value, class Weight>
class AggregateFunctionAvgWeighted final :
public AggregateFunctionAvgBase<
MaxFieldType<Value, Weight>, AvgWeightedFieldType<Weight>, AggregateFunctionAvgWeighted<Value, Weight>>
{
public:
using AggregateFunctionAvgBase<T, Data, AggregateFunctionAvgWeighted<T, Data>>::AggregateFunctionAvgBase;
using Base = AggregateFunctionAvgBase<
MaxFieldType<Value, Weight>, AvgWeightedFieldType<Weight>, AggregateFunctionAvgWeighted<Value, Weight>>;
using Base::Base;
using ValueT = MaxFieldType<Value, Weight>;
using ColVecType = std::conditional_t<IsDecimalNumber<T>, ColumnDecimal<T>, ColumnVector<T>>;
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
{
const auto & values = static_cast<const ColVecType &>(*columns[0]);
const auto & weights = static_cast<const ColVecType &>(*columns[1]);
const auto& weights = static_cast<const DecimalOrVectorCol<Weight> &>(*columns[1]);
this->data(place).numerator += static_cast<typename Data::NumeratorType>(values.getData()[row_num]) * weights.getData()[row_num];
this->data(place).denominator += weights.getData()[row_num];
this->data(place).numerator += static_cast<ValueT>(
static_cast<const DecimalOrVectorCol<Value> &>(*columns[0]).getData()[row_num]) *
static_cast<ValueT>(weights.getData()[row_num]);
this->data(place).denominator += static_cast<AvgWeightedFieldType<Weight>>(weights.getData()[row_num]);
}
String getName() const override { return "avgWeighted"; }
};
}

View File

@ -21,7 +21,8 @@ class IDataType;
using DataTypePtr = std::shared_ptr<const IDataType>;
using DataTypes = std::vector<DataTypePtr>;
/** Creator have arguments: name of aggregate function, types of arguments, values of parameters.
/**
* The invoker has arguments: name of aggregate function, types of arguments, values of parameters.
* Parameters are for "parametric" aggregate functions.
* For example, in quantileWeighted(0.9)(x, weight), 0.9 is "parameter" and x, weight are "arguments".
*/
@ -87,7 +88,6 @@ private:
std::optional<AggregateFunctionProperties> tryGetPropertiesImpl(const String & name) const;
private:
using AggregateFunctions = std::unordered_map<String, Value>;
AggregateFunctions aggregate_functions;

View File

@ -0,0 +1,37 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionMannWhitney.h>
#include <AggregateFunctions/FactoryHelpers.h>
#include "registerAggregateFunctions.h"
#include <AggregateFunctions/Helpers.h>
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
namespace DB
{
namespace
{
AggregateFunctionPtr createAggregateFunctionMannWhitneyUTest(const std::string & name, const DataTypes & argument_types, const Array & parameters)
{
assertBinary(name, argument_types);
if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED);
return std::make_shared<AggregateFunctionMannWhitney>(argument_types, parameters);
}
}
void registerAggregateFunctionMannWhitney(AggregateFunctionFactory & factory)
{
factory.registerFunction("mannWhitneyUTest", createAggregateFunctionMannWhitneyUTest);
}
}

View File

@ -0,0 +1,246 @@
#pragma once
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/StatCommon.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnTuple.h>
#include <Common/assert_cast.h>
#include <Common/FieldVisitors.h>
#include <Common/PODArray_fwd.h>
#include <common/types.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeTuple.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <limits>
#include <DataTypes/DataTypeArray.h>
#include <Common/ArenaAllocator.h>
#include <iostream>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int BAD_ARGUMENTS;
}
struct MannWhitneyData : public StatisticalSample<Float64, Float64>
{
/*Since null hypothesis is "for randomly selected values X and Y from two populations,
*the probability of X being greater than Y is equal to the probability of Y being greater than X".
*Or "the distribution F of first sample equals to the distribution G of second sample".
*Then alternative for this hypothesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */
enum class Alternative
{
TwoSided,
Less,
Greater
};
/// The behaviour equals to the similar function from scipy.
/// https://github.com/scipy/scipy/blob/ab9e9f17e0b7b2d618c4d4d8402cd4c0c200d6c0/scipy/stats/stats.py#L6978
std::pair<Float64, Float64> getResult(Alternative alternative, bool continuity_correction)
{
ConcatenatedSamples both(this->x, this->y);
RanksArray ranks;
Float64 tie_correction;
/// Compute ranks according to both samples.
std::tie(ranks, tie_correction) = computeRanksAndTieCorrection(both);
const Float64 n1 = this->size_x;
const Float64 n2 = this->size_y;
Float64 r1 = 0;
for (size_t i = 0; i < n1; ++i)
r1 += ranks[i];
const Float64 u1 = n1 * n2 + (n1 * (n1 + 1.)) / 2. - r1;
const Float64 u2 = n1 * n2 - u1;
/// The distribution of U-statistic under null hypothesis H0 is symmetric with respect to meanrank.
const Float64 meanrank = n1 * n2 /2. + 0.5 * continuity_correction;
const Float64 sd = std::sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0);
Float64 u = 0;
if (alternative == Alternative::TwoSided)
/// There is no difference which u_i to take as u, because z will be differ only in sign and we take std::abs() from it.
u = std::max(u1, u2);
else if (alternative == Alternative::Less)
u = u1;
else if (alternative == Alternative::Greater)
u = u2;
Float64 z = (u - meanrank) / sd;
if (alternative == Alternative::TwoSided)
z = std::abs(z);
/// In fact cdf is a probability function, so it is intergral of density from (-inf, z].
/// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);});
Float64 p_value = 0;
if (alternative == Alternative::TwoSided)
p_value = 1 - 2 * cdf;
else
p_value = 0.5 - cdf;
return {u2, p_value};
}
private:
using Sample = typename StatisticalSample<Float64, Float64>::SampleX;
/// We need to compute ranks according to all samples. Use this class to avoid extra copy and memory allocation.
class ConcatenatedSamples
{
public:
ConcatenatedSamples(const Sample & first_, const Sample & second_)
: first(first_), second(second_) {}
const Float64 & operator[](size_t ind) const
{
if (ind < first.size())
return first[ind];
return second[ind % first.size()];
}
size_t size() const
{
return first.size() + second.size();
}
private:
const Sample & first;
const Sample & second;
};
};
class AggregateFunctionMannWhitney final:
public IAggregateFunctionDataHelper<MannWhitneyData, AggregateFunctionMannWhitney>
{
private:
using Alternative = typename MannWhitneyData::Alternative;
Alternative alternative;
bool continuity_correction{true};
public:
explicit AggregateFunctionMannWhitney(const DataTypes & arguments, const Array & params)
:IAggregateFunctionDataHelper<MannWhitneyData, AggregateFunctionMannWhitney> ({arguments}, {})
{
if (params.size() > 2)
throw Exception("Aggregate function " + getName() + " require two parameter or less", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (params.empty())
{
alternative = Alternative::TwoSided;
return;
}
if (params[0].getType() != Field::Types::String)
throw Exception("Aggregate function " + getName() + " require require first parameter to be a String", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
auto param = params[0].get<String>();
if (param == "two-sided")
alternative = Alternative::TwoSided;
else if (param == "less")
alternative = Alternative::Less;
else if (param == "greater")
alternative = Alternative::Greater;
else
throw Exception("Unknown parameter in aggregate function " + getName() +
". It must be one of: 'two sided', 'less', 'greater'", ErrorCodes::BAD_ARGUMENTS);
if (params.size() != 2)
return;
if (params[1].getType() != Field::Types::UInt64)
throw Exception("Aggregate function " + getName() + " require require second parameter to be a UInt64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
continuity_correction = static_cast<bool>(params[1].get<UInt64>());
}
String getName() const override
{
return "mannWhitneyUTest";
}
DataTypePtr getReturnType() const override
{
DataTypes types
{
std::make_shared<DataTypeNumber<Float64>>(),
std::make_shared<DataTypeNumber<Float64>>(),
};
Strings names
{
"u_statistic",
"p_value"
};
return std::make_shared<DataTypeTuple>(
std::move(types),
std::move(names)
);
}
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena) const override
{
Float64 value = columns[0]->getFloat64(row_num);
UInt8 is_second = columns[1]->getUInt(row_num);
if (is_second)
this->data(place).addY(value, arena);
else
this->data(place).addX(value, arena);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
{
auto & a = this->data(place);
auto & b = this->data(rhs);
a.merge(b, arena);
}
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
{
this->data(place).write(buf);
}
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena * arena) const override
{
this->data(place).read(buf, arena);
}
void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
{
if (!this->data(place).size_x || !this->data(place).size_y)
throw Exception("Aggregate function " + getName() + " require both samples to be non empty", ErrorCodes::BAD_ARGUMENTS);
auto [u_statistic, p_value] = this->data(place).getResult(alternative, continuity_correction);
/// Because p-value is a probability.
p_value = std::min(1.0, std::max(0.0, p_value));
auto & column_tuple = assert_cast<ColumnTuple &>(to);
auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
column_stat.getData().push_back(u_statistic);
column_value.getData().push_back(p_value);
}
};
};

View File

@ -21,23 +21,10 @@ AggregateFunctionPtr createAggregateFunctionRankCorrelation(const std::string &
assertBinary(name, argument_types);
assertNoParameters(name, parameters);
AggregateFunctionPtr res;
if (isDecimal(argument_types[0]) || isDecimal(argument_types[1]))
{
if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED);
}
else
{
res.reset(createWithTwoNumericTypes<AggregateFunctionRankCorrelation>(*argument_types[0], *argument_types[1], argument_types));
}
if (!res)
{
throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED);
}
return res;
return std::make_shared<AggregateFunctionRankCorrelation>(argument_types);
}
}

View File

@ -1,73 +1,56 @@
#pragma once
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/StatCommon.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnTuple.h>
#include <Common/assert_cast.h>
#include <Common/FieldVisitors.h>
#include <Common/PODArray_fwd.h>
#include <common/types.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeTuple.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <limits>
#include <DataTypes/DataTypeArray.h>
#include <Common/ArenaAllocator.h>
#include <type_traits>
namespace DB
{
template <template <typename> class Comparator>
struct ComparePairFirst final
struct RankCorrelationData : public StatisticalSample<Float64, Float64>
{
template <typename X, typename Y>
bool operator()(const std::pair<X, Y> & lhs, const std::pair<X, Y> & rhs) const
Float64 getResult()
{
return Comparator<X>{}(lhs.first, rhs.first);
RanksArray ranks_x;
std::tie(ranks_x, std::ignore) = computeRanksAndTieCorrection(this->x);
RanksArray ranks_y;
std::tie(ranks_y, std::ignore) = computeRanksAndTieCorrection(this->y);
/// In our case sizes of both samples are equal.
const auto size = this->size_x;
/// Count d^2 sum
Float64 answer = 0;
for (size_t j = 0; j < size; ++j)
answer += (ranks_x[j] - ranks_y[j]) * (ranks_x[j] - ranks_y[j]);
answer *= 6;
answer /= size * (size * size - 1);
answer = 1 - answer;
return answer;
}
};
template <template <typename> class Comparator>
struct ComparePairSecond final
{
template <typename X, typename Y>
bool operator()(const std::pair<X, Y> & lhs, const std::pair<X, Y> & rhs) const
{
return Comparator<Y>{}(lhs.second, rhs.second);
}
};
template <typename X = Float64, typename Y = Float64>
struct AggregateFunctionRankCorrelationData final
{
size_t size_x = 0;
using Allocator = MixedAlignedArenaAllocator<alignof(std::pair<X, Y>), 4096>;
using Array = PODArray<std::pair<X, Y>, 32, Allocator>;
Array values;
};
template <typename X, typename Y>
class AggregateFunctionRankCorrelation :
public IAggregateFunctionDataHelper<AggregateFunctionRankCorrelationData<X, Y>, AggregateFunctionRankCorrelation<X, Y>>
public IAggregateFunctionDataHelper<RankCorrelationData, AggregateFunctionRankCorrelation>
{
using Data = AggregateFunctionRankCorrelationData<X, Y>;
using Allocator = MixedAlignedArenaAllocator<alignof(std::pair<Float64, Float64>), 4096>;
using Array = PODArray<std::pair<Float64, Float64>, 32, Allocator>;
public:
explicit AggregateFunctionRankCorrelation(const DataTypes & arguments)
:IAggregateFunctionDataHelper<AggregateFunctionRankCorrelationData<X, Y>,AggregateFunctionRankCorrelation<X, Y>> ({arguments}, {})
:IAggregateFunctionDataHelper<RankCorrelationData, AggregateFunctionRankCorrelation> ({arguments}, {})
{}
String getName() const override
@ -80,24 +63,12 @@ public:
return std::make_shared<DataTypeNumber<Float64>>();
}
void insert(Data & a, const std::pair<X, Y> & x, Arena * arena) const
{
++a.size_x;
a.values.push_back(x, arena);
}
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena) const override
{
auto & a = this->data(place);
auto new_x = assert_cast<const ColumnVector<X> &>(*columns[0]).getData()[row_num];
auto new_y = assert_cast<const ColumnVector<Y> &>(*columns[1]).getData()[row_num];
auto new_arg = std::make_pair(new_x, new_y);
a.size_x += 1;
a.values.push_back(new_arg, arena);
Float64 new_x = columns[0]->getFloat64(row_num);
Float64 new_y = columns[1]->getFloat64(row_num);
this->data(place).addX(new_x, arena);
this->data(place).addY(new_y, arena);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
@ -105,116 +76,22 @@ public:
auto & a = this->data(place);
auto & b = this->data(rhs);
if (b.size_x)
for (size_t i = 0; i < b.size_x; ++i)
insert(a, b.values[i], arena);
a.merge(b, arena);
}
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
{
const auto & value = this->data(place).values;
size_t size = this->data(place).size_x;
writeVarUInt(size, buf);
buf.write(reinterpret_cast<const char *>(value.data()), size * sizeof(value[0]));
this->data(place).write(buf);
}
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena * arena) const override
{
size_t size = 0;
readVarUInt(size, buf);
auto & value = this->data(place).values;
value.resize(size, arena);
buf.read(reinterpret_cast<char *>(value.data()), size * sizeof(value[0]));
this->data(place).read(buf, arena);
}
void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * /*arena*/) const override
void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
{
const auto & value = this->data(place).values;
size_t size = this->data(place).size_x;
// create a copy of values not to format data
PODArrayWithStackMemory<std::pair<Float64, Float64>, 32> tmp_values;
tmp_values.resize(size);
for (size_t j = 0; j < size; ++ j)
tmp_values[j] = static_cast<std::pair<Float64, Float64>>(value[j]);
// sort x_values
std::sort(std::begin(tmp_values), std::end(tmp_values), ComparePairFirst<std::greater>{});
for (size_t j = 0; j < size;)
{
// replace x_values with their ranks
size_t rank = j + 1;
size_t same = 1;
size_t cur_sum = rank;
size_t cur_start = j;
while (j < size - 1)
{
if (tmp_values[j].first == tmp_values[j + 1].first)
{
// rank of (j + 1)th number
rank += 1;
++same;
cur_sum += rank;
++j;
}
else
break;
}
// insert rank is calculated as average of ranks of equal values
Float64 insert_rank = static_cast<Float64>(cur_sum) / same;
for (size_t i = cur_start; i <= j; ++i)
tmp_values[i].first = insert_rank;
++j;
}
// sort y_values
std::sort(std::begin(tmp_values), std::end(tmp_values), ComparePairSecond<std::greater>{});
// replace y_values with their ranks
for (size_t j = 0; j < size;)
{
// replace x_values with their ranks
size_t rank = j + 1;
size_t same = 1;
size_t cur_sum = rank;
size_t cur_start = j;
while (j < size - 1)
{
if (tmp_values[j].second == tmp_values[j + 1].second)
{
// rank of (j + 1)th number
rank += 1;
++same;
cur_sum += rank;
++j;
}
else
{
break;
}
}
// insert rank is calculated as average of ranks of equal values
Float64 insert_rank = static_cast<Float64>(cur_sum) / same;
for (size_t i = cur_start; i <= j; ++i)
tmp_values[i].second = insert_rank;
++j;
}
// count d^2 sum
Float64 answer = static_cast<Float64>(0);
for (size_t j = 0; j < size; ++ j)
answer += (tmp_values[j].first - tmp_values[j].second) * (tmp_values[j].first - tmp_values[j].second);
answer *= 6;
answer /= size * (size * size - 1);
answer = 1 - answer;
auto answer = this->data(place).getResult();
auto & column = static_cast<ColumnVector<Float64> &>(to);
column.getData().push_back(answer);

View File

@ -8,6 +8,7 @@
#include <IO/ReadHelpers.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/Moments.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypesDecimal.h>
@ -30,310 +31,6 @@
namespace DB
{
namespace ErrorCodes
{
extern const int DECIMAL_OVERFLOW;
}
/**
Calculating univariate central moments
Levels:
level 2 (pop & samp): var, stddev
level 3: skewness
level 4: kurtosis
References:
https://en.wikipedia.org/wiki/Moment_(mathematics)
https://en.wikipedia.org/wiki/Skewness
https://en.wikipedia.org/wiki/Kurtosis
*/
template <typename T, size_t _level>
struct VarMoments
{
T m[_level + 1]{};
void add(T x)
{
++m[0];
m[1] += x;
m[2] += x * x;
if constexpr (_level >= 3) m[3] += x * x * x;
if constexpr (_level >= 4) m[4] += x * x * x * x;
}
void merge(const VarMoments & rhs)
{
m[0] += rhs.m[0];
m[1] += rhs.m[1];
m[2] += rhs.m[2];
if constexpr (_level >= 3) m[3] += rhs.m[3];
if constexpr (_level >= 4) m[4] += rhs.m[4];
}
void write(WriteBuffer & buf) const
{
writePODBinary(*this, buf);
}
void read(ReadBuffer & buf)
{
readPODBinary(*this, buf);
}
T getPopulation() const
{
if (m[0] == 0)
return std::numeric_limits<T>::quiet_NaN();
/// Due to numerical errors, the result can be slightly less than zero,
/// but it should be impossible. Trim to zero.
return std::max(T{}, (m[2] - m[1] * m[1] / m[0]) / m[0]);
}
T getSample() const
{
if (m[0] <= 1)
return std::numeric_limits<T>::quiet_NaN();
return std::max(T{}, (m[2] - m[1] * m[1] / m[0]) / (m[0] - 1));
}
T getMoment3() const
{
if (m[0] == 0)
return std::numeric_limits<T>::quiet_NaN();
// to avoid accuracy problem
if (m[0] == 1)
return 0;
return (m[3]
- (3 * m[2]
- 2 * m[1] * m[1] / m[0]
) * m[1] / m[0]
) / m[0];
}
T getMoment4() const
{
if (m[0] == 0)
return std::numeric_limits<T>::quiet_NaN();
// to avoid accuracy problem
if (m[0] == 1)
return 0;
return (m[4]
- (4 * m[3]
- (6 * m[2]
- 3 * m[1] * m[1] / m[0]
) * m[1] / m[0]
) * m[1] / m[0]
) / m[0];
}
};
template <typename T, size_t _level>
class VarMomentsDecimal
{
public:
using NativeType = typename T::NativeType;
void add(NativeType x)
{
++m0;
getM(1) += x;
NativeType tmp;
bool overflow = common::mulOverflow(x, x, tmp) || common::addOverflow(getM(2), tmp, getM(2));
if constexpr (_level >= 3)
overflow = overflow || common::mulOverflow(tmp, x, tmp) || common::addOverflow(getM(3), tmp, getM(3));
if constexpr (_level >= 4)
overflow = overflow || common::mulOverflow(tmp, x, tmp) || common::addOverflow(getM(4), tmp, getM(4));
if (overflow)
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
}
void merge(const VarMomentsDecimal & rhs)
{
m0 += rhs.m0;
getM(1) += rhs.getM(1);
bool overflow = common::addOverflow(getM(2), rhs.getM(2), getM(2));
if constexpr (_level >= 3)
overflow = overflow || common::addOverflow(getM(3), rhs.getM(3), getM(3));
if constexpr (_level >= 4)
overflow = overflow || common::addOverflow(getM(4), rhs.getM(4), getM(4));
if (overflow)
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
}
void write(WriteBuffer & buf) const { writePODBinary(*this, buf); }
void read(ReadBuffer & buf) { readPODBinary(*this, buf); }
Float64 getPopulation(UInt32 scale) const
{
if (m0 == 0)
return std::numeric_limits<Float64>::infinity();
NativeType tmp;
if (common::mulOverflow(getM(1), getM(1), tmp) ||
common::subOverflow(getM(2), NativeType(tmp / m0), tmp))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return std::max(Float64{}, DecimalUtils::convertTo<Float64>(T(tmp / m0), scale));
}
Float64 getSample(UInt32 scale) const
{
if (m0 == 0)
return std::numeric_limits<Float64>::quiet_NaN();
if (m0 == 1)
return std::numeric_limits<Float64>::infinity();
NativeType tmp;
if (common::mulOverflow(getM(1), getM(1), tmp) ||
common::subOverflow(getM(2), NativeType(tmp / m0), tmp))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return std::max(Float64{}, DecimalUtils::convertTo<Float64>(T(tmp / (m0 - 1)), scale));
}
Float64 getMoment3(UInt32 scale) const
{
if (m0 == 0)
return std::numeric_limits<Float64>::infinity();
NativeType tmp;
if (common::mulOverflow(2 * getM(1), getM(1), tmp) ||
common::subOverflow(3 * getM(2), NativeType(tmp / m0), tmp) ||
common::mulOverflow(tmp, getM(1), tmp) ||
common::subOverflow(getM(3), NativeType(tmp / m0), tmp))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return DecimalUtils::convertTo<Float64>(T(tmp / m0), scale);
}
Float64 getMoment4(UInt32 scale) const
{
if (m0 == 0)
return std::numeric_limits<Float64>::infinity();
NativeType tmp;
if (common::mulOverflow(3 * getM(1), getM(1), tmp) ||
common::subOverflow(6 * getM(2), NativeType(tmp / m0), tmp) ||
common::mulOverflow(tmp, getM(1), tmp) ||
common::subOverflow(4 * getM(3), NativeType(tmp / m0), tmp) ||
common::mulOverflow(tmp, getM(1), tmp) ||
common::subOverflow(getM(4), NativeType(tmp / m0), tmp))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return DecimalUtils::convertTo<Float64>(T(tmp / m0), scale);
}
private:
UInt64 m0{};
NativeType m[_level]{};
NativeType & getM(size_t i) { return m[i - 1]; }
const NativeType & getM(size_t i) const { return m[i - 1]; }
};
/**
Calculating multivariate central moments
Levels:
level 2 (pop & samp): covar
References:
https://en.wikipedia.org/wiki/Moment_(mathematics)
*/
template <typename T>
struct CovarMoments
{
T m0{};
T x1{};
T y1{};
T xy{};
void add(T x, T y)
{
++m0;
x1 += x;
y1 += y;
xy += x * y;
}
void merge(const CovarMoments & rhs)
{
m0 += rhs.m0;
x1 += rhs.x1;
y1 += rhs.y1;
xy += rhs.xy;
}
void write(WriteBuffer & buf) const
{
writePODBinary(*this, buf);
}
void read(ReadBuffer & buf)
{
readPODBinary(*this, buf);
}
T NO_SANITIZE_UNDEFINED getPopulation() const
{
return (xy - x1 * y1 / m0) / m0;
}
T NO_SANITIZE_UNDEFINED getSample() const
{
if (m0 == 0)
return std::numeric_limits<T>::quiet_NaN();
return (xy - x1 * y1 / m0) / (m0 - 1);
}
};
template <typename T>
struct CorrMoments
{
T m0{};
T x1{};
T y1{};
T xy{};
T x2{};
T y2{};
void add(T x, T y)
{
++m0;
x1 += x;
y1 += y;
xy += x * y;
x2 += x * x;
y2 += y * y;
}
void merge(const CorrMoments & rhs)
{
m0 += rhs.m0;
x1 += rhs.x1;
y1 += rhs.y1;
xy += rhs.xy;
x2 += rhs.x2;
y2 += rhs.y2;
}
void write(WriteBuffer & buf) const
{
writePODBinary(*this, buf);
}
void read(ReadBuffer & buf)
{
readPODBinary(*this, buf);
}
T NO_SANITIZE_UNDEFINED get() const
{
return (m0 * xy - x1 * y1) / sqrt((m0 * x2 - x1 * x1) * (m0 * y2 - y1 * y1));
}
};
enum class StatisticsFunctionKind
{
varPop, varSamp,

View File

@ -0,0 +1,77 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionTTest.h>
#include <AggregateFunctions/FactoryHelpers.h>
#include <AggregateFunctions/Moments.h>
#include "registerAggregateFunctions.h"
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
namespace DB
{
namespace
{
/** Student T-test applies to two samples of independent random variables
* that have normal distributions with equal (but unknown) variances.
* It allows to answer the question whether means of the distributions differ.
*
* If variances are not considered equal, Welch T-test should be used instead.
*/
struct StudentTTestData : public TTestMoments<Float64>
{
static constexpr auto name = "studentTTest";
std::pair<Float64, Float64> getResult() const
{
Float64 mean_x = x1 / nx;
Float64 mean_y = y1 / ny;
/// To estimate the variance we first estimate two means.
/// That's why the number of degrees of freedom is the total number of values of both samples minus 2.
Float64 degrees_of_freedom = nx + ny - 2;
/// Calculate s^2
/// The original formulae looks like
/// \frac{\sum_{i = 1}^{n_x}{(x_i - \bar{x}) ^ 2} + \sum_{i = 1}^{n_y}{(y_i - \bar{y}) ^ 2}}{n_x + n_y - 2}
/// But we made some mathematical transformations not to store original sequences.
/// Also we dropped sqrt, because later it will be squared later.
Float64 all_x = x2 + nx * mean_x * mean_x - 2 * mean_x * x1;
Float64 all_y = y2 + ny * mean_y * mean_y - 2 * mean_y * y1;
Float64 s2 = (all_x + all_y) / degrees_of_freedom;
Float64 std_err2 = s2 * (1. / nx + 1. / ny);
/// t-statistic
Float64 t_stat = (mean_x - mean_y) / sqrt(std_err2);
return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)};
}
};
AggregateFunctionPtr createAggregateFunctionStudentTTest(const std::string & name, const DataTypes & argument_types, const Array & parameters)
{
assertBinary(name, argument_types);
assertNoParameters(name, parameters);
if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::BAD_ARGUMENTS);
return std::make_shared<AggregateFunctionTTest<StudentTTestData>>(argument_types);
}
}
void registerAggregateFunctionStudentTTest(AggregateFunctionFactory & factory)
{
factory.registerFunction("studentTTest", createAggregateFunctionStudentTTest);
}
}

View File

@ -0,0 +1,154 @@
#pragma once
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/StatCommon.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnTuple.h>
#include <Common/assert_cast.h>
#include <Core/Types.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeTuple.h>
#include <cmath>
/// This function is used in implementations of different T-Tests.
/// On Darwin it's unavailable in math.h but actually exists in the library (can be linked successfully).
#if defined(OS_DARWIN)
extern "C"
{
double lgamma_r(double x, int * signgamp);
}
#endif
namespace DB
{
class ReadBuffer;
class WriteBuffer;
/**
* If you have a cumulative distribution function F, then calculating the p-value for given statistic T is simply 1F(T)
* In our case p-value is two-sided, so we multiply it by 2.
* So cumulative distribution function F equals to
* \[ F(t) = \int_{-\infty}^{t} f(u)du = 1 - \frac{1}{2} I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \]
* where \[ x(t) = \frac{v}{t^2 + v} \]: https://en.wikipedia.org/wiki/Student%27s_t-distribution#Cumulative_distribution_function
*
* so our resulting \[ p-value = I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \].
*
* And I is regularized incomplete beta function: https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function
*
* Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
* and
* \[ \mathrm {B} (x,y)={\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}}=\
* \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
*
* p-value can be calculated in terms of gamma functions and integrals more simply:
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}r^{{\frac {\nu }{2}}-1}\,(1-r)^{-0.5}\,\mathrm {d} r}\
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
*
* which simplifies to:
*
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}{\frac {r^{{\frac {\nu }{2}}-1}}{\sqrt {1-r}}}\,\mathrm {d} r}\
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
*
* Read here for details https://rosettacode.org/wiki/Welch%27s_t-test#
*
* Both WelchTTest and StudentTTest have t-statistric with Student distribution but with different degrees of freedom.
* So the procedure of computing p-value is the same.
*/
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2)
{
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
[degrees_of_freedom](double x) { return std::pow(x, degrees_of_freedom / 2 - 1) / std::sqrt(1 - x); });
int unused;
Float64 denominator = std::exp(
lgamma_r(degrees_of_freedom / 2, &unused)
+ lgamma_r(0.5, &unused)
- lgamma_r(degrees_of_freedom / 2 + 0.5, &unused));
return std::min(1.0, std::max(0.0, numerator / denominator));
}
/// Returns tuple of (t-statistic, p-value)
/// https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/9/1193/files/2016/01/05b-TandP.pdf
template <typename Data>
class AggregateFunctionTTest :
public IAggregateFunctionDataHelper<Data, AggregateFunctionTTest<Data>>
{
public:
AggregateFunctionTTest(const DataTypes & arguments)
: IAggregateFunctionDataHelper<Data, AggregateFunctionTTest<Data>>({arguments}, {})
{
}
String getName() const override
{
return Data::name;
}
DataTypePtr getReturnType() const override
{
DataTypes types
{
std::make_shared<DataTypeNumber<Float64>>(),
std::make_shared<DataTypeNumber<Float64>>(),
};
Strings names
{
"t_statistic",
"p_value"
};
return std::make_shared<DataTypeTuple>(
std::move(types),
std::move(names)
);
}
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
{
Float64 value = columns[0]->getFloat64(row_num);
UInt8 is_second = columns[1]->getUInt(row_num);
if (is_second)
this->data(place).addY(value);
else
this->data(place).addX(value);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
{
this->data(place).merge(this->data(rhs));
}
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
{
this->data(place).write(buf);
}
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
{
this->data(place).read(buf);
}
void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
{
auto [t_statistic, p_value] = this->data(place).getResult();
/// Because p-value is a probability.
p_value = std::min(1.0, std::max(0.0, p_value));
auto & column_tuple = assert_cast<ColumnTuple &>(to);
auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
column_stat.getData().push_back(t_statistic);
column_value.getData().push_back(p_value);
}
};
};

View File

@ -1,35 +0,0 @@
#include "AggregateFunctionTimeSeriesGroupSum.h"
#include "AggregateFunctionFactory.h"
#include "FactoryHelpers.h"
#include "Helpers.h"
#include "registerAggregateFunctions.h"
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
namespace
{
template <bool rate>
AggregateFunctionPtr createAggregateFunctionTimeSeriesGroupSum(const std::string & name, const DataTypes & arguments, const Array & params)
{
assertNoParameters(name, params);
if (arguments.size() < 3)
throw Exception("Not enough event arguments for aggregate function " + name, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
return std::make_shared<AggregateFunctionTimeSeriesGroupSum<rate>>(arguments);
}
}
void registerAggregateFunctionTimeSeriesGroupSum(AggregateFunctionFactory & factory)
{
factory.registerFunction("timeSeriesGroupSum", createAggregateFunctionTimeSeriesGroupSum<false>);
factory.registerFunction("timeSeriesGroupRateSum", createAggregateFunctionTimeSeriesGroupSum<true>);
}
}

View File

@ -1,291 +0,0 @@
#pragma once
#include <bitset>
#include <map>
#include <queue>
#include <unordered_set>
#include <utility>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Common/ArenaAllocator.h>
#include <Common/assert_cast.h>
#include <ext/range.h>
#include "IAggregateFunction.h"
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
template <bool rate>
struct AggregateFunctionTimeSeriesGroupSumData
{
using DataPoint = std::pair<Int64, Float64>;
struct Points
{
using Dps = std::queue<DataPoint>;
Dps dps;
void add(Int64 t, Float64 v)
{
dps.push(std::make_pair(t, v));
if (dps.size() > 2)
dps.pop();
}
Float64 getval(Int64 t)
{
Int64 t1, t2;
Float64 v1, v2;
if (rate)
{
if (dps.size() < 2)
return 0;
t1 = dps.back().first;
t2 = dps.front().first;
v1 = dps.back().second;
v2 = dps.front().second;
return (v1 - v2) / Float64(t1 - t2);
}
else
{
if (dps.size() == 1 && t == dps.front().first)
return dps.front().second;
t1 = dps.back().first;
t2 = dps.front().first;
v1 = dps.back().second;
v2 = dps.front().second;
return v2 + ((v1 - v2) * Float64(t - t2)) / Float64(t1 - t2);
}
}
};
typedef std::map<UInt64, Points> Series;
typedef PODArrayWithStackMemory<DataPoint, 128> AggSeries;
Series ss;
AggSeries result;
void add(UInt64 uid, Int64 t, Float64 v)
{ //suppose t is coming asc
typename Series::iterator it_ss;
if (ss.count(uid) == 0)
{ //time series not exist, insert new one
Points tmp;
tmp.add(t, v);
ss.emplace(uid, tmp);
it_ss = ss.find(uid);
}
else
{
it_ss = ss.find(uid);
it_ss->second.add(t, v);
}
if (result.size() > 0 && t < result.back().first)
throw Exception{"timeSeriesGroupSum or timeSeriesGroupRateSum must order by timestamp asc.", ErrorCodes::LOGICAL_ERROR};
if (result.size() > 0 && t == result.back().first)
{
//do not add new point
if (rate)
result.back().second += it_ss->second.getval(t);
else
result.back().second += v;
}
else
{
if (rate)
result.emplace_back(std::make_pair(t, it_ss->second.getval(t)));
else
result.emplace_back(std::make_pair(t, v));
}
ssize_t i = result.size() - 1;
//reverse find out the index of timestamp that more than previous timestamp of t
while (result[i].first > it_ss->second.dps.front().first && i >= 0)
i--;
i++;
while (i < ssize_t(result.size()) - 1)
{
result[i].second += it_ss->second.getval(result[i].first);
i++;
}
}
void merge(const AggregateFunctionTimeSeriesGroupSumData & other)
{
//if ts has overlap, then aggregate two series by interpolation;
AggSeries tmp;
tmp.reserve(other.result.size() + result.size());
size_t i = 0, j = 0;
Int64 t1, t2;
Float64 v1, v2;
while (i < result.size() && j < other.result.size())
{
if (result[i].first < other.result[j].first)
{
if (j == 0)
{
tmp.emplace_back(result[i]);
}
else
{
t1 = other.result[j].first;
t2 = other.result[j - 1].first;
v1 = other.result[j].second;
v2 = other.result[j - 1].second;
Float64 value = result[i].second + v2 + (v1 - v2) * (Float64(result[i].first - t2)) / Float64(t1 - t2);
tmp.emplace_back(std::make_pair(result[i].first, value));
}
i++;
}
else if (result[i].first > other.result[j].first)
{
if (i == 0)
{
tmp.emplace_back(other.result[j]);
}
else
{
t1 = result[i].first;
t2 = result[i - 1].first;
v1 = result[i].second;
v2 = result[i - 1].second;
Float64 value = other.result[j].second + v2 + (v1 - v2) * (Float64(other.result[j].first - t2)) / Float64(t1 - t2);
tmp.emplace_back(std::make_pair(other.result[j].first, value));
}
j++;
}
else
{
tmp.emplace_back(std::make_pair(result[i].first, result[i].second + other.result[j].second));
i++;
j++;
}
}
while (i < result.size())
{
tmp.emplace_back(result[i]);
i++;
}
while (j < other.result.size())
{
tmp.push_back(other.result[j]);
j++;
}
swap(result, tmp);
}
void serialize(WriteBuffer & buf) const
{
size_t size = result.size();
writeVarUInt(size, buf);
if (size > 0)
{
buf.write(reinterpret_cast<const char *>(result.data()), size * sizeof(result[0]));
}
}
void deserialize(ReadBuffer & buf)
{
size_t size = 0;
readVarUInt(size, buf);
result.resize(size);
if (size > 0)
{
buf.read(reinterpret_cast<char *>(result.data()), size * sizeof(result[0]));
}
}
};
template <bool rate>
class AggregateFunctionTimeSeriesGroupSum final
: public IAggregateFunctionDataHelper<AggregateFunctionTimeSeriesGroupSumData<rate>, AggregateFunctionTimeSeriesGroupSum<rate>>
{
private:
public:
String getName() const override { return rate ? "timeSeriesGroupRateSum" : "timeSeriesGroupSum"; }
AggregateFunctionTimeSeriesGroupSum(const DataTypes & arguments)
: IAggregateFunctionDataHelper<AggregateFunctionTimeSeriesGroupSumData<rate>, AggregateFunctionTimeSeriesGroupSum<rate>>(arguments, {})
{
if (!WhichDataType(arguments[0].get()).isUInt64())
throw Exception{"Illegal type " + arguments[0].get()->getName() + " of argument 1 of aggregate function " + getName()
+ ", must be UInt64",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
if (!WhichDataType(arguments[1].get()).isInt64())
throw Exception{"Illegal type " + arguments[1].get()->getName() + " of argument 2 of aggregate function " + getName()
+ ", must be Int64",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
if (!WhichDataType(arguments[2].get()).isFloat64())
throw Exception{"Illegal type " + arguments[2].get()->getName() + " of argument 3 of aggregate function " + getName()
+ ", must be Float64",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
}
DataTypePtr getReturnType() const override
{
auto datatypes = std::vector<DataTypePtr>();
datatypes.push_back(std::make_shared<DataTypeInt64>());
datatypes.push_back(std::make_shared<DataTypeFloat64>());
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(datatypes));
}
void add(AggregateDataPtr place, const IColumn ** columns, const size_t row_num, Arena *) const override
{
auto uid = assert_cast<const ColumnVector<UInt64> *>(columns[0])->getData()[row_num];
auto ts = assert_cast<const ColumnVector<Int64> *>(columns[1])->getData()[row_num];
auto val = assert_cast<const ColumnVector<Float64> *>(columns[2])->getData()[row_num];
if (uid && ts && val)
{
this->data(place).add(uid, ts, val);
}
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override { this->data(place).merge(this->data(rhs)); }
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override { this->data(place).serialize(buf); }
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override { this->data(place).deserialize(buf); }
void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
{
const auto & value = this->data(place).result;
size_t size = value.size();
ColumnArray & arr_to = assert_cast<ColumnArray &>(to);
ColumnArray::Offsets & offsets_to = arr_to.getOffsets();
size_t old_size = offsets_to.back();
offsets_to.push_back(offsets_to.back() + size);
if (size)
{
typename ColumnInt64::Container & ts_to
= assert_cast<ColumnInt64 &>(assert_cast<ColumnTuple &>(arr_to.getData()).getColumn(0)).getData();
typename ColumnFloat64::Container & val_to
= assert_cast<ColumnFloat64 &>(assert_cast<ColumnTuple &>(arr_to.getData()).getColumn(1)).getData();
ts_to.reserve(old_size + size);
val_to.reserve(old_size + size);
size_t i = 0;
while (i < this->data(place).result.size())
{
ts_to.push_back(this->data(place).result[i].first);
val_to.push_back(this->data(place).result[i].second);
i++;
}
}
}
bool allocatesMemoryInArena() const override { return true; }
};
}

View File

@ -0,0 +1,74 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionTTest.h>
#include <AggregateFunctions/FactoryHelpers.h>
#include <AggregateFunctions/Moments.h>
#include "registerAggregateFunctions.h"
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
namespace DB
{
namespace
{
struct WelchTTestData : public TTestMoments<Float64>
{
static constexpr auto name = "welchTTest";
std::pair<Float64, Float64> getResult() const
{
Float64 mean_x = x1 / nx;
Float64 mean_y = y1 / ny;
/// s_x^2, s_y^2
/// The original formulae looks like \frac{1}{size_x - 1} \sum_{i = 1}^{size_x}{(x_i - \bar{x}) ^ 2}
/// But we made some mathematical transformations not to store original sequences.
/// Also we dropped sqrt, because later it will be squared later.
Float64 sx2 = (x2 + nx * mean_x * mean_x - 2 * mean_x * x1) / (nx - 1);
Float64 sy2 = (y2 + ny * mean_y * mean_y - 2 * mean_y * y1) / (ny - 1);
/// t-statistic
Float64 t_stat = (mean_x - mean_y) / sqrt(sx2 / nx + sy2 / ny);
/// degrees of freedom
Float64 numerator_sqrt = sx2 / nx + sy2 / ny;
Float64 numerator = numerator_sqrt * numerator_sqrt;
Float64 denominator_x = sx2 * sx2 / (nx * nx * (nx - 1));
Float64 denominator_y = sy2 * sy2 / (ny * ny * (ny - 1));
Float64 degrees_of_freedom = numerator / (denominator_x + denominator_y);
return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)};
}
};
AggregateFunctionPtr createAggregateFunctionWelchTTest(const std::string & name, const DataTypes & argument_types, const Array & parameters)
{
assertBinary(name, argument_types);
assertNoParameters(name, parameters);
if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::BAD_ARGUMENTS);
return std::make_shared<AggregateFunctionTTest<WelchTTestData>>(argument_types);
}
}
void registerAggregateFunctionWelchTTest(AggregateFunctionFactory & factory)
{
factory.registerFunction("welchTTest", createAggregateFunctionWelchTTest);
}
}

View File

@ -15,6 +15,7 @@
M(Float32) \
M(Float64)
// No UInt128 here because of the name conflict
#define FOR_NUMERIC_TYPES(M) \
M(UInt8) \
M(UInt16) \

View File

@ -0,0 +1,361 @@
#pragma once
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int DECIMAL_OVERFLOW;
}
/**
Calculating univariate central moments
Levels:
level 2 (pop & samp): var, stddev
level 3: skewness
level 4: kurtosis
References:
https://en.wikipedia.org/wiki/Moment_(mathematics)
https://en.wikipedia.org/wiki/Skewness
https://en.wikipedia.org/wiki/Kurtosis
*/
template <typename T, size_t _level>
struct VarMoments
{
T m[_level + 1]{};
void add(T x)
{
++m[0];
m[1] += x;
m[2] += x * x;
if constexpr (_level >= 3) m[3] += x * x * x;
if constexpr (_level >= 4) m[4] += x * x * x * x;
}
void merge(const VarMoments & rhs)
{
m[0] += rhs.m[0];
m[1] += rhs.m[1];
m[2] += rhs.m[2];
if constexpr (_level >= 3) m[3] += rhs.m[3];
if constexpr (_level >= 4) m[4] += rhs.m[4];
}
void write(WriteBuffer & buf) const
{
writePODBinary(*this, buf);
}
void read(ReadBuffer & buf)
{
readPODBinary(*this, buf);
}
T getPopulation() const
{
if (m[0] == 0)
return std::numeric_limits<T>::quiet_NaN();
/// Due to numerical errors, the result can be slightly less than zero,
/// but it should be impossible. Trim to zero.
return std::max(T{}, (m[2] - m[1] * m[1] / m[0]) / m[0]);
}
T getSample() const
{
if (m[0] <= 1)
return std::numeric_limits<T>::quiet_NaN();
return std::max(T{}, (m[2] - m[1] * m[1] / m[0]) / (m[0] - 1));
}
T getMoment3() const
{
if (m[0] == 0)
return std::numeric_limits<T>::quiet_NaN();
// to avoid accuracy problem
if (m[0] == 1)
return 0;
/// \[ \frac{1}{m_0} (m_3 - (3 * m_2 - \frac{2 * {m_1}^2}{m_0}) * \frac{m_1}{m_0});\]
return (m[3]
- (3 * m[2]
- 2 * m[1] * m[1] / m[0]
) * m[1] / m[0]
) / m[0];
}
T getMoment4() const
{
if (m[0] == 0)
return std::numeric_limits<T>::quiet_NaN();
// to avoid accuracy problem
if (m[0] == 1)
return 0;
/// \[ \frac{1}{m_0}(m_4 - (4 * m_3 - (6 * m_2 - \frac{3 * m_1^2}{m_0} ) \frac{m_1}{m_0})\frac{m_1}{m_0})\]
return (m[4]
- (4 * m[3]
- (6 * m[2]
- 3 * m[1] * m[1] / m[0]
) * m[1] / m[0]
) * m[1] / m[0]
) / m[0];
}
};
template <typename T, size_t _level>
class VarMomentsDecimal
{
public:
using NativeType = typename T::NativeType;
void add(NativeType x)
{
++m0;
getM(1) += x;
NativeType tmp;
bool overflow = common::mulOverflow(x, x, tmp) || common::addOverflow(getM(2), tmp, getM(2));
if constexpr (_level >= 3)
overflow = overflow || common::mulOverflow(tmp, x, tmp) || common::addOverflow(getM(3), tmp, getM(3));
if constexpr (_level >= 4)
overflow = overflow || common::mulOverflow(tmp, x, tmp) || common::addOverflow(getM(4), tmp, getM(4));
if (overflow)
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
}
void merge(const VarMomentsDecimal & rhs)
{
m0 += rhs.m0;
getM(1) += rhs.getM(1);
bool overflow = common::addOverflow(getM(2), rhs.getM(2), getM(2));
if constexpr (_level >= 3)
overflow = overflow || common::addOverflow(getM(3), rhs.getM(3), getM(3));
if constexpr (_level >= 4)
overflow = overflow || common::addOverflow(getM(4), rhs.getM(4), getM(4));
if (overflow)
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
}
void write(WriteBuffer & buf) const { writePODBinary(*this, buf); }
void read(ReadBuffer & buf) { readPODBinary(*this, buf); }
Float64 getPopulation(UInt32 scale) const
{
if (m0 == 0)
return std::numeric_limits<Float64>::infinity();
NativeType tmp;
if (common::mulOverflow(getM(1), getM(1), tmp) ||
common::subOverflow(getM(2), NativeType(tmp / m0), tmp))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return std::max(Float64{}, DecimalUtils::convertTo<Float64>(T(tmp / m0), scale));
}
Float64 getSample(UInt32 scale) const
{
if (m0 == 0)
return std::numeric_limits<Float64>::quiet_NaN();
if (m0 == 1)
return std::numeric_limits<Float64>::infinity();
NativeType tmp;
if (common::mulOverflow(getM(1), getM(1), tmp) ||
common::subOverflow(getM(2), NativeType(tmp / m0), tmp))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return std::max(Float64{}, DecimalUtils::convertTo<Float64>(T(tmp / (m0 - 1)), scale));
}
Float64 getMoment3(UInt32 scale) const
{
if (m0 == 0)
return std::numeric_limits<Float64>::infinity();
NativeType tmp;
if (common::mulOverflow(2 * getM(1), getM(1), tmp) ||
common::subOverflow(3 * getM(2), NativeType(tmp / m0), tmp) ||
common::mulOverflow(tmp, getM(1), tmp) ||
common::subOverflow(getM(3), NativeType(tmp / m0), tmp))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return DecimalUtils::convertTo<Float64>(T(tmp / m0), scale);
}
Float64 getMoment4(UInt32 scale) const
{
if (m0 == 0)
return std::numeric_limits<Float64>::infinity();
NativeType tmp;
if (common::mulOverflow(3 * getM(1), getM(1), tmp) ||
common::subOverflow(6 * getM(2), NativeType(tmp / m0), tmp) ||
common::mulOverflow(tmp, getM(1), tmp) ||
common::subOverflow(4 * getM(3), NativeType(tmp / m0), tmp) ||
common::mulOverflow(tmp, getM(1), tmp) ||
common::subOverflow(getM(4), NativeType(tmp / m0), tmp))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return DecimalUtils::convertTo<Float64>(T(tmp / m0), scale);
}
private:
UInt64 m0{};
NativeType m[_level]{};
NativeType & getM(size_t i) { return m[i - 1]; }
const NativeType & getM(size_t i) const { return m[i - 1]; }
};
/**
Calculating multivariate central moments
Levels:
level 2 (pop & samp): covar
References:
https://en.wikipedia.org/wiki/Moment_(mathematics)
*/
template <typename T>
struct CovarMoments
{
T m0{};
T x1{};
T y1{};
T xy{};
void add(T x, T y)
{
++m0;
x1 += x;
y1 += y;
xy += x * y;
}
void merge(const CovarMoments & rhs)
{
m0 += rhs.m0;
x1 += rhs.x1;
y1 += rhs.y1;
xy += rhs.xy;
}
void write(WriteBuffer & buf) const
{
writePODBinary(*this, buf);
}
void read(ReadBuffer & buf)
{
readPODBinary(*this, buf);
}
T NO_SANITIZE_UNDEFINED getPopulation() const
{
return (xy - x1 * y1 / m0) / m0;
}
T NO_SANITIZE_UNDEFINED getSample() const
{
if (m0 == 0)
return std::numeric_limits<T>::quiet_NaN();
return (xy - x1 * y1 / m0) / (m0 - 1);
}
};
template <typename T>
struct CorrMoments
{
T m0{};
T x1{};
T y1{};
T xy{};
T x2{};
T y2{};
void add(T x, T y)
{
++m0;
x1 += x;
y1 += y;
xy += x * y;
x2 += x * x;
y2 += y * y;
}
void merge(const CorrMoments & rhs)
{
m0 += rhs.m0;
x1 += rhs.x1;
y1 += rhs.y1;
xy += rhs.xy;
x2 += rhs.x2;
y2 += rhs.y2;
}
void write(WriteBuffer & buf) const
{
writePODBinary(*this, buf);
}
void read(ReadBuffer & buf)
{
readPODBinary(*this, buf);
}
T NO_SANITIZE_UNDEFINED get() const
{
return (m0 * xy - x1 * y1) / sqrt((m0 * x2 - x1 * x1) * (m0 * y2 - y1 * y1));
}
};
/// Data for calculation of Student and Welch T-Tests.
template <typename T>
struct TTestMoments
{
T nx{};
T ny{};
T x1{};
T y1{};
T x2{};
T y2{};
void addX(T value)
{
++nx;
x1 += value;
x2 += value * value;
}
void addY(T value)
{
++ny;
y1 += value;
y2 += value * value;
}
void merge(const TTestMoments & rhs)
{
nx += rhs.nx;
ny += rhs.ny;
x1 += rhs.x1;
y1 += rhs.y1;
x2 += rhs.x2;
y2 += rhs.y2;
}
void write(WriteBuffer & buf) const
{
writePODBinary(*this, buf);
}
void read(ReadBuffer & buf)
{
readPODBinary(*this, buf);
}
};
}

View File

@ -114,7 +114,7 @@ class QuantileTDigest
static constexpr size_t PART_SIZE_BITS = 8;
using Transform = RadixSortFloatTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
using Allocator = RadixSortAllocator;
/// The function to get the key from an array element.
static Key & extractKey(Element & elem) { return elem.mean; }

View File

@ -0,0 +1,114 @@
#pragma once
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <Common/ArenaAllocator.h>
#include <numeric>
#include <algorithm>
#include <utility>
namespace DB
{
template <typename F>
static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
{
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b) - std::round(a)));
const long double h = (b - a) / iterations;
Float64 sum_odds = 0.0;
for (size_t i = 1; i < iterations; i += 2)
sum_odds += func(a + i * h);
Float64 sum_evens = 0.0;
for (size_t i = 2; i < iterations; i += 2)
sum_evens += func(a + i * h);
return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
}
/// Because ranks are adjusted, we have to store each of them in Float type.
using RanksArray = std::vector<Float64>;
template <typename Values>
std::pair<RanksArray, Float64> computeRanksAndTieCorrection(const Values & values)
{
const size_t size = values.size();
/// Save initial positions, than sort indices according to the values.
std::vector<size_t> indexes(size);
std::iota(indexes.begin(), indexes.end(), 0);
std::sort(indexes.begin(), indexes.end(),
[&] (size_t lhs, size_t rhs) { return values[lhs] < values[rhs]; });
size_t left = 0;
Float64 tie_numenator = 0;
RanksArray out(size);
while (left < size)
{
size_t right = left;
while (right < size && values[indexes[left]] == values[indexes[right]])
++right;
auto adjusted = (left + right + 1.) / 2.;
auto count_equal = right - left;
tie_numenator += std::pow(count_equal, 3) - count_equal;
for (size_t iter = left; iter < right; ++iter)
out[indexes[iter]] = adjusted;
left = right;
}
return {out, 1 - (tie_numenator / (std::pow(size, 3) - size))};
}
template <typename X, typename Y>
struct StatisticalSample
{
using AllocatorXSample = MixedAlignedArenaAllocator<alignof(X), 4096>;
using SampleX = PODArray<X, 32, AllocatorXSample>;
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
using SampleY = PODArray<Y, 32, AllocatorYSample>;
SampleX x{};
SampleY y{};
size_t size_x{0};
size_t size_y{0};
void addX(X value, Arena * arena)
{
++size_x;
x.push_back(value, arena);
}
void addY(Y value, Arena * arena)
{
++size_y;
y.push_back(value, arena);
}
void merge(const StatisticalSample & rhs, Arena * arena)
{
size_x += rhs.size_x;
size_y += rhs.size_y;
x.insert(rhs.x.begin(), rhs.x.end(), arena);
y.insert(rhs.y.begin(), rhs.y.end(), arena);
}
void write(WriteBuffer & buf) const
{
writeVarUInt(size_x, buf);
writeVarUInt(size_y, buf);
buf.write(reinterpret_cast<const char *>(x.data()), size_x * sizeof(x[0]));
buf.write(reinterpret_cast<const char *>(y.data()), size_y * sizeof(y[0]));
}
void read(ReadBuffer & buf, Arena * arena)
{
readVarUInt(size_x, buf);
readVarUInt(size_y, buf);
x.resize(size_x, arena);
y.resize(size_y, arena);
buf.read(reinterpret_cast<char *>(x.data()), size_x * sizeof(x[0]));
buf.read(reinterpret_cast<char *>(y.data()), size_y * sizeof(y[0]));
}
};
}

View File

@ -32,16 +32,16 @@ void registerAggregateFunctionsBitmap(AggregateFunctionFactory &);
void registerAggregateFunctionsMaxIntersections(AggregateFunctionFactory &);
void registerAggregateFunctionHistogram(AggregateFunctionFactory &);
void registerAggregateFunctionRetention(AggregateFunctionFactory &);
void registerAggregateFunctionTimeSeriesGroupSum(AggregateFunctionFactory &);
void registerAggregateFunctionMLMethod(AggregateFunctionFactory &);
void registerAggregateFunctionEntropy(AggregateFunctionFactory &);
void registerAggregateFunctionSimpleLinearRegression(AggregateFunctionFactory &);
void registerAggregateFunctionMoving(AggregateFunctionFactory &);
void registerAggregateFunctionCategoricalIV(AggregateFunctionFactory &);
void registerAggregateFunctionAggThrow(AggregateFunctionFactory &);
void registerAggregateFunctionRankCorrelation(AggregateFunctionFactory &);
void registerAggregateFunctionMannWhitney(AggregateFunctionFactory &);
void registerAggregateFunctionWelchTTest(AggregateFunctionFactory &);
void registerAggregateFunctionStudentTTest(AggregateFunctionFactory &);
void registerAggregateFunctionRankCorrelation(AggregateFunctionFactory &);
class AggregateFunctionCombinatorFactory;
void registerAggregateFunctionCombinatorIf(AggregateFunctionCombinatorFactory &);
@ -86,7 +86,6 @@ void registerAggregateFunctions()
registerAggregateFunctionsMaxIntersections(factory);
registerAggregateFunctionHistogram(factory);
registerAggregateFunctionRetention(factory);
registerAggregateFunctionTimeSeriesGroupSum(factory);
registerAggregateFunctionMLMethod(factory);
registerAggregateFunctionEntropy(factory);
registerAggregateFunctionSimpleLinearRegression(factory);
@ -94,6 +93,9 @@ void registerAggregateFunctions()
registerAggregateFunctionCategoricalIV(factory);
registerAggregateFunctionAggThrow(factory);
registerAggregateFunctionRankCorrelation(factory);
registerAggregateFunctionMannWhitney(factory);
registerAggregateFunctionWelchTTest(factory);
registerAggregateFunctionStudentTTest(factory);
}
{

View File

@ -0,0 +1,27 @@
#include <IO/WriteBufferFromString.h>
#include <IO/ReadBufferFromString.h>
#include <Common/PODArray.h>
#include <AggregateFunctions/StatCommon.h>
#include <iostream>
#include <gtest/gtest.h>
TEST(Ranks, Simple)
{
using namespace DB;
RanksArray sample = {310, 195, 480, 530, 155, 530, 245, 385, 450, 450, 465, 545, 170, 180, 125, 180, 230, 170, 75, 430, 480, 495, 295};
RanksArray ranks;
Float64 t = 0;
std::tie(ranks, t) = computeRanksAndTieCorrection(sample);
RanksArray expected{12.0, 8.0, 18.5, 21.5, 3.0, 21.5, 10.0, 13.0, 15.5, 15.5, 17.0, 23.0, 4.5, 6.5, 2.0, 6.5, 9.0, 4.5, 1.0, 14.0, 18.5, 20.0, 11.0};
ASSERT_EQ(ranks.size(), expected.size());
for (size_t i = 0; i < ranks.size(); ++i)
ASSERT_DOUBLE_EQ(ranks[i], expected[i]);
ASSERT_DOUBLE_EQ(t, 0.9975296442687747);
}

View File

@ -29,6 +29,7 @@ SRCS(
AggregateFunctionHistogram.cpp
AggregateFunctionIf.cpp
AggregateFunctionMLMethod.cpp
AggregateFunctionMannWhitney.cpp
AggregateFunctionMaxIntersections.cpp
AggregateFunctionMerge.cpp
AggregateFunctionMinMaxAny.cpp
@ -43,13 +44,14 @@ SRCS(
AggregateFunctionState.cpp
AggregateFunctionStatistics.cpp
AggregateFunctionStatisticsSimple.cpp
AggregateFunctionStudentTTest.cpp
AggregateFunctionSum.cpp
AggregateFunctionSumMap.cpp
AggregateFunctionTimeSeriesGroupSum.cpp
AggregateFunctionTopK.cpp
AggregateFunctionUniq.cpp
AggregateFunctionUniqCombined.cpp
AggregateFunctionUniqUpTo.cpp
AggregateFunctionWelchTTest.cpp
AggregateFunctionWindowFunnel.cpp
UniqCombinedBiasData.cpp
UniqVariadicHash.cpp

View File

@ -223,7 +223,7 @@ if (CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELW
Dictionaries/FlatDictionary.cpp
Dictionaries/HashedDictionary.cpp
Dictionaries/CacheDictionary.cpp
Dictionaries/TrieDictionary.cpp
Dictionaries/IPAddressDictionary.cpp
Dictionaries/RangeHashedDictionary.cpp
Dictionaries/ComplexKeyHashedDictionary.cpp
Dictionaries/ComplexKeyCacheDictionary.cpp
@ -305,7 +305,6 @@ endif()
dbms_target_link_libraries (
PRIVATE
${BTRIE_LIBRARIES}
boost::filesystem
boost::program_options
clickhouse_common_config

View File

@ -82,7 +82,7 @@ public:
bool isNumeric() const override { return false; }
bool canBeInsideNullable() const override { return true; }
bool isFixedAndContiguous() const override { return true; }
bool isFixedAndContiguous() const final { return true; }
size_t sizeOfValueIfFixed() const override { return sizeof(T); }
size_t size() const override { return data.size(); }

View File

@ -1,30 +1,35 @@
#include "IPv6ToBinary.h"
#include <Poco/Net/IPAddress.h>
#include <Poco/ByteOrder.h>
#include <cstring>
namespace DB
{
std::array<char, 16> IPv6ToBinary(const Poco::Net::IPAddress & address)
void IPv6ToRawBinary(const Poco::Net::IPAddress & address, char * res)
{
std::array<char, 16> res;
if (Poco::Net::IPAddress::IPv6 == address.family())
{
memcpy(res.data(), address.addr(), 16);
memcpy(res, address.addr(), 16);
}
else if (Poco::Net::IPAddress::IPv4 == address.family())
{
/// Convert to IPv6-mapped address.
memset(res.data(), 0, 10);
memset(res, 0, 10);
res[10] = '\xFF';
res[11] = '\xFF';
memcpy(&res[12], address.addr(), 4);
}
else
memset(res.data(), 0, 16);
memset(res, 0, 16);
}
std::array<char, 16> IPv6ToBinary(const Poco::Net::IPAddress & address)
{
std::array<char, 16> res;
IPv6ToRawBinary(address, res.data());
return res;
}

View File

@ -1,11 +1,16 @@
#pragma once
#include <array>
#include <common/types.h>
namespace Poco { namespace Net { class IPAddress; }}
namespace DB
{
/// Convert IP address to raw binary with IPv6 data (big endian). If it's an IPv4, map it to IPv6.
/// Saves result into the first 16 bytes of `res`.
void IPv6ToRawBinary(const Poco::Net::IPAddress & address, char * res);
/// Convert IP address to 16-byte array with IPv6 data (big endian). If it's an IPv4, map it to IPv6.
std::array<char, 16> IPv6ToBinary(const Poco::Net::IPAddress & address);

View File

@ -35,16 +35,16 @@
/** Used as a template parameter. See below.
*/
struct RadixSortMallocAllocator
struct RadixSortAllocator
{
void * allocate(size_t size)
{
return malloc(size);
return ::operator new(size);
}
void deallocate(void * ptr, size_t /*size*/)
void deallocate(void * ptr, size_t size)
{
return free(ptr);
::operator delete(ptr, size);
}
};
@ -100,7 +100,7 @@ struct RadixSortFloatTraits
/// An object with the functions allocate and deallocate.
/// Can be used, for example, to allocate memory for a temporary array on the stack.
/// To do this, the allocator itself is created on the stack.
using Allocator = RadixSortMallocAllocator;
using Allocator = RadixSortAllocator;
/// The function to get the key from an array element.
static Key & extractKey(Element & elem) { return elem; }
@ -139,7 +139,7 @@ struct RadixSortUIntTraits
static constexpr size_t PART_SIZE_BITS = 8;
using Transform = RadixSortIdentityTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
using Allocator = RadixSortAllocator;
static Key & extractKey(Element & elem) { return elem; }
static Result & extractResult(Element & elem) { return elem; }
@ -173,7 +173,7 @@ struct RadixSortIntTraits
static constexpr size_t PART_SIZE_BITS = 8;
using Transform = RadixSortSignedTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
using Allocator = RadixSortAllocator;
static Key & extractKey(Element & elem) { return elem; }
static Result & extractResult(Element & elem) { return elem; }

View File

@ -11,7 +11,6 @@ add_library(clickhouse_dictionaries ${clickhouse_dictionaries_sources})
target_link_libraries(clickhouse_dictionaries
PRIVATE
${BTRIE_LIBRARIES}
clickhouse_common_io
dbms
Poco::Data

File diff suppressed because it is too large Load Diff

View File

@ -7,6 +7,9 @@
#include <Columns/ColumnString.h>
#include <Common/Arena.h>
#include <Common/HashTable/HashMap.h>
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnVector.h>
#include <Poco/Net/IPAddress.h>
#include <common/StringRef.h>
#include <common/logger_useful.h>
#include <ext/range.h>
@ -14,23 +17,18 @@
#include "IDictionary.h"
#include "IDictionarySource.h"
struct btrie_s;
typedef struct btrie_s btrie_t;
namespace DB
{
class TrieDictionary final : public IDictionaryBase
class IPAddressDictionary final : public IDictionaryBase
{
public:
TrieDictionary(
IPAddressDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_);
~TrieDictionary() override;
std::string getKeyDescription() const { return key_description; }
std::string getTypeName() const override { return "Trie"; }
@ -47,7 +45,7 @@ public:
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<TrieDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty);
return std::make_shared<IPAddressDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty);
}
const IDictionarySource * getSource() const override { return source_ptr.get(); }
@ -150,9 +148,17 @@ public:
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:
template <typename Value>
using ContainerType = std::vector<Value>;
using IPAddress = Poco::Net::IPAddress;
using IPv4Container = PODArray<UInt32>;
using IPv6Container = PaddedPODArray<UInt8>;
using IPMaskContainer = PODArray<UInt8>;
using RowIdxConstIter = ContainerType<size_t>::const_iterator;
struct Attribute final
{
AttributeUnderlyingType type;
@ -207,16 +213,18 @@ private:
Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void getItemsByTwoKeyColumnsImpl(
const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const;
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void
getItemsImpl(const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const;
template <typename T>
bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value);
void setAttributeValueImpl(Attribute & attribute, const T value);
bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value);
void setAttributeValue(Attribute & attribute, const Field & value);
const Attribute & getAttribute(const std::string & attribute_name) const;
@ -224,6 +232,14 @@ private:
void has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray<UInt8> & out) const;
Columns getKeyColumns() const;
RowIdxConstIter ipNotFound() const;
RowIdxConstIter tryLookupIPv4(UInt32 addr, uint8_t * buf) const;
RowIdxConstIter tryLookupIPv6(const uint8_t * addr) const;
template <typename IPContainerType, typename IPValueType>
RowIdxConstIter lookupIP(IPValueType target) const;
static const uint8_t * getIPv6FromOffset(const IPv6Container & ipv6_col, size_t i);
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
@ -231,8 +247,22 @@ private:
const bool require_nonempty;
const std::string key_description{dict_struct.getKeyDescription()};
/// Contains sorted IP subnetworks. If some addresses equals, subnet with lower mask is placed first.
std::variant<IPv4Container, IPv6Container> ip_column;
/// Prefix lengths corresponding to ip_column.
IPMaskContainer mask_column;
/** Contains links to parent subnetworks in ip_column.
* Array holds such ip_column's (and mask_column's) indices that
* - if parent_subnet[i] < i, then ip_column[i] is subnetwork of ip_column[parent_subnet[i]],
* - if parent_subnet[i] == i, then ip_column[i] doesn't belong to any other subnet.
*/
ContainerType<size_t> parent_subnet;
/// Contains corresponding indices in attributes array.
ContainerType<size_t> row_idx;
btrie_t * trie = nullptr;
std::map<std::string, size_t> attribute_index_by_name;
std::vector<Attribute> attributes;

View File

@ -1,779 +0,0 @@
#include "TrieDictionary.h"
#include <stack>
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnVector.h>
#include <Common/assert_cast.h>
#include <DataTypes/DataTypeFixedString.h>
#include <DataTypes/DataTypeString.h>
#include <IO/WriteIntText.h>
#include <Poco/ByteOrder.h>
#include <Poco/Net/IPAddress.h>
#include <Common/formatIPv6.h>
#include <common/itoa.h>
#include <ext/map.h>
#include <ext/range.h>
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
#ifdef __clang__
#pragma clang diagnostic ignored "-Wold-style-cast"
#pragma clang diagnostic ignored "-Wnewline-eof"
#endif
#include <btrie.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int TYPE_MISMATCH;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
extern const int NOT_IMPLEMENTED;
}
static void validateKeyTypes(const DataTypes & key_types)
{
if (key_types.size() != 1)
throw Exception{"Expected a single IP address", ErrorCodes::TYPE_MISMATCH};
const auto & actual_type = key_types[0]->getName();
if (actual_type != "UInt32" && actual_type != "FixedString(16)")
throw Exception{"Key does not match, expected either UInt32 or FixedString(16)", ErrorCodes::TYPE_MISMATCH};
}
TrieDictionary::TrieDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
bool require_nonempty_)
: IDictionaryBase(dict_id_)
, dict_struct(dict_struct_)
, source_ptr{std::move(source_ptr_)}
, dict_lifetime(dict_lifetime_)
, require_nonempty(require_nonempty_)
, logger(&Poco::Logger::get("TrieDictionary"))
{
createAttributes();
trie = btrie_create();
loadData();
calculateBytesAllocated();
}
TrieDictionary::~TrieDictionary()
{
btrie_destroy(trie);
}
#define DECLARE(TYPE) \
void TrieDictionary::get##TYPE( \
const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ResultArrayType<TYPE> & out) const \
{ \
validateKeyTypes(key_types); \
\
const auto & attribute = getAttribute(attribute_name); \
checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \
\
const auto null_value = std::get<TYPE>(attribute.null_values); \
\
getItemsImpl<TYPE, TYPE>( \
attribute, \
key_columns, \
[&](const size_t row, const auto value) { out[row] = value; }, \
[&](const size_t) { return null_value; }); \
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(UInt128)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
DECLARE(Decimal32)
DECLARE(Decimal64)
DECLARE(Decimal128)
#undef DECLARE
void TrieDictionary::getString(
const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ColumnString * out) const
{
validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name);
checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString);
const auto & null_value = StringRef{std::get<String>(attribute.null_values)};
getItemsImpl<StringRef, StringRef>(
attribute,
key_columns,
[&](const size_t, const StringRef value) { out->insertData(value.data, value.size); },
[&](const size_t) { return null_value; });
}
#define DECLARE(TYPE) \
void TrieDictionary::get##TYPE( \
const std::string & attribute_name, \
const Columns & key_columns, \
const DataTypes & key_types, \
const PaddedPODArray<TYPE> & def, \
ResultArrayType<TYPE> & out) const \
{ \
validateKeyTypes(key_types); \
\
const auto & attribute = getAttribute(attribute_name); \
checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \
\
getItemsImpl<TYPE, TYPE>( \
attribute, \
key_columns, \
[&](const size_t row, const auto value) { out[row] = value; }, \
[&](const size_t row) { return def[row]; }); \
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(UInt128)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
DECLARE(Decimal32)
DECLARE(Decimal64)
DECLARE(Decimal128)
#undef DECLARE
void TrieDictionary::getString(
const std::string & attribute_name,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnString * const def,
ColumnString * const out) const
{
validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name);
checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString);
getItemsImpl<StringRef, StringRef>(
attribute,
key_columns,
[&](const size_t, const StringRef value) { out->insertData(value.data, value.size); },
[&](const size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE) \
void TrieDictionary::get##TYPE( \
const std::string & attribute_name, \
const Columns & key_columns, \
const DataTypes & key_types, \
const TYPE def, \
ResultArrayType<TYPE> & out) const \
{ \
validateKeyTypes(key_types); \
\
const auto & attribute = getAttribute(attribute_name); \
checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \
\
getItemsImpl<TYPE, TYPE>( \
attribute, key_columns, [&](const size_t row, const auto value) { out[row] = value; }, [&](const size_t) { return def; }); \
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(UInt128)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
DECLARE(Decimal32)
DECLARE(Decimal64)
DECLARE(Decimal128)
#undef DECLARE
void TrieDictionary::getString(
const std::string & attribute_name,
const Columns & key_columns,
const DataTypes & key_types,
const String & def,
ColumnString * const out) const
{
validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name);
checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString);
getItemsImpl<StringRef, StringRef>(
attribute,
key_columns,
[&](const size_t, const StringRef value) { out->insertData(value.data, value.size); },
[&](const size_t) { return StringRef{def}; });
}
void TrieDictionary::has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const
{
validateKeyTypes(key_types);
const auto & attribute = attributes.front();
switch (attribute.type)
{
case AttributeUnderlyingType::utUInt8:
has<UInt8>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utUInt16:
has<UInt16>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utUInt32:
has<UInt32>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utUInt64:
has<UInt64>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utUInt128:
has<UInt128>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utInt8:
has<Int8>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utInt16:
has<Int16>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utInt32:
has<Int32>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utInt64:
has<Int64>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utFloat32:
has<Float32>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utFloat64:
has<Float64>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utString:
has<StringRef>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utDecimal32:
has<Decimal32>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utDecimal64:
has<Decimal64>(attribute, key_columns, out);
break;
case AttributeUnderlyingType::utDecimal128:
has<Decimal128>(attribute, key_columns, out);
break;
}
}
void TrieDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(),
ErrorCodes::TYPE_MISMATCH};
}
}
void TrieDictionary::loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
/// created upfront to avoid excess allocations
const auto keys_size = dict_struct.key->size();
StringRefs keys(keys_size);
const auto attributes_size = attributes.size();
while (const auto block = stream->read())
{
const auto rows = block.rows();
element_count += rows;
const auto key_column_ptrs = ext::map<Columns>(
ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; });
const auto attribute_column_ptrs = ext::map<Columns>(ext::range(0, attributes_size), [&](const size_t attribute_idx)
{
return block.safeGetByPosition(keys_size + attribute_idx).column;
});
for (const auto row_idx : ext::range(0, rows))
{
/// calculate key once per row
const auto key_column = key_column_ptrs.front();
for (const auto attribute_idx : ext::range(0, attributes_size))
{
const auto & attribute_column = *attribute_column_ptrs[attribute_idx];
auto & attribute = attributes[attribute_idx];
setAttributeValue(attribute, key_column->getDataAt(row_idx), attribute_column[row_idx]);
}
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{full_name + ": dictionary source is empty and 'require_nonempty' property is set.", ErrorCodes::DICTIONARY_IS_EMPTY};
}
template <typename T>
void TrieDictionary::addAttributeSize(const Attribute & attribute)
{
const auto & vec = std::get<ContainerType<T>>(attribute.maps);
bytes_allocated += sizeof(ContainerType<T>) + (vec.capacity() * sizeof(T));
bucket_count = vec.size();
}
void TrieDictionary::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::utUInt8:
addAttributeSize<UInt8>(attribute);
break;
case AttributeUnderlyingType::utUInt16:
addAttributeSize<UInt16>(attribute);
break;
case AttributeUnderlyingType::utUInt32:
addAttributeSize<UInt32>(attribute);
break;
case AttributeUnderlyingType::utUInt64:
addAttributeSize<UInt64>(attribute);
break;
case AttributeUnderlyingType::utUInt128:
addAttributeSize<UInt128>(attribute);
break;
case AttributeUnderlyingType::utInt8:
addAttributeSize<Int8>(attribute);
break;
case AttributeUnderlyingType::utInt16:
addAttributeSize<Int16>(attribute);
break;
case AttributeUnderlyingType::utInt32:
addAttributeSize<Int32>(attribute);
break;
case AttributeUnderlyingType::utInt64:
addAttributeSize<Int64>(attribute);
break;
case AttributeUnderlyingType::utFloat32:
addAttributeSize<Float32>(attribute);
break;
case AttributeUnderlyingType::utFloat64:
addAttributeSize<Float64>(attribute);
break;
case AttributeUnderlyingType::utDecimal32:
addAttributeSize<Decimal32>(attribute);
break;
case AttributeUnderlyingType::utDecimal64:
addAttributeSize<Decimal64>(attribute);
break;
case AttributeUnderlyingType::utDecimal128:
addAttributeSize<Decimal128>(attribute);
break;
case AttributeUnderlyingType::utString:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
bytes_allocated += btrie_allocated(trie);
}
template <typename T>
void TrieDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value)
{
attribute.null_values = T(null_value.get<NearestFieldType<T>>());
attribute.maps.emplace<ContainerType<T>>();
}
TrieDictionary::Attribute TrieDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
Attribute attr{type, {}, {}, {}};
switch (type)
{
case AttributeUnderlyingType::utUInt8:
createAttributeImpl<UInt8>(attr, null_value);
break;
case AttributeUnderlyingType::utUInt16:
createAttributeImpl<UInt16>(attr, null_value);
break;
case AttributeUnderlyingType::utUInt32:
createAttributeImpl<UInt32>(attr, null_value);
break;
case AttributeUnderlyingType::utUInt64:
createAttributeImpl<UInt64>(attr, null_value);
break;
case AttributeUnderlyingType::utUInt128:
createAttributeImpl<UInt128>(attr, null_value);
break;
case AttributeUnderlyingType::utInt8:
createAttributeImpl<Int8>(attr, null_value);
break;
case AttributeUnderlyingType::utInt16:
createAttributeImpl<Int16>(attr, null_value);
break;
case AttributeUnderlyingType::utInt32:
createAttributeImpl<Int32>(attr, null_value);
break;
case AttributeUnderlyingType::utInt64:
createAttributeImpl<Int64>(attr, null_value);
break;
case AttributeUnderlyingType::utFloat32:
createAttributeImpl<Float32>(attr, null_value);
break;
case AttributeUnderlyingType::utFloat64:
createAttributeImpl<Float64>(attr, null_value);
break;
case AttributeUnderlyingType::utDecimal32:
createAttributeImpl<Decimal32>(attr, null_value);
break;
case AttributeUnderlyingType::utDecimal64:
createAttributeImpl<Decimal64>(attr, null_value);
break;
case AttributeUnderlyingType::utDecimal128:
createAttributeImpl<Decimal128>(attr, null_value);
break;
case AttributeUnderlyingType::utString:
{
attr.null_values = null_value.get<String>();
attr.maps.emplace<ContainerType<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr;
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void TrieDictionary::getItemsImpl(
const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const
{
auto & vec = std::get<ContainerType<AttributeType>>(attribute.maps);
const auto first_column = key_columns.front();
const auto rows = first_column->size();
if (first_column->isNumeric())
{
for (const auto i : ext::range(0, rows))
{
auto addr = Int32(first_column->get64(i));
uintptr_t slot = btrie_find(trie, addr);
#pragma GCC diagnostic push
#pragma GCC diagnostic warning "-Wold-style-cast"
set_value(i, slot != BTRIE_NULL ? static_cast<OutputType>(vec[slot]) : get_default(i));
#pragma GCC diagnostic pop
}
}
else
{
for (const auto i : ext::range(0, rows))
{
auto addr = first_column->getDataAt(i);
if (addr.size != 16)
throw Exception("Expected key to be FixedString(16)", ErrorCodes::LOGICAL_ERROR);
uintptr_t slot = btrie_find_a6(trie, reinterpret_cast<const uint8_t *>(addr.data));
#pragma GCC diagnostic push
#pragma GCC diagnostic warning "-Wold-style-cast"
set_value(i, slot != BTRIE_NULL ? static_cast<OutputType>(vec[slot]) : get_default(i));
#pragma GCC diagnostic pop
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T>
bool TrieDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value)
{
// Insert value into appropriate vector type
auto & vec = std::get<ContainerType<T>>(attribute.maps);
size_t row = vec.size();
vec.push_back(value);
// Parse IP address and subnet length from string (e.g. 2a02:6b8::3/64)
Poco::Net::IPAddress addr, mask;
std::string addr_str(key.toString());
size_t pos = addr_str.find('/');
if (pos != std::string::npos)
{
addr = Poco::Net::IPAddress(addr_str.substr(0, pos));
mask = Poco::Net::IPAddress(std::stoi(addr_str.substr(pos + 1), nullptr, 10), addr.family());
}
else
{
addr = Poco::Net::IPAddress(addr_str);
mask = Poco::Net::IPAddress(addr.length() * 8, addr.family());
}
/*
* Here we might overwrite the same key with the same slot as each key can map to multiple attributes.
* However, all columns have equal number of rows so it is okay to store only row number for each key
* instead of building a trie for each column. This comes at the cost of additional lookup in attribute
* vector on lookup time to return cell from row + column. The reason for this is to save space,
* and build only single trie instead of trie for each column.
*/
if (addr.family() == Poco::Net::IPAddress::IPv4)
{
UInt32 addr_v4 = Poco::ByteOrder::toNetwork(*reinterpret_cast<const UInt32 *>(addr.addr()));
UInt32 mask_v4 = Poco::ByteOrder::toNetwork(*reinterpret_cast<const UInt32 *>(mask.addr()));
return btrie_insert(trie, addr_v4, mask_v4, row) == 0;
}
const uint8_t * addr_v6 = reinterpret_cast<const uint8_t *>(addr.addr());
const uint8_t * mask_v6 = reinterpret_cast<const uint8_t *>(mask.addr());
return btrie_insert_a6(trie, addr_v6, mask_v6, row) == 0;
}
bool TrieDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value)
{
switch (attribute.type)
{
case AttributeUnderlyingType::utUInt8:
return setAttributeValueImpl<UInt8>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::utUInt16:
return setAttributeValueImpl<UInt16>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::utUInt32:
return setAttributeValueImpl<UInt32>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::utUInt64:
return setAttributeValueImpl<UInt64>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::utUInt128:
return setAttributeValueImpl<UInt128>(attribute, key, value.get<UInt128>());
case AttributeUnderlyingType::utInt8:
return setAttributeValueImpl<Int8>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::utInt16:
return setAttributeValueImpl<Int16>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::utInt32:
return setAttributeValueImpl<Int32>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::utInt64:
return setAttributeValueImpl<Int64>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::utFloat32:
return setAttributeValueImpl<Float32>(attribute, key, value.get<Float64>());
case AttributeUnderlyingType::utFloat64:
return setAttributeValueImpl<Float64>(attribute, key, value.get<Float64>());
case AttributeUnderlyingType::utDecimal32:
return setAttributeValueImpl<Decimal32>(attribute, key, value.get<Decimal32>());
case AttributeUnderlyingType::utDecimal64:
return setAttributeValueImpl<Decimal64>(attribute, key, value.get<Decimal64>());
case AttributeUnderlyingType::utDecimal128:
return setAttributeValueImpl<Decimal128>(attribute, key, value.get<Decimal128>());
case AttributeUnderlyingType::utString:
{
const auto & string = value.get<String>();
const auto * string_in_arena = attribute.string_arena->insert(string.data(), string.size());
setAttributeValueImpl<StringRef>(attribute, key, StringRef{string_in_arena, string.size()});
return true;
}
}
return {};
}
const TrieDictionary::Attribute & TrieDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
template <typename T>
void TrieDictionary::has(const Attribute &, const Columns & key_columns, PaddedPODArray<UInt8> & out) const
{
const auto first_column = key_columns.front();
const auto rows = first_column->size();
if (first_column->isNumeric())
{
for (const auto i : ext::range(0, rows))
{
auto addr = Int32(first_column->get64(i));
uintptr_t slot = btrie_find(trie, addr);
#pragma GCC diagnostic push
#pragma GCC diagnostic warning "-Wold-style-cast"
out[i] = (slot != BTRIE_NULL);
#pragma GCC diagnostic pop
}
}
else
{
for (const auto i : ext::range(0, rows))
{
auto addr = first_column->getDataAt(i);
if (unlikely(addr.size != 16))
throw Exception("Expected key to be FixedString(16)", ErrorCodes::LOGICAL_ERROR);
uintptr_t slot = btrie_find_a6(trie, reinterpret_cast<const uint8_t *>(addr.data));
#pragma GCC diagnostic push
#pragma GCC diagnostic warning "-Wold-style-cast"
out[i] = (slot != BTRIE_NULL);
#pragma GCC diagnostic pop
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename Getter, typename KeyType>
static void trieTraverse(const btrie_t * trie, Getter && getter)
{
KeyType key = 0;
const KeyType high_bit = ~((~key) >> 1);
btrie_node_t * node;
node = trie->root;
std::stack<btrie_node_t *> stack;
while (node)
{
stack.push(node);
node = node->left;
}
auto get_bit = [&high_bit](size_t size) { return size ? (high_bit >> (size - 1)) : 0; };
while (!stack.empty())
{
node = stack.top();
stack.pop();
#pragma GCC diagnostic push
#pragma GCC diagnostic warning "-Wold-style-cast"
if (node && node->value != BTRIE_NULL)
#pragma GCC diagnostic pop
getter(key, stack.size());
if (node && node->right)
{
stack.push(nullptr);
key |= get_bit(stack.size());
stack.push(node->right);
while (stack.top()->left)
stack.push(stack.top()->left);
}
else
key &= ~get_bit(stack.size());
}
}
Columns TrieDictionary::getKeyColumns() const
{
auto ip_column = ColumnFixedString::create(IPV6_BINARY_LENGTH);
auto mask_column = ColumnVector<UInt8>::create();
#if defined(__SIZEOF_INT128__)
auto getter = [&ip_column, &mask_column](__uint128_t ip, size_t mask)
{
Poco::UInt64 * ip_array = reinterpret_cast<Poco::UInt64 *>(&ip); // Poco:: for old poco + macos
ip_array[0] = Poco::ByteOrder::fromNetwork(ip_array[0]);
ip_array[1] = Poco::ByteOrder::fromNetwork(ip_array[1]);
std::swap(ip_array[0], ip_array[1]);
ip_column->insertData(reinterpret_cast<const char *>(ip_array), IPV6_BINARY_LENGTH);
mask_column->insertValue(static_cast<UInt8>(mask));
};
trieTraverse<decltype(getter), __uint128_t>(trie, std::move(getter));
#else
throw Exception("TrieDictionary::getKeyColumns is not implemented for 32bit arch", ErrorCodes::NOT_IMPLEMENTED);
#endif
return {std::move(ip_column), std::move(mask_column)};
}
BlockInputStreamPtr TrieDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
{
using BlockInputStreamType = DictionaryBlockInputStream<TrieDictionary, UInt64>;
auto get_keys = [](const Columns & columns, const std::vector<DictionaryAttribute> & dict_attributes)
{
const auto & attr = dict_attributes.front();
return ColumnsWithTypeAndName(
{ColumnWithTypeAndName(columns.front(), std::make_shared<DataTypeFixedString>(IPV6_BINARY_LENGTH), attr.name)});
};
auto get_view = [](const Columns & columns, const std::vector<DictionaryAttribute> & dict_attributes)
{
auto column = ColumnString::create();
const auto & ip_column = assert_cast<const ColumnFixedString &>(*columns.front());
const auto & mask_column = assert_cast<const ColumnVector<UInt8> &>(*columns.back());
char buffer[48];
for (size_t row : ext::range(0, ip_column.size()))
{
UInt8 mask = mask_column.getElement(row);
char * ptr = buffer;
formatIPv6(reinterpret_cast<const unsigned char *>(ip_column.getDataAt(row).data), ptr);
*(ptr - 1) = '/';
ptr = itoa(mask, ptr);
column->insertData(buffer, ptr - buffer);
}
return ColumnsWithTypeAndName{
ColumnWithTypeAndName(std::move(column), std::make_shared<DataTypeString>(), dict_attributes.front().name)};
};
return std::make_shared<BlockInputStreamType>(
shared_from_this(), max_block_size, getKeyColumns(), column_names, std::move(get_keys), std::move(get_view));
}
void registerDictionaryTrie(DictionaryFactory & factory)
{
auto create_layout = [=](const std::string &,
const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
DictionarySourcePtr source_ptr) -> DictionaryPtr
{
if (!dict_struct.key)
throw Exception{"'key' is required for dictionary of layout 'ip_trie'", ErrorCodes::BAD_ARGUMENTS};
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
// This is specialised trie for storing IPv4 and IPv6 prefixes.
return std::make_unique<TrieDictionary>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
};
factory.registerLayout("ip_trie", create_layout, true);
}
}

View File

@ -27,9 +27,7 @@ void registerDictionaries()
registerDictionaryComplexKeyHashed(factory);
registerDictionaryComplexKeyCache(factory);
registerDictionaryComplexKeyDirect(factory);
#if !defined(ARCADIA_BUILD)
registerDictionaryTrie(factory);
#endif
registerDictionaryFlat(factory);
registerDictionaryHashed(factory);
registerDictionaryCache(factory);

View File

@ -53,6 +53,7 @@ SRCS(
FlatDictionary.cpp
HTTPDictionarySource.cpp
HashedDictionary.cpp
IPAddressDictionary.cpp
LibraryDictionarySource.cpp
LibraryDictionarySourceExternal.cpp
MongoDBDictionarySource.cpp

View File

@ -91,6 +91,9 @@ inline UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column)
return field.get<UInt32>();
}
/// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type.
struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; };
/** Conversion of number types to each other, enums to numbers, dates and datetimes to numbers and back: done by straight assignment.
* (Date is represented internally as number of days from some day; DateTime - as unix timestamp)
@ -111,6 +114,13 @@ struct ConvertImpl
using ColVecFrom = typename FromDataType::ColumnType;
using ColVecTo = typename ToDataType::ColumnType;
if (std::is_same_v<Name, NameToUnixTimestamp>)
{
if (isDate(named_from.type))
throw Exception("Illegal column " + named_from.column->getName() + " of first argument of function " + Name::name,
ErrorCodes::ILLEGAL_COLUMN);
}
if constexpr ((IsDataTypeDecimal<FromDataType> || IsDataTypeDecimal<ToDataType>)
&& !(std::is_same_v<DataTypeDateTime64, FromDataType> || std::is_same_v<DataTypeDateTime64, ToDataType>))
{
@ -923,9 +933,6 @@ struct ConvertImplGenericFromString
};
/// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type.
struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; };
template <>
struct ConvertImpl<DataTypeString, DataTypeUInt32, NameToUnixTimestamp>
: ConvertImpl<DataTypeString, DataTypeDateTime, NameToUnixTimestamp> {};

View File

@ -37,7 +37,7 @@
#include <Dictionaries/ComplexKeyCacheDictionary.h>
#include <Dictionaries/ComplexKeyDirectDictionary.h>
#include <Dictionaries/RangeHashedDictionary.h>
#include <Dictionaries/TrieDictionary.h>
#include <Dictionaries/IPAddressDictionary.h>
#include <Dictionaries/PolygonDictionaryImplementations.h>
#include <Dictionaries/DirectDictionary.h>
@ -192,7 +192,7 @@ private:
|| (res = executeDispatchComplex<SSDComplexKeyCacheDictionary>(arguments, dict))
#endif
#if !defined(ARCADIA_BUILD)
|| (res = executeDispatchComplex<TrieDictionary>(arguments, dict))
|| (res = executeDispatchComplex<IPAddressDictionary>(arguments, dict))
#endif
|| (res = executeDispatchComplex<PolygonDictionarySimple>(arguments, dict))
|| (res = executeDispatchComplex<PolygonDictionaryIndexEach>(arguments, dict))
@ -346,7 +346,7 @@ private:
|| (res = executeDispatchComplex<SSDComplexKeyCacheDictionary>(arguments, dict))
#endif
#if !defined(ARCADIA_BUILD)
|| (res = executeDispatchComplex<TrieDictionary>(arguments, dict))
|| (res = executeDispatchComplex<IPAddressDictionary>(arguments, dict))
#endif
|| (res = executeDispatchComplex<PolygonDictionarySimple>(arguments, dict))
|| (res = executeDispatchComplex<PolygonDictionaryIndexEach>(arguments, dict))
@ -524,7 +524,7 @@ private:
|| (res = executeDispatchComplex<SSDComplexKeyCacheDictionary>(arguments, dict))
#endif
#if !defined(ARCADIA_BUILD)
|| (res = executeDispatchComplex<TrieDictionary>(arguments, dict))
|| (res = executeDispatchComplex<IPAddressDictionary>(arguments, dict))
#endif
|| (res = executeDispatchComplex<PolygonDictionarySimple>(arguments, dict))
|| (res = executeDispatchComplex<PolygonDictionaryIndexEach>(arguments, dict))
@ -861,7 +861,7 @@ private:
|| (res = executeDispatchComplex<SSDComplexKeyCacheDictionary>(arguments, dict))
#endif
#if !defined(ARCADIA_BUILD)
|| (res = executeDispatchComplex<TrieDictionary>(arguments, dict))
|| (res = executeDispatchComplex<IPAddressDictionary>(arguments, dict))
#endif
|| (res = executeDispatchComplex<PolygonDictionarySimple>(arguments, dict))
|| (res = executeDispatchComplex<PolygonDictionaryIndexEach>(arguments, dict))
@ -1116,7 +1116,7 @@ private:
|| (res = executeDispatchComplex<SSDComplexKeyCacheDictionary>(arguments, dict))
#endif
#if !defined(ARCADIA_BUILD)
|| (res = executeDispatchComplex<TrieDictionary>(arguments, dict))
|| (res = executeDispatchComplex<IPAddressDictionary>(arguments, dict))
#endif
|| (res = executeDispatchComplex<PolygonDictionarySimple>(arguments, dict))
|| (res = executeDispatchComplex<PolygonDictionaryIndexEach>(arguments, dict))

View File

@ -1177,7 +1177,7 @@ void InterpreterSelectQuery::executeFetchColumns(
const auto & func = desc.function;
std::optional<UInt64> num_rows{};
if (!query.prewhere() && !query.where())
num_rows = storage->totalRows();
num_rows = storage->totalRows(settings);
else // It's possible to optimize count() given only partition predicates
{
SelectQueryInfo temp_query_info;

View File

@ -463,7 +463,7 @@ public:
/// - For total_rows column in system.tables
///
/// Does takes underlying Storage (if any) into account.
virtual std::optional<UInt64> totalRows() const { return {}; }
virtual std::optional<UInt64> totalRows(const Settings &) const { return {}; }
/// Same as above but also take partition predicate into account.
virtual std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo &, const Context &) const { return {}; }
@ -481,7 +481,7 @@ public:
/// Memory part should be estimated as a resident memory size.
/// In particular, alloctedBytes() is preferable over bytes()
/// when considering in-memory blocks.
virtual std::optional<UInt64> totalBytes() const { return {}; }
virtual std::optional<UInt64> totalBytes(const Settings &) const { return {}; }
/// Number of rows INSERTed since server start.
///

View File

@ -36,34 +36,31 @@ double IBackgroundJobExecutor::getSleepRandomAdd()
return std::uniform_real_distribution<double>(0, sleep_settings.task_sleep_seconds_when_no_work_random_part)(rng);
}
void IBackgroundJobExecutor::scheduleTask(bool job_done, bool with_backoff)
void IBackgroundJobExecutor::runTaskWithoutDelay()
{
if (job_done)
{
no_work_done_count = 0;
/// We have background jobs, schedule task as soon as possible
scheduling_task->schedule();
no_work_done_count = 0;
/// We have background jobs, schedule task as soon as possible
scheduling_task->schedule();
}
void IBackgroundJobExecutor::scheduleTask(bool with_backoff)
{
size_t next_time_to_execute;
if (with_backoff)
{
auto no_work_done_times = no_work_done_count.fetch_add(1, std::memory_order_relaxed);
next_time_to_execute = 1000 * (std::min(
sleep_settings.task_sleep_seconds_when_no_work_max,
sleep_settings.thread_sleep_seconds_if_nothing_to_do * std::pow(sleep_settings.task_sleep_seconds_when_no_work_multiplier, no_work_done_times))
+ getSleepRandomAdd());
}
else
{
size_t next_time_to_execute;
if (with_backoff)
{
auto no_work_done_times = no_work_done_count.fetch_add(1, std::memory_order_relaxed);
next_time_to_execute = 1000 * (std::min(
sleep_settings.task_sleep_seconds_when_no_work_max,
sleep_settings.thread_sleep_seconds_if_nothing_to_do * std::pow(sleep_settings.task_sleep_seconds_when_no_work_multiplier, no_work_done_times))
+ getSleepRandomAdd());
}
else
{
next_time_to_execute = 1000 * sleep_settings.thread_sleep_seconds_if_nothing_to_do;
}
scheduling_task->scheduleAfter(next_time_to_execute, false);
next_time_to_execute = 1000 * sleep_settings.thread_sleep_seconds_if_nothing_to_do;
}
scheduling_task->scheduleAfter(next_time_to_execute, false);
}
namespace
@ -105,42 +102,42 @@ try
/// Job done, decrement metric and reset no_work counter
CurrentMetrics::values[pool_config.tasks_metric]--;
/// Job done, new empty space in pool, schedule background task
scheduleTask(true);
runTaskWithoutDelay();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
CurrentMetrics::values[pool_config.tasks_metric]--;
scheduleTask(false);
scheduleTask(/* with_backoff = */ true);
}
});
/// We've scheduled task in the background pool and when it will finish we will be triggered again. But this task can be
/// extremely long and we may have a lot of other small tasks to do, so we schedule ourselves here.
scheduleTask(true);
runTaskWithoutDelay();
}
catch (...)
{
/// With our Pool settings scheduleOrThrowOnError shouldn't throw exceptions, but for safety catch added here
tryLogCurrentException(__PRETTY_FUNCTION__);
CurrentMetrics::values[pool_config.tasks_metric]--;
scheduleTask(false);
scheduleTask(/* with_backoff = */ true);
}
}
else /// Pool is full and we have some work to do
{
scheduleTask(false, /* with_backoff = */ false);
scheduleTask(/* with_backoff = */ false);
}
}
else /// Nothing to do, no jobs
{
scheduleTask(false);
scheduleTask(/* with_backoff = */ true);
}
}
catch (...) /// Exception while we looking for a task, reschedule
{
tryLogCurrentException(__PRETTY_FUNCTION__);
scheduleTask(false);
scheduleTask(/* with_backoff = */ true);
}
void IBackgroundJobExecutor::start()

View File

@ -117,7 +117,9 @@ private:
/// Function that executes in background scheduling pool
void jobExecutingTask();
/// Recalculate timeouts when we have to check for a new job
void scheduleTask(bool job_done, bool with_backoff=false);
void scheduleTask(bool with_backoff);
/// Run background task as fast as possible and reset errors counter
void runTaskWithoutDelay();
/// Return random add for sleep in case of error
double getSleepRandomAdd();
};

View File

@ -867,13 +867,13 @@ void StorageBuffer::checkAlterIsPossible(const AlterCommands & commands, const S
}
}
std::optional<UInt64> StorageBuffer::totalRows() const
std::optional<UInt64> StorageBuffer::totalRows(const Settings & settings) const
{
std::optional<UInt64> underlying_rows;
auto underlying = DatabaseCatalog::instance().tryGetTable(destination_id, global_context);
if (underlying)
underlying_rows = underlying->totalRows();
underlying_rows = underlying->totalRows(settings);
if (!underlying_rows)
return underlying_rows;
@ -886,7 +886,7 @@ std::optional<UInt64> StorageBuffer::totalRows() const
return rows + *underlying_rows;
}
std::optional<UInt64> StorageBuffer::totalBytes() const
std::optional<UInt64> StorageBuffer::totalBytes(const Settings & /*settings*/) const
{
UInt64 bytes = 0;
for (const auto & buffer : buffers)

View File

@ -109,8 +109,8 @@ public:
/// The structure of the subordinate table is not checked and does not change.
void alter(const AlterCommands & params, const Context & context, TableLockHolder & table_lock_holder) override;
std::optional<UInt64> totalRows() const override;
std::optional<UInt64> totalBytes() const override;
std::optional<UInt64> totalRows(const Settings & settings) const override;
std::optional<UInt64> totalBytes(const Settings & settings) const override;
std::optional<UInt64> lifetimeRows() const override { return writes.rows; }
std::optional<UInt64> lifetimeBytes() const override { return writes.bytes; }

View File

@ -102,8 +102,8 @@ HashJoinPtr StorageJoin::getJoin(std::shared_ptr<TableJoin> analyzed_join) const
void StorageJoin::insertBlock(const Block & block) { join->addJoinedBlock(block, true); }
size_t StorageJoin::getSize() const { return join->getTotalRowCount(); }
std::optional<UInt64> StorageJoin::totalRows() const { return join->getTotalRowCount(); }
std::optional<UInt64> StorageJoin::totalBytes() const { return join->getTotalByteCount(); }
std::optional<UInt64> StorageJoin::totalRows(const Settings &) const { return join->getTotalRowCount(); }
std::optional<UInt64> StorageJoin::totalBytes(const Settings &) const { return join->getTotalByteCount(); }
void registerStorageJoin(StorageFactory & factory)

View File

@ -46,8 +46,8 @@ public:
size_t max_block_size,
unsigned num_streams) override;
std::optional<UInt64> totalRows() const override;
std::optional<UInt64> totalBytes() const override;
std::optional<UInt64> totalRows(const Settings & settings) const override;
std::optional<UInt64> totalBytes(const Settings & settings) const override;
private:
Block sample_block;

View File

@ -1,9 +1,11 @@
#include <cassert>
#include <Common/Exception.h>
#include <DataStreams/IBlockInputStream.h>
#include <Storages/StorageMemory.h>
#include <Interpreters/MutationsInterpreter.h>
#include <Storages/StorageFactory.h>
#include <Storages/StorageMemory.h>
#include <IO/WriteHelpers.h>
#include <Processors/Sources/SourceWithProgress.h>
@ -21,7 +23,7 @@ namespace ErrorCodes
class MemorySource : public SourceWithProgress
{
using InitializerFunc = std::function<void(BlocksList::const_iterator &, size_t &)>;
using InitializerFunc = std::function<void(BlocksList::const_iterator &, size_t &, std::shared_ptr<const BlocksList> &)>;
public:
/// Blocks are stored in std::list which may be appended in another thread.
/// We use pointer to the beginning of the list and its current size.
@ -34,11 +36,13 @@ public:
size_t num_blocks_,
const StorageMemory & storage,
const StorageMetadataPtr & metadata_snapshot,
InitializerFunc initializer_func_ = [](BlocksList::const_iterator &, size_t &) {})
std::shared_ptr<const BlocksList> data_,
InitializerFunc initializer_func_ = [](BlocksList::const_iterator &, size_t &, std::shared_ptr<const BlocksList> &) {})
: SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID()))
, column_names(std::move(column_names_))
, current_it(first_)
, num_blocks(num_blocks_)
, data(data_)
, initializer_func(std::move(initializer_func_))
{
}
@ -50,7 +54,7 @@ protected:
{
if (!postponed_init_done)
{
initializer_func(current_it, num_blocks);
initializer_func(current_it, num_blocks, data);
postponed_init_done = true;
}
@ -77,6 +81,7 @@ private:
size_t num_blocks;
size_t current_block_idx = 0;
std::shared_ptr<const BlocksList> data;
bool postponed_init_done = false;
InitializerFunc initializer_func;
};
@ -102,7 +107,9 @@ public:
metadata_snapshot->check(block, true);
{
std::lock_guard lock(storage.mutex);
storage.data.push_back(block);
auto new_data = std::make_unique<BlocksList>(*(storage.data.get()));
new_data->push_back(block);
storage.data.set(std::move(new_data));
storage.total_size_bytes.fetch_add(size_bytes_diff, std::memory_order_relaxed);
storage.total_size_rows.fetch_add(size_rows_diff, std::memory_order_relaxed);
@ -116,7 +123,7 @@ private:
StorageMemory::StorageMemory(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_)
: IStorage(table_id_)
: IStorage(table_id_), data(std::make_unique<const BlocksList>())
{
StorageInMemoryMetadata storage_metadata;
storage_metadata.setColumns(std::move(columns_description_));
@ -146,30 +153,31 @@ Pipe StorageMemory::read(
/// set for IN or hash table for JOIN, which can't be done concurrently.
/// Since no other manipulation with data is done, multiple sources shouldn't give any profit.
return Pipe(
std::make_shared<MemorySource>(
column_names, data.end(), 0, *this, metadata_snapshot,
/// This hack is needed for global subqueries.
/// It allows to set up this Source for read AFTER Storage::read() has been called and just before actual reading
[this](BlocksList::const_iterator & current_it, size_t & num_blocks)
{
std::lock_guard guard(mutex);
current_it = data.begin();
num_blocks = data.size();
}
));
return Pipe(std::make_shared<MemorySource>(
column_names,
data.get()->end(),
0,
*this,
metadata_snapshot,
data.get(),
[this](BlocksList::const_iterator & current_it, size_t & num_blocks, std::shared_ptr<const BlocksList> & current_data)
{
current_data = data.get();
current_it = current_data->begin();
num_blocks = current_data->size();
}));
}
std::lock_guard lock(mutex);
auto current_data = data.get();
size_t size = data.size();
size_t size = current_data->size();
if (num_streams > size)
num_streams = size;
Pipes pipes;
BlocksList::const_iterator it = data.begin();
BlocksList::const_iterator it = current_data->begin();
size_t offset = 0;
for (size_t stream = 0; stream < num_streams; ++stream)
@ -179,7 +187,7 @@ Pipe StorageMemory::read(
assert(num_blocks > 0);
pipes.emplace_back(std::make_shared<MemorySource>(column_names, it, num_blocks, *this, metadata_snapshot));
pipes.emplace_back(std::make_shared<MemorySource>(column_names, it, num_blocks, *this, metadata_snapshot, current_data));
while (offset < next_offset)
{
@ -200,37 +208,99 @@ BlockOutputStreamPtr StorageMemory::write(const ASTPtr & /*query*/, const Storag
void StorageMemory::drop()
{
std::lock_guard lock(mutex);
data.clear();
data.set(std::make_unique<BlocksList>());
total_size_bytes.store(0, std::memory_order_relaxed);
total_size_rows.store(0, std::memory_order_relaxed);
}
static inline void updateBlockData(Block & old_block, const Block & new_block)
{
for (const auto & it : new_block)
{
auto col_name = it.name;
auto & col_with_type_name = old_block.getByName(col_name);
col_with_type_name.column = it.column;
}
}
void StorageMemory::mutate(const MutationCommands & commands, const Context & context)
{
std::lock_guard lock(mutex);
auto metadata_snapshot = getInMemoryMetadataPtr();
auto storage = getStorageID();
auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context);
auto interpreter = std::make_unique<MutationsInterpreter>(storage_ptr, metadata_snapshot, commands, context, true);
auto in = interpreter->execute();
in->readPrefix();
BlocksList out;
Block block;
while ((block = in->read()))
{
out.push_back(block);
}
in->readSuffix();
std::unique_ptr<BlocksList> new_data;
// all column affected
if (interpreter->isAffectingAllColumns())
{
new_data = std::make_unique<BlocksList>(out);
}
else
{
/// just some of the column affected, we need update it with new column
new_data = std::make_unique<BlocksList>(*(data.get()));
auto data_it = new_data->begin();
auto out_it = out.begin();
while (data_it != new_data->end())
{
/// Mutation does not change the number of blocks
assert(out_it != out.end());
updateBlockData(*data_it, *out_it);
++data_it;
++out_it;
}
assert(out_it == out.end());
}
size_t rows = 0;
size_t bytes = 0;
for (const auto & buffer : *new_data)
{
rows += buffer.rows();
bytes += buffer.bytes();
}
total_size_bytes.store(rows, std::memory_order_relaxed);
total_size_rows.store(bytes, std::memory_order_relaxed);
data.set(std::move(new_data));
}
void StorageMemory::truncate(
const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &)
{
std::lock_guard lock(mutex);
data.clear();
data.set(std::make_unique<BlocksList>());
total_size_bytes.store(0, std::memory_order_relaxed);
total_size_rows.store(0, std::memory_order_relaxed);
}
std::optional<UInt64> StorageMemory::totalRows() const
std::optional<UInt64> StorageMemory::totalRows(const Settings &) const
{
/// All modifications of these counters are done under mutex which automatically guarantees synchronization/consistency
/// When run concurrently we are fine with any value: "before" or "after"
return total_size_rows.load(std::memory_order_relaxed);
}
std::optional<UInt64> StorageMemory::totalBytes() const
std::optional<UInt64> StorageMemory::totalBytes(const Settings &) const
{
return total_size_bytes.load(std::memory_order_relaxed);
}
void registerStorageMemory(StorageFactory & factory)
{
factory.registerStorage("Memory", [](const StorageFactory::Arguments & args)

View File

@ -10,6 +10,7 @@
#include <Storages/IStorage.h>
#include <DataStreams/IBlockOutputStream.h>
#include <Common/MultiVersion.h>
namespace DB
{
@ -27,7 +28,7 @@ friend struct ext::shared_ptr_helper<StorageMemory>;
public:
String getName() const override { return "Memory"; }
size_t getSize() const { return data.size(); }
size_t getSize() const { return data.get()->size(); }
Pipe read(
const Names & column_names,
@ -44,10 +45,12 @@ public:
void drop() override;
void mutate(const MutationCommands & commands, const Context & context) override;
void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) override;
std::optional<UInt64> totalRows() const override;
std::optional<UInt64> totalBytes() const override;
std::optional<UInt64> totalRows(const Settings &) const override;
std::optional<UInt64> totalBytes(const Settings &) const override;
/** Delays initialization of StorageMemory::read() until the first read is actually happen.
* Usually, fore code like this:
@ -87,8 +90,8 @@ public:
void delayReadForGlobalSubqueries() { delay_read_for_global_subqueries = true; }
private:
/// The data itself. `list` - so that when inserted to the end, the existing iterators are not invalidated.
BlocksList data;
/// MultiVersion data storage, so that we can copy the list of blocks to readers.
MultiVersion<BlocksList> data;
mutable std::mutex mutex;

View File

@ -202,7 +202,7 @@ Pipe StorageMergeTree::read(
return plan.convertToPipe();
}
std::optional<UInt64> StorageMergeTree::totalRows() const
std::optional<UInt64> StorageMergeTree::totalRows(const Settings &) const
{
return getTotalActiveSizeInRows();
}
@ -223,7 +223,7 @@ std::optional<UInt64> StorageMergeTree::totalRowsByPartitionPredicate(const Sele
return res;
}
std::optional<UInt64> StorageMergeTree::totalBytes() const
std::optional<UInt64> StorageMergeTree::totalBytes(const Settings &) const
{
return getTotalActiveSizeInBytes();
}

View File

@ -56,9 +56,9 @@ public:
size_t max_block_size,
unsigned num_streams) override;
std::optional<UInt64> totalRows() const override;
std::optional<UInt64> totalRows(const Settings &) const override;
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo &, const Context &) const override;
std::optional<UInt64> totalBytes() const override;
std::optional<UInt64> totalBytes(const Settings &) const override;
BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override;

View File

@ -45,11 +45,11 @@ public:
void alter(const AlterCommands & params, const Context & context, TableLockHolder & table_lock_holder) override;
std::optional<UInt64> totalRows() const override
std::optional<UInt64> totalRows(const Settings &) const override
{
return {0};
}
std::optional<UInt64> totalBytes() const override
std::optional<UInt64> totalBytes(const Settings &) const override
{
return {0};
}

View File

@ -148,8 +148,8 @@ public:
bool storesDataOnDisk() const override { return getNested()->storesDataOnDisk(); }
Strings getDataPaths() const override { return getNested()->getDataPaths(); }
StoragePolicyPtr getStoragePolicy() const override { return getNested()->getStoragePolicy(); }
std::optional<UInt64> totalRows() const override { return getNested()->totalRows(); }
std::optional<UInt64> totalBytes() const override { return getNested()->totalBytes(); }
std::optional<UInt64> totalRows(const Settings & settings) const override { return getNested()->totalRows(settings); }
std::optional<UInt64> totalBytes(const Settings & settings) const override { return getNested()->totalBytes(settings); }
std::optional<UInt64> lifetimeRows() const override { return getNested()->lifetimeRows(); }
std::optional<UInt64> lifetimeBytes() const override { return getNested()->lifetimeBytes(); }

View File

@ -3742,27 +3742,37 @@ Pipe StorageReplicatedMergeTree::read(
template <class Func>
void StorageReplicatedMergeTree::foreachCommittedParts(const Func & func) const
void StorageReplicatedMergeTree::foreachCommittedParts(Func && func, bool select_sequential_consistency) const
{
auto max_added_blocks = getMaxAddedBlocks();
std::optional<ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock> max_added_blocks = {};
/**
* Synchronously go to ZooKeeper when select_sequential_consistency enabled
*/
if (select_sequential_consistency)
max_added_blocks = getMaxAddedBlocks();
auto lock = lockParts();
for (const auto & part : getDataPartsStateRange(DataPartState::Committed))
{
if (part->isEmpty())
continue;
auto blocks_iterator = max_added_blocks.find(part->info.partition_id);
if (blocks_iterator == max_added_blocks.end() || part->info.max_block > blocks_iterator->second)
continue;
if (max_added_blocks)
{
auto blocks_iterator = max_added_blocks->find(part->info.partition_id);
if (blocks_iterator == max_added_blocks->end() || part->info.max_block > blocks_iterator->second)
continue;
}
func(part);
}
}
std::optional<UInt64> StorageReplicatedMergeTree::totalRows() const
std::optional<UInt64> StorageReplicatedMergeTree::totalRows(const Settings & settings) const
{
UInt64 res = 0;
foreachCommittedParts([&res](auto & part) { res += part->rows_count; });
foreachCommittedParts([&res](auto & part) { res += part->rows_count; }, settings.select_sequential_consistency);
return res;
}
@ -3777,14 +3787,14 @@ std::optional<UInt64> StorageReplicatedMergeTree::totalRowsByPartitionPredicate(
{
if (!partition_pruner.canBePruned(part))
res += part->rows_count;
});
}, context.getSettingsRef().select_sequential_consistency);
return res;
}
std::optional<UInt64> StorageReplicatedMergeTree::totalBytes() const
std::optional<UInt64> StorageReplicatedMergeTree::totalBytes(const Settings & settings) const
{
UInt64 res = 0;
foreachCommittedParts([&res](auto & part) { res += part->getBytesOnDisk(); });
foreachCommittedParts([&res](auto & part) { res += part->getBytesOnDisk(); }, settings.select_sequential_consistency);
return res;
}

View File

@ -107,9 +107,9 @@ public:
size_t max_block_size,
unsigned num_streams) override;
std::optional<UInt64> totalRows() const override;
std::optional<UInt64> totalRows(const Settings & settings) const override;
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, const Context & context) const override;
std::optional<UInt64> totalBytes() const override;
std::optional<UInt64> totalBytes(const Settings & settings) const override;
BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override;
@ -326,7 +326,7 @@ private:
const size_t replicated_fetches_pool_size;
template <class Func>
void foreachCommittedParts(const Func & func) const;
void foreachCommittedParts(Func && func, bool select_sequential_consistency) const;
/** Creates the minimum set of nodes in ZooKeeper and create first replica.
* Returns true if was created, false if exists.

View File

@ -153,8 +153,8 @@ void StorageSet::insertBlock(const Block & block) { set->insertFromBlock(block);
void StorageSet::finishInsert() { set->finishInsert(); }
size_t StorageSet::getSize() const { return set->getTotalRowCount(); }
std::optional<UInt64> StorageSet::totalRows() const { return set->getTotalRowCount(); }
std::optional<UInt64> StorageSet::totalBytes() const { return set->getTotalByteCount(); }
std::optional<UInt64> StorageSet::totalRows(const Settings &) const { return set->getTotalRowCount(); }
std::optional<UInt64> StorageSet::totalBytes(const Settings &) const { return set->getTotalByteCount(); }
void StorageSet::truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, const Context &, TableExclusiveLockHolder &)
{

View File

@ -73,8 +73,8 @@ public:
void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, const Context &, TableExclusiveLockHolder &) override;
std::optional<UInt64> totalRows() const override;
std::optional<UInt64> totalBytes() const override;
std::optional<UInt64> totalRows(const Settings & settings) const override;
std::optional<UInt64> totalBytes(const Settings & settings) const override;
private:
SetPtr set;

View File

@ -429,7 +429,7 @@ protected:
if (columns_mask[src_index++])
{
assert(table != nullptr);
auto total_rows = table->totalRows();
auto total_rows = table->totalRows(context.getSettingsRef());
if (total_rows)
res_columns[res_index++]->insert(*total_rows);
else
@ -439,7 +439,7 @@ protected:
if (columns_mask[src_index++])
{
assert(table != nullptr);
auto total_bytes = table->totalBytes();
auto total_bytes = table->totalBytes(context.getSettingsRef());
if (total_bytes)
res_columns[res_index++]->insert(*total_bytes);
else

View File

@ -112,7 +112,6 @@ def get_db_engine(args):
return "" # Will use default engine
def run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file):
# print(client_options)
if args.database:
@ -149,10 +148,12 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std
pattern = "{client} --send_logs_level={logs_level} --testmode --multiquery {options} < " + pattern
command = pattern.format(**params)
#print(command)
# print(command)
proc = Popen(command, shell=True, env=os.environ)
start_time = datetime.now()
while (datetime.now() - start_time).total_seconds() < args.timeout and proc.poll() is None:
sleep(0.01)
@ -317,6 +318,7 @@ def run_tests_array(all_tests_with_params):
stderr_file = os.path.join(suite_tmp_dir, name) + '.stderr'
proc, stdout, stderr, total_time = run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file)
if proc.returncode is None:
try:
proc.kill()
@ -347,7 +349,7 @@ def run_tests_array(all_tests_with_params):
if stderr:
print(stderr)
# Stop on fatal errors like segmentation fault. They are send to client via logs.
# Stop on fatal errors like segmentation fault. They are sent to client via logs.
if ' <Fatal> ' in stderr:
SERVER_DIED = True
@ -486,7 +488,7 @@ def collect_build_flags(client):
elif b'-fsanitize=memory' in stdout:
result.append(BuildFlags.MEMORY)
else:
raise Exception("Cannot get inforamtion about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
raise Exception("Cannot get information about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
clickhouse_proc = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = clickhouse_proc.communicate(b"SELECT value FROM system.build_options WHERE name = 'BUILD_TYPE'")
@ -497,7 +499,7 @@ def collect_build_flags(client):
elif b'RelWithDebInfo' in stdout or b'Release' in stdout:
result.append(BuildFlags.RELEASE)
else:
raise Exception("Cannot get inforamtion about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
raise Exception("Cannot get information about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
clickhouse_proc = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = clickhouse_proc.communicate(b"SELECT value FROM system.build_options WHERE name = 'UNBUNDLED'")
@ -506,7 +508,7 @@ def collect_build_flags(client):
if b'ON' in stdout or b'1' in stdout:
result.append(BuildFlags.UNBUNDLED)
else:
raise Exception("Cannot get inforamtion about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
raise Exception("Cannot get information about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
clickhouse_proc = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = clickhouse_proc.communicate(b"SELECT value FROM system.settings WHERE name = 'default_database_engine'")
@ -515,7 +517,7 @@ def collect_build_flags(client):
if b'Ordinary' in stdout:
result.append(BuildFlags.DATABASE_ORDINARY)
else:
raise Exception("Cannot get inforamtion about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
raise Exception("Cannot get information about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
clickhouse_proc = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = clickhouse_proc.communicate(b"SELECT value FROM system.merge_tree_settings WHERE name = 'min_bytes_for_wide_part'")
@ -544,8 +546,12 @@ def main(args):
return stdout.startswith(b'1')
if not check_server_started(args.client, args.server_check_retries):
raise Exception("clickhouse-server is not responding. Cannot execute 'SELECT 1' query.")
raise Exception(
"Server is not responding. Cannot execute 'SELECT 1' query. \
Note: if you are using unbundled mode, you also have to specify -c option.")
build_flags = collect_build_flags(args.client)
if args.use_skip_list:
tests_to_skip_from_list = collect_tests_to_skip(args.skip_list_path, build_flags)
else:
@ -790,8 +796,13 @@ if __name__ == '__main__':
parser=ArgumentParser(description='ClickHouse functional tests')
parser.add_argument('-q', '--queries', help='Path to queries dir')
parser.add_argument('--tmp', help='Path to tmp dir')
parser.add_argument('-b', '--binary', default='clickhouse', help='Path to clickhouse binary or name of binary in PATH')
parser.add_argument('-c', '--client', help='Client program')
parser.add_argument('-b', '--binary', default='clickhouse',
help='Path to clickhouse (if bundled, clickhouse-server otherwise) binary or name of binary in PATH')
parser.add_argument('-c', '--client',
help='Path to clickhouse-client (if unbundled, useless otherwise) binary of name of binary in PATH')
parser.add_argument('--extract_from_config', help='extract-from-config program')
parser.add_argument('--configclient', help='Client config (if you use not default ports)')
parser.add_argument('--configserver', default= '/etc/clickhouse-server/config.xml', help='Preprocessed server config')
@ -865,10 +876,14 @@ if __name__ == '__main__':
if args.client is None:
if find_binary(args.binary + '-client'):
args.client = args.binary + '-client'
print("Using " + args.client + " as client program (expecting unbundled mode)")
elif find_binary(args.binary):
args.client = args.binary + ' client'
print("Using " + args.client + " as client program (expecting bundled mode)")
else:
print("No 'clickhouse' binary found in PATH", file=sys.stderr)
print("No 'clickhouse' or 'clickhouse-client' client binary found", file=sys.stderr)
parser.print_help()
exit(1)

View File

@ -344,8 +344,6 @@
"TABLE"
"TABLES"
"TEMPORARY"
"timeSeriesGroupRateSum"
"timeSeriesGroupSum"
"TIMESTAMP"
"TIMESTAMP_ADD"
"TIMESTAMPADD"

View File

@ -13,7 +13,7 @@
<host>zoo3</host>
<port>2181</port>
</node>
<session_timeout_ms>2000</session_timeout_ms>
<session_timeout_ms>20000</session_timeout_ms>
</zookeeper>
</yandex>

View File

@ -62,14 +62,16 @@ def test_reload_zookeeper(start_cluster):
## stop all zookeepers, table will be readonly
cluster.stop_zookeeper_nodes(["zoo1", "zoo2", "zoo3"])
node.query("SELECT COUNT() FROM test_table")
with pytest.raises(QueryRuntimeException):
node.query("SELECT COUNT() FROM test_table")
node.query("SELECT COUNT() FROM test_table", settings={"select_sequential_consistency" : 1})
## start zoo2, zoo3, table will be readonly too, because it only connect to zoo1
cluster.start_zookeeper_nodes(["zoo2", "zoo3"])
wait_zookeeper_node_to_start(["zoo2", "zoo3"])
node.query("SELECT COUNT() FROM test_table")
with pytest.raises(QueryRuntimeException):
node.query("SELECT COUNT() FROM test_table")
node.query("SELECT COUNT() FROM test_table", settings={"select_sequential_consistency" : 1})
## set config to zoo2, server will be normal
new_config = """

View File

@ -0,0 +1,38 @@
<test>
<settings>
<allow_experimental_bigint_types>1</allow_experimental_bigint_types>
<max_threads>1</max_threads>
<max_insert_threads>8</max_insert_threads>
</settings>
<preconditions>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<create_query>CREATE TABLE perf_avg(
num UInt64,
num_u Decimal256(75) DEFAULT toDecimal256(num / 400000, 75),
num_f Float64 DEFAULT num / 100
) ENGINE = MergeTree() ORDER BY num
</create_query>
<fill_query>
INSERT INTO perf_avg(num)
SELECT toUInt64(UserID / (WatchID + 1) * 1000000)
FROM hits_100m_single
LIMIT 50000000
</fill_query>
<query>SELECT avg(num) FROM perf_avg FORMAT Null</query>
<query>SELECT avg(2 * num) FROM perf_avg FORMAT Null</query>
<query>SELECT avg(num_u) FROM perf_avg FORMAT Null</query>
<query>SELECT avg(num_f) FROM perf_avg FORMAT Null</query>
<query>SELECT avgWeighted(num_f, num) FROM perf_avg FORMAT Null</query>
<query>SELECT avgWeighted(num_f, num_f) FROM perf_avg FORMAT Null</query>
<query>SELECT avgWeighted(num_f, num_u) FROM perf_avg FORMAT Null</query>
<query>SELECT avgWeighted(num_u, num_f) FROM perf_avg FORMAT Null</query>
<query>SELECT avgWeighted(num_u, num) FROM perf_avg FORMAT Null</query>
<query>SELECT avgWeighted(num_u, num_u) FROM perf_avg FORMAT Null</query>
<drop_query>DROP TABLE IF EXISTS perf_avg</drop_query>
</test>

View File

@ -13,7 +13,6 @@
<value>toMonday</value>
<value>toRelativeDayNum</value>
<value>toYYYYMMDDhhmmss</value>
<value>toUnixTimestamp</value>
</values>
</substitution>
<substitution>
@ -33,8 +32,15 @@
</substitution>
</substitutions>
<!-- date_transform -->
<query>SELECT count() FROM numbers(100000000) WHERE NOT ignore(toDateTime('2017-01-01 00:00:00') + number % 100000000 + rand() % 100000 AS t, {date_transform}(t, '{time_zone}'))</query>
<query>SELECT count() FROM numbers(100000000) WHERE NOT ignore(toDate('2017-01-01') + number % 1000 + rand() % 10 AS t, {date_transform}(t))</query>
<!-- toUnixTimestamp() -->
<query>SELECT count() FROM numbers(100000000) WHERE NOT ignore(toDateTime('2017-01-01 00:00:00') + number % 100000000 + rand() % 100000 AS t, toUnixTimestamp(t, '{time_zone}'))</query>
<!-- toUnixTimestamp(Date()) is prohibit, wrap Date() with toUInt16() to overcome -->
<query>SELECT count() FROM numbers(100000000) WHERE NOT ignore(toDate('2017-01-01') + number % 1000 + rand() % 10 AS t, toUnixTimestamp(toUInt16(t)))</query>
<query>SELECT count() FROM numbers(100000000) WHERE NOT ignore(toDateTime('2017-01-01 00:00:00') + number % 100000000 + rand() % 100000 AS t, {binary_function}(t, 1))</query>
<query>SELECT count() FROM numbers(100000000) WHERE NOT ignore(toDateTime('2017-01-01 00:00:00') + number % 100000000 + rand() % 100000 AS t, toStartOfInterval(t, INTERVAL 1 month))</query>
<query>SELECT count() FROM numbers(100000000) WHERE NOT ignore(toDateTime('2017-01-01 00:00:00') + number % 100000000 + rand() % 100000 AS t, date_trunc('month', t))</query>

View File

@ -0,0 +1,89 @@
<test>
<create_query>
CREATE TABLE table_ip_trie
(
ip String,
ver UInt8,
val Float32
) ENGINE = TinyLog
</create_query>
<create_query>
INSERT INTO table_ip_trie
SELECT
IPv4NumToString(ipv4) || '/' || toString(rand() % 17 + 16) as ip,
4 as ver,
val
FROM generateRandom('ipv4 UInt32, val Float32', 0, 30, 30)
LIMIT 200000
</create_query>
<create_query>
INSERT INTO table_ip_trie
SELECT
IPv6NumToString(ipv6) || '/' || toString(rand() % 65 + 64) as ip,
6 as ver,
val
FROM generateRandom('ipv6 FixedString(16), val Float32', 0, 30, 30)
LIMIT 2500000
</create_query>
<create_query>
CREATE DICTIONARY dict_ip_trie
(
ip String,
ver UInt8,
val Float32
)
PRIMARY KEY ip
SOURCE(CLICKHOUSE(DB 'default' TABLE 'table_ip_trie'))
LAYOUT(IP_TRIE())
LIFETIME(300)
</create_query>
<create_query>
CREATE TABLE dict_ip_trie_table
(
`ip` String,
`ver` UInt8,
`val` Float32
) ENGINE = Dictionary(default.dict_ip_trie)
</create_query>
<create_query>
CREATE TABLE table_ip_from_dict (`ip` String, `ver` UInt8) ENGINE = TinyLog
</create_query>
<create_query>
INSERT INTO table_ip_from_dict
SELECT splitByChar('/', ip )[1] as ip, ver FROM dict_ip_trie_table
</create_query>
<query>
SELECT dictGetFloat32('default.dict_ip_trie', 'val', tuple(rand32()))
FROM numbers(500000) FORMAT Null
</query>
<query>
SELECT dictGetFloat32('default.dict_ip_trie', 'val', tuple(randomFixedString(16)))
FROM numbers(500000) FORMAT Null
</query>
<query>
SELECT dictGetFloat32('default.dict_ip_trie', 'val', tuple(IPv6StringToNum(ip)))
FROM table_ip_from_dict
WHERE ver == 4
LIMIT 500000 FORMAT Null
</query>
<query>
SELECT dictGetFloat32('default.dict_ip_trie', 'val', tuple(IPv6StringToNum(ip)))
FROM table_ip_from_dict
WHERE ver == 6
LIMIT 500000 FORMAT Null
</query>
<drop_query>DROP DICTIONARY IF EXISTS default.dict_ip_trie</drop_query>
<drop_query>DROP TABLE IF EXISTS table_ip_trie</drop_query>
<drop_query>DROP TABLE IF EXISTS dict_ip_trie_table</drop_query>
<drop_query>DROP TABLE IF EXISTS table_ip_from_dict</drop_query>
</test>

View File

@ -5,9 +5,6 @@
-1275.0000 -424.99999983 -255.00000000 -1275.0000 -424.99999983 -255.00000000
101.0000 101.00000000 101.00000000 101.0000 101.00000000 101.00000000
-101.0000 -101.00000000 -101.00000000 -101.0000 -101.00000000 -101.00000000
0.0000 0.00000000 0.00000000
25.5000 8.49999999 5.10000000
-25.5000 -8.49999999 -5.10000000
(101,101,101) (101,101,101) (101,101,101) (101,101,101) (102,100,101)
5 5 5
10 10 10

View File

@ -20,10 +20,6 @@ SELECT sum(a), sum(b), sum(c), sumWithOverflow(a), sumWithOverflow(b), sumWithOv
SELECT sum(a+1), sum(b+1), sum(c+1), sumWithOverflow(a+1), sumWithOverflow(b+1), sumWithOverflow(c+1) FROM decimal;
SELECT sum(a-1), sum(b-1), sum(c-1), sumWithOverflow(a-1), sumWithOverflow(b-1), sumWithOverflow(c-1) FROM decimal;
SELECT avg(a), avg(b), avg(c) FROM decimal;
SELECT avg(a), avg(b), avg(c) FROM decimal WHERE a > 0;
SELECT avg(a), avg(b), avg(c) FROM decimal WHERE a < 0;
SELECT (uniq(a), uniq(b), uniq(c)),
(uniqCombined(a), uniqCombined(b), uniqCombined(c)),
(uniqCombined(17)(a), uniqCombined(17)(b), uniqCombined(17)(c)),

View File

@ -5,9 +5,6 @@
0.0000 0.0000000 0.00000000 0.0000 0.0000000 0.00000000
0.0000 0.0000000 0.00000000 0.0000 0.0000000 0.00000000
0.0000 0.0000000 0.00000000 0.0000 0.0000000 0.00000000
0.0000 0.0000000 0.00000000 Decimal(9, 4) Decimal(18, 7) Decimal(38, 8)
0.0000 0.0000000 0.00000000 Decimal(9, 4) Decimal(18, 7) Decimal(38, 8)
0.0000 0.0000000 0.00000000 Decimal(9, 4) Decimal(18, 7) Decimal(38, 8)
(0,0,0) (0,0,0) (0,0,0) (0,0,0) (0,0,0)
0 0 0
0 0 0

View File

@ -16,10 +16,6 @@ SELECT sum(a), sum(b), sum(c), sumWithOverflow(a), sumWithOverflow(b), sumWithOv
SELECT sum(a+1), sum(b+1), sum(c+1), sumWithOverflow(a+1), sumWithOverflow(b+1), sumWithOverflow(c+1) FROM decimal;
SELECT sum(a-1), sum(b-1), sum(c-1), sumWithOverflow(a-1), sumWithOverflow(b-1), sumWithOverflow(c-1) FROM decimal;
SELECT avg(a) as aa, avg(b) as ab, avg(c) as ac, toTypeName(aa), toTypeName(ab),toTypeName(ac) FROM decimal;
SELECT avg(a) as aa, avg(b) as ab, avg(c) as ac, toTypeName(aa), toTypeName(ab),toTypeName(ac) FROM decimal WHERE a > 0;
SELECT avg(a) as aa, avg(b) as ab, avg(c) as ac, toTypeName(aa), toTypeName(ab),toTypeName(ac) FROM decimal WHERE a < 0;
SELECT (uniq(a), uniq(b), uniq(c)),
(uniqCombined(a), uniqCombined(b), uniqCombined(c)),
(uniqCombined(17)(a), uniqCombined(17)(b), uniqCombined(17)(c)),

View File

@ -1,2 +0,0 @@
[(2,0.2),(3,0.8999999999999999),(7,2.0999999999999996),(8,2.4),(12,3.5999999999999996),(17,5.1000000000000005),(18,5.4),(24,7.199999999999999),(25,2.5)]
[(2,0),(3,0.09999999999999999),(7,0.3),(8,0.30000000000000004),(12,0.29999999999999993),(17,0.30000000000000004),(18,0.30000000000000004),(24,0.29999999999999993),(25,0.1)]

View File

@ -1,10 +0,0 @@
drop table if exists tsgroupsum_test;
create table tsgroupsum_test (uid UInt64, ts Int64, value Float64) engine=Memory;
insert into tsgroupsum_test values (1,2,0.2),(1,7,0.7),(1,12,1.2),(1,17,1.7),(1,25,2.5);
insert into tsgroupsum_test values (2,3,0.6),(2,8,1.6),(2,12,2.4),(2,18,3.6),(2,24,4.8);
select timeSeriesGroupSum(uid, ts, value) from (select * from tsgroupsum_test order by ts asc);
select timeSeriesGroupRateSum(uid, ts, value) from (select * from tsgroupsum_test order by ts asc);
drop table tsgroupsum_test;

View File

@ -4,6 +4,6 @@
[499500.00]
[499500.00000]
[499500.0000000000]
1545081300 [('ed87e57c-9331-462a-80b4-9f0c005e88c8',0.4400)]
1545081300 [('ed87e57c-9331-462a-80b4-9f0c005e88c8',0.44)]
4341757 5657967 2018-11-01 16:47:46 txt 321.380000000000 315.080000000000 0.000000000000 2018-11-02 00:00:00
4360430 5681495 2018-11-02 09:00:07 txt 274.350000000000 268.970000000000 0.000000000000 2018-11-02 00:00:00

View File

@ -53,7 +53,8 @@ Code: 43
"UInt8",11
------------------------------------------
SELECT toUnixTimestamp(N)
"UInt32",18155
Code: 44
"UInt32",1568650811
"UInt32",1568650811
------------------------------------------

View File

@ -8,9 +8,6 @@
0.42
0.46
0
***ip trie dict***
17501
NP
***hierarchy dict***
Moscow
[3,2,1,10000]

View File

@ -72,33 +72,6 @@ SELECT dictGetFloat64('database_for_dict.dict2', 'Tax', toUInt64(1), toDateTime(
SELECT dictGetFloat64('database_for_dict.dict2', 'Tax', toUInt64(2), toDateTime('2019-05-29 00:00:00'));
SELECT dictGetFloat64('database_for_dict.dict2', 'Tax', toUInt64(2), toDateTime('2019-05-31 00:00:00'));
SELECT '***ip trie dict***';
CREATE TABLE database_for_dict.table_ip_trie
(
prefix String,
asn UInt32,
cca2 String
)
engine = TinyLog;
INSERT INTO database_for_dict.table_ip_trie VALUES ('202.79.32.0/20', 17501, 'NP'), ('2620:0:870::/48', 3856, 'US'), ('2a02:6b8:1::/48', 13238, 'RU'), ('2001:db8::/32', 65536, 'ZZ');
CREATE DICTIONARY database_for_dict.dict_ip_trie
(
prefix String,
asn UInt32,
cca2 String
)
PRIMARY KEY prefix
SOURCE(CLICKHOUSE(host 'localhost' port 9000 user 'default' db 'database_for_dict' table 'table_ip_trie'))
LAYOUT(IP_TRIE())
LIFETIME(MIN 10 MAX 100);
SELECT dictGetUInt32('database_for_dict.dict_ip_trie', 'asn', tuple(IPv4StringToNum('202.79.32.0')));
SELECT dictGetString('database_for_dict.dict_ip_trie', 'cca2', tuple(IPv4StringToNum('202.79.32.0')));
SELECT '***hierarchy dict***';
CREATE TABLE database_for_dict.table_with_hierarchy

View File

@ -41,7 +41,7 @@ nan
\N
\N
\N
0.00
0
\N
0
\N

View File

@ -0,0 +1,425 @@
***ipv4 trie dict***
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
***ipv4 trie dict mask***
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
***ipv4 trie dict pt2***
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
***ipv6 trie dict***
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
***ipv6 trie dict mask***
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

Some files were not shown because too many files have changed in this diff Show More