1. GraphiteMergeTree is added.

2. Merge remote-tracking branch 'upstream/master'.
This commit is contained in:
BayoNet 2017-05-16 19:54:45 +03:00
commit 2d8df96f7e
329 changed files with 7551 additions and 3432 deletions

View File

@ -4,7 +4,7 @@ macro(add_glob cur_list)
endmacro()
macro(add_headers_and_sources prefix common_path)
add_glob(${prefix}_headers RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h ${common_path}/*.inl)
add_glob(${prefix}_headers RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h)
add_glob(${prefix}_sources ${common_path}/*.cpp ${common_path}/*.h)
endmacro()

View File

@ -31,6 +31,7 @@ endif ()
add_subdirectory (libcityhash)
add_subdirectory (libfarmhash)
add_subdirectory (libmetrohash)
add_subdirectory (libbtrie)
if (USE_INTERNAL_ZLIB_LIBRARY)
add_subdirectory (libzlib-ng)

View File

@ -0,0 +1,6 @@
include_directories (BEFORE include)
add_library (btrie
src/btrie.c
include/btrie.h
)

23
contrib/libbtrie/LICENSE Normal file
View File

@ -0,0 +1,23 @@
Copyright (c) 2013, CobbLiu
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,155 @@
#pragma once
#if defined (__cplusplus)
extern "C" {
#endif
#include <stdlib.h>
#include <stdint.h>
/**
* In btrie, each leaf means one bit in ip tree.
* Left means 0, and right means 1.
*/
#define BTRIE_NULL (uintptr_t) -1
#define MAX_PAGES 1024 * 16
typedef struct btrie_node_s btrie_node_t;
struct btrie_node_s {
btrie_node_t *right;
btrie_node_t *left;
btrie_node_t *parent;
uintptr_t value;
};
typedef struct btrie_s {
btrie_node_t *root;
btrie_node_t *free; /* free list of btrie */
char *start;
size_t size;
/*
* memory pool.
* memory management(esp free) will be so easy by using this facility.
*/
char *pools[MAX_PAGES];
size_t len;
} btrie_t;
/**
* Create an empty btrie
*
* @Return:
* An ip radix_tree created.
* NULL if creation failed.
*/
btrie_t *btrie_create();
/**
* Destroy the ip radix_tree
*
* @Return:
* OK if deletion succeed.
* ERROR if error occurs while deleting.
*/
int btrie_destroy(btrie_t *tree);
/**
* Count the nodes in the radix tree.
*/
size_t btrie_count(btrie_t *tree);
/**
* Return the allocated number of bytes.
*/
size_t btrie_allocated(btrie_t *tree);
/**
* Add an ipv4 into btrie
*
* @Args:
* key: ip address
* mask: key's mask
* value: value of this IP, may be NULL.
*
* @Return:
* OK for success.
* ERROR for failure.
*/
int btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask,
uintptr_t value);
/**
* Delete an ipv4 from btrie
*
* @Args:
*
* @Return:
* OK for success.
* ERROR for failure.
*/
int btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask);
/**
* Find an ipv4 from btrie
*
* @Args:
*
* @Return:
* Value if succeed.
* NULL if failed.
*/
uintptr_t btrie_find(btrie_t *tree, uint32_t key);
/**
* Add an ipv6 into btrie
*
* @Args:
* key: ip address
* mask: key's mask
* value: value of this IP, may be NULL.
*
* @Return:
* OK for success.
* ERROR for failure.
*/
int btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask,
uintptr_t value);
/**
* Delete an ipv6 from btrie
*
* @Args:
*
* @Return:
* OK for success.
* ERROR for failure.
*/
int btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask);
/**
* Find an ipv6 from btrie
*
* @Args:
*
* @Return:
* Value if succeed.
* NULL if failed.
*/
uintptr_t btrie_find_a6(btrie_t *tree, const uint8_t *key);
#if defined (__cplusplus)
}
#endif

View File

@ -0,0 +1,460 @@
#include <stdlib.h>
#include <string.h>
#include <btrie.h>
#define PAGE_SIZE 4096
static btrie_node_t *
btrie_alloc(btrie_t *tree)
{
btrie_node_t *p;
if (tree->free) {
p = tree->free;
tree->free = tree->free->right;
return p;
}
if (tree->size < sizeof(btrie_node_t)) {
tree->start = (char *) calloc(sizeof(char), PAGE_SIZE);
if (tree->start == NULL) {
return NULL;
}
tree->pools[tree->len++] = tree->start;
tree->size = PAGE_SIZE;
}
p = (btrie_node_t *) tree->start;
tree->start += sizeof(btrie_node_t);
tree->size -= sizeof(btrie_node_t);
return p;
}
btrie_t *
btrie_create()
{
btrie_t *tree = (btrie_t *) malloc(sizeof(btrie_t));
if (tree == NULL) {
return NULL;
}
tree->free = NULL;
tree->start = NULL;
tree->size = 0;
memset(tree->pools, 0, sizeof(btrie_t *) * MAX_PAGES);
tree->len = 0;
tree->root = btrie_alloc(tree);
if (tree->root == NULL) {
return NULL;
}
tree->root->right = NULL;
tree->root->left = NULL;
tree->root->parent = NULL;
tree->root->value = BTRIE_NULL;
return tree;
}
static size_t
subtree_weight(btrie_node_t *node)
{
size_t weight = 1;
if (node->left) {
weight += subtree_weight(node->left);
}
if (node->right) {
weight += subtree_weight(node->right);
}
return weight;
}
size_t
btrie_count(btrie_t *tree)
{
if (tree->root == NULL) {
return 0;
}
return subtree_weight(tree->root);
}
size_t
btrie_allocated(btrie_t *tree)
{
return tree->len * PAGE_SIZE;
}
int
btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask,
uintptr_t value)
{
uint32_t bit;
btrie_node_t *node, *next;
bit = 0x80000000;
node = tree->root;
next = tree->root;
while (bit & mask) {
if (key & bit) {
next = node->right;
} else {
next = node->left;
}
if (next == NULL) {
break;
}
bit >>= 1;
node = next;
}
if (next) {
if (node->value != BTRIE_NULL) {
return -1;
}
node->value = value;
return 0;
}
while (bit & mask) {
next = btrie_alloc(tree);
if (next == NULL) {
return -1;
}
next->right = NULL;
next->left = NULL;
next->parent = node;
next->value = BTRIE_NULL;
if (key & bit) {
node->right = next;
} else {
node->left = next;
}
bit >>= 1;
node = next;
}
node->value = value;
return 0;
}
int
btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask)
{
uint32_t bit;
btrie_node_t *node;
bit = 0x80000000;
node = tree->root;
while (node && (bit & mask)) {
if (key & bit) {
node = node->right;
} else {
node = node->left;
}
bit >>= 1;
}
if (node == NULL) {
return -1;
}
if (node->right || node->left) {
if (node->value != BTRIE_NULL) {
node->value = BTRIE_NULL;
return 0;
}
return -1;
}
for ( ;; ) {
if (node->parent->right == node) {
node->parent->right = NULL;
} else {
node->parent->left = NULL;
}
node->right = tree->free;
tree->free = node;
node = node->parent;
if (node->right || node->left) {
break;
}
if (node->value != BTRIE_NULL) {
break;
}
if (node->parent == NULL) {
break;
}
}
return 0;
}
uintptr_t
btrie_find(btrie_t *tree, uint32_t key)
{
uint32_t bit;
uintptr_t value;
btrie_node_t *node;
bit = 0x80000000;
value = BTRIE_NULL;
node = tree->root;
while (node) {
if (node->value != BTRIE_NULL) {
value = node->value;
}
if (key & bit) {
node = node->right;
} else {
node = node->left;
}
bit >>= 1;
}
return value;
}
int
btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask,
uintptr_t value)
{
uint8_t bit;
uint i;
btrie_node_t *node, *next;
i = 0;
bit = 0x80;
node = tree->root;
next = tree->root;
while (bit & mask[i]) {
if (key[i] & bit) {
next = node->right;
} else {
next = node->left;
}
if (next == NULL) {
break;
}
bit >>= 1;
node = next;
if (bit == 0) {
if (++i == 16) {
break;
}
bit = 0x80;
}
}
if (next) {
if (node->value != BTRIE_NULL) {
return -1;
}
node->value = value;
return 0;
}
while (bit & mask[i]) {
next = btrie_alloc(tree);
if (next == NULL) {
return -1;
}
next->right = NULL;
next->left = NULL;
next->parent = node;
next->value = BTRIE_NULL;
if (key[i] & bit) {
node->right = next;
} else {
node->left = next;
}
bit >>= 1;
node = next;
if (bit == 0) {
if (++i == 16) {
break;
}
bit = 0x80;
}
}
node->value = value;
return 0;
}
int
btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask)
{
uint8_t bit;
uint i;
btrie_node_t *node;
i = 0;
bit = 0x80;
node = tree->root;
while (node && (bit & mask[i])) {
if (key[i] & bit) {
node = node->right;
} else {
node = node->left;
}
bit >>= 1;
if (bit == 0) {
if (++i == 16) {
break;
}
bit = 0x80;
}
}
if (node == NULL) {
return -1;
}
if (node->right || node->left) {
if (node->value != BTRIE_NULL) {
node->value = BTRIE_NULL;
return 0;
}
return -1;
}
for ( ;; ) {
if (node->parent->right == node) {
node->parent->right = NULL;
} else {
node->parent->left = NULL;
}
node->right = tree->free;
tree->free = node;
node = node->parent;
if (node->right || node->left) {
break;
}
if (node->value != BTRIE_NULL) {
break;
}
if (node->parent == NULL) {
break;
}
}
return 0;
}
uintptr_t
btrie_find_a6(btrie_t *tree, const uint8_t *key)
{
uint8_t bit;
uintptr_t value;
uint i;
btrie_node_t *node;
i = 0;
bit = 0x80;
value = BTRIE_NULL;
node = tree->root;
while (node) {
if (node->value != BTRIE_NULL) {
value = node->value;
}
if (key[i] & bit) {
node = node->right;
} else {
node = node->left;
}
bit >>= 1;
if (bit == 0) {
i++;
bit = 0x80;
}
}
return value;
}
int
btrie_destroy(btrie_t *tree)
{
size_t i;
/* free memory pools */
for (i = 0; i < tree->len; i++) {
free(tree->pools[i]);
}
free(tree);
return 0;
}

View File

@ -0,0 +1,94 @@
#include <stdio.h>
#include <btrie.h>
int main()
{
btrie_t *it;
int ret;
uint8_t prefix_v6[16] = {0xde, 0xad, 0xbe, 0xef};
uint8_t mask_v6[16] = {0xff, 0xff, 0xff};
uint8_t ip_v6[16] = {0xde, 0xad, 0xbe, 0xef, 0xde};
it = btrie_create();
if (it == NULL) {
printf("create error!\n");
return 0;
}
//add 101.45.69.50/16
ret = btrie_insert(it, 1697465650, 0xffff0000, 1);
if (ret != 0) {
printf("insert 1 error.\n");
goto error;
}
//add 10.45.69.50/16
ret = btrie_insert(it, 170738994, 0xffff0000, 1);
if (ret != 0) {
printf("insert 2 error.\n");
goto error;
}
//add 10.45.79.50/16
ret = btrie_insert(it, 170741554, 0xffff0000, 1);
if (ret == 0) {
printf("insert 3 error.\n");
goto error;
}
//add 102.45.79.50/24
ret = btrie_insert(it, 1714245426, 0xffffff00, 1);
if (ret != 0) {
printf("insert 4 error.\n");
goto error;
}
ret = btrie_find(it, 170741554);
if (ret == 1) {
printf("test case 1 passed\n");
} else {
printf("test case 1 error\n");
}
ret = btrie_find(it, 170786817);
if (ret != 1) {
printf("test case 2 passed\n");
} else {
printf("test case 2 error\n");
}
ret = btrie_delete(it, 1714245426, 0xffffff00);
if (ret != 0) {
printf("delete 1 error\n");
goto error;
}
ret = btrie_find(it, 1714245426);
if (ret != 1) {
printf("test case 3 passed\n");
} else {
printf("test case 3 error\n");
}
//add dead:beef::/32
ret = btrie_insert_a6(it, prefix_v6, mask_v6, 1);
if (ret != 0) {
printf("insert 5 error\n");
goto error;
}
ret = btrie_find_a6(it, ip_v6);
if (ret == 1) {
printf("test case 4 passed\n");
} else {
printf("test case 4 error\n");
}
return 0;
error:
btrie_destroy(it);
printf("test failed\n");
return 1;
}

View File

@ -5,4 +5,6 @@ add_library (lz4
src/lz4hc.c
include/lz4/lz4.h
include/lz4/lz4hc.h)
include/lz4/lz4hc.h
include/lz4/lz4opt.h)

View File

@ -1,7 +1,7 @@
/*
LZ4 - Fast LZ compression algorithm
Header File
Copyright (C) 2011-2015, Yann Collet.
* LZ4 - Fast LZ compression algorithm
* Header File
* Copyright (C) 2011-2016, Yann Collet.
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
@ -29,34 +29,79 @@
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
You can contact the author at :
- LZ4 source repository : https://github.com/Cyan4973/lz4
- LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
- LZ4 homepage : http://www.lz4.org
- LZ4 source repository : https://github.com/lz4/lz4
*/
#pragma once
#ifndef LZ4_H_2983827168210
#define LZ4_H_2983827168210
#if defined (__cplusplus)
extern "C" {
#endif
/*
* lz4.h provides block compression functions, and gives full buffer control to programmer.
* If you need to generate inter-operable compressed data (respecting LZ4 frame specification),
* and can let the library handle its own memory, please use lz4frame.h instead.
/* --- Dependency --- */
#include <stddef.h> /* size_t */
/**
Introduction
LZ4 is lossless compression algorithm, providing compression speed at 400 MB/s per core,
scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
The LZ4 compression library provides in-memory compression and decompression functions.
Compression can be done in:
- a single step (described as Simple Functions)
- a single step, reusing a context (described in Advanced Functions)
- unbounded multiple steps (described as Streaming compression)
lz4.h provides block compression functions. It gives full buffer control to user.
Decompressing an lz4-compressed block also requires metadata (such as compressed size).
Each application is free to encode such metadata in whichever way it wants.
An additional format, called LZ4 frame specification (doc/lz4_Frame_format.md),
take care of encoding standard metadata alongside LZ4-compressed blocks.
If your application requires interoperability, it's recommended to use it.
A library is provided to take care of it, see lz4frame.h.
*/
/**************************************
* Version
**************************************/
/*^***************************************************************
* Export parameters
*****************************************************************/
/*
* LZ4_DLL_EXPORT :
* Enable exporting of functions when building a Windows DLL
*/
#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
# define LZ4LIB_API __declspec(dllexport)
#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
# define LZ4LIB_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
#else
# define LZ4LIB_API
#endif
/*========== Version =========== */
#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */
#define LZ4_VERSION_MINOR 7 /* for new (non-breaking) interface capabilities */
#define LZ4_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */
#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
int LZ4_versionNumber (void);
#define LZ4_VERSION_RELEASE 5 /* for tweaks, bug-fixes, or development */
/**************************************
#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
#define LZ4_QUOTE(str) #str
#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)
LZ4LIB_API int LZ4_versionNumber (void);
LZ4LIB_API const char* LZ4_versionString (void);
/*-************************************
* Tuning parameter
**************************************/
/*
/*!
* LZ4_MEMORY_USAGE :
* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
* Increasing memory usage improves compression ratio
@ -66,15 +111,10 @@ int LZ4_versionNumber (void);
#define LZ4_MEMORY_USAGE 14
/**************************************
/*-************************************
* Simple Functions
**************************************/
int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize);
int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
/*
LZ4_compress_default() :
/*! LZ4_compress_default() :
Compresses 'sourceSize' bytes from buffer 'source'
into already allocated 'dest' buffer of size 'maxDestSize'.
Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize).
@ -86,9 +126,10 @@ LZ4_compress_default() :
sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE
maxDestSize : full or partial size of buffer 'dest' (which must be already allocated)
return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize)
or 0 if compression fails
or 0 if compression fails */
LZ4LIB_API int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize);
LZ4_decompress_safe() :
/*! LZ4_decompress_safe() :
compressedSize : is the precise full size of the compressed block.
maxDecompressedSize : is the size of destination buffer, which must be already allocated.
return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize)
@ -97,15 +138,16 @@ LZ4_decompress_safe() :
This function is protected against buffer overflow exploits, including malicious data packets.
It never writes outside output buffer, nor reads outside input buffer.
*/
LZ4LIB_API int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
/**************************************
/*-************************************
* Advanced Functions
**************************************/
#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */
#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
/*
/*!
LZ4_compressBound() :
Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
This function is primarily useful for memory allocation purposes (destination buffer size).
@ -115,9 +157,9 @@ LZ4_compressBound() :
return : maximum output size in a "worst case" scenario
or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
*/
int LZ4_compressBound(int inputSize);
LZ4LIB_API int LZ4_compressBound(int inputSize);
/*
/*!
LZ4_compress_fast() :
Same as LZ4_compress_default(), but allows to select an "acceleration" factor.
The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
@ -125,21 +167,21 @@ LZ4_compress_fast() :
An acceleration value of "1" is the same as regular LZ4_compress_default()
Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1.
*/
int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration);
LZ4LIB_API int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration);
/*
/*!
LZ4_compress_fast_extState() :
Same compression function, just using an externally allocated memory space to store compression state.
Use LZ4_sizeofState() to know how much memory must be allocated,
and allocate it on 8-bytes boundaries (using malloc() typically).
Then, provide it as 'void* state' to compression function.
*/
int LZ4_sizeofState(void);
int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration);
LZ4LIB_API int LZ4_sizeofState(void);
LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration);
/*
/*!
LZ4_compress_destSize() :
Reverse the logic, by compressing as much data as possible from 'source' buffer
into already allocated buffer 'dest' of size 'targetDestSize'.
@ -150,10 +192,10 @@ LZ4_compress_destSize() :
return : Nb bytes written into 'dest' (necessarily <= targetDestSize)
or 0 if compression fails
*/
int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize);
LZ4LIB_API int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize);
/*
/*!
LZ4_decompress_fast() :
originalSize : is the original and therefore uncompressed size
return : the number of bytes read from the source buffer (in other words, the compressed size)
@ -164,9 +206,9 @@ LZ4_decompress_fast() :
However, it does not provide any protection against intentionally modified data stream (malicious input).
Use this function in trusted environment only (data to decode comes from a trusted source).
*/
int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
LZ4LIB_API int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
/*
/*!
LZ4_decompress_safe_partial() :
This function decompress a compressed block of size 'compressedSize' at position 'source'
into destination buffer 'dest' of size 'maxDecompressedSize'.
@ -178,98 +220,73 @@ LZ4_decompress_safe_partial() :
If the source stream is detected malformed, the function will stop decoding and return a negative result.
This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
*/
int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
LZ4LIB_API int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
/***********************************************
/*-*********************************************
* Streaming Compression Functions
***********************************************/
#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(long long))
/*
* LZ4_stream_t
* information structure to track an LZ4 stream.
* important : init this structure content before first use !
* note : only allocated directly the structure if you are statically linking LZ4
* If you are using liblz4 as a DLL, please use below construction methods instead.
typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */
/*! LZ4_createStream() and LZ4_freeStream() :
* LZ4_createStream() will allocate and initialize an `LZ4_stream_t` structure.
* LZ4_freeStream() releases its memory.
*/
typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t;
LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
LZ4LIB_API int LZ4_freeStream (LZ4_stream_t* streamPtr);
/*
* LZ4_resetStream
* Use this function to init an allocated LZ4_stream_t structure
/*! LZ4_resetStream() :
* An LZ4_stream_t structure can be allocated once and re-used multiple times.
* Use this function to init an allocated `LZ4_stream_t` structure and start a new compression.
*/
void LZ4_resetStream (LZ4_stream_t* streamPtr);
LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
/*
* LZ4_createStream will allocate and initialize an LZ4_stream_t structure
* LZ4_freeStream releases its memory.
* In the context of a DLL (liblz4), please use these methods rather than the static struct.
* They are more future proof, in case of a change of LZ4_stream_t size.
/*! LZ4_loadDict() :
* Use this function to load a static dictionary into LZ4_stream.
* Any previous data will be forgotten, only 'dictionary' will remain in memory.
* Loading a size of 0 is allowed.
* Return : dictionary size, in bytes (necessarily <= 64 KB)
*/
LZ4_stream_t* LZ4_createStream(void);
int LZ4_freeStream (LZ4_stream_t* streamPtr);
LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
/*
* LZ4_loadDict
* Use this function to load a static dictionary into LZ4_stream.
* Any previous data will be forgotten, only 'dictionary' will remain in memory.
* Loading a size of 0 is allowed.
* Return : dictionary size, in bytes (necessarily <= 64 KB)
/*! LZ4_compress_fast_continue() :
* Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio.
* Important : Previous data blocks are assumed to still be present and unmodified !
* 'dst' buffer must be already allocated.
* If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
* If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero.
*/
int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration);
/*
* LZ4_compress_fast_continue
* Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio.
* Important : Previous data blocks are assumed to still be present and unmodified !
* 'dst' buffer must be already allocated.
* If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
* If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero.
/*! LZ4_saveDict() :
* If previously compressed data block is not guaranteed to remain available at its memory location,
* save it into a safer place (char* safeBuffer).
* Note : you don't need to call LZ4_loadDict() afterwards,
* dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue().
* Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
*/
int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration);
/*
* LZ4_saveDict
* If previously compressed data block is not guaranteed to remain available at its memory location
* save it into a safer place (char* safeBuffer)
* Note : you don't need to call LZ4_loadDict() afterwards,
* dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue()
* Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error
*/
int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize);
LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize);
/************************************************
/*-**********************************************
* Streaming Decompression Functions
* Bufferless synchronous API
************************************************/
typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* incomplete type (defined later) */
#define LZ4_STREAMDECODESIZE_U64 4
#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
/*
* LZ4_streamDecode_t
* information structure to track an LZ4 stream.
* init this structure content using LZ4_setStreamDecode or memset() before first use !
*
* In the context of a DLL (liblz4) please prefer usage of construction methods below.
* They are more future proof, in case of a change of LZ4_streamDecode_t size in the future.
* LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
* LZ4_freeStreamDecode releases its memory.
/* creation / destruction of streaming decompression tracking structure */
LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
LZ4LIB_API int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
/*! LZ4_setStreamDecode() :
* Use this function to instruct where to find the dictionary.
* Setting a size of 0 is allowed (same effect as reset).
* @return : 1 if OK, 0 if error
*/
LZ4_streamDecode_t* LZ4_createStreamDecode(void);
int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
/*
* LZ4_setStreamDecode
* Use this function to instruct where to find the dictionary.
* Setting a size of 0 is allowed (same effect as reset).
* Return : 1 if OK, 0 if error
*/
int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
/*
*_continue() :
/*!
LZ4_decompress_*_continue() :
These decoding functions allow decompression of multiple blocks in "streaming" mode.
Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB)
In the case of a ring buffers, decoding buffer must be either :
@ -285,35 +302,120 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti
Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
and indicate where it is saved using LZ4_setStreamDecode()
*/
int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize);
int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize);
LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
/*
Advanced decoding functions :
*_usingDict() :
These decoding functions work the same as
a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue()
They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure.
*/
int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
/*! LZ4_decompress_*_usingDict() :
* These decoding functions work the same as
* a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
* They are stand-alone, and don't need an LZ4_streamDecode_t structure.
*/
LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
/*^**********************************************
* !!!!!! STATIC LINKING ONLY !!!!!!
***********************************************/
/*-************************************
* Private definitions
**************************************
* Do not use these definitions.
* They are exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
* Using these definitions will expose code to API and/or ABI break in future versions of the library.
**************************************/
#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2)
#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) /* required as macro for static allocation */
/**************************************
#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
#include <stdint.h>
typedef struct {
uint32_t hashTable[LZ4_HASH_SIZE_U32];
uint32_t currentOffset;
uint32_t initCheck;
const uint8_t* dictionary;
uint8_t* bufferStart; /* obsolete, used for slideInputBuffer */
uint32_t dictSize;
} LZ4_stream_t_internal;
typedef struct {
const uint8_t* externalDict;
size_t extDictSize;
const uint8_t* prefixEnd;
size_t prefixSize;
} LZ4_streamDecode_t_internal;
#else
typedef struct {
unsigned int hashTable[LZ4_HASH_SIZE_U32];
unsigned int currentOffset;
unsigned int initCheck;
const unsigned char* dictionary;
unsigned char* bufferStart; /* obsolete, used for slideInputBuffer */
unsigned int dictSize;
} LZ4_stream_t_internal;
typedef struct {
const unsigned char* externalDict;
size_t extDictSize;
const unsigned char* prefixEnd;
size_t prefixSize;
} LZ4_streamDecode_t_internal;
#endif
/*!
* LZ4_stream_t :
* information structure to track an LZ4 stream.
* init this structure before first use.
* note : only use in association with static linking !
* this definition is not API/ABI safe,
* and may change in a future version !
*/
#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(unsigned long long))
union LZ4_stream_u {
unsigned long long table[LZ4_STREAMSIZE_U64];
LZ4_stream_t_internal internal_donotuse;
} ; /* previously typedef'd to LZ4_stream_t */
/*!
* LZ4_streamDecode_t :
* information structure to track an LZ4 stream during decompression.
* init this structure using LZ4_setStreamDecode (or memset()) before first use
* note : only use in association with static linking !
* this definition is not API/ABI safe,
* and may change in a future version !
*/
#define LZ4_STREAMDECODESIZE_U64 4
#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
union LZ4_streamDecode_u {
unsigned long long table[LZ4_STREAMDECODESIZE_U64];
LZ4_streamDecode_t_internal internal_donotuse;
} ; /* previously typedef'd to LZ4_streamDecode_t */
/*=************************************
* Obsolete Functions
**************************************/
/* Deprecate Warnings */
/* Should these warnings messages be a problem,
/* Deprecation warnings */
/* Should these warnings be a problem,
it is generally possible to disable them,
with -Wno-deprecated-declarations for gcc
or _CRT_SECURE_NO_WARNINGS in Visual for example.
You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */
#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK
# define LZ4_DEPRECATE_WARNING_DEFBLOCK
typically with -Wno-deprecated-declarations for gcc
or _CRT_SECURE_NO_WARNINGS in Visual.
Otherwise, it's also possible to define LZ4_DISABLE_DEPRECATE_WARNINGS */
#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
# define LZ4_DEPRECATED(message) /* disable deprecation warnings */
#else
# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
# if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
# define LZ4_DEPRECATED(message) [[deprecated(message)]]
# elif (LZ4_GCC_VERSION >= 405) || defined(__clang__)
# define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
# elif (LZ4_GCC_VERSION >= 301)
# define LZ4_DEPRECATED(message) __attribute__((deprecated))
@ -323,20 +425,19 @@ int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalS
# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
# define LZ4_DEPRECATED(message)
# endif
#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */
#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
/* Obsolete compression functions */
/* These functions are planned to start generate warnings by r131 approximately */
int LZ4_compress (const char* source, char* dest, int sourceSize);
int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize);
int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
LZ4_DEPRECATED("use LZ4_compress_default() instead") int LZ4_compress (const char* source, char* dest, int sourceSize);
LZ4_DEPRECATED("use LZ4_compress_default() instead") int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize);
LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
/* Obsolete decompression functions */
/* These function names are completely deprecated and must no longer be used.
They are only provided here for compatibility with older programs.
They are only provided in lz4.c for compatibility with older programs.
- LZ4_uncompress is the same as LZ4_decompress_fast
- LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
These function prototypes are now disabled; uncomment them only if you really need them.
@ -358,3 +459,5 @@ LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress
#if defined (__cplusplus)
}
#endif
#endif /* LZ4_H_2983827168210 */

View File

@ -1,7 +1,7 @@
/*
LZ4 HC - High Compression Mode of LZ4
Header File
Copyright (C) 2011-2015, Yann Collet.
Copyright (C) 2011-2016, Yann Collet.
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
Redistribution and use in source and binary forms, with or without
@ -28,107 +28,92 @@
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
You can contact the author at :
- LZ4 source repository : https://github.com/Cyan4973/lz4
- LZ4 source repository : https://github.com/lz4/lz4
- LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
*/
#pragma once
#ifndef LZ4_HC_H_19834876238432
#define LZ4_HC_H_19834876238432
#if defined (__cplusplus)
extern "C" {
#endif
/*****************************
* Includes
*****************************/
#include <stddef.h> /* size_t */
/* --- Dependency --- */
/* note : lz4hc is not an independent module, it requires lz4.h/lz4.c for proper compilation */
#include "lz4.h" /* stddef, LZ4LIB_API, LZ4_DEPRECATED */
/**************************************
* Block Compression
**************************************/
int LZ4_compress_HC (const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel);
/*
LZ4_compress_HC :
Destination buffer 'dst' must be already allocated.
Compression completion is guaranteed if 'dst' buffer is sized to handle worst circumstances (data not compressible)
Worst size evaluation is provided by function LZ4_compressBound() (see "lz4.h")
srcSize : Max supported value is LZ4_MAX_INPUT_SIZE (see "lz4.h")
compressionLevel : Recommended values are between 4 and 9, although any value between 0 and 16 will work.
0 means "use default value" (see lz4hc.c).
Values >16 behave the same as 16.
return : the number of bytes written into buffer 'dst'
or 0 if compression fails.
*/
/* --- Useful constants --- */
#define LZ4HC_CLEVEL_MIN 3
#define LZ4HC_CLEVEL_DEFAULT 9
#define LZ4HC_CLEVEL_OPT_MIN 11
#define LZ4HC_CLEVEL_MAX 12
/*-************************************
* Block Compression
**************************************/
/*! LZ4_compress_HC() :
* Compress data from `src` into `dst`, using the more powerful but slower "HC" algorithm.
* `dst` must be already allocated.
* Compression is guaranteed to succeed if `dstCapacity >= LZ4_compressBound(srcSize)` (see "lz4.h")
* Max supported `srcSize` value is LZ4_MAX_INPUT_SIZE (see "lz4.h")
* `compressionLevel` : Recommended values are between 4 and 9, although any value between 1 and LZ4HC_MAX_CLEVEL will work.
* Values >LZ4HC_MAX_CLEVEL behave the same as LZ4HC_MAX_CLEVEL.
* @return : the number of bytes written into 'dst'
* or 0 if compression fails.
*/
LZ4LIB_API int LZ4_compress_HC (const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel);
/* Note :
Decompression functions are provided within LZ4 source code (see "lz4.h") (BSD license)
*/
* Decompression functions are provided within "lz4.h" (BSD license)
*/
int LZ4_sizeofStateHC(void);
int LZ4_compress_HC_extStateHC(void* state, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel);
/*
LZ4_compress_HC_extStateHC() :
Use this function if you prefer to manually allocate memory for compression tables.
To know how much memory must be allocated for the compression tables, use :
int LZ4_sizeofStateHC();
Allocated memory must be aligned on 8-bytes boundaries (which a normal malloc() will do properly).
The allocated memory can then be provided to the compression functions using 'void* state' parameter.
LZ4_compress_HC_extStateHC() is equivalent to previously described function.
It just uses externally allocated memory for stateHC.
*/
/*! LZ4_compress_HC_extStateHC() :
* Same as LZ4_compress_HC(), but using an externally allocated memory segment for `state`.
* `state` size is provided by LZ4_sizeofStateHC().
* Memory segment must be aligned on 8-bytes boundaries (which a normal malloc() will do properly).
*/
LZ4LIB_API int LZ4_compress_HC_extStateHC(void* state, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel);
LZ4LIB_API int LZ4_sizeofStateHC(void);
/**************************************
* Streaming Compression
**************************************/
#define LZ4_STREAMHCSIZE 262192
#define LZ4_STREAMHCSIZE_SIZET (LZ4_STREAMHCSIZE / sizeof(size_t))
typedef struct { size_t table[LZ4_STREAMHCSIZE_SIZET]; } LZ4_streamHC_t;
/*
LZ4_streamHC_t
This structure allows static allocation of LZ4 HC streaming state.
State must then be initialized using LZ4_resetStreamHC() before first use.
/*-************************************
* Streaming Compression
* Bufferless synchronous API
**************************************/
typedef union LZ4_streamHC_u LZ4_streamHC_t; /* incomplete type (defined later) */
Static allocation should only be used in combination with static linking.
If you want to use LZ4 as a DLL, please use construction functions below, which are future-proof.
*/
/*! LZ4_createStreamHC() and LZ4_freeStreamHC() :
* These functions create and release memory for LZ4 HC streaming state.
* Newly created states are automatically initialized.
* Existing states can be re-used several times, using LZ4_resetStreamHC().
* These methods are API and ABI stable, they can be used in combination with a DLL.
*/
LZ4LIB_API LZ4_streamHC_t* LZ4_createStreamHC(void);
LZ4LIB_API int LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr);
LZ4LIB_API void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel);
LZ4LIB_API int LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize);
LZ4_streamHC_t* LZ4_createStreamHC(void);
int LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr);
/*
These functions create and release memory for LZ4 HC streaming state.
Newly created states are already initialized.
Existing state space can be re-used anytime using LZ4_resetStreamHC().
If you use LZ4 as a DLL, use these functions instead of static structure allocation,
to avoid size mismatch between different versions.
*/
LZ4LIB_API int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr, const char* src, char* dst, int srcSize, int maxDstSize);
void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel);
int LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize);
int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr, const char* src, char* dst, int srcSize, int maxDstSize);
int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize);
LZ4LIB_API int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize);
/*
These functions compress data in successive blocks of any size, using previous blocks as dictionary.
One key assumption is that previous blocks (up to 64 KB) remain read-accessible while compressing next blocks.
There is an exception for ring buffers, which can be smaller 64 KB.
Such case is automatically detected and correctly handled by LZ4_compress_HC_continue().
There is an exception for ring buffers, which can be smaller than 64 KB.
Ring buffers scenario is automatically detected and handled by LZ4_compress_HC_continue().
Before starting compression, state must be properly initialized, using LZ4_resetStreamHC().
A first "fictional block" can then be designated as initial dictionary, using LZ4_loadDictHC() (Optional).
Then, use LZ4_compress_HC_continue() to compress each successive block.
It works like LZ4_compress_HC(), but use previous memory blocks as dictionary to improve compression.
Previous memory blocks (including initial dictionary when present) must remain accessible and unmodified during compression.
As a reminder, size 'dst' buffer to handle worst cases, using LZ4_compressBound(), to ensure success of compression operation.
'dst' buffer should be sized to handle worst case scenarios, using LZ4_compressBound(), to ensure operation success.
If, for any reason, previous data blocks can't be preserved unmodified in memory during next compression block,
you must save it to a safer memory space, using LZ4_saveDictHC().
@ -136,50 +121,102 @@ int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSi
*/
/*-******************************************
* !!!!! STATIC LINKING ONLY !!!!!
*******************************************/
/**************************************
/*-*************************************
* PRIVATE DEFINITIONS :
* Do not use these definitions.
* They are exposed to allow static allocation of `LZ4_streamHC_t`.
* Using these definitions makes the code vulnerable to potential API break when upgrading LZ4
**************************************/
#define LZ4HC_DICTIONARY_LOGSIZE 17
#define LZ4HC_MAXD (1<<LZ4HC_DICTIONARY_LOGSIZE)
#define LZ4HC_MAXD_MASK (LZ4HC_MAXD - 1)
#define LZ4HC_HASH_LOG 15
#define LZ4HC_HASHTABLESIZE (1 << LZ4HC_HASH_LOG)
#define LZ4HC_HASH_MASK (LZ4HC_HASHTABLESIZE - 1)
#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
#include <stdint.h>
typedef struct
{
uint32_t hashTable[LZ4HC_HASHTABLESIZE];
uint16_t chainTable[LZ4HC_MAXD];
const uint8_t* end; /* next block here to continue on current prefix */
const uint8_t* base; /* All index relative to this position */
const uint8_t* dictBase; /* alternate base for extDict */
uint8_t* inputBuffer; /* deprecated */
uint32_t dictLimit; /* below that point, need extDict */
uint32_t lowLimit; /* below that point, no more dict */
uint32_t nextToUpdate; /* index from which to continue dictionary update */
uint32_t searchNum; /* only for optimal parser */
uint32_t compressionLevel;
} LZ4HC_CCtx_internal;
#else
typedef struct
{
unsigned int hashTable[LZ4HC_HASHTABLESIZE];
unsigned short chainTable[LZ4HC_MAXD];
const unsigned char* end; /* next block here to continue on current prefix */
const unsigned char* base; /* All index relative to this position */
const unsigned char* dictBase; /* alternate base for extDict */
unsigned char* inputBuffer; /* deprecated */
unsigned int dictLimit; /* below that point, need extDict */
unsigned int lowLimit; /* below that point, no more dict */
unsigned int nextToUpdate; /* index from which to continue dictionary update */
unsigned int searchNum; /* only for optimal parser */
unsigned int compressionLevel;
} LZ4HC_CCtx_internal;
#endif
#define LZ4_STREAMHCSIZE (4*LZ4HC_HASHTABLESIZE + 2*LZ4HC_MAXD + 56) /* 393268 */
#define LZ4_STREAMHCSIZE_SIZET (LZ4_STREAMHCSIZE / sizeof(size_t))
union LZ4_streamHC_u {
size_t table[LZ4_STREAMHCSIZE_SIZET];
LZ4HC_CCtx_internal internal_donotuse;
}; /* previously typedef'd to LZ4_streamHC_t */
/*
LZ4_streamHC_t :
This structure allows static allocation of LZ4 HC streaming state.
State must be initialized using LZ4_resetStreamHC() before first use.
Static allocation shall only be used in combination with static linking.
When invoking LZ4 from a DLL, use create/free functions instead, which are API and ABI stable.
*/
/*-************************************
* Deprecated Functions
**************************************/
/* Deprecate Warnings */
/* Should these warnings messages be a problem,
it is generally possible to disable them,
with -Wno-deprecated-declarations for gcc
or _CRT_SECURE_NO_WARNINGS in Visual for example.
You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */
#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK
# define LZ4_DEPRECATE_WARNING_DEFBLOCK
# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
# if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
# define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
# elif (LZ4_GCC_VERSION >= 301)
# define LZ4_DEPRECATED(message) __attribute__((deprecated))
# elif defined(_MSC_VER)
# define LZ4_DEPRECATED(message) __declspec(deprecated(message))
# else
# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
# define LZ4_DEPRECATED(message)
# endif
#endif // LZ4_DEPRECATE_WARNING_DEFBLOCK
/* see lz4.h LZ4_DISABLE_DEPRECATE_WARNINGS to turn off deprecation warnings */
/* compression functions */
/* these functions are planned to trigger warning messages by r131 approximately */
int LZ4_compressHC (const char* source, char* dest, int inputSize);
int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
int LZ4_compressHC2 (const char* source, char* dest, int inputSize, int compressionLevel);
int LZ4_compressHC2_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
int LZ4_compressHC_withStateHC (void* state, const char* source, char* dest, int inputSize);
int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
int LZ4_compressHC2_withStateHC (void* state, const char* source, char* dest, int inputSize, int compressionLevel);
int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
int LZ4_compressHC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize);
int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
/* deprecated compression functions */
/* these functions will trigger warning messages in future releases */
LZ4_DEPRECATED("use LZ4_compress_HC() instead") int LZ4_compressHC (const char* source, char* dest, int inputSize);
LZ4_DEPRECATED("use LZ4_compress_HC() instead") int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
LZ4_DEPRECATED("use LZ4_compress_HC() instead") int LZ4_compressHC2 (const char* source, char* dest, int inputSize, int compressionLevel);
LZ4_DEPRECATED("use LZ4_compress_HC() instead") int LZ4_compressHC2_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") int LZ4_compressHC_withStateHC (void* state, const char* source, char* dest, int inputSize);
LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") int LZ4_compressHC2_withStateHC (void* state, const char* source, char* dest, int inputSize, int compressionLevel);
LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize);
LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
/* Streaming functions following the older model; should no longer be used */
/* Deprecated Streaming functions using older model; should no longer be used */
LZ4_DEPRECATED("use LZ4_createStreamHC() instead") void* LZ4_createHC (char* inputBuffer);
LZ4_DEPRECATED("use LZ4_saveDictHC() instead") char* LZ4_slideInputBufferHC (void* LZ4HC_Data);
LZ4_DEPRECATED("use LZ4_freeStreamHC() instead") int LZ4_freeHC (void* LZ4HC_Data);
LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel);
LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel);
LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
LZ4_DEPRECATED("use LZ4_createStreamHC() instead") int LZ4_sizeofStreamStateHC(void);
LZ4_DEPRECATED("use LZ4_resetStreamHC() instead") int LZ4_resetStreamStateHC(void* state, char* inputBuffer);
@ -187,3 +224,5 @@ LZ4_DEPRECATED("use LZ4_resetStreamHC() instead") int LZ4_resetStreamStateHC(
#if defined (__cplusplus)
}
#endif
#endif /* LZ4_HC_H_19834876238432 */

View File

@ -0,0 +1,361 @@
/*
lz4opt.h - Optimal Mode of LZ4
Copyright (C) 2015-2017, Przemyslaw Skibinski <inikep@gmail.com>
Note : this file is intended to be included within lz4hc.c
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
You can contact the author at :
- LZ4 source repository : https://github.com/lz4/lz4
- LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
*/
#define LZ4_OPT_NUM (1<<12)
typedef struct {
int off;
int len;
} LZ4HC_match_t;
typedef struct {
int price;
int off;
int mlen;
int litlen;
} LZ4HC_optimal_t;
/* price in bytes */
FORCE_INLINE size_t LZ4HC_literalsPrice(size_t litlen)
{
size_t price = litlen;
if (litlen >= (size_t)RUN_MASK)
price += 1 + (litlen-RUN_MASK)/255;
return price;
}
/* requires mlen >= MINMATCH */
FORCE_INLINE size_t LZ4HC_sequencePrice(size_t litlen, size_t mlen)
{
size_t price = 2 + 1; /* 16-bit offset + token */
price += LZ4HC_literalsPrice(litlen);
if (mlen >= (size_t)(ML_MASK+MINMATCH))
price+= 1 + (mlen-(ML_MASK+MINMATCH))/255;
return price;
}
/*-*************************************
* Binary Tree search
***************************************/
FORCE_INLINE int LZ4HC_BinTree_InsertAndGetAllMatches (
LZ4HC_CCtx_internal* ctx,
const BYTE* const ip,
const BYTE* const iHighLimit,
size_t best_mlen,
LZ4HC_match_t* matches,
int* matchNum)
{
U16* const chainTable = ctx->chainTable;
U32* const HashTable = ctx->hashTable;
const BYTE* const base = ctx->base;
const U32 dictLimit = ctx->dictLimit;
const U32 current = (U32)(ip - base);
const U32 lowLimit = (ctx->lowLimit + MAX_DISTANCE > current) ? ctx->lowLimit : current - (MAX_DISTANCE - 1);
const BYTE* const dictBase = ctx->dictBase;
const BYTE* match;
int nbAttempts = ctx->searchNum;
int mnum = 0;
U16 *ptr0, *ptr1, delta0, delta1;
U32 matchIndex;
size_t matchLength = 0;
U32* HashPos;
if (ip + MINMATCH > iHighLimit) return 1;
/* HC4 match finder */
HashPos = &HashTable[LZ4HC_hashPtr(ip)];
matchIndex = *HashPos;
*HashPos = current;
ptr0 = &DELTANEXTMAXD(current*2+1);
ptr1 = &DELTANEXTMAXD(current*2);
delta0 = delta1 = (U16)(current - matchIndex);
while ((matchIndex < current) && (matchIndex>=lowLimit) && (nbAttempts)) {
nbAttempts--;
if (matchIndex >= dictLimit) {
match = base + matchIndex;
matchLength = LZ4_count(ip, match, iHighLimit);
} else {
const BYTE* vLimit = ip + (dictLimit - matchIndex);
match = dictBase + matchIndex;
if (vLimit > iHighLimit) vLimit = iHighLimit;
matchLength = LZ4_count(ip, match, vLimit);
if ((ip+matchLength == vLimit) && (vLimit < iHighLimit))
matchLength += LZ4_count(ip+matchLength, base+dictLimit, iHighLimit);
}
if (matchLength > best_mlen) {
best_mlen = matchLength;
if (matches) {
if (matchIndex >= dictLimit)
matches[mnum].off = (int)(ip - match);
else
matches[mnum].off = (int)(ip - (base + matchIndex)); /* virtual matchpos */
matches[mnum].len = (int)matchLength;
mnum++;
}
if (best_mlen > LZ4_OPT_NUM) break;
}
if (ip+matchLength >= iHighLimit) /* equal : no way to know if inf or sup */
break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */
if (*(ip+matchLength) < *(match+matchLength)) {
*ptr0 = delta0;
ptr0 = &DELTANEXTMAXD(matchIndex*2);
if (*ptr0 == (U16)-1) break;
delta0 = *ptr0;
delta1 += delta0;
matchIndex -= delta0;
} else {
*ptr1 = delta1;
ptr1 = &DELTANEXTMAXD(matchIndex*2+1);
if (*ptr1 == (U16)-1) break;
delta1 = *ptr1;
delta0 += delta1;
matchIndex -= delta1;
}
}
*ptr0 = (U16)-1;
*ptr1 = (U16)-1;
if (matchNum) *matchNum = mnum;
/* if (best_mlen > 8) return best_mlen-8; */
if (!matchNum) return 1;
return 1;
}
FORCE_INLINE void LZ4HC_updateBinTree(LZ4HC_CCtx_internal* ctx, const BYTE* const ip, const BYTE* const iHighLimit)
{
const BYTE* const base = ctx->base;
const U32 target = (U32)(ip - base);
U32 idx = ctx->nextToUpdate;
while(idx < target)
idx += LZ4HC_BinTree_InsertAndGetAllMatches(ctx, base+idx, iHighLimit, 8, NULL, NULL);
}
/** Tree updater, providing best match */
FORCE_INLINE int LZ4HC_BinTree_GetAllMatches (
LZ4HC_CCtx_internal* ctx,
const BYTE* const ip, const BYTE* const iHighLimit,
size_t best_mlen, LZ4HC_match_t* matches, const int fullUpdate)
{
int mnum = 0;
if (ip < ctx->base + ctx->nextToUpdate) return 0; /* skipped area */
if (fullUpdate) LZ4HC_updateBinTree(ctx, ip, iHighLimit);
best_mlen = LZ4HC_BinTree_InsertAndGetAllMatches(ctx, ip, iHighLimit, best_mlen, matches, &mnum);
ctx->nextToUpdate = (U32)(ip - ctx->base + best_mlen);
return mnum;
}
#define SET_PRICE(pos, ml, offset, ll, cost) \
{ \
while (last_pos < pos) { opt[last_pos+1].price = 1<<30; last_pos++; } \
opt[pos].mlen = (int)ml; \
opt[pos].off = (int)offset; \
opt[pos].litlen = (int)ll; \
opt[pos].price = (int)cost; \
}
static int LZ4HC_compress_optimal (
LZ4HC_CCtx_internal* ctx,
const char* const source,
char* dest,
int inputSize,
int maxOutputSize,
limitedOutput_directive limit,
size_t sufficient_len,
const int fullUpdate
)
{
LZ4HC_optimal_t opt[LZ4_OPT_NUM + 1]; /* this uses a bit too much stack memory to my taste ... */
LZ4HC_match_t matches[LZ4_OPT_NUM + 1];
const BYTE* ip = (const BYTE*) source;
const BYTE* anchor = ip;
const BYTE* const iend = ip + inputSize;
const BYTE* const mflimit = iend - MFLIMIT;
const BYTE* const matchlimit = (iend - LASTLITERALS);
BYTE* op = (BYTE*) dest;
BYTE* const oend = op + maxOutputSize;
/* init */
if (sufficient_len >= LZ4_OPT_NUM) sufficient_len = LZ4_OPT_NUM-1;
ctx->end += inputSize;
ip++;
/* Main Loop */
while (ip < mflimit) {
size_t const llen = ip - anchor;
size_t last_pos = 0;
size_t match_num, cur, best_mlen, best_off;
memset(opt, 0, sizeof(LZ4HC_optimal_t)); /* memset only the first one */
match_num = LZ4HC_BinTree_GetAllMatches(ctx, ip, matchlimit, MINMATCH-1, matches, fullUpdate);
if (!match_num) { ip++; continue; }
if ((size_t)matches[match_num-1].len > sufficient_len) {
/* good enough solution : immediate encoding */
best_mlen = matches[match_num-1].len;
best_off = matches[match_num-1].off;
cur = 0;
last_pos = 1;
goto encode;
}
/* set prices using matches at position = 0 */
{ size_t matchNb;
for (matchNb = 0; matchNb < match_num; matchNb++) {
size_t mlen = (matchNb>0) ? (size_t)matches[matchNb-1].len+1 : MINMATCH;
best_mlen = matches[matchNb].len; /* necessarily < sufficient_len < LZ4_OPT_NUM */
for ( ; mlen <= best_mlen ; mlen++) {
size_t const cost = LZ4HC_sequencePrice(llen, mlen) - LZ4HC_literalsPrice(llen);
SET_PRICE(mlen, mlen, matches[matchNb].off, 0, cost); /* updates last_pos and opt[pos] */
} } }
if (last_pos < MINMATCH) { ip++; continue; } /* note : on clang at least, this test improves performance */
/* check further positions */
opt[0].mlen = opt[1].mlen = 1;
for (cur = 1; cur <= last_pos; cur++) {
const BYTE* const curPtr = ip + cur;
/* establish baseline price if cur is literal */
{ size_t price, litlen;
if (opt[cur-1].mlen == 1) {
/* no match at previous position */
litlen = opt[cur-1].litlen + 1;
if (cur > litlen) {
price = opt[cur - litlen].price + LZ4HC_literalsPrice(litlen);
} else {
price = LZ4HC_literalsPrice(llen + litlen) - LZ4HC_literalsPrice(llen);
}
} else {
litlen = 1;
price = opt[cur - 1].price + LZ4HC_literalsPrice(1);
}
if (price < (size_t)opt[cur].price)
SET_PRICE(cur, 1 /*mlen*/, 0 /*off*/, litlen, price); /* note : increases last_pos */
}
if (cur == last_pos || curPtr >= mflimit) break;
match_num = LZ4HC_BinTree_GetAllMatches(ctx, curPtr, matchlimit, MINMATCH-1, matches, fullUpdate);
if ((match_num > 0) && (size_t)matches[match_num-1].len > sufficient_len) {
/* immediate encoding */
best_mlen = matches[match_num-1].len;
best_off = matches[match_num-1].off;
last_pos = cur + 1;
goto encode;
}
/* set prices using matches at position = cur */
{ size_t matchNb;
for (matchNb = 0; matchNb < match_num; matchNb++) {
size_t ml = (matchNb>0) ? (size_t)matches[matchNb-1].len+1 : MINMATCH;
best_mlen = (cur + matches[matchNb].len < LZ4_OPT_NUM) ?
(size_t)matches[matchNb].len : LZ4_OPT_NUM - cur;
for ( ; ml <= best_mlen ; ml++) {
size_t ll, price;
if (opt[cur].mlen == 1) {
ll = opt[cur].litlen;
if (cur > ll)
price = opt[cur - ll].price + LZ4HC_sequencePrice(ll, ml);
else
price = LZ4HC_sequencePrice(llen + ll, ml) - LZ4HC_literalsPrice(llen);
} else {
ll = 0;
price = opt[cur].price + LZ4HC_sequencePrice(0, ml);
}
if (cur + ml > last_pos || price < (size_t)opt[cur + ml].price) {
SET_PRICE(cur + ml, ml, matches[matchNb].off, ll, price);
} } } }
} /* for (cur = 1; cur <= last_pos; cur++) */
best_mlen = opt[last_pos].mlen;
best_off = opt[last_pos].off;
cur = last_pos - best_mlen;
encode: /* cur, last_pos, best_mlen, best_off must be set */
opt[0].mlen = 1;
while (1) { /* from end to beginning */
size_t const ml = opt[cur].mlen;
int const offset = opt[cur].off;
opt[cur].mlen = (int)best_mlen;
opt[cur].off = (int)best_off;
best_mlen = ml;
best_off = offset;
if (ml > cur) break; /* can this happen ? */
cur -= ml;
}
/* encode all recorded sequences */
cur = 0;
while (cur < last_pos) {
int const ml = opt[cur].mlen;
int const offset = opt[cur].off;
if (ml == 1) { ip++; cur++; continue; }
cur += ml;
if ( LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ip - offset, limit, oend) ) return 0;
}
} /* while (ip < mflimit) */
/* Encode Last Literals */
{ int lastRun = (int)(iend - anchor);
if ((limit) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; /* Check output limit */
if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
else *op++ = (BYTE)(lastRun<<ML_BITS);
memcpy(op, anchor, iend - anchor);
op += iend-anchor;
}
/* End */
return (int) ((char*)op-dest);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
/*
LZ4 HC - High Compression Mode of LZ4
Copyright (C) 2011-2015, Yann Collet.
Copyright (C) 2011-2016, Yann Collet.
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
@ -28,27 +28,36 @@
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
You can contact the author at :
- LZ4 source repository : https://github.com/Cyan4973/lz4
- LZ4 source repository : https://github.com/lz4/lz4
- LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
*/
/* note : lz4hc is not an independent module, it requires lz4.h/lz4.c for proper compilation */
/**************************************
/* *************************************
* Tuning Parameter
**************************************/
static const int LZ4HC_compressionLevel_default = 9;
***************************************/
/*!
* HEAPMODE :
* Select how default compression function will allocate workplace memory,
* in stack (0:fastest), or in heap (1:requires malloc()).
* Since workplace is rather large, heap mode is recommended.
*/
#ifndef LZ4HC_HEAPMODE
# define LZ4HC_HEAPMODE 1
#endif
/**************************************
* Includes
**************************************/
#include <lz4/lz4hc.h>
/* *************************************
* Dependency
***************************************/
#include "lz4hc.h"
/**************************************
/* *************************************
* Local Compiler Options
**************************************/
***************************************/
#if defined(__GNUC__)
# pragma GCC diagnostic ignored "-Wunused-function"
#endif
@ -58,52 +67,24 @@ static const int LZ4HC_compressionLevel_default = 9;
#endif
/**************************************
/* *************************************
* Common LZ4 definition
**************************************/
***************************************/
#define LZ4_COMMONDEFS_ONLY
#include "lz4.c"
/**************************************
/* *************************************
* Local Constants
**************************************/
#define DICTIONARY_LOGSIZE 16
#define MAXD (1<<DICTIONARY_LOGSIZE)
#define MAXD_MASK (MAXD - 1)
#define HASH_LOG (DICTIONARY_LOGSIZE-1)
#define HASHTABLESIZE (1 << HASH_LOG)
#define HASH_MASK (HASHTABLESIZE - 1)
***************************************/
#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH)
static const int g_maxCompressionLevel = 16;
/**************************************
* Local Types
**************************************/
typedef struct
{
U32 hashTable[HASHTABLESIZE];
U16 chainTable[MAXD];
const BYTE* end; /* next block here to continue on current prefix */
const BYTE* base; /* All index relative to this position */
const BYTE* dictBase; /* alternate base for extDict */
BYTE* inputBuffer; /* deprecated */
U32 dictLimit; /* below that point, need extDict */
U32 lowLimit; /* below that point, no more dict */
U32 nextToUpdate; /* index from which to continue dictionary update */
U32 compressionLevel;
} LZ4HC_Data_Structure;
/**************************************
* Local Macros
**************************************/
#define HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG))
//#define DELTANEXTU16(p) chainTable[(p) & MAXD_MASK] /* flexible, MAXD dependent */
#define HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8)-LZ4HC_HASH_LOG))
#define DELTANEXTMAXD(p) chainTable[(p) & LZ4HC_MAXD_MASK] /* flexible, LZ4HC_MAXD dependent */
#define DELTANEXTU16(p) chainTable[(U16)(p)] /* faster */
static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr)); }
@ -113,7 +94,7 @@ static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr)
/**************************************
* HC Compression
**************************************/
static void LZ4HC_init (LZ4HC_Data_Structure* hc4, const BYTE* start)
static void LZ4HC_init (LZ4HC_CCtx_internal* hc4, const BYTE* start)
{
MEM_INIT((void*)hc4->hashTable, 0, sizeof(hc4->hashTable));
MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable));
@ -127,21 +108,20 @@ static void LZ4HC_init (LZ4HC_Data_Structure* hc4, const BYTE* start)
/* Update chains up to ip (excluded) */
FORCE_INLINE void LZ4HC_Insert (LZ4HC_Data_Structure* hc4, const BYTE* ip)
FORCE_INLINE void LZ4HC_Insert (LZ4HC_CCtx_internal* hc4, const BYTE* ip)
{
U16* chainTable = hc4->chainTable;
U32* HashTable = hc4->hashTable;
U16* const chainTable = hc4->chainTable;
U32* const hashTable = hc4->hashTable;
const BYTE* const base = hc4->base;
const U32 target = (U32)(ip - base);
U32 const target = (U32)(ip - base);
U32 idx = hc4->nextToUpdate;
while(idx < target)
{
U32 h = LZ4HC_hashPtr(base+idx);
size_t delta = idx - HashTable[h];
while (idx < target) {
U32 const h = LZ4HC_hashPtr(base+idx);
size_t delta = idx - hashTable[h];
if (delta>MAX_DISTANCE) delta = MAX_DISTANCE;
DELTANEXTU16(idx) = (U16)delta;
HashTable[h] = idx;
hashTable[h] = idx;
idx++;
}
@ -149,7 +129,7 @@ FORCE_INLINE void LZ4HC_Insert (LZ4HC_Data_Structure* hc4, const BYTE* ip)
}
FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, /* Index table will be updated */
FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_CCtx_internal* hc4, /* Index table will be updated */
const BYTE* ip, const BYTE* const iLimit,
const BYTE** matchpos,
const int maxNbAttempts)
@ -161,7 +141,6 @@ FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, /* I
const U32 dictLimit = hc4->dictLimit;
const U32 lowLimit = (hc4->lowLimit + 64 KB > (U32)(ip-base)) ? hc4->lowLimit : (U32)(ip - base) - (64 KB - 1);
U32 matchIndex;
const BYTE* match;
int nbAttempts=maxNbAttempts;
size_t ml=0;
@ -169,24 +148,19 @@ FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, /* I
LZ4HC_Insert(hc4, ip);
matchIndex = HashTable[LZ4HC_hashPtr(ip)];
while ((matchIndex>=lowLimit) && (nbAttempts))
{
while ((matchIndex>=lowLimit) && (nbAttempts)) {
nbAttempts--;
if (matchIndex >= dictLimit)
{
match = base + matchIndex;
if (matchIndex >= dictLimit) {
const BYTE* const match = base + matchIndex;
if (*(match+ml) == *(ip+ml)
&& (LZ4_read32(match) == LZ4_read32(ip)))
{
size_t mlt = LZ4_count(ip+MINMATCH, match+MINMATCH, iLimit) + MINMATCH;
size_t const mlt = LZ4_count(ip+MINMATCH, match+MINMATCH, iLimit) + MINMATCH;
if (mlt > ml) { ml = mlt; *matchpos = match; }
}
}
else
{
match = dictBase + matchIndex;
if (LZ4_read32(match) == LZ4_read32(ip))
{
} else {
const BYTE* const match = dictBase + matchIndex;
if (LZ4_read32(match) == LZ4_read32(ip)) {
size_t mlt;
const BYTE* vLimit = ip + (dictLimit - matchIndex);
if (vLimit > iLimit) vLimit = iLimit;
@ -204,7 +178,7 @@ FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, /* I
FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch (
LZ4HC_Data_Structure* hc4,
LZ4HC_CCtx_internal* hc4,
const BYTE* const ip,
const BYTE* const iLowLimit,
const BYTE* const iHighLimit,
@ -229,38 +203,32 @@ FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch (
LZ4HC_Insert(hc4, ip);
matchIndex = HashTable[LZ4HC_hashPtr(ip)];
while ((matchIndex>=lowLimit) && (nbAttempts))
{
while ((matchIndex>=lowLimit) && (nbAttempts)) {
nbAttempts--;
if (matchIndex >= dictLimit)
{
if (matchIndex >= dictLimit) {
const BYTE* matchPtr = base + matchIndex;
if (*(iLowLimit + longest) == *(matchPtr - delta + longest))
if (LZ4_read32(matchPtr) == LZ4_read32(ip))
{
if (*(iLowLimit + longest) == *(matchPtr - delta + longest)) {
if (LZ4_read32(matchPtr) == LZ4_read32(ip)) {
int mlt = MINMATCH + LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit);
int back = 0;
while ((ip+back>iLowLimit)
while ((ip+back > iLowLimit)
&& (matchPtr+back > lowPrefixPtr)
&& (ip[back-1] == matchPtr[back-1]))
back--;
mlt -= back;
if (mlt > longest)
{
if (mlt > longest) {
longest = (int)mlt;
*matchpos = matchPtr+back;
*startpos = ip+back;
}
}
}
else
{
const BYTE* matchPtr = dictBase + matchIndex;
if (LZ4_read32(matchPtr) == LZ4_read32(ip))
{
}
} else {
const BYTE* const matchPtr = dictBase + matchIndex;
if (LZ4_read32(matchPtr) == LZ4_read32(ip)) {
size_t mlt;
int back=0;
const BYTE* vLimit = ip + (dictLimit - matchIndex);
@ -320,8 +288,15 @@ FORCE_INLINE int LZ4HC_encodeSequence (
/* Encode MatchLength */
length = (int)(matchLength-MINMATCH);
if ((limitedOutputBuffer) && (*op + (length>>8) + (1 + LASTLITERALS) > oend)) return 1; /* Check output limit */
if (length>=(int)ML_MASK) { *token+=ML_MASK; length-=ML_MASK; for(; length > 509 ; length-=510) { *(*op)++ = 255; *(*op)++ = 255; } if (length > 254) { length-=255; *(*op)++ = 255; } *(*op)++ = (BYTE)length; }
else *token += (BYTE)(length);
if (length>=(int)ML_MASK) {
*token += ML_MASK;
length -= ML_MASK;
for(; length > 509 ; length-=510) { *(*op)++ = 255; *(*op)++ = 255; }
if (length > 254) { length-=255; *(*op)++ = 255; }
*(*op)++ = (BYTE)length;
} else {
*token += (BYTE)(length);
}
/* Prepare next loop */
*ip += matchLength;
@ -330,18 +305,18 @@ FORCE_INLINE int LZ4HC_encodeSequence (
return 0;
}
#include "lz4opt.h"
static int LZ4HC_compress_generic (
void* ctxvoid,
const char* source,
char* dest,
int inputSize,
int maxOutputSize,
int compressionLevel,
static int LZ4HC_compress_hashChain (
LZ4HC_CCtx_internal* const ctx,
const char* const source,
char* const dest,
int const inputSize,
int const maxOutputSize,
unsigned maxNbAttempts,
limitedOutput_directive limit
)
{
LZ4HC_Data_Structure* ctx = (LZ4HC_Data_Structure*) ctxvoid;
const BYTE* ip = (const BYTE*) source;
const BYTE* anchor = ip;
const BYTE* const iend = ip + inputSize;
@ -351,28 +326,22 @@ static int LZ4HC_compress_generic (
BYTE* op = (BYTE*) dest;
BYTE* const oend = op + maxOutputSize;
unsigned maxNbAttempts;
int ml, ml2, ml3, ml0;
const BYTE* ref=NULL;
const BYTE* start2=NULL;
const BYTE* ref2=NULL;
const BYTE* start3=NULL;
const BYTE* ref3=NULL;
const BYTE* ref = NULL;
const BYTE* start2 = NULL;
const BYTE* ref2 = NULL;
const BYTE* start3 = NULL;
const BYTE* ref3 = NULL;
const BYTE* start0;
const BYTE* ref0;
/* init */
if (compressionLevel > g_maxCompressionLevel) compressionLevel = g_maxCompressionLevel;
if (compressionLevel < 1) compressionLevel = LZ4HC_compressionLevel_default;
maxNbAttempts = 1 << (compressionLevel-1);
ctx->end += inputSize;
ip++;
/* Main Loop */
while (ip < mflimit)
{
while (ip < mflimit) {
ml = LZ4HC_InsertAndFindBestMatch (ctx, ip, matchlimit, (&ref), maxNbAttempts);
if (!ml) { ip++; continue; }
@ -383,19 +352,16 @@ static int LZ4HC_compress_generic (
_Search2:
if (ip+ml < mflimit)
ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 1, matchlimit, ml, &ref2, &start2, maxNbAttempts);
ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 0, matchlimit, ml, &ref2, &start2, maxNbAttempts);
else ml2 = ml;
if (ml2 == ml) /* No better match */
{
if (ml2 == ml) { /* No better match */
if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0;
continue;
}
if (start0 < ip)
{
if (start2 < ip + ml0) /* empirical */
{
if (start0 < ip) {
if (start2 < ip + ml0) { /* empirical */
ip = start0;
ref = ref0;
ml = ml0;
@ -403,8 +369,7 @@ _Search2:
}
/* Here, start0==ip */
if ((start2 - ip) < 3) /* First Match too small : removed */
{
if ((start2 - ip) < 3) { /* First Match too small : removed */
ml = ml2;
ip = start2;
ref =ref2;
@ -417,15 +382,13 @@ _Search3:
* ml2 > ml1, and
* ip1+3 <= ip2 (usually < ip1+ml1)
*/
if ((start2 - ip) < OPTIMAL_ML)
{
if ((start2 - ip) < OPTIMAL_ML) {
int correction;
int new_ml = ml;
if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML;
if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
correction = new_ml - (int)(start2 - ip);
if (correction > 0)
{
if (correction > 0) {
start2 += correction;
ref2 += correction;
ml2 -= correction;
@ -437,8 +400,7 @@ _Search3:
ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3, maxNbAttempts);
else ml3 = ml2;
if (ml3 == ml2) /* No better match : 2 sequences to encode */
{
if (ml3 == ml2) { /* No better match : 2 sequences to encode */
/* ip & ref are known; Now for ml */
if (start2 < ip+ml) ml = (int)(start2 - ip);
/* Now, encode 2 sequences */
@ -448,18 +410,14 @@ _Search3:
continue;
}
if (start3 < ip+ml+3) /* Not enough space for match 2 : remove it */
{
if (start3 >= (ip+ml)) /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */
{
if (start2 < ip+ml)
{
if (start3 < ip+ml+3) { /* Not enough space for match 2 : remove it */
if (start3 >= (ip+ml)) { /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */
if (start2 < ip+ml) {
int correction = (int)(ip+ml - start2);
start2 += correction;
ref2 += correction;
ml2 -= correction;
if (ml2 < MINMATCH)
{
if (ml2 < MINMATCH) {
start2 = start3;
ref2 = ref3;
ml2 = ml3;
@ -487,23 +445,18 @@ _Search3:
* OK, now we have 3 ascending matches; let's write at least the first one
* ip & ref are known; Now for ml
*/
if (start2 < ip+ml)
{
if ((start2 - ip) < (int)ML_MASK)
{
if (start2 < ip+ml) {
if ((start2 - ip) < (int)ML_MASK) {
int correction;
if (ml > OPTIMAL_ML) ml = OPTIMAL_ML;
if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH;
correction = ml - (int)(start2 - ip);
if (correction > 0)
{
if (correction > 0) {
start2 += correction;
ref2 += correction;
ml2 -= correction;
}
}
else
{
} else {
ml = (int)(start2 - ip);
}
}
@ -521,8 +474,7 @@ _Search3:
}
/* Encode Last Literals */
{
int lastRun = (int)(iend - anchor);
{ int lastRun = (int)(iend - anchor);
if ((limit) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; /* Check output limit */
if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
else *op++ = (BYTE)(lastRun<<ML_BITS);
@ -534,23 +486,64 @@ _Search3:
return (int) (((char*)op)-dest);
}
static int LZ4HC_getSearchNum(int compressionLevel)
{
switch (compressionLevel) {
default: return 0; /* unused */
case 11: return 128;
case 12: return 1<<10;
}
}
int LZ4_sizeofStateHC(void) { return sizeof(LZ4HC_Data_Structure); }
static int LZ4HC_compress_generic (
LZ4HC_CCtx_internal* const ctx,
const char* const source,
char* const dest,
int const inputSize,
int const maxOutputSize,
int compressionLevel,
limitedOutput_directive limit
)
{
if (compressionLevel < 1) compressionLevel = LZ4HC_CLEVEL_DEFAULT;
if (compressionLevel > 9) {
switch (compressionLevel) {
case 10: return LZ4HC_compress_hashChain(ctx, source, dest, inputSize, maxOutputSize, 1 << (16-1), limit);
case 11: ctx->searchNum = LZ4HC_getSearchNum(compressionLevel); return LZ4HC_compress_optimal(ctx, source, dest, inputSize, maxOutputSize, limit, 128, 0);
default:
case 12: ctx->searchNum = LZ4HC_getSearchNum(compressionLevel); return LZ4HC_compress_optimal(ctx, source, dest, inputSize, maxOutputSize, limit, LZ4_OPT_NUM, 1);
}
}
return LZ4HC_compress_hashChain(ctx, source, dest, inputSize, maxOutputSize, 1 << (compressionLevel-1), limit);
}
int LZ4_sizeofStateHC(void) { return sizeof(LZ4_streamHC_t); }
int LZ4_compress_HC_extStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel)
{
LZ4HC_CCtx_internal* ctx = &((LZ4_streamHC_t*)state)->internal_donotuse;
if (((size_t)(state)&(sizeof(void*)-1)) != 0) return 0; /* Error : state is not aligned for pointers (32 or 64 bits) */
LZ4HC_init ((LZ4HC_Data_Structure*)state, (const BYTE*)src);
LZ4HC_init (ctx, (const BYTE*)src);
if (maxDstSize < LZ4_compressBound(srcSize))
return LZ4HC_compress_generic (state, src, dst, srcSize, maxDstSize, compressionLevel, limitedOutput);
return LZ4HC_compress_generic (ctx, src, dst, srcSize, maxDstSize, compressionLevel, limitedOutput);
else
return LZ4HC_compress_generic (state, src, dst, srcSize, maxDstSize, compressionLevel, noLimit);
return LZ4HC_compress_generic (ctx, src, dst, srcSize, maxDstSize, compressionLevel, noLimit);
}
int LZ4_compress_HC(const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel)
{
LZ4HC_Data_Structure state;
return LZ4_compress_HC_extStateHC(&state, src, dst, srcSize, maxDstSize, compressionLevel);
#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
LZ4_streamHC_t* const statePtr = (LZ4_streamHC_t*)malloc(sizeof(LZ4_streamHC_t));
#else
LZ4_streamHC_t state;
LZ4_streamHC_t* const statePtr = &state;
#endif
int const cSize = LZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, maxDstSize, compressionLevel);
#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
free(statePtr);
#endif
return cSize;
}
@ -566,32 +559,38 @@ int LZ4_freeStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr) { free(LZ4_st
/* initialization */
void LZ4_resetStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
{
LZ4_STATIC_ASSERT(sizeof(LZ4HC_Data_Structure) <= sizeof(LZ4_streamHC_t)); /* if compilation fails here, LZ4_STREAMHCSIZE must be increased */
((LZ4HC_Data_Structure*)LZ4_streamHCPtr)->base = NULL;
((LZ4HC_Data_Structure*)LZ4_streamHCPtr)->compressionLevel = (unsigned)compressionLevel;
LZ4_STATIC_ASSERT(sizeof(LZ4HC_CCtx_internal) <= sizeof(size_t) * LZ4_STREAMHCSIZE_SIZET); /* if compilation fails here, LZ4_STREAMHCSIZE must be increased */
LZ4_streamHCPtr->internal_donotuse.base = NULL;
LZ4_streamHCPtr->internal_donotuse.compressionLevel = (unsigned)compressionLevel;
LZ4_streamHCPtr->internal_donotuse.searchNum = LZ4HC_getSearchNum(compressionLevel);
}
int LZ4_loadDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, const char* dictionary, int dictSize)
{
LZ4HC_Data_Structure* ctxPtr = (LZ4HC_Data_Structure*) LZ4_streamHCPtr;
if (dictSize > 64 KB)
{
LZ4HC_CCtx_internal* ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
if (dictSize > 64 KB) {
dictionary += dictSize - 64 KB;
dictSize = 64 KB;
}
LZ4HC_init (ctxPtr, (const BYTE*)dictionary);
if (dictSize >= 4) LZ4HC_Insert (ctxPtr, (const BYTE*)dictionary +(dictSize-3));
ctxPtr->end = (const BYTE*)dictionary + dictSize;
if (ctxPtr->compressionLevel >= LZ4HC_CLEVEL_OPT_MIN)
LZ4HC_updateBinTree(ctxPtr, ctxPtr->end - MFLIMIT, ctxPtr->end - LASTLITERALS);
else
if (dictSize >= 4) LZ4HC_Insert (ctxPtr, ctxPtr->end-3);
return dictSize;
}
/* compression */
static void LZ4HC_setExternalDict(LZ4HC_Data_Structure* ctxPtr, const BYTE* newBlock)
static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock)
{
if (ctxPtr->end >= ctxPtr->base + 4)
LZ4HC_Insert (ctxPtr, ctxPtr->end-3); /* Referencing remaining dictionary content */
if (ctxPtr->compressionLevel >= LZ4HC_CLEVEL_OPT_MIN)
LZ4HC_updateBinTree(ctxPtr, ctxPtr->end - MFLIMIT, ctxPtr->end - LASTLITERALS);
else
if (ctxPtr->end >= ctxPtr->base + 4) LZ4HC_Insert (ctxPtr, ctxPtr->end-3); /* Referencing remaining dictionary content */
/* Only one memory segment for extDict, so any previous extDict is lost at this stage */
ctxPtr->lowLimit = ctxPtr->dictLimit;
ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base);
@ -601,34 +600,29 @@ static void LZ4HC_setExternalDict(LZ4HC_Data_Structure* ctxPtr, const BYTE* newB
ctxPtr->nextToUpdate = ctxPtr->dictLimit; /* match referencing will resume from there */
}
static int LZ4_compressHC_continue_generic (LZ4HC_Data_Structure* ctxPtr,
static int LZ4_compressHC_continue_generic (LZ4_streamHC_t* LZ4_streamHCPtr,
const char* source, char* dest,
int inputSize, int maxOutputSize, limitedOutput_directive limit)
{
LZ4HC_CCtx_internal* ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
/* auto-init if forgotten */
if (ctxPtr->base == NULL)
LZ4HC_init (ctxPtr, (const BYTE*) source);
if (ctxPtr->base == NULL) LZ4HC_init (ctxPtr, (const BYTE*) source);
/* Check overflow */
if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 GB)
{
if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 GB) {
size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base) - ctxPtr->dictLimit;
if (dictSize > 64 KB) dictSize = 64 KB;
LZ4_loadDictHC((LZ4_streamHC_t*)ctxPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize);
LZ4_loadDictHC(LZ4_streamHCPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize);
}
/* Check if blocks follow each other */
if ((const BYTE*)source != ctxPtr->end)
LZ4HC_setExternalDict(ctxPtr, (const BYTE*)source);
if ((const BYTE*)source != ctxPtr->end) LZ4HC_setExternalDict(ctxPtr, (const BYTE*)source);
/* Check overlapping input/dictionary space */
{
const BYTE* sourceEnd = (const BYTE*) source + inputSize;
const BYTE* dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit;
const BYTE* dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit;
if ((sourceEnd > dictBegin) && ((const BYTE*)source < dictEnd))
{
{ const BYTE* sourceEnd = (const BYTE*) source + inputSize;
const BYTE* const dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit;
const BYTE* const dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit;
if ((sourceEnd > dictBegin) && ((const BYTE*)source < dictEnd)) {
if (sourceEnd > dictEnd) sourceEnd = dictEnd;
ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase);
if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) ctxPtr->lowLimit = ctxPtr->dictLimit;
@ -641,9 +635,9 @@ static int LZ4_compressHC_continue_generic (LZ4HC_Data_Structure* ctxPtr,
int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize)
{
if (maxOutputSize < LZ4_compressBound(inputSize))
return LZ4_compressHC_continue_generic ((LZ4HC_Data_Structure*)LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, limitedOutput);
return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, limitedOutput);
else
return LZ4_compressHC_continue_generic ((LZ4HC_Data_Structure*)LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, noLimit);
return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, noLimit);
}
@ -651,14 +645,13 @@ int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* sourc
int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictSize)
{
LZ4HC_Data_Structure* streamPtr = (LZ4HC_Data_Structure*)LZ4_streamHCPtr;
int prefixSize = (int)(streamPtr->end - (streamPtr->base + streamPtr->dictLimit));
LZ4HC_CCtx_internal* const streamPtr = &LZ4_streamHCPtr->internal_donotuse;
int const prefixSize = (int)(streamPtr->end - (streamPtr->base + streamPtr->dictLimit));
if (dictSize > 64 KB) dictSize = 64 KB;
if (dictSize < 4) dictSize = 0;
if (dictSize > prefixSize) dictSize = prefixSize;
memmove(safeBuffer, streamPtr->end - dictSize, dictSize);
{
U32 endIndex = (U32)(streamPtr->end - streamPtr->base);
{ U32 const endIndex = (U32)(streamPtr->end - streamPtr->base);
streamPtr->end = (const BYTE*)safeBuffer + dictSize;
streamPtr->base = streamPtr->end - endIndex;
streamPtr->dictLimit = endIndex - dictSize;
@ -672,8 +665,8 @@ int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictS
/***********************************
* Deprecated Functions
***********************************/
/* These functions currently generate deprecation warnings */
/* Deprecated compression functions */
/* These functions are planned to start generate warnings by r131 approximately */
int LZ4_compressHC(const char* src, char* dst, int srcSize) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), 0); }
int LZ4_compressHC_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, 0); }
int LZ4_compressHC2(const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); }
@ -687,45 +680,41 @@ int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* ctx, const char* src,
/* Deprecated streaming functions */
/* These functions currently generate deprecation warnings */
int LZ4_sizeofStreamStateHC(void) { return LZ4_STREAMHCSIZE; }
int LZ4_resetStreamStateHC(void* state, char* inputBuffer)
{
LZ4HC_CCtx_internal *ctx = &((LZ4_streamHC_t*)state)->internal_donotuse;
if ((((size_t)state) & (sizeof(void*)-1)) != 0) return 1; /* Error : pointer is not aligned for pointer (32 or 64 bits) */
LZ4HC_init((LZ4HC_Data_Structure*)state, (const BYTE*)inputBuffer);
((LZ4HC_Data_Structure*)state)->inputBuffer = (BYTE*)inputBuffer;
LZ4HC_init(ctx, (const BYTE*)inputBuffer);
ctx->inputBuffer = (BYTE*)inputBuffer;
return 0;
}
void* LZ4_createHC (char* inputBuffer)
{
void* hc4 = ALLOCATOR(1, sizeof(LZ4HC_Data_Structure));
LZ4_streamHC_t* hc4 = (LZ4_streamHC_t*)ALLOCATOR(1, sizeof(LZ4_streamHC_t));
if (hc4 == NULL) return NULL; /* not enough memory */
LZ4HC_init ((LZ4HC_Data_Structure*)hc4, (const BYTE*)inputBuffer);
((LZ4HC_Data_Structure*)hc4)->inputBuffer = (BYTE*)inputBuffer;
LZ4HC_init (&hc4->internal_donotuse, (const BYTE*)inputBuffer);
hc4->internal_donotuse.inputBuffer = (BYTE*)inputBuffer;
return hc4;
}
int LZ4_freeHC (void* LZ4HC_Data)
{
FREEMEM(LZ4HC_Data);
return (0);
}
int LZ4_freeHC (void* LZ4HC_Data) { FREEMEM(LZ4HC_Data); return 0; }
int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel)
{
return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, 0, compressionLevel, noLimit);
return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, source, dest, inputSize, 0, compressionLevel, noLimit);
}
int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel)
{
return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, maxOutputSize, compressionLevel, limitedOutput);
return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, source, dest, inputSize, maxOutputSize, compressionLevel, limitedOutput);
}
char* LZ4_slideInputBufferHC(void* LZ4HC_Data)
{
LZ4HC_Data_Structure* hc4 = (LZ4HC_Data_Structure*)LZ4HC_Data;
int dictSize = LZ4_saveDictHC((LZ4_streamHC_t*)LZ4HC_Data, (char*)(hc4->inputBuffer), 64 KB);
LZ4HC_CCtx_internal* const hc4 = &((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse;
int const dictSize = LZ4_saveDictHC((LZ4_streamHC_t*)LZ4HC_Data, (char*)(hc4->inputBuffer), 64 KB);
return (char*)(hc4->inputBuffer + dictSize);
}

View File

@ -96,7 +96,7 @@ public:
ENC_BASE64 = 0x01, /// Base64-encoded output
ENC_BINHEX = 0x02, /// BinHex-encoded output
ENC_BASE64_NO_LF = 0x81, /// Base64-encoded output, no linefeeds
ENC_BINHEX_NO_LF = 0x82, /// BinHex-encoded output, no linefeeds
ENC_BINHEX_NO_LF = 0x82 /// BinHex-encoded output, no linefeeds
};

View File

@ -22,7 +22,6 @@
#define Crypto_Crypto_INCLUDED
#pragma GCC diagnostic push
#if defined(__APPLE__)
// OS X 10.7 deprecates some OpenSSL functions
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
@ -116,6 +115,5 @@ void Crypto_API uninitializeCrypto();
} } // namespace Poco::Crypto
#pragma GCC diagnostic pop
#endif // Crypto_Crypto_INCLUDED

View File

@ -61,7 +61,7 @@ protected:
private:
std::string _name;
EVP_MD_CTX* _ctx;
EVP_MD_CTX* _pContext;
Poco::DigestEngine::Digest _digest;
OpenSSLInitializer _openSSLInitializer;
};

View File

@ -130,6 +130,14 @@ public:
/// Returns true if verification against the issuer certificate
/// was successfull, false otherwise.
bool equals(const X509Certificate& otherCertificate) const;
/// Checks whether the certificate is equal to
/// the other certificate, by comparing the hashes
/// of both certificates.
///
/// Returns true if both certificates are identical,
/// otherwise false.
const X509* certificate() const;
/// Returns the underlying OpenSSL certificate.

View File

@ -30,7 +30,7 @@ namespace
{
unsigned long err;
std::string msg;
while ((err = ERR_get_error()))
{
if (!msg.empty())
@ -60,24 +60,28 @@ namespace
Direction dir);
~CryptoTransformImpl();
std::size_t blockSize() const;
int setPadding(int padding);
int setPadding(int padding);
std::streamsize transform(
const unsigned char* input,
std::streamsize inputLength,
unsigned char* output,
std::streamsize outputLength);
std::streamsize finalize(
unsigned char* output,
std::streamsize length);
private:
const EVP_CIPHER* _pCipher;
EVP_CIPHER_CTX _ctx;
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
EVP_CIPHER_CTX* _pContext;
#else
EVP_CIPHER_CTX _context;
#endif
ByteVec _key;
ByteVec _iv;
};
@ -92,32 +96,54 @@ namespace
_key(key),
_iv(iv)
{
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
_pContext = EVP_CIPHER_CTX_new();
EVP_CipherInit(
&_ctx,
_pContext,
_pCipher,
&_key[0],
_iv.empty() ? 0 : &_iv[0],
(dir == DIR_ENCRYPT) ? 1 : 0);
#else
EVP_CipherInit(
&_context,
_pCipher,
&_key[0],
_iv.empty() ? 0 : &_iv[0],
(dir == DIR_ENCRYPT) ? 1 : 0);
#endif
}
CryptoTransformImpl::~CryptoTransformImpl()
{
EVP_CIPHER_CTX_cleanup(&_ctx);
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
EVP_CIPHER_CTX_cleanup(_pContext);
#else
EVP_CIPHER_CTX_cleanup(&_context);
#endif
}
std::size_t CryptoTransformImpl::blockSize() const
{
return EVP_CIPHER_CTX_block_size(&_ctx);
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
return EVP_CIPHER_CTX_block_size(_pContext);
#else
return EVP_CIPHER_CTX_block_size(&_context);
#endif
}
int CryptoTransformImpl::setPadding(int padding)
{
return EVP_CIPHER_CTX_set_padding(&_ctx, padding);
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
return EVP_CIPHER_CTX_block_size(_pContext);
#else
return EVP_CIPHER_CTX_set_padding(&_context, padding);
#endif
}
std::streamsize CryptoTransformImpl::transform(
const unsigned char* input,
@ -125,16 +151,24 @@ namespace
unsigned char* output,
std::streamsize outputLength)
{
poco_assert (outputLength >= std::streamsize(inputLength + blockSize() - 1));
poco_assert (outputLength >= (inputLength + blockSize() - 1));
int outLen = static_cast<int>(outputLength);
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
int rc = EVP_CipherUpdate(
&_ctx,
_pContext,
output,
&outLen,
input,
static_cast<int>(inputLength));
#else
int rc = EVP_CipherUpdate(
&_context,
output,
&outLen,
input,
static_cast<int>(inputLength));
#endif
if (rc == 0)
throwError();
@ -146,18 +180,22 @@ namespace
unsigned char* output,
std::streamsize length)
{
poco_assert (length >= (std::streamsize)blockSize());
poco_assert (length >= blockSize());
int len = static_cast<int>(length);
// Use the '_ex' version that does not perform implicit cleanup since we
// will call EVP_CIPHER_CTX_cleanup() from the dtor as there is no
// guarantee that finalize() will be called if an error occurred.
int rc = EVP_CipherFinal_ex(&_ctx, output, &len);
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
int rc = EVP_CipherFinal_ex(_pContext, output, &len);
#else
int rc = EVP_CipherFinal_ex(&_context, output, &len);
#endif
if (rc == 0)
throwError();
return static_cast<std::streamsize>(len);
}
}

View File

@ -27,8 +27,8 @@ namespace Poco {
namespace Crypto {
CipherKeyImpl::CipherKeyImpl(const std::string& name,
const std::string& passphrase,
CipherKeyImpl::CipherKeyImpl(const std::string& name,
const std::string& passphrase,
const std::string& salt,
int iterationCount):
_pCipher(0),
@ -48,8 +48,8 @@ CipherKeyImpl::CipherKeyImpl(const std::string& name,
}
CipherKeyImpl::CipherKeyImpl(const std::string& name,
const ByteVec& key,
CipherKeyImpl::CipherKeyImpl(const std::string& name,
const ByteVec& key,
const ByteVec& iv):
_pCipher(0),
_name(name),
@ -64,7 +64,7 @@ CipherKeyImpl::CipherKeyImpl(const std::string& name,
throw Poco::NotFoundException("Cipher " + name + " was not found");
}
CipherKeyImpl::CipherKeyImpl(const std::string& name):
_pCipher(0),
_name(name),
@ -117,7 +117,7 @@ void CipherKeyImpl::generateKey()
getRandomBytes(vec, keySize());
setKey(vec);
getRandomBytes(vec, ivSize());
setIV(vec);
}
@ -126,11 +126,11 @@ void CipherKeyImpl::generateKey()
void CipherKeyImpl::getRandomBytes(ByteVec& vec, std::size_t count)
{
Poco::RandomInputStream random;
vec.clear();
vec.reserve(count);
for (std::size_t i = 0; i < count; ++i)
for (int i = 0; i < count; ++i)
vec.push_back(static_cast<unsigned char>(random.get()));
}

View File

@ -43,7 +43,7 @@ CryptoStreamBuf::CryptoStreamBuf(std::istream& istr, CryptoTransform* pTransform
_buffer(static_cast<std::size_t>(bufferSize))
{
poco_check_ptr (pTransform);
poco_assert ((size_t)bufferSize > 2 * pTransform->blockSize());
poco_assert (bufferSize > 2 * pTransform->blockSize());
}
@ -56,7 +56,7 @@ CryptoStreamBuf::CryptoStreamBuf(std::ostream& ostr, CryptoTransform* pTransform
_buffer(static_cast<std::size_t>(bufferSize))
{
poco_check_ptr (pTransform);
poco_assert ((size_t)bufferSize > 2 * pTransform->blockSize());
poco_assert (bufferSize > 2 * pTransform->blockSize());
}
@ -88,10 +88,10 @@ void CryptoStreamBuf::close()
// thrown.
std::ostream* pOstr = _pOstr;
_pOstr = 0;
// Finalize transformation.
std::streamsize n = _pTransform->finalize(_buffer.begin(), static_cast<std::streamsize>(_buffer.size()));
if (n > 0)
{
pOstr->write(reinterpret_cast<char*>(_buffer.begin()), n);
@ -159,7 +159,7 @@ int CryptoStreamBuf::writeToDevice(const char* buffer, std::streamsize length)
std::size_t maxChunkSize = _buffer.size()/2;
std::size_t count = 0;
while (count < (size_t)length)
while (count < length)
{
// Truncate chunk size so that the maximum output fits into _buffer.
std::size_t n = static_cast<std::size_t>(length) - count;

View File

@ -23,46 +23,51 @@ namespace Crypto {
DigestEngine::DigestEngine(const std::string& name):
_name(name)
_name(name),
_pContext(EVP_MD_CTX_create())
{
const EVP_MD* md = EVP_get_digestbyname(_name.c_str());
if (!md) throw Poco::NotFoundException(_name);
_ctx = EVP_MD_CTX_create();
EVP_DigestInit_ex(_ctx, md, NULL);
EVP_DigestInit_ex(_pContext, md, NULL);
}
DigestEngine::~DigestEngine()
{
EVP_MD_CTX_destroy(_ctx);
EVP_MD_CTX_destroy(_pContext);
}
int DigestEngine::nid() const
{
return EVP_MD_nid(_ctx->digest);
return EVP_MD_nid(EVP_MD_CTX_md(_pContext));
}
std::size_t DigestEngine::digestLength() const
{
return EVP_MD_CTX_size(_ctx);
return EVP_MD_CTX_size(_pContext);
}
void DigestEngine::reset()
{
EVP_MD_CTX_cleanup(_ctx);
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
EVP_MD_CTX_free(_pContext);
_pContext = EVP_MD_CTX_create();
#else
EVP_MD_CTX_cleanup(_pContext);
#endif
const EVP_MD* md = EVP_get_digestbyname(_name.c_str());
if (!md) throw Poco::NotFoundException(_name);
EVP_DigestInit_ex(_ctx, md, NULL);
EVP_DigestInit_ex(_pContext, md, NULL);
}
const Poco::DigestEngine::Digest& DigestEngine::digest()
{
_digest.clear();
unsigned len = EVP_MD_CTX_size(_ctx);
unsigned len = EVP_MD_CTX_size(_pContext);
_digest.resize(len);
EVP_DigestFinal_ex(_ctx, &_digest[0], &len);
EVP_DigestFinal_ex(_pContext, &_digest[0], &len);
reset();
return _digest;
}
@ -70,7 +75,7 @@ const Poco::DigestEngine::Digest& DigestEngine::digest()
void DigestEngine::updateImpl(const void* data, std::size_t length)
{
EVP_DigestUpdate(_ctx, data, length);
EVP_DigestUpdate(_pContext, data, length);
}

View File

@ -32,7 +32,7 @@ namespace
{
unsigned long err;
std::string msg;
while ((err = ERR_get_error()))
{
if (!msg.empty())
@ -68,7 +68,7 @@ namespace
public:
RSAEncryptImpl(const RSA* pRSA, RSAPaddingMode paddingMode);
~RSAEncryptImpl();
std::size_t blockSize() const;
std::size_t maxDataSize() const;
@ -77,7 +77,7 @@ namespace
std::streamsize inputLength,
unsigned char* output,
std::streamsize outputLength);
std::streamsize finalize(unsigned char* output, std::streamsize length);
private:
@ -156,7 +156,7 @@ namespace
output += n;
outputLength -= n;
_pos = 0;
}
else
{
@ -175,8 +175,8 @@ namespace
std::streamsize RSAEncryptImpl::finalize(unsigned char* output, std::streamsize length)
{
poco_assert ((size_t)length >= blockSize());
poco_assert ((size_t)_pos <= maxDataSize());
poco_assert (length >= blockSize());
poco_assert (_pos <= maxDataSize());
int rc = 0;
if (_pos > 0)
{
@ -192,7 +192,7 @@ namespace
public:
RSADecryptImpl(const RSA* pRSA, RSAPaddingMode paddingMode);
~RSADecryptImpl();
std::size_t blockSize() const;
std::streamsize transform(
@ -200,7 +200,7 @@ namespace
std::streamsize inputLength,
unsigned char* output,
std::streamsize outputLength);
std::streamsize finalize(
unsigned char* output,
std::streamsize length);
@ -241,7 +241,7 @@ namespace
unsigned char* output,
std::streamsize outputLength)
{
// always fill up the buffer before decrypting!
std::streamsize rsaSize = static_cast<std::streamsize>(blockSize());
poco_assert_dbg(_pos <= rsaSize);
@ -261,7 +261,7 @@ namespace
output += tmp;
outputLength -= tmp;
_pos = 0;
}
else
{
@ -280,7 +280,7 @@ namespace
std::streamsize RSADecryptImpl::finalize(unsigned char* output, std::streamsize length)
{
poco_assert ((size_t)length >= blockSize());
poco_assert (length >= blockSize());
int rc = 0;
if (_pos > 0)
{

View File

@ -207,19 +207,43 @@ int RSAKeyImpl::size() const
RSAKeyImpl::ByteVec RSAKeyImpl::modulus() const
{
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
const BIGNUM* n = 0;
const BIGNUM* e = 0;
const BIGNUM* d = 0;
RSA_get0_key(_pRSA, &n, &e, &d);
return convertToByteVec(n);
#else
return convertToByteVec(_pRSA->n);
#endif
}
RSAKeyImpl::ByteVec RSAKeyImpl::encryptionExponent() const
{
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
const BIGNUM* n = 0;
const BIGNUM* e = 0;
const BIGNUM* d = 0;
RSA_get0_key(_pRSA, &n, &e, &d);
return convertToByteVec(e);
#else
return convertToByteVec(_pRSA->e);
#endif
}
RSAKeyImpl::ByteVec RSAKeyImpl::decryptionExponent() const
{
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
const BIGNUM* n = 0;
const BIGNUM* e = 0;
const BIGNUM* d = 0;
RSA_get0_key(_pRSA, &n, &e, &d);
return convertToByteVec(d);
#else
return convertToByteVec(_pRSA->d);
#endif
}

View File

@ -59,7 +59,11 @@ X509Certificate::X509Certificate(X509* pCert, bool shared):
if (shared)
{
#if OPENSSL_VERSION_NUMBER >= 0x10100000L
X509_up_ref(_pCert);
#else
_pCert->references++;
#endif
}
init();
@ -205,10 +209,10 @@ std::string X509Certificate::issuerName(NID nid) const
if (X509_NAME* issuer = X509_get_issuer_name(_pCert))
{
char buffer[NAME_BUFFER_SIZE];
X509_NAME_get_text_by_NID(issuer, nid, buffer, sizeof(buffer));
return std::string(buffer);
if (X509_NAME_get_text_by_NID(issuer, nid, buffer, sizeof(buffer)) >= 0)
return std::string(buffer);
}
else return std::string();
return std::string();
}
@ -217,10 +221,10 @@ std::string X509Certificate::subjectName(NID nid) const
if (X509_NAME* subj = X509_get_subject_name(_pCert))
{
char buffer[NAME_BUFFER_SIZE];
X509_NAME_get_text_by_NID(subj, nid, buffer, sizeof(buffer));
return std::string(buffer);
if (X509_NAME_get_text_by_NID(subj, nid, buffer, sizeof(buffer)) >= 0)
return std::string(buffer);
}
else return std::string();
return std::string();
}
@ -280,4 +284,12 @@ bool X509Certificate::issuedBy(const X509Certificate& issuerCertificate) const
}
bool X509Certificate::equals(const X509Certificate& otherCertificate) const
{
X509* pCert = const_cast<X509*>(_pCert);
X509* pOtherCert = const_cast<X509*>(otherCertificate.certificate());
return X509_cmp(pCert, pOtherCert) == 0;
}
} } // namespace Poco::Crypto

View File

@ -246,6 +246,11 @@ void CryptoTest::testCertificate()
// fails with recent OpenSSL versions:
// assert (cert.issuedBy(cert));
std::istringstream otherCertStream(APPINF_PEM);
X509Certificate otherCert(otherCertStream);
assert (cert.equals(otherCert));
}

View File

@ -21,6 +21,7 @@ include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libdivide)
include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libcpuid/include)
include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libfarmhash)
include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libmetrohash/src)
include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libbtrie/include)
include_directories (${ClickHouse_SOURCE_DIR}/libs/libdaemon/include)
include_directories (${ClickHouse_BINARY_DIR}/dbms/src)
@ -44,7 +45,6 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(dbms src/TableFunctions)
add_headers_and_sources(dbms src/Parsers)
add_headers_and_sources(dbms src/Analyzers)
add_headers_and_sources(dbms src/AggregateFunctions)
add_headers_and_sources(dbms src/Core)
add_headers_and_sources(dbms src/DataStreams)
add_headers_and_sources(dbms src/DataTypes)
@ -70,6 +70,33 @@ list (APPEND dbms_headers ${CONFIG_VERSION} ${CONFIG_COMMON})
list (APPEND dbms_sources src/Functions/IFunction.cpp src/Functions/FunctionFactory.cpp src/Functions/DataTypeTraits.cpp)
list (APPEND dbms_headers src/Functions/IFunction.h src/Functions/FunctionFactory.h src/Functions/DataTypeTraits.h)
list (APPEND dbms_sources
src/AggregateFunctions/AggregateFunctionFactory.cpp
src/AggregateFunctions/AggregateFunctionState.cpp
src/AggregateFunctions/AggregateFunctionFactory.cpp
src/AggregateFunctions/AggregateFunctionState.cpp
src/AggregateFunctions/AggregateFunctionArray.cpp
src/AggregateFunctions/AggregateFunctionNull.cpp
src/AggregateFunctions/AggregateFunctionForEach.cpp
src/AggregateFunctions/AggregateFunctionIf.cpp
src/AggregateFunctions/AggregateFunctionMerge.cpp
src/AggregateFunctions/AggregateFunctionCount.cpp
)
list (APPEND dbms_headers
src/AggregateFunctions/IAggregateFunction.h
src/AggregateFunctions/AggregateFunctionFactory.h
src/AggregateFunctions/AggregateFunctionState.h
src/AggregateFunctions/AggregateFunctionFactory.h
src/AggregateFunctions/AggregateFunctionState.h
src/AggregateFunctions/AggregateFunctionArray.h
src/AggregateFunctions/AggregateFunctionNull.h
src/AggregateFunctions/AggregateFunctionForEach.h
src/AggregateFunctions/AggregateFunctionIf.h
src/AggregateFunctions/AggregateFunctionMerge.h
src/AggregateFunctions/AggregateFunctionCount.h
)
list(REMOVE_ITEM dbms_sources
src/Client/Client.cpp
@ -127,6 +154,7 @@ if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
src/Dictionaries/FlatDictionary.cpp
src/Dictionaries/HashedDictionary.cpp
src/Dictionaries/CacheDictionary.cpp
src/Dictionaries/TrieDictionary.cpp
src/Dictionaries/RangeHashedDictionary.cpp
src/Dictionaries/ComplexKeyHashedDictionary.cpp
src/Dictionaries/ComplexKeyCacheDictionary.cpp
@ -159,6 +187,7 @@ target_link_libraries (dbms
${OPENSSL_CRYPTO_LIBRARY}
${Boost_SYSTEM_LIBRARY}
${Poco_Data_LIBRARY}
btrie
)
if (Poco_DataODBC_FOUND)

View File

@ -1,6 +1,6 @@
#This strings autochanged from release_lib.sh :
set(VERSION_DESCRIBE v1.1.54232-testing)
set(VERSION_REVISION 54232)
set(VERSION_DESCRIBE v1.1.54234-testing)
set(VERSION_REVISION 54234)
#===end of autochange
set (VERSION_MAJOR 1)

View File

@ -30,24 +30,6 @@ std::string trimRight(const std::string & in, const char * suffix)
}
void registerAggregateFunctionAvg(AggregateFunctionFactory & factory);
void registerAggregateFunctionCount(AggregateFunctionFactory & factory);
void registerAggregateFunctionGroupArray(AggregateFunctionFactory & factory);
void registerAggregateFunctionGroupUniqArray(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantile(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileExact(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileExactWeighted(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileDeterministic(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileTiming(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileTDigest(AggregateFunctionFactory & factory);
void registerAggregateFunctionsSequenceMatch(AggregateFunctionFactory & factory);
void registerAggregateFunctionsMinMaxAny(AggregateFunctionFactory & factory);
void registerAggregateFunctionsStatistics(AggregateFunctionFactory & factory);
void registerAggregateFunctionSum(AggregateFunctionFactory & factory);
void registerAggregateFunctionsUniq(AggregateFunctionFactory & factory);
void registerAggregateFunctionUniqUpTo(AggregateFunctionFactory & factory);
void registerAggregateFunctionDebug(AggregateFunctionFactory & factory);
AggregateFunctionPtr createAggregateFunctionArray(AggregateFunctionPtr & nested);
AggregateFunctionPtr createAggregateFunctionForEach(AggregateFunctionPtr & nested);
AggregateFunctionPtr createAggregateFunctionIf(AggregateFunctionPtr & nested);
@ -60,23 +42,6 @@ AggregateFunctionPtr createAggregateFunctionCountNotNull(const DataTypes & argum
AggregateFunctionFactory::AggregateFunctionFactory()
{
registerAggregateFunctionAvg(*this);
registerAggregateFunctionCount(*this);
registerAggregateFunctionGroupArray(*this);
registerAggregateFunctionGroupUniqArray(*this);
registerAggregateFunctionsQuantile(*this);
registerAggregateFunctionsQuantileExact(*this);
registerAggregateFunctionsQuantileExactWeighted(*this);
registerAggregateFunctionsQuantileDeterministic(*this);
registerAggregateFunctionsQuantileTiming(*this);
registerAggregateFunctionsQuantileTDigest(*this);
registerAggregateFunctionsSequenceMatch(*this);
registerAggregateFunctionsMinMaxAny(*this);
registerAggregateFunctionsStatistics(*this);
registerAggregateFunctionSum(*this);
registerAggregateFunctionsUniq(*this);
registerAggregateFunctionUniqUpTo(*this);
registerAggregateFunctionDebug(*this);
}

View File

@ -2,6 +2,7 @@
#include <unordered_map>
#include <AggregateFunctions/IAggregateFunction.h>
#include <common/singleton.h>
namespace DB
@ -14,7 +15,7 @@ using DataTypes = std::vector<DataTypePtr>;
/** Creates an aggregate function by name.
*/
class AggregateFunctionFactory final
class AggregateFunctionFactory final : public Singleton<AggregateFunctionFactory>
{
friend class StorageSystemFunctions;

View File

@ -11,7 +11,7 @@ namespace
AggregateFunctionPtr createAggregateFunctionGroupArray(const std::string & name, const DataTypes & argument_types)
{
if (argument_types.size() != 1)
throw Exception("Incorrect number of arguments for aggregate function " + name,
throw Exception("Incorrect number of arguments for aggregate function " + name + ", should be 2",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
AggregateFunctionPtr res(createWithNumericType<AggregateFunctionGroupArrayNumeric>(*argument_types[0]));

View File

@ -100,7 +100,7 @@ public:
/// General case (ineffective). NOTE You can also implement a special case for strings.
/// General case (inefficient). NOTE You can also implement a special case for strings.
struct AggregateFunctionGroupArrayDataGeneric
{
Array value; /// TODO Add MemoryTracker
@ -109,7 +109,7 @@ struct AggregateFunctionGroupArrayDataGeneric
/// Puts all values to an array, general case. Implemented inefficiently.
class AggregateFunctionGroupArrayGeneric final
: public IUnaryAggregateFunction<AggregateFunctionGroupArrayDataGeneric, AggregateFunctionGroupArrayGeneric>
: public IUnaryAggregateFunction<AggregateFunctionGroupArrayDataGeneric, AggregateFunctionGroupArrayGeneric>
{
private:
DataTypePtr type;

View File

@ -0,0 +1,27 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h>
#include <AggregateFunctions/Helpers.h>
namespace DB
{
namespace
{
AggregateFunctionPtr createAggregateFunctionGroupArrayInsertAt(const std::string & name, const DataTypes & argument_types)
{
if (argument_types.size() != 2)
throw Exception("Incorrect number of arguments for aggregate function " + name + ", should be 2",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
return std::make_shared<AggregateFunctionGroupArrayInsertAtGeneric>();
}
}
void registerAggregateFunctionGroupArrayInsertAt(AggregateFunctionFactory & factory)
{
factory.registerFunction("groupArrayInsertAt", createAggregateFunctionGroupArrayInsertAt);
}
}

View File

@ -0,0 +1,210 @@
#pragma once
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesNumber.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnVector.h>
#include <Core/FieldVisitors.h>
#include <Interpreters/convertFieldToType.h>
#include <AggregateFunctions/IBinaryAggregateFunction.h>
#define AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE 0xFFFFFF
namespace DB
{
namespace ErrorCodes
{
extern const int TOO_LARGE_ARRAY_SIZE;
extern const int CANNOT_CONVERT_TYPE;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
/** Aggregate function, that takes two arguments: value and position,
* and as a result, builds an array with values are located at corresponding positions.
*
* If more than one value was inserted to single position, the any value (first in case of single thread) is stored.
* If no values was inserted to some position, then default value will be substituted.
*
* Aggregate function also accept optional parameters:
* - default value to substitute;
* - length to resize result arrays (if you want to have results of same length for all aggregation keys);
*
* If you want to pass length, default value should be also given.
*/
/// Generic case (inefficient).
struct AggregateFunctionGroupArrayInsertAtDataGeneric
{
Array value; /// TODO Add MemoryTracker
};
class AggregateFunctionGroupArrayInsertAtGeneric final
: public IBinaryAggregateFunction<AggregateFunctionGroupArrayInsertAtDataGeneric, AggregateFunctionGroupArrayInsertAtGeneric>
{
private:
DataTypePtr type;
Field default_value;
size_t length_to_resize = 0; /// zero means - do not do resizing.
public:
String getName() const override { return "groupArrayInsertAt"; }
DataTypePtr getReturnType() const override
{
return std::make_shared<DataTypeArray>(type);
}
void setArgumentsImpl(const DataTypes & arguments)
{
if (!arguments.at(1)->behavesAsNumber()) /// TODO filter out floating point types.
throw Exception("Second argument of aggregate function " + getName() + " must be integer.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
type = arguments.front();
if (default_value.isNull())
default_value = type->getDefault();
else
{
Field converted = convertFieldToType(default_value, *type);
if (converted.isNull())
throw Exception("Cannot convert parameter of aggregate function " + getName() + " (" + applyVisitor(FieldVisitorToString(), default_value) + ")"
" to type " + type->getName() + " to be used as default value in array", ErrorCodes::CANNOT_CONVERT_TYPE);
default_value = converted;
}
}
void setParameters(const Array & params) override
{
if (params.empty())
return;
if (params.size() > 2)
throw Exception("Aggregate function " + getName() + " requires at most two parameters.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
default_value = params[0];
if (params.size() == 2)
{
length_to_resize = applyVisitor(FieldVisitorConvertToNumber<size_t>(), params[1]);
}
}
void addImpl(AggregateDataPtr place, const IColumn & column_value, const IColumn & column_position, size_t row_num, Arena *) const
{
/// TODO Do positions need to be 1-based for this function?
size_t position = column_position.get64(row_num);
/// If position is larger than size to which array will be cutted - simply ignore value.
if (length_to_resize && position >= length_to_resize)
return;
if (position >= AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE)
throw Exception("Too large array size: position argument (" + toString(position) + ")"
" is greater or equals to limit (" + toString(AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE) + ")",
ErrorCodes::TOO_LARGE_ARRAY_SIZE);
Array & arr = data(place).value;
if (arr.size() <= position)
arr.resize(position + 1);
else if (!arr[position].isNull())
return; /// Element was already inserted to the specified position.
column_value.get(row_num, arr[position]);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
{
Array & arr_lhs = data(place).value;
const Array & arr_rhs = data(rhs).value;
if (arr_lhs.size() < arr_rhs.size())
arr_lhs.resize(arr_rhs.size());
for (size_t i = 0, size = arr_rhs.size(); i < size; ++i)
if (arr_lhs[i].isNull() && !arr_rhs[i].isNull())
arr_lhs[i] = arr_rhs[i];
}
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
{
const Array & arr = data(place).value;
size_t size = arr.size();
writeVarUInt(size, buf);
for (const Field & elem : arr)
{
if (elem.isNull())
{
writeBinary(UInt8(1), buf);
}
else
{
writeBinary(UInt8(0), buf);
type->serializeBinary(elem, buf);
}
}
}
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
{
size_t size = 0;
readVarUInt(size, buf);
if (size > AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE)
throw Exception("Too large array size", ErrorCodes::TOO_LARGE_ARRAY_SIZE);
Array & arr = data(place).value;
arr.resize(size);
for (size_t i = 0; i < size; ++i)
{
UInt8 is_null = 0;
readBinary(is_null, buf);
if (!is_null)
type->deserializeBinary(arr[i], buf);
}
}
void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
{
ColumnArray & to_array = static_cast<ColumnArray &>(to);
IColumn & to_data = to_array.getData();
ColumnArray::Offsets_t & to_offsets = to_array.getOffsets();
const Array & arr = data(place).value;
for (const Field & elem : arr)
{
if (!elem.isNull())
to_data.insert(elem);
else
to_data.insert(default_value);
}
size_t result_array_size = length_to_resize ? length_to_resize : arr.size();
/// Pad array if need.
for (size_t i = arr.size(); i < result_array_size; ++i)
to_data.insert(default_value);
to_offsets.push_back((to_offsets.empty() ? 0 : to_offsets.back()) + result_array_size);
}
};
#undef AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE
}

View File

@ -34,6 +34,11 @@ public:
return nested_func->getReturnType();
}
AggregateFunctionPtr getNestedFunction() const
{
return nested_func_owner;
}
void setArguments(const DataTypes & arguments) override
{
if (arguments.size() != 1)

View File

@ -286,7 +286,7 @@ private:
ParserString dot_p(".");
ParserNumber number_p;
auto pos = pattern.data();
const char * pos = pattern.data();
const auto begin = pos;
const auto end = pos + pattern.size();

View File

@ -1,8 +1,34 @@
#include <AggregateFunctions/AggregateFunctionState.h>
#include <AggregateFunctions/AggregateFunctionMerge.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
DataTypePtr AggregateFunctionState::getReturnType() const
{
auto ptr = std::make_shared<DataTypeAggregateFunction>(nested_func_owner, arguments, params);
/// Special case: it is -MergeState combinator
if (typeid_cast<const AggregateFunctionMerge *>(ptr->getFunction().get()))
{
if (arguments.size() != 1)
throw Exception("Combinator -MergeState expects only one argument", ErrorCodes::BAD_ARGUMENTS);
if (!typeid_cast<const DataTypeAggregateFunction *>(arguments[0].get()))
throw Exception("Combinator -MergeState expects argument with AggregateFunction type", ErrorCodes::BAD_ARGUMENTS);
return arguments[0];
}
return ptr;
}
AggregateFunctionPtr createAggregateFunctionState(AggregateFunctionPtr & nested)
{
return std::make_shared<AggregateFunctionState>(nested);

View File

@ -1,3 +1,4 @@
#pragma once
#include <DataTypes/DataTypeAggregateFunction.h>
@ -30,10 +31,7 @@ public:
return nested_func->getName() + "State";
}
DataTypePtr getReturnType() const override
{
return std::make_shared<DataTypeAggregateFunction>(nested_func_owner, arguments, params);
}
DataTypePtr getReturnType() const override;
void setArguments(const DataTypes & arguments_) override
{

View File

@ -0,0 +1,70 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionTopK.h>
#include <AggregateFunctions/Helpers.h>
namespace DB
{
namespace
{
/// Substitute return type for Date and DateTime
class AggregateFunctionTopKDate : public AggregateFunctionTopK<DataTypeDate::FieldType>
{
DataTypePtr getReturnType() const override { return std::make_shared<DataTypeArray>(std::make_shared<DataTypeDate>()); }
};
class AggregateFunctionTopKDateTime : public AggregateFunctionTopK<DataTypeDateTime::FieldType>
{
DataTypePtr getReturnType() const override { return std::make_shared<DataTypeArray>(std::make_shared<DataTypeDateTime>()); }
};
static IAggregateFunction * createWithExtraTypes(const IDataType & argument_type)
{
if (typeid_cast<const DataTypeDate *>(&argument_type)) return new AggregateFunctionTopKDate;
else if (typeid_cast<const DataTypeDateTime *>(&argument_type)) return new AggregateFunctionTopKDateTime;
else
{
/// Check that we can use plain version of AggregateFunctionTopKGeneric
if (typeid_cast<const DataTypeString*>(&argument_type) || typeid_cast<const DataTypeFixedString*>(&argument_type))
return new AggregateFunctionTopKGeneric<true>;
auto * array_type = typeid_cast<const DataTypeArray *>(&argument_type);
if (array_type)
{
auto nested_type = array_type->getNestedType();
if (nested_type->isNumeric() || typeid_cast<DataTypeFixedString *>(nested_type.get()))
return new AggregateFunctionTopKGeneric<true>;
}
return new AggregateFunctionTopKGeneric<false>;
}
}
AggregateFunctionPtr createAggregateFunctionTopK(const std::string & name, const DataTypes & argument_types)
{
if (argument_types.size() != 1)
throw Exception("Incorrect number of arguments for aggregate function " + name,
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
AggregateFunctionPtr res(createWithNumericType<AggregateFunctionTopK>(*argument_types[0]));
if (!res)
res = AggregateFunctionPtr(createWithExtraTypes(*argument_types[0]));
if (!res)
throw Exception("Illegal type " + argument_types[0]->getName() +
" of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return res;
}
}
void registerAggregateFunctionTopK(AggregateFunctionFactory & factory)
{
factory.registerFunction("topK", createAggregateFunctionTopK);
}
}

View File

@ -0,0 +1,261 @@
#pragma once
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <Columns/ColumnArray.h>
#include <Common/SpaceSaving.h>
#include <Core/FieldVisitors.h>
#include <AggregateFunctions/AggregateFunctionGroupArray.h>
namespace DB
{
// Allow NxK more space before calculating top K to increase accuracy
#define TOP_K_DEFAULT 10
#define TOP_K_LOAD_FACTOR 3
#define TOP_K_MAX_SIZE 0xFFFFFF
template <typename T>
struct AggregateFunctionTopKData
{
using Set = SpaceSaving
<
T,
T,
HashCRC32<T>,
HashTableGrower<4>,
HashTableAllocatorWithStackMemory<sizeof(T) * (1 << 4)>
>;
Set value;
};
template <typename T>
class AggregateFunctionTopK
: public IUnaryAggregateFunction<AggregateFunctionTopKData<T>, AggregateFunctionTopK<T>>
{
private:
using State = AggregateFunctionTopKData<T>;
size_t threshold = TOP_K_DEFAULT;
size_t reserved = TOP_K_LOAD_FACTOR * threshold;
public:
String getName() const override { return "topK"; }
DataTypePtr getReturnType() const override
{
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNumber<T>>());
}
void setArgument(const DataTypePtr & argument)
{
}
void setParameters(const Array & params) override
{
if (params.size() != 1)
throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
std::size_t k = applyVisitor(FieldVisitorConvertToNumber<size_t>(), params[0]);
if (k > TOP_K_MAX_SIZE)
throw Exception("Too large parameter for aggregate function " + getName() + ". Maximum: " + toString(TOP_K_MAX_SIZE),
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
threshold = k;
reserved = TOP_K_LOAD_FACTOR * k;
}
void addImpl(AggregateDataPtr place, const IColumn & column, size_t row_num, Arena *) const
{
auto & set = this->data(place).value;
if (set.capacity() != reserved)
set.resize(reserved);
set.insert(static_cast<const ColumnVector<T> &>(column).getData()[row_num]);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
{
this->data(place).value.merge(this->data(rhs).value);
}
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
{
this->data(place).value.write(buf);
}
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
{
auto & set = this->data(place).value;
set.resize(reserved);
set.read(buf);
}
void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
{
ColumnArray & arr_to = static_cast<ColumnArray &>(to);
ColumnArray::Offsets_t & offsets_to = arr_to.getOffsets();
const typename State::Set & set = this->data(place).value;
auto resultVec = set.topK(threshold);
size_t size = resultVec.size();
offsets_to.push_back((offsets_to.size() == 0 ? 0 : offsets_to.back()) + size);
typename ColumnVector<T>::Container_t & data_to = static_cast<ColumnVector<T> &>(arr_to.getData()).getData();
size_t old_size = data_to.size();
data_to.resize(old_size + size);
size_t i = 0;
for (auto it = resultVec.begin(); it != resultVec.end(); ++it, ++i)
data_to[old_size + i] = it->key;
}
};
/// Generic implementation, it uses serialized representation as object descriptor.
struct AggregateFunctionTopKGenericData
{
using Set = SpaceSaving
<
std::string,
StringRef,
StringRefHash,
HashTableGrower<4>,
HashTableAllocatorWithStackMemory<sizeof(StringRef) * (1 << 4)>
>;
Set value;
};
/** Template parameter with true value should be used for columns that store their elements in memory continuously.
* For such columns topK() can be implemented more efficently (especially for small numeric arrays).
*/
template <bool is_plain_column = false>
class AggregateFunctionTopKGeneric : public IUnaryAggregateFunction<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column>>
{
private:
using State = AggregateFunctionTopKGenericData;
DataTypePtr input_data_type;
size_t threshold = TOP_K_DEFAULT;
size_t reserved = TOP_K_LOAD_FACTOR * threshold;
static void deserializeAndInsert(StringRef str, IColumn & data_to);
public:
String getName() const override { return "topK"; }
void setArgument(const DataTypePtr & argument)
{
input_data_type = argument;
}
void setParameters(const Array & params) override
{
if (params.size() != 1)
throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
size_t k = applyVisitor(FieldVisitorConvertToNumber<size_t>(), params[0]);
if (k > TOP_K_MAX_SIZE)
throw Exception("Too large parameter for aggregate function " + getName() + ". Maximum: " + toString(TOP_K_MAX_SIZE),
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
threshold = k;
reserved = TOP_K_LOAD_FACTOR * k;
}
DataTypePtr getReturnType() const override
{
return std::make_shared<DataTypeArray>(input_data_type->clone());
}
bool allocatesMemoryInArena() const override
{
return true;
}
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
{
this->data(place).value.write(buf);
}
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena * arena) const override
{
auto & set = this->data(place).value;
set.resize(reserved);
size_t count = 0;
readVarUInt(count, buf);
for (size_t i = 0; i < count; ++i) {
std::string key_string;
readStringBinary(key_string, buf);
UInt64 count, error;
readVarUInt(count, buf);
readVarUInt(error, buf);
set.insert(key_string, count, error);
}
}
void addImpl(AggregateDataPtr place, const IColumn & column, size_t row_num, Arena * arena) const
{
auto & set = this->data(place).value;
if (set.capacity() != reserved) {
set.resize(reserved);
}
StringRef str_serialized = column.getDataAt(row_num);
set.insert(str_serialized.toString());
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
{
this->data(place).value.merge(this->data(rhs).value);
}
void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
{
ColumnArray & arr_to = static_cast<ColumnArray &>(to);
ColumnArray::Offsets_t & offsets_to = arr_to.getOffsets();
IColumn & data_to = arr_to.getData();
auto resultVec = this->data(place).value.topK(threshold);
offsets_to.push_back((offsets_to.size() == 0 ? 0 : offsets_to.back()) + resultVec.size());
for (auto & elem : resultVec)
{
deserializeAndInsert(elem.key, data_to);
}
}
};
template <>
inline void AggregateFunctionTopKGeneric<false>::deserializeAndInsert(StringRef str, IColumn & data_to)
{
data_to.deserializeAndInsertFromArena(str.data);
}
template <>
inline void AggregateFunctionTopKGeneric<true>::deserializeAndInsert(StringRef str, IColumn & data_to)
{
data_to.insertData(str.data, str.size);
}
#undef TOP_K_DEFAULT
#undef TOP_K_MAX_SIZE
#undef TOP_K_LOAD_FACTOR
}

View File

@ -86,12 +86,11 @@ struct AggregateFunctionUniqExactData
using Key = T;
/// When creating, the hash table must be small.
typedef HashSet<
using Set = HashSet<
Key,
HashCRC32<Key>,
HashTableGrower<4>,
HashTableAllocatorWithStackMemory<sizeof(Key) * (1 << 4)>
> Set;
HashTableAllocatorWithStackMemory<sizeof(Key) * (1 << 4)>>;
Set set;
@ -105,12 +104,11 @@ struct AggregateFunctionUniqExactData<String>
using Key = UInt128;
/// When creating, the hash table must be small.
typedef HashSet<
using Set = HashSet<
Key,
UInt128TrivialHash,
HashTableGrower<3>,
HashTableAllocatorWithStackMemory<sizeof(Key) * (1 << 3)>
> Set;
HashTableAllocatorWithStackMemory<sizeof(Key) * (1 << 3)>>;
Set set;

View File

@ -0,0 +1,27 @@
include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
add_headers_and_sources(clickhouse_aggregate_functions .)
list(REMOVE_ITEM clickhouse_aggregate_functions_sources
AggregateFunctionFactory.cpp
AggregateFunctionState.cpp
AggregateFunctionArray.cpp
AggregateFunctionNull.cpp
AggregateFunctionForEach.cpp
AggregateFunctionIf.cpp
AggregateFunctionMerge.cpp
AggregateFunctionCount.cpp
)
list(REMOVE_ITEM clickhouse_aggregate_functions_headers
AggregateFunction.h
AggregateFunctionFactory.h
AggregateFunctionState.h
AggregateFunctionArray.h
AggregateFunctionNull.h
AggregateFunctionForEach.h
AggregateFunctionIf.h
AggregateFunctionMerge.h
AggregateFunctionCount.h
)
add_library(clickhouse_aggregate_functions ${clickhouse_aggregate_functions_sources})

View File

@ -19,8 +19,7 @@ public:
if (arguments.size() != 2)
throw Exception{
"Passed " + toString(arguments.size()) + " arguments to binary aggregate function " + this->getName(),
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH
};
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH};
getDerived().setArgumentsImpl(arguments);
}

View File

@ -0,0 +1,54 @@
#include <AggregateFunctions/registerAggregateFunctions.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
namespace DB
{
void registerAggregateFunctionAvg(AggregateFunctionFactory & factory);
void registerAggregateFunctionCount(AggregateFunctionFactory & factory);
void registerAggregateFunctionGroupArray(AggregateFunctionFactory & factory);
void registerAggregateFunctionGroupUniqArray(AggregateFunctionFactory & factory);
void registerAggregateFunctionGroupArrayInsertAt(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantile(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileExact(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileExactWeighted(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileDeterministic(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileTiming(AggregateFunctionFactory & factory);
void registerAggregateFunctionsQuantileTDigest(AggregateFunctionFactory & factory);
void registerAggregateFunctionsSequenceMatch(AggregateFunctionFactory & factory);
void registerAggregateFunctionsMinMaxAny(AggregateFunctionFactory & factory);
void registerAggregateFunctionsStatistics(AggregateFunctionFactory & factory);
void registerAggregateFunctionSum(AggregateFunctionFactory & factory);
void registerAggregateFunctionsUniq(AggregateFunctionFactory & factory);
void registerAggregateFunctionUniqUpTo(AggregateFunctionFactory & factory);
void registerAggregateFunctionTopK(AggregateFunctionFactory & factory);
void registerAggregateFunctionDebug(AggregateFunctionFactory & factory);
void registerAggregateFunctions()
{
auto & factory = AggregateFunctionFactory::instance();
registerAggregateFunctionAvg(factory);
registerAggregateFunctionCount(factory);
registerAggregateFunctionGroupArray(factory);
registerAggregateFunctionGroupUniqArray(factory);
registerAggregateFunctionGroupArrayInsertAt(factory);
registerAggregateFunctionsQuantile(factory);
registerAggregateFunctionsQuantileExact(factory);
registerAggregateFunctionsQuantileExactWeighted(factory);
registerAggregateFunctionsQuantileDeterministic(factory);
registerAggregateFunctionsQuantileTiming(factory);
registerAggregateFunctionsQuantileTDigest(factory);
registerAggregateFunctionsSequenceMatch(factory);
registerAggregateFunctionsMinMaxAny(factory);
registerAggregateFunctionsStatistics(factory);
registerAggregateFunctionSum(factory);
registerAggregateFunctionsUniq(factory);
registerAggregateFunctionUniqUpTo(factory);
registerAggregateFunctionTopK(factory);
registerAggregateFunctionDebug(factory);
}
}

View File

@ -0,0 +1,8 @@
#pragma once
namespace DB
{
void registerAggregateFunctions();
}

View File

@ -165,7 +165,7 @@ void processFunction(const String & column_name, ASTPtr & ast, TypeAndConstantIn
}
/// Aggregate function.
if (AggregateFunctionPtr aggregate_function_ptr = context.getAggregateFunctionFactory().tryGet(function->name, argument_types))
if (AggregateFunctionPtr aggregate_function_ptr = AggregateFunctionFactory::instance().tryGet(function->name, argument_types))
{
/// NOTE Not considering aggregate function parameters in type inference. It could become needed in future.
/// Note that aggregate function could never be constant expression.

View File

@ -2,22 +2,22 @@ add_executable(collect_aliases collect_aliases.cpp)
target_link_libraries(collect_aliases dbms)
add_executable(collect_tables collect_tables.cpp)
target_link_libraries(collect_tables dbms storages_system)
target_link_libraries(collect_tables dbms clickhouse_storages_system)
add_executable(analyze_columns analyze_columns.cpp)
target_link_libraries(analyze_columns dbms storages_system)
target_link_libraries(analyze_columns dbms clickhouse_storages_system)
add_executable(type_and_constant_inference type_and_constant_inference.cpp)
target_link_libraries(type_and_constant_inference storages_system clickhouse_functions dbms)
target_link_libraries(type_and_constant_inference clickhouse_storages_system clickhouse_functions dbms)
add_executable(analyze_result_of_query analyze_result_of_query.cpp)
target_link_libraries(analyze_result_of_query dbms storages_system)
target_link_libraries(analyze_result_of_query dbms clickhouse_storages_system)
add_executable(translate_positional_arguments translate_positional_arguments.cpp)
target_link_libraries(translate_positional_arguments dbms)
add_executable(optimize_group_order_limit_by optimize_group_order_limit_by.cpp)
target_link_libraries(optimize_group_order_limit_by dbms storages_system)
target_link_libraries(optimize_group_order_limit_by dbms clickhouse_storages_system)
add_executable(analyze_lambdas analyze_lambdas.cpp)
target_link_libraries(analyze_lambdas dbms)

View File

@ -1,5 +1,5 @@
add_library (clickhouse-client Client.cpp)
target_link_libraries (clickhouse-client dbms ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY})
target_link_libraries (clickhouse-client dbms clickhouse_aggregate_functions ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY})
install (FILES config.xml DESTINATION ${CLICKHOUSE_ETC_DIR}/clickhouse-client COMPONENT clickhouse-client)
add_library (clickhouse-benchmark Benchmark.cpp)

View File

@ -46,6 +46,7 @@
#include <Common/NetException.h>
#include <common/readline_use.h>
#include <Functions/registerFunctions.h>
#include <AggregateFunctions/registerAggregateFunctions.h>
/// http://en.wikipedia.org/wiki/ANSI_escape_code
@ -191,6 +192,7 @@ private:
#undef EXTRACT_LIMIT
registerFunctions();
registerAggregateFunctions();
}

View File

@ -15,7 +15,7 @@
#include <unistd.h>
/** Небольшие обёртки для асинхронного ввода-вывода.
/** Small wrappers for asynchronous I/O.
*/

View File

@ -22,15 +22,15 @@ namespace ErrorCodes
}
/** Многие современные аллокаторы (например, tcmalloc) не умеют делать mremap для realloc,
* даже в случае достаточно больших кусков памяти.
* Хотя это позволяет увеличить производительность и уменьшить потребление памяти во время realloc-а.
* Чтобы это исправить, делаем mremap самостоятельно, если кусок памяти достаточно большой.
* Порог (64 МБ) выбран достаточно большим, так как изменение адресного пространства
* довольно сильно тормозит, особенно в случае наличия большого количества потоков.
* Рассчитываем, что набор операций mmap/что-то сделать/mremap может выполняться всего лишь около 1000 раз в секунду.
/** Many modern allocators (for example, tcmalloc) do not do a mremap for realloc,
* even in case of large enough chunks of memory.
* Although this allows you to increase performance and reduce memory consumption during realloc.
* To fix this, we do mremap manually if the chunk of memory is large enough.
* The threshold (64 MB) is chosen quite large, since changing the address space is
* very slow, especially in the case of a large number of threads.
* We expect that the set of operations mmap/something to do/mremap can only be performed about 1000 times per second.
*
* PS. Также это требуется, потому что tcmalloc не может выделить кусок памяти больше 16 GB.
* PS. This is also required, because tcmalloc can not allocate a chunk of memory greater than 16 GB.
*/
static constexpr size_t MMAP_THRESHOLD = 64 * (1 << 20);
static constexpr size_t MMAP_MIN_ALIGNMENT = 4096;

View File

@ -3,13 +3,13 @@
#include <string.h>
/** Отвечает за выделение/освобождение памяти. Используется, например, в PODArray, Arena.
* Также используется в хэш-таблицах.
* Интерфейс отличается от std::allocator
* - наличием метода realloc, который для больших кусков памяти использует mremap;
* - передачей размера в метод free;
* - наличием аргумента alignment;
* - возможностью зануления памяти (используется в хэш-таблицах);
/** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena.
* Also used in hash tables.
* The interface is different from std::allocator
* - the presence of the method realloc, which for large chunks of memory uses mremap;
* - passing the size into the `free` method;
* - by the presence of the `alignment` argument;
* - the possibility of zeroing memory (used in hash tables);
*/
template <bool clear_memory_>
class Allocator
@ -38,9 +38,9 @@ protected:
};
/** При использовании AllocatorWithStackMemory, размещённом на стеке,
* GCC 4.9 ошибочно делает предположение, что мы можем вызывать free от указателя на стек.
* На самом деле, комбинация условий внутри AllocatorWithStackMemory этого не допускает.
/** When using AllocatorWithStackMemory, located on the stack,
* GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack.
* In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this.
*/
#if !__clang__
#pragma GCC diagnostic push

View File

@ -8,40 +8,40 @@ namespace DB
{
/** В отличие от Arena, позволяет освобождать (для последующего повторного использования)
* выделенные ранее (не обязательно только что) куски памяти.
* Для этого, запрашиваемый размер округляется вверх до степени двух
* (или до 8, если меньше; или используется выделение памяти вне Arena, если размер больше 65536).
* При освобождении памяти, для каждого размера (всего 14 вариантов: 8, 16... 65536),
* поддерживается односвязный список свободных блоков.
* При аллокации, мы берём голову списка свободных блоков,
* либо, если список пуст - выделяем новый блок, используя Arena.
/** Unlike Arena, allows you to release (for later re-use)
* previously allocated (not necessarily just recently) chunks of memory.
* For this, the requested size is rounded up to the power of two
* (or up to 8, if less, or using memory allocation outside Arena if the size is greater than 65536).
* When freeing memory, for each size (14 options in all: 8, 16 ... 65536),
* a single-linked list of free blocks is kept track.
* When allocating, we take the head of the list of free blocks,
* or, if the list is empty - allocate a new block using Arena.
*/
class ArenaWithFreeLists : private Allocator<false>, private boost::noncopyable
{
private:
/// Если блок свободен, то в его начале хранится указатель на следующий свободный блок, либо nullptr, если свободных блоков больше нет.
/// Если блок используется, то в нём хранятся какие-то данные.
/// If the block is free, then the pointer to the next free block is stored at its beginning, or nullptr, if there are no more free blocks.
/// If the block is used, then some data is stored in it.
union Block
{
Block * next;
char data[0];
};
/// Максимальный размер куска памяти, который выделяется с помощью Arena. Иначе используем Allocator напрямую.
/// The maximum size of a piece of memory that is allocated with Arena. Otherwise, we use Allocator directly.
static constexpr size_t max_fixed_block_size = 65536;
/// Получить индекс в массиве freelist-ов для заданного размера.
/// Get the index in the freelist array for the specified size.
static size_t findFreeListIndex(const size_t size)
{
return size <= 8 ? 2 : bitScanReverse(size - 1);
}
/// Для выделения блоков не слишком большого размера используется Arena.
/// Arena is used to allocate blocks that are not too large.
Arena pool;
/// Списки свободных блоков. Каждый элемент указывает на голову соответствующего списка, либо равен nullptr.
/// Первые два элемента не используются, а предназначены для упрощения арифметики.
/// Lists of free blocks. Each element points to the head of the corresponding list, or is nullptr.
/// The first two elements are not used, but are intended to simplify arithmetic.
Block * free_lists[16] {};
public:
@ -60,10 +60,10 @@ public:
/// find list of required size
const auto list_idx = findFreeListIndex(size);
/// Если есть свободный блок.
/// If there is a free block.
if (auto & free_block_ptr = free_lists[list_idx])
{
/// Возьмём его. И поменяем голову списка на следующий элемент списка.
/// Let's take it. And change the head of the list to the next item in the list.
const auto res = free_block_ptr->data;
free_block_ptr = free_block_ptr->next;
return res;
@ -81,14 +81,14 @@ public:
/// find list of required size
const auto list_idx = findFreeListIndex(size);
/// Вставим освобождённый блок в голову списка.
/// Insert the released block into the head of the list.
auto & free_block_ptr = free_lists[list_idx];
const auto old_head = free_block_ptr;
free_block_ptr = reinterpret_cast<Block *>(ptr);
free_block_ptr->next = old_head;
}
/// Размер выделенного пула в байтах
/// Size of the allocated pool in bytes
size_t size() const
{
return pool.size();

View File

@ -8,30 +8,30 @@
namespace DB
{
/** Массив (почти) неизменяемого размера:
* размер задаётся в конструкторе;
* метод resize приводит к удалению старых данных и нужен лишь для того,
* чтобы можно было сначала создать пустой объект, используя конструктор по-умолчанию,
* а потом уже определиться с размером.
/** An array of (almost) unchangable size:
* the size is specified in the constructor;
* `resize` method removes old data, and necessary only for
* so that you can first create an empty object using the default constructor,
* and then decide on the size.
*
* Есть возможность не инициализировать элементы по-умолчанию, а создавать их inplace.
* Деструкторы элементов вызываются автоматически.
* There is a possibility to not initialize elements by default, but create them inplace.
* Member destructors are called automatically.
*
* sizeof равен размеру одного указателя.
* `sizeof` is equal to the size of one pointer.
*
* Не exception-safe.
* Копирование не поддерживается. Перемещение опустошает исходный объект.
* То есть, использовать этот массив во многих случаях неудобно.
* Not exception-safe.
* Copying is not supported. Moving empties the original object.
* That is, it is inconvenient to use this array in many cases.
*
* Предназначен для ситуаций, в которых создаётся много массивов одинакового небольшого размера,
* но при этом размер не известен во время компиляции.
* Также даёт существенное преимущество в случаях, когда важно, чтобы sizeof был минимальным.
* Например, если массивы кладутся в open-addressing хэш-таблицу с inplace хранением значений (как HashMap)
* Designed for situations in which many arrays of the same small size are created,
* but the size is not known at compile time.
* Also gives a significant advantage in cases where it is important that `sizeof` is minimal.
* For example, if arrays are put in an open-addressing hash table with inplace storage of values (like HashMap)
*
* В этом случае, по сравнению с std::vector:
* - для массивов размером в 1 элемент - преимущество примерно в 2 раза;
* - для массивов размером в 5 элементов - преимущество примерно в 1.5 раза
* (в качестве T использовались DB::Field, содержащие UInt64 и String);
* In this case, compared to std::vector:
* - for arrays of 1 element size - an advantage of about 2 times;
* - for arrays of 5 elements - an advantage of about 1.5 times
* (DB::Field, containing UInt64 and String, used as T);
*/
const size_t empty_auto_array_helper = 0;
@ -42,7 +42,7 @@ template <typename T>
class AutoArray
{
public:
/// Для отложенного создания.
/// For deferred creation.
AutoArray()
{
setEmpty();
@ -53,16 +53,16 @@ public:
init(size_, false);
}
/** Не будут вызваны конструкторы по-умолчанию для элементов.
* В этом случае, вы должны вставить все элементы с помощью функции place и placement new,
* так как для них потом будут вызваны деструкторы.
/** The default constructors for elements will not be called.
* In this case, you must insert all elements using the `place` and `placement new` functions,
* since destructors are then called for them.
*/
AutoArray(size_t size_, const DontInitElemsTag & tag)
{
init(size_, true);
}
/** Инициализирует все элементы копирующим конструктором с параметром value.
/** Initializes all elements with a copy constructor with the `value` parameter.
*/
AutoArray(size_t size_, const T & value)
{
@ -74,7 +74,7 @@ public:
}
}
/** resize удаляет все существующие элементы.
/** `resize` removes all existing items.
*/
void resize(size_t size_, bool dont_init_elems = false)
{
@ -82,7 +82,7 @@ public:
init(size_, dont_init_elems);
}
/** Премещение.
/** Move operations.
*/
AutoArray(AutoArray && src)
{
@ -125,10 +125,10 @@ public:
setEmpty();
}
/** Можно читать и модифицировать элементы с помощью оператора []
* только если элементы были инициализированы
* (то есть, в конструктор не был передан DontInitElemsTag,
* или вы их инициализировали с помощью place и placement new).
/** You can read and modify elements using the [] operator
* only if items were initialized
* (that is, into the constructor was not passed DontInitElemsTag,
* or you initialized them using `place` and `placement new`).
*/
T & operator[](size_t i)
{
@ -140,9 +140,9 @@ public:
return elem(i);
}
/** Получить кусок памяти, в котором должен быть расположен элемент.
* Функция предназначена, чтобы инициализировать элемент,
* который ещё не был инициализирован:
/** Get the piece of memory in which the element should be located.
* The function is intended to initialize an element,
* which has not yet been initialized
* new (arr.place(i)) T(args);
*/
char * place(size_t i)

View File

@ -23,9 +23,9 @@ static inline ContainerType max(const ContainerType & lhs, const ContainerType &
}
/** Для маленького количества ключей - массив фиксированного размера "на стеке".
* Для среднего - выделяется HashSet.
* Для большого - выделяется HyperLogLog.
/** For a small number of keys - an array of fixed size "on the stack".
* For the average, HashSet is allocated.
* For large, HyperLogLog is allocated.
*/
template
<
@ -146,7 +146,7 @@ public:
getContainer<Large>().merge(rhs.getContainer<Large>());
}
/// Можно вызывать только для пустого объекта.
/// You can only call for an empty object.
void read(DB::ReadBuffer & in)
{
UInt8 v;
@ -171,8 +171,8 @@ public:
{
auto container_type = getContainerType();
/// Если readAndMerge вызывается с пустым состоянием, просто десериализуем
/// состояние задано в качестве параметра.
/// If readAndMerge is called with an empty state, just deserialize
/// the state is specified as a parameter.
if ((container_type == details::ContainerType::SMALL) && small.empty())
{
read(in);

View File

@ -15,11 +15,11 @@ namespace ErrorCodes
}
/** Компактный массив для хранения данных, размер content_width, в битах, которых составляет
* меньше одного байта. Вместо того, чтобы хранить каждое значение в отдельный
* байт, что приводит к растрате 37.5% пространства для content_width=5, CompactArray хранит
* смежные content_width-битные значения в массиве байтов, т.е. фактически CompactArray
* симулирует массив content_width-битных значений.
/** Compact array for data storage, size `content_width`, in bits, of which is
* less than one byte. Instead of storing each value in a separate
* bytes, which leads to a waste of 37.5% of the space for content_width = 5, CompactArray stores
* adjacent `content_width`-bit values in the byte array, that is actually CompactArray
* simulates an array of `content_width`-bit values.
*/
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
class __attribute__ ((packed)) CompactArray final
@ -76,12 +76,12 @@ public:
}
private:
/// число байт в битсете
/// number of bytes in bitset
static constexpr size_t BITSET_SIZE = (static_cast<size_t>(bucket_count) * content_width + 7) / 8;
UInt8 bitset[BITSET_SIZE] = { 0 };
};
/** Класс для последовательного чтения ячеек из компактного массива на диске.
/** A class for sequentially reading cells from a compact array on a disk.
*/
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
class CompactArray<BucketIndex, content_width, bucket_count>::Reader final
@ -135,7 +135,7 @@ public:
return true;
}
/** Вернуть текущий номер ячейки и соответствующее содержание.
/** Return the current cell number and the corresponding content.
*/
inline std::pair<BucketIndex, UInt8> get() const
{
@ -150,26 +150,26 @@ public:
private:
ReadBuffer & in;
/// Физическое расположение текущей ячейки.
/// The physical location of the current cell.
Locus locus;
/// Текущая позиция в файле в виде номера ячейки.
/// The current position in the file as a cell number.
BucketIndex current_bucket_index = 0;
/// Количество прочитанных байтов.
/// The number of bytes read.
size_t read_count = 0;
/// Содержание в текущей позиции.
/// The content in the current position.
UInt8 value_l;
UInt8 value_r;
///
bool is_eof = false;
/// Влезает ли ячейка полностью в один байт?
/// Does the cell fully fit into one byte?
bool fits_in_byte;
};
/** Структура Locus содержит необходимую информацию, чтобы найти для каждой ячейки
* соответствующие байт и смещение, в битах, от начала ячейки. Поскольку в общем
* случае размер одного байта не делится на размер одной ячейки, возможны случаи,
* когда одна ячейка перекрывает два байта. Поэтому структура Locus содержит две
* пары (индекс, смещение).
/** The `Locus` structure contains the necessary information to find for each cell
* the corresponding byte and offset, in bits, from the beginning of the cell. Since in general
* case the size of one byte is not divisible by the size of one cell, cases possible
* when one cell overlaps two bytes. Therefore, the `Locus` structure contains two
* pairs (index, offset).
*/
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
class CompactArray<BucketIndex, content_width, bucket_count>::Locus final
@ -190,13 +190,13 @@ public:
{
if ((index_l == index_r) || (index_l == (BITSET_SIZE - 1)))
{
/// Ячейка полностью влезает в один байт.
/// The cell completely fits into one byte.
*content_l &= ~(((1 << content_width) - 1) << offset_l);
*content_l |= content << offset_l;
}
else
{
/// Ячейка перекрывает два байта.
/// The cell overlaps two bytes.
size_t left = 8 - offset_l;
*content_l &= ~(((1 << left) - 1) << offset_l);
@ -230,13 +230,13 @@ private:
UInt8 ALWAYS_INLINE read(UInt8 value_l) const
{
/// Ячейка полностью влезает в один байт.
/// The cell completely fits into one byte.
return (value_l >> offset_l) & ((1 << content_width) - 1);
}
UInt8 ALWAYS_INLINE read(UInt8 value_l, UInt8 value_r) const
{
/// Ячейка перекрывает два байта.
/// The cell overlaps two bytes.
return ((value_l >> offset_l) & ((1 << (8 - offset_l)) - 1))
| ((value_r & ((1 << offset_r) - 1)) << (8 - offset_l));
}
@ -250,7 +250,7 @@ private:
UInt8 * content_l;
UInt8 * content_r;
/// Проверки
/// Checks
static_assert((content_width > 0) && (content_width < 8), "Invalid parameter value");
static_assert(bucket_count <= (std::numeric_limits<size_t>::max() / content_width), "Invalid parameter value");
};

View File

@ -38,9 +38,9 @@ namespace detail
}
};
/** Очень простая thread-safe очередь ограниченной длины.
* Если пытаться вынуть элемент из пустой очереди, то поток блокируется, пока очередь не станет непустой.
* Если пытаться вставить элемент в переполненную очередь, то поток блокируется, пока в очереди не появится элемент.
/** A very simple thread-safe queue of limited length.
* If you try to pop an item from an empty queue, the thread is blocked until the queue becomes nonempty.
* If you try to push an element into an overflowed queue, the thread is blocked until space appears in the queue.
*/
template <typename T>
class ConcurrentBoundedQueue

View File

@ -22,24 +22,24 @@
#define SMALL_READ_WRITE_BUFFER_SIZE 16
/** Хранит в файле число.
* Предназначен для редких вызовов (не рассчитан на производительность).
/** Stores a number in the file.
* Designed for rare calls (not designed for performance).
*/
class CounterInFile
{
public:
/// path - имя файла, включая путь
/// path - the name of the file, including the path
CounterInFile(const std::string & path_) : path(path_) {}
/** Добавить delta к числу в файле и вернуть новое значение.
* Если параметр create_if_need не установлен в true, то
* в файле уже должно быть записано какое-нибудь число (если нет - создайте файл вручную с нулём).
/** Add `delta` to the number in the file and return the new value.
* If the `create_if_need` parameter is not set to true, then
* the file should already have a number written (if not - create the file manually with zero).
*
* Для защиты от race condition-ов между разными процессами, используются файловые блокировки.
* (Но при первом создании файла race condition возможен, так что лучше создать файл заранее.)
* To protect against race conditions between different processes, file locks are used.
* (But when the first file is created, the race condition is possible, so it's better to create the file in advance.)
*
* locked_callback вызывается при заблокированном файле со счетчиком. В него передается новое значение.
* locked_callback можно использовать, чтобы делать что-нибудь атомарно с увеличением счетчика (например, переименовывать файлы).
* `locked_callback` is called when the counter file is locked. A new value is passed to it.
* `locked_callback` can be used to do something atomically with incrementing the counter (for example, renaming files).
*/
template <typename Callback>
Int64 add(Int64 delta, Callback && locked_callback, bool create_if_need = false)
@ -74,7 +74,7 @@ public:
}
catch (const DB::Exception & e)
{
/// Более понятное сообщение об ошибке.
/// A more understandable error message.
if (e.code() == DB::ErrorCodes::CANNOT_READ_ALL_DATA || e.code() == DB::ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
throw DB::Exception("File " + path + " is empty. You must fill it manually with appropriate value.", e.code());
else
@ -118,13 +118,13 @@ public:
return path;
}
/// Изменить путь к файлу.
/// Change the path to the file.
void setPath(std::string path_)
{
path = path_;
}
// Не thread-safe и не синхронизирован между процессами.
// Not thread-safe and not synchronized between processes.
void fixIfBroken(UInt64 value)
{
bool file_exists = Poco::File(path).exists();

View File

@ -35,7 +35,7 @@ public:
DB::Exception * clone() const override { return new DB::Exception(*this); }
void rethrow() const override { throw *this; }
/// Дописать к существующему сообщению что-нибудь ещё.
/// Add something to the existing message.
void addMessage(const std::string & arg) { extendedMessage(arg); }
const StackTrace & getStackTrace() const { return trace; }
@ -45,7 +45,7 @@ private:
};
/// Содержит дополнительный член saved_errno. См. функцию throwFromErrno.
/// Contains an additional member `saved_errno`. See the throwFromErrno function.
class ErrnoException : public Exception
{
public:
@ -73,8 +73,8 @@ using Exceptions = std::vector<std::exception_ptr>;
void throwFromErrno(const std::string & s, int code = 0, int the_errno = errno);
/** Попробовать записать исключение в лог (и забыть про него).
* Можно использовать в деструкторах в блоке catch (...).
/** Try to write an exception to the log (and forget about it).
* Can be used in destructors in the catch-all block.
*/
void tryLogCurrentException(const char * log_name, const std::string & start_of_message = "");
void tryLogCurrentException(Poco::Logger * logger, const std::string & start_of_message = "");

View File

@ -25,16 +25,16 @@ namespace ErrorCodes
}
/// Базовый класс содержащий основную информацию о внешней таблице и
/// основные функции для извлечения этой информации из текстовых полей.
/// The base class containing the basic information about external table and
/// basic functions for extracting this information from text fields.
class BaseExternalTable
{
public:
std::string file; /// Файл с данными или '-' если stdin
std::string name; /// Имя таблицы
std::string format; /// Название формата хранения данных
std::string file; /// File with data or '-' if stdin
std::string name; /// The name of the table
std::string format; /// Name of the data storage format
/// Описание структуры таблицы: (имя столбца, имя типа данных)
/// Description of the table structure: (column name, data type name)
std::vector<std::pair<std::string, std::string> > structure;
std::unique_ptr<ReadBuffer> read_buffer;
@ -42,10 +42,10 @@ public:
virtual ~BaseExternalTable() {};
/// Инициализировать read_buffer в зависимости от источника данных. По умолчанию не делает ничего.
/// Initialize read_buffer, depending on the data source. By default, does nothing.
virtual void initReadBuffer() {};
/// Инициализировать sample_block по структуре таблицы сохраненной в structure
/// Initialize sample_block according to the structure of the table stored in the `structure`
virtual void initSampleBlock(const Context & context)
{
const DataTypeFactory & data_type_factory = DataTypeFactory::instance();
@ -60,7 +60,7 @@ public:
}
}
/// Получить данные таблицы - пару (поток с содержимым таблицы, имя таблицы)
/// Get the table data - a pair (a thread with the contents of the table, the name of the table)
virtual ExternalTableData getData(const Context & context)
{
initReadBuffer();
@ -71,7 +71,7 @@ public:
}
protected:
/// Очистить всю накопленную информацию
/// Clear all accumulated information
void clean()
{
name = "";
@ -82,7 +82,7 @@ protected:
read_buffer.reset();
}
/// Функция для отладочного вывода информации
/// Function for debugging information output
void write()
{
std::cerr << "file " << file << std::endl;
@ -100,7 +100,7 @@ protected:
return res;
}
/// Построить вектор structure по текстовому полю structure
/// Construct the `structure` vector from the text field `structure`
virtual void parseStructureFromStructureField(const std::string & argument)
{
std::vector<std::string> vals = split(argument, " ,");
@ -112,7 +112,7 @@ protected:
structure.emplace_back(vals[i], vals[i + 1]);
}
/// Построить вектор structure по текстовому полю types
/// Construct the `structure` vector from the text field `types`
virtual void parseStructureFromTypesField(const std::string & argument)
{
std::vector<std::string> vals = split(argument, " ,");
@ -123,7 +123,7 @@ protected:
};
/// Парсинг внешей таблицы, используемый в tcp клиенте.
/// Parsing of external table used in the tcp client.
class ExternalTable : public BaseExternalTable
{
public:
@ -135,7 +135,7 @@ public:
read_buffer = std::make_unique<ReadBufferFromFile>(file);
}
/// Извлечение параметров из variables_map, которая строится по командной строке клиента
/// Extract parameters from variables_map, which is built on the client command line
ExternalTable(const boost::program_options::variables_map & external_options)
{
if (external_options.count("file"))
@ -162,9 +162,9 @@ public:
}
};
/// Парсинг внешей таблицы, используемый при отправке таблиц через http
/// Функция handlePart будет вызываться для каждой переданной таблицы,
/// поэтому так же необходимо вызывать clean в конце handlePart.
/// Parsing of external table used when sending tables via http
/// The `handlePart` function will be called for each table passed,
/// so it's also necessary to call `clean` at the end of the `handlePart`.
class ExternalTablesHandler : public Poco::Net::PartHandler, BaseExternalTable
{
public:
@ -174,15 +174,15 @@ public:
void handlePart(const Poco::Net::MessageHeader & header, std::istream & stream)
{
/// Буфер инициализируется здесь, а не в виртуальной функции initReadBuffer
/// The buffer is initialized here, not in the virtual function initReadBuffer
read_buffer = std::make_unique<ReadBufferFromIStream>(stream);
/// Извлекаем коллекцию параметров из MessageHeader
/// Retrieve a collection of parameters from MessageHeader
Poco::Net::NameValueCollection content;
std::string label;
Poco::Net::MessageHeader::splitParameters(header.get("Content-Disposition"), label, content);
/// Получаем параметры
/// Get parameters
name = content.get("name", "_data");
format = params.get(name + "_format", "TabSeparated");
@ -195,13 +195,13 @@ public:
ExternalTableData data = getData(context);
/// Создаем таблицу
/// Create table
NamesAndTypesListPtr columns = std::make_shared<NamesAndTypesList>(sample_block.getColumnsList());
StoragePtr storage = StorageMemory::create(data.second, columns);
context.addExternalTable(data.second, storage);
BlockOutputStreamPtr output = storage->write(ASTPtr(), context.getSettingsRef());
/// Записываем данные
/// Write data
data.first->readPrefix();
output->writePrefix();
while(Block block = data.first->read())
@ -210,7 +210,7 @@ public:
output->writeSuffix();
names.push_back(name);
/// Подготавливаемся к приему следующего файла, для этого очищаем всю полученную информацию
/// We are ready to receive the next file, for this we clear all the information received
clean();
}

View File

@ -136,7 +136,7 @@ void FileChecker::load(Map & map) const
ReadBufferFromFile in(files_info_path);
WriteBufferFromString out(content);
/// The JSON library does not support whitespace. We delete them. Ineffective.
/// The JSON library does not support whitespace. We delete them. Inefficient.
while (!in.eof())
{
char c;

View File

@ -8,11 +8,11 @@
namespace DB
{
/// хранит размеры всех столбцов, и может проверять не побились ли столбцы
/// stores the sizes of all columns, and can check whether the columns are corrupted
class FileChecker
{
private:
/// Имя файла -> размер.
/// File name -> size.
using Map = std::map<std::string, size_t>;
public:
@ -23,7 +23,7 @@ public:
void update(const Poco::File & file);
void update(const Files::const_iterator & begin, const Files::const_iterator & end);
/// Проверяем файлы, параметры которых указаны в sizes.json
/// Check the files whose parameters are specified in sizes.json
bool check() const;
private:
@ -35,7 +35,7 @@ private:
std::string files_info_path;
std::string tmp_files_info_path;
/// Данные из файла читаются лениво.
/// The data from the file is read lazily.
Map map;
bool initialized = false;

View File

@ -4,12 +4,12 @@
#include <Common/HashTable/HashSet.h>
/** Хеш-таблица, позволяющая очищать таблицу за O(1).
* Еще более простая, чем HashSet: Key и Mapped должны быть POD-типами.
/** A hash table that allows you to clear the table in O(1).
* Even simpler than HashSet: Key and Mapped must be POD-types.
*
* Вместо этого класса можно было бы просто использовать в HashSet в качестве ключа пару <версия, ключ>,
* но тогда таблица накапливала бы все ключи, которые в нее когда-либо складывали, и неоправданно росла.
* Этот класс идет на шаг дальше и считает ключи со старой версией пустыми местами в хеш-таблице.
* Instead of this class, you could just use the pair (version, key) in the HashSet as the key
* but then the table would accumulate all the keys that it ever stored, and it was unreasonably growing.
* This class goes a step further and considers the keys with the old version empty in the hash table.
*/
@ -17,11 +17,11 @@ struct ClearableHashSetState
{
UInt32 version = 1;
/// Сериализация, в бинарном и текстовом виде.
/// Serialization, in binary and text form.
void write(DB::WriteBuffer & wb) const { DB::writeBinary(version, wb); }
void writeText(DB::WriteBuffer & wb) const { DB::writeText(version, wb); }
/// Десериализация, в бинарном и текстовом виде.
/// Deserialization, in binary and text form.
void read(DB::ReadBuffer & rb) { DB::readBinary(version, rb); }
void readText(DB::ReadBuffer & rb) { DB::readText(version, rb); }
};
@ -38,10 +38,10 @@ struct ClearableHashTableCell : public BaseCell
bool isZero(const State & state) const { return version != state.version; }
static bool isZero(const Key & key, const State & state) { return false; }
/// Установить значение ключа в ноль.
/// Set the key value to zero.
void setZero() { version = 0; }
/// Нужно ли хранить нулевой ключ отдельно (то есть, могут ли в хэш-таблицу вставить нулевой ключ).
/// Do I need to store the zero key separately (that is, can a zero key be inserted into the hash table).
static constexpr bool need_zero_value_storage = false;
ClearableHashTableCell() {}

View File

@ -3,12 +3,19 @@
#include <Core/Types.h>
/** Хэш функции, которые лучше чем тривиальная функция std::hash.
* (при агрегации по идентификатору посетителя, прирост производительности более чем в 5 раз)
/** Hash functions that are better than the trivial function std::hash.
*
* Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times.
* This is because of following reasons:
* - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits;
* - in typical implementation of standard library, hash function for integers is trivial and just use lower bits;
* - traffic is non-uniformly distributed across a day;
* - we are using open-addressing linear probing hash tables that are most critical to hash function quality,
* and trivial hash function gives disasterous results.
*/
/** Взято из MurmurHash.
* Быстрее, чем intHash32 при вставке в хэш-таблицу UInt64 -> UInt64, где ключ - идентификатор посетителя.
/** Taken from MurmurHash. This is Murmur finalizer.
* Faster than intHash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID.
*/
inline DB::UInt64 intHash64(DB::UInt64 x)
{
@ -21,21 +28,22 @@ inline DB::UInt64 intHash64(DB::UInt64 x)
return x;
}
/** CRC32C является не очень качественной в роли хэш функции,
* согласно avalanche и bit independence тестам, а также малым количеством бит,
* но может вести себя хорошо при использовании в хэш-таблицах,
* за счёт высокой скорости (latency 3 + 1 такт, througput 1 такт).
* Работает только при поддержке SSE 4.2.
* Используется asm вместо интринсика, чтобы не обязательно было собирать весь проект с -msse4.
/** CRC32C is not very high-quality as a hash function,
* according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits,
* but can behave well when used in hash tables,
* due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle).
* Works only with SSE 4.2 support.
*/
#if __SSE4_2__
#include <nmmintrin.h>
#endif
inline DB::UInt64 intHashCRC32(DB::UInt64 x)
{
#if defined(__x86_64__)
DB::UInt64 crc = -1ULL;
asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x));
return crc;
#if __SSE4_2__
return _mm_crc32_u64(-1ULL, x);
#else
/// На других платформах используем не обязательно CRC32. NOTE Это может сбить с толку.
/// On other platforms we do not have CRC32. NOTE This can be confusing.
return intHash64(x);
#endif
}
@ -117,7 +125,7 @@ DEFINE_HASH(DB::Float64)
#undef DEFINE_HASH
/// Разумно использовать для UInt8, UInt16 при достаточном размере хэш-таблицы.
/// It is reasonable to use for UInt8, UInt16 with sufficient hash table size.
struct TrivialHash
{
template <typename T>
@ -128,17 +136,22 @@ struct TrivialHash
};
/** Сравнительно неплохая некриптографическая хэш функция из UInt64 в UInt32.
* Но хуже (и по качеству и по скорости), чем просто срезка intHash64.
* Взята отсюда: http://www.concentric.net/~ttwang/tech/inthash.htm
/** A relatively good non-cryptographic hash function from UInt64 to UInt32.
* But worse (both in quality and speed) than just cutting intHash64.
* Taken from here: http://www.concentric.net/~ttwang/tech/inthash.htm
*
* Немного изменена по сравнению с функцией по ссылке: сдвиги вправо случайно заменены на цикличесвие сдвиги вправо.
* Это изменение никак не повлияло на результаты тестов smhasher.
* Slightly changed compared to the function by link: shifts to the right are accidentally replaced by a cyclic shift to the right.
* This change did not affect the smhasher test results.
*
* Рекомендуется для разных задач использовать разные salt.
* А то был случай, что в БД значения сортировались по хэшу (для некачественного псевдослучайного разбрасывания),
* а в другом месте, в агрегатной функции, в хэш таблице использовался такой же хэш,
* в результате чего, эта агрегатная функция чудовищно тормозила из-за коллизий.
* It is recommended to use different salt for different tasks.
* That was the case that in the database values were sorted by hash (for low-quality pseudo-random spread),
* and in another place, in the aggregate function, the same hash was used in the hash table,
* as a result, this aggregate function was monstrously slowed due to collisions.
*
* NOTE Salting is far from perfect, because it commutes with first steps of calculation.
*
* NOTE As mentioned, this function is slower than intHash64.
* But occasionaly, it is faster, when written in a loop and loop is vectorized.
*/
template <DB::UInt64 salt>
inline DB::UInt32 intHash32(DB::UInt64 key)
@ -156,7 +169,7 @@ inline DB::UInt32 intHash32(DB::UInt64 key)
}
/// Для контейнеров.
/// For containers.
template <typename T, DB::UInt64 salt = 0>
struct IntHash32
{

View File

@ -13,7 +13,7 @@
struct NoInitTag {};
/// Пара, которая не инициализирует элементы, если не нужно.
/// A pair that does not initialize the elements, if not needed.
template <typename First, typename Second>
struct PairNoInit
{
@ -60,18 +60,18 @@ struct HashMapCell
bool isZero(const State & state) const { return isZero(value.first, state); }
static bool isZero(const Key & key, const State & state) { return ZeroTraits::check(key); }
/// Установить значение ключа в ноль.
/// Set the key value to zero.
void setZero() { ZeroTraits::set(value.first); }
/// Нужно ли хранить нулевой ключ отдельно (то есть, могут ли в хэш-таблицу вставить нулевой ключ).
/// Do I need to store the zero key separately (that is, can a zero key be inserted into the hash table).
static constexpr bool need_zero_value_storage = true;
/// Является ли ячейка удалённой.
/// Whether the cell was deleted.
bool isDeleted() const { return false; }
void setMapped(const value_type & value_) { value.second = value_.second; }
/// Сериализация, в бинарном и текстовом виде.
/// Serialization, in binary and text form.
void write(DB::WriteBuffer & wb) const
{
DB::writeBinary(value.first, wb);
@ -85,7 +85,7 @@ struct HashMapCell
DB::writeDoubleQuoted(value.second, wb);
}
/// Десериализация, в бинарном и текстовом виде.
/// Deserialization, in binary and text form.
void read(DB::ReadBuffer & rb)
{
DB::readBinary(value.first, rb);
@ -141,19 +141,19 @@ public:
bool inserted;
this->emplace(x, it, inserted);
/** Может показаться, что инициализация не обязательна для POD-типов (или __has_trivial_constructor),
* так как кусок памяти для хэш-таблицы изначально инициализирован нулями.
* Но, на самом деле, пустая ячейка может быть не инициализирована нулями в следующих случаях:
* - ZeroValueStorage (в нём зануляется только ключ);
* - после ресайза и переноса части ячеек в новую половину хэш-таблицы, у старых ячеек, тоже зануляется только ключ.
/** It may seem that initialization is not necessary for POD-types (or __has_trivial_constructor),
* since the hash table memory is initially initialized with zeros.
* But, in fact, an empty cell may not be initialized with zeros in the following cases:
* - ZeroValueStorage (it only zeros the key);
* - after resizing and moving a part of the cells to the new half of the hash table, the old cells also have only the key to zero.
*
* По производительности, разницы почти всегда нет, за счёт того, что it->second как правило присваивается сразу
* после вызова operator[], и так как operator[] инлайнится, компилятор убирает лишнюю инициализацию.
* On performance, there is almost always no difference, due to the fact that it->second is usually assigned immediately
* after calling `operator[]`, and since `operator[]` is inlined, the compiler removes unnecessary initialization.
*
* Иногда из-за инициализации, производительность даже растёт. Это происходит в коде вида ++map[key].
* Когда мы делаем инициализацию, то для новых ячеек, достаточно сразу сделать store 1.
* А если бы мы не делали инициализацию, то не смотря на то, что в ячейке был ноль,
* компилятор не может об этом догадаться, и генерирует код load, increment, store.
* Sometimes due to initialization, the performance even grows. This occurs in code like `++map[key]`.
* When we do the initialization, for new cells, it's enough to make `store 1` right away.
* And if we did not initialize, then even though there was zero in the cell,
* the compiler can not guess about this, and generates the `load`, `increment`, `store` code.
*/
if (inserted)
new(&it->second) mapped_type();

View File

@ -44,27 +44,27 @@ namespace ErrorCodes
}
/** Состояние хэш-таблицы, которое влияет на свойства её ячеек.
* Используется в качестве параметра шаблона.
* Например, существует реализация мгновенно-очищаемой хэш-таблицы - ClearableHashMap.
* Для неё, в каждой ячейке хранится номер версии, и в самой хэш-таблице - текущая версия.
* При очистке, просто увеличивается текущая версия; все ячейки с несовпадающей версией считаются пустыми.
* Другой пример: для приближённого рассчёта количества уникальных посетителей, есть хэш-таблица UniquesHashSet.
* В ней имеется понятие "степень". При каждом переполнении, ячейки с ключами, не делящимися на соответствующую степень двух, удаляются.
/** The state of the hash table that affects the properties of its cells.
* Used as a template parameter.
* For example, there is an implementation of an instantly clearable hash table - ClearableHashMap.
* For it, each cell holds the version number, and in the hash table itself is the current version.
* When clearing, the current version simply increases; All cells with a mismatching version are considered empty.
* Another example: for an approximate calculation of the number of unique visitors, there is a hash table for UniquesHashSet.
* It has the concept of "degree". At each overflow, cells with keys that do not divide by the corresponding power of the two are deleted.
*/
struct HashTableNoState
{
/// Сериализация, в бинарном и текстовом виде.
/// Serialization, in binary and text form.
void write(DB::WriteBuffer & wb) const {}
void writeText(DB::WriteBuffer & wb) const {}
/// Десериализация, в бинарном и текстовом виде.
/// Deserialization, in binary and text form.
void read(DB::ReadBuffer & rb) {}
void readText(DB::ReadBuffer & rb) {}
};
/// Эти функции могут быть перегружены для пользовательских типов.
/// These functions can be overloaded for custom types.
namespace ZeroTraits
{
@ -77,11 +77,11 @@ void set(T & x) { x = 0; }
};
/** Compile-time интерфейс ячейки хэш-таблицы.
* Разные ячейки используются для реализации разных хэш-таблиц.
* Ячейка должна содержать ключ.
* Также может содержать значение и произвольные дополнительные данные
* (пример: запомненное значение хэш-функции; номер версии для ClearableHashMap).
/** Compile-time interface for cell of the hash table.
* Different cell types are used to implement different hash tables.
* The cell must contain a key.
* It can also contain a value and arbitrary additional data
* (example: the stored hash value; version number for ClearableHashMap).
*/
template <typename Key, typename Hash, typename TState = HashTableNoState>
struct HashTableCell
@ -93,89 +93,89 @@ struct HashTableCell
HashTableCell() {}
/// Создать ячейку с заданным ключём / ключём и значением.
/// Create a cell with the given key / key and value.
HashTableCell(const Key & key_, const State & state) : key(key_) {}
/// HashTableCell(const value_type & value_, const State & state) : key(value_) {}
/// HashTableCell(const value_type & value_, const State & state) : key(value_) {}
/// Получить то, что будет value_type контейнера.
/// Get what the value_type of the container will be.
value_type & getValue() { return key; }
const value_type & getValue() const { return key; }
/// Получить ключ.
/// Get the key.
static Key & getKey(value_type & value) { return value; }
static const Key & getKey(const value_type & value) { return value; }
/// Равны ли ключи у ячеек.
/// Are the keys at the cells equal?
bool keyEquals(const Key & key_) const { return key == key_; }
bool keyEquals(const Key & key_, size_t hash_) const { return key == key_; }
/// Если ячейка умеет запоминать в себе значение хэш-функции, то запомнить его.
/// If the cell can remember the value of the hash function, then remember it.
void setHash(size_t hash_value) {}
/// Если ячейка умеет запоминать в себе значение хэш-функции, то вернуть запомненное значение.
/// Оно должно быть хотя бы один раз вычислено до этого.
/// Если запоминание значения хэш-функции не предусмотрено, то просто вычислить хэш.
/// If the cell can store the hash value in itself, then return the stored value.
/// It must be at least once calculated before.
/// If storing the hash value is not provided, then just compute the hash.
size_t getHash(const Hash & hash) const { return hash(key); }
/// Является ли ключ нулевым. В основном буфере, ячейки с нулевым ключём, считаются пустыми.
/// Если нулевые ключи могут быть вставлены в таблицу, то ячейка для нулевого ключа хранится отдельно, не в основном буфере.
/// Нулевые ключи должны быть такими, что занулённый кусок памяти представляет собой нулевой ключ.
/// Whether the key is zero. In the main buffer, cells with a zero key are considered empty.
/// If zero keys can be inserted into the table, then the cell for the zero key is stored separately, not in the main buffer.
/// Zero keys must be such that the zeroed-down piece of memory is a zero key.
bool isZero(const State & state) const { return isZero(key, state); }
static bool isZero(const Key & key, const State & state) { return ZeroTraits::check(key); }
/// Установить значение ключа в ноль.
/// Set the key value to zero.
void setZero() { ZeroTraits::set(key); }
/// Нужно ли хранить нулевой ключ отдельно (то есть, могут ли в хэш-таблицу вставить нулевой ключ).
/// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table).
static constexpr bool need_zero_value_storage = true;
/// Является ли ячейка удалённой.
/// Whether the cell is deleted.
bool isDeleted() const { return false; }
/// Установить отображаемое значение, если есть (для HashMap), в соответствующиее из value.
/// Set the mapped value, if any (for HashMap), to the corresponding `value`.
void setMapped(const value_type & value) {}
/// Сериализация, в бинарном и текстовом виде.
/// Serialization, in binary and text form.
void write(DB::WriteBuffer & wb) const { DB::writeBinary(key, wb); }
void writeText(DB::WriteBuffer & wb) const { DB::writeDoubleQuoted(key, wb); }
/// Десериализация, в бинарном и текстовом виде.
/// Deserialization, in binary and text form.
void read(DB::ReadBuffer & rb) { DB::readBinary(key, rb); }
void readText(DB::ReadBuffer & rb) { DB::writeDoubleQuoted(key, rb); }
};
/** Определяет размер хэш-таблицы, а также когда и во сколько раз её надо ресайзить.
/** Determines the size of the hash table, and when and how much it should be resized.
*/
template <size_t initial_size_degree = 8>
struct HashTableGrower
{
/// Состояние этой структуры достаточно, чтобы получить размер буфера хэш-таблицы.
/// The state of this structure is enough to get the buffer size of the hash table.
UInt8 size_degree = initial_size_degree;
/// Размер хэш-таблицы в ячейках.
/// The size of the hash table in the cells.
size_t bufSize() const { return 1 << size_degree; }
size_t maxFill() const { return 1 << (size_degree - 1); }
size_t mask() const { return bufSize() - 1; }
/// Из значения хэш-функции получить номер ячейки в хэш-таблице.
/// From the hash value, get the cell number in the hash table.
size_t place(size_t x) const { return x & mask(); }
/// Следующая ячейка в цепочке разрешения коллизий.
/// The next cell in the collision resolution chain.
size_t next(size_t pos) const { ++pos; return pos & mask(); }
/// Является ли хэш-таблица достаточно заполненной. Нужно увеличить размер хэш-таблицы, или удалить из неё что-нибудь ненужное.
/// Whether the hash table is sufficiently full. You need to increase the size of the hash table, or remove something unnecessary from it.
bool overflow(size_t elems) const { return elems > maxFill(); }
/// Увеличить размер хэш-таблицы.
/// Increase the size of the hash table.
void increaseSize()
{
size_degree += size_degree >= 23 ? 1 : 2;
}
/// Установить размер буфера по количеству элементов хэш-таблицы. Используется при десериализации хэш-таблицы.
/// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table.
void set(size_t num_elems)
{
size_degree = num_elems <= 1
@ -192,17 +192,17 @@ struct HashTableGrower
};
/** При использовании в качестве Grower-а, превращает хэш-таблицу в что-то типа lookup-таблицы.
* Остаётся неоптимальность - в ячейках хранятся ключи.
* Также компилятору не удаётся полностью удалить код хождения по цепочке разрешения коллизий, хотя он не нужен.
* TODO Сделать полноценную lookup-таблицу.
/** When used as a Grower, it turns a hash table into something like a lookup table.
* It remains non-optimal - the cells store the keys.
* Also, the compiler can not completely remove the code of passing through the collision resolution chain, although it is not needed.
* TODO Make a proper lookup table.
*/
template <size_t key_bits>
struct HashTableFixedGrower
{
size_t bufSize() const { return 1 << key_bits; }
size_t place(size_t x) const { return x; }
/// Тут можно было бы написать __builtin_unreachable(), но компилятор не до конца всё оптимизирует, и получается менее эффективно.
/// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently.
size_t next(size_t pos) const { return pos + 1; }
bool overflow(size_t elems) const { return false; }
@ -212,7 +212,7 @@ struct HashTableFixedGrower
};
/** Если нужно хранить нулевой ключ отдельно - место для его хранения. */
/** If you want to store the zero key separately - a place to store it. */
template <bool need_zero_value_storage, typename Cell>
struct ZeroValueStorage;
@ -271,15 +271,15 @@ protected:
using Self = HashTable<Key, Cell, Hash, Grower, Allocator>;
using cell_type = Cell;
size_t m_size = 0; /// Количество элементов
Cell * buf; /// Кусок памяти для всех элементов кроме элемента с ключём 0.
size_t m_size = 0; /// Amount of elements
Cell * buf; /// A piece of memory for all elements except the element with zero key.
Grower grower;
#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
mutable size_t collisions = 0;
#endif
/// Найти ячейку с тем же ключём или пустую ячейку, начиная с заданного места и далее по цепочке разрешения коллизий.
/// Find a cell with the same key or an empty cell, starting from the specified position and further along the collision resolution chain.
size_t ALWAYS_INLINE findCell(const Key & x, size_t hash_value, size_t place_value) const
{
while (!buf[place_value].isZero(*this) && !buf[place_value].keyEquals(x, hash_value))
@ -293,7 +293,7 @@ protected:
return place_value;
}
/// Найти пустую ячейку, начиная с заданного места и далее по цепочке разрешения коллизий.
/// Find an empty cell, starting with the specified position and further along the collision resolution chain.
size_t ALWAYS_INLINE findEmptyCell(const Key & x, size_t hash_value, size_t place_value) const
{
while (!buf[place_value].isZero(*this))
@ -323,7 +323,7 @@ protected:
}
/// Увеличить размер буфера.
/// Increase the size of the buffer.
void resize(size_t for_num_elems = 0, size_t for_buf_size = 0)
{
#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
@ -332,10 +332,10 @@ protected:
size_t old_size = grower.bufSize();
/** Чтобы в случае исключения, объект остался в корректном состоянии,
* изменение переменной grower (определяющией размер буфера хэш-таблицы)
* откладываем на момент после реального изменения буфера.
* Временная переменная new_grower используется, чтобы определить новый размер.
/** In case of exception for the object to remain in the correct state,
* changing the variable `grower` (which determines the buffer size of the hash table)
* is postponed for a moment after a real buffer change.
* The temporary variable `new_grower` is used to determine the new size.
*/
Grower new_grower = grower;
@ -354,29 +354,29 @@ protected:
else
new_grower.increaseSize();
/// Расширим пространство.
/// Expand the space.
buf = reinterpret_cast<Cell *>(Allocator::realloc(buf, getBufferSizeInBytes(), new_grower.bufSize() * sizeof(Cell)));
grower = new_grower;
/** Теперь некоторые элементы может потребоваться переместить на новое место.
* Элемент может остаться на месте, или переместиться в новое место "справа",
* или переместиться левее по цепочке разрешения коллизий, из-за того, что элементы левее него были перемещены в новое место "справа".
/** Now some items may need to be moved to a new location.
* The element can stay in place, or move to a new location "on the right",
* or move to the left of the collision resolution chain, because the elements to the left of it have been moved to the new "right" location.
*/
size_t i = 0;
for (; i < old_size; ++i)
if (!buf[i].isZero(*this) && !buf[i].isDeleted())
reinsert(buf[i]);
reinsert(buf[i], buf[i].getHash(*this));
/** Также имеется особый случай:
* если элемент должен был быть в конце старого буфера, [ x]
* но находится в начале из-за цепочки разрешения коллизий, [o x]
* то после ресайза, он сначала снова окажется не на своём месте, [ xo ]
* и для того, чтобы перенести его куда надо,
* надо будет после переноса всех элементов из старой половинки [ o x ]
* обработать ещё хвостик из цепочки разрешения коллизий сразу после неё [ o x ]
/** There is also a special case:
* if the element was to be at the end of the old buffer, [ x]
* but is at the beginning because of the collision resolution chain, [o x]
* then after resizing, it will first be out of place again, [ xo ]
* and in order to transfer it where necessary,
* after transferring all the elements from the old halves you need to [ o x ]
* process tail from the collision resolution chain immediately after it [ o x ]
*/
for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i)
reinsert(buf[i]);
reinsert(buf[i], buf[i].getHash(*this));
#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
watch.stop();
@ -387,30 +387,30 @@ protected:
}
/** Вставить в новый буфер значение, которое было в старом буфере.
* Используется при увеличении размера буфера.
/** Paste into the new buffer the value that was in the old buffer.
* Used when increasing the buffer size.
*/
void reinsert(Cell & x)
void reinsert(Cell & x, size_t hash_value)
{
size_t hash_value = x.getHash(*this);
size_t place_value = grower.place(hash_value);
/// Если элемент на своём месте.
/// If the element is in its place.
if (&x == &buf[place_value])
return;
/// Вычисление нового места, с учётом цепочки разрешения коллизий.
/// Compute a new location, taking into account the collision resolution chain.
place_value = findCell(Cell::getKey(x.getValue()), hash_value, place_value);
/// Если элемент остался на своём месте в старой цепочке разрешения коллизий.
/// If the item remains in its place in the old collision resolution chain.
if (!buf[place_value].isZero(*this))
return;
/// Копирование на новое место и зануление старого.
/// Copy to a new location and zero the old one.
x.setHash(hash_value);
memcpy(&buf[place_value], &x, sizeof(x));
x.setZero();
/// Потом на старое место могут переместиться элементы, которые раньше были в коллизии с этим.
/// Then the elements that previously were in collision with this can move to the old place.
}
@ -611,10 +611,10 @@ protected:
iterator iteratorToZero() { return iteratorTo(this->zeroValue()); }
/// Если ключ нулевой - вставить его в специальное место и вернуть true.
bool ALWAYS_INLINE emplaceIfZero(Key x, iterator & it, bool & inserted)
/// If the key is zero, insert it into a special place and return true.
bool ALWAYS_INLINE emplaceIfZero(Key x, iterator & it, bool & inserted, size_t hash_value)
{
/// Если утверждается, что нулевой ключ не могут вставить в таблицу.
/// If it is claimed that the zero key can not be inserted into the table.
if (!Cell::need_zero_value_storage)
return false;
@ -625,7 +625,7 @@ protected:
{
++m_size;
this->setHasZero();
it.ptr->setHash(hash(x));
it.ptr->setHash(hash_value);
inserted = true;
}
else
@ -638,7 +638,7 @@ protected:
}
/// Только для ненулевых ключей. Найти нужное место, вставить туда ключ, если его ещё нет, вернуть итератор на ячейку.
/// Only for non-zero keys. Find the right place, insert the key there, if it does not already exist. Set iterator to the cell in output parameter.
void ALWAYS_INLINE emplaceNonZero(Key x, iterator & it, bool & inserted, size_t hash_value)
{
size_t place_value = findCell(x, hash_value, grower.place(hash_value));
@ -664,9 +664,9 @@ protected:
}
catch (...)
{
/** Если этого не делать, то будут проблемы.
* Ведь останется ключ, но неинициализированное mapped-значение,
* у которого, возможно, даже нельзя вызвать деструктор.
/** If we have not resized successfully, then there will be problems.
* There remains a key, but uninitialized mapped-value,
* which, perhaps, can not even be called a destructor.
*/
--m_size;
buf[place_value].setZero();
@ -679,13 +679,14 @@ protected:
public:
/// Вставить значение. В случае хоть сколько-нибудь сложных значений, лучше используйте функцию emplace.
/// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
std::pair<iterator, bool> ALWAYS_INLINE insert(const value_type & x)
{
std::pair<iterator, bool> res;
if (!emplaceIfZero(Cell::getKey(x), res.first, res.second))
emplaceNonZero(Cell::getKey(x), res.first, res.second, hash(Cell::getKey(x)));
size_t hash_value = hash(Cell::getKey(x));
if (!emplaceIfZero(Cell::getKey(x), res.first, res.second, hash_value))
emplaceNonZero(Cell::getKey(x), res.first, res.second, hash_value);
if (res.second)
res.first.ptr->setMapped(x);
@ -694,14 +695,21 @@ public:
}
/** Вставить ключ,
* вернуть итератор на позицию, которую можно использовать для placement new значения,
* а также флаг - был ли вставлен новый ключ.
/// Reinsert node pointed to by iterator
void ALWAYS_INLINE reinsert(iterator & it, size_t hash_value)
{
reinsert(*it.getPtr(), hash_value);
}
/** Insert the key,
* return an iterator to a position that can be used for `placement new` of value,
* as well as the flag - whether a new key was inserted.
*
* Вы обязаны сделать placement new значения, если был вставлен новый ключ,
* так как при уничтожении хэш-таблицы для него будет вызываться деструктор!
* You have to make `placement new` of value if you inserted a new key,
* since when destroying a hash table, it will call the destructor!
*
* Пример использования:
* Example usage:
*
* Map::iterator it;
* bool inserted;
@ -711,20 +719,21 @@ public:
*/
void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted)
{
if (!emplaceIfZero(x, it, inserted))
emplaceNonZero(x, it, inserted, hash(x));
}
/// То же самое, но с заранее вычисленным значением хэш-функции.
void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t hash_value)
{
if (!emplaceIfZero(x, it, inserted))
size_t hash_value = hash(x);
if (!emplaceIfZero(x, it, inserted, hash_value))
emplaceNonZero(x, it, inserted, hash_value);
}
/// Скопировать ячейку из другой хэш-таблицы. Предполагается, что ячейка не нулевая, а также, что такого ключа в таблице ещё не было.
/// Same, but with a precalculated value of hash function.
void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t hash_value)
{
if (!emplaceIfZero(x, it, inserted, hash_value))
emplaceNonZero(x, it, inserted, hash_value);
}
/// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet.
void ALWAYS_INLINE insertUniqueNonZero(const Cell * cell, size_t hash_value)
{
size_t place_value = findEmptyCell(cell->getKey(cell->getValue()), hash_value, grower.place(hash_value));
@ -903,8 +912,8 @@ public:
memset(buf, 0, grower.bufSize() * sizeof(*buf));
}
/// После выполнения этой функции, таблицу можно только уничтожить,
/// а также можно использовать методы size, empty, begin, end.
/// After executing this function, the table can only be destroyed,
/// and also you can use the methods `size`, `empty`, `begin`, `end`.
void clearAndShrink()
{
destroyElements();

View File

@ -3,15 +3,15 @@
#include <Common/HashTable/HashMap.h>
/** Замена хэш-таблицы для маленького количества (единицы) ключей.
* Реализована в виде массива с линейным поиском.
* Массив расположен внутри объекта.
* Интерфейс является подмножеством интерфейса HashTable.
/** Replacement of the hash table for a small number (<10) of keys.
* Implemented as an array with linear search.
* The array is located inside the object.
* The interface is a subset of the HashTable interface.
*
* Вставка возможна только если метод full возвращает false.
* При неизвестном количестве различных ключей,
* вы должны проверять, не заполнена ли таблица,
* и делать fallback в этом случае (например, использовать полноценную хэш-таблицу).
* Insert is possible only if the `full` method returns false.
* With an unknown number of different keys,
* you should check if the table is not full,
* and do a `fallback` in this case (for example, use a real hash table).
*/
template
@ -32,11 +32,11 @@ protected:
using Self = SmallTable<Key, Cell, capacity>;
using cell_type = Cell;
size_t m_size = 0; /// Количество элементов.
Cell buf[capacity]; /// Кусок памяти для всех элементов.
size_t m_size = 0; /// Amount of elements.
Cell buf[capacity]; /// A piece of memory for all elements.
/// Найти ячейку с тем же ключём или пустую ячейку, начиная с заданного места и далее по цепочке разрешения коллизий.
/// Find a cell with the same key or an empty cell, starting from the specified position and then by the collision resolution chain.
const Cell * ALWAYS_INLINE findCell(const Key & x) const
{
const Cell * it = buf;
@ -188,8 +188,8 @@ protected:
public:
/** Таблица переполнена.
* В переполненную таблицу ничего нельзя вставлять.
/** The table is full.
* You can not insert anything into the full table.
*/
bool full()
{
@ -197,7 +197,7 @@ public:
}
/// Вставить значение. В случае хоть сколько-нибудь сложных значений, лучше используйте функцию emplace.
/// Insert the value. In the case of any more complex values, it is better to use the `emplace` function.
std::pair<iterator, bool> ALWAYS_INLINE insert(const value_type & x)
{
std::pair<iterator, bool> res;
@ -211,14 +211,14 @@ public:
}
/** Вставить ключ,
* вернуть итератор на позицию, которую можно использовать для placement new значения,
* а также флаг - был ли вставлен новый ключ.
/** Insert the key,
* return an iterator to a position that can be used for `placement new` of value,
* as well as the flag - whether a new key was inserted.
*
* Вы обязаны сделать placement new значения, если был вставлен новый ключ,
* так как при уничтожении хэш-таблицы для него будет вызываться деструктор!
* You have to make `placement new` of value if you inserted a new key,
* since when destroying a hash table, a destructor will be called for it!
*
* Пример использования:
* Example usage:
*
* Map::iterator it;
* bool inserted;
@ -239,7 +239,7 @@ public:
}
/// То же самое, но вернуть false, если переполнено.
/// Same, but return false if it's full.
bool ALWAYS_INLINE tryEmplace(Key x, iterator & it, bool & inserted)
{
Cell * res = findCell(x);
@ -257,7 +257,7 @@ public:
}
/// Скопировать ячейку из другой хэш-таблицы. Предполагается, что такого ключа в таблице ещё не было.
/// Copy the cell from another hash table. It is assumed that there was no such key in the table yet.
void ALWAYS_INLINE insertUnique(const Cell * cell)
{
memcpy(&buf[m_size], cell, sizeof(*cell));

View File

@ -3,21 +3,21 @@
#include <Common/HashTable/HashTable.h>
/** Двухуровневая хэш-таблица.
* Представляет собой 256 (или 1 << BITS_FOR_BUCKET) маленьких хэш-таблиц (bucket-ов первого уровня).
* Для определения, какую из них использовать, берётся один из байтов хэш-функции.
/** Two-level hash table.
* Represents 256 (or 1 << BITS_FOR_BUCKET) small hash tables (buckets of the first level).
* To determine which one to use, one of the bytes of the hash function is taken.
*
* Обычно работает чуть-чуть медленнее простой хэш-таблицы.
* Тем не менее, обладает преимуществами в некоторых случаях:
* - если надо мерджить две хэш-таблицы вместе, то это можно легко распараллелить по bucket-ам;
* - лаг при ресайзах размазан, так как маленькие хэш-таблицы ресайзятся по-отдельности;
* - по идее, ресайзы кэш-локальны в большем диапазоне размеров.
* Usually works a little slower than a simple hash table.
* However, it has advantages in some cases:
* - if you need to merge two hash tables together, then you can easily parallelize it by buckets;
* - delay during resizes is amortized, since the small hash tables will be resized separately;
* - in theory, resizes are cache-local in a larger range of sizes.
*/
template <size_t initial_size_degree = 8>
struct TwoLevelHashTableGrower : public HashTableGrower<initial_size_degree>
{
/// Увеличить размер хэш-таблицы.
/// Increase the size of the hash table.
void increaseSize()
{
this->size_degree += this->size_degree >= 15 ? 1 : 2;
@ -52,7 +52,7 @@ public:
size_t hash(const Key & x) const { return Hash::operator()(x); }
/// NOTE Плохо для хэш-таблиц больше чем на 2^32 ячеек.
/// NOTE Bad for hash tables with more than 2^32 cells.
static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; }
protected:
@ -89,13 +89,13 @@ public:
TwoLevelHashTable() {}
/// Скопировать данные из другой (обычной) хэш-таблицы. У неё должна быть такая же хэш-функция.
/// Copy the data from another (normal) hash table. It should have the same hash function.
template <typename Source>
TwoLevelHashTable(const Source & src)
{
typename Source::const_iterator it = src.begin();
/// Предполагается, что нулевой ключ (хранящийся отдельно) при итерировании идёт первым.
/// It is assumed that the zero key (stored separately) is first in iteration order.
if (it != src.end() && it.getPtr()->isZero(src))
{
insert(*it);
@ -205,7 +205,7 @@ public:
iterator end() { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; }
/// Вставить значение. В случае хоть сколько-нибудь сложных значений, лучше используйте функцию emplace.
/// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
std::pair<iterator, bool> ALWAYS_INLINE insert(const value_type & x)
{
size_t hash_value = hash(Cell::getKey(x));
@ -220,14 +220,14 @@ public:
}
/** Вставить ключ,
* вернуть итератор на позицию, которую можно использовать для placement new значения,
* а также флаг - был ли вставлен новый ключ.
/** Insert the key,
* return an iterator to a position that can be used for `placement new` of value,
* as well as the flag - whether a new key was inserted.
*
* Вы обязаны сделать placement new значения, если был вставлен новый ключ,
* так как при уничтожении хэш-таблицы для него будет вызываться деструктор!
* You have to make `placement new` values if you inserted a new key,
* since when destroying a hash table, the destructor will be invoked for it!
*
* Пример использования:
* Example usage:
*
* Map::iterator it;
* bool inserted;
@ -242,7 +242,7 @@ public:
}
/// То же самое, но с заранее вычисленным значением хэш-функции.
/// Same, but with a precalculated values of hash function.
void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t hash_value)
{
size_t buck = getBucketFromHash(hash_value);

View File

@ -7,10 +7,10 @@
#include <tuple>
#include <type_traits>
/** Этот класс предоставляет способ, чтобы оценить погрешность результата применения алгоритма HyperLogLog.
* Эмирические наблюдения показывают, что большие погрешности возникают при E < 5 * 2^precision, где
* E - возвращаемое значение алгоритмом HyperLogLog, и precision - параметр точности HyperLogLog.
* См. "HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm".
/** This class provides a way to evaluate the error in the result of applying the HyperLogLog algorithm.
* Empirical observations show that large errors occur at E < 5 * 2^precision, where
* E is the return value of the HyperLogLog algorithm, and `precision` is the HyperLogLog precision parameter.
* See "HyperLogLog in Practice: Algorithmic Engineering of a State of the Art Cardinality Estimation Algorithm".
* (S. Heule et al., Proceedings of the EDBT 2013 Conference).
*/
template <typename BiasData>
@ -22,14 +22,14 @@ public:
return false;
}
/// Предельное количество уникальных значений до которого должна примениться поправка
/// из алгоритма LinearCounting.
/// Maximum number of unique values to which the correction should apply
/// from the LinearCounting algorithm.
static double getThreshold()
{
return BiasData::getThreshold();
}
/// Вернуть оценку погрешности.
/// Return the error estimate.
static double getBias(double raw_estimate)
{
const auto & estimates = BiasData::getRawEstimates();
@ -52,7 +52,7 @@ public:
}
else
{
/// Получаем оценку погрешности путём линейной интерполяции.
/// We get the error estimate by linear interpolation.
size_t index = std::distance(estimates.begin(), it);
double estimate1 = estimates[index - 1];
@ -60,7 +60,7 @@ public:
double bias1 = biases[index - 1];
double bias2 = biases[index];
/// Предполагается, что условие estimate1 < estimate2 всегда выполнено.
/// It is assumed that the estimate1 < estimate2 condition is always satisfied.
double slope = (bias2 - bias1) / (estimate2 - estimate1);
return bias1 + slope * (raw_estimate - estimate1);
@ -68,7 +68,7 @@ public:
}
private:
/// Статические проверки.
/// Static checks.
using TRawEstimatesRef = decltype(BiasData::getRawEstimates());
using TRawEstimates = typename std::remove_reference<TRawEstimatesRef>::type;
@ -82,10 +82,10 @@ private:
"Bias estimator has inconsistent data");
};
/** Тривиальный случай HyperLogLogBiasEstimator: употребляется, если не хотим исправить
* погрешность. Это имеет смысль при маленьких значениях параметра точности, например 5 или 12.
* Тогда применяются поправки из оригинальной версии алгоритма HyperLogLog.
* См. "HyperLogLog: The analysis of a near-optimal cardinality estimation algorithm"
/** Trivial case of HyperLogLogBiasEstimator: used if we do not want to fix
* error. This has meaning for small values of the accuracy parameter, for example 5 or 12.
* Then the corrections from the original version of the HyperLogLog algorithm are applied.
* See "HyperLogLog: The analysis of a near-optimal cardinality estimation algorithm"
* (P. Flajolet et al., AOFA '07: Proceedings of the 2007 International Conference on Analysis
* of Algorithms)
*/

View File

@ -9,10 +9,10 @@ namespace DB
{
/** Для маленького количества ключей - массив фиксированного размера "на стеке".
* Для большого - выделяется HyperLogLog.
* Смотрите также более практичную реализацию в CombinedCardinalityEstimator.h,
* где используется также хэш-таблица для множеств среднего размера.
/** For a small number of keys - an array of fixed size "on the stack".
* For large, HyperLogLog is allocated.
* See also the more practical implementation in CombinedCardinalityEstimator.h,
* where a hash table is also used for medium-sized sets.
*/
template
<
@ -39,7 +39,7 @@ private:
{
CurrentMemoryTracker::alloc(sizeof(large));
/// На время копирования данных из tiny, устанавливать значение large ещё нельзя (иначе оно перезатрёт часть данных).
/// At the time of copying data from `tiny`, setting the value of `large` is still not possible (otherwise it will overwrite some data).
Large * tmp_large = new Large;
for (const auto & x : small)
@ -99,7 +99,7 @@ public:
}
}
/// Можно вызывать только для пустого объекта.
/// You can only call for an empty object.
void read(DB::ReadBuffer & in)
{
bool is_large;

View File

@ -3,24 +3,24 @@
#include <Common/CounterInFile.h>
/** Позволяет получать авто-инкрементное число, храня его в файле.
* Предназначен для редких вызовов (не рассчитан на производительность).
/** Allows to get an auto-increment number, storing it in a file.
* Intended for rare calls (not designed for performance).
*/
class Increment
{
public:
/// path - имя файла, включая путь
/// path - the name of the file, including the path
Increment(const std::string & path_) : counter(path_) {}
/** Получить следующее число.
* Если параметр create_if_need не установлен в true, то
* в файле уже должно быть записано какое-нибудь число (если нет - создайте файл вручную с нулём).
/** Get the next number.
* If the `create_if_need` parameter is not set to true, then
* the file must already have a number written (if not - create the file manually with zero).
*
* Для защиты от race condition-ов между разными процессами, используются файловые блокировки.
* (Но при первом создании файла race condition возможен, так что лучше создать файл заранее.)
* To protect against race conditions between different processes, file locks are used.
* (But when the first file is created, the race condition is possible, so it's better to create the file in advance.)
*
* locked_callback вызывается при заблокированном файле со счетчиком. В него передается новое значение.
* locked_callback можно использовать, чтобы делать что-нибудь атомарно с увеличением счетчика (например, переименовывать файлы).
* `locked_callback` is called when the counter file is locked. A new value is passed to it.
* `locked_callback` can be used to do something atomically with the increment of the counter (for example, rename files).
*/
template <typename Callback>
UInt64 get(Callback && locked_callback, bool create_if_need = false)
@ -33,25 +33,25 @@ public:
return getBunch(1, create_if_need);
}
/// Посмотреть следующее значение.
/// Peek the next value.
UInt64 peek(bool create_if_need = false)
{
return getBunch(0, create_if_need);
}
/** Получить следующее число и увеличить счетчик на count.
* Если параметр create_if_need не установлен в true, то
* в файле уже должно быть записано какое-нибудь число (если нет - создайте файл вручную с нулём).
*
* Для защиты от race condition-ов между разными процессами, используются файловые блокировки.
* (Но при первом создании файла race condition возможен, так что лучше создать файл заранее.)
*/
/** Get the next number and increase the counter by `count`.
* If the `create_if_need` parameter is not set to true, then
* the file should already have a number written (if not - create the file manually with zero).
*
* To protect against race conditions between different processes, file locks are used.
* (But when the first file is created, the race condition is possible, so it's better to create the file in advance.)
*/
UInt64 getBunch(UInt64 count, bool create_if_need = false)
{
return static_cast<UInt64>(counter.add(static_cast<Int64>(count), create_if_need) - count + 1);
}
/// Изменить путь к файлу.
/// Change the path to the file.
void setPath(std::string path_)
{
counter.setPath(path_);
@ -65,23 +65,3 @@ public:
private:
CounterInFile counter;
};
/** То же самое, но без хранения в файле.
*/
struct SimpleIncrement : private boost::noncopyable
{
std::atomic<UInt64> value;
SimpleIncrement(UInt64 start = 0) : value(start) {}
void set(UInt64 new_value)
{
value = new_value;
}
UInt64 get()
{
return ++value;
}
};

View File

@ -4,10 +4,11 @@
#include <Poco/Util/AbstractConfiguration.h>
#include <map>
namespace DB
{
/** Раскрывает в строке макросы из конфига.
/** Apply substitutions from the macros in config to the string.
*/
class Macros
{
@ -15,8 +16,8 @@ public:
Macros();
Macros(const Poco::Util::AbstractConfiguration & config, const String & key);
/** Заменить в строке подстроки вида {macro_name} на значение для macro_name, полученное из конфига.
* level - уровень рекурсии.
/** Replace the substring of the form {macro_name} with the value for macro_name, obtained from the config file.
* level - the level of recursion.
*/
String expand(const String & s, size_t level = 0) const;

View File

@ -102,10 +102,10 @@ public:
};
/** Объект MemoryTracker довольно трудно протащить во все места, где выделяются существенные объёмы памяти.
* Поэтому, используется thread-local указатель на используемый MemoryTracker или nullptr, если его не нужно использовать.
* Этот указатель выставляется, когда в данном потоке следует отслеживать потребление памяти.
* Таким образом, его нужно всего-лишь протащить во все потоки, в которых обрабатывается один запрос.
/** The MemoryTracker object is quite difficult to pass to all places where significant amounts of memory are allocated.
* Therefore, a thread-local pointer to used MemoryTracker is set, or nullptr if MemoryTracker does not need to be used.
* This pointer is set when memory consumption is monitored in current thread.
* So, you just need to pass it to all the threads that handle one request.
*/
extern __thread MemoryTracker * current_memory_tracker;

View File

@ -12,20 +12,22 @@
#endif
/** Использует два способа оптимизации регулярного выражения:
* 1. Если регулярное выражение является тривиальным (сводится к поиску подстроки в строке),
* то заменяет поиск на strstr или strcasestr.
* 2. Если регулярное выражение содержит безальтернативную подстроку достаточной длины,
* то перед проверкой используется strstr или strcasestr достаточной длины;
* регулярное выражение проверяется полностью только если подстрока найдена.
* 3. В остальных случаях, используется движок re2.
/** Uses two ways to optimize a regular expression:
* 1. If the regular expression is trivial (reduces to finding a substring in a string),
* then replaces the search with strstr or strcasestr.
* 2. If the regular expression contains a non-alternative substring of sufficient length,
* then before testing, strstr or strcasestr of sufficient length is used;
* regular expression is only fully checked if a substring is found.
* 3. In other cases, the re2 engine is used.
*
* Это имеет смысл, так как strstr и strcasestr в libc под Linux хорошо оптимизированы.
* This makes sense, since strstr and strcasestr in libc for Linux are well optimized.
*
* Подходит, если одновременно выполнены следующие условия:
* - если в большинстве вызовов, регулярное выражение не матчится;
* - если регулярное выражение совместимо с движком re2;
* - можете использовать на свой риск, так как, возможно, не все случаи учтены.
* Suitable if the following conditions are simultaneously met:
* - if in most calls, the regular expression does not match;
* - if the regular expression is compatible with the re2 engine;
* - you can use at your own risk, since, probably, not all cases are taken into account.
*
* NOTE: Multi-character metasymbols such as \Pl are handled incorrectly.
*/
namespace OptimizedRegularExpressionDetails
@ -82,7 +84,7 @@ public:
unsigned getNumberOfSubpatterns() const { return number_of_subpatterns; }
/// Получить регексп re2 или nullptr, если шаблон тривиален (для вывода в лог).
/// Get the regexp re2 or nullptr if the pattern is trivial (for output to the log).
const std::unique_ptr<RegexType>& getRE2() const { return re2; }
static void analyze(const std::string & regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix);
@ -105,4 +107,4 @@ private:
using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>;
#include "OptimizedRegularExpression.inl"
#include "OptimizedRegularExpression.inl.h"

View File

@ -1,431 +0,0 @@
#include <iostream>
#include <Poco/Exception.h>
#include <Common/OptimizedRegularExpression.h>
#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5
template <bool b>
void OptimizedRegularExpressionImpl<b>::analyze(
const std::string & regexp,
std::string & required_substring,
bool & is_trivial,
bool & required_substring_is_prefix)
{
/** Выражение тривиально, если в нём все метасимволы эскейплены.
* Безальтернативная строка - это
* строка вне скобок,
* в которой все метасимволы эскейплены,
* а также если вне скобок нет '|',
* а также избегаются подстроки вида http:// или www.
*/
const char * begin = regexp.data();
const char * pos = begin;
const char * end = regexp.data() + regexp.size();
int depth = 0;
is_trivial = true;
required_substring_is_prefix = false;
required_substring.clear();
bool has_alternative_on_depth_0 = false;
/// Подстрока с позицией.
typedef std::pair<std::string, size_t> Substring;
typedef std::vector<Substring> Substrings;
Substrings trivial_substrings(1);
Substring * last_substring = &trivial_substrings.back();
bool in_curly_braces = false;
bool in_square_braces = false;
while (pos != end)
{
switch (*pos)
{
case '\0':
pos = end;
break;
case '\\':
{
++pos;
if (pos == end)
break;
switch (*pos)
{
case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{':
if (depth == 0 && !in_curly_braces && !in_square_braces)
{
if (last_substring->first.empty())
last_substring->second = pos - begin;
last_substring->first.push_back(*pos);
}
break;
default:
/// все остальные escape-последовательности не поддерживаем
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
break;
}
++pos;
break;
}
case '|':
if (depth == 0)
has_alternative_on_depth_0 = true;
is_trivial = false;
if (!in_square_braces && !last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
case '(':
if (!in_square_braces)
{
++depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
}
++pos;
break;
case '[':
in_square_braces = true;
++depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
case ']':
if (!in_square_braces)
goto ordinary;
in_square_braces = false;
--depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
case ')':
if (!in_square_braces)
{
--depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
}
++pos;
break;
case '^': case '$': case '.': case '+':
is_trivial = false;
if (!last_substring->first.empty() && !in_square_braces)
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
/// Квантификаторы, допускающие нулевое количество.
case '{':
in_curly_braces = true;
case '?': case '*':
is_trivial = false;
if (!last_substring->first.empty() && !in_square_braces)
{
last_substring->first.resize(last_substring->first.size() - 1);
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
case '}':
if (!in_curly_braces)
goto ordinary;
in_curly_braces = false;
++pos;
break;
ordinary: /// Обычный, не заэскейпленный символ.
default:
if (depth == 0 && !in_curly_braces && !in_square_braces)
{
if (last_substring->first.empty())
last_substring->second = pos - begin;
last_substring->first.push_back(*pos);
}
++pos;
break;
}
}
if (last_substring && last_substring->first.empty())
trivial_substrings.pop_back();
if (!is_trivial)
{
if (!has_alternative_on_depth_0)
{
/** Выберем безальтернативную подстроку максимальной длины, среди префиксов,
* или безальтернативную подстроку максимальной длины.
*/
size_t max_length = 0;
Substrings::const_iterator candidate_it = trivial_substrings.begin();
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
{
if (((it->second == 0 && candidate_it->second != 0)
|| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
/// Тюнинг для предметной области
&& (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
&& (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
&& (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
&& (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows "))))
{
max_length = it->first.size();
candidate_it = it;
}
}
if (max_length >= MIN_LENGTH_FOR_STRSTR)
{
required_substring = candidate_it->first;
required_substring_is_prefix = candidate_it->second == 0;
}
}
}
else
{
required_substring = trivial_substrings.front().first;
required_substring_is_prefix = trivial_substrings.front().second == 0;
}
/* std::cerr
<< "regexp: " << regexp
<< ", is_trivial: " << is_trivial
<< ", required_substring: " << required_substring
<< ", required_substring_is_prefix: " << required_substring_is_prefix
<< std::endl;*/
}
template <bool b>
OptimizedRegularExpressionImpl<b>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
{
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
/// Поддерживаются 3 опции
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
throw Poco::Exception("OptimizedRegularExpression: Unsupported option.");
is_case_insensitive = options & RE_CASELESS;
bool is_no_capture = options & RE_NO_CAPTURE;
bool is_dot_nl = options & RE_DOT_NL;
number_of_subpatterns = 0;
if (!is_trivial)
{
/// Скомпилируем регулярное выражение re2.
typename RegexType::Options options;
if (is_case_insensitive)
options.set_case_sensitive(false);
if (is_dot_nl)
options.set_dot_nl(true);
re2 = std::make_unique<RegexType>(regexp_, options);
if (!re2->ok())
throw Poco::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error());
if (!is_no_capture)
{
number_of_subpatterns = re2->NumberOfCapturingGroups();
if (number_of_subpatterns > MAX_SUBPATTERNS)
throw Poco::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_);
}
}
}
template <bool b>
bool OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size) const
{
if (is_trivial)
{
if (is_case_insensitive)
return nullptr != strcasestr(subject, required_substring.data());
else
return nullptr != strstr(subject, required_substring.data());
}
else
{
if (!required_substring.empty())
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (nullptr == pos)
return 0;
}
return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0);
}
}
template <bool b>
bool OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size, Match & match) const
{
if (is_trivial)
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (pos == nullptr)
return 0;
else
{
match.offset = pos - subject;
match.length = required_substring.size();
return 1;
}
}
else
{
if (!required_substring.empty())
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (nullptr == pos)
return 0;
}
StringPieceType piece;
if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))
return 0;
else
{
match.offset = piece.data() - subject;
match.length = piece.length();
return 1;
}
}
}
template <bool b>
unsigned OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
{
matches.clear();
if (limit == 0)
return 0;
if (limit > number_of_subpatterns + 1)
limit = number_of_subpatterns + 1;
if (is_trivial)
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (pos == nullptr)
return 0;
else
{
Match match;
match.offset = pos - subject;
match.length = required_substring.size();
matches.push_back(match);
return 1;
}
}
else
{
if (!required_substring.empty())
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (nullptr == pos)
return 0;
}
StringPieceType pieces[MAX_SUBPATTERNS];
if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit))
return 0;
else
{
matches.resize(limit);
for (size_t i = 0; i < limit; ++i)
{
if (pieces[i] != nullptr)
{
matches[i].offset = pieces[i].data() - subject;
matches[i].length = pieces[i].length();
}
else
{
matches[i].offset = std::string::npos;
matches[i].length = 0;
}
}
return limit;
}
}
}
#undef MIN_LENGTH_FOR_STRSTR
#undef MAX_SUBPATTERNS

View File

@ -0,0 +1,433 @@
#include <iostream>
#include <Poco/Exception.h>
#include <Common/OptimizedRegularExpression.h>
#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5
template <bool thread_safe>
void OptimizedRegularExpressionImpl<thread_safe>::analyze(
const std::string & regexp,
std::string & required_substring,
bool & is_trivial,
bool & required_substring_is_prefix)
{
/** The expression is trivial if all the metacharacters in it are escaped.
* The non-alternative string is
* a string outside parentheses,
* in which all metacharacters are escaped,
* and also if there are no '|' outside the brackets,
* and also avoid substrings of the form `http://` or `www` and some other
* (this is the hack for typical use case in Yandex.Metrica).
*/
const char * begin = regexp.data();
const char * pos = begin;
const char * end = regexp.data() + regexp.size();
int depth = 0;
is_trivial = true;
required_substring_is_prefix = false;
required_substring.clear();
bool has_alternative_on_depth_0 = false;
/// Substring with a position.
using Substring = std::pair<std::string, size_t>;
using Substrings = std::vector<Substring>;
Substrings trivial_substrings(1);
Substring * last_substring = &trivial_substrings.back();
bool in_curly_braces = false;
bool in_square_braces = false;
while (pos != end)
{
switch (*pos)
{
case '\0':
pos = end;
break;
case '\\':
{
++pos;
if (pos == end)
break;
switch (*pos)
{
case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{':
if (depth == 0 && !in_curly_braces && !in_square_braces)
{
if (last_substring->first.empty())
last_substring->second = pos - begin;
last_substring->first.push_back(*pos);
}
break;
default:
/// all other escape sequences are not supported
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
break;
}
++pos;
break;
}
case '|':
if (depth == 0)
has_alternative_on_depth_0 = true;
is_trivial = false;
if (!in_square_braces && !last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
case '(':
if (!in_square_braces)
{
++depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
}
++pos;
break;
case '[':
in_square_braces = true;
++depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
case ']':
if (!in_square_braces)
goto ordinary;
in_square_braces = false;
--depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
case ')':
if (!in_square_braces)
{
--depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
}
++pos;
break;
case '^': case '$': case '.': case '+':
is_trivial = false;
if (!last_substring->first.empty() && !in_square_braces)
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
/// Quantifiers that allow a zero number of occurences.
case '{':
in_curly_braces = true;
case '?': case '*':
is_trivial = false;
if (!last_substring->first.empty() && !in_square_braces)
{
last_substring->first.resize(last_substring->first.size() - 1);
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
break;
case '}':
if (!in_curly_braces)
goto ordinary;
in_curly_braces = false;
++pos;
break;
ordinary: /// Normal, not escaped symbol.
default:
if (depth == 0 && !in_curly_braces && !in_square_braces)
{
if (last_substring->first.empty())
last_substring->second = pos - begin;
last_substring->first.push_back(*pos);
}
++pos;
break;
}
}
if (last_substring && last_substring->first.empty())
trivial_substrings.pop_back();
if (!is_trivial)
{
if (!has_alternative_on_depth_0)
{
/** We choose the non-alternative substring of the maximum length, among the prefixes,
* or a non-alternative substring of maximum length.
*/
size_t max_length = 0;
Substrings::const_iterator candidate_it = trivial_substrings.begin();
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
{
if (((it->second == 0 && candidate_it->second != 0)
|| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
/// Tuning for typical usage domain
&& (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
&& (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
&& (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
&& (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows "))))
{
max_length = it->first.size();
candidate_it = it;
}
}
if (max_length >= MIN_LENGTH_FOR_STRSTR)
{
required_substring = candidate_it->first;
required_substring_is_prefix = candidate_it->second == 0;
}
}
}
else
{
required_substring = trivial_substrings.front().first;
required_substring_is_prefix = trivial_substrings.front().second == 0;
}
/* std::cerr
<< "regexp: " << regexp
<< ", is_trivial: " << is_trivial
<< ", required_substring: " << required_substring
<< ", required_substring_is_prefix: " << required_substring_is_prefix
<< std::endl;*/
}
template <bool thread_safe>
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
{
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
/// Just three following options are supported
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
throw Poco::Exception("OptimizedRegularExpression: Unsupported option.");
is_case_insensitive = options & RE_CASELESS;
bool is_no_capture = options & RE_NO_CAPTURE;
bool is_dot_nl = options & RE_DOT_NL;
number_of_subpatterns = 0;
if (!is_trivial)
{
/// Compile the re2 regular expression.
typename RegexType::Options options;
if (is_case_insensitive)
options.set_case_sensitive(false);
if (is_dot_nl)
options.set_dot_nl(true);
re2 = std::make_unique<RegexType>(regexp_, options);
if (!re2->ok())
throw Poco::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error());
if (!is_no_capture)
{
number_of_subpatterns = re2->NumberOfCapturingGroups();
if (number_of_subpatterns > MAX_SUBPATTERNS)
throw Poco::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_);
}
}
}
template <bool thread_safe>
bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size) const
{
if (is_trivial)
{
if (is_case_insensitive)
return nullptr != strcasestr(subject, required_substring.data());
else
return nullptr != strstr(subject, required_substring.data());
}
else
{
if (!required_substring.empty())
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (nullptr == pos)
return 0;
}
return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0);
}
}
template <bool thread_safe>
bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, Match & match) const
{
if (is_trivial)
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (pos == nullptr)
return 0;
else
{
match.offset = pos - subject;
match.length = required_substring.size();
return 1;
}
}
else
{
if (!required_substring.empty())
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (nullptr == pos)
return 0;
}
StringPieceType piece;
if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))
return 0;
else
{
match.offset = piece.data() - subject;
match.length = piece.length();
return 1;
}
}
}
template <bool thread_safe>
unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
{
matches.clear();
if (limit == 0)
return 0;
if (limit > number_of_subpatterns + 1)
limit = number_of_subpatterns + 1;
if (is_trivial)
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (pos == nullptr)
return 0;
else
{
Match match;
match.offset = pos - subject;
match.length = required_substring.size();
matches.push_back(match);
return 1;
}
}
else
{
if (!required_substring.empty())
{
const char * pos;
if (is_case_insensitive)
pos = strcasestr(subject, required_substring.data());
else
pos = strstr(subject, required_substring.data());
if (nullptr == pos)
return 0;
}
StringPieceType pieces[MAX_SUBPATTERNS];
if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit))
return 0;
else
{
matches.resize(limit);
for (size_t i = 0; i < limit; ++i)
{
if (pieces[i] != nullptr)
{
matches[i].offset = pieces[i].data() - subject;
matches[i].length = pieces[i].length();
}
else
{
matches[i].offset = std::string::npos;
matches[i].length = 0;
}
}
return limit;
}
}
}
#undef MIN_LENGTH_FOR_STRSTR
#undef MAX_SUBPATTERNS

View File

@ -19,46 +19,46 @@
namespace DB
{
/** Динамический массив для POD-типов.
* Предназначен для небольшого количества больших массивов (а не большого количества маленьких).
* А точнее - для использования в ColumnVector.
* Отличается от std::vector тем, что не инициализирует элементы.
/** A dynamic array for POD types.
* Designed for a small number of large arrays (rather than a lot of small ones).
* To be more precise - for use in ColumnVector.
* It differs from std::vector in that it does not initialize the elements.
*
* Сделан некопируемым, чтобы не было случайных копий. Скопировать данные можно с помощью метода assign.
* Made noncopyable so that there are no accidential copies. You can copy the data using `assign` method.
*
* Поддерживается только часть интерфейса std::vector.
* Only part of the std::vector interface is supported.
*
* Конструктор по-умолчанию создаёт пустой объект, который не выделяет память.
* Затем выделяется память минимум в INITIAL_SIZE байт.
* The default constructor creates an empty object that does not allocate memory.
* Then the memory is allocated at least INITIAL_SIZE bytes.
*
* Если вставлять элементы push_back-ом, не делая reserve, то PODArray примерно в 2.5 раза быстрее std::vector.
* If you insert elements with push_back, without making a `reserve`, then PODArray is about 2.5 times faster than std::vector.
*
* Шаблонный параметр pad_right - всегда выделять в конце массива столько неиспользуемых байт.
* Может использоваться для того, чтобы делать оптимистичное чтение, запись, копирование невыровненными SIMD-инструкциями.
* The template parameter `pad_right` - always allocate at the end of the array as many unused bytes.
* Can be used to make optimistic reading, writing, copying with unaligned SIMD instructions.
*/
template <typename T, size_t INITIAL_SIZE = 4096, typename TAllocator = Allocator<false>, size_t pad_right_ = 0>
class PODArray : private boost::noncopyable, private TAllocator /// empty base optimization
{
private:
/// Округление padding-а вверх до целого количества элементов, чтобы упростить арифметику.
/// Round padding up to an whole number of elements to simplify arithmetic.
static constexpr size_t pad_right = (pad_right_ + sizeof(T) - 1) / sizeof(T) * sizeof(T);
char * c_start = nullptr;
char * c_end = nullptr;
char * c_end_of_storage = nullptr; /// Не включает в себя pad_right.
char * c_start = nullptr;
char * c_end = nullptr;
char * c_end_of_storage = nullptr; /// Does not include pad_right.
T * t_start() { return reinterpret_cast<T *>(c_start); }
T * t_end() { return reinterpret_cast<T *>(c_end); }
T * t_end_of_storage() { return reinterpret_cast<T *>(c_end_of_storage); }
T * t_start() { return reinterpret_cast<T *>(c_start); }
T * t_end() { return reinterpret_cast<T *>(c_end); }
T * t_end_of_storage() { return reinterpret_cast<T *>(c_end_of_storage); }
const T * t_start() const { return reinterpret_cast<const T *>(c_start); }
const T * t_end() const { return reinterpret_cast<const T *>(c_end); }
const T * t_end_of_storage() const { return reinterpret_cast<const T *>(c_end_of_storage); }
const T * t_start() const { return reinterpret_cast<const T *>(c_start); }
const T * t_end() const { return reinterpret_cast<const T *>(c_end); }
const T * t_end_of_storage() const { return reinterpret_cast<const T *>(c_end_of_storage); }
/// Количество памяти, занимаемое num_elements элементов.
/// The amount of memory occupied by the num_elements of the elements.
static size_t byte_size(size_t num_elements) { return num_elements * sizeof(T); }
/// Минимальное количество памяти, которое нужно выделить для num_elements элементов, включая padding.
/// Minimum amount of memory to allocate for num_elements, including padding.
static size_t minimum_memory_for_elements(size_t num_elements) { return byte_size(num_elements) + pad_right; }
void alloc_for_num_elements(size_t num_elements)
@ -112,7 +112,7 @@ public:
size_t allocated_size() const { return c_end_of_storage - c_start + pad_right; }
/// Просто typedef нельзя, так как возникает неоднозначность для конструкторов и функций assign.
/// You can not just use `typedef`, because there is ambiguity for the constructors and `assign` functions.
struct iterator : public boost::iterator_adaptor<iterator, T*>
{
iterator() {}
@ -173,16 +173,16 @@ public:
const T & operator[] (size_t n) const { return t_start()[n]; }
T & front() { return t_start()[0]; }
T & back() { return t_end()[-1]; }
T & back() { return t_end()[-1]; }
const T & front() const { return t_start()[0]; }
const T & back() const { return t_end()[-1]; }
iterator begin() { return t_start(); }
iterator end() { return t_end(); }
const_iterator begin() const { return t_start(); }
const_iterator end() const { return t_end(); }
const_iterator cbegin() const { return t_start(); }
const_iterator cend() const { return t_end(); }
iterator begin() { return t_start(); }
iterator end() { return t_end(); }
const_iterator begin() const { return t_start(); }
const_iterator end() const { return t_end(); }
const_iterator cbegin() const { return t_start(); }
const_iterator cend() const { return t_end(); }
void reserve(size_t n)
{
@ -209,7 +209,7 @@ public:
c_end = c_start + byte_size(n);
}
/// Как resize, но обнуляет новые элементы.
/// Same as resize, but zeroes new elements.
void resize_fill(size_t n)
{
size_t old_size = size();
@ -261,7 +261,7 @@ public:
c_end -= byte_size(1);
}
/// Не вставляйте в массив кусок самого себя. Потому что при ресайзе, итераторы на самого себя могут инвалидироваться.
/// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated.
template <typename It1, typename It2>
void insert(It1 from_begin, It2 from_end)
{
@ -458,7 +458,7 @@ void swap(PODArray<T, INITIAL_SIZE, TAllocator, pad_right_> & lhs, PODArray<T, I
lhs.swap(rhs);
}
/** Для столбцов. Padding-а хватает, чтобы читать и писать xmm-регистр по адресу последнего элемента. */
/** For columns. Padding is enough to read and write xmm-register at the address of the last element. */
template <typename T, size_t INITIAL_SIZE = 4096, typename TAllocator = Allocator<false>>
using PaddedPODArray = PODArray<T, INITIAL_SIZE, TAllocator, 15>;

View File

@ -8,8 +8,17 @@
#include <common/logger_useful.h>
#include <Common/Exception.h>
/** Класс, от которого можно унаследоваться и получить пул чего-нибудь. Используется для пулов соединений с БД.
* Наследник должен предоставить метод для создания нового объекта для помещения в пул.
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
}
/** A class from which you can inherit and get a pool of something. Used for database connection pools.
* Descendant class must provide a method for creating a new object to place in the pool.
*/
template <typename TObject>
@ -22,7 +31,7 @@ public:
private:
/** Объект с флагом, используется ли он сейчас. */
/** The object with the flag, whether it is currently used. */
struct PooledObject
{
PooledObject(ObjectPtr object_, PoolBase & pool_)
@ -37,8 +46,8 @@ private:
using Objects = std::vector<std::shared_ptr<PooledObject>>;
/** Помощник, который устанавливает флаг использования объекта, а в деструкторе - снимает,
* а также уведомляет о событии с помощью condvar-а.
/** The helper, which sets the flag for using the object, and in the destructor - removes,
* and also notifies the event using condvar.
*/
struct PoolEntryHelper
{
@ -54,36 +63,36 @@ private:
};
public:
/** То, что выдаётся пользователю. */
/** What is given to the user. */
class Entry
{
public:
friend class PoolBase<Object>;
Entry() {} /// Для отложенной инициализации.
Entry() {} /// For deferred initialization.
/** Объект Entry защищает ресурс от использования другим потоком.
* Следующие методы запрещены для rvalue, чтобы нельзя было написать подобное
*
* auto q = pool.Get()->query("SELECT .."); // Упс, после этой строчки Entry уничтожился
* q.execute(); // Кто-то еще может использовать этот Connection
*/
/** The `Entry` object protects the resource from being used by another thread.
* The following methods are forbidden for `rvalue`, so you can not write a similar to
*
* auto q = pool.Get()->query("SELECT .."); // Oops, after this line Entry was destroyed
* q.execute (); // Someone else can use this Connection
*/
Object * operator->() && = delete;
const Object * operator->() const && = delete;
Object & operator*() && = delete;
const Object & operator*() const && = delete;
Object * operator->() & { return &*data->data.object; }
const Object * operator->() const & { return &*data->data.object; }
Object & operator*() & { return *data->data.object; }
const Object & operator*() const & { return *data->data.object; }
Object * operator->() & { return &*data->data.object; }
const Object * operator->() const & { return &*data->data.object; }
Object & operator*() & { return *data->data.object; }
const Object & operator*() const & { return *data->data.object; }
bool isNull() const { return data == nullptr; }
PoolBase * getPool() const
{
if (!data)
throw DB::Exception("attempt to get pool from uninitialized entry");
throw DB::Exception("Attempt to get pool from uninitialized entry", DB::ErrorCodes::LOGICAL_ERROR);
return &data->data.pool;
}
@ -95,7 +104,7 @@ public:
virtual ~PoolBase() {}
/** Выделяет объект для работы. При timeout < 0 таймаут бесконечный. */
/** Allocates the object. Wait for free object in pool for 'timeout'. With 'timeout' < 0, the timeout is infinite. */
Entry get(Poco::Timespan::TimeDiff timeout)
{
std::unique_lock<std::mutex> lock(mutex);
@ -131,13 +140,13 @@ public:
}
private:
/** Максимальный размер пула. */
/** The maximum size of the pool. */
unsigned max_items;
/** Пул. */
/** Pool. */
Objects items;
/** Блокировка для доступа к пулу. */
/** Lock to access the pool. */
std::mutex mutex;
std::condition_variable available;
@ -151,7 +160,7 @@ protected:
items.reserve(max_items);
}
/** Создает новый объект для помещения в пул. */
/** Creates a new object to put into the pool. */
virtual ObjectPtr allocObject() = 0;
};

View File

@ -13,18 +13,18 @@
#include <Core/Defines.h>
/** Поразрядная сортировка, обладает следующей функциональностью:
* Может сортировать unsigned, signed числа, а также float-ы.
* Может сортировать массив элементов фиксированной длины, которые содержат что-то ещё кроме ключа.
* Настраиваемый размер разряда.
/** Radix sort, has the following functionality:
* Can sort unsigned, signed numbers, and floats.
* Can sort an array of fixed length elements that contain something else besides the key.
* Customizable radix size.
*
* LSB, stable.
* NOTE Для некоторых приложений имеет смысл добавить MSB-radix-sort,
* а также алгоритмы radix-select, radix-partial-sort, radix-get-permutation на его основе.
* NOTE For some applications it makes sense to add MSB-radix-sort,
* as well as radix-select, radix-partial-sort, radix-get-permutation algorithms based on it.
*/
/** Используется в качестве параметра шаблона. См. ниже.
/** Used as a template parameter. See below.
*/
struct RadixSortMallocAllocator
{
@ -40,16 +40,16 @@ struct RadixSortMallocAllocator
};
/** Преобразование, которое переводит битовое представление ключа в такое целое беззнаковое число,
* что отношение порядка над ключами будет соответствовать отношению порядка над полученными беззнаковыми числами.
* Для float-ов это преобразование делает следующее:
* если выставлен знаковый бит, то переворачивает все остальные биты.
* При этом, NaN-ы оказываются больше всех нормальных чисел.
/** A transformation that transforms the bit representation of a key into an unsigned integer number,
* that the order relation over the keys will match the order relation over the obtained unsigned numbers.
* For floats this conversion does the following:
* if the signed bit is set, it flips all other bits.
* In this case, NaN-s are bigger than all normal numbers.
*/
template <typename KeyBits>
struct RadixSortFloatTransform
{
/// Стоит ли записывать результат в память, или лучше делать его каждый раз заново?
/// Is it worth writing the result in memory, or is it better to do calculation every time again?
static constexpr bool transform_is_simple = false;
static KeyBits forward(KeyBits x)
@ -67,24 +67,24 @@ struct RadixSortFloatTransform
template <typename Float>
struct RadixSortFloatTraits
{
using Element = Float; /// Тип элемента. Это может быть структура с ключём и ещё каким-то payload-ом. Либо просто ключ.
using Key = Float; /// Ключ, по которому нужно сортировать.
using CountType = uint32_t; /// Тип для подсчёта гистограмм. В случае заведомо маленького количества элементов, может быть меньше чем size_t.
using Element = Float; /// The type of the element. It can be a structure with a key and some other payload. Or just a key.
using Key = Float; /// The key to sort.
using CountType = uint32_t; /// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t.
/// Тип, в который переводится ключ, чтобы делать битовые операции. Это UInt такого же размера, как ключ.
/// The type to which the key is transformed to do bit operations. This UInt is the same size as the key.
using KeyBits = typename std::conditional<sizeof(Float) == 8, uint64_t, uint32_t>::type;
static constexpr size_t PART_SIZE_BITS = 8; /// Какими кусочками ключа в количестве бит делать один проход - перестановку массива.
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
/// Преобразования ключа в KeyBits такое, что отношение порядка над ключём соответствует отношению порядка над KeyBits.
/// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits.
using Transform = RadixSortFloatTransform<KeyBits>;
/// Объект с функциями allocate и deallocate.
/// Может быть использован, например, чтобы выделить память для временного массива на стеке.
/// Для этого сам аллокатор создаётся на стеке.
/// An object with the functions allocate and deallocate.
/// Can be used, for example, to allocate memory for a temporary array on the stack.
/// To do this, the allocator itself is created on the stack.
using Allocator = RadixSortMallocAllocator;
/// Функция получения ключа из элемента массива.
/// The function to get the key from an array element.
static Key & extractKey(Element & elem) { return elem; }
};
@ -95,7 +95,7 @@ struct RadixSortIdentityTransform
static constexpr bool transform_is_simple = true;
static KeyBits forward(KeyBits x) { return x; }
static KeyBits backward(KeyBits x) { return x; }
static KeyBits backward(KeyBits x) { return x; }
};
@ -105,7 +105,7 @@ struct RadixSortSignedTransform
static constexpr bool transform_is_simple = true;
static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
};
@ -122,7 +122,7 @@ struct RadixSortUIntTraits
using Transform = RadixSortIdentityTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
/// Функция получения ключа из элемента массива.
/// The function to get the key from an array element.
static Key & extractKey(Element & elem) { return elem; }
};
@ -139,7 +139,7 @@ struct RadixSortIntTraits
using Transform = RadixSortSignedTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
/// Функция получения ключа из элемента массива.
/// The function to get the key from an array element.
static Key & extractKey(Element & elem) { return elem; }
};
@ -150,7 +150,7 @@ struct RadixSort
private:
using Element = typename Traits::Element;
using Key = typename Traits::Key;
using CountType = typename Traits::CountType;
using CountType = typename Traits::CountType;
using KeyBits = typename Traits::KeyBits;
static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS;
@ -172,19 +172,19 @@ private:
public:
static void execute(Element * arr, size_t size)
{
/// Если массив имеет размер меньше 256, то лучше использовать другой алгоритм.
/// If the array is smaller than 256, then it is better to use another algorithm.
/// Здесь есть циклы по NUM_PASSES. Очень важно, что они разворачиваются в compile-time.
/// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time.
/// Для каждого из NUM_PASSES кусков бит ключа, считаем, сколько раз каждое значение этого куска встретилось.
/// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met.
CountType histograms[HISTOGRAM_SIZE * NUM_PASSES] = {0};
typename Traits::Allocator allocator;
/// Будем делать несколько проходов по массиву. На каждом проходе, данные перекладываются в другой массив. Выделим этот временный массив.
/// We will do several passes through the array. On each pass, the data is transferred to another array. Let's allocate this temporary array.
Element * swap_buffer = reinterpret_cast<Element *>(allocator.allocate(size * sizeof(Element)));
/// Трансформируем массив и вычисляем гистограмму.
/// Transform the array and calculate the histogram.
for (size_t i = 0; i < size; ++i)
{
if (!Traits::Transform::transform_is_simple)
@ -195,7 +195,7 @@ public:
}
{
/// Заменяем гистограммы на суммы с накоплением: значение в позиции i равно сумме в предыдущих позициях минус один.
/// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions minus one.
size_t sums[NUM_PASSES] = {0};
for (size_t i = 0; i < HISTOGRAM_SIZE; ++i)
@ -209,7 +209,7 @@ public:
}
}
/// Перекладываем элементы в порядке начиная от младшего куска бит, и далее делаем несколько проходов по количеству кусков.
/// Move the elements in the order starting from the least bit piece, and then do a few passes on the number of pieces.
for (size_t j = 0; j < NUM_PASSES; ++j)
{
Element * writer = j % 2 ? arr : swap_buffer;
@ -219,17 +219,18 @@ public:
{
size_t pos = getPart(j, keyToBits(Traits::extractKey(reader[i])));
/// Размещаем элемент на следующей свободной позиции.
/// Place the element on the next free position.
auto & dest = writer[++histograms[j * HISTOGRAM_SIZE + pos]];
dest = reader[i];
/// На последнем перекладывании, делаем обратную трансформацию.
/// On the last pass, we do the reverse transformation.
if (!Traits::Transform::transform_is_simple && j == NUM_PASSES - 1)
Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(reader[i]))));
}
}
/// Если число проходов нечётное, то результирующий массив находится во временном буфере. Скопируем его на место исходного массива.
/// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array.
/// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array.
if (NUM_PASSES % 2)
memcpy(arr, swap_buffer, size * sizeof(Element));

View File

@ -9,19 +9,19 @@ namespace DB
{
/** Позволяет запустить команду,
* читать её stdout, stderr, писать в stdin,
* дождаться завершения.
/** Lets you run the command,
* read it stdout and stderr; write to stdin;
* wait for completion.
*
* Реализация похожа на функцию popen из POSIX (посмотреть можно в исходниках libc).
* The implementation is similar to the popen function from POSIX (see libc source code).
*
* Наиболее важное отличие: использует vfork вместо fork.
* Это сделано, потому что fork не работает (с ошибкой о нехватке памяти),
* при некоторых настройках overcommit-а, если размер адресного пространства процесса больше половины количества доступной памяти.
* Также, изменение memory map-ов - довольно ресурсоёмкая операция.
* The most important difference: uses vfork instead of fork.
* This is done because fork does not work (with a memory shortage error),
* with some overcommit settings, if the address space of the process is more than half the amount of available memory.
* Also, changing memory maps - a fairly resource-intensive operation.
*
* Второе отличие - позволяет работать одновременно и с stdin, и с stdout, и с stderr запущенного процесса,
* а также узнать код и статус завершения.
* The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr of running process,
* and also to obtain the return code and completion status.
*/
class ShellCommand
{
@ -34,20 +34,20 @@ private:
static std::unique_ptr<ShellCommand> executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only);
public:
WriteBufferFromFile in; /// Если команда читает из stdin, то не забудьте вызвать in.close() после записи туда всех данных.
WriteBufferFromFile in; /// If the command reads from stdin, do not forget to call in.close() after writing all the data there.
ReadBufferFromFile out;
ReadBufferFromFile err;
/// Выполнить команду с использованием /bin/sh -c
/// Run the command using /bin/sh -c
static std::unique_ptr<ShellCommand> execute(const std::string & command, bool pipe_stdin_only = false);
/// Выполнить исполняемый файл с указаннами аргументами. arguments - без argv[0].
/// Run the executable with the specified arguments. `arguments` - without argv[0].
static std::unique_ptr<ShellCommand> executeDirect(const std::string & path, const std::vector<std::string> & arguments);
/// Подождать завершения процесса, кинуть исключение, если код не 0 или если процесс был завершён не самостоятельно.
/// Wait for the process to end, throw an exception if the code is not 0 or if the process was not completed by itself.
void wait();
/// Подождать завершения процесса, узнать код возврата. Кинуть исключение, если процесс был завершён не самостоятельно.
/// Wait for the process to finish, see the return code. To throw an exception if the process was not completed independently.
int tryWait();
};

View File

@ -6,13 +6,13 @@
#include <ext/function_traits.hpp>
/** Простейший кэш для свободной функции.
* Можете также передать статический метод класса или лямбду без захвата.
* Размер неограничен. Значения не устаревают.
* Для синхронизации используется mutex.
* Подходит только для простейших случаев.
/** The simplest cache for a free function.
* You can also pass a static class method or lambda without captures.
* The size is unlimited. Values are stored permanently and never evicted.
* Mutex is used for synchronization.
* Suitable only for the simplest cases.
*
* Использование:
* Usage
*
* SimpleCache<decltype(func), &func> func_cached;
* std::cerr << func_cached(args...);
@ -41,7 +41,7 @@ public:
return it->second;
}
/// Сами вычисления делаются не под mutex-ом.
/// The calculations themselves are not done under mutex.
Result res = f(std::forward<Args>(args)...);
{

View File

@ -0,0 +1,24 @@
#pragma once
#include <common/Types.h>
#include <atomic>
/** Is used for numbering of files.
*/
struct SimpleIncrement
{
std::atomic<UInt64> value;
SimpleIncrement(UInt64 start = 0) : value(start) {}
void set(UInt64 new_value)
{
value = new_value;
}
UInt64 get()
{
return ++value;
}
};

View File

@ -1,57 +1,54 @@
#pragma once
/** SipHash - быстрая криптографическая хэш функция для коротких строк.
* Взято отсюда: https://www.131002.net/siphash/
/** SipHash is a fast cryptographic hash function for short strings.
* Taken from here: https://www.131002.net/siphash/
*
* Сделано два изменения:
* - возвращает 128 бит, а не 64;
* - сделано потоковой (можно вычислять по частям).
* This is SipHash 2-4 variant.
*
* На коротких строках (URL, поисковые фразы) более чем в 3 раза быстрее MD5 от OpenSSL.
* (~ 700 МБ/сек., 15 млн. строк в секунду)
* Two changes are made:
* - returns also 128 bits, not only 64;
* - done streaming (can be calculated in parts).
*
* On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL.
* (~ 700 MB/sec, 15 million strings per second)
*/
#include <cstdint>
#include <cstddef>
#include <Core/Types.h>
#include <common/Types.h>
#define ROTL(x,b) static_cast<u64>( ((x) << (b)) | ( (x) >> (64 - (b))) )
#define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b))))
#define SIPROUND \
do \
{ \
v0 += v1; v1=ROTL(v1,13); v1 ^= v0; v0=ROTL(v0,32); \
v2 += v3; v3=ROTL(v3,16); v3 ^= v2; \
v0 += v3; v3=ROTL(v3,21); v3 ^= v0; \
v2 += v1; v1=ROTL(v1,17); v1 ^= v2; v2=ROTL(v2,32); \
#define SIPROUND \
do \
{ \
v0 += v1; v1 = ROTL(v1, 13); v1 ^= v0; v0 = ROTL(v0, 32); \
v2 += v3; v3 = ROTL(v3, 16); v3 ^= v2; \
v0 += v3; v3 = ROTL(v3, 21); v3 ^= v0; \
v2 += v1; v1 = ROTL(v1, 17); v1 ^= v2; v2 = ROTL(v2, 32); \
} while(0)
class SipHash
{
private:
using u64 = DB::UInt64;
using u8 = DB::UInt8;
/// State.
UInt64 v0;
UInt64 v1;
UInt64 v2;
UInt64 v3;
/// Состояние.
u64 v0;
u64 v1;
u64 v2;
u64 v3;
/// How many bytes have been processed.
UInt64 cnt;
/// Сколько байт обработано.
u64 cnt;
/// Текущие 8 байт входных данных.
/// The current 8 bytes of input data.
union
{
u64 current_word;
u8 current_bytes[8];
UInt64 current_word;
UInt8 current_bytes[8];
};
void finalize()
{
/// В последний свободный байт пишем остаток от деления длины на 256.
/// In the last free byte, we write the remainder of the division by 256.
current_bytes[7] = cnt;
v3 ^= current_word;
@ -67,10 +64,10 @@ private:
}
public:
/// Аргументы - seed.
SipHash(u64 k0 = 0, u64 k1 = 0)
/// Arguments - seed.
SipHash(UInt64 k0 = 0, UInt64 k1 = 0)
{
/// Инициализируем состояние некоторыми случайными байтами и seed-ом.
/// Initialize the state with some random bytes and seed.
v0 = 0x736f6d6570736575ULL ^ k0;
v1 = 0x646f72616e646f6dULL ^ k1;
v2 = 0x6c7967656e657261ULL ^ k0;
@ -80,11 +77,11 @@ public:
current_word = 0;
}
void update(const char * data, u64 size)
void update(const char * data, UInt64 size)
{
const char * end = data + size;
/// Дообработаем остаток от предыдущего апдейта, если есть.
/// We'll finish to process the remainder of the previous update, if any.
if (cnt & 7)
{
while (cnt & 7 && data < end)
@ -94,7 +91,7 @@ public:
++cnt;
}
/// Если всё ещё не хватает байт до восьмибайтового слова.
/// If we still do not have enough bytes to an 8-byte word.
if (cnt & 7)
return;
@ -108,7 +105,7 @@ public:
while (data + 8 <= end)
{
current_word = *reinterpret_cast<const u64 *>(data);
current_word = *reinterpret_cast<const UInt64 *>(data);
v3 ^= current_word;
SIPROUND;
@ -118,7 +115,7 @@ public:
data += 8;
}
/// Заполняем остаток, которого не хватает до восьмибайтового слова.
/// Pad the remainder, which is missing up to an 8-byte word.
current_word = 0;
switch (end - data)
{
@ -133,23 +130,23 @@ public:
}
}
/// Получить результат в некотором виде. Это можно сделать только один раз!
/// Get the result in some form. This can only be done once!
void get128(char * out)
{
finalize();
reinterpret_cast<u64 *>(out)[0] = v0 ^ v1;
reinterpret_cast<u64 *>(out)[1] = v2 ^ v3;
reinterpret_cast<UInt64 *>(out)[0] = v0 ^ v1;
reinterpret_cast<UInt64 *>(out)[1] = v2 ^ v3;
}
void get128(u64 & lo, u64 & hi)
void get128(UInt64 & lo, UInt64 & hi)
{
finalize();
lo = v0 ^ v1;
hi = v2 ^ v3;
}
u64 get64()
UInt64 get64()
{
finalize();
return v0 ^ v1 ^ v2 ^ v3;
@ -160,6 +157,7 @@ public:
#undef ROTL
#undef SIPROUND
#include <cstddef>
inline void sipHash128(const char * data, const size_t size, char * out)
{
@ -168,7 +166,7 @@ inline void sipHash128(const char * data, const size_t size, char * out)
hash.get128(out);
}
inline DB::UInt64 sipHash64(const char * data, const size_t size)
inline UInt64 sipHash64(const char * data, const size_t size)
{
SipHash hash;
hash.update(data, size);
@ -177,7 +175,7 @@ inline DB::UInt64 sipHash64(const char * data, const size_t size)
#include <string>
inline DB::UInt64 sipHash64(const std::string & s)
inline UInt64 sipHash64(const std::string & s)
{
return sipHash64(s.data(), s.size());
}

View File

@ -73,7 +73,7 @@ public:
free_list = block;
}
/// Размер выделенного пула в байтах
/// The size of the allocated pool in bytes
size_t size() const
{
return pool.size();

View File

@ -0,0 +1,288 @@
#pragma once
#include <iostream>
#include <vector>
#include <boost/range/adaptor/reversed.hpp>
#include <Common/UInt128.h>
#include <Common/HashTable/Hash.h>
#include <Common/HashTable/HashMap.h>
#include <IO/WriteBuffer.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBuffer.h>
#include <IO/ReadHelpers.h>
#include <IO/VarInt.h>
/*
* Implementation of the Filtered Space-Saving for TopK streaming analysis.
* http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf
* It implements suggested reduce-and-combine algorithm from Parallel Space Saving:
* https://arxiv.org/pdf/1401.0702.pdf
*/
namespace DB
{
template
<
typename TKey,
typename HashKey = TKey,
typename Hash = DefaultHash<HashKey>,
typename Grower = HashTableGrower<>,
typename Allocator = HashTableAllocator
>
class SpaceSaving
{
private:
// Suggested constants in the paper "Finding top-k elements in data streams", chap 6. equation (24)
// Round to nearest power of 2 for cheaper binning without modulo
constexpr uint64_t nextAlphaSize (uint64_t x)
{
constexpr uint64_t ALPHA_MAP_ELEMENTS_PER_COUNTER = 6;
return 1ULL<<(sizeof(uint64_t) * 8 - __builtin_clzll(x * ALPHA_MAP_ELEMENTS_PER_COUNTER));
}
public:
using Self = SpaceSaving<TKey, HashKey, Hash, Grower, Allocator>;
struct Counter
{
Counter() {}
Counter(const TKey & k, UInt64 c = 0, UInt64 e = 0, size_t h = 0)
: key(k), slot(0), hash(h), count(c), error(e) {}
void write(WriteBuffer & wb) const
{
writeBinary(key, wb);
writeVarUInt(count, wb);
writeVarUInt(error, wb);
}
void read(ReadBuffer & rb)
{
readBinary(key, rb);
readVarUInt(count, rb);
readVarUInt(error, rb);
}
// greater() taking slot error into account
bool operator> (const Counter & b) const
{
return (count > b.count) || (count == b.count && error < b.error);
}
TKey key;
size_t slot, hash;
UInt64 count;
UInt64 error;
};
SpaceSaving(size_t c = 10) : alpha_map(nextAlphaSize(c)), m_capacity(c) {}
~SpaceSaving() { destroyElements(); }
inline size_t size() const
{
return counter_list.size();
}
inline size_t capacity() const
{
return m_capacity;
}
void resize(size_t new_capacity)
{
counter_list.reserve(new_capacity);
alpha_map.resize(nextAlphaSize(new_capacity));
m_capacity = new_capacity;
}
void insert(const TKey & key, UInt64 increment = 1, UInt64 error = 0)
{
// Increase weight of a key that already exists
// It uses hashtable for both value mapping as a presence test (c_i != 0)
auto hash = counter_map.hash(key);
auto it = counter_map.find(key, hash);
if (it != counter_map.end())
{
auto c = it->second;
c->count += increment;
c->error += error;
percolate(c);
return;
}
// Key doesn't exist, but can fit in the top K
else if (unlikely(size() < capacity()))
{
auto c = new Counter(key, increment, error, hash);
push(c);
return;
}
auto min = counter_list.back();
const size_t alpha_mask = alpha_map.size() - 1;
auto & alpha = alpha_map[hash & alpha_mask];
if (alpha + increment < min->count)
{
alpha += increment;
return;
}
// Erase the current minimum element
alpha_map[min->hash & alpha_mask] = min->count;
it = counter_map.find(min->key, min->hash);
// Replace minimum with newly inserted element
if (it != counter_map.end())
{
min->hash = hash;
min->key = key;
min->count = alpha + increment;
min->error = alpha + error;
percolate(min);
it->second = min;
it->first = key;
counter_map.reinsert(it, hash);
}
}
/*
* Parallel Space Saving reduction and combine step from:
* https://arxiv.org/pdf/1401.0702.pdf
*/
void merge(const Self & rhs)
{
UInt64 m1 = 0;
UInt64 m2 = 0;
if (size() == capacity())
{
m1 = counter_list.back()->count;
}
if (rhs.size() == rhs.capacity())
{
m2 = rhs.counter_list.back()->count;
}
/*
* Updated algorithm to mutate current table in place
* without mutating rhs table or creating new one
* in the first step we expect that no elements overlap
* and in the second sweep we correct the error if they do.
*/
if (m2 > 0)
{
for (auto counter : counter_list)
{
counter->count += m2;
counter->error += m2;
}
}
// The list is sorted in descending order, we have to scan in reverse
for (auto counter : boost::adaptors::reverse(rhs.counter_list))
{
if (counter_map.find(counter->key) != counter_map.end())
{
// Subtract m2 previously added, guaranteed not negative
insert(counter->key, counter->count - m2, counter->error - m2);
}
else
{
// Counters not monitored in S1
insert(counter->key, counter->count + m1, counter->error + m1);
}
}
}
std::vector<Counter> topK(size_t k) const
{
std::vector<Counter> res;
for (auto counter : counter_list)
{
res.push_back(*counter);
if (res.size() == k)
break;
}
return res;
}
void write(WriteBuffer & wb) const
{
writeVarUInt(size(), wb);
for (auto counter : counter_list)
counter->write(wb);
for (auto alpha : alpha_map)
writeVarUInt(alpha, wb);
}
void read(ReadBuffer & rb)
{
destroyElements();
size_t count = 0;
readVarUInt(count, rb);
for (size_t i = 0; i < count; ++i)
{
auto counter = new Counter();
counter->read(rb);
counter->hash = counter_map.hash(counter->key);
push(counter);
}
for (size_t i = 0; i < nextAlphaSize(m_capacity); ++i)
{
UInt64 alpha = 0;
readVarUInt(alpha, rb);
alpha_map.push_back(alpha);
}
}
protected:
void push(Counter * counter)
{
counter->slot = counter_list.size();
counter_list.push_back(counter);
counter_map[counter->key] = counter;
percolate(counter);
}
// This is equivallent to one step of bubble sort
void percolate(Counter * counter)
{
while (counter->slot > 0)
{
auto next = counter_list[counter->slot - 1];
if (*counter > *next)
{
std::swap(next->slot, counter->slot);
std::swap(counter_list[next->slot], counter_list[counter->slot]);
}
else
break;
}
}
private:
void destroyElements()
{
for (auto counter : counter_list)
delete counter;
counter_map.clear();
counter_list.clear();
alpha_map.clear();
}
HashMap<HashKey, Counter *, Hash, Grower, Allocator> counter_map;
std::vector<Counter *> counter_list;
std::vector<UInt64> alpha_map;
size_t m_capacity;
};
};

View File

@ -6,14 +6,14 @@
#define STACK_TRACE_MAX_DEPTH 32
/// Позволяет получить стек-трейс
/// Lets you get a stacktrace
class StackTrace
{
public:
/// Стектрейс снимается в момент создания объекта
/// The stacktrace is captured when the object is created
StackTrace();
/// Вывести в строку
/// Print to string
std::string toString() const;
private:

View File

@ -19,15 +19,14 @@
namespace DB
{
namespace ErrorCodes
{
extern const int UNSUPPORTED_PARAMETER;
}
/** Варианты поиска подстроки в строке.
* В большинстве случаев, менее производительные, чем Volnitsky (см. Volnitsky.h).
/** Variants for searching a substring in a string.
* In most cases, performance is less than Volnitsky (see Volnitsky.h).
*/
@ -37,7 +36,7 @@ struct StringSearcherBase
static constexpr auto n = sizeof(__m128i);
const int page_size = getpagesize();
bool page_safe(const void * const ptr) const
bool pageSafe(const void * const ptr) const
{
return ((page_size - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <= page_size - n;
}
@ -55,7 +54,7 @@ class StringSearcher<false, false> : private StringSearcherBase
private:
using UTF8SequenceBuffer = UInt8[6];
/// string to be searched for
/// substring to be searched for
const UInt8 * const needle;
const std::size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
@ -135,8 +134,7 @@ public:
if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
throw DB::Exception{
"UTF8 sequences with different lowercase and uppercase lengths are not supported",
DB::ErrorCodes::UNSUPPORTED_PARAMETER
};
DB::ErrorCodes::UNSUPPORTED_PARAMETER};
cache_actual_len += src_len;
if (cache_actual_len < n)
@ -165,7 +163,7 @@ public:
static const Poco::UTF8Encoding utf8;
#if __SSE4_1__
if (page_safe(pos))
if (pageSafe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
@ -230,7 +228,7 @@ public:
while (haystack < haystack_end)
{
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
if (haystack + n <= haystack_end && pageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
@ -249,7 +247,7 @@ public:
const auto offset = __builtin_ctz(mask);
haystack += offset;
if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack))
if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
@ -377,7 +375,7 @@ public:
bool compare(const UInt8 * pos) const
{
#if __SSE4_1__
if (page_safe(pos))
if (pageSafe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
@ -429,7 +427,7 @@ public:
while (haystack < haystack_end)
{
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
if (haystack + n <= haystack_end && pageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
@ -447,7 +445,7 @@ public:
const auto offset = __builtin_ctz(mask);
haystack += offset;
if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack))
if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
@ -559,7 +557,7 @@ public:
bool compare(const UInt8 * pos) const
{
#if __SSE4_1__
if (page_safe(pos))
if (pageSafe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache);
@ -609,7 +607,7 @@ public:
while (haystack < haystack_end)
{
#if __SSE4_1__
if (haystack + n <= haystack_end && page_safe(haystack))
if (haystack + n <= haystack_end && pageSafe(haystack))
{
/// find first character
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
@ -627,7 +625,7 @@ public:
const auto offset = __builtin_ctz(mask);
haystack += offset;
if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack))
if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack))
{
/// check for first 16 octets
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
@ -693,10 +691,10 @@ using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
/** Используют функции из libc.
* Имеет смысл использовать для коротких строк, когда требуется дешёвая инициализация.
* Нет варианта для регистронезависимого поиска UTF-8 строк.
* Требуется, чтобы за концом строк был нулевой байт.
/** Uses functions from libc.
* It makes sense to use only with short haystacks when cheap initialization is required.
* There is no option for case-insensitive search for UTF-8 strings.
* It is required that strings are zero-terminated.
*/
struct LibCASCIICaseSensitiveStringSearcher

View File

@ -1,11 +1,13 @@
#pragma once
#include <time.h> /// nanosleep
#include <mutex>
#include <memory>
#include <Common/Stopwatch.h>
#include <Common/Exception.h>
#include <IO/WriteHelpers.h>
namespace DB
{
@ -15,12 +17,12 @@ namespace ErrorCodes
}
/** Позволяет ограничить скорость чего либо (в штуках в секунду) с помощью sleep.
* Особенности работы:
* - считается только средняя скорость, от момента первого вызова функции add;
* если были периоды с низкой скоростью, то в течение промежутка времени после них, скорость будет выше;
/** Allows you to limit the speed of something (in entities per second) using sleep.
* Specifics of work:
* - only the average speed is considered, from the moment of the first call of `add` function;
* if there were periods with low speed, then during some time after them, the speed will be higher;
*
* Также позволяет задать ограничение на максимальное количество в штуках. При превышении кидается исключение.
* Also allows you to set a limit on the maximum number of entities. If exceeded, an exception will be thrown.
*/
class Throttler
{
@ -56,7 +58,7 @@ public:
if (max_speed)
{
/// Сколько должно было бы пройти времени, если бы скорость была равна max_speed.
/// How much time to wait for the average speed to become `max_speed`.
UInt64 desired_ns = new_count * 1000000000 / max_speed;
if (desired_ns > elapsed_ns)
@ -65,7 +67,7 @@ public:
timespec sleep_ts;
sleep_ts.tv_sec = sleep_ns / 1000000000;
sleep_ts.tv_nsec = sleep_ns % 1000000000;
nanosleep(&sleep_ts, nullptr); /// NOTE Завершается раньше в случае сигнала. Это считается нормальным.
nanosleep(&sleep_ts, nullptr); /// NOTE Returns early in case of a signal. This is considered normal.
}
}
}
@ -73,7 +75,7 @@ public:
private:
size_t max_speed = 0;
size_t count = 0;
size_t limit = 0; /// 0 - не ограничено.
size_t limit = 0; /// 0 - not limited.
const char * limit_exceeded_exception_message = nullptr;
Stopwatch watch {CLOCK_MONOTONIC_COARSE};
std::mutex mutex;

View File

@ -4,12 +4,16 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#if __SSE4_2__
#include <nmmintrin.h>
#endif
namespace DB
{
/// Для агрегации по SipHash или конкатенации нескольких полей.
/// For aggregation by SipHash or concatenation of several fields.
struct UInt128
{
/// Suppress gcc7 warnings: 'prev_key.DB::UInt128::first' may be used uninitialized in this function
@ -42,22 +46,22 @@ struct UInt128Hash
}
};
#if defined(__x86_64__)
#if __SSE4_2__
struct UInt128HashCRC32
{
size_t operator()(UInt128 x) const
{
UInt64 crc = -1ULL;
asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.first));
asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.second));
crc = _mm_crc32_u64(crc, x.first);
crc = _mm_crc32_u64(crc, x.second);
return crc;
}
};
#else
/// На других платформах используем не обязательно CRC32. NOTE Это может сбить с толку.
/// On other platforms we do not use CRC32. NOTE This can be confusing.
struct UInt128HashCRC32 : public UInt128Hash {};
#endif
@ -71,7 +75,7 @@ inline void readBinary(UInt128 & x, ReadBuffer & buf) { readPODBinary(x, buf); }
inline void writeBinary(const UInt128 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
/** Используется при агрегации, для укладки большого количества ключей постоянной длины в хэш-таблицу.
/** Used for aggregation, for putting a large number of constant-length keys in a hash table.
*/
struct UInt256
{
@ -91,7 +95,7 @@ struct UInt256
{
return a == rhs.a && b == rhs.b && c == rhs.c && d == rhs.d;
/* Так получается не лучше.
/* So it's no better.
return 0xFFFF == _mm_movemask_epi8(_mm_and_si128(
_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&a)),
@ -122,30 +126,30 @@ struct UInt256Hash
}
};
#if defined(__x86_64__)
#if __SSE4_2__
struct UInt256HashCRC32
{
size_t operator()(UInt256 x) const
{
UInt64 crc = -1ULL;
asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.a));
asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.b));
asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.c));
asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.d));
crc = _mm_crc32_u64(crc, x.a);
crc = _mm_crc32_u64(crc, x.b);
crc = _mm_crc32_u64(crc, x.c);
crc = _mm_crc32_u64(crc, x.d);
return crc;
}
};
#else
/// На других платформах используем не обязательно CRC32. NOTE Это может сбить с толку.
/// We do not need to use CRC32 on other platforms. NOTE This can be confusing.
struct UInt256HashCRC32
{
DefaultHash<UInt64> hash64;
size_t operator()(UInt256 x) const
{
/// TODO Это не оптимально.
/// TODO This is not optimal.
return hash64(hash64(hash64(hash64(x.a) ^ x.b) ^ x.c) ^ x.d);
}
};

View File

@ -8,7 +8,7 @@
#define UNICODE_BAR_CHAR_SIZE (strlen("█"))
/** Позволяет нарисовать unicode-art полоску, ширина которой отображается с разрешением 1/8 символа.
/** Allows you to draw a unicode-art bar whose width is displayed with a resolution of 1/8 character.
*/
@ -32,7 +32,7 @@ namespace UnicodeBar
return ceil(width - 1.0 / 8) * UNICODE_BAR_CHAR_SIZE;
}
/// В dst должно быть место для barWidthInBytes(width) символов и завершающего нуля.
/// In `dst` there must be a space for barWidthInBytes(width) characters and a trailing zero.
inline void render(double width, char * dst)
{
size_t floor_width = floor(width);

View File

@ -16,23 +16,23 @@ class Context;
namespace VirtualColumnUtils
{
/// Вычислить минимальный числовый суффикс, который надо добавить к строке, чтобы она не присутствовала в множестве
/// Calculate the minimum numeric suffix to add to the string so that it is not present in the set
String chooseSuffix(const NamesAndTypesList & columns, const String & name);
/// Вычислить минимальный общий числовый суффикс, который надо добавить к каждой строке,
/// чтобы ни одна не присутствовала в множестве.
/// Calculate the minimum total numeric suffix to add to each string,
/// so that none is present in the set.
String chooseSuffixForSet(const NamesAndTypesList & columns, const std::vector<String> & names);
/// Добавляет в селект запрос секцию select column_name as value
/// Например select _port as 9000.
/// Adds to the select query section `select column_name as value`
/// For example select _port as 9000.
void rewriteEntityInAst(ASTPtr ast, const String & column_name, const Field & value);
/// Оставить в блоке только строки, подходящие под секции WHERE и PREWHERE запроса.
/// Рассматриваются только элементы внешней конъюнкции, зависящие только от столбцов, присутствующих в блоке.
/// Возвращает true, если хоть одна строка выброшена.
/// Leave in the block only the rows that fit under the WHERE clause and the PREWHERE clause of the query.
/// Only elements of the outer conjunction are considered, depending only on the columns present in the block.
/// Returns true if at least one row is discarded.
bool filterBlockWithQuery(ASTPtr query, Block & block, const Context & context);
/// Извлечь из входного потока множество значений столбца name
/// Extract from the input stream a set of `name` column values
template<typename T1>
std::multiset<T1> extractSingleValueFromBlock(const Block & block, const String & name)
{

View File

@ -2,6 +2,7 @@
#include <Common/StringSearcher.h>
#include <Common/StringUtils.h>
#include <Core/Types.h>
#include <Poco/UTF8Encoding.h>
#include <Poco/Unicode.h>
#include <ext/range.hpp>
@ -9,24 +10,24 @@
#include <string.h>
/** Поиск подстроки в строке по алгоритму Вольницкого:
/** Search for a substring in a string by Volnitsky's algorithm
* http://volnitsky.com/project/str_search/
*
* haystack и needle могут содержать нулевые байты.
* `haystack` and `needle` can contain zero bytes.
*
* Алгоритм:
* - при слишком маленьком или слишком большом размере needle, или слишком маленьком haystack, используем std::search или memchr;
* - при инициализации, заполняем open-addressing linear probing хэш-таблицу вида:
* хэш от биграммы из needle -> позиция этой биграммы в needle + 1.
* (прибавлена единица только чтобы отличить смещение ноль от пустой ячейки)
* - в хэш-таблице ключи не хранятся, хранятся только значения;
* - биграммы могут быть вставлены несколько раз, если они встречаются в needle несколько раз;
* - при поиске, берём из haystack биграмму, которая должна соответствовать последней биграмме needle (сравниваем с конца);
* - ищем её в хэш-таблице, если нашли - достаём смещение из хэш-таблицы и сравниваем строку побайтово;
* - если сравнить не получилось - проверяем следующую ячейку хэш-таблицы из цепочки разрешения коллизий;
* - если не нашли, пропускаем в haystack почти размер needle байт;
* Algorithm:
* - if the `needle` is too small or too large, or too small `haystack`, use std::search or memchr;
* - when initializing, fill in an open-addressing linear probing hash table of the form
* hash from the bigram of needle -> the position of this bigram in needle + 1.
* (one is added only to distinguish zero offset from an empty cell)
* - the keys are not stored in the hash table, only the values are stored;
* - bigrams can be inserted several times if they occur in the needle several times;
* - when searching, take from haystack bigram, which should correspond to the last bigram of needle (comparing from the end);
* - look for it in the hash table, if found - get the offset from the hash table and compare the string bytewise;
* - if it did not match, we check the next cell of the hash table from the collision resolution chain;
* - if not found, skip to haystack almost the size of the needle bytes;
*
* Используется невыровненный доступ к памяти.
* Unaligned memory access is used.
*/
@ -39,34 +40,35 @@ template <typename CRTP>
class VolnitskyBase
{
protected:
using offset_t = uint8_t; /// Смещение в needle. Для основного алгоритма, длина needle не должна быть больше 255.
using ngram_t = uint16_t; /// n-грамма (2 байта).
using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255.
using Ngram = UInt16; /// n-gram (2 bytes).
const UInt8 * const needle;
const size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
/// На сколько двигаемся, если n-грамма из haystack не нашлась в хэш-таблице.
const size_t step = needle_size - sizeof(ngram_t) + 1;
/// For how long we move, if the n-gram from haystack is not found in the hash table.
const size_t step = needle_size - sizeof(Ngram) + 1;
/** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)
* storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
static const size_t hash_size = 64 * 1024; /// Помещается в L2-кэш.
offset_t hash[hash_size]; /// Хэш-таблица.
* storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
static const size_t hash_size = 64 * 1024; /// Fits into the L2 cache (of common Intel CPUs).
Offset hash[hash_size]; /// Hash table.
/// min haystack size to use main algorithm instead of fallback
static constexpr auto min_haystack_size_for_algorithm = 20000;
const bool fallback; /// Нужно ли использовать fallback алгоритм.
const bool fallback; /// Do we need to use the fallback algorithm.
public:
/** haystack_size_hint - ожидаемый суммарный размер haystack при вызовах search. Можно не указывать.
* Если указать его достаточно маленьким, то будет использован fallback алгоритм,
* так как считается, что тратить время на инициализацию хэш-таблицы не имеет смысла.
/** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
* If you specify it small enough, the fallback algorithm will be used,
* since it is considered that it's useless to waste time initializing the hash table.
*/
VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0)
: needle{reinterpret_cast<const UInt8 *>(needle)}, needle_size{needle_size},
fallback{
needle_size < 2 * sizeof(ngram_t) || needle_size >= std::numeric_limits<offset_t>::max() ||
(haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)}
needle_size < 2 * sizeof(Ngram)
|| needle_size >= std::numeric_limits<Offset>::max()
|| (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)}
{
if (fallback)
return;
@ -74,12 +76,12 @@ public:
memset(hash, 0, sizeof(hash));
/// int is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
for (auto i = static_cast<int>(needle_size - sizeof(ngram_t)); i >= 0; --i)
for (auto i = static_cast<int>(needle_size - sizeof(Ngram)); i >= 0; --i)
self().putNGram(this->needle + i, i + 1, this->needle);
}
/// Если не найдено - возвращается конец haystack.
/// If not found, the end of the haystack is returned.
const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const
{
if (needle_size == 0)
@ -90,15 +92,15 @@ public:
if (needle_size == 1 || fallback || haystack_size <= needle_size)
return self().search_fallback(haystack, haystack_end);
/// Будем "прикладывать" needle к haystack и сравнивать n-грам из конца needle.
const auto * pos = haystack + needle_size - sizeof(ngram_t);
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
const auto * pos = haystack + needle_size - sizeof(Ngram);
for (; pos <= haystack_end - needle_size; pos += step)
{
/// Смотрим все ячейки хэш-таблицы, которые могут соответствовать n-граму из haystack.
/// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
for (size_t cell_num = toNGram(pos) % hash_size; hash[cell_num];
cell_num = (cell_num + 1) % hash_size)
{
/// Когда нашли - сравниваем побайтово, используя смещение из хэш-таблицы.
/// When found - compare bytewise, using the offset from the hash table.
const auto res = pos - (hash[cell_num] - 1);
if (self().compare(res))
@ -106,7 +108,7 @@ public:
}
}
/// Оставшийся хвостик.
/// The remaining tail.
return self().search_fallback(pos - step + 1, haystack_end);
}
@ -119,18 +121,18 @@ protected:
CRTP & self() { return static_cast<CRTP &>(*this); }
const CRTP & self() const { return const_cast<VolnitskyBase *>(this)->self(); }
static const ngram_t & toNGram(const UInt8 * const pos)
static const Ngram & toNGram(const UInt8 * const pos)
{
return *reinterpret_cast<const ngram_t *>(pos);
return *reinterpret_cast<const Ngram *>(pos);
}
void putNGramBase(const ngram_t ngram, const int offset)
void putNGramBase(const Ngram ngram, const int offset)
{
/// Кладём смещение для n-грама в соответствующую ему ячейку или ближайшую свободную.
/// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
size_t cell_num = ngram % hash_size;
while (hash[cell_num])
cell_num = (cell_num + 1) % hash_size; /// Поиск следующей свободной ячейки.
cell_num = (cell_num + 1) % hash_size; /// Search for the next free cell.
hash[cell_num] = offset;
}
@ -145,7 +147,7 @@ protected:
union
{
ngram_t n;
Ngram n;
Chars chars;
};
@ -260,7 +262,7 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
union
{
ngram_t n;
Ngram n;
Chars chars;
};
@ -272,15 +274,17 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
}
else
{
/** n-грам (в случае n = 2)
* может быть целиком расположен внутри одной кодовой точки,
* либо пересекаться с двумя кодовыми точками.
/** n-gram (in the case of n = 2)
* can be entirely located within one code point,
* or intersect with two code points.
*
* В первом случае, нужно рассматривать до двух альтернатив - эта кодовая точка в верхнем и нижнем регистре,
* а во втором случае - до четырёх альтернатив - фрагменты двух кодовых точек во всех комбинациях регистров.
* In the first case, you need to consider up to two alternatives - this code point in upper and lower case,
* and in the second case - up to four alternatives - fragments of two code points in all combinations of cases.
*
* При этом не учитывается зависимость перевода между регистрами от локали (пример - турецкие Ii)
* а также композиция/декомпозиция и другие особенности.
* It does not take into account the dependence of the case-transformation from the locale (for example - Turkish `Ii`)
* as well as composition / decomposition and other features.
*
* It also does not work if characters with lower and upper cases are represented by different number of bytes or code points.
*/
using Seq = UInt8[6];
@ -302,12 +306,12 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
putNGramBase(n, offset);
else
{
/// where is the given ngram in respect to UTF-8 sequence start?
/// where is the given ngram in respect to the start of UTF-8 sequence?
const auto seq_ngram_offset = pos - seq_pos;
Seq seq;
/// put ngram from lowercase
/// put ngram for lowercase
utf8.convert(l_u32, seq, sizeof(seq));
chars.c0 = seq[seq_ngram_offset];
chars.c1 = seq[seq_ngram_offset + 1];
@ -326,7 +330,7 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
/// first sequence may start before u_pos if it is not ASCII
auto first_seq_pos = pos;
UTF8::syncBackward(first_seq_pos, begin);
/// where is the given ngram in respect to the first UTF-8 sequence start?
/// where is the given ngram in respect to the start of first UTF-8 sequence?
const auto seq_ngram_offset = pos - first_seq_pos;
const auto first_u32 = utf8.convert(first_seq_pos);

View File

@ -4,14 +4,14 @@
#include <IO/WriteBuffer.h>
/// Выводит переданный размер в байтах в виде 123.45 GiB.
/// Displays the passed size in bytes as 123.45 GiB.
void formatReadableSizeWithBinarySuffix(double value, DB::WriteBuffer & out, int precision = 2);
std::string formatReadableSizeWithBinarySuffix(double value, int precision = 2);
/// Выводит переданный размер в байтах в виде 132.55 GB.
/// Displays the passed size in bytes as 132.55 GB.
void formatReadableSizeWithDecimalSuffix(double value, DB::WriteBuffer & out, int precision = 2);
std::string formatReadableSizeWithDecimalSuffix(double value, int precision = 2);
/// Выводит число в виде 123.45 billion.
/// Prints the number as 123.45 billion.
void formatReadableQuantity(double value, DB::WriteBuffer & out, int precision = 2);
std::string formatReadableQuantity(double value, int precision = 2);

View File

@ -2,7 +2,7 @@
#include <string>
/** Получить FQDN для локального сервера путём DNS-резолвинга hostname - аналогично вызову утилиты hostname с флагом -f.
* Если не получилось отрезолвить, то вернуть hostname - аналогично вызову утилиты hostname без флагов или uname -n.
/** Get the FQDN for the local server by resolving DNS hostname - similar to calling the 'hostname' tool with the -f flag.
* If it does not work, return hostname - similar to calling 'hostname' without flags or 'uname -n'.
*/
const std::string & getFQDNOrHostName();

View File

@ -39,10 +39,7 @@ std::ostream & operator<<(std::ostream & stream, const DB::IDataType & what)
std::ostream & operator<<(std::ostream & stream, const DB::IStorage & what)
{
stream << "IStorage(name = " << what.getName() << ", tableName = " << what.getTableName() << ") {"
// TODO: uncomment #if and fix me:
#if !defined(__APPLE__)
<< what.getColumnsList()
#endif
<< what.getColumnsList().toString()
<< "}";
// isRemote supportsSampling supportsFinal supportsPrewhere supportsParallelReplicas
return stream;
@ -64,10 +61,7 @@ std::ostream & operator<<(std::ostream & stream, const DB::IFunction & what)
std::ostream & operator<<(std::ostream & stream, const DB::Block & what)
{
stream << "Block("
// TODO: uncomment #if and fix me:
#if !defined(__APPLE__)
<< "data = " << what.getColumns()
#endif
<< "size = " << what.getColumns().size()
<< ")";
return stream;
}

View File

@ -12,13 +12,13 @@ namespace Poco
namespace DB
{
/** Позволяет проверить, похож ли адрес на localhost.
* Цель этой проверки обычно состоит в том, чтобы сделать предположение,
* что при хождении на этот адрес через интернет, мы попадём на себя.
* Следует иметь ввиду, что эта проверка делается неточно:
* - адрес просто сравнивается с адресами сетевых интерфейсов;
* - для каждого сетевого интерфейса берётся только первый адрес;
* - не проверяются правила маршрутизации, которые влияют, через какой сетевой интерфейс мы пойдём на заданный адрес.
/** Lets you check if the address is similar to `localhost`.
* The purpose of this check is usually to make an assumption,
* that when we go to this address via the Internet, we'll get to ourselves.
* Please note that this check is not accurate:
* - the address is simply compared to the addresses of the network interfaces;
* - only the first address is taken for each network interface;
* - the routing rules that affect which network interface we go to the specified address are not checked.
*/
bool isLocalAddress(const Poco::Net::SocketAddress & address);

View File

@ -3,14 +3,14 @@
#include <Poco/Path.h>
/** Создаёт локальный (в той же точке монтирования) бэкап (снэпшот) директории.
/** Creates a local (at the same mount point) backup (snapshot) directory.
*
* В указанной destination-директории создаёт hard link-и на все файлы source-директории
* и во всех вложенных директориях, с сохранением (созданием) всех относительных путей;
* а также делает chown, снимая разрешение на запись.
* In the specified destination directory, it creates a hard links on all source-directory files
* and in all nested directories, with saving (creating) all relative paths;
* and also `chown`, removing the write permission.
*
* Это защищает данные от случайного удаления или модификации,
* и предназначено для использования как простое средство защиты от человеческой или программной ошибки,
* но не от аппаратного сбоя.
* This protects data from accidental deletion or modification,
* and is intended to be used as a simple means of protection against a human or program error,
* but not from a hardware failure.
*/
void localBackup(Poco::Path source_path, Poco::Path destination_path);

View File

@ -1,7 +1,7 @@
#pragma once
/** Устанавливает имя потока (максимальная длина - 15 байт),
* которое будет видно в ps, gdb, /proc,
* для удобства наблюдений и отладки.
/** Sets the thread name (maximum length is 15 bytes),
* which will be visible in ps, gdb, /proc,
* for convenience of observation and debugging.
*/
void setThreadName(const char * name);

View File

@ -54,3 +54,6 @@ target_link_libraries (thread_pool dbms)
add_executable (array_cache array_cache.cpp)
target_link_libraries (array_cache dbms)
add_executable (space_saving space_saving.cpp)
target_link_libraries (space_saving dbms)

View File

@ -0,0 +1,101 @@
#include <iostream>
#include <iomanip>
#include <string>
#include <map>
#include <Core/StringRef.h>
#include <Common/SpaceSaving.h>
int main(int argc, char ** argv)
{
{
using Cont = DB::SpaceSaving<int>;
Cont first(10);
/* Test biased insertion */
for (int i = 0; i < 200; ++i) {
first.insert(i);
int k = i % 5; // Bias towards 0-4
first.insert(k);
}
/* Test whether the biased elements are retained */
std::map<int, UInt64> expect;
for (int i = 0; i < 5; ++i) {
expect[i] = 41;
}
for (auto x : first.topK(5)) {
if (expect[x.key] != x.count) {
std::cerr << "key: " << x.key << " value: " << x.count << " expected: " << expect[x.key] << std::endl;
} else {
std::cout << "key: " << x.key << " value: " << x.count << std::endl;
}
expect.erase(x.key);
}
if (!expect.empty()) {
std::cerr << "expected to find all heavy hitters" << std::endl;
}
/* Create another table and test merging */
Cont second(10);
for (int i = 0; i < 200; ++i) {
first.insert(i);
}
for (int i = 0; i < 5; ++i) {
expect[i] = 42;
}
first.merge(second);
for (auto x : first.topK(5)) {
if (expect[x.key] != x.count) {
std::cerr << "key: " << x.key << " value: " << x.count << " expected: " << expect[x.key] << std::endl;
} else {
std::cout << "key: " << x.key << " value: " << x.count << std::endl;
}
expect.erase(x.key);
}
}
{
/* Same test for string keys */
using Cont = DB::SpaceSaving<std::string, StringRef, StringRefHash>;
Cont cont(10);
for (int i = 0; i < 400; ++i) {
cont.insert(std::to_string(i));
cont.insert(std::to_string(i % 5)); // Bias towards 0-4
}
// The hashing is going to be more lossy
// Expect at least ~ 10% count
std::map<std::string, UInt64> expect;
for (int i = 0; i < 5; ++i) {
expect[std::to_string(i)] = 38;
}
for (auto x : cont.topK(5)) {
auto key = x.key;
if (x.count < expect[key]) {
std::cerr << "key: " << key << " value: " << x.count << " expected: " << expect[key] << std::endl;
} else {
std::cout << "key: " << key << " value: " << x.count << std::endl;
}
expect.erase(key);
}
if (!expect.empty()) {
std::cerr << "expected to find all heavy hitters" << std::endl;
abort();
}
}
return 0;
}

View File

@ -16,9 +16,9 @@ namespace DB
}
/** Проверяет совпадение типа путём сравнения typeid-ов.
* Проверяется точное совпадение типа. То есть, cast в предка будет неуспешным.
* В остальном, ведёт себя как dynamic_cast.
/** Checks type by comparing typeid.
* The exact match of the type is checked. That is, cast in the ancestor will be unsuccessful.
* In the rest, behaves like a dynamic_cast.
*/
template <typename To, typename From>
typename std::enable_if<std::is_reference<To>::value, To>::type typeid_cast(From & from)

Some files were not shown because too many files have changed in this diff Show More