diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000000..f288c211faa --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "doc/presentations"] + path = doc/presentations + url = https://github.com/yandex/clickhouse-presentations.git diff --git a/cmake/dbms_glob_sources.cmake b/cmake/dbms_glob_sources.cmake index 7d9bd2444b1..2d2200c2c55 100644 --- a/cmake/dbms_glob_sources.cmake +++ b/cmake/dbms_glob_sources.cmake @@ -4,7 +4,7 @@ macro(add_glob cur_list) endmacro() macro(add_headers_and_sources prefix common_path) - add_glob(${prefix}_headers RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h ${common_path}/*.inl) + add_glob(${prefix}_headers RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h) add_glob(${prefix}_sources ${common_path}/*.cpp ${common_path}/*.h) endmacro() diff --git a/cmake/find_libtool.cmake b/cmake/find_libtool.cmake index b99f324da59..bee27b2d69b 100644 --- a/cmake/find_libtool.cmake +++ b/cmake/find_libtool.cmake @@ -1,3 +1,3 @@ set (LTDL_PATHS "/usr/local/opt/libtool/lib") -find_library (LTDL_LIB ltdl PATHSS ${LTDL_PATHS}) +find_library (LTDL_LIB ltdl PATHS ${LTDL_PATHS}) message (STATUS "Using ltdl: ${LTDL_LIB}") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 0aba2218671..d5ed783e2cc 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -31,6 +31,7 @@ endif () add_subdirectory (libcityhash) add_subdirectory (libfarmhash) add_subdirectory (libmetrohash) +add_subdirectory (libbtrie) if (USE_INTERNAL_ZLIB_LIBRARY) add_subdirectory (libzlib-ng) diff --git a/contrib/libbtrie/CMakeLists.txt b/contrib/libbtrie/CMakeLists.txt new file mode 100644 index 00000000000..8d91eb1c316 --- /dev/null +++ b/contrib/libbtrie/CMakeLists.txt @@ -0,0 +1,6 @@ +include_directories (BEFORE include) + +add_library (btrie + src/btrie.c + include/btrie.h +) diff --git a/contrib/libbtrie/LICENSE b/contrib/libbtrie/LICENSE new file mode 100644 index 00000000000..d386c6f7b79 --- /dev/null +++ b/contrib/libbtrie/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) 2013, CobbLiu +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/libbtrie/include/btrie.h b/contrib/libbtrie/include/btrie.h new file mode 100644 index 00000000000..e395ddb09fe --- /dev/null +++ b/contrib/libbtrie/include/btrie.h @@ -0,0 +1,155 @@ +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + +#include +#include + +/** + * In btrie, each leaf means one bit in ip tree. + * Left means 0, and right means 1. + */ + +#define BTRIE_NULL (uintptr_t) -1 +#define MAX_PAGES 1024 * 16 + +typedef struct btrie_node_s btrie_node_t; + +struct btrie_node_s { + btrie_node_t *right; + btrie_node_t *left; + btrie_node_t *parent; + uintptr_t value; +}; + + +typedef struct btrie_s { + btrie_node_t *root; + + btrie_node_t *free; /* free list of btrie */ + char *start; + size_t size; + + /* + * memory pool. + * memory management(esp free) will be so easy by using this facility. + */ + char *pools[MAX_PAGES]; + size_t len; +} btrie_t; + + +/** + * Create an empty btrie + * + * @Return: + * An ip radix_tree created. + * NULL if creation failed. + */ + +btrie_t *btrie_create(); + +/** + * Destroy the ip radix_tree + * + * @Return: + * OK if deletion succeed. + * ERROR if error occurs while deleting. + */ +int btrie_destroy(btrie_t *tree); + +/** + * Count the nodes in the radix tree. + */ +size_t btrie_count(btrie_t *tree); + +/** + * Return the allocated number of bytes. + */ +size_t btrie_allocated(btrie_t *tree); + + +/** + * Add an ipv4 into btrie + * + * @Args: + * key: ip address + * mask: key's mask + * value: value of this IP, may be NULL. + * + * @Return: + * OK for success. + * ERROR for failure. + */ +int btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask, + uintptr_t value); + + +/** + * Delete an ipv4 from btrie + * + * @Args: + * + * @Return: + * OK for success. + * ERROR for failure. + */ +int btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask); + + +/** + * Find an ipv4 from btrie + * + + * @Args: + * + * @Return: + * Value if succeed. + * NULL if failed. + */ +uintptr_t btrie_find(btrie_t *tree, uint32_t key); + + +/** + * Add an ipv6 into btrie + * + * @Args: + * key: ip address + * mask: key's mask + * value: value of this IP, may be NULL. + * + * @Return: + * OK for success. + * ERROR for failure. + */ +int btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask, + uintptr_t value); + +/** + * Delete an ipv6 from btrie + * + * @Args: + * + * @Return: + * OK for success. + * ERROR for failure. + */ +int btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask); + +/** + * Find an ipv6 from btrie + * + + * @Args: + * + * @Return: + * Value if succeed. + * NULL if failed. + */ +uintptr_t btrie_find_a6(btrie_t *tree, const uint8_t *key); + +#if defined (__cplusplus) +} +#endif \ No newline at end of file diff --git a/contrib/libbtrie/src/btrie.c b/contrib/libbtrie/src/btrie.c new file mode 100644 index 00000000000..e959d3f1786 --- /dev/null +++ b/contrib/libbtrie/src/btrie.c @@ -0,0 +1,460 @@ +#include +#include +#include + +#define PAGE_SIZE 4096 + + +static btrie_node_t * +btrie_alloc(btrie_t *tree) +{ + btrie_node_t *p; + + if (tree->free) { + p = tree->free; + tree->free = tree->free->right; + return p; + } + + if (tree->size < sizeof(btrie_node_t)) { + tree->start = (char *) calloc(sizeof(char), PAGE_SIZE); + if (tree->start == NULL) { + return NULL; + } + + tree->pools[tree->len++] = tree->start; + tree->size = PAGE_SIZE; + } + + p = (btrie_node_t *) tree->start; + + tree->start += sizeof(btrie_node_t); + tree->size -= sizeof(btrie_node_t); + + return p; +} + + +btrie_t * +btrie_create() +{ + btrie_t *tree = (btrie_t *) malloc(sizeof(btrie_t)); + if (tree == NULL) { + return NULL; + } + + tree->free = NULL; + tree->start = NULL; + tree->size = 0; + memset(tree->pools, 0, sizeof(btrie_t *) * MAX_PAGES); + tree->len = 0; + + tree->root = btrie_alloc(tree); + if (tree->root == NULL) { + return NULL; + } + + tree->root->right = NULL; + tree->root->left = NULL; + tree->root->parent = NULL; + tree->root->value = BTRIE_NULL; + + return tree; +} + +static size_t +subtree_weight(btrie_node_t *node) +{ + size_t weight = 1; + if (node->left) { + weight += subtree_weight(node->left); + } + if (node->right) { + weight += subtree_weight(node->right); + } + return weight; +} + +size_t +btrie_count(btrie_t *tree) +{ + if (tree->root == NULL) { + return 0; + } + + return subtree_weight(tree->root); +} + +size_t +btrie_allocated(btrie_t *tree) +{ + return tree->len * PAGE_SIZE; +} + + +int +btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask, + uintptr_t value) +{ + uint32_t bit; + btrie_node_t *node, *next; + + bit = 0x80000000; + + node = tree->root; + next = tree->root; + + while (bit & mask) { + if (key & bit) { + next = node->right; + + } else { + next = node->left; + } + + if (next == NULL) { + break; + } + + bit >>= 1; + node = next; + } + + if (next) { + if (node->value != BTRIE_NULL) { + return -1; + } + + node->value = value; + return 0; + } + + while (bit & mask) { + next = btrie_alloc(tree); + if (next == NULL) { + return -1; + } + + next->right = NULL; + next->left = NULL; + next->parent = node; + next->value = BTRIE_NULL; + + if (key & bit) { + node->right = next; + + } else { + node->left = next; + } + + bit >>= 1; + node = next; + } + + node->value = value; + + return 0; +} + + +int +btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask) +{ + uint32_t bit; + btrie_node_t *node; + + bit = 0x80000000; + node = tree->root; + + while (node && (bit & mask)) { + if (key & bit) { + node = node->right; + + } else { + node = node->left; + } + + bit >>= 1; + } + + if (node == NULL) { + return -1; + } + + if (node->right || node->left) { + if (node->value != BTRIE_NULL) { + node->value = BTRIE_NULL; + return 0; + } + + return -1; + } + + for ( ;; ) { + if (node->parent->right == node) { + node->parent->right = NULL; + + } else { + node->parent->left = NULL; + } + + node->right = tree->free; + tree->free = node; + + node = node->parent; + + if (node->right || node->left) { + break; + } + + if (node->value != BTRIE_NULL) { + break; + } + + if (node->parent == NULL) { + break; + } + } + + return 0; +} + + +uintptr_t +btrie_find(btrie_t *tree, uint32_t key) +{ + uint32_t bit; + uintptr_t value; + btrie_node_t *node; + + bit = 0x80000000; + value = BTRIE_NULL; + node = tree->root; + + while (node) { + if (node->value != BTRIE_NULL) { + value = node->value; + } + + if (key & bit) { + node = node->right; + + } else { + node = node->left; + } + + bit >>= 1; + } + + return value; +} + + +int +btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask, + uintptr_t value) +{ + uint8_t bit; + uint i; + btrie_node_t *node, *next; + + i = 0; + bit = 0x80; + + node = tree->root; + next = tree->root; + + while (bit & mask[i]) { + if (key[i] & bit) { + next = node->right; + + } else { + next = node->left; + } + + if (next == NULL) { + break; + } + + bit >>= 1; + node = next; + + if (bit == 0) { + if (++i == 16) { + break; + } + + bit = 0x80; + } + } + + if (next) { + if (node->value != BTRIE_NULL) { + return -1; + } + + node->value = value; + return 0; + } + + while (bit & mask[i]) { + next = btrie_alloc(tree); + if (next == NULL) { + return -1; + } + + next->right = NULL; + next->left = NULL; + next->parent = node; + next->value = BTRIE_NULL; + + if (key[i] & bit) { + node->right = next; + + } else { + node->left = next; + } + + bit >>= 1; + node = next; + + if (bit == 0) { + if (++i == 16) { + break; + } + + bit = 0x80; + } + } + + node->value = value; + + return 0; +} + + +int +btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask) +{ + uint8_t bit; + uint i; + btrie_node_t *node; + + i = 0; + bit = 0x80; + node = tree->root; + + while (node && (bit & mask[i])) { + if (key[i] & bit) { + node = node->right; + + } else { + node = node->left; + } + + bit >>= 1; + + if (bit == 0) { + if (++i == 16) { + break; + } + + bit = 0x80; + } + } + + if (node == NULL) { + return -1; + } + + if (node->right || node->left) { + if (node->value != BTRIE_NULL) { + node->value = BTRIE_NULL; + return 0; + } + + return -1; + } + + for ( ;; ) { + if (node->parent->right == node) { + node->parent->right = NULL; + + } else { + node->parent->left = NULL; + } + + node->right = tree->free; + tree->free = node; + + node = node->parent; + + if (node->right || node->left) { + break; + } + + if (node->value != BTRIE_NULL) { + break; + } + + if (node->parent == NULL) { + break; + } + } + + return 0; +} + + +uintptr_t +btrie_find_a6(btrie_t *tree, const uint8_t *key) +{ + uint8_t bit; + uintptr_t value; + uint i; + btrie_node_t *node; + + i = 0; + bit = 0x80; + value = BTRIE_NULL; + node = tree->root; + + while (node) { + if (node->value != BTRIE_NULL) { + value = node->value; + } + + if (key[i] & bit) { + node = node->right; + + } else { + node = node->left; + } + + bit >>= 1; + + if (bit == 0) { + i++; + bit = 0x80; + } + } + + return value; +} + + +int +btrie_destroy(btrie_t *tree) +{ + size_t i; + + + /* free memory pools */ + for (i = 0; i < tree->len; i++) { + free(tree->pools[i]); + } + + free(tree); + + return 0; +} diff --git a/contrib/libbtrie/test/test_btrie.c b/contrib/libbtrie/test/test_btrie.c new file mode 100644 index 00000000000..a2daca2d6d5 --- /dev/null +++ b/contrib/libbtrie/test/test_btrie.c @@ -0,0 +1,94 @@ +#include +#include + +int main() +{ + btrie_t *it; + int ret; + + uint8_t prefix_v6[16] = {0xde, 0xad, 0xbe, 0xef}; + uint8_t mask_v6[16] = {0xff, 0xff, 0xff}; + uint8_t ip_v6[16] = {0xde, 0xad, 0xbe, 0xef, 0xde}; + + it = btrie_create(); + if (it == NULL) { + printf("create error!\n"); + return 0; + } + + //add 101.45.69.50/16 + ret = btrie_insert(it, 1697465650, 0xffff0000, 1); + if (ret != 0) { + printf("insert 1 error.\n"); + goto error; + } + + //add 10.45.69.50/16 + ret = btrie_insert(it, 170738994, 0xffff0000, 1); + if (ret != 0) { + printf("insert 2 error.\n"); + goto error; + } + + //add 10.45.79.50/16 + ret = btrie_insert(it, 170741554, 0xffff0000, 1); + if (ret == 0) { + printf("insert 3 error.\n"); + goto error; + } + + //add 102.45.79.50/24 + ret = btrie_insert(it, 1714245426, 0xffffff00, 1); + if (ret != 0) { + printf("insert 4 error.\n"); + goto error; + } + + ret = btrie_find(it, 170741554); + if (ret == 1) { + printf("test case 1 passed\n"); + } else { + printf("test case 1 error\n"); + } + + ret = btrie_find(it, 170786817); + if (ret != 1) { + printf("test case 2 passed\n"); + } else { + printf("test case 2 error\n"); + } + + ret = btrie_delete(it, 1714245426, 0xffffff00); + if (ret != 0) { + printf("delete 1 error\n"); + goto error; + } + + ret = btrie_find(it, 1714245426); + if (ret != 1) { + printf("test case 3 passed\n"); + } else { + printf("test case 3 error\n"); + } + + //add dead:beef::/32 + ret = btrie_insert_a6(it, prefix_v6, mask_v6, 1); + if (ret != 0) { + printf("insert 5 error\n"); + goto error; + } + + ret = btrie_find_a6(it, ip_v6); + if (ret == 1) { + printf("test case 4 passed\n"); + } else { + printf("test case 4 error\n"); + } + + return 0; + + error: + btrie_destroy(it); + printf("test failed\n"); + return 1; +} diff --git a/contrib/liblz4/CMakeLists.txt b/contrib/liblz4/CMakeLists.txt index bce214ca215..033b923e8e3 100644 --- a/contrib/liblz4/CMakeLists.txt +++ b/contrib/liblz4/CMakeLists.txt @@ -5,4 +5,6 @@ add_library (lz4 src/lz4hc.c include/lz4/lz4.h - include/lz4/lz4hc.h) + include/lz4/lz4hc.h + include/lz4/lz4opt.h) + diff --git a/contrib/liblz4/include/lz4/lz4.h b/contrib/liblz4/include/lz4/lz4.h index 3e740022561..0aae19c9a73 100644 --- a/contrib/liblz4/include/lz4/lz4.h +++ b/contrib/liblz4/include/lz4/lz4.h @@ -1,7 +1,7 @@ /* - LZ4 - Fast LZ compression algorithm - Header File - Copyright (C) 2011-2015, Yann Collet. + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-2016, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) @@ -29,34 +29,79 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - LZ4 source repository : https://github.com/Cyan4973/lz4 - - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 */ -#pragma once +#ifndef LZ4_H_2983827168210 +#define LZ4_H_2983827168210 #if defined (__cplusplus) extern "C" { #endif -/* - * lz4.h provides block compression functions, and gives full buffer control to programmer. - * If you need to generate inter-operable compressed data (respecting LZ4 frame specification), - * and can let the library handle its own memory, please use lz4frame.h instead. +/* --- Dependency --- */ +#include /* size_t */ + + +/** + Introduction + + LZ4 is lossless compression algorithm, providing compression speed at 400 MB/s per core, + scalable with multi-cores CPU. It features an extremely fast decoder, with speed in + multiple GB/s per core, typically reaching RAM speed limits on multi-core systems. + + The LZ4 compression library provides in-memory compression and decompression functions. + Compression can be done in: + - a single step (described as Simple Functions) + - a single step, reusing a context (described in Advanced Functions) + - unbounded multiple steps (described as Streaming compression) + + lz4.h provides block compression functions. It gives full buffer control to user. + Decompressing an lz4-compressed block also requires metadata (such as compressed size). + Each application is free to encode such metadata in whichever way it wants. + + An additional format, called LZ4 frame specification (doc/lz4_Frame_format.md), + take care of encoding standard metadata alongside LZ4-compressed blocks. + If your application requires interoperability, it's recommended to use it. + A library is provided to take care of it, see lz4frame.h. */ -/************************************** -* Version -**************************************/ +/*^*************************************************************** +* Export parameters +*****************************************************************/ +/* +* LZ4_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +*/ +#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1) +# define LZ4LIB_API __declspec(dllexport) +#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1) +# define LZ4LIB_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define LZ4LIB_API +#endif + + +/*========== Version =========== */ #define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ #define LZ4_VERSION_MINOR 7 /* for new (non-breaking) interface capabilities */ -#define LZ4_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */ -#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) -int LZ4_versionNumber (void); +#define LZ4_VERSION_RELEASE 5 /* for tweaks, bug-fixes, or development */ -/************************************** +#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) + +#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE +#define LZ4_QUOTE(str) #str +#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str) +#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) + +LZ4LIB_API int LZ4_versionNumber (void); +LZ4LIB_API const char* LZ4_versionString (void); + + +/*-************************************ * Tuning parameter **************************************/ -/* +/*! * LZ4_MEMORY_USAGE : * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) * Increasing memory usage improves compression ratio @@ -66,15 +111,10 @@ int LZ4_versionNumber (void); #define LZ4_MEMORY_USAGE 14 -/************************************** +/*-************************************ * Simple Functions **************************************/ - -int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize); -int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); - -/* -LZ4_compress_default() : +/*! LZ4_compress_default() : Compresses 'sourceSize' bytes from buffer 'source' into already allocated 'dest' buffer of size 'maxDestSize'. Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize). @@ -86,9 +126,10 @@ LZ4_compress_default() : sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE maxDestSize : full or partial size of buffer 'dest' (which must be already allocated) return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize) - or 0 if compression fails + or 0 if compression fails */ +LZ4LIB_API int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize); -LZ4_decompress_safe() : +/*! LZ4_decompress_safe() : compressedSize : is the precise full size of the compressed block. maxDecompressedSize : is the size of destination buffer, which must be already allocated. return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize) @@ -97,15 +138,16 @@ LZ4_decompress_safe() : This function is protected against buffer overflow exploits, including malicious data packets. It never writes outside output buffer, nor reads outside input buffer. */ +LZ4LIB_API int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); -/************************************** +/*-************************************ * Advanced Functions **************************************/ #define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ #define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) -/* +/*! LZ4_compressBound() : Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) This function is primarily useful for memory allocation purposes (destination buffer size). @@ -115,9 +157,9 @@ LZ4_compressBound() : return : maximum output size in a "worst case" scenario or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) */ -int LZ4_compressBound(int inputSize); +LZ4LIB_API int LZ4_compressBound(int inputSize); -/* +/*! LZ4_compress_fast() : Same as LZ4_compress_default(), but allows to select an "acceleration" factor. The larger the acceleration value, the faster the algorithm, but also the lesser the compression. @@ -125,21 +167,21 @@ LZ4_compress_fast() : An acceleration value of "1" is the same as regular LZ4_compress_default() Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1. */ -int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration); +LZ4LIB_API int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration); -/* +/*! LZ4_compress_fast_extState() : Same compression function, just using an externally allocated memory space to store compression state. Use LZ4_sizeofState() to know how much memory must be allocated, and allocate it on 8-bytes boundaries (using malloc() typically). Then, provide it as 'void* state' to compression function. */ -int LZ4_sizeofState(void); -int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration); +LZ4LIB_API int LZ4_sizeofState(void); +LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration); -/* +/*! LZ4_compress_destSize() : Reverse the logic, by compressing as much data as possible from 'source' buffer into already allocated buffer 'dest' of size 'targetDestSize'. @@ -150,10 +192,10 @@ LZ4_compress_destSize() : return : Nb bytes written into 'dest' (necessarily <= targetDestSize) or 0 if compression fails */ -int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize); +LZ4LIB_API int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize); -/* +/*! LZ4_decompress_fast() : originalSize : is the original and therefore uncompressed size return : the number of bytes read from the source buffer (in other words, the compressed size) @@ -164,9 +206,9 @@ LZ4_decompress_fast() : However, it does not provide any protection against intentionally modified data stream (malicious input). Use this function in trusted environment only (data to decode comes from a trusted source). */ -int LZ4_decompress_fast (const char* source, char* dest, int originalSize); +LZ4LIB_API int LZ4_decompress_fast (const char* source, char* dest, int originalSize); -/* +/*! LZ4_decompress_safe_partial() : This function decompress a compressed block of size 'compressedSize' at position 'source' into destination buffer 'dest' of size 'maxDecompressedSize'. @@ -178,98 +220,73 @@ LZ4_decompress_safe_partial() : If the source stream is detected malformed, the function will stop decoding and return a negative result. This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets */ -int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); +LZ4LIB_API int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); -/*********************************************** +/*-********************************************* * Streaming Compression Functions ***********************************************/ -#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) -#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(long long)) -/* - * LZ4_stream_t - * information structure to track an LZ4 stream. - * important : init this structure content before first use ! - * note : only allocated directly the structure if you are statically linking LZ4 - * If you are using liblz4 as a DLL, please use below construction methods instead. +typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */ + +/*! LZ4_createStream() and LZ4_freeStream() : + * LZ4_createStream() will allocate and initialize an `LZ4_stream_t` structure. + * LZ4_freeStream() releases its memory. */ -typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t; +LZ4LIB_API LZ4_stream_t* LZ4_createStream(void); +LZ4LIB_API int LZ4_freeStream (LZ4_stream_t* streamPtr); -/* - * LZ4_resetStream - * Use this function to init an allocated LZ4_stream_t structure +/*! LZ4_resetStream() : + * An LZ4_stream_t structure can be allocated once and re-used multiple times. + * Use this function to init an allocated `LZ4_stream_t` structure and start a new compression. */ -void LZ4_resetStream (LZ4_stream_t* streamPtr); +LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr); -/* - * LZ4_createStream will allocate and initialize an LZ4_stream_t structure - * LZ4_freeStream releases its memory. - * In the context of a DLL (liblz4), please use these methods rather than the static struct. - * They are more future proof, in case of a change of LZ4_stream_t size. +/*! LZ4_loadDict() : + * Use this function to load a static dictionary into LZ4_stream. + * Any previous data will be forgotten, only 'dictionary' will remain in memory. + * Loading a size of 0 is allowed. + * Return : dictionary size, in bytes (necessarily <= 64 KB) */ -LZ4_stream_t* LZ4_createStream(void); -int LZ4_freeStream (LZ4_stream_t* streamPtr); +LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); -/* - * LZ4_loadDict - * Use this function to load a static dictionary into LZ4_stream. - * Any previous data will be forgotten, only 'dictionary' will remain in memory. - * Loading a size of 0 is allowed. - * Return : dictionary size, in bytes (necessarily <= 64 KB) +/*! LZ4_compress_fast_continue() : + * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio. + * Important : Previous data blocks are assumed to still be present and unmodified ! + * 'dst' buffer must be already allocated. + * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. + * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero. */ -int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); +LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration); -/* - * LZ4_compress_fast_continue - * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio. - * Important : Previous data blocks are assumed to still be present and unmodified ! - * 'dst' buffer must be already allocated. - * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. - * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero. +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at its memory location, + * save it into a safer place (char* safeBuffer). + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue(). + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. */ -int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration); - -/* - * LZ4_saveDict - * If previously compressed data block is not guaranteed to remain available at its memory location - * save it into a safer place (char* safeBuffer) - * Note : you don't need to call LZ4_loadDict() afterwards, - * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue() - * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error - */ -int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize); +LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize); -/************************************************ +/*-********************************************** * Streaming Decompression Functions +* Bufferless synchronous API ************************************************/ +typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* incomplete type (defined later) */ -#define LZ4_STREAMDECODESIZE_U64 4 -#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) -typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t; -/* - * LZ4_streamDecode_t - * information structure to track an LZ4 stream. - * init this structure content using LZ4_setStreamDecode or memset() before first use ! - * - * In the context of a DLL (liblz4) please prefer usage of construction methods below. - * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future. - * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure - * LZ4_freeStreamDecode releases its memory. +/* creation / destruction of streaming decompression tracking structure */ +LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void); +LZ4LIB_API int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); + +/*! LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. + * Setting a size of 0 is allowed (same effect as reset). + * @return : 1 if OK, 0 if error */ -LZ4_streamDecode_t* LZ4_createStreamDecode(void); -int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); +LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); -/* - * LZ4_setStreamDecode - * Use this function to instruct where to find the dictionary. - * Setting a size of 0 is allowed (same effect as reset). - * Return : 1 if OK, 0 if error - */ -int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); - -/* -*_continue() : +/*! +LZ4_decompress_*_continue() : These decoding functions allow decompression of multiple blocks in "streaming" mode. Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB) In the case of a ring buffers, decoding buffer must be either : @@ -285,35 +302,120 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer, and indicate where it is saved using LZ4_setStreamDecode() */ -int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize); -int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); +LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize); +LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); -/* -Advanced decoding functions : -*_usingDict() : - These decoding functions work the same as - a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue() - They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure. -*/ -int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize); -int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); +/*! LZ4_decompress_*_usingDict() : + * These decoding functions work the same as + * a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue() + * They are stand-alone, and don't need an LZ4_streamDecode_t structure. + */ +LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize); +LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); +/*^********************************************** + * !!!!!! STATIC LINKING ONLY !!!!!! + ***********************************************/ +/*-************************************ + * Private definitions + ************************************** + * Do not use these definitions. + * They are exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`. + * Using these definitions will expose code to API and/or ABI break in future versions of the library. + **************************************/ +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) +#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) /* required as macro for static allocation */ -/************************************** +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +#include + +typedef struct { + uint32_t hashTable[LZ4_HASH_SIZE_U32]; + uint32_t currentOffset; + uint32_t initCheck; + const uint8_t* dictionary; + uint8_t* bufferStart; /* obsolete, used for slideInputBuffer */ + uint32_t dictSize; +} LZ4_stream_t_internal; + +typedef struct { + const uint8_t* externalDict; + size_t extDictSize; + const uint8_t* prefixEnd; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + +#else + +typedef struct { + unsigned int hashTable[LZ4_HASH_SIZE_U32]; + unsigned int currentOffset; + unsigned int initCheck; + const unsigned char* dictionary; + unsigned char* bufferStart; /* obsolete, used for slideInputBuffer */ + unsigned int dictSize; +} LZ4_stream_t_internal; + +typedef struct { + const unsigned char* externalDict; + size_t extDictSize; + const unsigned char* prefixEnd; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + +#endif + +/*! + * LZ4_stream_t : + * information structure to track an LZ4 stream. + * init this structure before first use. + * note : only use in association with static linking ! + * this definition is not API/ABI safe, + * and may change in a future version ! + */ +#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) +#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(unsigned long long)) +union LZ4_stream_u { + unsigned long long table[LZ4_STREAMSIZE_U64]; + LZ4_stream_t_internal internal_donotuse; +} ; /* previously typedef'd to LZ4_stream_t */ + + +/*! + * LZ4_streamDecode_t : + * information structure to track an LZ4 stream during decompression. + * init this structure using LZ4_setStreamDecode (or memset()) before first use + * note : only use in association with static linking ! + * this definition is not API/ABI safe, + * and may change in a future version ! + */ +#define LZ4_STREAMDECODESIZE_U64 4 +#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) +union LZ4_streamDecode_u { + unsigned long long table[LZ4_STREAMDECODESIZE_U64]; + LZ4_streamDecode_t_internal internal_donotuse; +} ; /* previously typedef'd to LZ4_streamDecode_t */ + + +/*=************************************ * Obsolete Functions **************************************/ -/* Deprecate Warnings */ -/* Should these warnings messages be a problem, +/* Deprecation warnings */ +/* Should these warnings be a problem, it is generally possible to disable them, - with -Wno-deprecated-declarations for gcc - or _CRT_SECURE_NO_WARNINGS in Visual for example. - You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */ -#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK -# define LZ4_DEPRECATE_WARNING_DEFBLOCK + typically with -Wno-deprecated-declarations for gcc + or _CRT_SECURE_NO_WARNINGS in Visual. + Otherwise, it's also possible to define LZ4_DISABLE_DEPRECATE_WARNINGS */ +#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS +# define LZ4_DEPRECATED(message) /* disable deprecation warnings */ +#else # define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) -# if (LZ4_GCC_VERSION >= 405) || defined(__clang__) +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define LZ4_DEPRECATED(message) [[deprecated(message)]] +# elif (LZ4_GCC_VERSION >= 405) || defined(__clang__) # define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) # elif (LZ4_GCC_VERSION >= 301) # define LZ4_DEPRECATED(message) __attribute__((deprecated)) @@ -323,20 +425,19 @@ int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalS # pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") # define LZ4_DEPRECATED(message) # endif -#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */ +#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */ /* Obsolete compression functions */ -/* These functions are planned to start generate warnings by r131 approximately */ -int LZ4_compress (const char* source, char* dest, int sourceSize); -int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); -int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); -int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); -int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); -int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_default() instead") int LZ4_compress (const char* source, char* dest, int sourceSize); +LZ4_DEPRECATED("use LZ4_compress_default() instead") int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); /* Obsolete decompression functions */ /* These function names are completely deprecated and must no longer be used. - They are only provided here for compatibility with older programs. + They are only provided in lz4.c for compatibility with older programs. - LZ4_uncompress is the same as LZ4_decompress_fast - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe These function prototypes are now disabled; uncomment them only if you really need them. @@ -358,3 +459,5 @@ LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress #if defined (__cplusplus) } #endif + +#endif /* LZ4_H_2983827168210 */ diff --git a/contrib/liblz4/include/lz4/lz4hc.h b/contrib/liblz4/include/lz4/lz4hc.h index 431f7c87c86..1036fd0bf5c 100644 --- a/contrib/liblz4/include/lz4/lz4hc.h +++ b/contrib/liblz4/include/lz4/lz4hc.h @@ -1,7 +1,7 @@ /* LZ4 HC - High Compression Mode of LZ4 Header File - Copyright (C) 2011-2015, Yann Collet. + Copyright (C) 2011-2016, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without @@ -28,107 +28,92 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - LZ4 source repository : https://github.com/Cyan4973/lz4 + - LZ4 source repository : https://github.com/lz4/lz4 - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c */ -#pragma once - +#ifndef LZ4_HC_H_19834876238432 +#define LZ4_HC_H_19834876238432 #if defined (__cplusplus) extern "C" { #endif -/***************************** -* Includes -*****************************/ -#include /* size_t */ +/* --- Dependency --- */ +/* note : lz4hc is not an independent module, it requires lz4.h/lz4.c for proper compilation */ +#include "lz4.h" /* stddef, LZ4LIB_API, LZ4_DEPRECATED */ -/************************************** -* Block Compression -**************************************/ -int LZ4_compress_HC (const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel); -/* -LZ4_compress_HC : - Destination buffer 'dst' must be already allocated. - Compression completion is guaranteed if 'dst' buffer is sized to handle worst circumstances (data not compressible) - Worst size evaluation is provided by function LZ4_compressBound() (see "lz4.h") - srcSize : Max supported value is LZ4_MAX_INPUT_SIZE (see "lz4.h") - compressionLevel : Recommended values are between 4 and 9, although any value between 0 and 16 will work. - 0 means "use default value" (see lz4hc.c). - Values >16 behave the same as 16. - return : the number of bytes written into buffer 'dst' - or 0 if compression fails. -*/ +/* --- Useful constants --- */ +#define LZ4HC_CLEVEL_MIN 3 +#define LZ4HC_CLEVEL_DEFAULT 9 +#define LZ4HC_CLEVEL_OPT_MIN 11 +#define LZ4HC_CLEVEL_MAX 12 + + +/*-************************************ + * Block Compression + **************************************/ +/*! LZ4_compress_HC() : + * Compress data from `src` into `dst`, using the more powerful but slower "HC" algorithm. + * `dst` must be already allocated. + * Compression is guaranteed to succeed if `dstCapacity >= LZ4_compressBound(srcSize)` (see "lz4.h") + * Max supported `srcSize` value is LZ4_MAX_INPUT_SIZE (see "lz4.h") + * `compressionLevel` : Recommended values are between 4 and 9, although any value between 1 and LZ4HC_MAX_CLEVEL will work. + * Values >LZ4HC_MAX_CLEVEL behave the same as LZ4HC_MAX_CLEVEL. + * @return : the number of bytes written into 'dst' + * or 0 if compression fails. + */ +LZ4LIB_API int LZ4_compress_HC (const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel); /* Note : - Decompression functions are provided within LZ4 source code (see "lz4.h") (BSD license) -*/ + * Decompression functions are provided within "lz4.h" (BSD license) + */ -int LZ4_sizeofStateHC(void); -int LZ4_compress_HC_extStateHC(void* state, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel); -/* -LZ4_compress_HC_extStateHC() : - Use this function if you prefer to manually allocate memory for compression tables. - To know how much memory must be allocated for the compression tables, use : - int LZ4_sizeofStateHC(); - - Allocated memory must be aligned on 8-bytes boundaries (which a normal malloc() will do properly). - - The allocated memory can then be provided to the compression functions using 'void* state' parameter. - LZ4_compress_HC_extStateHC() is equivalent to previously described function. - It just uses externally allocated memory for stateHC. -*/ +/*! LZ4_compress_HC_extStateHC() : + * Same as LZ4_compress_HC(), but using an externally allocated memory segment for `state`. + * `state` size is provided by LZ4_sizeofStateHC(). + * Memory segment must be aligned on 8-bytes boundaries (which a normal malloc() will do properly). + */ +LZ4LIB_API int LZ4_compress_HC_extStateHC(void* state, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel); +LZ4LIB_API int LZ4_sizeofStateHC(void); -/************************************** -* Streaming Compression -**************************************/ -#define LZ4_STREAMHCSIZE 262192 -#define LZ4_STREAMHCSIZE_SIZET (LZ4_STREAMHCSIZE / sizeof(size_t)) -typedef struct { size_t table[LZ4_STREAMHCSIZE_SIZET]; } LZ4_streamHC_t; -/* - LZ4_streamHC_t - This structure allows static allocation of LZ4 HC streaming state. - State must then be initialized using LZ4_resetStreamHC() before first use. +/*-************************************ + * Streaming Compression + * Bufferless synchronous API + **************************************/ + typedef union LZ4_streamHC_u LZ4_streamHC_t; /* incomplete type (defined later) */ - Static allocation should only be used in combination with static linking. - If you want to use LZ4 as a DLL, please use construction functions below, which are future-proof. -*/ +/*! LZ4_createStreamHC() and LZ4_freeStreamHC() : + * These functions create and release memory for LZ4 HC streaming state. + * Newly created states are automatically initialized. + * Existing states can be re-used several times, using LZ4_resetStreamHC(). + * These methods are API and ABI stable, they can be used in combination with a DLL. + */ +LZ4LIB_API LZ4_streamHC_t* LZ4_createStreamHC(void); +LZ4LIB_API int LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr); +LZ4LIB_API void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel); +LZ4LIB_API int LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize); -LZ4_streamHC_t* LZ4_createStreamHC(void); -int LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr); -/* - These functions create and release memory for LZ4 HC streaming state. - Newly created states are already initialized. - Existing state space can be re-used anytime using LZ4_resetStreamHC(). - If you use LZ4 as a DLL, use these functions instead of static structure allocation, - to avoid size mismatch between different versions. -*/ +LZ4LIB_API int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr, const char* src, char* dst, int srcSize, int maxDstSize); -void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel); -int LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize); - -int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr, const char* src, char* dst, int srcSize, int maxDstSize); - -int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize); +LZ4LIB_API int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize); /* These functions compress data in successive blocks of any size, using previous blocks as dictionary. One key assumption is that previous blocks (up to 64 KB) remain read-accessible while compressing next blocks. - There is an exception for ring buffers, which can be smaller 64 KB. - Such case is automatically detected and correctly handled by LZ4_compress_HC_continue(). + There is an exception for ring buffers, which can be smaller than 64 KB. + Ring buffers scenario is automatically detected and handled by LZ4_compress_HC_continue(). Before starting compression, state must be properly initialized, using LZ4_resetStreamHC(). A first "fictional block" can then be designated as initial dictionary, using LZ4_loadDictHC() (Optional). Then, use LZ4_compress_HC_continue() to compress each successive block. - It works like LZ4_compress_HC(), but use previous memory blocks as dictionary to improve compression. Previous memory blocks (including initial dictionary when present) must remain accessible and unmodified during compression. - As a reminder, size 'dst' buffer to handle worst cases, using LZ4_compressBound(), to ensure success of compression operation. + 'dst' buffer should be sized to handle worst case scenarios, using LZ4_compressBound(), to ensure operation success. If, for any reason, previous data blocks can't be preserved unmodified in memory during next compression block, you must save it to a safer memory space, using LZ4_saveDictHC(). @@ -136,50 +121,102 @@ int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSi */ +/*-****************************************** + * !!!!! STATIC LINKING ONLY !!!!! + *******************************************/ -/************************************** + /*-************************************* + * PRIVATE DEFINITIONS : + * Do not use these definitions. + * They are exposed to allow static allocation of `LZ4_streamHC_t`. + * Using these definitions makes the code vulnerable to potential API break when upgrading LZ4 + **************************************/ +#define LZ4HC_DICTIONARY_LOGSIZE 17 +#define LZ4HC_MAXD (1<= 199901L) /* C99 */) +#include + +typedef struct +{ + uint32_t hashTable[LZ4HC_HASHTABLESIZE]; + uint16_t chainTable[LZ4HC_MAXD]; + const uint8_t* end; /* next block here to continue on current prefix */ + const uint8_t* base; /* All index relative to this position */ + const uint8_t* dictBase; /* alternate base for extDict */ + uint8_t* inputBuffer; /* deprecated */ + uint32_t dictLimit; /* below that point, need extDict */ + uint32_t lowLimit; /* below that point, no more dict */ + uint32_t nextToUpdate; /* index from which to continue dictionary update */ + uint32_t searchNum; /* only for optimal parser */ + uint32_t compressionLevel; +} LZ4HC_CCtx_internal; + +#else + +typedef struct +{ + unsigned int hashTable[LZ4HC_HASHTABLESIZE]; + unsigned short chainTable[LZ4HC_MAXD]; + const unsigned char* end; /* next block here to continue on current prefix */ + const unsigned char* base; /* All index relative to this position */ + const unsigned char* dictBase; /* alternate base for extDict */ + unsigned char* inputBuffer; /* deprecated */ + unsigned int dictLimit; /* below that point, need extDict */ + unsigned int lowLimit; /* below that point, no more dict */ + unsigned int nextToUpdate; /* index from which to continue dictionary update */ + unsigned int searchNum; /* only for optimal parser */ + unsigned int compressionLevel; +} LZ4HC_CCtx_internal; + +#endif + +#define LZ4_STREAMHCSIZE (4*LZ4HC_HASHTABLESIZE + 2*LZ4HC_MAXD + 56) /* 393268 */ +#define LZ4_STREAMHCSIZE_SIZET (LZ4_STREAMHCSIZE / sizeof(size_t)) +union LZ4_streamHC_u { + size_t table[LZ4_STREAMHCSIZE_SIZET]; + LZ4HC_CCtx_internal internal_donotuse; +}; /* previously typedef'd to LZ4_streamHC_t */ +/* + LZ4_streamHC_t : + This structure allows static allocation of LZ4 HC streaming state. + State must be initialized using LZ4_resetStreamHC() before first use. + + Static allocation shall only be used in combination with static linking. + When invoking LZ4 from a DLL, use create/free functions instead, which are API and ABI stable. +*/ + + +/*-************************************ * Deprecated Functions **************************************/ -/* Deprecate Warnings */ -/* Should these warnings messages be a problem, - it is generally possible to disable them, - with -Wno-deprecated-declarations for gcc - or _CRT_SECURE_NO_WARNINGS in Visual for example. - You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */ -#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK -# define LZ4_DEPRECATE_WARNING_DEFBLOCK -# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) -# if (LZ4_GCC_VERSION >= 405) || defined(__clang__) -# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) -# elif (LZ4_GCC_VERSION >= 301) -# define LZ4_DEPRECATED(message) __attribute__((deprecated)) -# elif defined(_MSC_VER) -# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) -# else -# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") -# define LZ4_DEPRECATED(message) -# endif -#endif // LZ4_DEPRECATE_WARNING_DEFBLOCK +/* see lz4.h LZ4_DISABLE_DEPRECATE_WARNINGS to turn off deprecation warnings */ -/* compression functions */ -/* these functions are planned to trigger warning messages by r131 approximately */ -int LZ4_compressHC (const char* source, char* dest, int inputSize); -int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); -int LZ4_compressHC2 (const char* source, char* dest, int inputSize, int compressionLevel); -int LZ4_compressHC2_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); -int LZ4_compressHC_withStateHC (void* state, const char* source, char* dest, int inputSize); -int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); -int LZ4_compressHC2_withStateHC (void* state, const char* source, char* dest, int inputSize, int compressionLevel); -int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); -int LZ4_compressHC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize); -int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize); +/* deprecated compression functions */ +/* these functions will trigger warning messages in future releases */ +LZ4_DEPRECATED("use LZ4_compress_HC() instead") int LZ4_compressHC (const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_HC() instead") int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_HC() instead") int LZ4_compressHC2 (const char* source, char* dest, int inputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC() instead") int LZ4_compressHC2_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") int LZ4_compressHC_withStateHC (void* state, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") int LZ4_compressHC2_withStateHC (void* state, const char* source, char* dest, int inputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize); +LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize); -/* Streaming functions following the older model; should no longer be used */ +/* Deprecated Streaming functions using older model; should no longer be used */ LZ4_DEPRECATED("use LZ4_createStreamHC() instead") void* LZ4_createHC (char* inputBuffer); LZ4_DEPRECATED("use LZ4_saveDictHC() instead") char* LZ4_slideInputBufferHC (void* LZ4HC_Data); LZ4_DEPRECATED("use LZ4_freeStreamHC() instead") int LZ4_freeHC (void* LZ4HC_Data); -LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel); -LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel); +LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel); LZ4_DEPRECATED("use LZ4_createStreamHC() instead") int LZ4_sizeofStreamStateHC(void); LZ4_DEPRECATED("use LZ4_resetStreamHC() instead") int LZ4_resetStreamStateHC(void* state, char* inputBuffer); @@ -187,3 +224,5 @@ LZ4_DEPRECATED("use LZ4_resetStreamHC() instead") int LZ4_resetStreamStateHC( #if defined (__cplusplus) } #endif + +#endif /* LZ4_HC_H_19834876238432 */ diff --git a/contrib/liblz4/include/lz4/lz4opt.h b/contrib/liblz4/include/lz4/lz4opt.h new file mode 100644 index 00000000000..b346eba87f1 --- /dev/null +++ b/contrib/liblz4/include/lz4/lz4opt.h @@ -0,0 +1,361 @@ +/* + lz4opt.h - Optimal Mode of LZ4 + Copyright (C) 2015-2017, Przemyslaw Skibinski + Note : this file is intended to be included within lz4hc.c + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : https://github.com/lz4/lz4 + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +#define LZ4_OPT_NUM (1<<12) + + +typedef struct { + int off; + int len; +} LZ4HC_match_t; + +typedef struct { + int price; + int off; + int mlen; + int litlen; +} LZ4HC_optimal_t; + + +/* price in bytes */ +FORCE_INLINE size_t LZ4HC_literalsPrice(size_t litlen) +{ + size_t price = litlen; + if (litlen >= (size_t)RUN_MASK) + price += 1 + (litlen-RUN_MASK)/255; + return price; +} + + +/* requires mlen >= MINMATCH */ +FORCE_INLINE size_t LZ4HC_sequencePrice(size_t litlen, size_t mlen) +{ + size_t price = 2 + 1; /* 16-bit offset + token */ + + price += LZ4HC_literalsPrice(litlen); + + if (mlen >= (size_t)(ML_MASK+MINMATCH)) + price+= 1 + (mlen-(ML_MASK+MINMATCH))/255; + + return price; +} + + +/*-************************************* +* Binary Tree search +***************************************/ +FORCE_INLINE int LZ4HC_BinTree_InsertAndGetAllMatches ( + LZ4HC_CCtx_internal* ctx, + const BYTE* const ip, + const BYTE* const iHighLimit, + size_t best_mlen, + LZ4HC_match_t* matches, + int* matchNum) +{ + U16* const chainTable = ctx->chainTable; + U32* const HashTable = ctx->hashTable; + const BYTE* const base = ctx->base; + const U32 dictLimit = ctx->dictLimit; + const U32 current = (U32)(ip - base); + const U32 lowLimit = (ctx->lowLimit + MAX_DISTANCE > current) ? ctx->lowLimit : current - (MAX_DISTANCE - 1); + const BYTE* const dictBase = ctx->dictBase; + const BYTE* match; + int nbAttempts = ctx->searchNum; + int mnum = 0; + U16 *ptr0, *ptr1, delta0, delta1; + U32 matchIndex; + size_t matchLength = 0; + U32* HashPos; + + if (ip + MINMATCH > iHighLimit) return 1; + + /* HC4 match finder */ + HashPos = &HashTable[LZ4HC_hashPtr(ip)]; + matchIndex = *HashPos; + *HashPos = current; + + ptr0 = &DELTANEXTMAXD(current*2+1); + ptr1 = &DELTANEXTMAXD(current*2); + delta0 = delta1 = (U16)(current - matchIndex); + + while ((matchIndex < current) && (matchIndex>=lowLimit) && (nbAttempts)) { + nbAttempts--; + if (matchIndex >= dictLimit) { + match = base + matchIndex; + matchLength = LZ4_count(ip, match, iHighLimit); + } else { + const BYTE* vLimit = ip + (dictLimit - matchIndex); + match = dictBase + matchIndex; + if (vLimit > iHighLimit) vLimit = iHighLimit; + matchLength = LZ4_count(ip, match, vLimit); + if ((ip+matchLength == vLimit) && (vLimit < iHighLimit)) + matchLength += LZ4_count(ip+matchLength, base+dictLimit, iHighLimit); + } + + if (matchLength > best_mlen) { + best_mlen = matchLength; + if (matches) { + if (matchIndex >= dictLimit) + matches[mnum].off = (int)(ip - match); + else + matches[mnum].off = (int)(ip - (base + matchIndex)); /* virtual matchpos */ + matches[mnum].len = (int)matchLength; + mnum++; + } + if (best_mlen > LZ4_OPT_NUM) break; + } + + if (ip+matchLength >= iHighLimit) /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */ + + if (*(ip+matchLength) < *(match+matchLength)) { + *ptr0 = delta0; + ptr0 = &DELTANEXTMAXD(matchIndex*2); + if (*ptr0 == (U16)-1) break; + delta0 = *ptr0; + delta1 += delta0; + matchIndex -= delta0; + } else { + *ptr1 = delta1; + ptr1 = &DELTANEXTMAXD(matchIndex*2+1); + if (*ptr1 == (U16)-1) break; + delta1 = *ptr1; + delta0 += delta1; + matchIndex -= delta1; + } + } + + *ptr0 = (U16)-1; + *ptr1 = (U16)-1; + if (matchNum) *matchNum = mnum; + /* if (best_mlen > 8) return best_mlen-8; */ + if (!matchNum) return 1; + return 1; +} + + +FORCE_INLINE void LZ4HC_updateBinTree(LZ4HC_CCtx_internal* ctx, const BYTE* const ip, const BYTE* const iHighLimit) +{ + const BYTE* const base = ctx->base; + const U32 target = (U32)(ip - base); + U32 idx = ctx->nextToUpdate; + while(idx < target) + idx += LZ4HC_BinTree_InsertAndGetAllMatches(ctx, base+idx, iHighLimit, 8, NULL, NULL); +} + + +/** Tree updater, providing best match */ +FORCE_INLINE int LZ4HC_BinTree_GetAllMatches ( + LZ4HC_CCtx_internal* ctx, + const BYTE* const ip, const BYTE* const iHighLimit, + size_t best_mlen, LZ4HC_match_t* matches, const int fullUpdate) +{ + int mnum = 0; + if (ip < ctx->base + ctx->nextToUpdate) return 0; /* skipped area */ + if (fullUpdate) LZ4HC_updateBinTree(ctx, ip, iHighLimit); + best_mlen = LZ4HC_BinTree_InsertAndGetAllMatches(ctx, ip, iHighLimit, best_mlen, matches, &mnum); + ctx->nextToUpdate = (U32)(ip - ctx->base + best_mlen); + return mnum; +} + + +#define SET_PRICE(pos, ml, offset, ll, cost) \ +{ \ + while (last_pos < pos) { opt[last_pos+1].price = 1<<30; last_pos++; } \ + opt[pos].mlen = (int)ml; \ + opt[pos].off = (int)offset; \ + opt[pos].litlen = (int)ll; \ + opt[pos].price = (int)cost; \ +} + + +static int LZ4HC_compress_optimal ( + LZ4HC_CCtx_internal* ctx, + const char* const source, + char* dest, + int inputSize, + int maxOutputSize, + limitedOutput_directive limit, + size_t sufficient_len, + const int fullUpdate + ) +{ + LZ4HC_optimal_t opt[LZ4_OPT_NUM + 1]; /* this uses a bit too much stack memory to my taste ... */ + LZ4HC_match_t matches[LZ4_OPT_NUM + 1]; + + const BYTE* ip = (const BYTE*) source; + const BYTE* anchor = ip; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = (iend - LASTLITERALS); + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + /* init */ + if (sufficient_len >= LZ4_OPT_NUM) sufficient_len = LZ4_OPT_NUM-1; + ctx->end += inputSize; + ip++; + + /* Main Loop */ + while (ip < mflimit) { + size_t const llen = ip - anchor; + size_t last_pos = 0; + size_t match_num, cur, best_mlen, best_off; + memset(opt, 0, sizeof(LZ4HC_optimal_t)); /* memset only the first one */ + + match_num = LZ4HC_BinTree_GetAllMatches(ctx, ip, matchlimit, MINMATCH-1, matches, fullUpdate); + if (!match_num) { ip++; continue; } + + if ((size_t)matches[match_num-1].len > sufficient_len) { + /* good enough solution : immediate encoding */ + best_mlen = matches[match_num-1].len; + best_off = matches[match_num-1].off; + cur = 0; + last_pos = 1; + goto encode; + } + + /* set prices using matches at position = 0 */ + { size_t matchNb; + for (matchNb = 0; matchNb < match_num; matchNb++) { + size_t mlen = (matchNb>0) ? (size_t)matches[matchNb-1].len+1 : MINMATCH; + best_mlen = matches[matchNb].len; /* necessarily < sufficient_len < LZ4_OPT_NUM */ + for ( ; mlen <= best_mlen ; mlen++) { + size_t const cost = LZ4HC_sequencePrice(llen, mlen) - LZ4HC_literalsPrice(llen); + SET_PRICE(mlen, mlen, matches[matchNb].off, 0, cost); /* updates last_pos and opt[pos] */ + } } } + + if (last_pos < MINMATCH) { ip++; continue; } /* note : on clang at least, this test improves performance */ + + /* check further positions */ + opt[0].mlen = opt[1].mlen = 1; + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const curPtr = ip + cur; + + /* establish baseline price if cur is literal */ + { size_t price, litlen; + if (opt[cur-1].mlen == 1) { + /* no match at previous position */ + litlen = opt[cur-1].litlen + 1; + if (cur > litlen) { + price = opt[cur - litlen].price + LZ4HC_literalsPrice(litlen); + } else { + price = LZ4HC_literalsPrice(llen + litlen) - LZ4HC_literalsPrice(llen); + } + } else { + litlen = 1; + price = opt[cur - 1].price + LZ4HC_literalsPrice(1); + } + + if (price < (size_t)opt[cur].price) + SET_PRICE(cur, 1 /*mlen*/, 0 /*off*/, litlen, price); /* note : increases last_pos */ + } + + if (cur == last_pos || curPtr >= mflimit) break; + + match_num = LZ4HC_BinTree_GetAllMatches(ctx, curPtr, matchlimit, MINMATCH-1, matches, fullUpdate); + if ((match_num > 0) && (size_t)matches[match_num-1].len > sufficient_len) { + /* immediate encoding */ + best_mlen = matches[match_num-1].len; + best_off = matches[match_num-1].off; + last_pos = cur + 1; + goto encode; + } + + /* set prices using matches at position = cur */ + { size_t matchNb; + for (matchNb = 0; matchNb < match_num; matchNb++) { + size_t ml = (matchNb>0) ? (size_t)matches[matchNb-1].len+1 : MINMATCH; + best_mlen = (cur + matches[matchNb].len < LZ4_OPT_NUM) ? + (size_t)matches[matchNb].len : LZ4_OPT_NUM - cur; + + for ( ; ml <= best_mlen ; ml++) { + size_t ll, price; + if (opt[cur].mlen == 1) { + ll = opt[cur].litlen; + if (cur > ll) + price = opt[cur - ll].price + LZ4HC_sequencePrice(ll, ml); + else + price = LZ4HC_sequencePrice(llen + ll, ml) - LZ4HC_literalsPrice(llen); + } else { + ll = 0; + price = opt[cur].price + LZ4HC_sequencePrice(0, ml); + } + + if (cur + ml > last_pos || price < (size_t)opt[cur + ml].price) { + SET_PRICE(cur + ml, ml, matches[matchNb].off, ll, price); + } } } } + } /* for (cur = 1; cur <= last_pos; cur++) */ + + best_mlen = opt[last_pos].mlen; + best_off = opt[last_pos].off; + cur = last_pos - best_mlen; + +encode: /* cur, last_pos, best_mlen, best_off must be set */ + opt[0].mlen = 1; + while (1) { /* from end to beginning */ + size_t const ml = opt[cur].mlen; + int const offset = opt[cur].off; + opt[cur].mlen = (int)best_mlen; + opt[cur].off = (int)best_off; + best_mlen = ml; + best_off = offset; + if (ml > cur) break; /* can this happen ? */ + cur -= ml; + } + + /* encode all recorded sequences */ + cur = 0; + while (cur < last_pos) { + int const ml = opt[cur].mlen; + int const offset = opt[cur].off; + if (ml == 1) { ip++; cur++; continue; } + cur += ml; + if ( LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ip - offset, limit, oend) ) return 0; + } + } /* while (ip < mflimit) */ + + /* Encode Last Literals */ + { int lastRun = (int)(iend - anchor); + if ((limit) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; /* Check output limit */ + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (BYTE)(lastRun< 1 > 2) + */ +#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define LZ4_FORCE_MEMORY_ACCESS 2 +# elif defined(__INTEL_COMPILER) || \ + (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) +# define LZ4_FORCE_MEMORY_ACCESS 1 +# endif +#endif + /* * LZ4_FORCE_SW_BITCOUNT * Define this parameter if your target system or compiler does not support hardware bit count @@ -62,13 +86,14 @@ #endif -/************************************** -* Includes +/*-************************************ +* Dependency **************************************/ -#include +#include "lz4.h" +/* see also "memory routines" below */ -/************************************** +/*-************************************ * Compiler Options **************************************/ #ifdef _MSC_VER /* Visual Studio */ @@ -77,19 +102,16 @@ # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ # pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ #else -# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ -# if defined(__GNUC__) || defined(__clang__) -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif +# if defined(__GNUC__) || defined(__clang__) +# define FORCE_INLINE static inline __attribute__((always_inline)) +# elif defined(__cplusplus) || (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define FORCE_INLINE static inline # else # define FORCE_INLINE static -# endif /* __STDC_VERSION__ */ +# endif #endif /* _MSC_VER */ -/* LZ4_GCC_VERSION is defined into lz4.h */ -#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__) # define expect(expr,value) (__builtin_expect ((expr),(value)) ) #else # define expect(expr,value) (expr) @@ -99,7 +121,7 @@ #define unlikely(expr) expect((expr) != 0, 0) -/************************************** +/*-************************************ * Memory routines **************************************/ #include /* malloc, calloc, free */ @@ -109,54 +131,100 @@ #define MEM_INIT memset -/************************************** +/*-************************************ * Basic Types **************************************/ -#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) # include typedef uint8_t BYTE; typedef uint16_t U16; typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; + typedef uintptr_t uptrval; #else typedef unsigned char BYTE; typedef unsigned short U16; typedef unsigned int U32; typedef signed int S32; typedef unsigned long long U64; + typedef size_t uptrval; /* generally true, except OpenVMS-64 */ #endif +#if defined(__x86_64__) + typedef U64 reg_t; /* 64-bits in x32 mode */ +#else + typedef size_t reg_t; /* 32-bits in x32 mode */ +#endif -/************************************** +/*-************************************ * Reading and writing into memory **************************************/ -#define STEPSIZE sizeof(size_t) - -static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } - static unsigned LZ4_isLittleEndian(void) { - const union { U32 i; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ return one.c[0]; } +#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2) +/* lie to the compiler about data alignment; use with caution */ + +static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; } +static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; } +static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; } + +static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } + +#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign; + +static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } +static U32 LZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +static reg_t LZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; } + +static void LZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } +static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; } + +#else /* safe and portable access through memcpy() */ + static U16 LZ4_read16(const void* memPtr) { - U16 val16; - memcpy(&val16, memPtr, 2); - return val16; + U16 val; memcpy(&val, memPtr, sizeof(val)); return val; } +static U32 LZ4_read32(const void* memPtr) +{ + U32 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static reg_t LZ4_read_ARCH(const void* memPtr) +{ + reg_t val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +static void LZ4_write16(void* memPtr, U16 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +static void LZ4_write32(void* memPtr, U32 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* LZ4_FORCE_MEMORY_ACCESS */ + + static U16 LZ4_readLE16(const void* memPtr) { - if (LZ4_isLittleEndian()) - { + if (LZ4_isLittleEndian()) { return LZ4_read16(memPtr); - } - else - { + } else { const BYTE* p = (const BYTE*)memPtr; return (U16)((U16)p[0] + (p[1]<<8)); } @@ -164,63 +232,39 @@ static U16 LZ4_readLE16(const void* memPtr) static void LZ4_writeLE16(void* memPtr, U16 value) { - if (LZ4_isLittleEndian()) - { - memcpy(memPtr, &value, 2); - } - else - { + if (LZ4_isLittleEndian()) { + LZ4_write16(memPtr, value); + } else { BYTE* p = (BYTE*)memPtr; p[0] = (BYTE) value; p[1] = (BYTE)(value>>8); } } -static U32 LZ4_read32(const void* memPtr) +static void LZ4_copy8(void* dst, const void* src) { - U32 val32; - memcpy(&val32, memPtr, 4); - return val32; + memcpy(dst,src,8); } -static U64 LZ4_read64(const void* memPtr) -{ - U64 val64; - memcpy(&val64, memPtr, 8); - return val64; -} - -static size_t LZ4_read_ARCH(const void* p) -{ - if (LZ4_64bits()) - return (size_t)LZ4_read64(p); - else - return (size_t)LZ4_read32(p); -} - - -static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); } - -static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); } - -/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */ +/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */ static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) { BYTE* d = (BYTE*)dstPtr; const BYTE* s = (const BYTE*)srcPtr; - BYTE* e = (BYTE*)dstEnd; + BYTE* const e = (BYTE*)dstEnd; + do { LZ4_copy8(d,s); d+=8; s+=8; } while (d>3); -# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif - } - else /* 32 bits */ - { + } else /* 32 bits */ { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r; _BitScanForward( &r, (U32)val ); return (int)(r>>3); -# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif } - } - else /* Big Endian CPU */ - { - if (LZ4_64bits()) - { + } else /* Big Endian CPU */ { + if (sizeof(val)==8) { # if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanReverse64( &r, val ); return (unsigned)(r>>3); -# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clzll((U64)val) >> 3); # else unsigned r; @@ -293,14 +330,12 @@ static unsigned LZ4_NbCommonBytes (register size_t val) r += (!val); return r; # endif - } - else /* 32 bits */ - { + } else /* 32 bits */ { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanReverse( &r, (unsigned long)val ); return (unsigned)(r>>3); -# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) +# elif (defined(__clang__) || (defined(__GNUC__) && (__GNUC__>=3))) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clz((U32)val) >> 3); # else unsigned r; @@ -312,19 +347,19 @@ static unsigned LZ4_NbCommonBytes (register size_t val) } } +#define STEPSIZE sizeof(reg_t) static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) { const BYTE* const pStart = pIn; - while (likely(pIn compression run slower on incompressible data */ -/************************************** +/*-************************************ * Local Structures and types **************************************/ -typedef struct { - U32 hashTable[HASH_SIZE_U32]; - U32 currentOffset; - U32 initCheck; - const BYTE* dictionary; - BYTE* bufferStart; /* obsolete, used for slideInputBuffer */ - U32 dictSize; -} LZ4_stream_t_internal; - typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; typedef enum { byPtr, byU32, byU16 } tableType_t; @@ -365,44 +387,43 @@ typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; typedef enum { full = 0, partial = 1 } earlyEnd_directive; -/************************************** +/*-************************************ * Local Utils **************************************/ int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } +const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; } int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } int LZ4_sizeofState() { return LZ4_STREAMSIZE; } - -/******************************** +/*-****************************** * Compression functions ********************************/ - -static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType) +static U32 LZ4_hash4(U32 sequence, tableType_t const tableType) { if (tableType == byU16) - return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); else - return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); + return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); } -static const U64 prime5bytes = 889523592379ULL; -static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType) +static U32 LZ4_hash5(U64 sequence, tableType_t const tableType) { + static const U64 prime5bytes = 889523592379ULL; + static const U64 prime8bytes = 11400714785074694791ULL; const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; - const U32 hashMask = (1<> (40 - hashLog)) & hashMask; + if (LZ4_isLittleEndian()) + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + else + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); } -static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType) +FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType) { - if (LZ4_64bits()) - return LZ4_hashSequence64(sequence, tableType); - return LZ4_hashSequence((U32)sequence, tableType); + if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType); + return LZ4_hash4(LZ4_read32(p), tableType); } -static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); } - static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase) { switch (tableType) @@ -413,27 +434,30 @@ static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableTy } } -static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) { - U32 h = LZ4_hashPosition(p, tableType); + U32 const h = LZ4_hashPosition(p, tableType); LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); } static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) { if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } - if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } - { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ + if (tableType == byU32) { const U32* const hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { const U16* const hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ } -static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) { - U32 h = LZ4_hashPosition(p, tableType); + U32 const h = LZ4_hashPosition(p, tableType); return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); } + +/** LZ4_compress_generic() : + inlined, to ensure branches are decided at compilation time */ FORCE_INLINE int LZ4_compress_generic( - void* const ctx, + LZ4_stream_t_internal* const cctx, const char* const source, char* const dest, const int inputSize, @@ -444,15 +468,13 @@ FORCE_INLINE int LZ4_compress_generic( const dictIssue_directive dictIssue, const U32 acceleration) { - LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx; - const BYTE* ip = (const BYTE*) source; const BYTE* base; const BYTE* lowLimit; - const BYTE* const lowRefLimit = ip - dictPtr->dictSize; - const BYTE* const dictionary = dictPtr->dictionary; - const BYTE* const dictEnd = dictionary + dictPtr->dictSize; - const size_t dictDelta = dictEnd - (const BYTE*)source; + const BYTE* const lowRefLimit = ip - cctx->dictSize; + const BYTE* const dictionary = cctx->dictionary; + const BYTE* const dictEnd = dictionary + cctx->dictSize; + const ptrdiff_t dictDelta = dictEnd - (const BYTE*)source; const BYTE* anchor = (const BYTE*) source; const BYTE* const iend = ip + inputSize; const BYTE* const mflimit = iend - MFLIMIT; @@ -462,10 +484,9 @@ FORCE_INLINE int LZ4_compress_generic( BYTE* const olimit = op + maxOutputSize; U32 forwardH; - size_t refDelta=0; /* Init conditions */ - if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported inputSize, too large (or negative) */ switch(dict) { case noDict: @@ -474,11 +495,11 @@ FORCE_INLINE int LZ4_compress_generic( lowLimit = (const BYTE*)source; break; case withPrefix64k: - base = (const BYTE*)source - dictPtr->currentOffset; - lowLimit = (const BYTE*)source - dictPtr->dictSize; + base = (const BYTE*)source - cctx->currentOffset; + lowLimit = (const BYTE*)source - cctx->dictSize; break; case usingExtDict: - base = (const BYTE*)source - dictPtr->currentOffset; + base = (const BYTE*)source - cctx->currentOffset; lowLimit = (const BYTE*)source; break; } @@ -486,44 +507,38 @@ FORCE_INLINE int LZ4_compress_generic( if (inputSizehashTable, tableType, base); ip++; forwardH = LZ4_hashPosition(ip, tableType); /* Main Loop */ - for ( ; ; ) - { + for ( ; ; ) { + ptrdiff_t refDelta = 0; const BYTE* match; BYTE* token; - { - const BYTE* forwardIp = ip; + + /* Find a match */ + { const BYTE* forwardIp = ip; unsigned step = 1; unsigned searchMatchNb = acceleration << LZ4_skipTrigger; - - /* Find a match */ do { - U32 h = forwardH; + U32 const h = forwardH; ip = forwardIp; forwardIp += step; step = (searchMatchNb++ >> LZ4_skipTrigger); if (unlikely(forwardIp > mflimit)) goto _last_literals; - match = LZ4_getPositionOnHash(h, ctx, tableType, base); - if (dict==usingExtDict) - { - if (match<(const BYTE*)source) - { + match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType, base); + if (dict==usingExtDict) { + if (match < (const BYTE*)source) { refDelta = dictDelta; lowLimit = dictionary; - } - else - { + } else { refDelta = 0; lowLimit = (const BYTE*)source; - } - } + } } forwardH = LZ4_hashPosition(forwardIp, tableType); - LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base); } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0) || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) @@ -531,18 +546,17 @@ FORCE_INLINE int LZ4_compress_generic( } /* Catch up */ - while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } + while (((ip>anchor) & (match+refDelta > lowLimit)) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } - { - /* Encode Literal length */ - unsigned litLength = (unsigned)(ip - anchor); + /* Encode Literals */ + { unsigned const litLength = (unsigned)(ip - anchor); token = op++; - if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) - return 0; /* Check output limit */ - if (litLength>=RUN_MASK) - { + if ((outputLimited) && /* Check output buffer overflow */ + (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) + return 0; + if (litLength >= RUN_MASK) { int len = (int)litLength-RUN_MASK; - *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } @@ -558,41 +572,37 @@ _next_match: LZ4_writeLE16(op, (U16)(ip-match)); op+=2; /* Encode MatchLength */ - { - unsigned matchLength; + { unsigned matchCode; - if ((dict==usingExtDict) && (lowLimit==dictionary)) - { + if ((dict==usingExtDict) && (lowLimit==dictionary)) { const BYTE* limit; match += refDelta; limit = ip + (dictEnd-match); if (limit > matchlimit) limit = matchlimit; - matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); - ip += MINMATCH + matchLength; - if (ip==limit) - { - unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit); - matchLength += more; + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); + ip += MINMATCH + matchCode; + if (ip==limit) { + unsigned const more = LZ4_count(ip, (const BYTE*)source, matchlimit); + matchCode += more; ip += more; } - } - else - { - matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); - ip += MINMATCH + matchLength; + } else { + matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); + ip += MINMATCH + matchCode; } - if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit))) - return 0; /* Check output limit */ - if (matchLength>=ML_MASK) - { + if ( outputLimited && /* Check output buffer overflow */ + (unlikely(op + (1 + LASTLITERALS) + (matchCode>>8) > olimit)) ) + return 0; + if (matchCode >= ML_MASK) { *token += ML_MASK; - matchLength -= ML_MASK; - for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; } - if (matchLength >= 255) { matchLength-=255; *op++ = 255; } - *op++ = (BYTE)matchLength; - } - else *token += (BYTE)(matchLength); + matchCode -= ML_MASK; + LZ4_write32(op, 0xFFFFFFFF); + while (matchCode >= 4*255) op+=4, LZ4_write32(op, 0xFFFFFFFF), matchCode -= 4*255; + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else + *token += (BYTE)(matchCode); } anchor = ip; @@ -601,24 +611,19 @@ _next_match: if (ip > mflimit) break; /* Fill table */ - LZ4_putPosition(ip-2, ctx, tableType, base); + LZ4_putPosition(ip-2, cctx->hashTable, tableType, base); /* Test next position */ - match = LZ4_getPosition(ip, ctx, tableType, base); - if (dict==usingExtDict) - { - if (match<(const BYTE*)source) - { + match = LZ4_getPosition(ip, cctx->hashTable, tableType, base); + if (dict==usingExtDict) { + if (match < (const BYTE*)source) { refDelta = dictDelta; lowLimit = dictionary; - } - else - { + } else { refDelta = 0; lowLimit = (const BYTE*)source; - } - } - LZ4_putPosition(ip, ctx, tableType, base); + } } + LZ4_putPosition(ip, cctx->hashTable, tableType, base); if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1) && (match+MAX_DISTANCE>=ip) && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) ) @@ -630,19 +635,16 @@ _next_match: _last_literals: /* Encode Last Literals */ - { - const size_t lastRun = (size_t)(iend - anchor); - if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) - return 0; /* Check output limit */ - if (lastRun >= RUN_MASK) - { + { size_t const lastRun = (size_t)(iend - anchor); + if ( (outputLimited) && /* Check output buffer overflow */ + ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) ) + return 0; + if (lastRun >= RUN_MASK) { size_t accumulator = lastRun - RUN_MASK; *op++ = RUN_MASK << ML_BITS; for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; *op++ = (BYTE) accumulator; - } - else - { + } else { *op++ = (BYTE)(lastRun<internal_donotuse; LZ4_resetStream((LZ4_stream_t*)state); if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; - if (maxOutputSize >= LZ4_compressBound(inputSize)) - { + if (maxOutputSize >= LZ4_compressBound(inputSize)) { if (inputSize < LZ4_64Klimit) - return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); + return LZ4_compress_generic(ctx, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); else - return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); - } - else - { + return LZ4_compress_generic(ctx, source, dest, inputSize, 0, notLimited, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue, acceleration); + } else { if (inputSize < LZ4_64Klimit) - return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + return LZ4_compress_generic(ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); else - return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); + return LZ4_compress_generic(ctx, source, dest, inputSize, maxOutputSize, limitedOutput, (sizeof(void*)==8) ? byU32 : byPtr, noDict, noDictIssue, acceleration); } } @@ -682,10 +682,10 @@ int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutp void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ #else LZ4_stream_t ctx; - void* ctxPtr = &ctx; + void* const ctxPtr = &ctx; #endif - int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); + int const result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); #if (HEAPMODE) FREEMEM(ctxPtr); @@ -705,22 +705,21 @@ int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxO int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) { LZ4_stream_t ctx; - LZ4_resetStream(&ctx); if (inputSize < LZ4_64Klimit) - return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); + return LZ4_compress_generic(&ctx.internal_donotuse, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); else - return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); + return LZ4_compress_generic(&ctx.internal_donotuse, source, dest, inputSize, maxOutputSize, limitedOutput, sizeof(void*)==8 ? byU32 : byPtr, noDict, noDictIssue, acceleration); } -/******************************** -* destSize variant +/*-****************************** +* *_destSize() variant ********************************/ static int LZ4_compress_destSize_generic( - void* const ctx, + LZ4_stream_t_internal* const ctx, const char* const src, char* const dst, int* const srcSizePtr, @@ -752,32 +751,30 @@ static int LZ4_compress_destSize_generic( /* First Byte */ *srcSizePtr = 0; - LZ4_putPosition(ip, ctx, tableType, base); + LZ4_putPosition(ip, ctx->hashTable, tableType, base); ip++; forwardH = LZ4_hashPosition(ip, tableType); /* Main Loop */ - for ( ; ; ) - { + for ( ; ; ) { const BYTE* match; BYTE* token; - { - const BYTE* forwardIp = ip; + + /* Find a match */ + { const BYTE* forwardIp = ip; unsigned step = 1; unsigned searchMatchNb = 1 << LZ4_skipTrigger; - /* Find a match */ do { U32 h = forwardH; ip = forwardIp; forwardIp += step; step = (searchMatchNb++ >> LZ4_skipTrigger); - if (unlikely(forwardIp > mflimit)) - goto _last_literals; + if (unlikely(forwardIp > mflimit)) goto _last_literals; - match = LZ4_getPositionOnHash(h, ctx, tableType, base); + match = LZ4_getPositionOnHash(h, ctx->hashTable, tableType, base); forwardH = LZ4_hashPosition(forwardIp, tableType); - LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + LZ4_putPositionOnHash(ip, h, ctx->hashTable, tableType, base); } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) || (LZ4_read32(match) != LZ4_read32(ip)) ); @@ -786,18 +783,15 @@ static int LZ4_compress_destSize_generic( /* Catch up */ while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } - { - /* Encode Literal length */ - unsigned litLength = (unsigned)(ip - anchor); + /* Encode Literal length */ + { unsigned litLength = (unsigned)(ip - anchor); token = op++; - if (op + ((litLength+240)/255) + litLength > oMaxLit) - { + if (op + ((litLength+240)/255) + litLength > oMaxLit) { /* Not enough space for a last match */ op--; goto _last_literals; } - if (litLength>=RUN_MASK) - { + if (litLength>=RUN_MASK) { unsigned len = litLength - RUN_MASK; *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; @@ -815,21 +809,15 @@ _next_match: LZ4_writeLE16(op, (U16)(ip-match)); op+=2; /* Encode MatchLength */ - { - size_t matchLength; + { size_t matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); - matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); - - if (op + ((matchLength+240)/255) > oMaxMatch) - { + if (op + ((matchLength+240)/255) > oMaxMatch) { /* Match description too long : reduce it */ matchLength = (15-1) + (oMaxMatch-op) * 255; } - //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH); ip += MINMATCH + matchLength; - if (matchLength>=ML_MASK) - { + if (matchLength>=ML_MASK) { *token += ML_MASK; matchLength -= ML_MASK; while (matchLength >= 255) { matchLength-=255; *op++ = 255; } @@ -845,11 +833,11 @@ _next_match: if (op > oMaxSeq) break; /* Fill table */ - LZ4_putPosition(ip-2, ctx, tableType, base); + LZ4_putPosition(ip-2, ctx->hashTable, tableType, base); /* Test next position */ - match = LZ4_getPosition(ip, ctx, tableType, base); - LZ4_putPosition(ip, ctx, tableType, base); + match = LZ4_getPosition(ip, ctx->hashTable, tableType, base); + LZ4_putPosition(ip, ctx->hashTable, tableType, base); if ( (match+MAX_DISTANCE>=ip) && (LZ4_read32(match)==LZ4_read32(ip)) ) { token=op++; *token=0; goto _next_match; } @@ -860,25 +848,20 @@ _next_match: _last_literals: /* Encode Last Literals */ - { - size_t lastRunSize = (size_t)(iend - anchor); - if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) - { + { size_t lastRunSize = (size_t)(iend - anchor); + if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) { /* adapt lastRunSize to fill 'dst' */ lastRunSize = (oend-op) - 1; lastRunSize -= (lastRunSize+240)/255; } ip = anchor + lastRunSize; - if (lastRunSize >= RUN_MASK) - { + if (lastRunSize >= RUN_MASK) { size_t accumulator = lastRunSize - RUN_MASK; *op++ = RUN_MASK << ML_BITS; for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; *op++ = (BYTE) accumulator; - } - else - { + } else { *op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) /* compression success is guaranteed */ - { + if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */ return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); - } - else - { + } else { if (*srcSizePtr < LZ4_64Klimit) - return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16); + return LZ4_compress_destSize_generic(&state->internal_donotuse, src, dst, srcSizePtr, targetDstSize, byU16); else - return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr); + return LZ4_compress_destSize_generic(&state->internal_donotuse, src, dst, srcSizePtr, targetDstSize, sizeof(void*)==8 ? byU32 : byPtr); } } @@ -912,10 +892,10 @@ static int LZ4_compress_destSize_extState (void* state, const char* src, char* d int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) { #if (HEAPMODE) - void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + LZ4_stream_t* ctx = (LZ4_stream_t*)ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ #else LZ4_stream_t ctxBody; - void* ctx = &ctxBody; + LZ4_stream_t* ctx = &ctxBody; #endif int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); @@ -928,7 +908,7 @@ int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targe -/******************************** +/*-****************************** * Streaming functions ********************************/ @@ -952,10 +932,10 @@ int LZ4_freeStream (LZ4_stream_t* LZ4_stream) } -#define HASH_UNIT sizeof(size_t) +#define HASH_UNIT sizeof(reg_t) int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) { - LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; + LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse; const BYTE* p = (const BYTE*)dictionary; const BYTE* const dictEnd = p + dictSize; const BYTE* base; @@ -963,8 +943,7 @@ int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */ LZ4_resetStream(LZ4_dict); - if (dictSize < (int)HASH_UNIT) - { + if (dictSize < (int)HASH_UNIT) { dict->dictionary = NULL; dict->dictSize = 0; return 0; @@ -977,8 +956,7 @@ int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) dict->dictSize = (U32)(dictEnd - p); dict->currentOffset += dict->dictSize; - while (p <= dictEnd-HASH_UNIT) - { + while (p <= dictEnd-HASH_UNIT) { LZ4_putPosition(p, dict->hashTable, byU32, base); p+=3; } @@ -990,14 +968,12 @@ int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) { if ((LZ4_dict->currentOffset > 0x80000000) || - ((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */ - { + ((uptrval)LZ4_dict->currentOffset > (uptrval)src)) { /* address space overflow */ /* rescale hash table */ - U32 delta = LZ4_dict->currentOffset - 64 KB; + U32 const delta = LZ4_dict->currentOffset - 64 KB; const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; int i; - for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; else LZ4_dict->hashTable[i] -= delta; } @@ -1010,7 +986,7 @@ static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) { - LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream; + LZ4_stream_t_internal* streamPtr = &LZ4_stream->internal_donotuse; const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; const BYTE* smallest = (const BYTE*) source; @@ -1020,10 +996,8 @@ int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, ch if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; /* Check overlapping input/dictionary space */ - { - const BYTE* sourceEnd = (const BYTE*) source + inputSize; - if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) - { + { const BYTE* sourceEnd = (const BYTE*) source + inputSize; + if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) { streamPtr->dictSize = (U32)(dictEnd - sourceEnd); if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; @@ -1032,25 +1006,23 @@ int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, ch } /* prefix mode : source data follows dictionary */ - if (dictEnd == (const BYTE*)source) - { + if (dictEnd == (const BYTE*)source) { int result; if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) - result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); else - result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); streamPtr->dictSize += (U32)inputSize; streamPtr->currentOffset += (U32)inputSize; return result; } /* external dictionary mode */ - { - int result; + { int result; if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) - result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); else - result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); streamPtr->dictionary = (const BYTE*)source; streamPtr->dictSize = (U32)inputSize; streamPtr->currentOffset += (U32)inputSize; @@ -1062,15 +1034,15 @@ int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, ch /* Hidden debug function, to force external dictionary mode */ int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) { - LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict; + LZ4_stream_t_internal* streamPtr = &LZ4_dict->internal_donotuse; int result; const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; const BYTE* smallest = dictEnd; if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; - LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest); + LZ4_renormDictT(streamPtr, smallest); - result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); + result = LZ4_compress_generic(streamPtr, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); streamPtr->dictionary = (const BYTE*)source; streamPtr->dictSize = (U32)inputSize; @@ -1080,10 +1052,17 @@ int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* } +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at its memory location, + * save it into a safer place (char* safeBuffer). + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue(). + * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. + */ int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) { - LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; - const BYTE* previousDictEnd = dict->dictionary + dict->dictSize; + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; @@ -1098,14 +1077,14 @@ int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) -/******************************* +/*-***************************** * Decompression functions *******************************/ -/* - * This generic decompression function cover all use cases. - * It shall be instantiated several times, using different sets of directives - * Note that it is essential this generic function is really inlined, - * in order to remove useless branches during compilation optimization. +/*! LZ4_decompress_generic() : + * This generic decompression function cover all use cases. + * It shall be instantiated several times, using different sets of directives + * Note that it is important this generic function is really inlined, + * in order to remove useless branches during compilation optimization. */ FORCE_INLINE int LZ4_decompress_generic( const char* const source, @@ -1117,7 +1096,7 @@ FORCE_INLINE int LZ4_decompress_generic( int partialDecoding, /* full, partial */ int targetOutputSize, /* only used if partialDecoding==partial */ int dict, /* noDict, withPrefix64k, usingExtDict */ - const BYTE* const lowPrefix, /* == dest if dict == noDict */ + const BYTE* const lowPrefix, /* == dest when no prefix */ const BYTE* const dictStart, /* only if dict==usingExtDict */ const size_t dictSize /* note : = 0 if noDict */ ) @@ -1133,53 +1112,45 @@ FORCE_INLINE int LZ4_decompress_generic( const BYTE* const lowLimit = lowPrefix - dictSize; const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; - const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4}; - const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + const unsigned dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; + const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; const int safeDecode = (endOnInput==endOnInputSize); const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); /* Special cases */ - if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ + if ((partialDecoding) && (oexit > oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); - - /* Main Loop */ - while (1) - { - unsigned token; + /* Main Loop : decode sequences */ + while (1) { size_t length; const BYTE* match; + size_t offset; /* get literal length */ - token = *ip++; - if ((length=(token>>ML_BITS)) == RUN_MASK) - { + unsigned const token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) { unsigned s; - do - { + do { s = *ip++; length += s; - } - while (likely((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) - || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + if ( ((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) ) { - if (partialDecoding) - { + if (partialDecoding) { if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ - } - else - { + } else { if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ } @@ -1192,84 +1163,76 @@ FORCE_INLINE int LZ4_decompress_generic( ip += length; op = cpy; /* get offset */ - match = cpy - LZ4_readLE16(ip); ip+=2; - if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside buffers */ + LZ4_write32(op, (U32)offset); /* costs ~1%; silence an msan warning when offset==0 */ /* get matchlength */ length = token & ML_MASK; - if (length == ML_MASK) - { + if (length == ML_MASK) { unsigned s; - do - { - if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; + do { s = *ip++; + if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; length += s; } while (s==255); - if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */ + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ } length += MINMATCH; /* check external dictionary */ - if ((dict==usingExtDict) && (match < lowPrefix)) - { + if ((dict==usingExtDict) && (match < lowPrefix)) { if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ - if (length <= (size_t)(lowPrefix-match)) - { + if (length <= (size_t)(lowPrefix-match)) { /* match can be copied as a single segment from external dictionary */ - match = dictEnd - (lowPrefix-match); - memmove(op, match, length); op += length; - } - else - { - /* match encompass external dictionary and current segment */ - size_t copySize = (size_t)(lowPrefix-match); + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match encompass external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix-match); + size_t const restSize = length - copySize; memcpy(op, dictEnd - copySize, copySize); op += copySize; - copySize = length - copySize; - if (copySize > (size_t)(op-lowPrefix)) /* overlap within current segment */ - { - BYTE* const endOfMatch = op + copySize; + if (restSize > (size_t)(op-lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; const BYTE* copyFrom = lowPrefix; while (op < endOfMatch) *op++ = *copyFrom++; - } - else - { - memcpy(op, lowPrefix, copySize); - op += copySize; - } - } + } else { + memcpy(op, lowPrefix, restSize); + op += restSize; + } } continue; } - /* copy repeated sequence */ + /* copy match within block */ cpy = op + length; - if (unlikely((op-match)<8)) - { - const size_t dec64 = dec64table[op-match]; + if (unlikely(offset<8)) { + const int dec64 = dec64table[offset]; op[0] = match[0]; op[1] = match[1]; op[2] = match[2]; op[3] = match[3]; - match += dec32table[op-match]; - LZ4_copy4(op+4, match); - op += 8; match -= dec64; - } else { LZ4_copy8(op, match); op+=8; match+=8; } + match += dec32table[offset]; + memcpy(op+4, match, 4); + match -= dec64; + } else { LZ4_copy8(op, match); match+=8; } + op += 8; - if (unlikely(cpy>oend-12)) - { - if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ - if (op < oend-8) - { - LZ4_wildCopy(op, match, oend-8); - match += (oend-8) - op; - op = oend-8; + if (unlikely(cpy>oend-12)) { + BYTE* const oCopyLimit = oend-(WILDCOPYLENGTH-1); + if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ + if (op < oCopyLimit) { + LZ4_wildCopy(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; } while (op16) LZ4_wildCopy(op+8, match+8, cpy); } - else - LZ4_wildCopy(op, match, cpy); op=cpy; /* correction */ } @@ -1301,15 +1264,7 @@ int LZ4_decompress_fast(const char* source, char* dest, int originalSize) } -/* streaming decompression functions */ - -typedef struct -{ - const BYTE* externalDict; - size_t extDictSize; - const BYTE* prefixEnd; - size_t prefixSize; -} LZ4_streamDecode_t_internal; +/*===== streaming decompression functions =====*/ /* * If you prefer dynamic allocation methods, @@ -1328,16 +1283,16 @@ int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream) return 0; } -/* - * LZ4_setStreamDecode - * Use this function to instruct where to find the dictionary +/*! + * LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. * This function is not necessary if previous data is still available where it was decoded. * Loading a size of 0 is allowed (same effect as no dictionary). * Return : 1 if OK, 0 if error */ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize) { - LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; lz4sd->prefixSize = (size_t) dictSize; lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; lz4sd->externalDict = NULL; @@ -1354,20 +1309,17 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti */ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) { - LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; int result; - if (lz4sd->prefixEnd == (BYTE*)dest) - { + if (lz4sd->prefixEnd == (BYTE*)dest) { result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize += result; lz4sd->prefixEnd += result; - } - else - { + } else { lz4sd->extDictSize = lz4sd->prefixSize; lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, @@ -1383,22 +1335,19 @@ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const ch int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) { - LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; int result; - if (lz4sd->prefixEnd == (BYTE*)dest) - { + if (lz4sd->prefixEnd == (BYTE*)dest) { result = LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); if (result <= 0) return result; lz4sd->prefixSize += originalSize; lz4sd->prefixEnd += originalSize; - } - else - { + } else { lz4sd->extDictSize = lz4sd->prefixSize; - lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; result = LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); @@ -1422,8 +1371,7 @@ FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest { if (dictSize==0) return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0); - if (dictStart+dictSize == dest) - { + if (dictStart+dictSize == dest) { if (dictSize >= (int)(64 KB - 1)) return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0); return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0); @@ -1448,7 +1396,7 @@ int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compres } -/*************************************************** +/*=************************************************* * Obsolete Functions ***************************************************/ /* obsolete compression functions */ @@ -1473,29 +1421,29 @@ int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } -static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base) +static void LZ4_init(LZ4_stream_t* lz4ds, BYTE* base) { - MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE); - lz4ds->bufferStart = base; + MEM_INIT(lz4ds, 0, sizeof(LZ4_stream_t)); + lz4ds->internal_donotuse.bufferStart = base; } int LZ4_resetStreamState(void* state, char* inputBuffer) { - if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ - LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer); + if ((((uptrval)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ + LZ4_init((LZ4_stream_t*)state, (BYTE*)inputBuffer); return 0; } void* LZ4_create (char* inputBuffer) { - void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64); - LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer); + LZ4_stream_t* lz4ds = (LZ4_stream_t*)ALLOCATOR(8, sizeof(LZ4_stream_t)); + LZ4_init (lz4ds, (BYTE*)inputBuffer); return lz4ds; } char* LZ4_slideInputBuffer (void* LZ4_Data) { - LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data; + LZ4_stream_t_internal* ctx = &((LZ4_stream_t*)LZ4_Data)->internal_donotuse; int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB); return (char*)(ctx->bufferStart + dictSize); } @@ -1513,4 +1461,3 @@ int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int origin } #endif /* LZ4_COMMONDEFS_ONLY */ - diff --git a/contrib/liblz4/src/lz4hc.c b/contrib/liblz4/src/lz4hc.c index 8cc1a3398da..5d4ea3e6328 100644 --- a/contrib/liblz4/src/lz4hc.c +++ b/contrib/liblz4/src/lz4hc.c @@ -1,6 +1,6 @@ /* LZ4 HC - High Compression Mode of LZ4 - Copyright (C) 2011-2015, Yann Collet. + Copyright (C) 2011-2016, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) @@ -28,27 +28,36 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - LZ4 source repository : https://github.com/Cyan4973/lz4 + - LZ4 source repository : https://github.com/lz4/lz4 - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c */ +/* note : lz4hc is not an independent module, it requires lz4.h/lz4.c for proper compilation */ - -/************************************** +/* ************************************* * Tuning Parameter -**************************************/ -static const int LZ4HC_compressionLevel_default = 9; +***************************************/ + +/*! + * HEAPMODE : + * Select how default compression function will allocate workplace memory, + * in stack (0:fastest), or in heap (1:requires malloc()). + * Since workplace is rather large, heap mode is recommended. + */ +#ifndef LZ4HC_HEAPMODE +# define LZ4HC_HEAPMODE 1 +#endif -/************************************** -* Includes -**************************************/ -#include +/* ************************************* +* Dependency +***************************************/ +#include "lz4hc.h" -/************************************** +/* ************************************* * Local Compiler Options -**************************************/ +***************************************/ #if defined(__GNUC__) # pragma GCC diagnostic ignored "-Wunused-function" #endif @@ -58,52 +67,24 @@ static const int LZ4HC_compressionLevel_default = 9; #endif -/************************************** +/* ************************************* * Common LZ4 definition -**************************************/ +***************************************/ #define LZ4_COMMONDEFS_ONLY #include "lz4.c" -/************************************** +/* ************************************* * Local Constants -**************************************/ -#define DICTIONARY_LOGSIZE 16 -#define MAXD (1<> ((MINMATCH*8)-HASH_LOG)) -//#define DELTANEXTU16(p) chainTable[(p) & MAXD_MASK] /* flexible, MAXD dependent */ +#define HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8)-LZ4HC_HASH_LOG)) +#define DELTANEXTMAXD(p) chainTable[(p) & LZ4HC_MAXD_MASK] /* flexible, LZ4HC_MAXD dependent */ #define DELTANEXTU16(p) chainTable[(U16)(p)] /* faster */ static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr)); } @@ -113,7 +94,7 @@ static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr) /************************************** * HC Compression **************************************/ -static void LZ4HC_init (LZ4HC_Data_Structure* hc4, const BYTE* start) +static void LZ4HC_init (LZ4HC_CCtx_internal* hc4, const BYTE* start) { MEM_INIT((void*)hc4->hashTable, 0, sizeof(hc4->hashTable)); MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable)); @@ -127,21 +108,20 @@ static void LZ4HC_init (LZ4HC_Data_Structure* hc4, const BYTE* start) /* Update chains up to ip (excluded) */ -FORCE_INLINE void LZ4HC_Insert (LZ4HC_Data_Structure* hc4, const BYTE* ip) +FORCE_INLINE void LZ4HC_Insert (LZ4HC_CCtx_internal* hc4, const BYTE* ip) { - U16* chainTable = hc4->chainTable; - U32* HashTable = hc4->hashTable; + U16* const chainTable = hc4->chainTable; + U32* const hashTable = hc4->hashTable; const BYTE* const base = hc4->base; - const U32 target = (U32)(ip - base); + U32 const target = (U32)(ip - base); U32 idx = hc4->nextToUpdate; - while(idx < target) - { - U32 h = LZ4HC_hashPtr(base+idx); - size_t delta = idx - HashTable[h]; + while (idx < target) { + U32 const h = LZ4HC_hashPtr(base+idx); + size_t delta = idx - hashTable[h]; if (delta>MAX_DISTANCE) delta = MAX_DISTANCE; DELTANEXTU16(idx) = (U16)delta; - HashTable[h] = idx; + hashTable[h] = idx; idx++; } @@ -149,7 +129,7 @@ FORCE_INLINE void LZ4HC_Insert (LZ4HC_Data_Structure* hc4, const BYTE* ip) } -FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, /* Index table will be updated */ +FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_CCtx_internal* hc4, /* Index table will be updated */ const BYTE* ip, const BYTE* const iLimit, const BYTE** matchpos, const int maxNbAttempts) @@ -161,7 +141,6 @@ FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, /* I const U32 dictLimit = hc4->dictLimit; const U32 lowLimit = (hc4->lowLimit + 64 KB > (U32)(ip-base)) ? hc4->lowLimit : (U32)(ip - base) - (64 KB - 1); U32 matchIndex; - const BYTE* match; int nbAttempts=maxNbAttempts; size_t ml=0; @@ -169,24 +148,19 @@ FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, /* I LZ4HC_Insert(hc4, ip); matchIndex = HashTable[LZ4HC_hashPtr(ip)]; - while ((matchIndex>=lowLimit) && (nbAttempts)) - { + while ((matchIndex>=lowLimit) && (nbAttempts)) { nbAttempts--; - if (matchIndex >= dictLimit) - { - match = base + matchIndex; + if (matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; if (*(match+ml) == *(ip+ml) && (LZ4_read32(match) == LZ4_read32(ip))) { - size_t mlt = LZ4_count(ip+MINMATCH, match+MINMATCH, iLimit) + MINMATCH; + size_t const mlt = LZ4_count(ip+MINMATCH, match+MINMATCH, iLimit) + MINMATCH; if (mlt > ml) { ml = mlt; *matchpos = match; } } - } - else - { - match = dictBase + matchIndex; - if (LZ4_read32(match) == LZ4_read32(ip)) - { + } else { + const BYTE* const match = dictBase + matchIndex; + if (LZ4_read32(match) == LZ4_read32(ip)) { size_t mlt; const BYTE* vLimit = ip + (dictLimit - matchIndex); if (vLimit > iLimit) vLimit = iLimit; @@ -204,7 +178,7 @@ FORCE_INLINE int LZ4HC_InsertAndFindBestMatch (LZ4HC_Data_Structure* hc4, /* I FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch ( - LZ4HC_Data_Structure* hc4, + LZ4HC_CCtx_internal* hc4, const BYTE* const ip, const BYTE* const iLowLimit, const BYTE* const iHighLimit, @@ -229,38 +203,32 @@ FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch ( LZ4HC_Insert(hc4, ip); matchIndex = HashTable[LZ4HC_hashPtr(ip)]; - while ((matchIndex>=lowLimit) && (nbAttempts)) - { + while ((matchIndex>=lowLimit) && (nbAttempts)) { nbAttempts--; - if (matchIndex >= dictLimit) - { + if (matchIndex >= dictLimit) { const BYTE* matchPtr = base + matchIndex; - if (*(iLowLimit + longest) == *(matchPtr - delta + longest)) - if (LZ4_read32(matchPtr) == LZ4_read32(ip)) - { + if (*(iLowLimit + longest) == *(matchPtr - delta + longest)) { + if (LZ4_read32(matchPtr) == LZ4_read32(ip)) { int mlt = MINMATCH + LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit); int back = 0; - while ((ip+back>iLowLimit) + while ((ip+back > iLowLimit) && (matchPtr+back > lowPrefixPtr) && (ip[back-1] == matchPtr[back-1])) back--; mlt -= back; - if (mlt > longest) - { + if (mlt > longest) { longest = (int)mlt; *matchpos = matchPtr+back; *startpos = ip+back; } } - } - else - { - const BYTE* matchPtr = dictBase + matchIndex; - if (LZ4_read32(matchPtr) == LZ4_read32(ip)) - { + } + } else { + const BYTE* const matchPtr = dictBase + matchIndex; + if (LZ4_read32(matchPtr) == LZ4_read32(ip)) { size_t mlt; int back=0; const BYTE* vLimit = ip + (dictLimit - matchIndex); @@ -320,8 +288,15 @@ FORCE_INLINE int LZ4HC_encodeSequence ( /* Encode MatchLength */ length = (int)(matchLength-MINMATCH); if ((limitedOutputBuffer) && (*op + (length>>8) + (1 + LASTLITERALS) > oend)) return 1; /* Check output limit */ - if (length>=(int)ML_MASK) { *token+=ML_MASK; length-=ML_MASK; for(; length > 509 ; length-=510) { *(*op)++ = 255; *(*op)++ = 255; } if (length > 254) { length-=255; *(*op)++ = 255; } *(*op)++ = (BYTE)length; } - else *token += (BYTE)(length); + if (length>=(int)ML_MASK) { + *token += ML_MASK; + length -= ML_MASK; + for(; length > 509 ; length-=510) { *(*op)++ = 255; *(*op)++ = 255; } + if (length > 254) { length-=255; *(*op)++ = 255; } + *(*op)++ = (BYTE)length; + } else { + *token += (BYTE)(length); + } /* Prepare next loop */ *ip += matchLength; @@ -330,18 +305,18 @@ FORCE_INLINE int LZ4HC_encodeSequence ( return 0; } +#include "lz4opt.h" -static int LZ4HC_compress_generic ( - void* ctxvoid, - const char* source, - char* dest, - int inputSize, - int maxOutputSize, - int compressionLevel, +static int LZ4HC_compress_hashChain ( + LZ4HC_CCtx_internal* const ctx, + const char* const source, + char* const dest, + int const inputSize, + int const maxOutputSize, + unsigned maxNbAttempts, limitedOutput_directive limit ) { - LZ4HC_Data_Structure* ctx = (LZ4HC_Data_Structure*) ctxvoid; const BYTE* ip = (const BYTE*) source; const BYTE* anchor = ip; const BYTE* const iend = ip + inputSize; @@ -351,28 +326,22 @@ static int LZ4HC_compress_generic ( BYTE* op = (BYTE*) dest; BYTE* const oend = op + maxOutputSize; - unsigned maxNbAttempts; int ml, ml2, ml3, ml0; - const BYTE* ref=NULL; - const BYTE* start2=NULL; - const BYTE* ref2=NULL; - const BYTE* start3=NULL; - const BYTE* ref3=NULL; + const BYTE* ref = NULL; + const BYTE* start2 = NULL; + const BYTE* ref2 = NULL; + const BYTE* start3 = NULL; + const BYTE* ref3 = NULL; const BYTE* start0; const BYTE* ref0; - /* init */ - if (compressionLevel > g_maxCompressionLevel) compressionLevel = g_maxCompressionLevel; - if (compressionLevel < 1) compressionLevel = LZ4HC_compressionLevel_default; - maxNbAttempts = 1 << (compressionLevel-1); ctx->end += inputSize; ip++; /* Main Loop */ - while (ip < mflimit) - { + while (ip < mflimit) { ml = LZ4HC_InsertAndFindBestMatch (ctx, ip, matchlimit, (&ref), maxNbAttempts); if (!ml) { ip++; continue; } @@ -383,19 +352,16 @@ static int LZ4HC_compress_generic ( _Search2: if (ip+ml < mflimit) - ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 1, matchlimit, ml, &ref2, &start2, maxNbAttempts); + ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 0, matchlimit, ml, &ref2, &start2, maxNbAttempts); else ml2 = ml; - if (ml2 == ml) /* No better match */ - { + if (ml2 == ml) { /* No better match */ if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, ref, limit, oend)) return 0; continue; } - if (start0 < ip) - { - if (start2 < ip + ml0) /* empirical */ - { + if (start0 < ip) { + if (start2 < ip + ml0) { /* empirical */ ip = start0; ref = ref0; ml = ml0; @@ -403,8 +369,7 @@ _Search2: } /* Here, start0==ip */ - if ((start2 - ip) < 3) /* First Match too small : removed */ - { + if ((start2 - ip) < 3) { /* First Match too small : removed */ ml = ml2; ip = start2; ref =ref2; @@ -417,15 +382,13 @@ _Search3: * ml2 > ml1, and * ip1+3 <= ip2 (usually < ip1+ml1) */ - if ((start2 - ip) < OPTIMAL_ML) - { + if ((start2 - ip) < OPTIMAL_ML) { int correction; int new_ml = ml; if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML; if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH; correction = new_ml - (int)(start2 - ip); - if (correction > 0) - { + if (correction > 0) { start2 += correction; ref2 += correction; ml2 -= correction; @@ -437,8 +400,7 @@ _Search3: ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3, maxNbAttempts); else ml3 = ml2; - if (ml3 == ml2) /* No better match : 2 sequences to encode */ - { + if (ml3 == ml2) { /* No better match : 2 sequences to encode */ /* ip & ref are known; Now for ml */ if (start2 < ip+ml) ml = (int)(start2 - ip); /* Now, encode 2 sequences */ @@ -448,18 +410,14 @@ _Search3: continue; } - if (start3 < ip+ml+3) /* Not enough space for match 2 : remove it */ - { - if (start3 >= (ip+ml)) /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */ - { - if (start2 < ip+ml) - { + if (start3 < ip+ml+3) { /* Not enough space for match 2 : remove it */ + if (start3 >= (ip+ml)) { /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */ + if (start2 < ip+ml) { int correction = (int)(ip+ml - start2); start2 += correction; ref2 += correction; ml2 -= correction; - if (ml2 < MINMATCH) - { + if (ml2 < MINMATCH) { start2 = start3; ref2 = ref3; ml2 = ml3; @@ -487,23 +445,18 @@ _Search3: * OK, now we have 3 ascending matches; let's write at least the first one * ip & ref are known; Now for ml */ - if (start2 < ip+ml) - { - if ((start2 - ip) < (int)ML_MASK) - { + if (start2 < ip+ml) { + if ((start2 - ip) < (int)ML_MASK) { int correction; if (ml > OPTIMAL_ML) ml = OPTIMAL_ML; if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH; correction = ml - (int)(start2 - ip); - if (correction > 0) - { + if (correction > 0) { start2 += correction; ref2 += correction; ml2 -= correction; } - } - else - { + } else { ml = (int)(start2 - ip); } } @@ -521,8 +474,7 @@ _Search3: } /* Encode Last Literals */ - { - int lastRun = (int)(iend - anchor); + { int lastRun = (int)(iend - anchor); if ((limit) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; /* Check output limit */ if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } else *op++ = (BYTE)(lastRun< 9) { + switch (compressionLevel) { + case 10: return LZ4HC_compress_hashChain(ctx, source, dest, inputSize, maxOutputSize, 1 << (16-1), limit); + case 11: ctx->searchNum = LZ4HC_getSearchNum(compressionLevel); return LZ4HC_compress_optimal(ctx, source, dest, inputSize, maxOutputSize, limit, 128, 0); + default: + case 12: ctx->searchNum = LZ4HC_getSearchNum(compressionLevel); return LZ4HC_compress_optimal(ctx, source, dest, inputSize, maxOutputSize, limit, LZ4_OPT_NUM, 1); + } + } + return LZ4HC_compress_hashChain(ctx, source, dest, inputSize, maxOutputSize, 1 << (compressionLevel-1), limit); +} + + +int LZ4_sizeofStateHC(void) { return sizeof(LZ4_streamHC_t); } int LZ4_compress_HC_extStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel) { + LZ4HC_CCtx_internal* ctx = &((LZ4_streamHC_t*)state)->internal_donotuse; if (((size_t)(state)&(sizeof(void*)-1)) != 0) return 0; /* Error : state is not aligned for pointers (32 or 64 bits) */ - LZ4HC_init ((LZ4HC_Data_Structure*)state, (const BYTE*)src); + LZ4HC_init (ctx, (const BYTE*)src); if (maxDstSize < LZ4_compressBound(srcSize)) - return LZ4HC_compress_generic (state, src, dst, srcSize, maxDstSize, compressionLevel, limitedOutput); + return LZ4HC_compress_generic (ctx, src, dst, srcSize, maxDstSize, compressionLevel, limitedOutput); else - return LZ4HC_compress_generic (state, src, dst, srcSize, maxDstSize, compressionLevel, noLimit); + return LZ4HC_compress_generic (ctx, src, dst, srcSize, maxDstSize, compressionLevel, noLimit); } int LZ4_compress_HC(const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel) { - LZ4HC_Data_Structure state; - return LZ4_compress_HC_extStateHC(&state, src, dst, srcSize, maxDstSize, compressionLevel); +#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1 + LZ4_streamHC_t* const statePtr = (LZ4_streamHC_t*)malloc(sizeof(LZ4_streamHC_t)); +#else + LZ4_streamHC_t state; + LZ4_streamHC_t* const statePtr = &state; +#endif + int const cSize = LZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, maxDstSize, compressionLevel); +#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1 + free(statePtr); +#endif + return cSize; } @@ -566,32 +559,38 @@ int LZ4_freeStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr) { free(LZ4_st /* initialization */ void LZ4_resetStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel) { - LZ4_STATIC_ASSERT(sizeof(LZ4HC_Data_Structure) <= sizeof(LZ4_streamHC_t)); /* if compilation fails here, LZ4_STREAMHCSIZE must be increased */ - ((LZ4HC_Data_Structure*)LZ4_streamHCPtr)->base = NULL; - ((LZ4HC_Data_Structure*)LZ4_streamHCPtr)->compressionLevel = (unsigned)compressionLevel; + LZ4_STATIC_ASSERT(sizeof(LZ4HC_CCtx_internal) <= sizeof(size_t) * LZ4_STREAMHCSIZE_SIZET); /* if compilation fails here, LZ4_STREAMHCSIZE must be increased */ + LZ4_streamHCPtr->internal_donotuse.base = NULL; + LZ4_streamHCPtr->internal_donotuse.compressionLevel = (unsigned)compressionLevel; + LZ4_streamHCPtr->internal_donotuse.searchNum = LZ4HC_getSearchNum(compressionLevel); } int LZ4_loadDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, const char* dictionary, int dictSize) { - LZ4HC_Data_Structure* ctxPtr = (LZ4HC_Data_Structure*) LZ4_streamHCPtr; - if (dictSize > 64 KB) - { + LZ4HC_CCtx_internal* ctxPtr = &LZ4_streamHCPtr->internal_donotuse; + if (dictSize > 64 KB) { dictionary += dictSize - 64 KB; dictSize = 64 KB; } LZ4HC_init (ctxPtr, (const BYTE*)dictionary); - if (dictSize >= 4) LZ4HC_Insert (ctxPtr, (const BYTE*)dictionary +(dictSize-3)); ctxPtr->end = (const BYTE*)dictionary + dictSize; + if (ctxPtr->compressionLevel >= LZ4HC_CLEVEL_OPT_MIN) + LZ4HC_updateBinTree(ctxPtr, ctxPtr->end - MFLIMIT, ctxPtr->end - LASTLITERALS); + else + if (dictSize >= 4) LZ4HC_Insert (ctxPtr, ctxPtr->end-3); return dictSize; } /* compression */ -static void LZ4HC_setExternalDict(LZ4HC_Data_Structure* ctxPtr, const BYTE* newBlock) +static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock) { - if (ctxPtr->end >= ctxPtr->base + 4) - LZ4HC_Insert (ctxPtr, ctxPtr->end-3); /* Referencing remaining dictionary content */ + if (ctxPtr->compressionLevel >= LZ4HC_CLEVEL_OPT_MIN) + LZ4HC_updateBinTree(ctxPtr, ctxPtr->end - MFLIMIT, ctxPtr->end - LASTLITERALS); + else + if (ctxPtr->end >= ctxPtr->base + 4) LZ4HC_Insert (ctxPtr, ctxPtr->end-3); /* Referencing remaining dictionary content */ + /* Only one memory segment for extDict, so any previous extDict is lost at this stage */ ctxPtr->lowLimit = ctxPtr->dictLimit; ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base); @@ -601,34 +600,29 @@ static void LZ4HC_setExternalDict(LZ4HC_Data_Structure* ctxPtr, const BYTE* newB ctxPtr->nextToUpdate = ctxPtr->dictLimit; /* match referencing will resume from there */ } -static int LZ4_compressHC_continue_generic (LZ4HC_Data_Structure* ctxPtr, +static int LZ4_compressHC_continue_generic (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize, limitedOutput_directive limit) { + LZ4HC_CCtx_internal* ctxPtr = &LZ4_streamHCPtr->internal_donotuse; /* auto-init if forgotten */ - if (ctxPtr->base == NULL) - LZ4HC_init (ctxPtr, (const BYTE*) source); + if (ctxPtr->base == NULL) LZ4HC_init (ctxPtr, (const BYTE*) source); /* Check overflow */ - if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 GB) - { + if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 GB) { size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base) - ctxPtr->dictLimit; if (dictSize > 64 KB) dictSize = 64 KB; - - LZ4_loadDictHC((LZ4_streamHC_t*)ctxPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize); + LZ4_loadDictHC(LZ4_streamHCPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize); } /* Check if blocks follow each other */ - if ((const BYTE*)source != ctxPtr->end) - LZ4HC_setExternalDict(ctxPtr, (const BYTE*)source); + if ((const BYTE*)source != ctxPtr->end) LZ4HC_setExternalDict(ctxPtr, (const BYTE*)source); /* Check overlapping input/dictionary space */ - { - const BYTE* sourceEnd = (const BYTE*) source + inputSize; - const BYTE* dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit; - const BYTE* dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit; - if ((sourceEnd > dictBegin) && ((const BYTE*)source < dictEnd)) - { + { const BYTE* sourceEnd = (const BYTE*) source + inputSize; + const BYTE* const dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit; + const BYTE* const dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit; + if ((sourceEnd > dictBegin) && ((const BYTE*)source < dictEnd)) { if (sourceEnd > dictEnd) sourceEnd = dictEnd; ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase); if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) ctxPtr->lowLimit = ctxPtr->dictLimit; @@ -641,9 +635,9 @@ static int LZ4_compressHC_continue_generic (LZ4HC_Data_Structure* ctxPtr, int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize) { if (maxOutputSize < LZ4_compressBound(inputSize)) - return LZ4_compressHC_continue_generic ((LZ4HC_Data_Structure*)LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, limitedOutput); + return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, limitedOutput); else - return LZ4_compressHC_continue_generic ((LZ4HC_Data_Structure*)LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, noLimit); + return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, source, dest, inputSize, maxOutputSize, noLimit); } @@ -651,14 +645,13 @@ int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* sourc int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictSize) { - LZ4HC_Data_Structure* streamPtr = (LZ4HC_Data_Structure*)LZ4_streamHCPtr; - int prefixSize = (int)(streamPtr->end - (streamPtr->base + streamPtr->dictLimit)); + LZ4HC_CCtx_internal* const streamPtr = &LZ4_streamHCPtr->internal_donotuse; + int const prefixSize = (int)(streamPtr->end - (streamPtr->base + streamPtr->dictLimit)); if (dictSize > 64 KB) dictSize = 64 KB; if (dictSize < 4) dictSize = 0; if (dictSize > prefixSize) dictSize = prefixSize; memmove(safeBuffer, streamPtr->end - dictSize, dictSize); - { - U32 endIndex = (U32)(streamPtr->end - streamPtr->base); + { U32 const endIndex = (U32)(streamPtr->end - streamPtr->base); streamPtr->end = (const BYTE*)safeBuffer + dictSize; streamPtr->base = streamPtr->end - endIndex; streamPtr->dictLimit = endIndex - dictSize; @@ -672,8 +665,8 @@ int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictS /*********************************** * Deprecated Functions ***********************************/ +/* These functions currently generate deprecation warnings */ /* Deprecated compression functions */ -/* These functions are planned to start generate warnings by r131 approximately */ int LZ4_compressHC(const char* src, char* dst, int srcSize) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), 0); } int LZ4_compressHC_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, 0); } int LZ4_compressHC2(const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); } @@ -687,45 +680,41 @@ int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* ctx, const char* src, /* Deprecated streaming functions */ -/* These functions currently generate deprecation warnings */ int LZ4_sizeofStreamStateHC(void) { return LZ4_STREAMHCSIZE; } int LZ4_resetStreamStateHC(void* state, char* inputBuffer) { + LZ4HC_CCtx_internal *ctx = &((LZ4_streamHC_t*)state)->internal_donotuse; if ((((size_t)state) & (sizeof(void*)-1)) != 0) return 1; /* Error : pointer is not aligned for pointer (32 or 64 bits) */ - LZ4HC_init((LZ4HC_Data_Structure*)state, (const BYTE*)inputBuffer); - ((LZ4HC_Data_Structure*)state)->inputBuffer = (BYTE*)inputBuffer; + LZ4HC_init(ctx, (const BYTE*)inputBuffer); + ctx->inputBuffer = (BYTE*)inputBuffer; return 0; } void* LZ4_createHC (char* inputBuffer) { - void* hc4 = ALLOCATOR(1, sizeof(LZ4HC_Data_Structure)); + LZ4_streamHC_t* hc4 = (LZ4_streamHC_t*)ALLOCATOR(1, sizeof(LZ4_streamHC_t)); if (hc4 == NULL) return NULL; /* not enough memory */ - LZ4HC_init ((LZ4HC_Data_Structure*)hc4, (const BYTE*)inputBuffer); - ((LZ4HC_Data_Structure*)hc4)->inputBuffer = (BYTE*)inputBuffer; + LZ4HC_init (&hc4->internal_donotuse, (const BYTE*)inputBuffer); + hc4->internal_donotuse.inputBuffer = (BYTE*)inputBuffer; return hc4; } -int LZ4_freeHC (void* LZ4HC_Data) -{ - FREEMEM(LZ4HC_Data); - return (0); -} +int LZ4_freeHC (void* LZ4HC_Data) { FREEMEM(LZ4HC_Data); return 0; } int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel) { - return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, 0, compressionLevel, noLimit); + return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, source, dest, inputSize, 0, compressionLevel, noLimit); } int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel) { - return LZ4HC_compress_generic (LZ4HC_Data, source, dest, inputSize, maxOutputSize, compressionLevel, limitedOutput); + return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, source, dest, inputSize, maxOutputSize, compressionLevel, limitedOutput); } char* LZ4_slideInputBufferHC(void* LZ4HC_Data) { - LZ4HC_Data_Structure* hc4 = (LZ4HC_Data_Structure*)LZ4HC_Data; - int dictSize = LZ4_saveDictHC((LZ4_streamHC_t*)LZ4HC_Data, (char*)(hc4->inputBuffer), 64 KB); + LZ4HC_CCtx_internal* const hc4 = &((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse; + int const dictSize = LZ4_saveDictHC((LZ4_streamHC_t*)LZ4HC_Data, (char*)(hc4->inputBuffer), 64 KB); return (char*)(hc4->inputBuffer + dictSize); } diff --git a/contrib/libpoco/Crypto/include/Poco/Crypto/Cipher.h b/contrib/libpoco/Crypto/include/Poco/Crypto/Cipher.h index 30d17f3cd0d..92ba5da9f4e 100644 --- a/contrib/libpoco/Crypto/include/Poco/Crypto/Cipher.h +++ b/contrib/libpoco/Crypto/include/Poco/Crypto/Cipher.h @@ -96,7 +96,7 @@ public: ENC_BASE64 = 0x01, /// Base64-encoded output ENC_BINHEX = 0x02, /// BinHex-encoded output ENC_BASE64_NO_LF = 0x81, /// Base64-encoded output, no linefeeds - ENC_BINHEX_NO_LF = 0x82, /// BinHex-encoded output, no linefeeds + ENC_BINHEX_NO_LF = 0x82 /// BinHex-encoded output, no linefeeds }; diff --git a/contrib/libpoco/Crypto/include/Poco/Crypto/Crypto.h b/contrib/libpoco/Crypto/include/Poco/Crypto/Crypto.h index 1ed50941da1..fcfb20ec26f 100644 --- a/contrib/libpoco/Crypto/include/Poco/Crypto/Crypto.h +++ b/contrib/libpoco/Crypto/include/Poco/Crypto/Crypto.h @@ -22,7 +22,6 @@ #define Crypto_Crypto_INCLUDED -#pragma GCC diagnostic push #if defined(__APPLE__) // OS X 10.7 deprecates some OpenSSL functions #pragma GCC diagnostic ignored "-Wdeprecated-declarations" @@ -116,6 +115,5 @@ void Crypto_API uninitializeCrypto(); } } // namespace Poco::Crypto -#pragma GCC diagnostic pop #endif // Crypto_Crypto_INCLUDED diff --git a/contrib/libpoco/Crypto/include/Poco/Crypto/DigestEngine.h b/contrib/libpoco/Crypto/include/Poco/Crypto/DigestEngine.h index 5de75392a83..e2121c414df 100644 --- a/contrib/libpoco/Crypto/include/Poco/Crypto/DigestEngine.h +++ b/contrib/libpoco/Crypto/include/Poco/Crypto/DigestEngine.h @@ -61,7 +61,7 @@ protected: private: std::string _name; - EVP_MD_CTX* _ctx; + EVP_MD_CTX* _pContext; Poco::DigestEngine::Digest _digest; OpenSSLInitializer _openSSLInitializer; }; diff --git a/contrib/libpoco/Crypto/include/Poco/Crypto/X509Certificate.h b/contrib/libpoco/Crypto/include/Poco/Crypto/X509Certificate.h index 472c537637e..a6d86901248 100644 --- a/contrib/libpoco/Crypto/include/Poco/Crypto/X509Certificate.h +++ b/contrib/libpoco/Crypto/include/Poco/Crypto/X509Certificate.h @@ -130,6 +130,14 @@ public: /// Returns true if verification against the issuer certificate /// was successfull, false otherwise. + bool equals(const X509Certificate& otherCertificate) const; + /// Checks whether the certificate is equal to + /// the other certificate, by comparing the hashes + /// of both certificates. + /// + /// Returns true if both certificates are identical, + /// otherwise false. + const X509* certificate() const; /// Returns the underlying OpenSSL certificate. diff --git a/contrib/libpoco/Crypto/src/CipherImpl.cpp b/contrib/libpoco/Crypto/src/CipherImpl.cpp index c953aae52e2..b8708a78c81 100644 --- a/contrib/libpoco/Crypto/src/CipherImpl.cpp +++ b/contrib/libpoco/Crypto/src/CipherImpl.cpp @@ -30,7 +30,7 @@ namespace { unsigned long err; std::string msg; - + while ((err = ERR_get_error())) { if (!msg.empty()) @@ -60,24 +60,28 @@ namespace Direction dir); ~CryptoTransformImpl(); - + std::size_t blockSize() const; - int setPadding(int padding); + int setPadding(int padding); std::streamsize transform( const unsigned char* input, std::streamsize inputLength, unsigned char* output, std::streamsize outputLength); - + std::streamsize finalize( unsigned char* output, std::streamsize length); private: const EVP_CIPHER* _pCipher; - EVP_CIPHER_CTX _ctx; +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + EVP_CIPHER_CTX* _pContext; +#else + EVP_CIPHER_CTX _context; +#endif ByteVec _key; ByteVec _iv; }; @@ -92,32 +96,54 @@ namespace _key(key), _iv(iv) { +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + _pContext = EVP_CIPHER_CTX_new(); EVP_CipherInit( - &_ctx, + _pContext, _pCipher, &_key[0], _iv.empty() ? 0 : &_iv[0], (dir == DIR_ENCRYPT) ? 1 : 0); +#else + EVP_CipherInit( + &_context, + _pCipher, + &_key[0], + _iv.empty() ? 0 : &_iv[0], + (dir == DIR_ENCRYPT) ? 1 : 0); +#endif } CryptoTransformImpl::~CryptoTransformImpl() { - EVP_CIPHER_CTX_cleanup(&_ctx); +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + EVP_CIPHER_CTX_cleanup(_pContext); +#else + EVP_CIPHER_CTX_cleanup(&_context); +#endif } std::size_t CryptoTransformImpl::blockSize() const { - return EVP_CIPHER_CTX_block_size(&_ctx); +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + return EVP_CIPHER_CTX_block_size(_pContext); +#else + return EVP_CIPHER_CTX_block_size(&_context); +#endif } - + int CryptoTransformImpl::setPadding(int padding) { - return EVP_CIPHER_CTX_set_padding(&_ctx, padding); +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + return EVP_CIPHER_CTX_block_size(_pContext); +#else + return EVP_CIPHER_CTX_set_padding(&_context, padding); +#endif } - + std::streamsize CryptoTransformImpl::transform( const unsigned char* input, @@ -125,16 +151,24 @@ namespace unsigned char* output, std::streamsize outputLength) { - poco_assert (outputLength >= std::streamsize(inputLength + blockSize() - 1)); + poco_assert (outputLength >= (inputLength + blockSize() - 1)); int outLen = static_cast(outputLength); +#if OPENSSL_VERSION_NUMBER >= 0x10100000L int rc = EVP_CipherUpdate( - &_ctx, + _pContext, output, &outLen, input, static_cast(inputLength)); - +#else + int rc = EVP_CipherUpdate( + &_context, + output, + &outLen, + input, + static_cast(inputLength)); +#endif if (rc == 0) throwError(); @@ -146,18 +180,22 @@ namespace unsigned char* output, std::streamsize length) { - poco_assert (length >= (std::streamsize)blockSize()); - + poco_assert (length >= blockSize()); + int len = static_cast(length); // Use the '_ex' version that does not perform implicit cleanup since we // will call EVP_CIPHER_CTX_cleanup() from the dtor as there is no // guarantee that finalize() will be called if an error occurred. - int rc = EVP_CipherFinal_ex(&_ctx, output, &len); +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + int rc = EVP_CipherFinal_ex(_pContext, output, &len); +#else + int rc = EVP_CipherFinal_ex(&_context, output, &len); +#endif if (rc == 0) throwError(); - + return static_cast(len); } } diff --git a/contrib/libpoco/Crypto/src/CipherKeyImpl.cpp b/contrib/libpoco/Crypto/src/CipherKeyImpl.cpp index 58b51c9424d..bcd7452c696 100644 --- a/contrib/libpoco/Crypto/src/CipherKeyImpl.cpp +++ b/contrib/libpoco/Crypto/src/CipherKeyImpl.cpp @@ -27,8 +27,8 @@ namespace Poco { namespace Crypto { -CipherKeyImpl::CipherKeyImpl(const std::string& name, - const std::string& passphrase, +CipherKeyImpl::CipherKeyImpl(const std::string& name, + const std::string& passphrase, const std::string& salt, int iterationCount): _pCipher(0), @@ -48,8 +48,8 @@ CipherKeyImpl::CipherKeyImpl(const std::string& name, } -CipherKeyImpl::CipherKeyImpl(const std::string& name, - const ByteVec& key, +CipherKeyImpl::CipherKeyImpl(const std::string& name, + const ByteVec& key, const ByteVec& iv): _pCipher(0), _name(name), @@ -64,7 +64,7 @@ CipherKeyImpl::CipherKeyImpl(const std::string& name, throw Poco::NotFoundException("Cipher " + name + " was not found"); } - + CipherKeyImpl::CipherKeyImpl(const std::string& name): _pCipher(0), _name(name), @@ -117,7 +117,7 @@ void CipherKeyImpl::generateKey() getRandomBytes(vec, keySize()); setKey(vec); - + getRandomBytes(vec, ivSize()); setIV(vec); } @@ -126,11 +126,11 @@ void CipherKeyImpl::generateKey() void CipherKeyImpl::getRandomBytes(ByteVec& vec, std::size_t count) { Poco::RandomInputStream random; - + vec.clear(); vec.reserve(count); - for (std::size_t i = 0; i < count; ++i) + for (int i = 0; i < count; ++i) vec.push_back(static_cast(random.get())); } diff --git a/contrib/libpoco/Crypto/src/CryptoStream.cpp b/contrib/libpoco/Crypto/src/CryptoStream.cpp index 34ce13b4c39..97e73ce810f 100644 --- a/contrib/libpoco/Crypto/src/CryptoStream.cpp +++ b/contrib/libpoco/Crypto/src/CryptoStream.cpp @@ -43,7 +43,7 @@ CryptoStreamBuf::CryptoStreamBuf(std::istream& istr, CryptoTransform* pTransform _buffer(static_cast(bufferSize)) { poco_check_ptr (pTransform); - poco_assert ((size_t)bufferSize > 2 * pTransform->blockSize()); + poco_assert (bufferSize > 2 * pTransform->blockSize()); } @@ -56,7 +56,7 @@ CryptoStreamBuf::CryptoStreamBuf(std::ostream& ostr, CryptoTransform* pTransform _buffer(static_cast(bufferSize)) { poco_check_ptr (pTransform); - poco_assert ((size_t)bufferSize > 2 * pTransform->blockSize()); + poco_assert (bufferSize > 2 * pTransform->blockSize()); } @@ -88,10 +88,10 @@ void CryptoStreamBuf::close() // thrown. std::ostream* pOstr = _pOstr; _pOstr = 0; - + // Finalize transformation. std::streamsize n = _pTransform->finalize(_buffer.begin(), static_cast(_buffer.size())); - + if (n > 0) { pOstr->write(reinterpret_cast(_buffer.begin()), n); @@ -159,7 +159,7 @@ int CryptoStreamBuf::writeToDevice(const char* buffer, std::streamsize length) std::size_t maxChunkSize = _buffer.size()/2; std::size_t count = 0; - while (count < (size_t)length) + while (count < length) { // Truncate chunk size so that the maximum output fits into _buffer. std::size_t n = static_cast(length) - count; diff --git a/contrib/libpoco/Crypto/src/DigestEngine.cpp b/contrib/libpoco/Crypto/src/DigestEngine.cpp index 6e574ab42e1..64042589f17 100644 --- a/contrib/libpoco/Crypto/src/DigestEngine.cpp +++ b/contrib/libpoco/Crypto/src/DigestEngine.cpp @@ -23,46 +23,51 @@ namespace Crypto { DigestEngine::DigestEngine(const std::string& name): - _name(name) + _name(name), + _pContext(EVP_MD_CTX_create()) { const EVP_MD* md = EVP_get_digestbyname(_name.c_str()); if (!md) throw Poco::NotFoundException(_name); - _ctx = EVP_MD_CTX_create(); - EVP_DigestInit_ex(_ctx, md, NULL); + EVP_DigestInit_ex(_pContext, md, NULL); } DigestEngine::~DigestEngine() { - EVP_MD_CTX_destroy(_ctx); + EVP_MD_CTX_destroy(_pContext); } int DigestEngine::nid() const { - return EVP_MD_nid(_ctx->digest); + return EVP_MD_nid(EVP_MD_CTX_md(_pContext)); } std::size_t DigestEngine::digestLength() const { - return EVP_MD_CTX_size(_ctx); + return EVP_MD_CTX_size(_pContext); } void DigestEngine::reset() { - EVP_MD_CTX_cleanup(_ctx); +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + EVP_MD_CTX_free(_pContext); + _pContext = EVP_MD_CTX_create(); +#else + EVP_MD_CTX_cleanup(_pContext); +#endif const EVP_MD* md = EVP_get_digestbyname(_name.c_str()); if (!md) throw Poco::NotFoundException(_name); - EVP_DigestInit_ex(_ctx, md, NULL); + EVP_DigestInit_ex(_pContext, md, NULL); } const Poco::DigestEngine::Digest& DigestEngine::digest() { _digest.clear(); - unsigned len = EVP_MD_CTX_size(_ctx); + unsigned len = EVP_MD_CTX_size(_pContext); _digest.resize(len); - EVP_DigestFinal_ex(_ctx, &_digest[0], &len); + EVP_DigestFinal_ex(_pContext, &_digest[0], &len); reset(); return _digest; } @@ -70,7 +75,7 @@ const Poco::DigestEngine::Digest& DigestEngine::digest() void DigestEngine::updateImpl(const void* data, std::size_t length) { - EVP_DigestUpdate(_ctx, data, length); + EVP_DigestUpdate(_pContext, data, length); } diff --git a/contrib/libpoco/Crypto/src/RSACipherImpl.cpp b/contrib/libpoco/Crypto/src/RSACipherImpl.cpp index 01b23851282..91c5b815d61 100644 --- a/contrib/libpoco/Crypto/src/RSACipherImpl.cpp +++ b/contrib/libpoco/Crypto/src/RSACipherImpl.cpp @@ -32,7 +32,7 @@ namespace { unsigned long err; std::string msg; - + while ((err = ERR_get_error())) { if (!msg.empty()) @@ -68,7 +68,7 @@ namespace public: RSAEncryptImpl(const RSA* pRSA, RSAPaddingMode paddingMode); ~RSAEncryptImpl(); - + std::size_t blockSize() const; std::size_t maxDataSize() const; @@ -77,7 +77,7 @@ namespace std::streamsize inputLength, unsigned char* output, std::streamsize outputLength); - + std::streamsize finalize(unsigned char* output, std::streamsize length); private: @@ -156,7 +156,7 @@ namespace output += n; outputLength -= n; _pos = 0; - + } else { @@ -175,8 +175,8 @@ namespace std::streamsize RSAEncryptImpl::finalize(unsigned char* output, std::streamsize length) { - poco_assert ((size_t)length >= blockSize()); - poco_assert ((size_t)_pos <= maxDataSize()); + poco_assert (length >= blockSize()); + poco_assert (_pos <= maxDataSize()); int rc = 0; if (_pos > 0) { @@ -192,7 +192,7 @@ namespace public: RSADecryptImpl(const RSA* pRSA, RSAPaddingMode paddingMode); ~RSADecryptImpl(); - + std::size_t blockSize() const; std::streamsize transform( @@ -200,7 +200,7 @@ namespace std::streamsize inputLength, unsigned char* output, std::streamsize outputLength); - + std::streamsize finalize( unsigned char* output, std::streamsize length); @@ -241,7 +241,7 @@ namespace unsigned char* output, std::streamsize outputLength) { - + // always fill up the buffer before decrypting! std::streamsize rsaSize = static_cast(blockSize()); poco_assert_dbg(_pos <= rsaSize); @@ -261,7 +261,7 @@ namespace output += tmp; outputLength -= tmp; _pos = 0; - + } else { @@ -280,7 +280,7 @@ namespace std::streamsize RSADecryptImpl::finalize(unsigned char* output, std::streamsize length) { - poco_assert ((size_t)length >= blockSize()); + poco_assert (length >= blockSize()); int rc = 0; if (_pos > 0) { diff --git a/contrib/libpoco/Crypto/src/RSAKeyImpl.cpp b/contrib/libpoco/Crypto/src/RSAKeyImpl.cpp index 8333453cee0..3a1580f6912 100644 --- a/contrib/libpoco/Crypto/src/RSAKeyImpl.cpp +++ b/contrib/libpoco/Crypto/src/RSAKeyImpl.cpp @@ -207,19 +207,43 @@ int RSAKeyImpl::size() const RSAKeyImpl::ByteVec RSAKeyImpl::modulus() const { +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + const BIGNUM* n = 0; + const BIGNUM* e = 0; + const BIGNUM* d = 0; + RSA_get0_key(_pRSA, &n, &e, &d); + return convertToByteVec(n); +#else return convertToByteVec(_pRSA->n); +#endif } RSAKeyImpl::ByteVec RSAKeyImpl::encryptionExponent() const { +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + const BIGNUM* n = 0; + const BIGNUM* e = 0; + const BIGNUM* d = 0; + RSA_get0_key(_pRSA, &n, &e, &d); + return convertToByteVec(e); +#else return convertToByteVec(_pRSA->e); +#endif } RSAKeyImpl::ByteVec RSAKeyImpl::decryptionExponent() const { +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + const BIGNUM* n = 0; + const BIGNUM* e = 0; + const BIGNUM* d = 0; + RSA_get0_key(_pRSA, &n, &e, &d); + return convertToByteVec(d); +#else return convertToByteVec(_pRSA->d); +#endif } diff --git a/contrib/libpoco/Crypto/src/X509Certificate.cpp b/contrib/libpoco/Crypto/src/X509Certificate.cpp index dd9ebd2cab3..f7f37965ed8 100644 --- a/contrib/libpoco/Crypto/src/X509Certificate.cpp +++ b/contrib/libpoco/Crypto/src/X509Certificate.cpp @@ -59,7 +59,11 @@ X509Certificate::X509Certificate(X509* pCert, bool shared): if (shared) { +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + X509_up_ref(_pCert); +#else _pCert->references++; +#endif } init(); @@ -205,10 +209,10 @@ std::string X509Certificate::issuerName(NID nid) const if (X509_NAME* issuer = X509_get_issuer_name(_pCert)) { char buffer[NAME_BUFFER_SIZE]; - X509_NAME_get_text_by_NID(issuer, nid, buffer, sizeof(buffer)); - return std::string(buffer); + if (X509_NAME_get_text_by_NID(issuer, nid, buffer, sizeof(buffer)) >= 0) + return std::string(buffer); } - else return std::string(); + return std::string(); } @@ -217,10 +221,10 @@ std::string X509Certificate::subjectName(NID nid) const if (X509_NAME* subj = X509_get_subject_name(_pCert)) { char buffer[NAME_BUFFER_SIZE]; - X509_NAME_get_text_by_NID(subj, nid, buffer, sizeof(buffer)); - return std::string(buffer); + if (X509_NAME_get_text_by_NID(subj, nid, buffer, sizeof(buffer)) >= 0) + return std::string(buffer); } - else return std::string(); + return std::string(); } @@ -280,4 +284,12 @@ bool X509Certificate::issuedBy(const X509Certificate& issuerCertificate) const } +bool X509Certificate::equals(const X509Certificate& otherCertificate) const +{ + X509* pCert = const_cast(_pCert); + X509* pOtherCert = const_cast(otherCertificate.certificate()); + return X509_cmp(pCert, pOtherCert) == 0; +} + + } } // namespace Poco::Crypto diff --git a/contrib/libpoco/Crypto/testsuite/src/CryptoTest.cpp b/contrib/libpoco/Crypto/testsuite/src/CryptoTest.cpp index 8c403470950..53764df137c 100644 --- a/contrib/libpoco/Crypto/testsuite/src/CryptoTest.cpp +++ b/contrib/libpoco/Crypto/testsuite/src/CryptoTest.cpp @@ -246,6 +246,11 @@ void CryptoTest::testCertificate() // fails with recent OpenSSL versions: // assert (cert.issuedBy(cert)); + + std::istringstream otherCertStream(APPINF_PEM); + X509Certificate otherCert(otherCertStream); + + assert (cert.equals(otherCert)); } diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 1ba41dca53c..5f93dc281b0 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -21,6 +21,7 @@ include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libdivide) include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libcpuid/include) include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libfarmhash) include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libmetrohash/src) +include_directories (BEFORE ${ClickHouse_SOURCE_DIR}/contrib/libbtrie/include) include_directories (${ClickHouse_SOURCE_DIR}/libs/libdaemon/include) include_directories (${ClickHouse_BINARY_DIR}/dbms/src) @@ -44,7 +45,6 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake) add_headers_and_sources(dbms src/TableFunctions) add_headers_and_sources(dbms src/Parsers) add_headers_and_sources(dbms src/Analyzers) -add_headers_and_sources(dbms src/AggregateFunctions) add_headers_and_sources(dbms src/Core) add_headers_and_sources(dbms src/DataStreams) add_headers_and_sources(dbms src/DataTypes) @@ -70,6 +70,33 @@ list (APPEND dbms_headers ${CONFIG_VERSION} ${CONFIG_COMMON}) list (APPEND dbms_sources src/Functions/IFunction.cpp src/Functions/FunctionFactory.cpp src/Functions/DataTypeTraits.cpp) list (APPEND dbms_headers src/Functions/IFunction.h src/Functions/FunctionFactory.h src/Functions/DataTypeTraits.h) +list (APPEND dbms_sources + src/AggregateFunctions/AggregateFunctionFactory.cpp + src/AggregateFunctions/AggregateFunctionState.cpp + src/AggregateFunctions/AggregateFunctionFactory.cpp + src/AggregateFunctions/AggregateFunctionState.cpp + src/AggregateFunctions/AggregateFunctionArray.cpp + src/AggregateFunctions/AggregateFunctionNull.cpp + src/AggregateFunctions/AggregateFunctionForEach.cpp + src/AggregateFunctions/AggregateFunctionIf.cpp + src/AggregateFunctions/AggregateFunctionMerge.cpp + src/AggregateFunctions/AggregateFunctionCount.cpp +) + +list (APPEND dbms_headers + src/AggregateFunctions/IAggregateFunction.h + src/AggregateFunctions/AggregateFunctionFactory.h + src/AggregateFunctions/AggregateFunctionState.h + src/AggregateFunctions/AggregateFunctionFactory.h + src/AggregateFunctions/AggregateFunctionState.h + src/AggregateFunctions/AggregateFunctionArray.h + src/AggregateFunctions/AggregateFunctionNull.h + src/AggregateFunctions/AggregateFunctionForEach.h + src/AggregateFunctions/AggregateFunctionIf.h + src/AggregateFunctions/AggregateFunctionMerge.h + src/AggregateFunctions/AggregateFunctionCount.h +) + list(REMOVE_ITEM dbms_sources src/Client/Client.cpp @@ -127,6 +154,7 @@ if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") src/Dictionaries/FlatDictionary.cpp src/Dictionaries/HashedDictionary.cpp src/Dictionaries/CacheDictionary.cpp + src/Dictionaries/TrieDictionary.cpp src/Dictionaries/RangeHashedDictionary.cpp src/Dictionaries/ComplexKeyHashedDictionary.cpp src/Dictionaries/ComplexKeyCacheDictionary.cpp @@ -159,6 +187,7 @@ target_link_libraries (dbms ${OPENSSL_CRYPTO_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Poco_Data_LIBRARY} + btrie ) if (Poco_DataODBC_FOUND) diff --git a/dbms/cmake/version.cmake b/dbms/cmake/version.cmake index 5e625d6e5de..248c11bf89e 100644 --- a/dbms/cmake/version.cmake +++ b/dbms/cmake/version.cmake @@ -1,6 +1,6 @@ #This strings autochanged from release_lib.sh : -set(VERSION_DESCRIBE v1.1.54227-testing) -set(VERSION_REVISION 54227) +set(VERSION_DESCRIBE v1.1.54234-testing) +set(VERSION_REVISION 54234) #===end of autochange set (VERSION_MAJOR 1) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionFactory.cpp b/dbms/src/AggregateFunctions/AggregateFunctionFactory.cpp index 6a1e922a8a4..5cd9d88f85b 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -30,24 +30,6 @@ std::string trimRight(const std::string & in, const char * suffix) } -void registerAggregateFunctionAvg(AggregateFunctionFactory & factory); -void registerAggregateFunctionCount(AggregateFunctionFactory & factory); -void registerAggregateFunctionGroupArray(AggregateFunctionFactory & factory); -void registerAggregateFunctionGroupUniqArray(AggregateFunctionFactory & factory); -void registerAggregateFunctionsQuantile(AggregateFunctionFactory & factory); -void registerAggregateFunctionsQuantileExact(AggregateFunctionFactory & factory); -void registerAggregateFunctionsQuantileExactWeighted(AggregateFunctionFactory & factory); -void registerAggregateFunctionsQuantileDeterministic(AggregateFunctionFactory & factory); -void registerAggregateFunctionsQuantileTiming(AggregateFunctionFactory & factory); -void registerAggregateFunctionsQuantileTDigest(AggregateFunctionFactory & factory); -void registerAggregateFunctionsSequenceMatch(AggregateFunctionFactory & factory); -void registerAggregateFunctionsMinMaxAny(AggregateFunctionFactory & factory); -void registerAggregateFunctionsStatistics(AggregateFunctionFactory & factory); -void registerAggregateFunctionSum(AggregateFunctionFactory & factory); -void registerAggregateFunctionsUniq(AggregateFunctionFactory & factory); -void registerAggregateFunctionUniqUpTo(AggregateFunctionFactory & factory); -void registerAggregateFunctionDebug(AggregateFunctionFactory & factory); - AggregateFunctionPtr createAggregateFunctionArray(AggregateFunctionPtr & nested); AggregateFunctionPtr createAggregateFunctionForEach(AggregateFunctionPtr & nested); AggregateFunctionPtr createAggregateFunctionIf(AggregateFunctionPtr & nested); @@ -60,23 +42,6 @@ AggregateFunctionPtr createAggregateFunctionCountNotNull(const DataTypes & argum AggregateFunctionFactory::AggregateFunctionFactory() { - registerAggregateFunctionAvg(*this); - registerAggregateFunctionCount(*this); - registerAggregateFunctionGroupArray(*this); - registerAggregateFunctionGroupUniqArray(*this); - registerAggregateFunctionsQuantile(*this); - registerAggregateFunctionsQuantileExact(*this); - registerAggregateFunctionsQuantileExactWeighted(*this); - registerAggregateFunctionsQuantileDeterministic(*this); - registerAggregateFunctionsQuantileTiming(*this); - registerAggregateFunctionsQuantileTDigest(*this); - registerAggregateFunctionsSequenceMatch(*this); - registerAggregateFunctionsMinMaxAny(*this); - registerAggregateFunctionsStatistics(*this); - registerAggregateFunctionSum(*this); - registerAggregateFunctionsUniq(*this); - registerAggregateFunctionUniqUpTo(*this); - registerAggregateFunctionDebug(*this); } diff --git a/dbms/src/AggregateFunctions/AggregateFunctionFactory.h b/dbms/src/AggregateFunctions/AggregateFunctionFactory.h index ade5f84065e..6ab27a22b3b 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionFactory.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionFactory.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB @@ -14,7 +15,7 @@ using DataTypes = std::vector; /** Creates an aggregate function by name. */ -class AggregateFunctionFactory final +class AggregateFunctionFactory final : public Singleton { friend class StorageSystemFunctions; diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index 5f52e80eb4a..6989e4ef93a 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -11,7 +11,7 @@ namespace AggregateFunctionPtr createAggregateFunctionGroupArray(const std::string & name, const DataTypes & argument_types) { if (argument_types.size() != 1) - throw Exception("Incorrect number of arguments for aggregate function " + name, + throw Exception("Incorrect number of arguments for aggregate function " + name + ", should be 2", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); AggregateFunctionPtr res(createWithNumericType(*argument_types[0])); diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.h index eb9f035f5fe..3bc72c1f974 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.h @@ -100,7 +100,7 @@ public: -/// General case (ineffective). NOTE You can also implement a special case for strings. +/// General case (inefficient). NOTE You can also implement a special case for strings. struct AggregateFunctionGroupArrayDataGeneric { Array value; /// TODO Add MemoryTracker @@ -109,7 +109,7 @@ struct AggregateFunctionGroupArrayDataGeneric /// Puts all values to an array, general case. Implemented inefficiently. class AggregateFunctionGroupArrayGeneric final -: public IUnaryAggregateFunction + : public IUnaryAggregateFunction { private: DataTypePtr type; diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.cpp b/dbms/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.cpp new file mode 100644 index 00000000000..09af12fa132 --- /dev/null +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.cpp @@ -0,0 +1,27 @@ +#include +#include +#include + +namespace DB +{ + +namespace +{ + +AggregateFunctionPtr createAggregateFunctionGroupArrayInsertAt(const std::string & name, const DataTypes & argument_types) +{ + if (argument_types.size() != 2) + throw Exception("Incorrect number of arguments for aggregate function " + name + ", should be 2", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + return std::make_shared(); +} + +} + +void registerAggregateFunctionGroupArrayInsertAt(AggregateFunctionFactory & factory) +{ + factory.registerFunction("groupArrayInsertAt", createAggregateFunctionGroupArrayInsertAt); +} + +} diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h new file mode 100644 index 00000000000..6a48e621444 --- /dev/null +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupArrayInsertAt.h @@ -0,0 +1,210 @@ +#pragma once + +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +#define AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE 0xFFFFFF + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TOO_LARGE_ARRAY_SIZE; + extern const int CANNOT_CONVERT_TYPE; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + + +/** Aggregate function, that takes two arguments: value and position, + * and as a result, builds an array with values are located at corresponding positions. + * + * If more than one value was inserted to single position, the any value (first in case of single thread) is stored. + * If no values was inserted to some position, then default value will be substituted. + * + * Aggregate function also accept optional parameters: + * - default value to substitute; + * - length to resize result arrays (if you want to have results of same length for all aggregation keys); + * + * If you want to pass length, default value should be also given. + */ + + +/// Generic case (inefficient). +struct AggregateFunctionGroupArrayInsertAtDataGeneric +{ + Array value; /// TODO Add MemoryTracker +}; + + +class AggregateFunctionGroupArrayInsertAtGeneric final + : public IBinaryAggregateFunction +{ +private: + DataTypePtr type; + Field default_value; + size_t length_to_resize = 0; /// zero means - do not do resizing. + +public: + String getName() const override { return "groupArrayInsertAt"; } + + DataTypePtr getReturnType() const override + { + return std::make_shared(type); + } + + void setArgumentsImpl(const DataTypes & arguments) + { + if (!arguments.at(1)->behavesAsNumber()) /// TODO filter out floating point types. + throw Exception("Second argument of aggregate function " + getName() + " must be integer.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + type = arguments.front(); + + if (default_value.isNull()) + default_value = type->getDefault(); + else + { + Field converted = convertFieldToType(default_value, *type); + if (converted.isNull()) + throw Exception("Cannot convert parameter of aggregate function " + getName() + " (" + applyVisitor(FieldVisitorToString(), default_value) + ")" + " to type " + type->getName() + " to be used as default value in array", ErrorCodes::CANNOT_CONVERT_TYPE); + + default_value = converted; + } + } + + void setParameters(const Array & params) override + { + if (params.empty()) + return; + + if (params.size() > 2) + throw Exception("Aggregate function " + getName() + " requires at most two parameters.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + default_value = params[0]; + + if (params.size() == 2) + { + length_to_resize = applyVisitor(FieldVisitorConvertToNumber(), params[1]); + } + } + + void addImpl(AggregateDataPtr place, const IColumn & column_value, const IColumn & column_position, size_t row_num, Arena *) const + { + /// TODO Do positions need to be 1-based for this function? + size_t position = column_position.get64(row_num); + + /// If position is larger than size to which array will be cutted - simply ignore value. + if (length_to_resize && position >= length_to_resize) + return; + + if (position >= AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE) + throw Exception("Too large array size: position argument (" + toString(position) + ")" + " is greater or equals to limit (" + toString(AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE) + ")", + ErrorCodes::TOO_LARGE_ARRAY_SIZE); + + Array & arr = data(place).value; + + if (arr.size() <= position) + arr.resize(position + 1); + else if (!arr[position].isNull()) + return; /// Element was already inserted to the specified position. + + column_value.get(row_num, arr[position]); + } + + void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override + { + Array & arr_lhs = data(place).value; + const Array & arr_rhs = data(rhs).value; + + if (arr_lhs.size() < arr_rhs.size()) + arr_lhs.resize(arr_rhs.size()); + + for (size_t i = 0, size = arr_rhs.size(); i < size; ++i) + if (arr_lhs[i].isNull() && !arr_rhs[i].isNull()) + arr_lhs[i] = arr_rhs[i]; + } + + void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override + { + const Array & arr = data(place).value; + size_t size = arr.size(); + writeVarUInt(size, buf); + + for (const Field & elem : arr) + { + if (elem.isNull()) + { + writeBinary(UInt8(1), buf); + } + else + { + writeBinary(UInt8(0), buf); + type->serializeBinary(elem, buf); + } + } + } + + void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override + { + size_t size = 0; + readVarUInt(size, buf); + + if (size > AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE) + throw Exception("Too large array size", ErrorCodes::TOO_LARGE_ARRAY_SIZE); + + Array & arr = data(place).value; + + arr.resize(size); + for (size_t i = 0; i < size; ++i) + { + UInt8 is_null = 0; + readBinary(is_null, buf); + if (!is_null) + type->deserializeBinary(arr[i], buf); + } + } + + void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override + { + ColumnArray & to_array = static_cast(to); + IColumn & to_data = to_array.getData(); + ColumnArray::Offsets_t & to_offsets = to_array.getOffsets(); + + const Array & arr = data(place).value; + + for (const Field & elem : arr) + { + if (!elem.isNull()) + to_data.insert(elem); + else + to_data.insert(default_value); + } + + size_t result_array_size = length_to_resize ? length_to_resize : arr.size(); + + /// Pad array if need. + for (size_t i = arr.size(); i < result_array_size; ++i) + to_data.insert(default_value); + + to_offsets.push_back((to_offsets.empty() ? 0 : to_offsets.back()) + result_array_size); + } +}; + + +#undef AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE + +} diff --git a/dbms/src/AggregateFunctions/AggregateFunctionMerge.h b/dbms/src/AggregateFunctions/AggregateFunctionMerge.h index bf4ad82c98d..435d5c4c638 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionMerge.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionMerge.h @@ -34,6 +34,11 @@ public: return nested_func->getReturnType(); } + AggregateFunctionPtr getNestedFunction() const + { + return nested_func_owner; + } + void setArguments(const DataTypes & arguments) override { if (arguments.size() != 1) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index cb5c55adc92..bb1c1a8b103 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -286,7 +286,7 @@ private: ParserString dot_p("."); ParserNumber number_p; - auto pos = pattern.data(); + const char * pos = pattern.data(); const auto begin = pos; const auto end = pos + pattern.size(); diff --git a/dbms/src/AggregateFunctions/AggregateFunctionState.cpp b/dbms/src/AggregateFunctions/AggregateFunctionState.cpp index f9d60a2f885..0bb1345cf00 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionState.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionState.cpp @@ -1,8 +1,34 @@ #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +DataTypePtr AggregateFunctionState::getReturnType() const +{ + auto ptr = std::make_shared(nested_func_owner, arguments, params); + + /// Special case: it is -MergeState combinator + if (typeid_cast(ptr->getFunction().get())) + { + if (arguments.size() != 1) + throw Exception("Combinator -MergeState expects only one argument", ErrorCodes::BAD_ARGUMENTS); + + if (!typeid_cast(arguments[0].get())) + throw Exception("Combinator -MergeState expects argument with AggregateFunction type", ErrorCodes::BAD_ARGUMENTS); + + return arguments[0]; + } + + return ptr; +} + + AggregateFunctionPtr createAggregateFunctionState(AggregateFunctionPtr & nested) { return std::make_shared(nested); diff --git a/dbms/src/AggregateFunctions/AggregateFunctionState.h b/dbms/src/AggregateFunctions/AggregateFunctionState.h index 59e5f984399..86511be93d0 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionState.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionState.h @@ -1,3 +1,4 @@ + #pragma once #include @@ -30,10 +31,7 @@ public: return nested_func->getName() + "State"; } - DataTypePtr getReturnType() const override - { - return std::make_shared(nested_func_owner, arguments, params); - } + DataTypePtr getReturnType() const override; void setArguments(const DataTypes & arguments_) override { diff --git a/dbms/src/AggregateFunctions/AggregateFunctionTopK.cpp b/dbms/src/AggregateFunctions/AggregateFunctionTopK.cpp new file mode 100644 index 00000000000..3310691a8ab --- /dev/null +++ b/dbms/src/AggregateFunctions/AggregateFunctionTopK.cpp @@ -0,0 +1,70 @@ +#include +#include +#include + +namespace DB +{ + +namespace +{ + +/// Substitute return type for Date and DateTime +class AggregateFunctionTopKDate : public AggregateFunctionTopK +{ + DataTypePtr getReturnType() const override { return std::make_shared(std::make_shared()); } +}; + +class AggregateFunctionTopKDateTime : public AggregateFunctionTopK +{ + DataTypePtr getReturnType() const override { return std::make_shared(std::make_shared()); } +}; + + +static IAggregateFunction * createWithExtraTypes(const IDataType & argument_type) +{ + if (typeid_cast(&argument_type)) return new AggregateFunctionTopKDate; + else if (typeid_cast(&argument_type)) return new AggregateFunctionTopKDateTime; + else + { + /// Check that we can use plain version of AggregateFunctionTopKGeneric + if (typeid_cast(&argument_type) || typeid_cast(&argument_type)) + return new AggregateFunctionTopKGeneric; + + auto * array_type = typeid_cast(&argument_type); + if (array_type) + { + auto nested_type = array_type->getNestedType(); + if (nested_type->isNumeric() || typeid_cast(nested_type.get())) + return new AggregateFunctionTopKGeneric; + } + + return new AggregateFunctionTopKGeneric; + } +} + +AggregateFunctionPtr createAggregateFunctionTopK(const std::string & name, const DataTypes & argument_types) +{ + if (argument_types.size() != 1) + throw Exception("Incorrect number of arguments for aggregate function " + name, + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + AggregateFunctionPtr res(createWithNumericType(*argument_types[0])); + + if (!res) + res = AggregateFunctionPtr(createWithExtraTypes(*argument_types[0])); + + if (!res) + throw Exception("Illegal type " + argument_types[0]->getName() + + " of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return res; +} + +} + +void registerAggregateFunctionTopK(AggregateFunctionFactory & factory) +{ + factory.registerFunction("topK", createAggregateFunctionTopK); +} + +} diff --git a/dbms/src/AggregateFunctions/AggregateFunctionTopK.h b/dbms/src/AggregateFunctions/AggregateFunctionTopK.h new file mode 100644 index 00000000000..f8143cbe7b4 --- /dev/null +++ b/dbms/src/AggregateFunctions/AggregateFunctionTopK.h @@ -0,0 +1,261 @@ +#pragma once + +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +#include + + +namespace DB +{ + + +// Allow NxK more space before calculating top K to increase accuracy +#define TOP_K_DEFAULT 10 +#define TOP_K_LOAD_FACTOR 3 +#define TOP_K_MAX_SIZE 0xFFFFFF + + +template +struct AggregateFunctionTopKData +{ + using Set = SpaceSaving + < + T, + T, + HashCRC32, + HashTableGrower<4>, + HashTableAllocatorWithStackMemory + >; + Set value; +}; + + +template +class AggregateFunctionTopK + : public IUnaryAggregateFunction, AggregateFunctionTopK> +{ +private: + using State = AggregateFunctionTopKData; + size_t threshold = TOP_K_DEFAULT; + size_t reserved = TOP_K_LOAD_FACTOR * threshold; + +public: + String getName() const override { return "topK"; } + + DataTypePtr getReturnType() const override + { + return std::make_shared(std::make_shared>()); + } + + void setArgument(const DataTypePtr & argument) + { + } + + void setParameters(const Array & params) override + { + if (params.size() != 1) + throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + std::size_t k = applyVisitor(FieldVisitorConvertToNumber(), params[0]); + + if (k > TOP_K_MAX_SIZE) + throw Exception("Too large parameter for aggregate function " + getName() + ". Maximum: " + toString(TOP_K_MAX_SIZE), + ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + threshold = k; + reserved = TOP_K_LOAD_FACTOR * k; + } + + void addImpl(AggregateDataPtr place, const IColumn & column, size_t row_num, Arena *) const + { + auto & set = this->data(place).value; + if (set.capacity() != reserved) + set.resize(reserved); + set.insert(static_cast &>(column).getData()[row_num]); + } + + void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override + { + this->data(place).value.merge(this->data(rhs).value); + } + + void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override + { + this->data(place).value.write(buf); + } + + void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override + { + auto & set = this->data(place).value; + set.resize(reserved); + set.read(buf); + } + + void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override + { + ColumnArray & arr_to = static_cast(to); + ColumnArray::Offsets_t & offsets_to = arr_to.getOffsets(); + + const typename State::Set & set = this->data(place).value; + auto resultVec = set.topK(threshold); + size_t size = resultVec.size(); + + offsets_to.push_back((offsets_to.size() == 0 ? 0 : offsets_to.back()) + size); + + typename ColumnVector::Container_t & data_to = static_cast &>(arr_to.getData()).getData(); + size_t old_size = data_to.size(); + data_to.resize(old_size + size); + + size_t i = 0; + for (auto it = resultVec.begin(); it != resultVec.end(); ++it, ++i) + data_to[old_size + i] = it->key; + } +}; + + +/// Generic implementation, it uses serialized representation as object descriptor. +struct AggregateFunctionTopKGenericData +{ + using Set = SpaceSaving + < + std::string, + StringRef, + StringRefHash, + HashTableGrower<4>, + HashTableAllocatorWithStackMemory + >; + + Set value; +}; + +/** Template parameter with true value should be used for columns that store their elements in memory continuously. + * For such columns topK() can be implemented more efficently (especially for small numeric arrays). + */ +template +class AggregateFunctionTopKGeneric : public IUnaryAggregateFunction> +{ +private: + using State = AggregateFunctionTopKGenericData; + DataTypePtr input_data_type; + size_t threshold = TOP_K_DEFAULT; + size_t reserved = TOP_K_LOAD_FACTOR * threshold; + + static void deserializeAndInsert(StringRef str, IColumn & data_to); + +public: + String getName() const override { return "topK"; } + + void setArgument(const DataTypePtr & argument) + { + input_data_type = argument; + } + + void setParameters(const Array & params) override + { + if (params.size() != 1) + throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + size_t k = applyVisitor(FieldVisitorConvertToNumber(), params[0]); + + if (k > TOP_K_MAX_SIZE) + throw Exception("Too large parameter for aggregate function " + getName() + ". Maximum: " + toString(TOP_K_MAX_SIZE), + ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + threshold = k; + reserved = TOP_K_LOAD_FACTOR * k; + } + + DataTypePtr getReturnType() const override + { + return std::make_shared(input_data_type->clone()); + } + + bool allocatesMemoryInArena() const override + { + return true; + } + + void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override + { + this->data(place).value.write(buf); + } + + void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena * arena) const override + { + auto & set = this->data(place).value; + set.resize(reserved); + + size_t count = 0; + readVarUInt(count, buf); + for (size_t i = 0; i < count; ++i) { + std::string key_string; + readStringBinary(key_string, buf); + UInt64 count, error; + readVarUInt(count, buf); + readVarUInt(error, buf); + set.insert(key_string, count, error); + } + } + + void addImpl(AggregateDataPtr place, const IColumn & column, size_t row_num, Arena * arena) const + { + auto & set = this->data(place).value; + if (set.capacity() != reserved) { + set.resize(reserved); + } + + StringRef str_serialized = column.getDataAt(row_num); + set.insert(str_serialized.toString()); + } + + void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override + { + this->data(place).value.merge(this->data(rhs).value); + } + + void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override + { + ColumnArray & arr_to = static_cast(to); + ColumnArray::Offsets_t & offsets_to = arr_to.getOffsets(); + IColumn & data_to = arr_to.getData(); + + auto resultVec = this->data(place).value.topK(threshold); + offsets_to.push_back((offsets_to.size() == 0 ? 0 : offsets_to.back()) + resultVec.size()); + + for (auto & elem : resultVec) + { + deserializeAndInsert(elem.key, data_to); + } + } +}; + + +template <> +inline void AggregateFunctionTopKGeneric::deserializeAndInsert(StringRef str, IColumn & data_to) +{ + data_to.deserializeAndInsertFromArena(str.data); +} + +template <> +inline void AggregateFunctionTopKGeneric::deserializeAndInsert(StringRef str, IColumn & data_to) +{ + data_to.insertData(str.data, str.size); +} + + +#undef TOP_K_DEFAULT +#undef TOP_K_MAX_SIZE +#undef TOP_K_LOAD_FACTOR + +} diff --git a/dbms/src/AggregateFunctions/AggregateFunctionUniq.h b/dbms/src/AggregateFunctions/AggregateFunctionUniq.h index 690fa330693..51ee93024d1 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionUniq.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionUniq.h @@ -86,12 +86,11 @@ struct AggregateFunctionUniqExactData using Key = T; /// When creating, the hash table must be small. - typedef HashSet< + using Set = HashSet< Key, HashCRC32, HashTableGrower<4>, - HashTableAllocatorWithStackMemory - > Set; + HashTableAllocatorWithStackMemory>; Set set; @@ -105,12 +104,11 @@ struct AggregateFunctionUniqExactData using Key = UInt128; /// When creating, the hash table must be small. - typedef HashSet< + using Set = HashSet< Key, UInt128TrivialHash, HashTableGrower<3>, - HashTableAllocatorWithStackMemory - > Set; + HashTableAllocatorWithStackMemory>; Set set; diff --git a/dbms/src/AggregateFunctions/CMakeLists.txt b/dbms/src/AggregateFunctions/CMakeLists.txt new file mode 100644 index 00000000000..8e2f0368f99 --- /dev/null +++ b/dbms/src/AggregateFunctions/CMakeLists.txt @@ -0,0 +1,27 @@ +include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake) +add_headers_and_sources(clickhouse_aggregate_functions .) + +list(REMOVE_ITEM clickhouse_aggregate_functions_sources + AggregateFunctionFactory.cpp + AggregateFunctionState.cpp + AggregateFunctionArray.cpp + AggregateFunctionNull.cpp + AggregateFunctionForEach.cpp + AggregateFunctionIf.cpp + AggregateFunctionMerge.cpp + AggregateFunctionCount.cpp +) + +list(REMOVE_ITEM clickhouse_aggregate_functions_headers + AggregateFunction.h + AggregateFunctionFactory.h + AggregateFunctionState.h + AggregateFunctionArray.h + AggregateFunctionNull.h + AggregateFunctionForEach.h + AggregateFunctionIf.h + AggregateFunctionMerge.h + AggregateFunctionCount.h +) + +add_library(clickhouse_aggregate_functions ${clickhouse_aggregate_functions_sources}) diff --git a/dbms/src/AggregateFunctions/IBinaryAggregateFunction.h b/dbms/src/AggregateFunctions/IBinaryAggregateFunction.h index 94aea8d947b..1d07cd22f74 100644 --- a/dbms/src/AggregateFunctions/IBinaryAggregateFunction.h +++ b/dbms/src/AggregateFunctions/IBinaryAggregateFunction.h @@ -19,8 +19,7 @@ public: if (arguments.size() != 2) throw Exception{ "Passed " + toString(arguments.size()) + " arguments to binary aggregate function " + this->getName(), - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH - }; + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH}; getDerived().setArgumentsImpl(arguments); } diff --git a/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp b/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp new file mode 100644 index 00000000000..5c8646fd583 --- /dev/null +++ b/dbms/src/AggregateFunctions/registerAggregateFunctions.cpp @@ -0,0 +1,54 @@ +#include + +#include + +namespace DB +{ + +void registerAggregateFunctionAvg(AggregateFunctionFactory & factory); +void registerAggregateFunctionCount(AggregateFunctionFactory & factory); +void registerAggregateFunctionGroupArray(AggregateFunctionFactory & factory); +void registerAggregateFunctionGroupUniqArray(AggregateFunctionFactory & factory); +void registerAggregateFunctionGroupArrayInsertAt(AggregateFunctionFactory & factory); +void registerAggregateFunctionsQuantile(AggregateFunctionFactory & factory); +void registerAggregateFunctionsQuantileExact(AggregateFunctionFactory & factory); +void registerAggregateFunctionsQuantileExactWeighted(AggregateFunctionFactory & factory); +void registerAggregateFunctionsQuantileDeterministic(AggregateFunctionFactory & factory); +void registerAggregateFunctionsQuantileTiming(AggregateFunctionFactory & factory); +void registerAggregateFunctionsQuantileTDigest(AggregateFunctionFactory & factory); +void registerAggregateFunctionsSequenceMatch(AggregateFunctionFactory & factory); +void registerAggregateFunctionsMinMaxAny(AggregateFunctionFactory & factory); +void registerAggregateFunctionsStatistics(AggregateFunctionFactory & factory); +void registerAggregateFunctionSum(AggregateFunctionFactory & factory); +void registerAggregateFunctionsUniq(AggregateFunctionFactory & factory); +void registerAggregateFunctionUniqUpTo(AggregateFunctionFactory & factory); +void registerAggregateFunctionTopK(AggregateFunctionFactory & factory); +void registerAggregateFunctionDebug(AggregateFunctionFactory & factory); + + +void registerAggregateFunctions() +{ + auto & factory = AggregateFunctionFactory::instance(); + + registerAggregateFunctionAvg(factory); + registerAggregateFunctionCount(factory); + registerAggregateFunctionGroupArray(factory); + registerAggregateFunctionGroupUniqArray(factory); + registerAggregateFunctionGroupArrayInsertAt(factory); + registerAggregateFunctionsQuantile(factory); + registerAggregateFunctionsQuantileExact(factory); + registerAggregateFunctionsQuantileExactWeighted(factory); + registerAggregateFunctionsQuantileDeterministic(factory); + registerAggregateFunctionsQuantileTiming(factory); + registerAggregateFunctionsQuantileTDigest(factory); + registerAggregateFunctionsSequenceMatch(factory); + registerAggregateFunctionsMinMaxAny(factory); + registerAggregateFunctionsStatistics(factory); + registerAggregateFunctionSum(factory); + registerAggregateFunctionsUniq(factory); + registerAggregateFunctionUniqUpTo(factory); + registerAggregateFunctionTopK(factory); + registerAggregateFunctionDebug(factory); +} + +} diff --git a/dbms/src/AggregateFunctions/registerAggregateFunctions.h b/dbms/src/AggregateFunctions/registerAggregateFunctions.h new file mode 100644 index 00000000000..2a2e0bb7d3f --- /dev/null +++ b/dbms/src/AggregateFunctions/registerAggregateFunctions.h @@ -0,0 +1,8 @@ +#pragma once + +namespace DB +{ + +void registerAggregateFunctions(); + +} diff --git a/dbms/src/Analyzers/TypeAndConstantInference.cpp b/dbms/src/Analyzers/TypeAndConstantInference.cpp index 0b227b0f8cc..39b85811f56 100644 --- a/dbms/src/Analyzers/TypeAndConstantInference.cpp +++ b/dbms/src/Analyzers/TypeAndConstantInference.cpp @@ -165,7 +165,7 @@ void processFunction(const String & column_name, ASTPtr & ast, TypeAndConstantIn } /// Aggregate function. - if (AggregateFunctionPtr aggregate_function_ptr = context.getAggregateFunctionFactory().tryGet(function->name, argument_types)) + if (AggregateFunctionPtr aggregate_function_ptr = AggregateFunctionFactory::instance().tryGet(function->name, argument_types)) { /// NOTE Not considering aggregate function parameters in type inference. It could become needed in future. /// Note that aggregate function could never be constant expression. diff --git a/dbms/src/Analyzers/tests/CMakeLists.txt b/dbms/src/Analyzers/tests/CMakeLists.txt index 512d955e038..42fcdc54e5e 100644 --- a/dbms/src/Analyzers/tests/CMakeLists.txt +++ b/dbms/src/Analyzers/tests/CMakeLists.txt @@ -2,22 +2,22 @@ add_executable(collect_aliases collect_aliases.cpp) target_link_libraries(collect_aliases dbms) add_executable(collect_tables collect_tables.cpp) -target_link_libraries(collect_tables dbms storages_system) +target_link_libraries(collect_tables dbms clickhouse_storages_system) add_executable(analyze_columns analyze_columns.cpp) -target_link_libraries(analyze_columns dbms storages_system) +target_link_libraries(analyze_columns dbms clickhouse_storages_system) add_executable(type_and_constant_inference type_and_constant_inference.cpp) -target_link_libraries(type_and_constant_inference storages_system clickhouse_functions dbms) +target_link_libraries(type_and_constant_inference clickhouse_storages_system clickhouse_functions dbms) add_executable(analyze_result_of_query analyze_result_of_query.cpp) -target_link_libraries(analyze_result_of_query dbms storages_system) +target_link_libraries(analyze_result_of_query dbms clickhouse_storages_system) add_executable(translate_positional_arguments translate_positional_arguments.cpp) target_link_libraries(translate_positional_arguments dbms) add_executable(optimize_group_order_limit_by optimize_group_order_limit_by.cpp) -target_link_libraries(optimize_group_order_limit_by dbms storages_system) +target_link_libraries(optimize_group_order_limit_by dbms clickhouse_storages_system) add_executable(analyze_lambdas analyze_lambdas.cpp) target_link_libraries(analyze_lambdas dbms) diff --git a/dbms/src/Client/CMakeLists.txt b/dbms/src/Client/CMakeLists.txt index 999a46a0826..da5dc27cdb5 100644 --- a/dbms/src/Client/CMakeLists.txt +++ b/dbms/src/Client/CMakeLists.txt @@ -1,5 +1,5 @@ add_library (clickhouse-client Client.cpp) -target_link_libraries (clickhouse-client dbms ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY}) +target_link_libraries (clickhouse-client dbms clickhouse_aggregate_functions ${LINE_EDITING_LIBS} ${Boost_PROGRAM_OPTIONS_LIBRARY}) install (FILES config.xml DESTINATION ${CLICKHOUSE_ETC_DIR}/clickhouse-client COMPONENT clickhouse-client) add_library (clickhouse-benchmark Benchmark.cpp) diff --git a/dbms/src/Client/Client.cpp b/dbms/src/Client/Client.cpp index 7e4ad1a7607..530301cabd4 100644 --- a/dbms/src/Client/Client.cpp +++ b/dbms/src/Client/Client.cpp @@ -46,6 +46,7 @@ #include #include #include +#include /// http://en.wikipedia.org/wiki/ANSI_escape_code @@ -191,6 +192,7 @@ private: #undef EXTRACT_LIMIT registerFunctions(); + registerAggregateFunctions(); } diff --git a/dbms/src/Common/AIO.h b/dbms/src/Common/AIO.h index 5702a8bc683..df3e22a5338 100644 --- a/dbms/src/Common/AIO.h +++ b/dbms/src/Common/AIO.h @@ -15,7 +15,7 @@ #include -/** Небольшие обёртки для асинхронного ввода-вывода. +/** Small wrappers for asynchronous I/O. */ diff --git a/dbms/src/Common/Allocator.cpp b/dbms/src/Common/Allocator.cpp index 7eca841833e..b57ba0dd798 100644 --- a/dbms/src/Common/Allocator.cpp +++ b/dbms/src/Common/Allocator.cpp @@ -22,15 +22,15 @@ namespace ErrorCodes } -/** Многие современные аллокаторы (например, tcmalloc) не умеют делать mremap для realloc, - * даже в случае достаточно больших кусков памяти. - * Хотя это позволяет увеличить производительность и уменьшить потребление памяти во время realloc-а. - * Чтобы это исправить, делаем mremap самостоятельно, если кусок памяти достаточно большой. - * Порог (64 МБ) выбран достаточно большим, так как изменение адресного пространства - * довольно сильно тормозит, особенно в случае наличия большого количества потоков. - * Рассчитываем, что набор операций mmap/что-то сделать/mremap может выполняться всего лишь около 1000 раз в секунду. +/** Many modern allocators (for example, tcmalloc) do not do a mremap for realloc, + * even in case of large enough chunks of memory. + * Although this allows you to increase performance and reduce memory consumption during realloc. + * To fix this, we do mremap manually if the chunk of memory is large enough. + * The threshold (64 MB) is chosen quite large, since changing the address space is + * very slow, especially in the case of a large number of threads. + * We expect that the set of operations mmap/something to do/mremap can only be performed about 1000 times per second. * - * PS. Также это требуется, потому что tcmalloc не может выделить кусок памяти больше 16 GB. + * PS. This is also required, because tcmalloc can not allocate a chunk of memory greater than 16 GB. */ static constexpr size_t MMAP_THRESHOLD = 64 * (1 << 20); static constexpr size_t MMAP_MIN_ALIGNMENT = 4096; diff --git a/dbms/src/Common/Allocator.h b/dbms/src/Common/Allocator.h index 8018ebbde79..8079b37cf0c 100644 --- a/dbms/src/Common/Allocator.h +++ b/dbms/src/Common/Allocator.h @@ -3,13 +3,13 @@ #include -/** Отвечает за выделение/освобождение памяти. Используется, например, в PODArray, Arena. - * Также используется в хэш-таблицах. - * Интерфейс отличается от std::allocator - * - наличием метода realloc, который для больших кусков памяти использует mremap; - * - передачей размера в метод free; - * - наличием аргумента alignment; - * - возможностью зануления памяти (используется в хэш-таблицах); +/** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena. + * Also used in hash tables. + * The interface is different from std::allocator + * - the presence of the method realloc, which for large chunks of memory uses mremap; + * - passing the size into the `free` method; + * - by the presence of the `alignment` argument; + * - the possibility of zeroing memory (used in hash tables); */ template class Allocator @@ -38,9 +38,9 @@ protected: }; -/** При использовании AllocatorWithStackMemory, размещённом на стеке, - * GCC 4.9 ошибочно делает предположение, что мы можем вызывать free от указателя на стек. - * На самом деле, комбинация условий внутри AllocatorWithStackMemory этого не допускает. +/** When using AllocatorWithStackMemory, located on the stack, + * GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack. + * In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this. */ #if !__clang__ #pragma GCC diagnostic push diff --git a/dbms/src/Common/ArenaWithFreeLists.h b/dbms/src/Common/ArenaWithFreeLists.h index 08a3c84d8ca..4a8801ed2f1 100644 --- a/dbms/src/Common/ArenaWithFreeLists.h +++ b/dbms/src/Common/ArenaWithFreeLists.h @@ -8,40 +8,40 @@ namespace DB { -/** В отличие от Arena, позволяет освобождать (для последующего повторного использования) - * выделенные ранее (не обязательно только что) куски памяти. - * Для этого, запрашиваемый размер округляется вверх до степени двух - * (или до 8, если меньше; или используется выделение памяти вне Arena, если размер больше 65536). - * При освобождении памяти, для каждого размера (всего 14 вариантов: 8, 16... 65536), - * поддерживается односвязный список свободных блоков. - * При аллокации, мы берём голову списка свободных блоков, - * либо, если список пуст - выделяем новый блок, используя Arena. +/** Unlike Arena, allows you to release (for later re-use) + * previously allocated (not necessarily just recently) chunks of memory. + * For this, the requested size is rounded up to the power of two + * (or up to 8, if less, or using memory allocation outside Arena if the size is greater than 65536). + * When freeing memory, for each size (14 options in all: 8, 16 ... 65536), + * a single-linked list of free blocks is kept track. + * When allocating, we take the head of the list of free blocks, + * or, if the list is empty - allocate a new block using Arena. */ class ArenaWithFreeLists : private Allocator, private boost::noncopyable { private: - /// Если блок свободен, то в его начале хранится указатель на следующий свободный блок, либо nullptr, если свободных блоков больше нет. - /// Если блок используется, то в нём хранятся какие-то данные. + /// If the block is free, then the pointer to the next free block is stored at its beginning, or nullptr, if there are no more free blocks. + /// If the block is used, then some data is stored in it. union Block { Block * next; char data[0]; }; - /// Максимальный размер куска памяти, который выделяется с помощью Arena. Иначе используем Allocator напрямую. + /// The maximum size of a piece of memory that is allocated with Arena. Otherwise, we use Allocator directly. static constexpr size_t max_fixed_block_size = 65536; - /// Получить индекс в массиве freelist-ов для заданного размера. + /// Get the index in the freelist array for the specified size. static size_t findFreeListIndex(const size_t size) { return size <= 8 ? 2 : bitScanReverse(size - 1); } - /// Для выделения блоков не слишком большого размера используется Arena. + /// Arena is used to allocate blocks that are not too large. Arena pool; - /// Списки свободных блоков. Каждый элемент указывает на голову соответствующего списка, либо равен nullptr. - /// Первые два элемента не используются, а предназначены для упрощения арифметики. + /// Lists of free blocks. Each element points to the head of the corresponding list, or is nullptr. + /// The first two elements are not used, but are intended to simplify arithmetic. Block * free_lists[16] {}; public: @@ -60,10 +60,10 @@ public: /// find list of required size const auto list_idx = findFreeListIndex(size); - /// Если есть свободный блок. + /// If there is a free block. if (auto & free_block_ptr = free_lists[list_idx]) { - /// Возьмём его. И поменяем голову списка на следующий элемент списка. + /// Let's take it. And change the head of the list to the next item in the list. const auto res = free_block_ptr->data; free_block_ptr = free_block_ptr->next; return res; @@ -81,14 +81,14 @@ public: /// find list of required size const auto list_idx = findFreeListIndex(size); - /// Вставим освобождённый блок в голову списка. + /// Insert the released block into the head of the list. auto & free_block_ptr = free_lists[list_idx]; const auto old_head = free_block_ptr; free_block_ptr = reinterpret_cast(ptr); free_block_ptr->next = old_head; } - /// Размер выделенного пула в байтах + /// Size of the allocated pool in bytes size_t size() const { return pool.size(); diff --git a/dbms/src/Common/AutoArray.h b/dbms/src/Common/AutoArray.h index 60246b4d1b2..7b5d13d417c 100644 --- a/dbms/src/Common/AutoArray.h +++ b/dbms/src/Common/AutoArray.h @@ -8,30 +8,30 @@ namespace DB { -/** Массив (почти) неизменяемого размера: - * размер задаётся в конструкторе; - * метод resize приводит к удалению старых данных и нужен лишь для того, - * чтобы можно было сначала создать пустой объект, используя конструктор по-умолчанию, - * а потом уже определиться с размером. +/** An array of (almost) unchangable size: + * the size is specified in the constructor; + * `resize` method removes old data, and necessary only for + * so that you can first create an empty object using the default constructor, + * and then decide on the size. * - * Есть возможность не инициализировать элементы по-умолчанию, а создавать их inplace. - * Деструкторы элементов вызываются автоматически. + * There is a possibility to not initialize elements by default, but create them inplace. + * Member destructors are called automatically. * - * sizeof равен размеру одного указателя. + * `sizeof` is equal to the size of one pointer. * - * Не exception-safe. - * Копирование не поддерживается. Перемещение опустошает исходный объект. - * То есть, использовать этот массив во многих случаях неудобно. + * Not exception-safe. + * Copying is not supported. Moving empties the original object. + * That is, it is inconvenient to use this array in many cases. * - * Предназначен для ситуаций, в которых создаётся много массивов одинакового небольшого размера, - * но при этом размер не известен во время компиляции. - * Также даёт существенное преимущество в случаях, когда важно, чтобы sizeof был минимальным. - * Например, если массивы кладутся в open-addressing хэш-таблицу с inplace хранением значений (как HashMap) + * Designed for situations in which many arrays of the same small size are created, + * but the size is not known at compile time. + * Also gives a significant advantage in cases where it is important that `sizeof` is minimal. + * For example, if arrays are put in an open-addressing hash table with inplace storage of values (like HashMap) * - * В этом случае, по сравнению с std::vector: - * - для массивов размером в 1 элемент - преимущество примерно в 2 раза; - * - для массивов размером в 5 элементов - преимущество примерно в 1.5 раза - * (в качестве T использовались DB::Field, содержащие UInt64 и String); + * In this case, compared to std::vector: + * - for arrays of 1 element size - an advantage of about 2 times; + * - for arrays of 5 elements - an advantage of about 1.5 times + * (DB::Field, containing UInt64 and String, used as T); */ const size_t empty_auto_array_helper = 0; @@ -42,7 +42,7 @@ template class AutoArray { public: - /// Для отложенного создания. + /// For deferred creation. AutoArray() { setEmpty(); @@ -53,16 +53,16 @@ public: init(size_, false); } - /** Не будут вызваны конструкторы по-умолчанию для элементов. - * В этом случае, вы должны вставить все элементы с помощью функции place и placement new, - * так как для них потом будут вызваны деструкторы. + /** The default constructors for elements will not be called. + * In this case, you must insert all elements using the `place` and `placement new` functions, + * since destructors are then called for them. */ AutoArray(size_t size_, const DontInitElemsTag & tag) { init(size_, true); } - /** Инициализирует все элементы копирующим конструктором с параметром value. + /** Initializes all elements with a copy constructor with the `value` parameter. */ AutoArray(size_t size_, const T & value) { @@ -74,7 +74,7 @@ public: } } - /** resize удаляет все существующие элементы. + /** `resize` removes all existing items. */ void resize(size_t size_, bool dont_init_elems = false) { @@ -82,7 +82,7 @@ public: init(size_, dont_init_elems); } - /** Премещение. + /** Move operations. */ AutoArray(AutoArray && src) { @@ -125,10 +125,10 @@ public: setEmpty(); } - /** Можно читать и модифицировать элементы с помощью оператора [] - * только если элементы были инициализированы - * (то есть, в конструктор не был передан DontInitElemsTag, - * или вы их инициализировали с помощью place и placement new). + /** You can read and modify elements using the [] operator + * only if items were initialized + * (that is, into the constructor was not passed DontInitElemsTag, + * or you initialized them using `place` and `placement new`). */ T & operator[](size_t i) { @@ -140,9 +140,9 @@ public: return elem(i); } - /** Получить кусок памяти, в котором должен быть расположен элемент. - * Функция предназначена, чтобы инициализировать элемент, - * который ещё не был инициализирован: + /** Get the piece of memory in which the element should be located. + * The function is intended to initialize an element, + * which has not yet been initialized * new (arr.place(i)) T(args); */ char * place(size_t i) diff --git a/dbms/src/Common/CombinedCardinalityEstimator.h b/dbms/src/Common/CombinedCardinalityEstimator.h index e89b62699f6..94d21064a42 100644 --- a/dbms/src/Common/CombinedCardinalityEstimator.h +++ b/dbms/src/Common/CombinedCardinalityEstimator.h @@ -23,9 +23,9 @@ static inline ContainerType max(const ContainerType & lhs, const ContainerType & } -/** Для маленького количества ключей - массив фиксированного размера "на стеке". - * Для среднего - выделяется HashSet. - * Для большого - выделяется HyperLogLog. +/** For a small number of keys - an array of fixed size "on the stack". + * For the average, HashSet is allocated. + * For large, HyperLogLog is allocated. */ template < @@ -146,7 +146,7 @@ public: getContainer().merge(rhs.getContainer()); } - /// Можно вызывать только для пустого объекта. + /// You can only call for an empty object. void read(DB::ReadBuffer & in) { UInt8 v; @@ -171,8 +171,8 @@ public: { auto container_type = getContainerType(); - /// Если readAndMerge вызывается с пустым состоянием, просто десериализуем - /// состояние задано в качестве параметра. + /// If readAndMerge is called with an empty state, just deserialize + /// the state is specified as a parameter. if ((container_type == details::ContainerType::SMALL) && small.empty()) { read(in); diff --git a/dbms/src/Common/CompactArray.h b/dbms/src/Common/CompactArray.h index c53fb4d5b7b..ee4d74b3ed2 100644 --- a/dbms/src/Common/CompactArray.h +++ b/dbms/src/Common/CompactArray.h @@ -15,11 +15,11 @@ namespace ErrorCodes } -/** Компактный массив для хранения данных, размер content_width, в битах, которых составляет - * меньше одного байта. Вместо того, чтобы хранить каждое значение в отдельный - * байт, что приводит к растрате 37.5% пространства для content_width=5, CompactArray хранит - * смежные content_width-битные значения в массиве байтов, т.е. фактически CompactArray - * симулирует массив content_width-битных значений. +/** Compact array for data storage, size `content_width`, in bits, of which is + * less than one byte. Instead of storing each value in a separate + * bytes, which leads to a waste of 37.5% of the space for content_width = 5, CompactArray stores + * adjacent `content_width`-bit values in the byte array, that is actually CompactArray + * simulates an array of `content_width`-bit values. */ template class __attribute__ ((packed)) CompactArray final @@ -76,12 +76,12 @@ public: } private: - /// число байт в битсете + /// number of bytes in bitset static constexpr size_t BITSET_SIZE = (static_cast(bucket_count) * content_width + 7) / 8; UInt8 bitset[BITSET_SIZE] = { 0 }; }; -/** Класс для последовательного чтения ячеек из компактного массива на диске. +/** A class for sequentially reading cells from a compact array on a disk. */ template class CompactArray::Reader final @@ -135,7 +135,7 @@ public: return true; } - /** Вернуть текущий номер ячейки и соответствующее содержание. + /** Return the current cell number and the corresponding content. */ inline std::pair get() const { @@ -150,26 +150,26 @@ public: private: ReadBuffer & in; - /// Физическое расположение текущей ячейки. + /// The physical location of the current cell. Locus locus; - /// Текущая позиция в файле в виде номера ячейки. + /// The current position in the file as a cell number. BucketIndex current_bucket_index = 0; - /// Количество прочитанных байтов. + /// The number of bytes read. size_t read_count = 0; - /// Содержание в текущей позиции. + /// The content in the current position. UInt8 value_l; UInt8 value_r; /// bool is_eof = false; - /// Влезает ли ячейка полностью в один байт? + /// Does the cell fully fit into one byte? bool fits_in_byte; }; -/** Структура Locus содержит необходимую информацию, чтобы найти для каждой ячейки - * соответствующие байт и смещение, в битах, от начала ячейки. Поскольку в общем - * случае размер одного байта не делится на размер одной ячейки, возможны случаи, - * когда одна ячейка перекрывает два байта. Поэтому структура Locus содержит две - * пары (индекс, смещение). +/** The `Locus` structure contains the necessary information to find for each cell + * the corresponding byte and offset, in bits, from the beginning of the cell. Since in general + * case the size of one byte is not divisible by the size of one cell, cases possible + * when one cell overlaps two bytes. Therefore, the `Locus` structure contains two + * pairs (index, offset). */ template class CompactArray::Locus final @@ -190,13 +190,13 @@ public: { if ((index_l == index_r) || (index_l == (BITSET_SIZE - 1))) { - /// Ячейка полностью влезает в один байт. + /// The cell completely fits into one byte. *content_l &= ~(((1 << content_width) - 1) << offset_l); *content_l |= content << offset_l; } else { - /// Ячейка перекрывает два байта. + /// The cell overlaps two bytes. size_t left = 8 - offset_l; *content_l &= ~(((1 << left) - 1) << offset_l); @@ -230,13 +230,13 @@ private: UInt8 ALWAYS_INLINE read(UInt8 value_l) const { - /// Ячейка полностью влезает в один байт. + /// The cell completely fits into one byte. return (value_l >> offset_l) & ((1 << content_width) - 1); } UInt8 ALWAYS_INLINE read(UInt8 value_l, UInt8 value_r) const { - /// Ячейка перекрывает два байта. + /// The cell overlaps two bytes. return ((value_l >> offset_l) & ((1 << (8 - offset_l)) - 1)) | ((value_r & ((1 << offset_r) - 1)) << (8 - offset_l)); } @@ -250,7 +250,7 @@ private: UInt8 * content_l; UInt8 * content_r; - /// Проверки + /// Checks static_assert((content_width > 0) && (content_width < 8), "Invalid parameter value"); static_assert(bucket_count <= (std::numeric_limits::max() / content_width), "Invalid parameter value"); }; diff --git a/dbms/src/Common/ConcurrentBoundedQueue.h b/dbms/src/Common/ConcurrentBoundedQueue.h index cbcfd1b5eb8..9b9a80dd9f1 100644 --- a/dbms/src/Common/ConcurrentBoundedQueue.h +++ b/dbms/src/Common/ConcurrentBoundedQueue.h @@ -38,9 +38,9 @@ namespace detail } }; -/** Очень простая thread-safe очередь ограниченной длины. - * Если пытаться вынуть элемент из пустой очереди, то поток блокируется, пока очередь не станет непустой. - * Если пытаться вставить элемент в переполненную очередь, то поток блокируется, пока в очереди не появится элемент. +/** A very simple thread-safe queue of limited length. + * If you try to pop an item from an empty queue, the thread is blocked until the queue becomes nonempty. + * If you try to push an element into an overflowed queue, the thread is blocked until space appears in the queue. */ template class ConcurrentBoundedQueue diff --git a/dbms/src/Common/CounterInFile.h b/dbms/src/Common/CounterInFile.h index 3ff4d9aeb67..99320a1fbfd 100644 --- a/dbms/src/Common/CounterInFile.h +++ b/dbms/src/Common/CounterInFile.h @@ -22,24 +22,24 @@ #define SMALL_READ_WRITE_BUFFER_SIZE 16 -/** Хранит в файле число. - * Предназначен для редких вызовов (не рассчитан на производительность). +/** Stores a number in the file. + * Designed for rare calls (not designed for performance). */ class CounterInFile { public: - /// path - имя файла, включая путь + /// path - the name of the file, including the path CounterInFile(const std::string & path_) : path(path_) {} - /** Добавить delta к числу в файле и вернуть новое значение. - * Если параметр create_if_need не установлен в true, то - * в файле уже должно быть записано какое-нибудь число (если нет - создайте файл вручную с нулём). + /** Add `delta` to the number in the file and return the new value. + * If the `create_if_need` parameter is not set to true, then + * the file should already have a number written (if not - create the file manually with zero). * - * Для защиты от race condition-ов между разными процессами, используются файловые блокировки. - * (Но при первом создании файла race condition возможен, так что лучше создать файл заранее.) + * To protect against race conditions between different processes, file locks are used. + * (But when the first file is created, the race condition is possible, so it's better to create the file in advance.) * - * locked_callback вызывается при заблокированном файле со счетчиком. В него передается новое значение. - * locked_callback можно использовать, чтобы делать что-нибудь атомарно с увеличением счетчика (например, переименовывать файлы). + * `locked_callback` is called when the counter file is locked. A new value is passed to it. + * `locked_callback` can be used to do something atomically with incrementing the counter (for example, renaming files). */ template Int64 add(Int64 delta, Callback && locked_callback, bool create_if_need = false) @@ -74,7 +74,7 @@ public: } catch (const DB::Exception & e) { - /// Более понятное сообщение об ошибке. + /// A more understandable error message. if (e.code() == DB::ErrorCodes::CANNOT_READ_ALL_DATA || e.code() == DB::ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) throw DB::Exception("File " + path + " is empty. You must fill it manually with appropriate value.", e.code()); else @@ -118,13 +118,13 @@ public: return path; } - /// Изменить путь к файлу. + /// Change the path to the file. void setPath(std::string path_) { path = path_; } - // Не thread-safe и не синхронизирован между процессами. + // Not thread-safe and not synchronized between processes. void fixIfBroken(UInt64 value) { bool file_exists = Poco::File(path).exists(); diff --git a/dbms/src/Common/Exception.h b/dbms/src/Common/Exception.h index aa3f4544b74..05a40479308 100644 --- a/dbms/src/Common/Exception.h +++ b/dbms/src/Common/Exception.h @@ -35,7 +35,7 @@ public: DB::Exception * clone() const override { return new DB::Exception(*this); } void rethrow() const override { throw *this; } - /// Дописать к существующему сообщению что-нибудь ещё. + /// Add something to the existing message. void addMessage(const std::string & arg) { extendedMessage(arg); } const StackTrace & getStackTrace() const { return trace; } @@ -45,7 +45,7 @@ private: }; -/// Содержит дополнительный член saved_errno. См. функцию throwFromErrno. +/// Contains an additional member `saved_errno`. See the throwFromErrno function. class ErrnoException : public Exception { public: @@ -73,8 +73,8 @@ using Exceptions = std::vector; void throwFromErrno(const std::string & s, int code = 0, int the_errno = errno); -/** Попробовать записать исключение в лог (и забыть про него). - * Можно использовать в деструкторах в блоке catch (...). +/** Try to write an exception to the log (and forget about it). + * Can be used in destructors in the catch-all block. */ void tryLogCurrentException(const char * log_name, const std::string & start_of_message = ""); void tryLogCurrentException(Poco::Logger * logger, const std::string & start_of_message = ""); diff --git a/dbms/src/Common/ExternalTable.h b/dbms/src/Common/ExternalTable.h index e275ce2e37f..257431a5d15 100644 --- a/dbms/src/Common/ExternalTable.h +++ b/dbms/src/Common/ExternalTable.h @@ -25,16 +25,16 @@ namespace ErrorCodes } -/// Базовый класс содержащий основную информацию о внешней таблице и -/// основные функции для извлечения этой информации из текстовых полей. +/// The base class containing the basic information about external table and +/// basic functions for extracting this information from text fields. class BaseExternalTable { public: - std::string file; /// Файл с данными или '-' если stdin - std::string name; /// Имя таблицы - std::string format; /// Название формата хранения данных + std::string file; /// File with data or '-' if stdin + std::string name; /// The name of the table + std::string format; /// Name of the data storage format - /// Описание структуры таблицы: (имя столбца, имя типа данных) + /// Description of the table structure: (column name, data type name) std::vector > structure; std::unique_ptr read_buffer; @@ -42,10 +42,10 @@ public: virtual ~BaseExternalTable() {}; - /// Инициализировать read_buffer в зависимости от источника данных. По умолчанию не делает ничего. + /// Initialize read_buffer, depending on the data source. By default, does nothing. virtual void initReadBuffer() {}; - /// Инициализировать sample_block по структуре таблицы сохраненной в structure + /// Initialize sample_block according to the structure of the table stored in the `structure` virtual void initSampleBlock(const Context & context) { const DataTypeFactory & data_type_factory = DataTypeFactory::instance(); @@ -60,7 +60,7 @@ public: } } - /// Получить данные таблицы - пару (поток с содержимым таблицы, имя таблицы) + /// Get the table data - a pair (a thread with the contents of the table, the name of the table) virtual ExternalTableData getData(const Context & context) { initReadBuffer(); @@ -71,7 +71,7 @@ public: } protected: - /// Очистить всю накопленную информацию + /// Clear all accumulated information void clean() { name = ""; @@ -82,7 +82,7 @@ protected: read_buffer.reset(); } - /// Функция для отладочного вывода информации + /// Function for debugging information output void write() { std::cerr << "file " << file << std::endl; @@ -100,7 +100,7 @@ protected: return res; } - /// Построить вектор structure по текстовому полю structure + /// Construct the `structure` vector from the text field `structure` virtual void parseStructureFromStructureField(const std::string & argument) { std::vector vals = split(argument, " ,"); @@ -112,7 +112,7 @@ protected: structure.emplace_back(vals[i], vals[i + 1]); } - /// Построить вектор structure по текстовому полю types + /// Construct the `structure` vector from the text field `types` virtual void parseStructureFromTypesField(const std::string & argument) { std::vector vals = split(argument, " ,"); @@ -123,7 +123,7 @@ protected: }; -/// Парсинг внешей таблицы, используемый в tcp клиенте. +/// Parsing of external table used in the tcp client. class ExternalTable : public BaseExternalTable { public: @@ -135,7 +135,7 @@ public: read_buffer = std::make_unique(file); } - /// Извлечение параметров из variables_map, которая строится по командной строке клиента + /// Extract parameters from variables_map, which is built on the client command line ExternalTable(const boost::program_options::variables_map & external_options) { if (external_options.count("file")) @@ -162,9 +162,9 @@ public: } }; -/// Парсинг внешей таблицы, используемый при отправке таблиц через http -/// Функция handlePart будет вызываться для каждой переданной таблицы, -/// поэтому так же необходимо вызывать clean в конце handlePart. +/// Parsing of external table used when sending tables via http +/// The `handlePart` function will be called for each table passed, + /// so it's also necessary to call `clean` at the end of the `handlePart`. class ExternalTablesHandler : public Poco::Net::PartHandler, BaseExternalTable { public: @@ -174,15 +174,15 @@ public: void handlePart(const Poco::Net::MessageHeader & header, std::istream & stream) { - /// Буфер инициализируется здесь, а не в виртуальной функции initReadBuffer + /// The buffer is initialized here, not in the virtual function initReadBuffer read_buffer = std::make_unique(stream); - /// Извлекаем коллекцию параметров из MessageHeader + /// Retrieve a collection of parameters from MessageHeader Poco::Net::NameValueCollection content; std::string label; Poco::Net::MessageHeader::splitParameters(header.get("Content-Disposition"), label, content); - /// Получаем параметры + /// Get parameters name = content.get("name", "_data"); format = params.get(name + "_format", "TabSeparated"); @@ -195,13 +195,13 @@ public: ExternalTableData data = getData(context); - /// Создаем таблицу + /// Create table NamesAndTypesListPtr columns = std::make_shared(sample_block.getColumnsList()); StoragePtr storage = StorageMemory::create(data.second, columns); context.addExternalTable(data.second, storage); BlockOutputStreamPtr output = storage->write(ASTPtr(), context.getSettingsRef()); - /// Записываем данные + /// Write data data.first->readPrefix(); output->writePrefix(); while(Block block = data.first->read()) @@ -210,7 +210,7 @@ public: output->writeSuffix(); names.push_back(name); - /// Подготавливаемся к приему следующего файла, для этого очищаем всю полученную информацию + /// We are ready to receive the next file, for this we clear all the information received clean(); } diff --git a/dbms/src/Common/FileChecker.cpp b/dbms/src/Common/FileChecker.cpp index fb5a12c5bf7..66e315fd754 100644 --- a/dbms/src/Common/FileChecker.cpp +++ b/dbms/src/Common/FileChecker.cpp @@ -136,7 +136,7 @@ void FileChecker::load(Map & map) const ReadBufferFromFile in(files_info_path); WriteBufferFromString out(content); - /// The JSON library does not support whitespace. We delete them. Ineffective. + /// The JSON library does not support whitespace. We delete them. Inefficient. while (!in.eof()) { char c; diff --git a/dbms/src/Common/FileChecker.h b/dbms/src/Common/FileChecker.h index aa1cb601125..4b44b7ede7b 100644 --- a/dbms/src/Common/FileChecker.h +++ b/dbms/src/Common/FileChecker.h @@ -8,11 +8,11 @@ namespace DB { -/// хранит размеры всех столбцов, и может проверять не побились ли столбцы +/// stores the sizes of all columns, and can check whether the columns are corrupted class FileChecker { private: - /// Имя файла -> размер. + /// File name -> size. using Map = std::map; public: @@ -23,7 +23,7 @@ public: void update(const Poco::File & file); void update(const Files::const_iterator & begin, const Files::const_iterator & end); - /// Проверяем файлы, параметры которых указаны в sizes.json + /// Check the files whose parameters are specified in sizes.json bool check() const; private: @@ -35,7 +35,7 @@ private: std::string files_info_path; std::string tmp_files_info_path; - /// Данные из файла читаются лениво. + /// The data from the file is read lazily. Map map; bool initialized = false; diff --git a/dbms/src/Common/HashTable/ClearableHashSet.h b/dbms/src/Common/HashTable/ClearableHashSet.h index e0939d43071..4979e7ab03c 100644 --- a/dbms/src/Common/HashTable/ClearableHashSet.h +++ b/dbms/src/Common/HashTable/ClearableHashSet.h @@ -4,12 +4,12 @@ #include -/** Хеш-таблица, позволяющая очищать таблицу за O(1). - * Еще более простая, чем HashSet: Key и Mapped должны быть POD-типами. +/** A hash table that allows you to clear the table in O(1). + * Even simpler than HashSet: Key and Mapped must be POD-types. * - * Вместо этого класса можно было бы просто использовать в HashSet в качестве ключа пару <версия, ключ>, - * но тогда таблица накапливала бы все ключи, которые в нее когда-либо складывали, и неоправданно росла. - * Этот класс идет на шаг дальше и считает ключи со старой версией пустыми местами в хеш-таблице. + * Instead of this class, you could just use the pair (version, key) in the HashSet as the key + * but then the table would accumulate all the keys that it ever stored, and it was unreasonably growing. + * This class goes a step further and considers the keys with the old version empty in the hash table. */ @@ -17,11 +17,11 @@ struct ClearableHashSetState { UInt32 version = 1; - /// Сериализация, в бинарном и текстовом виде. + /// Serialization, in binary and text form. void write(DB::WriteBuffer & wb) const { DB::writeBinary(version, wb); } void writeText(DB::WriteBuffer & wb) const { DB::writeText(version, wb); } - /// Десериализация, в бинарном и текстовом виде. + /// Deserialization, in binary and text form. void read(DB::ReadBuffer & rb) { DB::readBinary(version, rb); } void readText(DB::ReadBuffer & rb) { DB::readText(version, rb); } }; @@ -38,10 +38,10 @@ struct ClearableHashTableCell : public BaseCell bool isZero(const State & state) const { return version != state.version; } static bool isZero(const Key & key, const State & state) { return false; } - /// Установить значение ключа в ноль. + /// Set the key value to zero. void setZero() { version = 0; } - /// Нужно ли хранить нулевой ключ отдельно (то есть, могут ли в хэш-таблицу вставить нулевой ключ). + /// Do I need to store the zero key separately (that is, can a zero key be inserted into the hash table). static constexpr bool need_zero_value_storage = false; ClearableHashTableCell() {} diff --git a/dbms/src/Common/HashTable/Hash.h b/dbms/src/Common/HashTable/Hash.h index ed3bc228b65..a9517e3e5e1 100644 --- a/dbms/src/Common/HashTable/Hash.h +++ b/dbms/src/Common/HashTable/Hash.h @@ -3,12 +3,19 @@ #include -/** Хэш функции, которые лучше чем тривиальная функция std::hash. - * (при агрегации по идентификатору посетителя, прирост производительности более чем в 5 раз) +/** Hash functions that are better than the trivial function std::hash. + * + * Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times. + * This is because of following reasons: + * - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits; + * - in typical implementation of standard library, hash function for integers is trivial and just use lower bits; + * - traffic is non-uniformly distributed across a day; + * - we are using open-addressing linear probing hash tables that are most critical to hash function quality, + * and trivial hash function gives disasterous results. */ -/** Взято из MurmurHash. - * Быстрее, чем intHash32 при вставке в хэш-таблицу UInt64 -> UInt64, где ключ - идентификатор посетителя. +/** Taken from MurmurHash. This is Murmur finalizer. + * Faster than intHash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID. */ inline DB::UInt64 intHash64(DB::UInt64 x) { @@ -21,21 +28,22 @@ inline DB::UInt64 intHash64(DB::UInt64 x) return x; } -/** CRC32C является не очень качественной в роли хэш функции, - * согласно avalanche и bit independence тестам, а также малым количеством бит, - * но может вести себя хорошо при использовании в хэш-таблицах, - * за счёт высокой скорости (latency 3 + 1 такт, througput 1 такт). - * Работает только при поддержке SSE 4.2. - * Используется asm вместо интринсика, чтобы не обязательно было собирать весь проект с -msse4. +/** CRC32C is not very high-quality as a hash function, + * according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits, + * but can behave well when used in hash tables, + * due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle). + * Works only with SSE 4.2 support. */ +#if __SSE4_2__ +#include +#endif + inline DB::UInt64 intHashCRC32(DB::UInt64 x) { -#if defined(__x86_64__) - DB::UInt64 crc = -1ULL; - asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x)); - return crc; +#if __SSE4_2__ + return _mm_crc32_u64(-1ULL, x); #else - /// На других платформах используем не обязательно CRC32. NOTE Это может сбить с толку. + /// On other platforms we do not have CRC32. NOTE This can be confusing. return intHash64(x); #endif } @@ -117,7 +125,7 @@ DEFINE_HASH(DB::Float64) #undef DEFINE_HASH -/// Разумно использовать для UInt8, UInt16 при достаточном размере хэш-таблицы. +/// It is reasonable to use for UInt8, UInt16 with sufficient hash table size. struct TrivialHash { template @@ -128,17 +136,22 @@ struct TrivialHash }; -/** Сравнительно неплохая некриптографическая хэш функция из UInt64 в UInt32. - * Но хуже (и по качеству и по скорости), чем просто срезка intHash64. - * Взята отсюда: http://www.concentric.net/~ttwang/tech/inthash.htm +/** A relatively good non-cryptographic hash function from UInt64 to UInt32. + * But worse (both in quality and speed) than just cutting intHash64. + * Taken from here: http://www.concentric.net/~ttwang/tech/inthash.htm * - * Немного изменена по сравнению с функцией по ссылке: сдвиги вправо случайно заменены на цикличесвие сдвиги вправо. - * Это изменение никак не повлияло на результаты тестов smhasher. + * Slightly changed compared to the function by link: shifts to the right are accidentally replaced by a cyclic shift to the right. + * This change did not affect the smhasher test results. * - * Рекомендуется для разных задач использовать разные salt. - * А то был случай, что в БД значения сортировались по хэшу (для некачественного псевдослучайного разбрасывания), - * а в другом месте, в агрегатной функции, в хэш таблице использовался такой же хэш, - * в результате чего, эта агрегатная функция чудовищно тормозила из-за коллизий. + * It is recommended to use different salt for different tasks. + * That was the case that in the database values were sorted by hash (for low-quality pseudo-random spread), + * and in another place, in the aggregate function, the same hash was used in the hash table, + * as a result, this aggregate function was monstrously slowed due to collisions. + * + * NOTE Salting is far from perfect, because it commutes with first steps of calculation. + * + * NOTE As mentioned, this function is slower than intHash64. + * But occasionaly, it is faster, when written in a loop and loop is vectorized. */ template inline DB::UInt32 intHash32(DB::UInt64 key) @@ -156,7 +169,7 @@ inline DB::UInt32 intHash32(DB::UInt64 key) } -/// Для контейнеров. +/// For containers. template struct IntHash32 { diff --git a/dbms/src/Common/HashTable/HashMap.h b/dbms/src/Common/HashTable/HashMap.h index 438022f7c02..9068807e21f 100644 --- a/dbms/src/Common/HashTable/HashMap.h +++ b/dbms/src/Common/HashTable/HashMap.h @@ -13,7 +13,7 @@ struct NoInitTag {}; -/// Пара, которая не инициализирует элементы, если не нужно. +/// A pair that does not initialize the elements, if not needed. template struct PairNoInit { @@ -60,18 +60,18 @@ struct HashMapCell bool isZero(const State & state) const { return isZero(value.first, state); } static bool isZero(const Key & key, const State & state) { return ZeroTraits::check(key); } - /// Установить значение ключа в ноль. + /// Set the key value to zero. void setZero() { ZeroTraits::set(value.first); } - /// Нужно ли хранить нулевой ключ отдельно (то есть, могут ли в хэш-таблицу вставить нулевой ключ). + /// Do I need to store the zero key separately (that is, can a zero key be inserted into the hash table). static constexpr bool need_zero_value_storage = true; - /// Является ли ячейка удалённой. + /// Whether the cell was deleted. bool isDeleted() const { return false; } void setMapped(const value_type & value_) { value.second = value_.second; } - /// Сериализация, в бинарном и текстовом виде. + /// Serialization, in binary and text form. void write(DB::WriteBuffer & wb) const { DB::writeBinary(value.first, wb); @@ -85,7 +85,7 @@ struct HashMapCell DB::writeDoubleQuoted(value.second, wb); } - /// Десериализация, в бинарном и текстовом виде. + /// Deserialization, in binary and text form. void read(DB::ReadBuffer & rb) { DB::readBinary(value.first, rb); @@ -141,19 +141,19 @@ public: bool inserted; this->emplace(x, it, inserted); - /** Может показаться, что инициализация не обязательна для POD-типов (или __has_trivial_constructor), - * так как кусок памяти для хэш-таблицы изначально инициализирован нулями. - * Но, на самом деле, пустая ячейка может быть не инициализирована нулями в следующих случаях: - * - ZeroValueStorage (в нём зануляется только ключ); - * - после ресайза и переноса части ячеек в новую половину хэш-таблицы, у старых ячеек, тоже зануляется только ключ. + /** It may seem that initialization is not necessary for POD-types (or __has_trivial_constructor), + * since the hash table memory is initially initialized with zeros. + * But, in fact, an empty cell may not be initialized with zeros in the following cases: + * - ZeroValueStorage (it only zeros the key); + * - after resizing and moving a part of the cells to the new half of the hash table, the old cells also have only the key to zero. * - * По производительности, разницы почти всегда нет, за счёт того, что it->second как правило присваивается сразу - * после вызова operator[], и так как operator[] инлайнится, компилятор убирает лишнюю инициализацию. + * On performance, there is almost always no difference, due to the fact that it->second is usually assigned immediately + * after calling `operator[]`, and since `operator[]` is inlined, the compiler removes unnecessary initialization. * - * Иногда из-за инициализации, производительность даже растёт. Это происходит в коде вида ++map[key]. - * Когда мы делаем инициализацию, то для новых ячеек, достаточно сразу сделать store 1. - * А если бы мы не делали инициализацию, то не смотря на то, что в ячейке был ноль, - * компилятор не может об этом догадаться, и генерирует код load, increment, store. + * Sometimes due to initialization, the performance even grows. This occurs in code like `++map[key]`. + * When we do the initialization, for new cells, it's enough to make `store 1` right away. + * And if we did not initialize, then even though there was zero in the cell, + * the compiler can not guess about this, and generates the `load`, `increment`, `store` code. */ if (inserted) new(&it->second) mapped_type(); diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h index 50ffb4058b2..d505bf2c94c 100644 --- a/dbms/src/Common/HashTable/HashTable.h +++ b/dbms/src/Common/HashTable/HashTable.h @@ -44,27 +44,27 @@ namespace ErrorCodes } -/** Состояние хэш-таблицы, которое влияет на свойства её ячеек. - * Используется в качестве параметра шаблона. - * Например, существует реализация мгновенно-очищаемой хэш-таблицы - ClearableHashMap. - * Для неё, в каждой ячейке хранится номер версии, и в самой хэш-таблице - текущая версия. - * При очистке, просто увеличивается текущая версия; все ячейки с несовпадающей версией считаются пустыми. - * Другой пример: для приближённого рассчёта количества уникальных посетителей, есть хэш-таблица UniquesHashSet. - * В ней имеется понятие "степень". При каждом переполнении, ячейки с ключами, не делящимися на соответствующую степень двух, удаляются. +/** The state of the hash table that affects the properties of its cells. + * Used as a template parameter. + * For example, there is an implementation of an instantly clearable hash table - ClearableHashMap. + * For it, each cell holds the version number, and in the hash table itself is the current version. + * When clearing, the current version simply increases; All cells with a mismatching version are considered empty. + * Another example: for an approximate calculation of the number of unique visitors, there is a hash table for UniquesHashSet. + * It has the concept of "degree". At each overflow, cells with keys that do not divide by the corresponding power of the two are deleted. */ struct HashTableNoState { - /// Сериализация, в бинарном и текстовом виде. + /// Serialization, in binary and text form. void write(DB::WriteBuffer & wb) const {} void writeText(DB::WriteBuffer & wb) const {} - /// Десериализация, в бинарном и текстовом виде. + /// Deserialization, in binary and text form. void read(DB::ReadBuffer & rb) {} void readText(DB::ReadBuffer & rb) {} }; -/// Эти функции могут быть перегружены для пользовательских типов. +/// These functions can be overloaded for custom types. namespace ZeroTraits { @@ -77,11 +77,11 @@ void set(T & x) { x = 0; } }; -/** Compile-time интерфейс ячейки хэш-таблицы. - * Разные ячейки используются для реализации разных хэш-таблиц. - * Ячейка должна содержать ключ. - * Также может содержать значение и произвольные дополнительные данные - * (пример: запомненное значение хэш-функции; номер версии для ClearableHashMap). +/** Compile-time interface for cell of the hash table. + * Different cell types are used to implement different hash tables. + * The cell must contain a key. + * It can also contain a value and arbitrary additional data + * (example: the stored hash value; version number for ClearableHashMap). */ template struct HashTableCell @@ -93,89 +93,89 @@ struct HashTableCell HashTableCell() {} - /// Создать ячейку с заданным ключём / ключём и значением. + /// Create a cell with the given key / key and value. HashTableCell(const Key & key_, const State & state) : key(key_) {} -/// HashTableCell(const value_type & value_, const State & state) : key(value_) {} +/// HashTableCell(const value_type & value_, const State & state) : key(value_) {} - /// Получить то, что будет value_type контейнера. + /// Get what the value_type of the container will be. value_type & getValue() { return key; } const value_type & getValue() const { return key; } - /// Получить ключ. + /// Get the key. static Key & getKey(value_type & value) { return value; } static const Key & getKey(const value_type & value) { return value; } - /// Равны ли ключи у ячеек. + /// Are the keys at the cells equal? bool keyEquals(const Key & key_) const { return key == key_; } bool keyEquals(const Key & key_, size_t hash_) const { return key == key_; } - /// Если ячейка умеет запоминать в себе значение хэш-функции, то запомнить его. + /// If the cell can remember the value of the hash function, then remember it. void setHash(size_t hash_value) {} - /// Если ячейка умеет запоминать в себе значение хэш-функции, то вернуть запомненное значение. - /// Оно должно быть хотя бы один раз вычислено до этого. - /// Если запоминание значения хэш-функции не предусмотрено, то просто вычислить хэш. + /// If the cell can store the hash value in itself, then return the stored value. + /// It must be at least once calculated before. + /// If storing the hash value is not provided, then just compute the hash. size_t getHash(const Hash & hash) const { return hash(key); } - /// Является ли ключ нулевым. В основном буфере, ячейки с нулевым ключём, считаются пустыми. - /// Если нулевые ключи могут быть вставлены в таблицу, то ячейка для нулевого ключа хранится отдельно, не в основном буфере. - /// Нулевые ключи должны быть такими, что занулённый кусок памяти представляет собой нулевой ключ. + /// Whether the key is zero. In the main buffer, cells with a zero key are considered empty. + /// If zero keys can be inserted into the table, then the cell for the zero key is stored separately, not in the main buffer. + /// Zero keys must be such that the zeroed-down piece of memory is a zero key. bool isZero(const State & state) const { return isZero(key, state); } static bool isZero(const Key & key, const State & state) { return ZeroTraits::check(key); } - /// Установить значение ключа в ноль. + /// Set the key value to zero. void setZero() { ZeroTraits::set(key); } - /// Нужно ли хранить нулевой ключ отдельно (то есть, могут ли в хэш-таблицу вставить нулевой ключ). + /// Do the hash table need to store the zero key separately (that is, can a zero key be inserted into the hash table). static constexpr bool need_zero_value_storage = true; - /// Является ли ячейка удалённой. + /// Whether the cell is deleted. bool isDeleted() const { return false; } - /// Установить отображаемое значение, если есть (для HashMap), в соответствующиее из value. + /// Set the mapped value, if any (for HashMap), to the corresponding `value`. void setMapped(const value_type & value) {} - /// Сериализация, в бинарном и текстовом виде. + /// Serialization, in binary and text form. void write(DB::WriteBuffer & wb) const { DB::writeBinary(key, wb); } void writeText(DB::WriteBuffer & wb) const { DB::writeDoubleQuoted(key, wb); } - /// Десериализация, в бинарном и текстовом виде. + /// Deserialization, in binary and text form. void read(DB::ReadBuffer & rb) { DB::readBinary(key, rb); } void readText(DB::ReadBuffer & rb) { DB::writeDoubleQuoted(key, rb); } }; -/** Определяет размер хэш-таблицы, а также когда и во сколько раз её надо ресайзить. +/** Determines the size of the hash table, and when and how much it should be resized. */ template struct HashTableGrower { - /// Состояние этой структуры достаточно, чтобы получить размер буфера хэш-таблицы. + /// The state of this structure is enough to get the buffer size of the hash table. UInt8 size_degree = initial_size_degree; - /// Размер хэш-таблицы в ячейках. + /// The size of the hash table in the cells. size_t bufSize() const { return 1 << size_degree; } size_t maxFill() const { return 1 << (size_degree - 1); } size_t mask() const { return bufSize() - 1; } - /// Из значения хэш-функции получить номер ячейки в хэш-таблице. + /// From the hash value, get the cell number in the hash table. size_t place(size_t x) const { return x & mask(); } - /// Следующая ячейка в цепочке разрешения коллизий. + /// The next cell in the collision resolution chain. size_t next(size_t pos) const { ++pos; return pos & mask(); } - /// Является ли хэш-таблица достаточно заполненной. Нужно увеличить размер хэш-таблицы, или удалить из неё что-нибудь ненужное. + /// Whether the hash table is sufficiently full. You need to increase the size of the hash table, or remove something unnecessary from it. bool overflow(size_t elems) const { return elems > maxFill(); } - /// Увеличить размер хэш-таблицы. + /// Increase the size of the hash table. void increaseSize() { size_degree += size_degree >= 23 ? 1 : 2; } - /// Установить размер буфера по количеству элементов хэш-таблицы. Используется при десериализации хэш-таблицы. + /// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table. void set(size_t num_elems) { size_degree = num_elems <= 1 @@ -192,17 +192,17 @@ struct HashTableGrower }; -/** При использовании в качестве Grower-а, превращает хэш-таблицу в что-то типа lookup-таблицы. - * Остаётся неоптимальность - в ячейках хранятся ключи. - * Также компилятору не удаётся полностью удалить код хождения по цепочке разрешения коллизий, хотя он не нужен. - * TODO Сделать полноценную lookup-таблицу. +/** When used as a Grower, it turns a hash table into something like a lookup table. + * It remains non-optimal - the cells store the keys. + * Also, the compiler can not completely remove the code of passing through the collision resolution chain, although it is not needed. + * TODO Make a proper lookup table. */ template struct HashTableFixedGrower { size_t bufSize() const { return 1 << key_bits; } size_t place(size_t x) const { return x; } - /// Тут можно было бы написать __builtin_unreachable(), но компилятор не до конца всё оптимизирует, и получается менее эффективно. + /// You could write __builtin_unreachable(), but the compiler does not optimize everything, and it turns out less efficiently. size_t next(size_t pos) const { return pos + 1; } bool overflow(size_t elems) const { return false; } @@ -212,7 +212,7 @@ struct HashTableFixedGrower }; -/** Если нужно хранить нулевой ключ отдельно - место для его хранения. */ +/** If you want to store the zero key separately - a place to store it. */ template struct ZeroValueStorage; @@ -271,15 +271,15 @@ protected: using Self = HashTable; using cell_type = Cell; - size_t m_size = 0; /// Количество элементов - Cell * buf; /// Кусок памяти для всех элементов кроме элемента с ключём 0. + size_t m_size = 0; /// Amount of elements + Cell * buf; /// A piece of memory for all elements except the element with zero key. Grower grower; #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS mutable size_t collisions = 0; #endif - /// Найти ячейку с тем же ключём или пустую ячейку, начиная с заданного места и далее по цепочке разрешения коллизий. + /// Find a cell with the same key or an empty cell, starting from the specified position and further along the collision resolution chain. size_t ALWAYS_INLINE findCell(const Key & x, size_t hash_value, size_t place_value) const { while (!buf[place_value].isZero(*this) && !buf[place_value].keyEquals(x, hash_value)) @@ -293,7 +293,7 @@ protected: return place_value; } - /// Найти пустую ячейку, начиная с заданного места и далее по цепочке разрешения коллизий. + /// Find an empty cell, starting with the specified position and further along the collision resolution chain. size_t ALWAYS_INLINE findEmptyCell(const Key & x, size_t hash_value, size_t place_value) const { while (!buf[place_value].isZero(*this)) @@ -323,7 +323,7 @@ protected: } - /// Увеличить размер буфера. + /// Increase the size of the buffer. void resize(size_t for_num_elems = 0, size_t for_buf_size = 0) { #ifdef DBMS_HASH_MAP_DEBUG_RESIZES @@ -332,10 +332,10 @@ protected: size_t old_size = grower.bufSize(); - /** Чтобы в случае исключения, объект остался в корректном состоянии, - * изменение переменной grower (определяющией размер буфера хэш-таблицы) - * откладываем на момент после реального изменения буфера. - * Временная переменная new_grower используется, чтобы определить новый размер. + /** In case of exception for the object to remain in the correct state, + * changing the variable `grower` (which determines the buffer size of the hash table) + * is postponed for a moment after a real buffer change. + * The temporary variable `new_grower` is used to determine the new size. */ Grower new_grower = grower; @@ -354,29 +354,29 @@ protected: else new_grower.increaseSize(); - /// Расширим пространство. + /// Expand the space. buf = reinterpret_cast(Allocator::realloc(buf, getBufferSizeInBytes(), new_grower.bufSize() * sizeof(Cell))); grower = new_grower; - /** Теперь некоторые элементы может потребоваться переместить на новое место. - * Элемент может остаться на месте, или переместиться в новое место "справа", - * или переместиться левее по цепочке разрешения коллизий, из-за того, что элементы левее него были перемещены в новое место "справа". + /** Now some items may need to be moved to a new location. + * The element can stay in place, or move to a new location "on the right", + * or move to the left of the collision resolution chain, because the elements to the left of it have been moved to the new "right" location. */ size_t i = 0; for (; i < old_size; ++i) if (!buf[i].isZero(*this) && !buf[i].isDeleted()) - reinsert(buf[i]); + reinsert(buf[i], buf[i].getHash(*this)); - /** Также имеется особый случай: - * если элемент должен был быть в конце старого буфера, [ x] - * но находится в начале из-за цепочки разрешения коллизий, [o x] - * то после ресайза, он сначала снова окажется не на своём месте, [ xo ] - * и для того, чтобы перенести его куда надо, - * надо будет после переноса всех элементов из старой половинки [ o x ] - * обработать ещё хвостик из цепочки разрешения коллизий сразу после неё [ o x ] + /** There is also a special case: + * if the element was to be at the end of the old buffer, [ x] + * but is at the beginning because of the collision resolution chain, [o x] + * then after resizing, it will first be out of place again, [ xo ] + * and in order to transfer it where necessary, + * after transferring all the elements from the old halves you need to [ o x ] + * process tail from the collision resolution chain immediately after it [ o x ] */ for (; !buf[i].isZero(*this) && !buf[i].isDeleted(); ++i) - reinsert(buf[i]); + reinsert(buf[i], buf[i].getHash(*this)); #ifdef DBMS_HASH_MAP_DEBUG_RESIZES watch.stop(); @@ -387,30 +387,30 @@ protected: } - /** Вставить в новый буфер значение, которое было в старом буфере. - * Используется при увеличении размера буфера. + /** Paste into the new buffer the value that was in the old buffer. + * Used when increasing the buffer size. */ - void reinsert(Cell & x) + void reinsert(Cell & x, size_t hash_value) { - size_t hash_value = x.getHash(*this); size_t place_value = grower.place(hash_value); - /// Если элемент на своём месте. + /// If the element is in its place. if (&x == &buf[place_value]) return; - /// Вычисление нового места, с учётом цепочки разрешения коллизий. + /// Compute a new location, taking into account the collision resolution chain. place_value = findCell(Cell::getKey(x.getValue()), hash_value, place_value); - /// Если элемент остался на своём месте в старой цепочке разрешения коллизий. + /// If the item remains in its place in the old collision resolution chain. if (!buf[place_value].isZero(*this)) return; - /// Копирование на новое место и зануление старого. + /// Copy to a new location and zero the old one. + x.setHash(hash_value); memcpy(&buf[place_value], &x, sizeof(x)); x.setZero(); - /// Потом на старое место могут переместиться элементы, которые раньше были в коллизии с этим. + /// Then the elements that previously were in collision with this can move to the old place. } @@ -611,10 +611,10 @@ protected: iterator iteratorToZero() { return iteratorTo(this->zeroValue()); } - /// Если ключ нулевой - вставить его в специальное место и вернуть true. - bool ALWAYS_INLINE emplaceIfZero(Key x, iterator & it, bool & inserted) + /// If the key is zero, insert it into a special place and return true. + bool ALWAYS_INLINE emplaceIfZero(Key x, iterator & it, bool & inserted, size_t hash_value) { - /// Если утверждается, что нулевой ключ не могут вставить в таблицу. + /// If it is claimed that the zero key can not be inserted into the table. if (!Cell::need_zero_value_storage) return false; @@ -625,7 +625,7 @@ protected: { ++m_size; this->setHasZero(); - it.ptr->setHash(hash(x)); + it.ptr->setHash(hash_value); inserted = true; } else @@ -638,7 +638,7 @@ protected: } - /// Только для ненулевых ключей. Найти нужное место, вставить туда ключ, если его ещё нет, вернуть итератор на ячейку. + /// Only for non-zero keys. Find the right place, insert the key there, if it does not already exist. Set iterator to the cell in output parameter. void ALWAYS_INLINE emplaceNonZero(Key x, iterator & it, bool & inserted, size_t hash_value) { size_t place_value = findCell(x, hash_value, grower.place(hash_value)); @@ -664,9 +664,9 @@ protected: } catch (...) { - /** Если этого не делать, то будут проблемы. - * Ведь останется ключ, но неинициализированное mapped-значение, - * у которого, возможно, даже нельзя вызвать деструктор. + /** If we have not resized successfully, then there will be problems. + * There remains a key, but uninitialized mapped-value, + * which, perhaps, can not even be called a destructor. */ --m_size; buf[place_value].setZero(); @@ -679,13 +679,14 @@ protected: public: - /// Вставить значение. В случае хоть сколько-нибудь сложных значений, лучше используйте функцию emplace. + /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. std::pair ALWAYS_INLINE insert(const value_type & x) { std::pair res; - if (!emplaceIfZero(Cell::getKey(x), res.first, res.second)) - emplaceNonZero(Cell::getKey(x), res.first, res.second, hash(Cell::getKey(x))); + size_t hash_value = hash(Cell::getKey(x)); + if (!emplaceIfZero(Cell::getKey(x), res.first, res.second, hash_value)) + emplaceNonZero(Cell::getKey(x), res.first, res.second, hash_value); if (res.second) res.first.ptr->setMapped(x); @@ -694,14 +695,21 @@ public: } - /** Вставить ключ, - * вернуть итератор на позицию, которую можно использовать для placement new значения, - * а также флаг - был ли вставлен новый ключ. + /// Reinsert node pointed to by iterator + void ALWAYS_INLINE reinsert(iterator & it, size_t hash_value) + { + reinsert(*it.getPtr(), hash_value); + } + + + /** Insert the key, + * return an iterator to a position that can be used for `placement new` of value, + * as well as the flag - whether a new key was inserted. * - * Вы обязаны сделать placement new значения, если был вставлен новый ключ, - * так как при уничтожении хэш-таблицы для него будет вызываться деструктор! + * You have to make `placement new` of value if you inserted a new key, + * since when destroying a hash table, it will call the destructor! * - * Пример использования: + * Example usage: * * Map::iterator it; * bool inserted; @@ -711,20 +719,21 @@ public: */ void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted) { - if (!emplaceIfZero(x, it, inserted)) - emplaceNonZero(x, it, inserted, hash(x)); - } - - - /// То же самое, но с заранее вычисленным значением хэш-функции. - void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t hash_value) - { - if (!emplaceIfZero(x, it, inserted)) + size_t hash_value = hash(x); + if (!emplaceIfZero(x, it, inserted, hash_value)) emplaceNonZero(x, it, inserted, hash_value); } - /// Скопировать ячейку из другой хэш-таблицы. Предполагается, что ячейка не нулевая, а также, что такого ключа в таблице ещё не было. + /// Same, but with a precalculated value of hash function. + void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t hash_value) + { + if (!emplaceIfZero(x, it, inserted, hash_value)) + emplaceNonZero(x, it, inserted, hash_value); + } + + + /// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet. void ALWAYS_INLINE insertUniqueNonZero(const Cell * cell, size_t hash_value) { size_t place_value = findEmptyCell(cell->getKey(cell->getValue()), hash_value, grower.place(hash_value)); @@ -903,8 +912,8 @@ public: memset(buf, 0, grower.bufSize() * sizeof(*buf)); } - /// После выполнения этой функции, таблицу можно только уничтожить, - /// а также можно использовать методы size, empty, begin, end. + /// After executing this function, the table can only be destroyed, + /// and also you can use the methods `size`, `empty`, `begin`, `end`. void clearAndShrink() { destroyElements(); diff --git a/dbms/src/Common/HashTable/SmallTable.h b/dbms/src/Common/HashTable/SmallTable.h index 413adac0018..823bc93bf5d 100644 --- a/dbms/src/Common/HashTable/SmallTable.h +++ b/dbms/src/Common/HashTable/SmallTable.h @@ -3,15 +3,15 @@ #include -/** Замена хэш-таблицы для маленького количества (единицы) ключей. - * Реализована в виде массива с линейным поиском. - * Массив расположен внутри объекта. - * Интерфейс является подмножеством интерфейса HashTable. +/** Replacement of the hash table for a small number (<10) of keys. + * Implemented as an array with linear search. + * The array is located inside the object. + * The interface is a subset of the HashTable interface. * - * Вставка возможна только если метод full возвращает false. - * При неизвестном количестве различных ключей, - * вы должны проверять, не заполнена ли таблица, - * и делать fallback в этом случае (например, использовать полноценную хэш-таблицу). + * Insert is possible only if the `full` method returns false. + * With an unknown number of different keys, + * you should check if the table is not full, + * and do a `fallback` in this case (for example, use a real hash table). */ template @@ -32,11 +32,11 @@ protected: using Self = SmallTable; using cell_type = Cell; - size_t m_size = 0; /// Количество элементов. - Cell buf[capacity]; /// Кусок памяти для всех элементов. + size_t m_size = 0; /// Amount of elements. + Cell buf[capacity]; /// A piece of memory for all elements. - /// Найти ячейку с тем же ключём или пустую ячейку, начиная с заданного места и далее по цепочке разрешения коллизий. + /// Find a cell with the same key or an empty cell, starting from the specified position and then by the collision resolution chain. const Cell * ALWAYS_INLINE findCell(const Key & x) const { const Cell * it = buf; @@ -188,8 +188,8 @@ protected: public: - /** Таблица переполнена. - * В переполненную таблицу ничего нельзя вставлять. + /** The table is full. + * You can not insert anything into the full table. */ bool full() { @@ -197,7 +197,7 @@ public: } - /// Вставить значение. В случае хоть сколько-нибудь сложных значений, лучше используйте функцию emplace. + /// Insert the value. In the case of any more complex values, it is better to use the `emplace` function. std::pair ALWAYS_INLINE insert(const value_type & x) { std::pair res; @@ -211,14 +211,14 @@ public: } - /** Вставить ключ, - * вернуть итератор на позицию, которую можно использовать для placement new значения, - * а также флаг - был ли вставлен новый ключ. + /** Insert the key, + * return an iterator to a position that can be used for `placement new` of value, + * as well as the flag - whether a new key was inserted. * - * Вы обязаны сделать placement new значения, если был вставлен новый ключ, - * так как при уничтожении хэш-таблицы для него будет вызываться деструктор! + * You have to make `placement new` of value if you inserted a new key, + * since when destroying a hash table, a destructor will be called for it! * - * Пример использования: + * Example usage: * * Map::iterator it; * bool inserted; @@ -239,7 +239,7 @@ public: } - /// То же самое, но вернуть false, если переполнено. + /// Same, but return false if it's full. bool ALWAYS_INLINE tryEmplace(Key x, iterator & it, bool & inserted) { Cell * res = findCell(x); @@ -257,7 +257,7 @@ public: } - /// Скопировать ячейку из другой хэш-таблицы. Предполагается, что такого ключа в таблице ещё не было. + /// Copy the cell from another hash table. It is assumed that there was no such key in the table yet. void ALWAYS_INLINE insertUnique(const Cell * cell) { memcpy(&buf[m_size], cell, sizeof(*cell)); diff --git a/dbms/src/Common/HashTable/TwoLevelHashTable.h b/dbms/src/Common/HashTable/TwoLevelHashTable.h index 519b0db7110..6d4edf49fc7 100644 --- a/dbms/src/Common/HashTable/TwoLevelHashTable.h +++ b/dbms/src/Common/HashTable/TwoLevelHashTable.h @@ -3,21 +3,21 @@ #include -/** Двухуровневая хэш-таблица. - * Представляет собой 256 (или 1 << BITS_FOR_BUCKET) маленьких хэш-таблиц (bucket-ов первого уровня). - * Для определения, какую из них использовать, берётся один из байтов хэш-функции. +/** Two-level hash table. + * Represents 256 (or 1 << BITS_FOR_BUCKET) small hash tables (buckets of the first level). + * To determine which one to use, one of the bytes of the hash function is taken. * - * Обычно работает чуть-чуть медленнее простой хэш-таблицы. - * Тем не менее, обладает преимуществами в некоторых случаях: - * - если надо мерджить две хэш-таблицы вместе, то это можно легко распараллелить по bucket-ам; - * - лаг при ресайзах размазан, так как маленькие хэш-таблицы ресайзятся по-отдельности; - * - по идее, ресайзы кэш-локальны в большем диапазоне размеров. + * Usually works a little slower than a simple hash table. + * However, it has advantages in some cases: + * - if you need to merge two hash tables together, then you can easily parallelize it by buckets; + * - delay during resizes is amortized, since the small hash tables will be resized separately; + * - in theory, resizes are cache-local in a larger range of sizes. */ template struct TwoLevelHashTableGrower : public HashTableGrower { - /// Увеличить размер хэш-таблицы. + /// Increase the size of the hash table. void increaseSize() { this->size_degree += this->size_degree >= 15 ? 1 : 2; @@ -52,7 +52,7 @@ public: size_t hash(const Key & x) const { return Hash::operator()(x); } - /// NOTE Плохо для хэш-таблиц больше чем на 2^32 ячеек. + /// NOTE Bad for hash tables with more than 2^32 cells. static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; } protected: @@ -89,13 +89,13 @@ public: TwoLevelHashTable() {} - /// Скопировать данные из другой (обычной) хэш-таблицы. У неё должна быть такая же хэш-функция. + /// Copy the data from another (normal) hash table. It should have the same hash function. template TwoLevelHashTable(const Source & src) { typename Source::const_iterator it = src.begin(); - /// Предполагается, что нулевой ключ (хранящийся отдельно) при итерировании идёт первым. + /// It is assumed that the zero key (stored separately) is first in iteration order. if (it != src.end() && it.getPtr()->isZero(src)) { insert(*it); @@ -205,7 +205,7 @@ public: iterator end() { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; } - /// Вставить значение. В случае хоть сколько-нибудь сложных значений, лучше используйте функцию emplace. + /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. std::pair ALWAYS_INLINE insert(const value_type & x) { size_t hash_value = hash(Cell::getKey(x)); @@ -220,14 +220,14 @@ public: } - /** Вставить ключ, - * вернуть итератор на позицию, которую можно использовать для placement new значения, - * а также флаг - был ли вставлен новый ключ. + /** Insert the key, + * return an iterator to a position that can be used for `placement new` of value, + * as well as the flag - whether a new key was inserted. * - * Вы обязаны сделать placement new значения, если был вставлен новый ключ, - * так как при уничтожении хэш-таблицы для него будет вызываться деструктор! + * You have to make `placement new` values if you inserted a new key, + * since when destroying a hash table, the destructor will be invoked for it! * - * Пример использования: + * Example usage: * * Map::iterator it; * bool inserted; @@ -242,7 +242,7 @@ public: } - /// То же самое, но с заранее вычисленным значением хэш-функции. + /// Same, but with a precalculated values of hash function. void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t hash_value) { size_t buck = getBucketFromHash(hash_value); diff --git a/dbms/src/Common/HyperLogLogBiasEstimator.h b/dbms/src/Common/HyperLogLogBiasEstimator.h index 75116de1ce9..82da2f66597 100644 --- a/dbms/src/Common/HyperLogLogBiasEstimator.h +++ b/dbms/src/Common/HyperLogLogBiasEstimator.h @@ -7,10 +7,10 @@ #include #include -/** Этот класс предоставляет способ, чтобы оценить погрешность результата применения алгоритма HyperLogLog. - * Эмирические наблюдения показывают, что большие погрешности возникают при E < 5 * 2^precision, где - * E - возвращаемое значение алгоритмом HyperLogLog, и precision - параметр точности HyperLogLog. - * См. "HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm". +/** This class provides a way to evaluate the error in the result of applying the HyperLogLog algorithm. + * Empirical observations show that large errors occur at E < 5 * 2^precision, where + * E is the return value of the HyperLogLog algorithm, and `precision` is the HyperLogLog precision parameter. + * See "HyperLogLog in Practice: Algorithmic Engineering of a State of the Art Cardinality Estimation Algorithm". * (S. Heule et al., Proceedings of the EDBT 2013 Conference). */ template @@ -22,14 +22,14 @@ public: return false; } - /// Предельное количество уникальных значений до которого должна примениться поправка - /// из алгоритма LinearCounting. + /// Maximum number of unique values to which the correction should apply + /// from the LinearCounting algorithm. static double getThreshold() { return BiasData::getThreshold(); } - /// Вернуть оценку погрешности. + /// Return the error estimate. static double getBias(double raw_estimate) { const auto & estimates = BiasData::getRawEstimates(); @@ -52,7 +52,7 @@ public: } else { - /// Получаем оценку погрешности путём линейной интерполяции. + /// We get the error estimate by linear interpolation. size_t index = std::distance(estimates.begin(), it); double estimate1 = estimates[index - 1]; @@ -60,7 +60,7 @@ public: double bias1 = biases[index - 1]; double bias2 = biases[index]; - /// Предполагается, что условие estimate1 < estimate2 всегда выполнено. + /// It is assumed that the estimate1 < estimate2 condition is always satisfied. double slope = (bias2 - bias1) / (estimate2 - estimate1); return bias1 + slope * (raw_estimate - estimate1); @@ -68,7 +68,7 @@ public: } private: - /// Статические проверки. + /// Static checks. using TRawEstimatesRef = decltype(BiasData::getRawEstimates()); using TRawEstimates = typename std::remove_reference::type; @@ -82,10 +82,10 @@ private: "Bias estimator has inconsistent data"); }; -/** Тривиальный случай HyperLogLogBiasEstimator: употребляется, если не хотим исправить - * погрешность. Это имеет смысль при маленьких значениях параметра точности, например 5 или 12. - * Тогда применяются поправки из оригинальной версии алгоритма HyperLogLog. - * См. "HyperLogLog: The analysis of a near-optimal cardinality estimation algorithm" +/** Trivial case of HyperLogLogBiasEstimator: used if we do not want to fix + * error. This has meaning for small values of the accuracy parameter, for example 5 or 12. + * Then the corrections from the original version of the HyperLogLog algorithm are applied. + * See "HyperLogLog: The analysis of a near-optimal cardinality estimation algorithm" * (P. Flajolet et al., AOFA '07: Proceedings of the 2007 International Conference on Analysis * of Algorithms) */ diff --git a/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h b/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h index faa6b90c9db..5296a606121 100644 --- a/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h +++ b/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h @@ -9,10 +9,10 @@ namespace DB { -/** Для маленького количества ключей - массив фиксированного размера "на стеке". - * Для большого - выделяется HyperLogLog. - * Смотрите также более практичную реализацию в CombinedCardinalityEstimator.h, - * где используется также хэш-таблица для множеств среднего размера. +/** For a small number of keys - an array of fixed size "on the stack". + * For large, HyperLogLog is allocated. + * See also the more practical implementation in CombinedCardinalityEstimator.h, + * where a hash table is also used for medium-sized sets. */ template < @@ -39,7 +39,7 @@ private: { CurrentMemoryTracker::alloc(sizeof(large)); - /// На время копирования данных из tiny, устанавливать значение large ещё нельзя (иначе оно перезатрёт часть данных). + /// At the time of copying data from `tiny`, setting the value of `large` is still not possible (otherwise it will overwrite some data). Large * tmp_large = new Large; for (const auto & x : small) @@ -99,7 +99,7 @@ public: } } - /// Можно вызывать только для пустого объекта. + /// You can only call for an empty object. void read(DB::ReadBuffer & in) { bool is_large; diff --git a/dbms/src/Common/Increment.h b/dbms/src/Common/Increment.h index fafc424073c..45ef604ccd9 100644 --- a/dbms/src/Common/Increment.h +++ b/dbms/src/Common/Increment.h @@ -3,24 +3,24 @@ #include -/** Позволяет получать авто-инкрементное число, храня его в файле. - * Предназначен для редких вызовов (не рассчитан на производительность). +/** Allows to get an auto-increment number, storing it in a file. + * Intended for rare calls (not designed for performance). */ class Increment { public: - /// path - имя файла, включая путь + /// path - the name of the file, including the path Increment(const std::string & path_) : counter(path_) {} - /** Получить следующее число. - * Если параметр create_if_need не установлен в true, то - * в файле уже должно быть записано какое-нибудь число (если нет - создайте файл вручную с нулём). + /** Get the next number. + * If the `create_if_need` parameter is not set to true, then + * the file must already have a number written (if not - create the file manually with zero). * - * Для защиты от race condition-ов между разными процессами, используются файловые блокировки. - * (Но при первом создании файла race condition возможен, так что лучше создать файл заранее.) + * To protect against race conditions between different processes, file locks are used. + * (But when the first file is created, the race condition is possible, so it's better to create the file in advance.) * - * locked_callback вызывается при заблокированном файле со счетчиком. В него передается новое значение. - * locked_callback можно использовать, чтобы делать что-нибудь атомарно с увеличением счетчика (например, переименовывать файлы). + * `locked_callback` is called when the counter file is locked. A new value is passed to it. + * `locked_callback` can be used to do something atomically with the increment of the counter (for example, rename files). */ template UInt64 get(Callback && locked_callback, bool create_if_need = false) @@ -33,25 +33,25 @@ public: return getBunch(1, create_if_need); } - /// Посмотреть следующее значение. + /// Peek the next value. UInt64 peek(bool create_if_need = false) { return getBunch(0, create_if_need); } - /** Получить следующее число и увеличить счетчик на count. - * Если параметр create_if_need не установлен в true, то - * в файле уже должно быть записано какое-нибудь число (если нет - создайте файл вручную с нулём). - * - * Для защиты от race condition-ов между разными процессами, используются файловые блокировки. - * (Но при первом создании файла race condition возможен, так что лучше создать файл заранее.) - */ + /** Get the next number and increase the counter by `count`. + * If the `create_if_need` parameter is not set to true, then + * the file should already have a number written (if not - create the file manually with zero). + * + * To protect against race conditions between different processes, file locks are used. + * (But when the first file is created, the race condition is possible, so it's better to create the file in advance.) + */ UInt64 getBunch(UInt64 count, bool create_if_need = false) { return static_cast(counter.add(static_cast(count), create_if_need) - count + 1); } - /// Изменить путь к файлу. + /// Change the path to the file. void setPath(std::string path_) { counter.setPath(path_); @@ -65,23 +65,3 @@ public: private: CounterInFile counter; }; - - -/** То же самое, но без хранения в файле. - */ -struct SimpleIncrement : private boost::noncopyable -{ - std::atomic value; - - SimpleIncrement(UInt64 start = 0) : value(start) {} - - void set(UInt64 new_value) - { - value = new_value; - } - - UInt64 get() - { - return ++value; - } -}; diff --git a/dbms/src/Common/Macros.h b/dbms/src/Common/Macros.h index 0e6cfe94c39..0ebf52afd02 100644 --- a/dbms/src/Common/Macros.h +++ b/dbms/src/Common/Macros.h @@ -4,10 +4,11 @@ #include #include + namespace DB { -/** Раскрывает в строке макросы из конфига. +/** Apply substitutions from the macros in config to the string. */ class Macros { @@ -15,8 +16,8 @@ public: Macros(); Macros(const Poco::Util::AbstractConfiguration & config, const String & key); - /** Заменить в строке подстроки вида {macro_name} на значение для macro_name, полученное из конфига. - * level - уровень рекурсии. + /** Replace the substring of the form {macro_name} with the value for macro_name, obtained from the config file. + * level - the level of recursion. */ String expand(const String & s, size_t level = 0) const; diff --git a/dbms/src/Common/MemoryTracker.h b/dbms/src/Common/MemoryTracker.h index df805e7f999..c06fc33444e 100644 --- a/dbms/src/Common/MemoryTracker.h +++ b/dbms/src/Common/MemoryTracker.h @@ -102,10 +102,10 @@ public: }; -/** Объект MemoryTracker довольно трудно протащить во все места, где выделяются существенные объёмы памяти. - * Поэтому, используется thread-local указатель на используемый MemoryTracker или nullptr, если его не нужно использовать. - * Этот указатель выставляется, когда в данном потоке следует отслеживать потребление памяти. - * Таким образом, его нужно всего-лишь протащить во все потоки, в которых обрабатывается один запрос. +/** The MemoryTracker object is quite difficult to pass to all places where significant amounts of memory are allocated. + * Therefore, a thread-local pointer to used MemoryTracker is set, or nullptr if MemoryTracker does not need to be used. + * This pointer is set when memory consumption is monitored in current thread. + * So, you just need to pass it to all the threads that handle one request. */ extern __thread MemoryTracker * current_memory_tracker; diff --git a/dbms/src/Common/OptimizedRegularExpression.h b/dbms/src/Common/OptimizedRegularExpression.h index d6a6475c1fb..906359df503 100644 --- a/dbms/src/Common/OptimizedRegularExpression.h +++ b/dbms/src/Common/OptimizedRegularExpression.h @@ -12,20 +12,22 @@ #endif -/** Использует два способа оптимизации регулярного выражения: - * 1. Если регулярное выражение является тривиальным (сводится к поиску подстроки в строке), - * то заменяет поиск на strstr или strcasestr. - * 2. Если регулярное выражение содержит безальтернативную подстроку достаточной длины, - * то перед проверкой используется strstr или strcasestr достаточной длины; - * регулярное выражение проверяется полностью только если подстрока найдена. - * 3. В остальных случаях, используется движок re2. +/** Uses two ways to optimize a regular expression: + * 1. If the regular expression is trivial (reduces to finding a substring in a string), + * then replaces the search with strstr or strcasestr. + * 2. If the regular expression contains a non-alternative substring of sufficient length, + * then before testing, strstr or strcasestr of sufficient length is used; + * regular expression is only fully checked if a substring is found. + * 3. In other cases, the re2 engine is used. * - * Это имеет смысл, так как strstr и strcasestr в libc под Linux хорошо оптимизированы. + * This makes sense, since strstr and strcasestr in libc for Linux are well optimized. * - * Подходит, если одновременно выполнены следующие условия: - * - если в большинстве вызовов, регулярное выражение не матчится; - * - если регулярное выражение совместимо с движком re2; - * - можете использовать на свой риск, так как, возможно, не все случаи учтены. + * Suitable if the following conditions are simultaneously met: + * - if in most calls, the regular expression does not match; + * - if the regular expression is compatible with the re2 engine; + * - you can use at your own risk, since, probably, not all cases are taken into account. + * + * NOTE: Multi-character metasymbols such as \Pl are handled incorrectly. */ namespace OptimizedRegularExpressionDetails @@ -82,7 +84,7 @@ public: unsigned getNumberOfSubpatterns() const { return number_of_subpatterns; } - /// Получить регексп re2 или nullptr, если шаблон тривиален (для вывода в лог). + /// Get the regexp re2 or nullptr if the pattern is trivial (for output to the log). const std::unique_ptr& getRE2() const { return re2; } static void analyze(const std::string & regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix); @@ -105,4 +107,4 @@ private: using OptimizedRegularExpression = OptimizedRegularExpressionImpl; -#include "OptimizedRegularExpression.inl" +#include "OptimizedRegularExpression.inl.h" diff --git a/dbms/src/Common/OptimizedRegularExpression.inl b/dbms/src/Common/OptimizedRegularExpression.inl deleted file mode 100644 index 2689901ca99..00000000000 --- a/dbms/src/Common/OptimizedRegularExpression.inl +++ /dev/null @@ -1,431 +0,0 @@ -#include - -#include - -#include - - -#define MIN_LENGTH_FOR_STRSTR 3 -#define MAX_SUBPATTERNS 5 - -template -void OptimizedRegularExpressionImpl::analyze( - const std::string & regexp, - std::string & required_substring, - bool & is_trivial, - bool & required_substring_is_prefix) -{ - /** Выражение тривиально, если в нём все метасимволы эскейплены. - * Безальтернативная строка - это - * строка вне скобок, - * в которой все метасимволы эскейплены, - * а также если вне скобок нет '|', - * а также избегаются подстроки вида http:// или www. - */ - const char * begin = regexp.data(); - const char * pos = begin; - const char * end = regexp.data() + regexp.size(); - int depth = 0; - is_trivial = true; - required_substring_is_prefix = false; - required_substring.clear(); - bool has_alternative_on_depth_0 = false; - - /// Подстрока с позицией. - typedef std::pair Substring; - - typedef std::vector Substrings; - Substrings trivial_substrings(1); - Substring * last_substring = &trivial_substrings.back(); - - bool in_curly_braces = false; - bool in_square_braces = false; - - while (pos != end) - { - switch (*pos) - { - case '\0': - pos = end; - break; - - case '\\': - { - ++pos; - if (pos == end) - break; - - switch (*pos) - { - case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{': - if (depth == 0 && !in_curly_braces && !in_square_braces) - { - if (last_substring->first.empty()) - last_substring->second = pos - begin; - last_substring->first.push_back(*pos); - } - break; - default: - /// все остальные escape-последовательности не поддерживаем - is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - break; - } - - ++pos; - break; - } - - case '|': - if (depth == 0) - has_alternative_on_depth_0 = true; - is_trivial = false; - if (!in_square_braces && !last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - ++pos; - break; - - case '(': - if (!in_square_braces) - { - ++depth; - is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - } - ++pos; - break; - - case '[': - in_square_braces = true; - ++depth; - is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - ++pos; - break; - - case ']': - if (!in_square_braces) - goto ordinary; - - in_square_braces = false; - --depth; - is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - ++pos; - break; - - case ')': - if (!in_square_braces) - { - --depth; - is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - } - ++pos; - break; - - case '^': case '$': case '.': case '+': - is_trivial = false; - if (!last_substring->first.empty() && !in_square_braces) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - ++pos; - break; - - /// Квантификаторы, допускающие нулевое количество. - case '{': - in_curly_braces = true; - case '?': case '*': - is_trivial = false; - if (!last_substring->first.empty() && !in_square_braces) - { - last_substring->first.resize(last_substring->first.size() - 1); - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - ++pos; - break; - - case '}': - if (!in_curly_braces) - goto ordinary; - - in_curly_braces = false; - ++pos; - break; - - ordinary: /// Обычный, не заэскейпленный символ. - default: - if (depth == 0 && !in_curly_braces && !in_square_braces) - { - if (last_substring->first.empty()) - last_substring->second = pos - begin; - last_substring->first.push_back(*pos); - } - ++pos; - break; - } - } - - if (last_substring && last_substring->first.empty()) - trivial_substrings.pop_back(); - - if (!is_trivial) - { - if (!has_alternative_on_depth_0) - { - /** Выберем безальтернативную подстроку максимальной длины, среди префиксов, - * или безальтернативную подстроку максимальной длины. - */ - size_t max_length = 0; - Substrings::const_iterator candidate_it = trivial_substrings.begin(); - for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it) - { - if (((it->second == 0 && candidate_it->second != 0) - || ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length)) - /// Тюнинг для предметной области - && (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://"))) - && (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http"))) - && (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www"))) - && (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows ")))) - { - max_length = it->first.size(); - candidate_it = it; - } - } - - if (max_length >= MIN_LENGTH_FOR_STRSTR) - { - required_substring = candidate_it->first; - required_substring_is_prefix = candidate_it->second == 0; - } - } - } - else - { - required_substring = trivial_substrings.front().first; - required_substring_is_prefix = trivial_substrings.front().second == 0; - } - -/* std::cerr - << "regexp: " << regexp - << ", is_trivial: " << is_trivial - << ", required_substring: " << required_substring - << ", required_substring_is_prefix: " << required_substring_is_prefix - << std::endl;*/ -} - - -template -OptimizedRegularExpressionImpl::OptimizedRegularExpressionImpl(const std::string & regexp_, int options) -{ - analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix); - - /// Поддерживаются 3 опции - if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL))) - throw Poco::Exception("OptimizedRegularExpression: Unsupported option."); - - is_case_insensitive = options & RE_CASELESS; - bool is_no_capture = options & RE_NO_CAPTURE; - bool is_dot_nl = options & RE_DOT_NL; - - number_of_subpatterns = 0; - if (!is_trivial) - { - /// Скомпилируем регулярное выражение re2. - typename RegexType::Options options; - - if (is_case_insensitive) - options.set_case_sensitive(false); - - if (is_dot_nl) - options.set_dot_nl(true); - - re2 = std::make_unique(regexp_, options); - if (!re2->ok()) - throw Poco::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error()); - - if (!is_no_capture) - { - number_of_subpatterns = re2->NumberOfCapturingGroups(); - if (number_of_subpatterns > MAX_SUBPATTERNS) - throw Poco::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_); - } - } -} - - -template -bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size) const -{ - if (is_trivial) - { - if (is_case_insensitive) - return nullptr != strcasestr(subject, required_substring.data()); - else - return nullptr != strstr(subject, required_substring.data()); - } - else - { - if (!required_substring.empty()) - { - const char * pos; - if (is_case_insensitive) - pos = strcasestr(subject, required_substring.data()); - else - pos = strstr(subject, required_substring.data()); - - if (nullptr == pos) - return 0; - } - - return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0); - } -} - - -template -bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size, Match & match) const -{ - if (is_trivial) - { - const char * pos; - if (is_case_insensitive) - pos = strcasestr(subject, required_substring.data()); - else - pos = strstr(subject, required_substring.data()); - - if (pos == nullptr) - return 0; - else - { - match.offset = pos - subject; - match.length = required_substring.size(); - return 1; - } - } - else - { - if (!required_substring.empty()) - { - const char * pos; - if (is_case_insensitive) - pos = strcasestr(subject, required_substring.data()); - else - pos = strstr(subject, required_substring.data()); - - if (nullptr == pos) - return 0; - } - - StringPieceType piece; - - if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece)) - return 0; - else - { - match.offset = piece.data() - subject; - match.length = piece.length(); - return 1; - } - } -} - - -template -unsigned OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const -{ - matches.clear(); - - if (limit == 0) - return 0; - - if (limit > number_of_subpatterns + 1) - limit = number_of_subpatterns + 1; - - if (is_trivial) - { - const char * pos; - if (is_case_insensitive) - pos = strcasestr(subject, required_substring.data()); - else - pos = strstr(subject, required_substring.data()); - - if (pos == nullptr) - return 0; - else - { - Match match; - match.offset = pos - subject; - match.length = required_substring.size(); - matches.push_back(match); - return 1; - } - } - else - { - if (!required_substring.empty()) - { - const char * pos; - if (is_case_insensitive) - pos = strcasestr(subject, required_substring.data()); - else - pos = strstr(subject, required_substring.data()); - - if (nullptr == pos) - return 0; - } - - StringPieceType pieces[MAX_SUBPATTERNS]; - - if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit)) - return 0; - else - { - matches.resize(limit); - for (size_t i = 0; i < limit; ++i) - { - if (pieces[i] != nullptr) - { - matches[i].offset = pieces[i].data() - subject; - matches[i].length = pieces[i].length(); - } - else - { - matches[i].offset = std::string::npos; - matches[i].length = 0; - } - } - return limit; - } - } -} - -#undef MIN_LENGTH_FOR_STRSTR -#undef MAX_SUBPATTERNS - diff --git a/dbms/src/Common/OptimizedRegularExpression.inl.h b/dbms/src/Common/OptimizedRegularExpression.inl.h new file mode 100644 index 00000000000..ef6cb781a39 --- /dev/null +++ b/dbms/src/Common/OptimizedRegularExpression.inl.h @@ -0,0 +1,433 @@ +#include + +#include + +#include + + +#define MIN_LENGTH_FOR_STRSTR 3 +#define MAX_SUBPATTERNS 5 + + +template +void OptimizedRegularExpressionImpl::analyze( + const std::string & regexp, + std::string & required_substring, + bool & is_trivial, + bool & required_substring_is_prefix) +{ + /** The expression is trivial if all the metacharacters in it are escaped. + * The non-alternative string is + * a string outside parentheses, + * in which all metacharacters are escaped, + * and also if there are no '|' outside the brackets, + * and also avoid substrings of the form `http://` or `www` and some other + * (this is the hack for typical use case in Yandex.Metrica). + */ + const char * begin = regexp.data(); + const char * pos = begin; + const char * end = regexp.data() + regexp.size(); + int depth = 0; + is_trivial = true; + required_substring_is_prefix = false; + required_substring.clear(); + bool has_alternative_on_depth_0 = false; + + /// Substring with a position. + using Substring = std::pair; + using Substrings = std::vector; + + Substrings trivial_substrings(1); + Substring * last_substring = &trivial_substrings.back(); + + bool in_curly_braces = false; + bool in_square_braces = false; + + while (pos != end) + { + switch (*pos) + { + case '\0': + pos = end; + break; + + case '\\': + { + ++pos; + if (pos == end) + break; + + switch (*pos) + { + case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{': + if (depth == 0 && !in_curly_braces && !in_square_braces) + { + if (last_substring->first.empty()) + last_substring->second = pos - begin; + last_substring->first.push_back(*pos); + } + break; + default: + /// all other escape sequences are not supported + is_trivial = false; + if (!last_substring->first.empty()) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + break; + } + + ++pos; + break; + } + + case '|': + if (depth == 0) + has_alternative_on_depth_0 = true; + is_trivial = false; + if (!in_square_braces && !last_substring->first.empty()) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + ++pos; + break; + + case '(': + if (!in_square_braces) + { + ++depth; + is_trivial = false; + if (!last_substring->first.empty()) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + } + ++pos; + break; + + case '[': + in_square_braces = true; + ++depth; + is_trivial = false; + if (!last_substring->first.empty()) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + ++pos; + break; + + case ']': + if (!in_square_braces) + goto ordinary; + + in_square_braces = false; + --depth; + is_trivial = false; + if (!last_substring->first.empty()) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + ++pos; + break; + + case ')': + if (!in_square_braces) + { + --depth; + is_trivial = false; + if (!last_substring->first.empty()) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + } + ++pos; + break; + + case '^': case '$': case '.': case '+': + is_trivial = false; + if (!last_substring->first.empty() && !in_square_braces) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + ++pos; + break; + + /// Quantifiers that allow a zero number of occurences. + case '{': + in_curly_braces = true; + case '?': case '*': + is_trivial = false; + if (!last_substring->first.empty() && !in_square_braces) + { + last_substring->first.resize(last_substring->first.size() - 1); + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + ++pos; + break; + + case '}': + if (!in_curly_braces) + goto ordinary; + + in_curly_braces = false; + ++pos; + break; + + ordinary: /// Normal, not escaped symbol. + default: + if (depth == 0 && !in_curly_braces && !in_square_braces) + { + if (last_substring->first.empty()) + last_substring->second = pos - begin; + last_substring->first.push_back(*pos); + } + ++pos; + break; + } + } + + if (last_substring && last_substring->first.empty()) + trivial_substrings.pop_back(); + + if (!is_trivial) + { + if (!has_alternative_on_depth_0) + { + /** We choose the non-alternative substring of the maximum length, among the prefixes, + * or a non-alternative substring of maximum length. + */ + size_t max_length = 0; + Substrings::const_iterator candidate_it = trivial_substrings.begin(); + for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it) + { + if (((it->second == 0 && candidate_it->second != 0) + || ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length)) + /// Tuning for typical usage domain + && (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://"))) + && (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http"))) + && (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www"))) + && (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows ")))) + { + max_length = it->first.size(); + candidate_it = it; + } + } + + if (max_length >= MIN_LENGTH_FOR_STRSTR) + { + required_substring = candidate_it->first; + required_substring_is_prefix = candidate_it->second == 0; + } + } + } + else + { + required_substring = trivial_substrings.front().first; + required_substring_is_prefix = trivial_substrings.front().second == 0; + } + +/* std::cerr + << "regexp: " << regexp + << ", is_trivial: " << is_trivial + << ", required_substring: " << required_substring + << ", required_substring_is_prefix: " << required_substring_is_prefix + << std::endl;*/ +} + + +template +OptimizedRegularExpressionImpl::OptimizedRegularExpressionImpl(const std::string & regexp_, int options) +{ + analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix); + + /// Just three following options are supported + if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL))) + throw Poco::Exception("OptimizedRegularExpression: Unsupported option."); + + is_case_insensitive = options & RE_CASELESS; + bool is_no_capture = options & RE_NO_CAPTURE; + bool is_dot_nl = options & RE_DOT_NL; + + number_of_subpatterns = 0; + if (!is_trivial) + { + /// Compile the re2 regular expression. + typename RegexType::Options options; + + if (is_case_insensitive) + options.set_case_sensitive(false); + + if (is_dot_nl) + options.set_dot_nl(true); + + re2 = std::make_unique(regexp_, options); + if (!re2->ok()) + throw Poco::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error()); + + if (!is_no_capture) + { + number_of_subpatterns = re2->NumberOfCapturingGroups(); + if (number_of_subpatterns > MAX_SUBPATTERNS) + throw Poco::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_); + } + } +} + + +template +bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size) const +{ + if (is_trivial) + { + if (is_case_insensitive) + return nullptr != strcasestr(subject, required_substring.data()); + else + return nullptr != strstr(subject, required_substring.data()); + } + else + { + if (!required_substring.empty()) + { + const char * pos; + if (is_case_insensitive) + pos = strcasestr(subject, required_substring.data()); + else + pos = strstr(subject, required_substring.data()); + + if (nullptr == pos) + return 0; + } + + return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0); + } +} + + +template +bool OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size, Match & match) const +{ + if (is_trivial) + { + const char * pos; + if (is_case_insensitive) + pos = strcasestr(subject, required_substring.data()); + else + pos = strstr(subject, required_substring.data()); + + if (pos == nullptr) + return 0; + else + { + match.offset = pos - subject; + match.length = required_substring.size(); + return 1; + } + } + else + { + if (!required_substring.empty()) + { + const char * pos; + if (is_case_insensitive) + pos = strcasestr(subject, required_substring.data()); + else + pos = strstr(subject, required_substring.data()); + + if (nullptr == pos) + return 0; + } + + StringPieceType piece; + + if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece)) + return 0; + else + { + match.offset = piece.data() - subject; + match.length = piece.length(); + return 1; + } + } +} + + +template +unsigned OptimizedRegularExpressionImpl::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const +{ + matches.clear(); + + if (limit == 0) + return 0; + + if (limit > number_of_subpatterns + 1) + limit = number_of_subpatterns + 1; + + if (is_trivial) + { + const char * pos; + if (is_case_insensitive) + pos = strcasestr(subject, required_substring.data()); + else + pos = strstr(subject, required_substring.data()); + + if (pos == nullptr) + return 0; + else + { + Match match; + match.offset = pos - subject; + match.length = required_substring.size(); + matches.push_back(match); + return 1; + } + } + else + { + if (!required_substring.empty()) + { + const char * pos; + if (is_case_insensitive) + pos = strcasestr(subject, required_substring.data()); + else + pos = strstr(subject, required_substring.data()); + + if (nullptr == pos) + return 0; + } + + StringPieceType pieces[MAX_SUBPATTERNS]; + + if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit)) + return 0; + else + { + matches.resize(limit); + for (size_t i = 0; i < limit; ++i) + { + if (pieces[i] != nullptr) + { + matches[i].offset = pieces[i].data() - subject; + matches[i].length = pieces[i].length(); + } + else + { + matches[i].offset = std::string::npos; + matches[i].length = 0; + } + } + return limit; + } + } +} + +#undef MIN_LENGTH_FOR_STRSTR +#undef MAX_SUBPATTERNS + diff --git a/dbms/src/Common/PODArray.h b/dbms/src/Common/PODArray.h index 2d42c1c39dc..e098756c8ae 100644 --- a/dbms/src/Common/PODArray.h +++ b/dbms/src/Common/PODArray.h @@ -19,46 +19,46 @@ namespace DB { -/** Динамический массив для POD-типов. - * Предназначен для небольшого количества больших массивов (а не большого количества маленьких). - * А точнее - для использования в ColumnVector. - * Отличается от std::vector тем, что не инициализирует элементы. +/** A dynamic array for POD types. + * Designed for a small number of large arrays (rather than a lot of small ones). + * To be more precise - for use in ColumnVector. + * It differs from std::vector in that it does not initialize the elements. * - * Сделан некопируемым, чтобы не было случайных копий. Скопировать данные можно с помощью метода assign. + * Made noncopyable so that there are no accidential copies. You can copy the data using `assign` method. * - * Поддерживается только часть интерфейса std::vector. + * Only part of the std::vector interface is supported. * - * Конструктор по-умолчанию создаёт пустой объект, который не выделяет память. - * Затем выделяется память минимум в INITIAL_SIZE байт. + * The default constructor creates an empty object that does not allocate memory. + * Then the memory is allocated at least INITIAL_SIZE bytes. * - * Если вставлять элементы push_back-ом, не делая reserve, то PODArray примерно в 2.5 раза быстрее std::vector. + * If you insert elements with push_back, without making a `reserve`, then PODArray is about 2.5 times faster than std::vector. * - * Шаблонный параметр pad_right - всегда выделять в конце массива столько неиспользуемых байт. - * Может использоваться для того, чтобы делать оптимистичное чтение, запись, копирование невыровненными SIMD-инструкциями. + * The template parameter `pad_right` - always allocate at the end of the array as many unused bytes. + * Can be used to make optimistic reading, writing, copying with unaligned SIMD instructions. */ template , size_t pad_right_ = 0> class PODArray : private boost::noncopyable, private TAllocator /// empty base optimization { private: - /// Округление padding-а вверх до целого количества элементов, чтобы упростить арифметику. + /// Round padding up to an whole number of elements to simplify arithmetic. static constexpr size_t pad_right = (pad_right_ + sizeof(T) - 1) / sizeof(T) * sizeof(T); - char * c_start = nullptr; - char * c_end = nullptr; - char * c_end_of_storage = nullptr; /// Не включает в себя pad_right. + char * c_start = nullptr; + char * c_end = nullptr; + char * c_end_of_storage = nullptr; /// Does not include pad_right. - T * t_start() { return reinterpret_cast(c_start); } - T * t_end() { return reinterpret_cast(c_end); } - T * t_end_of_storage() { return reinterpret_cast(c_end_of_storage); } + T * t_start() { return reinterpret_cast(c_start); } + T * t_end() { return reinterpret_cast(c_end); } + T * t_end_of_storage() { return reinterpret_cast(c_end_of_storage); } - const T * t_start() const { return reinterpret_cast(c_start); } - const T * t_end() const { return reinterpret_cast(c_end); } - const T * t_end_of_storage() const { return reinterpret_cast(c_end_of_storage); } + const T * t_start() const { return reinterpret_cast(c_start); } + const T * t_end() const { return reinterpret_cast(c_end); } + const T * t_end_of_storage() const { return reinterpret_cast(c_end_of_storage); } - /// Количество памяти, занимаемое num_elements элементов. + /// The amount of memory occupied by the num_elements of the elements. static size_t byte_size(size_t num_elements) { return num_elements * sizeof(T); } - /// Минимальное количество памяти, которое нужно выделить для num_elements элементов, включая padding. + /// Minimum amount of memory to allocate for num_elements, including padding. static size_t minimum_memory_for_elements(size_t num_elements) { return byte_size(num_elements) + pad_right; } void alloc_for_num_elements(size_t num_elements) @@ -112,7 +112,7 @@ public: size_t allocated_size() const { return c_end_of_storage - c_start + pad_right; } - /// Просто typedef нельзя, так как возникает неоднозначность для конструкторов и функций assign. + /// You can not just use `typedef`, because there is ambiguity for the constructors and `assign` functions. struct iterator : public boost::iterator_adaptor { iterator() {} @@ -173,16 +173,16 @@ public: const T & operator[] (size_t n) const { return t_start()[n]; } T & front() { return t_start()[0]; } - T & back() { return t_end()[-1]; } + T & back() { return t_end()[-1]; } const T & front() const { return t_start()[0]; } const T & back() const { return t_end()[-1]; } - iterator begin() { return t_start(); } - iterator end() { return t_end(); } - const_iterator begin() const { return t_start(); } - const_iterator end() const { return t_end(); } - const_iterator cbegin() const { return t_start(); } - const_iterator cend() const { return t_end(); } + iterator begin() { return t_start(); } + iterator end() { return t_end(); } + const_iterator begin() const { return t_start(); } + const_iterator end() const { return t_end(); } + const_iterator cbegin() const { return t_start(); } + const_iterator cend() const { return t_end(); } void reserve(size_t n) { @@ -209,7 +209,7 @@ public: c_end = c_start + byte_size(n); } - /// Как resize, но обнуляет новые элементы. + /// Same as resize, but zeroes new elements. void resize_fill(size_t n) { size_t old_size = size(); @@ -261,7 +261,7 @@ public: c_end -= byte_size(1); } - /// Не вставляйте в массив кусок самого себя. Потому что при ресайзе, итераторы на самого себя могут инвалидироваться. + /// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated. template void insert(It1 from_begin, It2 from_end) { @@ -458,7 +458,7 @@ void swap(PODArray & lhs, PODArray> using PaddedPODArray = PODArray; diff --git a/dbms/src/Common/PoolBase.h b/dbms/src/Common/PoolBase.h index 5c470ada446..194d7e421ad 100644 --- a/dbms/src/Common/PoolBase.h +++ b/dbms/src/Common/PoolBase.h @@ -8,8 +8,17 @@ #include #include -/** Класс, от которого можно унаследоваться и получить пул чего-нибудь. Используется для пулов соединений с БД. - * Наследник должен предоставить метод для создания нового объекта для помещения в пул. + +namespace DB +{ + namespace ErrorCodes + { + extern const int LOGICAL_ERROR; + } +} + +/** A class from which you can inherit and get a pool of something. Used for database connection pools. + * Descendant class must provide a method for creating a new object to place in the pool. */ template @@ -22,7 +31,7 @@ public: private: - /** Объект с флагом, используется ли он сейчас. */ + /** The object with the flag, whether it is currently used. */ struct PooledObject { PooledObject(ObjectPtr object_, PoolBase & pool_) @@ -37,8 +46,8 @@ private: using Objects = std::vector>; - /** Помощник, который устанавливает флаг использования объекта, а в деструкторе - снимает, - * а также уведомляет о событии с помощью condvar-а. + /** The helper, which sets the flag for using the object, and in the destructor - removes, + * and also notifies the event using condvar. */ struct PoolEntryHelper { @@ -54,36 +63,36 @@ private: }; public: - /** То, что выдаётся пользователю. */ + /** What is given to the user. */ class Entry { public: friend class PoolBase; - Entry() {} /// Для отложенной инициализации. + Entry() {} /// For deferred initialization. - /** Объект Entry защищает ресурс от использования другим потоком. - * Следующие методы запрещены для rvalue, чтобы нельзя было написать подобное - * - * auto q = pool.Get()->query("SELECT .."); // Упс, после этой строчки Entry уничтожился - * q.execute(); // Кто-то еще может использовать этот Connection - */ + /** The `Entry` object protects the resource from being used by another thread. + * The following methods are forbidden for `rvalue`, so you can not write a similar to + * + * auto q = pool.Get()->query("SELECT .."); // Oops, after this line Entry was destroyed + * q.execute (); // Someone else can use this Connection + */ Object * operator->() && = delete; const Object * operator->() const && = delete; Object & operator*() && = delete; const Object & operator*() const && = delete; - Object * operator->() & { return &*data->data.object; } - const Object * operator->() const & { return &*data->data.object; } - Object & operator*() & { return *data->data.object; } - const Object & operator*() const & { return *data->data.object; } + Object * operator->() & { return &*data->data.object; } + const Object * operator->() const & { return &*data->data.object; } + Object & operator*() & { return *data->data.object; } + const Object & operator*() const & { return *data->data.object; } bool isNull() const { return data == nullptr; } PoolBase * getPool() const { if (!data) - throw DB::Exception("attempt to get pool from uninitialized entry"); + throw DB::Exception("Attempt to get pool from uninitialized entry", DB::ErrorCodes::LOGICAL_ERROR); return &data->data.pool; } @@ -95,7 +104,7 @@ public: virtual ~PoolBase() {} - /** Выделяет объект для работы. При timeout < 0 таймаут бесконечный. */ + /** Allocates the object. Wait for free object in pool for 'timeout'. With 'timeout' < 0, the timeout is infinite. */ Entry get(Poco::Timespan::TimeDiff timeout) { std::unique_lock lock(mutex); @@ -131,13 +140,13 @@ public: } private: - /** Максимальный размер пула. */ + /** The maximum size of the pool. */ unsigned max_items; - /** Пул. */ + /** Pool. */ Objects items; - /** Блокировка для доступа к пулу. */ + /** Lock to access the pool. */ std::mutex mutex; std::condition_variable available; @@ -151,7 +160,7 @@ protected: items.reserve(max_items); } - /** Создает новый объект для помещения в пул. */ + /** Creates a new object to put into the pool. */ virtual ObjectPtr allocObject() = 0; }; diff --git a/dbms/src/Common/RadixSort.h b/dbms/src/Common/RadixSort.h index 39a21a8e02f..ee844fa83a8 100644 --- a/dbms/src/Common/RadixSort.h +++ b/dbms/src/Common/RadixSort.h @@ -13,18 +13,18 @@ #include -/** Поразрядная сортировка, обладает следующей функциональностью: - * Может сортировать unsigned, signed числа, а также float-ы. - * Может сортировать массив элементов фиксированной длины, которые содержат что-то ещё кроме ключа. - * Настраиваемый размер разряда. +/** Radix sort, has the following functionality: + * Can sort unsigned, signed numbers, and floats. + * Can sort an array of fixed length elements that contain something else besides the key. + * Customizable radix size. * * LSB, stable. - * NOTE Для некоторых приложений имеет смысл добавить MSB-radix-sort, - * а также алгоритмы radix-select, radix-partial-sort, radix-get-permutation на его основе. + * NOTE For some applications it makes sense to add MSB-radix-sort, + * as well as radix-select, radix-partial-sort, radix-get-permutation algorithms based on it. */ -/** Используется в качестве параметра шаблона. См. ниже. +/** Used as a template parameter. See below. */ struct RadixSortMallocAllocator { @@ -40,16 +40,16 @@ struct RadixSortMallocAllocator }; -/** Преобразование, которое переводит битовое представление ключа в такое целое беззнаковое число, - * что отношение порядка над ключами будет соответствовать отношению порядка над полученными беззнаковыми числами. - * Для float-ов это преобразование делает следующее: - * если выставлен знаковый бит, то переворачивает все остальные биты. - * При этом, NaN-ы оказываются больше всех нормальных чисел. +/** A transformation that transforms the bit representation of a key into an unsigned integer number, + * that the order relation over the keys will match the order relation over the obtained unsigned numbers. + * For floats this conversion does the following: + * if the signed bit is set, it flips all other bits. + * In this case, NaN-s are bigger than all normal numbers. */ template struct RadixSortFloatTransform { - /// Стоит ли записывать результат в память, или лучше делать его каждый раз заново? + /// Is it worth writing the result in memory, or is it better to do calculation every time again? static constexpr bool transform_is_simple = false; static KeyBits forward(KeyBits x) @@ -67,24 +67,24 @@ struct RadixSortFloatTransform template struct RadixSortFloatTraits { - using Element = Float; /// Тип элемента. Это может быть структура с ключём и ещё каким-то payload-ом. Либо просто ключ. - using Key = Float; /// Ключ, по которому нужно сортировать. - using CountType = uint32_t; /// Тип для подсчёта гистограмм. В случае заведомо маленького количества элементов, может быть меньше чем size_t. + using Element = Float; /// The type of the element. It can be a structure with a key and some other payload. Or just a key. + using Key = Float; /// The key to sort. + using CountType = uint32_t; /// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t. - /// Тип, в который переводится ключ, чтобы делать битовые операции. Это UInt такого же размера, как ключ. + /// The type to which the key is transformed to do bit operations. This UInt is the same size as the key. using KeyBits = typename std::conditional::type; - static constexpr size_t PART_SIZE_BITS = 8; /// Какими кусочками ключа в количестве бит делать один проход - перестановку массива. + static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array. - /// Преобразования ключа в KeyBits такое, что отношение порядка над ключём соответствует отношению порядка над KeyBits. + /// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits. using Transform = RadixSortFloatTransform; - /// Объект с функциями allocate и deallocate. - /// Может быть использован, например, чтобы выделить память для временного массива на стеке. - /// Для этого сам аллокатор создаётся на стеке. + /// An object with the functions allocate and deallocate. + /// Can be used, for example, to allocate memory for a temporary array on the stack. + /// To do this, the allocator itself is created on the stack. using Allocator = RadixSortMallocAllocator; - /// Функция получения ключа из элемента массива. + /// The function to get the key from an array element. static Key & extractKey(Element & elem) { return elem; } }; @@ -95,7 +95,7 @@ struct RadixSortIdentityTransform static constexpr bool transform_is_simple = true; static KeyBits forward(KeyBits x) { return x; } - static KeyBits backward(KeyBits x) { return x; } + static KeyBits backward(KeyBits x) { return x; } }; @@ -105,7 +105,7 @@ struct RadixSortSignedTransform static constexpr bool transform_is_simple = true; static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); } - static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); } + static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); } }; @@ -122,7 +122,7 @@ struct RadixSortUIntTraits using Transform = RadixSortIdentityTransform; using Allocator = RadixSortMallocAllocator; - /// Функция получения ключа из элемента массива. + /// The function to get the key from an array element. static Key & extractKey(Element & elem) { return elem; } }; @@ -139,7 +139,7 @@ struct RadixSortIntTraits using Transform = RadixSortSignedTransform; using Allocator = RadixSortMallocAllocator; - /// Функция получения ключа из элемента массива. + /// The function to get the key from an array element. static Key & extractKey(Element & elem) { return elem; } }; @@ -150,7 +150,7 @@ struct RadixSort private: using Element = typename Traits::Element; using Key = typename Traits::Key; - using CountType = typename Traits::CountType; + using CountType = typename Traits::CountType; using KeyBits = typename Traits::KeyBits; static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS; @@ -172,19 +172,19 @@ private: public: static void execute(Element * arr, size_t size) { - /// Если массив имеет размер меньше 256, то лучше использовать другой алгоритм. + /// If the array is smaller than 256, then it is better to use another algorithm. - /// Здесь есть циклы по NUM_PASSES. Очень важно, что они разворачиваются в compile-time. + /// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time. - /// Для каждого из NUM_PASSES кусков бит ключа, считаем, сколько раз каждое значение этого куска встретилось. + /// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met. CountType histograms[HISTOGRAM_SIZE * NUM_PASSES] = {0}; typename Traits::Allocator allocator; - /// Будем делать несколько проходов по массиву. На каждом проходе, данные перекладываются в другой массив. Выделим этот временный массив. + /// We will do several passes through the array. On each pass, the data is transferred to another array. Let's allocate this temporary array. Element * swap_buffer = reinterpret_cast(allocator.allocate(size * sizeof(Element))); - /// Трансформируем массив и вычисляем гистограмму. + /// Transform the array and calculate the histogram. for (size_t i = 0; i < size; ++i) { if (!Traits::Transform::transform_is_simple) @@ -195,7 +195,7 @@ public: } { - /// Заменяем гистограммы на суммы с накоплением: значение в позиции i равно сумме в предыдущих позициях минус один. + /// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions minus one. size_t sums[NUM_PASSES] = {0}; for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) @@ -209,7 +209,7 @@ public: } } - /// Перекладываем элементы в порядке начиная от младшего куска бит, и далее делаем несколько проходов по количеству кусков. + /// Move the elements in the order starting from the least bit piece, and then do a few passes on the number of pieces. for (size_t j = 0; j < NUM_PASSES; ++j) { Element * writer = j % 2 ? arr : swap_buffer; @@ -219,17 +219,18 @@ public: { size_t pos = getPart(j, keyToBits(Traits::extractKey(reader[i]))); - /// Размещаем элемент на следующей свободной позиции. + /// Place the element on the next free position. auto & dest = writer[++histograms[j * HISTOGRAM_SIZE + pos]]; dest = reader[i]; - /// На последнем перекладывании, делаем обратную трансформацию. + /// On the last pass, we do the reverse transformation. if (!Traits::Transform::transform_is_simple && j == NUM_PASSES - 1) Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(reader[i])))); } } - /// Если число проходов нечётное, то результирующий массив находится во временном буфере. Скопируем его на место исходного массива. + /// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array. + /// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array. if (NUM_PASSES % 2) memcpy(arr, swap_buffer, size * sizeof(Element)); diff --git a/dbms/src/Common/ShellCommand.h b/dbms/src/Common/ShellCommand.h index 21ccab08878..a558216fcbf 100644 --- a/dbms/src/Common/ShellCommand.h +++ b/dbms/src/Common/ShellCommand.h @@ -9,19 +9,19 @@ namespace DB { -/** Позволяет запустить команду, - * читать её stdout, stderr, писать в stdin, - * дождаться завершения. +/** Lets you run the command, + * read it stdout and stderr; write to stdin; + * wait for completion. * - * Реализация похожа на функцию popen из POSIX (посмотреть можно в исходниках libc). + * The implementation is similar to the popen function from POSIX (see libc source code). * - * Наиболее важное отличие: использует vfork вместо fork. - * Это сделано, потому что fork не работает (с ошибкой о нехватке памяти), - * при некоторых настройках overcommit-а, если размер адресного пространства процесса больше половины количества доступной памяти. - * Также, изменение memory map-ов - довольно ресурсоёмкая операция. + * The most important difference: uses vfork instead of fork. + * This is done because fork does not work (with a memory shortage error), + * with some overcommit settings, if the address space of the process is more than half the amount of available memory. + * Also, changing memory maps - a fairly resource-intensive operation. * - * Второе отличие - позволяет работать одновременно и с stdin, и с stdout, и с stderr запущенного процесса, - * а также узнать код и статус завершения. + * The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr of running process, + * and also to obtain the return code and completion status. */ class ShellCommand { @@ -34,20 +34,20 @@ private: static std::unique_ptr executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only); public: - WriteBufferFromFile in; /// Если команда читает из stdin, то не забудьте вызвать in.close() после записи туда всех данных. + WriteBufferFromFile in; /// If the command reads from stdin, do not forget to call in.close() after writing all the data there. ReadBufferFromFile out; ReadBufferFromFile err; - /// Выполнить команду с использованием /bin/sh -c + /// Run the command using /bin/sh -c static std::unique_ptr execute(const std::string & command, bool pipe_stdin_only = false); - /// Выполнить исполняемый файл с указаннами аргументами. arguments - без argv[0]. + /// Run the executable with the specified arguments. `arguments` - without argv[0]. static std::unique_ptr executeDirect(const std::string & path, const std::vector & arguments); - /// Подождать завершения процесса, кинуть исключение, если код не 0 или если процесс был завершён не самостоятельно. + /// Wait for the process to end, throw an exception if the code is not 0 or if the process was not completed by itself. void wait(); - /// Подождать завершения процесса, узнать код возврата. Кинуть исключение, если процесс был завершён не самостоятельно. + /// Wait for the process to finish, see the return code. To throw an exception if the process was not completed independently. int tryWait(); }; diff --git a/dbms/src/Common/SimpleCache.h b/dbms/src/Common/SimpleCache.h index ef37ff3fdb9..4de92baa9f5 100644 --- a/dbms/src/Common/SimpleCache.h +++ b/dbms/src/Common/SimpleCache.h @@ -6,13 +6,13 @@ #include -/** Простейший кэш для свободной функции. - * Можете также передать статический метод класса или лямбду без захвата. - * Размер неограничен. Значения не устаревают. - * Для синхронизации используется mutex. - * Подходит только для простейших случаев. +/** The simplest cache for a free function. + * You can also pass a static class method or lambda without captures. + * The size is unlimited. Values are stored permanently and never evicted. + * Mutex is used for synchronization. + * Suitable only for the simplest cases. * - * Использование: + * Usage * * SimpleCache func_cached; * std::cerr << func_cached(args...); @@ -41,7 +41,7 @@ public: return it->second; } - /// Сами вычисления делаются не под mutex-ом. + /// The calculations themselves are not done under mutex. Result res = f(std::forward(args)...); { diff --git a/dbms/src/Common/SimpleIncrement.h b/dbms/src/Common/SimpleIncrement.h new file mode 100644 index 00000000000..29e0010b0fa --- /dev/null +++ b/dbms/src/Common/SimpleIncrement.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include + + +/** Is used for numbering of files. + */ +struct SimpleIncrement +{ + std::atomic value; + + SimpleIncrement(UInt64 start = 0) : value(start) {} + + void set(UInt64 new_value) + { + value = new_value; + } + + UInt64 get() + { + return ++value; + } +}; diff --git a/dbms/src/Common/SipHash.h b/dbms/src/Common/SipHash.h index 62b76ce5ce6..55b4574d851 100644 --- a/dbms/src/Common/SipHash.h +++ b/dbms/src/Common/SipHash.h @@ -1,57 +1,54 @@ #pragma once -/** SipHash - быстрая криптографическая хэш функция для коротких строк. - * Взято отсюда: https://www.131002.net/siphash/ +/** SipHash is a fast cryptographic hash function for short strings. + * Taken from here: https://www.131002.net/siphash/ * - * Сделано два изменения: - * - возвращает 128 бит, а не 64; - * - сделано потоковой (можно вычислять по частям). + * This is SipHash 2-4 variant. * - * На коротких строках (URL, поисковые фразы) более чем в 3 раза быстрее MD5 от OpenSSL. - * (~ 700 МБ/сек., 15 млн. строк в секунду) + * Two changes are made: + * - returns also 128 bits, not only 64; + * - done streaming (can be calculated in parts). + * + * On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL. + * (~ 700 MB/sec, 15 million strings per second) */ -#include -#include -#include +#include -#define ROTL(x,b) static_cast( ((x) << (b)) | ( (x) >> (64 - (b))) ) +#define ROTL(x, b) static_cast(((x) << (b)) | ((x) >> (64 - (b)))) -#define SIPROUND \ - do \ - { \ - v0 += v1; v1=ROTL(v1,13); v1 ^= v0; v0=ROTL(v0,32); \ - v2 += v3; v3=ROTL(v3,16); v3 ^= v2; \ - v0 += v3; v3=ROTL(v3,21); v3 ^= v0; \ - v2 += v1; v1=ROTL(v1,17); v1 ^= v2; v2=ROTL(v2,32); \ +#define SIPROUND \ + do \ + { \ + v0 += v1; v1 = ROTL(v1, 13); v1 ^= v0; v0 = ROTL(v0, 32); \ + v2 += v3; v3 = ROTL(v3, 16); v3 ^= v2; \ + v0 += v3; v3 = ROTL(v3, 21); v3 ^= v0; \ + v2 += v1; v1 = ROTL(v1, 17); v1 ^= v2; v2 = ROTL(v2, 32); \ } while(0) class SipHash { private: - using u64 = DB::UInt64; - using u8 = DB::UInt8; + /// State. + UInt64 v0; + UInt64 v1; + UInt64 v2; + UInt64 v3; - /// Состояние. - u64 v0; - u64 v1; - u64 v2; - u64 v3; + /// How many bytes have been processed. + UInt64 cnt; - /// Сколько байт обработано. - u64 cnt; - - /// Текущие 8 байт входных данных. + /// The current 8 bytes of input data. union { - u64 current_word; - u8 current_bytes[8]; + UInt64 current_word; + UInt8 current_bytes[8]; }; void finalize() { - /// В последний свободный байт пишем остаток от деления длины на 256. + /// In the last free byte, we write the remainder of the division by 256. current_bytes[7] = cnt; v3 ^= current_word; @@ -67,10 +64,10 @@ private: } public: - /// Аргументы - seed. - SipHash(u64 k0 = 0, u64 k1 = 0) + /// Arguments - seed. + SipHash(UInt64 k0 = 0, UInt64 k1 = 0) { - /// Инициализируем состояние некоторыми случайными байтами и seed-ом. + /// Initialize the state with some random bytes and seed. v0 = 0x736f6d6570736575ULL ^ k0; v1 = 0x646f72616e646f6dULL ^ k1; v2 = 0x6c7967656e657261ULL ^ k0; @@ -80,11 +77,11 @@ public: current_word = 0; } - void update(const char * data, u64 size) + void update(const char * data, UInt64 size) { const char * end = data + size; - /// Дообработаем остаток от предыдущего апдейта, если есть. + /// We'll finish to process the remainder of the previous update, if any. if (cnt & 7) { while (cnt & 7 && data < end) @@ -94,7 +91,7 @@ public: ++cnt; } - /// Если всё ещё не хватает байт до восьмибайтового слова. + /// If we still do not have enough bytes to an 8-byte word. if (cnt & 7) return; @@ -108,7 +105,7 @@ public: while (data + 8 <= end) { - current_word = *reinterpret_cast(data); + current_word = *reinterpret_cast(data); v3 ^= current_word; SIPROUND; @@ -118,7 +115,7 @@ public: data += 8; } - /// Заполняем остаток, которого не хватает до восьмибайтового слова. + /// Pad the remainder, which is missing up to an 8-byte word. current_word = 0; switch (end - data) { @@ -133,23 +130,23 @@ public: } } - /// Получить результат в некотором виде. Это можно сделать только один раз! + /// Get the result in some form. This can only be done once! void get128(char * out) { finalize(); - reinterpret_cast(out)[0] = v0 ^ v1; - reinterpret_cast(out)[1] = v2 ^ v3; + reinterpret_cast(out)[0] = v0 ^ v1; + reinterpret_cast(out)[1] = v2 ^ v3; } - void get128(u64 & lo, u64 & hi) + void get128(UInt64 & lo, UInt64 & hi) { finalize(); lo = v0 ^ v1; hi = v2 ^ v3; } - u64 get64() + UInt64 get64() { finalize(); return v0 ^ v1 ^ v2 ^ v3; @@ -160,6 +157,7 @@ public: #undef ROTL #undef SIPROUND +#include inline void sipHash128(const char * data, const size_t size, char * out) { @@ -168,7 +166,7 @@ inline void sipHash128(const char * data, const size_t size, char * out) hash.get128(out); } -inline DB::UInt64 sipHash64(const char * data, const size_t size) +inline UInt64 sipHash64(const char * data, const size_t size) { SipHash hash; hash.update(data, size); @@ -177,7 +175,7 @@ inline DB::UInt64 sipHash64(const char * data, const size_t size) #include -inline DB::UInt64 sipHash64(const std::string & s) +inline UInt64 sipHash64(const std::string & s) { return sipHash64(s.data(), s.size()); } diff --git a/dbms/src/Common/SmallObjectPool.h b/dbms/src/Common/SmallObjectPool.h index f3002f6ebcb..e53a9234ae2 100644 --- a/dbms/src/Common/SmallObjectPool.h +++ b/dbms/src/Common/SmallObjectPool.h @@ -73,7 +73,7 @@ public: free_list = block; } - /// Размер выделенного пула в байтах + /// The size of the allocated pool in bytes size_t size() const { return pool.size(); diff --git a/dbms/src/Common/SpaceSaving.h b/dbms/src/Common/SpaceSaving.h new file mode 100644 index 00000000000..7483d098990 --- /dev/null +++ b/dbms/src/Common/SpaceSaving.h @@ -0,0 +1,288 @@ +#pragma once + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Implementation of the Filtered Space-Saving for TopK streaming analysis. + * http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf + * It implements suggested reduce-and-combine algorithm from Parallel Space Saving: + * https://arxiv.org/pdf/1401.0702.pdf + */ + +namespace DB +{ + +template +< + typename TKey, + typename HashKey = TKey, + typename Hash = DefaultHash, + typename Grower = HashTableGrower<>, + typename Allocator = HashTableAllocator +> +class SpaceSaving +{ +private: + // Suggested constants in the paper "Finding top-k elements in data streams", chap 6. equation (24) + // Round to nearest power of 2 for cheaper binning without modulo + constexpr uint64_t nextAlphaSize (uint64_t x) + { + constexpr uint64_t ALPHA_MAP_ELEMENTS_PER_COUNTER = 6; + return 1ULL<<(sizeof(uint64_t) * 8 - __builtin_clzll(x * ALPHA_MAP_ELEMENTS_PER_COUNTER)); + } + +public: + using Self = SpaceSaving; + + struct Counter + { + Counter() {} + + Counter(const TKey & k, UInt64 c = 0, UInt64 e = 0, size_t h = 0) + : key(k), slot(0), hash(h), count(c), error(e) {} + + void write(WriteBuffer & wb) const + { + writeBinary(key, wb); + writeVarUInt(count, wb); + writeVarUInt(error, wb); + } + + void read(ReadBuffer & rb) + { + readBinary(key, rb); + readVarUInt(count, rb); + readVarUInt(error, rb); + } + + // greater() taking slot error into account + bool operator> (const Counter & b) const + { + return (count > b.count) || (count == b.count && error < b.error); + } + + TKey key; + size_t slot, hash; + UInt64 count; + UInt64 error; + }; + + SpaceSaving(size_t c = 10) : alpha_map(nextAlphaSize(c)), m_capacity(c) {} + ~SpaceSaving() { destroyElements(); } + + inline size_t size() const + { + return counter_list.size(); + } + + inline size_t capacity() const + { + return m_capacity; + } + + void resize(size_t new_capacity) + { + counter_list.reserve(new_capacity); + alpha_map.resize(nextAlphaSize(new_capacity)); + m_capacity = new_capacity; + } + + void insert(const TKey & key, UInt64 increment = 1, UInt64 error = 0) + { + // Increase weight of a key that already exists + // It uses hashtable for both value mapping as a presence test (c_i != 0) + auto hash = counter_map.hash(key); + auto it = counter_map.find(key, hash); + if (it != counter_map.end()) + { + auto c = it->second; + c->count += increment; + c->error += error; + percolate(c); + return; + } + // Key doesn't exist, but can fit in the top K + else if (unlikely(size() < capacity())) + { + auto c = new Counter(key, increment, error, hash); + push(c); + return; + } + + auto min = counter_list.back(); + const size_t alpha_mask = alpha_map.size() - 1; + auto & alpha = alpha_map[hash & alpha_mask]; + if (alpha + increment < min->count) + { + alpha += increment; + return; + } + + // Erase the current minimum element + alpha_map[min->hash & alpha_mask] = min->count; + it = counter_map.find(min->key, min->hash); + + // Replace minimum with newly inserted element + if (it != counter_map.end()) + { + min->hash = hash; + min->key = key; + min->count = alpha + increment; + min->error = alpha + error; + percolate(min); + + it->second = min; + it->first = key; + counter_map.reinsert(it, hash); + } + } + + /* + * Parallel Space Saving reduction and combine step from: + * https://arxiv.org/pdf/1401.0702.pdf + */ + void merge(const Self & rhs) + { + UInt64 m1 = 0; + UInt64 m2 = 0; + + if (size() == capacity()) + { + m1 = counter_list.back()->count; + } + + if (rhs.size() == rhs.capacity()) + { + m2 = rhs.counter_list.back()->count; + } + + /* + * Updated algorithm to mutate current table in place + * without mutating rhs table or creating new one + * in the first step we expect that no elements overlap + * and in the second sweep we correct the error if they do. + */ + if (m2 > 0) + { + for (auto counter : counter_list) + { + counter->count += m2; + counter->error += m2; + } + } + + // The list is sorted in descending order, we have to scan in reverse + for (auto counter : boost::adaptors::reverse(rhs.counter_list)) + { + if (counter_map.find(counter->key) != counter_map.end()) + { + // Subtract m2 previously added, guaranteed not negative + insert(counter->key, counter->count - m2, counter->error - m2); + } + else + { + // Counters not monitored in S1 + insert(counter->key, counter->count + m1, counter->error + m1); + } + } + } + + std::vector topK(size_t k) const + { + std::vector res; + for (auto counter : counter_list) + { + res.push_back(*counter); + if (res.size() == k) + break; + } + return res; + } + + void write(WriteBuffer & wb) const + { + writeVarUInt(size(), wb); + for (auto counter : counter_list) + counter->write(wb); + for (auto alpha : alpha_map) + writeVarUInt(alpha, wb); + } + + void read(ReadBuffer & rb) + { + destroyElements(); + size_t count = 0; + readVarUInt(count, rb); + + for (size_t i = 0; i < count; ++i) + { + auto counter = new Counter(); + counter->read(rb); + counter->hash = counter_map.hash(counter->key); + push(counter); + } + + for (size_t i = 0; i < nextAlphaSize(m_capacity); ++i) + { + UInt64 alpha = 0; + readVarUInt(alpha, rb); + alpha_map.push_back(alpha); + } + } + +protected: + void push(Counter * counter) + { + counter->slot = counter_list.size(); + counter_list.push_back(counter); + counter_map[counter->key] = counter; + percolate(counter); + } + + // This is equivallent to one step of bubble sort + void percolate(Counter * counter) + { + while (counter->slot > 0) + { + auto next = counter_list[counter->slot - 1]; + if (*counter > *next) + { + std::swap(next->slot, counter->slot); + std::swap(counter_list[next->slot], counter_list[counter->slot]); + } + else + break; + } + } + +private: + void destroyElements() + { + for (auto counter : counter_list) + delete counter; + + counter_map.clear(); + counter_list.clear(); + alpha_map.clear(); + } + + HashMap counter_map; + std::vector counter_list; + std::vector alpha_map; + size_t m_capacity; +}; + +}; diff --git a/dbms/src/Common/StackTrace.h b/dbms/src/Common/StackTrace.h index 68822dfc019..3ac4ddb9354 100644 --- a/dbms/src/Common/StackTrace.h +++ b/dbms/src/Common/StackTrace.h @@ -6,14 +6,14 @@ #define STACK_TRACE_MAX_DEPTH 32 -/// Позволяет получить стек-трейс +/// Lets you get a stacktrace class StackTrace { public: - /// Стектрейс снимается в момент создания объекта + /// The stacktrace is captured when the object is created StackTrace(); - /// Вывести в строку + /// Print to string std::string toString() const; private: diff --git a/dbms/src/Common/StringSearcher.h b/dbms/src/Common/StringSearcher.h index 00edba6fe47..ba1947f515c 100644 --- a/dbms/src/Common/StringSearcher.h +++ b/dbms/src/Common/StringSearcher.h @@ -19,15 +19,14 @@ namespace DB { - namespace ErrorCodes { extern const int UNSUPPORTED_PARAMETER; } -/** Варианты поиска подстроки в строке. - * В большинстве случаев, менее производительные, чем Volnitsky (см. Volnitsky.h). +/** Variants for searching a substring in a string. + * In most cases, performance is less than Volnitsky (see Volnitsky.h). */ @@ -37,7 +36,7 @@ struct StringSearcherBase static constexpr auto n = sizeof(__m128i); const int page_size = getpagesize(); - bool page_safe(const void * const ptr) const + bool pageSafe(const void * const ptr) const { return ((page_size - 1) & reinterpret_cast(ptr)) <= page_size - n; } @@ -55,7 +54,7 @@ class StringSearcher : private StringSearcherBase private: using UTF8SequenceBuffer = UInt8[6]; - /// string to be searched for + /// substring to be searched for const UInt8 * const needle; const std::size_t needle_size; const UInt8 * const needle_end = needle + needle_size; @@ -135,8 +134,7 @@ public: if (!(dst_l_len == dst_u_len && dst_u_len == src_len)) throw DB::Exception{ "UTF8 sequences with different lowercase and uppercase lengths are not supported", - DB::ErrorCodes::UNSUPPORTED_PARAMETER - }; + DB::ErrorCodes::UNSUPPORTED_PARAMETER}; cache_actual_len += src_len; if (cache_actual_len < n) @@ -165,7 +163,7 @@ public: static const Poco::UTF8Encoding utf8; #if __SSE4_1__ - if (page_safe(pos)) + if (pageSafe(pos)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(pos)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -230,7 +228,7 @@ public: while (haystack < haystack_end) { #if __SSE4_1__ - if (haystack + n <= haystack_end && page_safe(haystack)) + if (haystack + n <= haystack_end && pageSafe(haystack)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl); @@ -249,7 +247,7 @@ public: const auto offset = __builtin_ctz(mask); haystack += offset; - if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack)) + if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -377,7 +375,7 @@ public: bool compare(const UInt8 * pos) const { #if __SSE4_1__ - if (page_safe(pos)) + if (pageSafe(pos)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(pos)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -429,7 +427,7 @@ public: while (haystack < haystack_end) { #if __SSE4_1__ - if (haystack + n <= haystack_end && page_safe(haystack)) + if (haystack + n <= haystack_end && pageSafe(haystack)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl); @@ -447,7 +445,7 @@ public: const auto offset = __builtin_ctz(mask); haystack += offset; - if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack)) + if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel); @@ -559,7 +557,7 @@ public: bool compare(const UInt8 * pos) const { #if __SSE4_1__ - if (page_safe(pos)) + if (pageSafe(pos)) { const auto v_haystack = _mm_loadu_si128(reinterpret_cast(pos)); const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache); @@ -609,7 +607,7 @@ public: while (haystack < haystack_end) { #if __SSE4_1__ - if (haystack + n <= haystack_end && page_safe(haystack)) + if (haystack + n <= haystack_end && pageSafe(haystack)) { /// find first character const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); @@ -627,7 +625,7 @@ public: const auto offset = __builtin_ctz(mask); haystack += offset; - if (haystack < haystack_end && haystack + n <= haystack_end && page_safe(haystack)) + if (haystack < haystack_end && haystack + n <= haystack_end && pageSafe(haystack)) { /// check for first 16 octets const auto v_haystack = _mm_loadu_si128(reinterpret_cast(haystack)); @@ -693,10 +691,10 @@ using UTF8CaseSensitiveStringSearcher = StringSearcher; using UTF8CaseInsensitiveStringSearcher = StringSearcher; -/** Используют функции из libc. - * Имеет смысл использовать для коротких строк, когда требуется дешёвая инициализация. - * Нет варианта для регистронезависимого поиска UTF-8 строк. - * Требуется, чтобы за концом строк был нулевой байт. +/** Uses functions from libc. + * It makes sense to use only with short haystacks when cheap initialization is required. + * There is no option for case-insensitive search for UTF-8 strings. + * It is required that strings are zero-terminated. */ struct LibCASCIICaseSensitiveStringSearcher diff --git a/dbms/src/Common/StringUtils.h b/dbms/src/Common/StringUtils.h index 69cd0336cf8..c0ff2a9b6ea 100644 --- a/dbms/src/Common/StringUtils.h +++ b/dbms/src/Common/StringUtils.h @@ -101,6 +101,12 @@ inline bool isWordCharASCII(char c) || c == '_'; } +inline bool isValidIdentifierBegin(char c) +{ + return isAlphaASCII(c) + || c == '_'; +} + inline bool isWhitespaceASCII(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'; diff --git a/dbms/src/Common/Throttler.h b/dbms/src/Common/Throttler.h index 6cd0175022b..0b242b25110 100644 --- a/dbms/src/Common/Throttler.h +++ b/dbms/src/Common/Throttler.h @@ -1,11 +1,13 @@ #pragma once +#include /// nanosleep #include #include #include #include #include + namespace DB { @@ -15,12 +17,12 @@ namespace ErrorCodes } -/** Позволяет ограничить скорость чего либо (в штуках в секунду) с помощью sleep. - * Особенности работы: - * - считается только средняя скорость, от момента первого вызова функции add; - * если были периоды с низкой скоростью, то в течение промежутка времени после них, скорость будет выше; +/** Allows you to limit the speed of something (in entities per second) using sleep. + * Specifics of work: + * - only the average speed is considered, from the moment of the first call of `add` function; + * if there were periods with low speed, then during some time after them, the speed will be higher; * - * Также позволяет задать ограничение на максимальное количество в штуках. При превышении кидается исключение. + * Also allows you to set a limit on the maximum number of entities. If exceeded, an exception will be thrown. */ class Throttler { @@ -56,7 +58,7 @@ public: if (max_speed) { - /// Сколько должно было бы пройти времени, если бы скорость была равна max_speed. + /// How much time to wait for the average speed to become `max_speed`. UInt64 desired_ns = new_count * 1000000000 / max_speed; if (desired_ns > elapsed_ns) @@ -65,7 +67,7 @@ public: timespec sleep_ts; sleep_ts.tv_sec = sleep_ns / 1000000000; sleep_ts.tv_nsec = sleep_ns % 1000000000; - nanosleep(&sleep_ts, nullptr); /// NOTE Завершается раньше в случае сигнала. Это считается нормальным. + nanosleep(&sleep_ts, nullptr); /// NOTE Returns early in case of a signal. This is considered normal. } } } @@ -73,7 +75,7 @@ public: private: size_t max_speed = 0; size_t count = 0; - size_t limit = 0; /// 0 - не ограничено. + size_t limit = 0; /// 0 - not limited. const char * limit_exceeded_exception_message = nullptr; Stopwatch watch {CLOCK_MONOTONIC_COARSE}; std::mutex mutex; diff --git a/dbms/src/Common/UInt128.h b/dbms/src/Common/UInt128.h index 3df43e0a8f5..2b46af6f4a3 100644 --- a/dbms/src/Common/UInt128.h +++ b/dbms/src/Common/UInt128.h @@ -4,12 +4,16 @@ #include #include +#if __SSE4_2__ +#include +#endif + namespace DB { -/// Для агрегации по SipHash или конкатенации нескольких полей. +/// For aggregation by SipHash or concatenation of several fields. struct UInt128 { /// Suppress gcc7 warnings: 'prev_key.DB::UInt128::first' may be used uninitialized in this function @@ -42,22 +46,22 @@ struct UInt128Hash } }; -#if defined(__x86_64__) +#if __SSE4_2__ struct UInt128HashCRC32 { size_t operator()(UInt128 x) const { UInt64 crc = -1ULL; - asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.first)); - asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.second)); + crc = _mm_crc32_u64(crc, x.first); + crc = _mm_crc32_u64(crc, x.second); return crc; } }; #else -/// На других платформах используем не обязательно CRC32. NOTE Это может сбить с толку. +/// On other platforms we do not use CRC32. NOTE This can be confusing. struct UInt128HashCRC32 : public UInt128Hash {}; #endif @@ -71,7 +75,7 @@ inline void readBinary(UInt128 & x, ReadBuffer & buf) { readPODBinary(x, buf); } inline void writeBinary(const UInt128 & x, WriteBuffer & buf) { writePODBinary(x, buf); } -/** Используется при агрегации, для укладки большого количества ключей постоянной длины в хэш-таблицу. +/** Used for aggregation, for putting a large number of constant-length keys in a hash table. */ struct UInt256 { @@ -91,7 +95,7 @@ struct UInt256 { return a == rhs.a && b == rhs.b && c == rhs.c && d == rhs.d; - /* Так получается не лучше. + /* So it's no better. return 0xFFFF == _mm_movemask_epi8(_mm_and_si128( _mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(&a)), @@ -122,30 +126,30 @@ struct UInt256Hash } }; -#if defined(__x86_64__) +#if __SSE4_2__ struct UInt256HashCRC32 { size_t operator()(UInt256 x) const { UInt64 crc = -1ULL; - asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.a)); - asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.b)); - asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.c)); - asm("crc32q %[x], %[crc]\n" : [crc] "+r" (crc) : [x] "rm" (x.d)); + crc = _mm_crc32_u64(crc, x.a); + crc = _mm_crc32_u64(crc, x.b); + crc = _mm_crc32_u64(crc, x.c); + crc = _mm_crc32_u64(crc, x.d); return crc; } }; #else -/// На других платформах используем не обязательно CRC32. NOTE Это может сбить с толку. +/// We do not need to use CRC32 on other platforms. NOTE This can be confusing. struct UInt256HashCRC32 { DefaultHash hash64; size_t operator()(UInt256 x) const { - /// TODO Это не оптимально. + /// TODO This is not optimal. return hash64(hash64(hash64(hash64(x.a) ^ x.b) ^ x.c) ^ x.d); } }; diff --git a/dbms/src/Common/UnicodeBar.h b/dbms/src/Common/UnicodeBar.h index 6182784fef7..beee179ea1d 100644 --- a/dbms/src/Common/UnicodeBar.h +++ b/dbms/src/Common/UnicodeBar.h @@ -8,7 +8,7 @@ #define UNICODE_BAR_CHAR_SIZE (strlen("█")) -/** Позволяет нарисовать unicode-art полоску, ширина которой отображается с разрешением 1/8 символа. +/** Allows you to draw a unicode-art bar whose width is displayed with a resolution of 1/8 character. */ @@ -32,7 +32,7 @@ namespace UnicodeBar return ceil(width - 1.0 / 8) * UNICODE_BAR_CHAR_SIZE; } - /// В dst должно быть место для barWidthInBytes(width) символов и завершающего нуля. + /// In `dst` there must be a space for barWidthInBytes(width) characters and a trailing zero. inline void render(double width, char * dst) { size_t floor_width = floor(width); diff --git a/dbms/src/Common/VirtualColumnUtils.h b/dbms/src/Common/VirtualColumnUtils.h index eb1cc5106b7..b70245f0333 100644 --- a/dbms/src/Common/VirtualColumnUtils.h +++ b/dbms/src/Common/VirtualColumnUtils.h @@ -16,23 +16,23 @@ class Context; namespace VirtualColumnUtils { -/// Вычислить минимальный числовый суффикс, который надо добавить к строке, чтобы она не присутствовала в множестве +/// Calculate the minimum numeric suffix to add to the string so that it is not present in the set String chooseSuffix(const NamesAndTypesList & columns, const String & name); -/// Вычислить минимальный общий числовый суффикс, который надо добавить к каждой строке, -/// чтобы ни одна не присутствовала в множестве. +/// Calculate the minimum total numeric suffix to add to each string, +/// so that none is present in the set. String chooseSuffixForSet(const NamesAndTypesList & columns, const std::vector & names); -/// Добавляет в селект запрос секцию select column_name as value -/// Например select _port as 9000. +/// Adds to the select query section `select column_name as value` +/// For example select _port as 9000. void rewriteEntityInAst(ASTPtr ast, const String & column_name, const Field & value); -/// Оставить в блоке только строки, подходящие под секции WHERE и PREWHERE запроса. -/// Рассматриваются только элементы внешней конъюнкции, зависящие только от столбцов, присутствующих в блоке. -/// Возвращает true, если хоть одна строка выброшена. +/// Leave in the block only the rows that fit under the WHERE clause and the PREWHERE clause of the query. +/// Only elements of the outer conjunction are considered, depending only on the columns present in the block. +/// Returns true if at least one row is discarded. bool filterBlockWithQuery(ASTPtr query, Block & block, const Context & context); -/// Извлечь из входного потока множество значений столбца name +/// Extract from the input stream a set of `name` column values template std::multiset extractSingleValueFromBlock(const Block & block, const String & name) { diff --git a/dbms/src/Common/Volnitsky.h b/dbms/src/Common/Volnitsky.h index f0baf9ec321..e1fda9f0bb0 100644 --- a/dbms/src/Common/Volnitsky.h +++ b/dbms/src/Common/Volnitsky.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -9,24 +10,24 @@ #include -/** Поиск подстроки в строке по алгоритму Вольницкого: +/** Search for a substring in a string by Volnitsky's algorithm * http://volnitsky.com/project/str_search/ * - * haystack и needle могут содержать нулевые байты. + * `haystack` and `needle` can contain zero bytes. * - * Алгоритм: - * - при слишком маленьком или слишком большом размере needle, или слишком маленьком haystack, используем std::search или memchr; - * - при инициализации, заполняем open-addressing linear probing хэш-таблицу вида: - * хэш от биграммы из needle -> позиция этой биграммы в needle + 1. - * (прибавлена единица только чтобы отличить смещение ноль от пустой ячейки) - * - в хэш-таблице ключи не хранятся, хранятся только значения; - * - биграммы могут быть вставлены несколько раз, если они встречаются в needle несколько раз; - * - при поиске, берём из haystack биграмму, которая должна соответствовать последней биграмме needle (сравниваем с конца); - * - ищем её в хэш-таблице, если нашли - достаём смещение из хэш-таблицы и сравниваем строку побайтово; - * - если сравнить не получилось - проверяем следующую ячейку хэш-таблицы из цепочки разрешения коллизий; - * - если не нашли, пропускаем в haystack почти размер needle байт; + * Algorithm: + * - if the `needle` is too small or too large, or too small `haystack`, use std::search or memchr; + * - when initializing, fill in an open-addressing linear probing hash table of the form + * hash from the bigram of needle -> the position of this bigram in needle + 1. + * (one is added only to distinguish zero offset from an empty cell) + * - the keys are not stored in the hash table, only the values are stored; + * - bigrams can be inserted several times if they occur in the needle several times; + * - when searching, take from haystack bigram, which should correspond to the last bigram of needle (comparing from the end); + * - look for it in the hash table, if found - get the offset from the hash table and compare the string bytewise; + * - if it did not match, we check the next cell of the hash table from the collision resolution chain; + * - if not found, skip to haystack almost the size of the needle bytes; * - * Используется невыровненный доступ к памяти. + * Unaligned memory access is used. */ @@ -39,34 +40,35 @@ template class VolnitskyBase { protected: - using offset_t = uint8_t; /// Смещение в needle. Для основного алгоритма, длина needle не должна быть больше 255. - using ngram_t = uint16_t; /// n-грамма (2 байта). + using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255. + using Ngram = UInt16; /// n-gram (2 bytes). const UInt8 * const needle; const size_t needle_size; const UInt8 * const needle_end = needle + needle_size; - /// На сколько двигаемся, если n-грамма из haystack не нашлась в хэш-таблице. - const size_t step = needle_size - sizeof(ngram_t) + 1; + /// For how long we move, if the n-gram from haystack is not found in the hash table. + const size_t step = needle_size - sizeof(Ngram) + 1; /** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1) - * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */ - static const size_t hash_size = 64 * 1024; /// Помещается в L2-кэш. - offset_t hash[hash_size]; /// Хэш-таблица. + * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */ + static const size_t hash_size = 64 * 1024; /// Fits into the L2 cache (of common Intel CPUs). + Offset hash[hash_size]; /// Hash table. /// min haystack size to use main algorithm instead of fallback static constexpr auto min_haystack_size_for_algorithm = 20000; - const bool fallback; /// Нужно ли использовать fallback алгоритм. + const bool fallback; /// Do we need to use the fallback algorithm. public: - /** haystack_size_hint - ожидаемый суммарный размер haystack при вызовах search. Можно не указывать. - * Если указать его достаточно маленьким, то будет использован fallback алгоритм, - * так как считается, что тратить время на инициализацию хэш-таблицы не имеет смысла. + /** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified). + * If you specify it small enough, the fallback algorithm will be used, + * since it is considered that it's useless to waste time initializing the hash table. */ VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0) : needle{reinterpret_cast(needle)}, needle_size{needle_size}, fallback{ - needle_size < 2 * sizeof(ngram_t) || needle_size >= std::numeric_limits::max() || - (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)} + needle_size < 2 * sizeof(Ngram) + || needle_size >= std::numeric_limits::max() + || (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)} { if (fallback) return; @@ -74,12 +76,12 @@ public: memset(hash, 0, sizeof(hash)); /// int is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0 - for (auto i = static_cast(needle_size - sizeof(ngram_t)); i >= 0; --i) + for (auto i = static_cast(needle_size - sizeof(Ngram)); i >= 0; --i) self().putNGram(this->needle + i, i + 1, this->needle); } - /// Если не найдено - возвращается конец haystack. + /// If not found, the end of the haystack is returned. const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const { if (needle_size == 0) @@ -90,15 +92,15 @@ public: if (needle_size == 1 || fallback || haystack_size <= needle_size) return self().search_fallback(haystack, haystack_end); - /// Будем "прикладывать" needle к haystack и сравнивать n-грам из конца needle. - const auto * pos = haystack + needle_size - sizeof(ngram_t); + /// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle. + const auto * pos = haystack + needle_size - sizeof(Ngram); for (; pos <= haystack_end - needle_size; pos += step) { - /// Смотрим все ячейки хэш-таблицы, которые могут соответствовать n-граму из haystack. + /// We look at all the cells of the hash table that can correspond to the n-gram from haystack. for (size_t cell_num = toNGram(pos) % hash_size; hash[cell_num]; cell_num = (cell_num + 1) % hash_size) { - /// Когда нашли - сравниваем побайтово, используя смещение из хэш-таблицы. + /// When found - compare bytewise, using the offset from the hash table. const auto res = pos - (hash[cell_num] - 1); if (self().compare(res)) @@ -106,7 +108,7 @@ public: } } - /// Оставшийся хвостик. + /// The remaining tail. return self().search_fallback(pos - step + 1, haystack_end); } @@ -119,18 +121,18 @@ protected: CRTP & self() { return static_cast(*this); } const CRTP & self() const { return const_cast(this)->self(); } - static const ngram_t & toNGram(const UInt8 * const pos) + static const Ngram & toNGram(const UInt8 * const pos) { - return *reinterpret_cast(pos); + return *reinterpret_cast(pos); } - void putNGramBase(const ngram_t ngram, const int offset) + void putNGramBase(const Ngram ngram, const int offset) { - /// Кладём смещение для n-грама в соответствующую ему ячейку или ближайшую свободную. + /// Put the offset for the n-gram in the corresponding cell or the nearest free cell. size_t cell_num = ngram % hash_size; while (hash[cell_num]) - cell_num = (cell_num + 1) % hash_size; /// Поиск следующей свободной ячейки. + cell_num = (cell_num + 1) % hash_size; /// Search for the next free cell. hash[cell_num] = offset; } @@ -145,7 +147,7 @@ protected: union { - ngram_t n; + Ngram n; Chars chars; }; @@ -260,7 +262,7 @@ template <> struct VolnitskyImpl : VolnitskyBase struct VolnitskyImpl : VolnitskyBase struct VolnitskyImpl : VolnitskyBase struct VolnitskyImpl : VolnitskyBase -/// Выводит переданный размер в байтах в виде 123.45 GiB. +/// Displays the passed size in bytes as 123.45 GiB. void formatReadableSizeWithBinarySuffix(double value, DB::WriteBuffer & out, int precision = 2); std::string formatReadableSizeWithBinarySuffix(double value, int precision = 2); -/// Выводит переданный размер в байтах в виде 132.55 GB. +/// Displays the passed size in bytes as 132.55 GB. void formatReadableSizeWithDecimalSuffix(double value, DB::WriteBuffer & out, int precision = 2); std::string formatReadableSizeWithDecimalSuffix(double value, int precision = 2); -/// Выводит число в виде 123.45 billion. +/// Prints the number as 123.45 billion. void formatReadableQuantity(double value, DB::WriteBuffer & out, int precision = 2); std::string formatReadableQuantity(double value, int precision = 2); diff --git a/dbms/src/Common/getFQDNOrHostName.h b/dbms/src/Common/getFQDNOrHostName.h index 7e1c1b43040..a4367a72622 100644 --- a/dbms/src/Common/getFQDNOrHostName.h +++ b/dbms/src/Common/getFQDNOrHostName.h @@ -2,7 +2,7 @@ #include -/** Получить FQDN для локального сервера путём DNS-резолвинга hostname - аналогично вызову утилиты hostname с флагом -f. - * Если не получилось отрезолвить, то вернуть hostname - аналогично вызову утилиты hostname без флагов или uname -n. +/** Get the FQDN for the local server by resolving DNS hostname - similar to calling the 'hostname' tool with the -f flag. + * If it does not work, return hostname - similar to calling 'hostname' without flags or 'uname -n'. */ const std::string & getFQDNOrHostName(); diff --git a/dbms/src/Common/iostream_debug_helpers.cpp b/dbms/src/Common/iostream_debug_helpers.cpp new file mode 100644 index 00000000000..d4cded2fab7 --- /dev/null +++ b/dbms/src/Common/iostream_debug_helpers.cpp @@ -0,0 +1,81 @@ +#include "iostream_debug_helpers.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +std::ostream & operator<<(std::ostream & stream, const DB::IBlockInputStream & what) +{ + stream << "IBlockInputStream(id = " << what.getID() << ", name = " << what.getName() << ")"; + //what.dumpTree(stream); // todo: set const + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::Field & what) +{ + stream << "Field(type = " << what.getTypeName() << ")"; + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::NameAndTypePair & what) +{ + stream << "NameAndTypePair(name = " << what.name << ", type = " << what.type << ")"; + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::IDataType & what) +{ + stream << "IDataType(name = " << what.getName() << ", default = " << what.getDefault() << ", isNullable = " << what.isNullable() + << ", isNumeric = " << what.isNumeric() << ", behavesAsNumber = " << what.behavesAsNumber() << ")"; + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::IStorage & what) +{ + stream << "IStorage(name = " << what.getName() << ", tableName = " << what.getTableName() << ") {" + << what.getColumnsList().toString() + << "}"; + // isRemote supportsSampling supportsFinal supportsPrewhere supportsParallelReplicas + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::TableStructureReadLock & what) +{ + stream << "TableStructureReadLock()"; + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::IFunction & what) +{ + stream << "IFunction(name = " << what.getName() << ", variadic = " << what.isVariadic() << ", args = " << what.getNumberOfArguments() + << ")"; + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::Block & what) +{ + stream << "Block(" + << "size = " << what.getColumns().size() + << ")"; + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::ColumnWithTypeAndName & what) +{ + stream << "ColumnWithTypeAndName(name = " << what.name << ", type = " << what.type << ", column = " << what.column << ")"; + return stream; +} + +std::ostream & operator<<(std::ostream & stream, const DB::IColumn & what) +{ + stream << "IColumn(name = " << what.getName() + // TODO: maybe many flags here + << ")"; + return stream; +} diff --git a/dbms/src/Common/iostream_debug_helpers.h b/dbms/src/Common/iostream_debug_helpers.h new file mode 100644 index 00000000000..c2cf913273d --- /dev/null +++ b/dbms/src/Common/iostream_debug_helpers.h @@ -0,0 +1,37 @@ +#pragma once +#include + + +namespace DB { class IBlockInputStream; } +std::ostream & operator<<(std::ostream & stream, const DB::IBlockInputStream & what); + +namespace DB { class Field; } +std::ostream & operator<<(std::ostream & stream, const DB::Field & what); + +namespace DB { struct NameAndTypePair; } +std::ostream & operator<<(std::ostream & stream, const DB::NameAndTypePair & what); + +namespace DB { class IDataType; } +std::ostream & operator<<(std::ostream & stream, const DB::IDataType & what); + +namespace DB { class IStorage; } +std::ostream & operator<<(std::ostream & stream, const DB::IStorage & what); + +namespace DB { class TableStructureReadLock; } +std::ostream & operator<<(std::ostream & stream, const DB::TableStructureReadLock & what); + +namespace DB { class IFunction; } +std::ostream & operator<<(std::ostream & stream, const DB::IFunction & what); + +namespace DB { class Block; } +std::ostream & operator<<(std::ostream & stream, const DB::Block & what); + +namespace DB { struct ColumnWithTypeAndName; } +std::ostream & operator<<(std::ostream & stream, const DB::ColumnWithTypeAndName & what); + +namespace DB { class IColumn; } +std::ostream & operator<<(std::ostream & stream, const DB::IColumn & what); + + +/// some operator<< should be declared before operator<<(... std::shared_ptr<>) +#include diff --git a/dbms/src/Common/isLocalAddress.h b/dbms/src/Common/isLocalAddress.h index 3bbc72b26db..d63b42dbb97 100644 --- a/dbms/src/Common/isLocalAddress.h +++ b/dbms/src/Common/isLocalAddress.h @@ -12,13 +12,13 @@ namespace Poco namespace DB { - /** Позволяет проверить, похож ли адрес на localhost. - * Цель этой проверки обычно состоит в том, чтобы сделать предположение, - * что при хождении на этот адрес через интернет, мы попадём на себя. - * Следует иметь ввиду, что эта проверка делается неточно: - * - адрес просто сравнивается с адресами сетевых интерфейсов; - * - для каждого сетевого интерфейса берётся только первый адрес; - * - не проверяются правила маршрутизации, которые влияют, через какой сетевой интерфейс мы пойдём на заданный адрес. + /** Lets you check if the address is similar to `localhost`. + * The purpose of this check is usually to make an assumption, + * that when we go to this address via the Internet, we'll get to ourselves. + * Please note that this check is not accurate: + * - the address is simply compared to the addresses of the network interfaces; + * - only the first address is taken for each network interface; + * - the routing rules that affect which network interface we go to the specified address are not checked. */ bool isLocalAddress(const Poco::Net::SocketAddress & address); diff --git a/dbms/src/Common/localBackup.h b/dbms/src/Common/localBackup.h index 25b1de4e8e5..91107294e26 100644 --- a/dbms/src/Common/localBackup.h +++ b/dbms/src/Common/localBackup.h @@ -3,14 +3,14 @@ #include -/** Создаёт локальный (в той же точке монтирования) бэкап (снэпшот) директории. +/** Creates a local (at the same mount point) backup (snapshot) directory. * - * В указанной destination-директории создаёт hard link-и на все файлы source-директории - * и во всех вложенных директориях, с сохранением (созданием) всех относительных путей; - * а также делает chown, снимая разрешение на запись. + * In the specified destination directory, it creates a hard links on all source-directory files + * and in all nested directories, with saving (creating) all relative paths; + * and also `chown`, removing the write permission. * - * Это защищает данные от случайного удаления или модификации, - * и предназначено для использования как простое средство защиты от человеческой или программной ошибки, - * но не от аппаратного сбоя. + * This protects data from accidental deletion or modification, + * and is intended to be used as a simple means of protection against a human or program error, + * but not from a hardware failure. */ void localBackup(Poco::Path source_path, Poco::Path destination_path); diff --git a/dbms/src/Common/setThreadName.h b/dbms/src/Common/setThreadName.h index 476aa47dbdf..dc6af7336e0 100644 --- a/dbms/src/Common/setThreadName.h +++ b/dbms/src/Common/setThreadName.h @@ -1,7 +1,7 @@ #pragma once -/** Устанавливает имя потока (максимальная длина - 15 байт), - * которое будет видно в ps, gdb, /proc, - * для удобства наблюдений и отладки. +/** Sets the thread name (maximum length is 15 bytes), + * which will be visible in ps, gdb, /proc, + * for convenience of observation and debugging. */ void setThreadName(const char * name); diff --git a/dbms/src/Common/tests/CMakeLists.txt b/dbms/src/Common/tests/CMakeLists.txt index 3b8f53b307c..072d5547301 100644 --- a/dbms/src/Common/tests/CMakeLists.txt +++ b/dbms/src/Common/tests/CMakeLists.txt @@ -54,3 +54,6 @@ target_link_libraries (thread_pool dbms) add_executable (array_cache array_cache.cpp) target_link_libraries (array_cache dbms) + +add_executable (space_saving space_saving.cpp) +target_link_libraries (space_saving dbms) diff --git a/dbms/src/Common/tests/space_saving.cpp b/dbms/src/Common/tests/space_saving.cpp new file mode 100644 index 00000000000..50c15424ef5 --- /dev/null +++ b/dbms/src/Common/tests/space_saving.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include + +#include +#include + +int main(int argc, char ** argv) +{ + { + using Cont = DB::SpaceSaving; + Cont first(10); + + /* Test biased insertion */ + + for (int i = 0; i < 200; ++i) { + first.insert(i); + int k = i % 5; // Bias towards 0-4 + first.insert(k); + } + + /* Test whether the biased elements are retained */ + + std::map expect; + for (int i = 0; i < 5; ++i) { + expect[i] = 41; + } + + for (auto x : first.topK(5)) { + if (expect[x.key] != x.count) { + std::cerr << "key: " << x.key << " value: " << x.count << " expected: " << expect[x.key] << std::endl; + } else { + std::cout << "key: " << x.key << " value: " << x.count << std::endl; + } + expect.erase(x.key); + } + + if (!expect.empty()) { + std::cerr << "expected to find all heavy hitters" << std::endl; + } + + /* Create another table and test merging */ + + Cont second(10); + for (int i = 0; i < 200; ++i) { + first.insert(i); + } + + for (int i = 0; i < 5; ++i) { + expect[i] = 42; + } + + first.merge(second); + + for (auto x : first.topK(5)) { + if (expect[x.key] != x.count) { + std::cerr << "key: " << x.key << " value: " << x.count << " expected: " << expect[x.key] << std::endl; + } else { + std::cout << "key: " << x.key << " value: " << x.count << std::endl; + } + expect.erase(x.key); + } + } + + { + /* Same test for string keys */ + + using Cont = DB::SpaceSaving; + Cont cont(10); + + for (int i = 0; i < 400; ++i) { + cont.insert(std::to_string(i)); + cont.insert(std::to_string(i % 5)); // Bias towards 0-4 + } + + // The hashing is going to be more lossy + // Expect at least ~ 10% count + std::map expect; + for (int i = 0; i < 5; ++i) { + expect[std::to_string(i)] = 38; + } + + for (auto x : cont.topK(5)) { + auto key = x.key; + if (x.count < expect[key]) { + std::cerr << "key: " << key << " value: " << x.count << " expected: " << expect[key] << std::endl; + } else { + std::cout << "key: " << key << " value: " << x.count << std::endl; + } + expect.erase(key); + } + + if (!expect.empty()) { + std::cerr << "expected to find all heavy hitters" << std::endl; + abort(); + } + } + + return 0; +} diff --git a/dbms/src/Common/typeid_cast.h b/dbms/src/Common/typeid_cast.h index e3f47870bb9..e335f8f9672 100644 --- a/dbms/src/Common/typeid_cast.h +++ b/dbms/src/Common/typeid_cast.h @@ -16,9 +16,9 @@ namespace DB } -/** Проверяет совпадение типа путём сравнения typeid-ов. - * Проверяется точное совпадение типа. То есть, cast в предка будет неуспешным. - * В остальном, ведёт себя как dynamic_cast. +/** Checks type by comparing typeid. + * The exact match of the type is checked. That is, cast in the ancestor will be unsuccessful. + * In the rest, behaves like a dynamic_cast. */ template typename std::enable_if::value, To>::type typeid_cast(From & from) diff --git a/dbms/src/Core/Block.h b/dbms/src/Core/Block.h index f30be28f2f1..93c8279c400 100644 --- a/dbms/src/Core/Block.h +++ b/dbms/src/Core/Block.h @@ -42,20 +42,20 @@ public: Block(std::initializer_list il); Block(const ColumnsWithTypeAndName & data_); - /// вставить столбец в заданную позицию + /// insert the column at the specified position void insert(size_t position, const ColumnWithTypeAndName & elem); void insert(size_t position, ColumnWithTypeAndName && elem); - /// вставить столбец в конец + /// insert the column to the end void insert(const ColumnWithTypeAndName & elem); void insert(ColumnWithTypeAndName && elem); - /// вставить столбец в конец, если столбца с таким именем ещё нет + /// insert the column to the end, if there is no column with that name yet void insertUnique(const ColumnWithTypeAndName & elem); void insertUnique(ColumnWithTypeAndName && elem); - /// удалить столбец в заданной позиции + /// remove the column at the specified position void erase(size_t position); - /// удалить столбец с заданным именем + /// remove the column with the specified name void erase(const String & name); - /// Добавляет в блок недостающие столбцы со значениями по-умолчанию + /// Adds missing columns to the block with default values void addDefaults(const NamesAndTypesList & required_columns); /// References are invalidated after calling functions above. @@ -90,23 +90,23 @@ public: operator bool() const { return !data.empty(); } bool operator!() const { return data.empty(); } - /** Получить список имён столбцов через запятую. */ + /** Get a list of column names separated by commas. */ std::string dumpNames() const; - /** Список имен, типов и длин столбцов. Предназначен для отладки. */ + /** List of names, types and lengths of columns. Designed for debugging. */ std::string dumpStructure() const; - /** Получить такой же блок, но пустой. */ + /** Get the same block, but empty. */ Block cloneEmpty() const; - /** Получить блок со столбцами, переставленными в порядке их имён. */ + /** Get a block with columns that have been rearranged in the order of their names. */ Block sortColumns() const; - /** Заменяет столбцы смещений внутри вложенных таблиц на один общий для таблицы. - * Кидает исключение, если эти смещения вдруг оказались неодинаковы. + /** Replaces the offset columns within the nested tables by one common for the table. + * Throws an exception if these offsets suddenly turn out to be different. */ void optimizeNestedArraysOffsets(); - /** Тоже самое, только без замены смещений. */ + /** The same, only without changing the offsets. */ void checkNestedArraysOffsets() const; void clear(); @@ -128,15 +128,15 @@ using Blocks = std::vector; using BlocksList = std::list; -/// Сравнить типы столбцов у блоков. Порядок столбцов имеет значение. Имена не имеют значения. +/// Compare column types for blocks. The order of the columns matters. Names do not matter. bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs); /// Calculate difference in structure of blocks and write description into output strings. void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out_lhs_diff, std::string & out_rhs_diff); -/** Дополнительные данные к блокам. Они пока нужны только для запроса - * DESCRIBE TABLE с Distributed-таблицами. +/** Additional data to the blocks. They are only needed for a query + * DESCRIBE TABLE with Distributed tables. */ struct BlockExtraInfo { diff --git a/dbms/src/Core/BlockInfo.h b/dbms/src/Core/BlockInfo.h index f978f5c0a52..ebfbd117de7 100644 --- a/dbms/src/Core/BlockInfo.h +++ b/dbms/src/Core/BlockInfo.h @@ -9,20 +9,20 @@ namespace DB class ReadBuffer; class WriteBuffer; -/** Дополнительная информация о блоке. +/** More information about the block. */ struct BlockInfo { /** is_overflows: - * После выполнения GROUP BY ... WITH TOTALS с настройками max_rows_to_group_by и group_by_overflow_mode = 'any', - * в отдельный блок засовывается строчка с аргегированными значениями, не прошедшими max_rows_to_group_by. - * Если это такой блок, то для него is_overflows выставляется в true. + * After running GROUP BY ... WITH TOTALS with the max_rows_to_group_by and group_by_overflow_mode = 'any' settings, + * a row is inserted in the separate block with aggregated values that have not passed max_rows_to_group_by. + * If it is such a block, then is_overflows is set to true for it. */ /** bucket_num: - * При использовании двухуровневого метода агрегации, данные с разными группами ключей раскидываются по разным корзинам. - * В таком случае здесь указывается номер корзины. Он используется для оптимизации слияния при распределённой аргегации. - * Иначе - -1. + * When using the two-level aggregation method, data with different key groups are scattered across different buckets. + * In this case, the bucket number is indicated here. It is used to optimize the merge for distributed aggregation. + * Otherwise -1. */ #define APPLY_FOR_BLOCK_INFO_FIELDS(M) \ @@ -36,10 +36,10 @@ struct BlockInfo #undef DECLARE_FIELD - /// Записать значения в бинарном виде. NOTE: Можно было бы использовать protobuf, но он был бы overkill для данного случая. + /// Write the values in binary form. NOTE: You could use protobuf, but it would be overkill for this case. void write(WriteBuffer & out) const; - /// Прочитать значения в бинарном виде. + /// Read the values in binary form. void read(ReadBuffer & in); }; diff --git a/dbms/src/Core/Defines.h b/dbms/src/Core/Defines.h index 7aeba1fdc93..7dde20e6ccb 100644 --- a/dbms/src/Core/Defines.h +++ b/dbms/src/Core/Defines.h @@ -16,30 +16,30 @@ #define DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC 5 #define DBMS_DEFAULT_POLL_INTERVAL 10 -/// Размер буфера ввода-вывода по-умолчанию. +/// The size of the I/O buffer by default. #define DBMS_DEFAULT_BUFFER_SIZE 1048576ULL -/// При записи данных, для сжатия выделяется буфер размером max_compress_block_size. При переполнении буфера или если в буфер -/// записано данных больше или равно, чем min_compress_block_size, то при очередной засечке, данные так же будут сжиматься -/// В результате, для маленьких столбцов (числа 1-8 байт), при index_granularity = 8192, размер блока будет 64 KБ. -/// А для больших столбцов (Title - строка ~100 байт), размер блока будет ~819 КБ. За счёт этого, коэффициент сжатия почти не ухудшится. +/// When writing data, a buffer of `max_compress_block_size` size is allocated for compression. When the buffer overflows or if into the buffer +/// more or equal data is written than `min_compress_block_size`, then with the next mark, the data will also compressed +/// As a result, for small columns (numbers 1-8 bytes), with index_granularity = 8192, the block size will be 64 KB. +/// And for large columns (Title - string ~100 bytes), the block size will be ~819 KB. Due to this, the compression ratio almost does not get worse. #define DEFAULT_MIN_COMPRESS_BLOCK_SIZE 65536 #define DEFAULT_MAX_COMPRESS_BLOCK_SIZE 1048576 -/** Какими блоками по-умолчанию читаются данные (в числе строк). - * Меньшие значения дают лучшую кэш-локальность, меньшее потребление оперативки, но больший оверхед на обработку запроса. +/** Which blocks by default read the data (by number of rows). + * Smaller values give better cache locality, less consumption of RAM, but more overhead to process the query. */ #define DEFAULT_BLOCK_SIZE 65536 -/** Какие блоки следует формировать для вставки в таблицу, если мы управляем формированием блоков. - * (Иногда в таблицу вставляются ровно такие блоки, какие были считаны / переданы извне, и на их размер этот параметр не влияет.) - * Больше, чем DEFAULT_BLOCK_SIZE, так как в некоторых таблицах на каждый блок создаётся кусок данных на диске (довольно большая штука), - * и если бы куски были маленькими, то их было бы накладно потом объединять. +/** Which blocks should be formed for insertion into the table, if we control the formation of blocks. + * (Sometimes the blocks are inserted exactly such blocks that have been read / transmitted from the outside, and this parameter does not affect their size.) + * More than DEFAULT_BLOCK_SIZE, because in some tables a block of data on the disk is created for each block (quite a big thing), + * and if the parts were small, then it would be costly then to combine them. */ #define DEFAULT_INSERT_BLOCK_SIZE 1048576 -/** То же самое, но для операций слияния. Меньше DEFAULT_BLOCK_SIZE для экономии оперативки (так как читаются все столбцы). - * Сильно меньше, так как бывают 10-way слияния. +/** The same, but for merge operations. Less DEFAULT_BLOCK_SIZE for saving RAM (since all the columns are read). + * Significantly less, since there are 10-way mergers. */ #define DEFAULT_MERGE_BLOCK_SIZE 8192 @@ -49,16 +49,16 @@ #define DEFAULT_INTERACTIVE_DELAY 100000 #define DBMS_DEFAULT_DISTRIBUTED_CONNECTIONS_POOL_SIZE 1024 #define DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES 3 -/// каждый период уменьшаем счетчик ошибок в 2 раза -/// слишком маленький период может приводить, что ошибки исчезают сразу после создания. +/// each period reduces the error counter by 2 times +/// too short a period can cause errors to disappear immediately after creation. #define DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_DECREASE_ERROR_PERIOD (2 * DBMS_DEFAULT_SEND_TIMEOUT_SEC) -#define DEFAULT_QUERIES_QUEUE_WAIT_TIME_MS 5000 /// Максимальное время ожидания в очереди запросов. +#define DEFAULT_QUERIES_QUEUE_WAIT_TIME_MS 5000 /// Maximum waiting time in the request queue. #define DBMS_DEFAULT_BACKGROUND_POOL_SIZE 16 -/// Используется в методе reserve, когда известно число строк, но неизвестны их размеры. +/// Used in the `reserve` method, when the number of rows is known, but their dimensions are unknown. #define DBMS_APPROX_STRING_SIZE 64 -/// Суффикс имени для столбца, содержащего смещения массива. +/// Name suffix for the column containing the array offsets. #define ARRAY_SIZES_COLUMN_NAME_SUFFIX ".size" #define DBMS_MIN_REVISION_WITH_TEMPORARY_TABLES 50264 @@ -74,7 +74,7 @@ #define DBMS_DISTRIBUTED_DIRECTORY_MONITOR_SLEEP_TIME_MS 100 -/// Граница, на которых должны быть выровнены блоки для асинхронных файловых операций. +/// The boundary on which the blocks for asynchronous file operations should be aligned. #define DEFAULT_AIO_FILE_BLOCK_SIZE 4096 #define DEFAULT_QUERY_LOG_FLUSH_INTERVAL_MILLISECONDS 7500 diff --git a/dbms/src/Core/Field.h b/dbms/src/Core/Field.h index 7c88cd2f07d..87b2c819d89 100644 --- a/dbms/src/Core/Field.h +++ b/dbms/src/Core/Field.h @@ -31,13 +31,13 @@ STRONG_TYPEDEF(TupleBackend, Tuple); /// Array and Tuple are different types wit #define DBMS_MIN_FIELD_SIZE 32 -/** Discriminated union из нескольких типов. - * Сделан для замены boost::variant: - * является не обобщённым, - * зато несколько более эффективным, и более простым. +/** Discriminated union of several types. + * Made for replacement of `boost::variant` + * is not generalized, + * but somewhat more efficient, and simpler. * - * Используется для представления единичного значения одного из нескольких типов в оперативке. - * Внимание! Предпочтительно вместо единичных значений хранить кусочки столбцов. См. Column.h + * Used to represent a unit value of one of several types in the RAM. + * Warning! Preferably, instead of single values, store the pieces of the columns. See Column.h */ class Field { @@ -47,16 +47,16 @@ public: /// Type tag. enum Which { - Null = 0, - UInt64 = 1, - Int64 = 2, - Float64 = 3, + Null = 0, + UInt64 = 1, + Int64 = 2, + Float64 = 3, /// Non-POD types. - String = 16, - Array = 17, - Tuple = 18, + String = 16, + Array = 17, + Tuple = 18, }; static const int MIN_NON_POD = 16; @@ -65,13 +65,13 @@ public: { switch (which) { - case Null: return "Null"; - case UInt64: return "UInt64"; - case Int64: return "Int64"; - case Float64: return "Float64"; - case String: return "String"; - case Array: return "Array"; - case Tuple: return "Tuple"; + case Null: return "Null"; + case UInt64: return "UInt64"; + case Int64: return "Int64"; + case Float64: return "Float64"; + case String: return "String"; + case Array: return "Array"; + case Tuple: return "Tuple"; default: throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD); @@ -80,7 +80,7 @@ public: }; - /// Позволяет получить идентификатор для типа или наоборот. + /// Returns an identifier for the type or vice versa. template struct TypeToEnum; template struct EnumToType; @@ -90,8 +90,8 @@ public: { } - /** Не смотря на наличие шаблонного конструктора, этот конструктор всё-равно нужен, - * так как при его отсутствии, компилятор всё-равно сгенерирует конструктор по-умолчанию. + /** Despite the presence of a template constructor, this constructor is still needed, + * since, in its absence, the compiler will still generate the default constructor. */ Field(const Field & rhs) { @@ -110,7 +110,7 @@ public: createConcrete(std::forward(rhs)); } - /// Создать строку inplace. + /// Create a string inplace. Field(const char * data, size_t size) { create(data, size); @@ -231,13 +231,13 @@ public: switch (which) { - case Types::Null: return false; - case Types::UInt64: return get() < rhs.get(); - case Types::Int64: return get() < rhs.get(); - case Types::Float64: return get() < rhs.get(); - case Types::String: return get() < rhs.get(); - case Types::Array: return get() < rhs.get(); - case Types::Tuple: return get() < rhs.get(); + case Types::Null: return false; + case Types::UInt64: return get() < rhs.get(); + case Types::Int64: return get() < rhs.get(); + case Types::Float64: return get() < rhs.get(); + case Types::String: return get() < rhs.get(); + case Types::Array: return get() < rhs.get(); + case Types::Tuple: return get() < rhs.get(); default: throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD); @@ -258,13 +258,13 @@ public: switch (which) { - case Types::Null: return true; - case Types::UInt64: return get() <= rhs.get(); - case Types::Int64: return get() <= rhs.get(); - case Types::Float64: return get() <= rhs.get(); - case Types::String: return get() <= rhs.get(); - case Types::Array: return get() <= rhs.get(); - case Types::Tuple: return get() <= rhs.get(); + case Types::Null: return true; + case Types::UInt64: return get() <= rhs.get(); + case Types::Int64: return get() <= rhs.get(); + case Types::Float64: return get() <= rhs.get(); + case Types::String: return get() <= rhs.get(); + case Types::Array: return get() <= rhs.get(); + case Types::Tuple: return get() <= rhs.get(); default: throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD); @@ -283,13 +283,13 @@ public: switch (which) { - case Types::Null: return true; + case Types::Null: return true; case Types::UInt64: case Types::Int64: - case Types::Float64: return get() == rhs.get(); - case Types::String: return get() == rhs.get(); - case Types::Array: return get() == rhs.get(); - case Types::Tuple: return get() == rhs.get(); + case Types::Float64: return get() == rhs.get(); + case Types::String: return get() == rhs.get(); + case Types::Array: return get() == rhs.get(); + case Types::Tuple: return get() == rhs.get(); default: throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD); @@ -335,13 +335,13 @@ private: { switch (field.which) { - case Types::Null: f(field.template get()); return; - case Types::UInt64: f(field.template get()); return; - case Types::Int64: f(field.template get()); return; - case Types::Float64: f(field.template get()); return; - case Types::String: f(field.template get()); return; - case Types::Array: f(field.template get()); return; - case Types::Tuple: f(field.template get()); return; + case Types::Null: f(field.template get()); return; + case Types::UInt64: f(field.template get()); return; + case Types::Int64: f(field.template get()); return; + case Types::Float64: f(field.template get()); return; + case Types::String: f(field.template get()); return; + case Types::Array: f(field.template get()); return; + case Types::Tuple: f(field.template get()); return; default: throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD); @@ -416,21 +416,21 @@ private: #undef DBMS_MIN_FIELD_SIZE -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Null; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Float64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::String; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Array; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Tuple; }; +template <> struct Field::TypeToEnum { static const Types::Which value = Types::Null; }; +template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt64; }; +template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int64; }; +template <> struct Field::TypeToEnum { static const Types::Which value = Types::Float64; }; +template <> struct Field::TypeToEnum { static const Types::Which value = Types::String; }; +template <> struct Field::TypeToEnum { static const Types::Which value = Types::Array; }; +template <> struct Field::TypeToEnum { static const Types::Which value = Types::Tuple; }; -template <> struct Field::EnumToType { using Type = Null ; }; -template <> struct Field::EnumToType { using Type = UInt64 ; }; -template <> struct Field::EnumToType { using Type = Int64 ; }; -template <> struct Field::EnumToType { using Type = Float64 ; }; -template <> struct Field::EnumToType { using Type = String ; }; -template <> struct Field::EnumToType { using Type = Array ; }; -template <> struct Field::EnumToType { using Type = Tuple ; }; +template <> struct Field::EnumToType { using Type = Null; }; +template <> struct Field::EnumToType { using Type = UInt64; }; +template <> struct Field::EnumToType { using Type = Int64; }; +template <> struct Field::EnumToType { using Type = Float64; }; +template <> struct Field::EnumToType { using Type = String; }; +template <> struct Field::EnumToType { using Type = Array; }; +template <> struct Field::EnumToType { using Type = Tuple; }; template @@ -464,21 +464,21 @@ template <> struct TypeName { static std::string get() { return "Tuple"; template struct NearestFieldType; -template <> struct NearestFieldType { using Type = UInt64 ; }; -template <> struct NearestFieldType { using Type = UInt64 ; }; -template <> struct NearestFieldType { using Type = UInt64 ; }; -template <> struct NearestFieldType { using Type = UInt64 ; }; -template <> struct NearestFieldType { using Type = Int64 ; }; -template <> struct NearestFieldType { using Type = Int64 ; }; -template <> struct NearestFieldType { using Type = Int64 ; }; -template <> struct NearestFieldType { using Type = Int64 ; }; -template <> struct NearestFieldType { using Type = Float64 ; }; -template <> struct NearestFieldType { using Type = Float64 ; }; -template <> struct NearestFieldType { using Type = String ; }; -template <> struct NearestFieldType { using Type = Array ; }; -template <> struct NearestFieldType { using Type = Tuple ; }; -template <> struct NearestFieldType { using Type = UInt64 ; }; -template <> struct NearestFieldType { using Type = Null; }; +template <> struct NearestFieldType { using Type = UInt64; }; +template <> struct NearestFieldType { using Type = UInt64; }; +template <> struct NearestFieldType { using Type = UInt64; }; +template <> struct NearestFieldType { using Type = UInt64; }; +template <> struct NearestFieldType { using Type = Int64; }; +template <> struct NearestFieldType { using Type = Int64; }; +template <> struct NearestFieldType { using Type = Int64; }; +template <> struct NearestFieldType { using Type = Int64; }; +template <> struct NearestFieldType { using Type = Float64; }; +template <> struct NearestFieldType { using Type = Float64; }; +template <> struct NearestFieldType { using Type = String; }; +template <> struct NearestFieldType { using Type = Array; }; +template <> struct NearestFieldType { using Type = Tuple; }; +template <> struct NearestFieldType { using Type = UInt64; }; +template <> struct NearestFieldType { using Type = Null; }; template @@ -491,13 +491,13 @@ typename NearestFieldType::Type nearestFieldType(const T & x) class ReadBuffer; class WriteBuffer; -/// Предполагается что у всех элементов массива одинаковый тип. +/// It is assumed that all elements of the array have the same type. void readBinary(Array & x, ReadBuffer & buf); -inline void readText(Array & x, ReadBuffer & buf) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } -inline void readQuoted(Array & x, ReadBuffer & buf) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } +inline void readText(Array & x, ReadBuffer & buf) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } +inline void readQuoted(Array & x, ReadBuffer & buf) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } -/// Предполагается что у всех элементов массива одинаковый тип. +/// It is assumed that all elements of the array have the same type. void writeBinary(const Array & x, WriteBuffer & buf); void writeText(const Array & x, WriteBuffer & buf); @@ -506,8 +506,8 @@ inline void writeQuoted(const Array & x, WriteBuffer & buf) { throw Exception("C void readBinary(Tuple & x, ReadBuffer & buf); -inline void readText(Tuple & x, ReadBuffer & buf) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); } -inline void readQuoted(Tuple & x, ReadBuffer & buf) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); } +inline void readText(Tuple & x, ReadBuffer & buf) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); } +inline void readQuoted(Tuple & x, ReadBuffer & buf) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); } void writeBinary(const Tuple & x, WriteBuffer & buf); diff --git a/dbms/src/Core/FieldVisitors.h b/dbms/src/Core/FieldVisitors.h index 92a7ace374d..99a18ee3d8d 100644 --- a/dbms/src/Core/FieldVisitors.h +++ b/dbms/src/Core/FieldVisitors.h @@ -33,13 +33,13 @@ typename std::decay::type::ResultType applyVisitor(Visitor && visitor, { switch (field.getType()) { - case Field::Types::Null: return visitor(field.template get()); - case Field::Types::UInt64: return visitor(field.template get()); - case Field::Types::Int64: return visitor(field.template get()); + case Field::Types::Null: return visitor(field.template get()); + case Field::Types::UInt64: return visitor(field.template get()); + case Field::Types::Int64: return visitor(field.template get()); case Field::Types::Float64: return visitor(field.template get()); - case Field::Types::String: return visitor(field.template get()); - case Field::Types::Array: return visitor(field.template get()); - case Field::Types::Tuple: return visitor(field.template get()); + case Field::Types::String: return visitor(field.template get()); + case Field::Types::Array: return visitor(field.template get()); + case Field::Types::Tuple: return visitor(field.template get()); default: throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD); @@ -52,13 +52,13 @@ static typename std::decay::type::ResultType applyBinaryVisitorImpl(Vis { switch (field2.getType()) { - case Field::Types::Null: return visitor(field1, field2.template get()); - case Field::Types::UInt64: return visitor(field1, field2.template get()); - case Field::Types::Int64: return visitor(field1, field2.template get()); + case Field::Types::Null: return visitor(field1, field2.template get()); + case Field::Types::UInt64: return visitor(field1, field2.template get()); + case Field::Types::Int64: return visitor(field1, field2.template get()); case Field::Types::Float64: return visitor(field1, field2.template get()); - case Field::Types::String: return visitor(field1, field2.template get()); - case Field::Types::Array: return visitor(field1, field2.template get()); - case Field::Types::Tuple: return visitor(field1, field2.template get()); + case Field::Types::String: return visitor(field1, field2.template get()); + case Field::Types::Array: return visitor(field1, field2.template get()); + case Field::Types::Tuple: return visitor(field1, field2.template get()); default: throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD); @@ -102,13 +102,13 @@ typename std::decay::type::ResultType applyVisitor(Visitor && visitor, class FieldVisitorToString : public StaticVisitor { public: - String operator() (const Null & x) const; - String operator() (const UInt64 & x) const; - String operator() (const Int64 & x) const; - String operator() (const Float64 & x) const; - String operator() (const String & x) const; - String operator() (const Array & x) const; - String operator() (const Tuple & x) const; + String operator() (const Null & x) const; + String operator() (const UInt64 & x) const; + String operator() (const Int64 & x) const; + String operator() (const Float64 & x) const; + String operator() (const String & x) const; + String operator() (const Array & x) const; + String operator() (const Tuple & x) const; }; @@ -116,13 +116,13 @@ public: class FieldVisitorDump : public StaticVisitor { public: - String operator() (const Null & x) const; - String operator() (const UInt64 & x) const; - String operator() (const Int64 & x) const; - String operator() (const Float64 & x) const; - String operator() (const String & x) const; - String operator() (const Array & x) const; - String operator() (const Tuple & x) const; + String operator() (const Null & x) const; + String operator() (const UInt64 & x) const; + String operator() (const Int64 & x) const; + String operator() (const Float64 & x) const; + String operator() (const String & x) const; + String operator() (const Array & x) const; + String operator() (const Tuple & x) const; }; @@ -151,8 +151,8 @@ public: throw Exception("Cannot convert Tuple to " + TypeName::get(), ErrorCodes::CANNOT_CONVERT_TYPE); } - T operator() (const UInt64 & x) const { return x; } - T operator() (const Int64 & x) const { return x; } + T operator() (const UInt64 & x) const { return x; } + T operator() (const Int64 & x) const { return x; } T operator() (const Float64 & x) const { return x; } }; @@ -165,12 +165,12 @@ private: public: FieldVisitorHash(SipHash & hash); - void operator() (const Null & x) const; - void operator() (const UInt64 & x) const; - void operator() (const Int64 & x) const; - void operator() (const Float64 & x) const; - void operator() (const String & x) const; - void operator() (const Array & x) const; + void operator() (const Null & x) const; + void operator() (const UInt64 & x) const; + void operator() (const Int64 & x) const; + void operator() (const Float64 & x) const; + void operator() (const String & x) const; + void operator() (const Array & x) const; }; diff --git a/dbms/src/Core/NamesAndTypes.h b/dbms/src/Core/NamesAndTypes.h index 5da8a5a06a4..86c0b17b66c 100644 --- a/dbms/src/Core/NamesAndTypes.h +++ b/dbms/src/Core/NamesAndTypes.h @@ -44,22 +44,22 @@ public: String toString() const; static NamesAndTypesList parse(const String & s); - /// Все элементы rhs должны быть различны. + /// All `rhs` elements must be different. bool isSubsetOf(const NamesAndTypesList & rhs) const; - /// Расстояние Хемминга между множествами - /// (иными словами, добавленные и удаленные столбцы считаются один раз; столбцы, изменившие тип, - дважды). + /// Hamming distance between sets + /// (in other words, the added and deleted columns are counted once, the columns that changed the type - twice). size_t sizeOfDifference(const NamesAndTypesList & rhs) const; Names getNames() const; - /// Оставить только столбцы, имена которых есть в names. В names могут быть лишние столбцы. + /// Leave only the columns whose names are in the `names`. In `names` there can be superfluous columns. NamesAndTypesList filter(const NameSet & names) const; - /// Оставить только столбцы, имена которых есть в names. В names могут быть лишние столбцы. + /// Leave only the columns whose names are in the `names`. In `names` there can be superfluous columns. NamesAndTypesList filter(const Names & names) const; - /// В отличие от filter, возвращает столбцы в том порядке, в котором они идут в names. + /// Unlike `filter`, returns columns in the order in which they go in `names`. NamesAndTypesList addTypes(const Names & names) const; }; diff --git a/dbms/src/Core/QueryProcessingStage.h b/dbms/src/Core/QueryProcessingStage.h index 58d6ea3ead3..a16b041ae35 100644 --- a/dbms/src/Core/QueryProcessingStage.h +++ b/dbms/src/Core/QueryProcessingStage.h @@ -6,15 +6,15 @@ namespace DB { -/// До какой стадии выполнен или нужно выполнить SELECT запрос. +/// Up to what stage the SELECT query is executed or needs to be executed. namespace QueryProcessingStage { - /// Номера имеют значение - более поздняя стадия имеет больший номер. + /// Numbers matter - the later stage has a larger number. enum Enum { - FetchColumns = 0, /// Только прочитать/прочитаны указанные в запросе столбцы. - WithMergeableState = 1, /// До стадии, когда результаты обработки на разных серверах можно объединить. - Complete = 2, /// Полностью. + FetchColumns = 0, /// Only read/have been read the columns specified in the query. + WithMergeableState = 1, /// Until the stage where the results of processing on different servers can be combined. + Complete = 2, /// Completely. }; inline const char * toString(UInt64 stage) diff --git a/dbms/src/Core/Row.h b/dbms/src/Core/Row.h index b163f40a0e8..0dc105922ff 100644 --- a/dbms/src/Core/Row.h +++ b/dbms/src/Core/Row.h @@ -9,8 +9,8 @@ namespace DB { -/** Тип данных для представления одной строки таблицы в оперативке. - * Внимание! Предпочтительно вместо единичных строк хранить блоки столбцов. См. Block.h +/** The data type for representing one row of the table in the RAM. + * Warning! It is preferable to store column blocks instead of single rows. See Block.h */ using Row = AutoArray; diff --git a/dbms/src/Core/SortCursor.h b/dbms/src/Core/SortCursor.h index 3d7b5610e3a..385e57bcba4 100644 --- a/dbms/src/Core/SortCursor.h +++ b/dbms/src/Core/SortCursor.h @@ -35,7 +35,7 @@ struct SortCursorImpl /** Should we use Collator to sort a column? */ NeedCollationFlags need_collation; - /** Есть ли хотя бы один столбец с Collator. */ + /** Is there at least one column with Collator. */ bool has_collation = false; SortCursorImpl() {} @@ -48,7 +48,7 @@ struct SortCursorImpl bool empty() const { return rows == 0; } - /// Установить курсор в начало нового блока. + /// Set the cursor to the beginning of the new block. void reset(const Block & block) { all_columns.clear(); @@ -81,7 +81,7 @@ struct SortCursorImpl }; -/// Для лёгкости копирования. +/// For easy copying. struct SortCursor { SortCursorImpl * impl; @@ -90,7 +90,7 @@ struct SortCursor SortCursorImpl * operator-> () { return impl; } const SortCursorImpl * operator-> () const { return impl; } - /// Указанная строка данного курсора больше указанной строки другого курсора. + /// The specified row of this cursor is greater than the specified row of another cursor. bool greaterAt(const SortCursor & rhs, size_t lhs_pos, size_t rhs_pos) const { for (size_t i = 0; i < impl->sort_columns_size; ++i) @@ -106,13 +106,13 @@ struct SortCursor return impl->order > rhs.impl->order; } - /// Проверяет, что все строки в текущем блоке данного курсора меньше или равны, чем все строки текущего блока другого курсора. + /// Checks that all rows in the current block of this cursor are less than or equal to all the rows of the current block of another cursor. bool totallyLessOrEquals(const SortCursor & rhs) const { if (impl->rows == 0 || rhs.impl->rows == 0) return false; - /// Последняя строка данного курсора не больше первой строки другого. + /// The last row of this cursor is no larger than the first row of the another cursor. return !greaterAt(rhs, impl->rows - 1, 0); } @@ -121,7 +121,7 @@ struct SortCursor return greaterAt(rhs, impl->pos, rhs.impl->pos); } - /// Инвертировано, чтобы из priority queue элементы вынимались в порядке по возрастанию. + /// Inverted so that the priority queue elements are removed in ascending order. bool operator< (const SortCursor & rhs) const { return greater(rhs); @@ -129,7 +129,7 @@ struct SortCursor }; -/// Отдельный компаратор для locale-sensitive сравнения строк +/// Separate comparator for locale-sensitive string comparisons struct SortCursorWithCollation { SortCursorImpl * impl; @@ -167,7 +167,7 @@ struct SortCursorWithCollation if (impl->rows == 0 || rhs.impl->rows == 0) return false; - /// Последняя строка данного курсора не больше первой строки другого. + /// The last row of this cursor is no larger than the first row of the another cursor. return !greaterAt(rhs, impl->rows - 1, 0); } diff --git a/dbms/src/Core/SortDescription.h b/dbms/src/Core/SortDescription.h index 5e93181f815..70069b4beb9 100644 --- a/dbms/src/Core/SortDescription.h +++ b/dbms/src/Core/SortDescription.h @@ -11,11 +11,11 @@ class Collator; namespace DB { -/// Описание правила сортировки по одному столбцу. +/// Description of the sorting rule by one column. struct SortColumnDescription { - std::string column_name; /// Имя столбца. - size_t column_number; /// Номер столбца (используется, если не задано имя). + std::string column_name; /// The name of the column. + size_t column_number; /// Column number (used if no name is given). int direction; /// 1 - ascending, -1 - descending. int nulls_direction; /// 1 - NULLs and NaNs are greater, -1 - less. /// To achieve NULLS LAST, set it equal to direction, to achieve NULLS FIRST, set it opposite. @@ -31,7 +31,7 @@ struct SortColumnDescription std::string getID() const; }; -/// Описание правила сортировки по нескольким столбцам. +/// Description of the sorting rule for several columns. using SortDescription = std::vector; } diff --git a/dbms/src/Core/StringRef.h b/dbms/src/Core/StringRef.h index 5a397b401ee..ba1d32b80d5 100644 --- a/dbms/src/Core/StringRef.h +++ b/dbms/src/Core/StringRef.h @@ -15,7 +15,7 @@ #include -/// Штука, чтобы не создавать строки для поиска подстроки в хэш таблице. +/// The thing to avoid creating strings to find substrings in the hash table. struct StringRef { const char * data = nullptr; @@ -37,9 +37,9 @@ using UInt64 = DB::UInt64; #if __SSE2__ -/** Сравнение строк на равенство. - * Подход является спорным и выигрывает не во всех случаях. - * Подробнее смотрите hash_map_string_2.cpp +/** Compare strings for equality. + * The approach is controversial and does not win in all cases. + * For more information, see hash_map_string_2.cpp */ inline bool compareSSE2(const char * p1, const char * p2) @@ -153,12 +153,12 @@ inline bool operator> (StringRef lhs, StringRef rhs) } -/** Хэш-функции. - * Можно использовать либо CityHash64, - * либо функцию на основе инструкции crc32, - * которая является заведомо менее качественной, но на реальных наборах данных, - * при использовании в хэш-таблице, работает существенно быстрее. - * Подробнее см. hash_map_string_3.cpp +/** Hash functions. + * You can use either CityHash64, + * or a function based on the crc32 statement, + * which is obviously less qualitative, but on real data sets, + * when used in a hash table, works much faster. + * For more information, see hash_map_string_3.cpp */ struct StringRefHash64 @@ -171,19 +171,9 @@ struct StringRefHash64 #if __SSE4_2__ -#ifdef __SSE4_1__ #include -#else -inline UInt64 _mm_crc32_u64(UInt64 crc, UInt64 value) -{ - asm("crc32q %[value], %[crc]\n" : [crc] "+r" (crc) : [value] "rm" (value)); - return crc; -} - -#endif - -/// Кусочки взяты из CityHash. +/// Parts are taken from CityHash. inline UInt64 hashLen16(UInt64 u, UInt64 v) { @@ -262,7 +252,7 @@ struct CRC32Hash pos += 8; } while (pos + 8 < end); - UInt64 word = unalignedLoad(end - 8); /// Не уверен, что это нормально. + UInt64 word = unalignedLoad(end - 8); /// I'm not sure if this is normal. res = _mm_crc32_u64(res, word); return res; diff --git a/dbms/src/Core/Types.h b/dbms/src/Core/Types.h index 2da5274ae82..99491fe0518 100644 --- a/dbms/src/Core/Types.h +++ b/dbms/src/Core/Types.h @@ -9,7 +9,7 @@ namespace DB { -/** Типы данных для представления значений из БД в оперативке. +/** Data types for representing values from a database in RAM. */ STRONG_TYPEDEF(char, Null); @@ -74,7 +74,7 @@ template <> struct TypeName { static std::string get() { return "Fl template <> struct TypeName { static std::string get() { return "Float64"; } }; template <> struct TypeName { static std::string get() { return "String"; } }; -/// Этот тип не поддерживается СУБД, но используется в некоторых внутренних преобразованиях. +/// This type is not supported by the DBMS, but is used in some internal transformations. template <> struct TypeName{ static std::string get() { return "long double"; } }; } diff --git a/dbms/src/Core/toField.h b/dbms/src/Core/toField.h index 3185516ad15..b11dbdcae24 100644 --- a/dbms/src/Core/toField.h +++ b/dbms/src/Core/toField.h @@ -12,7 +12,7 @@ namespace DB { -/// Перевести что угодно в Field. +/// Transform anything to Field. template inline Field toField(const T & x) { diff --git a/dbms/src/DataStreams/AddingConstColumnBlockInputStream.h b/dbms/src/DataStreams/AddingConstColumnBlockInputStream.h index 7a430537f57..bf278ed9547 100644 --- a/dbms/src/DataStreams/AddingConstColumnBlockInputStream.h +++ b/dbms/src/DataStreams/AddingConstColumnBlockInputStream.h @@ -7,7 +7,7 @@ namespace DB { -/** Добавляет в блок материализованный const column с заданным значением. +/** Adds a materialized const column to the block with a specified value. */ template class AddingConstColumnBlockInputStream : public IProfilingBlockInputStream diff --git a/dbms/src/DataStreams/AddingDefaultBlockInputStream.h b/dbms/src/DataStreams/AddingDefaultBlockInputStream.h index 10d3e5affab..d34d8b1adf4 100644 --- a/dbms/src/DataStreams/AddingDefaultBlockInputStream.h +++ b/dbms/src/DataStreams/AddingDefaultBlockInputStream.h @@ -11,8 +11,8 @@ namespace DB { -/** Добавляет в блок недостающие столбцы со значениями по-умолчанию. - * Эти столбцы - материалированные (не константы). +/** Adds missing columns to the block with default values. + * These columns are materialized (not constants). */ class AddingDefaultBlockInputStream : public IProfilingBlockInputStream { diff --git a/dbms/src/DataStreams/AddingDefaultBlockOutputStream.h b/dbms/src/DataStreams/AddingDefaultBlockOutputStream.h index fda6cd35c32..e8f1cb30b80 100644 --- a/dbms/src/DataStreams/AddingDefaultBlockOutputStream.h +++ b/dbms/src/DataStreams/AddingDefaultBlockOutputStream.h @@ -11,8 +11,8 @@ namespace DB { -/** Добавляет в блок недостающие столбцы со значениями по-умолчанию. - * Эти столбцы - материалированные (не константы). +/** Adds missing columns to the block with default values. + * These columns are materialized (not constants). */ class AddingDefaultBlockOutputStream : public IBlockOutputStream { diff --git a/dbms/src/DataStreams/AggregatingBlockInputStream.h b/dbms/src/DataStreams/AggregatingBlockInputStream.h index 195aa7d2282..8bc92c3c3ec 100644 --- a/dbms/src/DataStreams/AggregatingBlockInputStream.h +++ b/dbms/src/DataStreams/AggregatingBlockInputStream.h @@ -10,17 +10,17 @@ namespace DB { -/** Агрегирует поток блоков, используя заданные столбцы-ключи и агрегатные функции. - * Столбцы с агрегатными функциями добавляет в конец блока. - * Если final=false, агрегатные функции не финализируются, то есть, не заменяются на своё значение, а содержат промежуточное состояние вычислений. - * Это необходимо, чтобы можно было продолжить агрегацию (например, объединяя потоки частично агрегированных данных). +/** Aggregates the stream of blocks using the specified key columns and aggregate functions. + * Columns with aggregate functions adds to the end of the block. + * If final = false, the aggregate functions are not finalized, that is, they are not replaced by their value, but contain an intermediate state of calculations. + * This is necessary so that aggregation can continue (for example, by combining streams of partially aggregated data). */ class AggregatingBlockInputStream : public IProfilingBlockInputStream { public: - /** keys берутся из GROUP BY части запроса - * Агрегатные функции ищутся везде в выражении. - * Столбцы, соответствующие keys и аргументам агрегатных функций, уже должны быть вычислены. + /** keys are taken from the GROUP BY part of the query + * Aggregate functions are searched everywhere in the expression. + * Columns corresponding to keys and arguments of aggregate functions must already be computed. */ AggregatingBlockInputStream(BlockInputStreamPtr input_, const Aggregator::Params & params_, bool final_) : params(params_), aggregator(params), final(final_) @@ -46,7 +46,7 @@ protected: bool executed = false; - /// Для чтения сброшенных во временный файл данных. + /// To read the data that was flushed into the temporary data file. struct TemporaryFileStream { ReadBufferFromFile file_in; @@ -57,7 +57,7 @@ protected: }; std::vector> temporary_inputs; - /** Отсюда будем доставать готовые блоки после агрегации. */ + /** From here we will get the completed blocks after the aggregation. */ std::unique_ptr impl; Logger * log = &Logger::get("AggregatingBlockInputStream"); diff --git a/dbms/src/DataStreams/AggregatingSortedBlockInputStream.h b/dbms/src/DataStreams/AggregatingSortedBlockInputStream.h index 8a5907e976e..455af621df1 100644 --- a/dbms/src/DataStreams/AggregatingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/AggregatingSortedBlockInputStream.h @@ -12,11 +12,11 @@ namespace DB { -/** Соединяет несколько сортированных потоков в один. - * При этом, для каждой группы идущих подряд одинаковых значений первичного ключа (столбцов, по которым сортируются данные), - * сливает их в одну строку. При слиянии, производится доагрегация данных - слияние состояний агрегатных функций, - * соответствующих одному значению первичного ключа. Для столбцов, не входящих в первичный ключ, и не имеющих тип AggregateFunction, - * при слиянии, выбирается первое попавшееся значение. +/** Merges several sorted streams to one. + * During this for each group of consecutive identical values of the primary key (the columns by which the data is sorted), + * merges them into one row. When merging, the data is pre-aggregated - merge of states of aggregate functions, + * corresponding to a one value of the primary key. For columns that are not part of the primary key and which do not have the AggregateFunction type, + * when merged, the first random value is selected. */ class AggregatingSortedBlockInputStream : public MergingSortedBlockInputStream { @@ -50,30 +50,30 @@ public: const SortDescription & getSortDescription() const override { return description; } protected: - /// Может возвращаться на 1 больше записей, чем max_block_size. + /// Can return 1 more records than max_block_size. Block readImpl() override; private: Logger * log = &Logger::get("AggregatingSortedBlockInputStream"); - /// Прочитали до конца. + /// Read finished. bool finished = false; - /// Столбцы с какими номерами надо аггрегировать. + /// Columns with which numbers should be aggregated. ColumnNumbers column_numbers_to_aggregate; ColumnNumbers column_numbers_not_to_aggregate; std::vector columns_to_aggregate; - RowRef current_key; /// Текущий первичный ключ. - RowRef next_key; /// Первичный ключ следующей строки. + RowRef current_key; /// The current primary key. + RowRef next_key; /// The primary key of the next row. - /** Делаем поддержку двух разных курсоров - с Collation и без. - * Шаблоны используем вместо полиморфных SortCursor'ов и вызовов виртуальных функций. + /** We support two different cursors - with Collation and without. + * Templates are used instead of polymorphic SortCursor and calls to virtual functions. */ template void merge(ColumnPlainPtrs & merged_columns, std::priority_queue & queue); - /** Извлечь все состояния аггрегатных функций и объединить с текущей группой. + /** Extract all states of aggregate functions and merge them with the current group. */ template void addRow(TSortCursor & cursor); diff --git a/dbms/src/DataStreams/AsynchronousBlockInputStream.h b/dbms/src/DataStreams/AsynchronousBlockInputStream.h index 37784e7b33a..7776b661e25 100644 --- a/dbms/src/DataStreams/AsynchronousBlockInputStream.h +++ b/dbms/src/DataStreams/AsynchronousBlockInputStream.h @@ -17,12 +17,13 @@ namespace CurrentMetrics namespace DB { -/** Выполняет другой BlockInputStream в отдельном потоке. - * Это служит для двух целей: - * 1. Позволяет сделать так, чтобы разные стадии конвейера выполнения запроса работали параллельно. - * 2. Позволяет не ждать до того, как данные будут готовы, а периодически проверять их готовность без блокировки. - * Это нужно, например, чтобы можно было во время ожидания проверить, не пришёл ли по сети пакет с просьбой прервать выполнение запроса. - * Также это позволяет выполнить несколько запросов одновременно. +/** Executes another BlockInputStream in a separate thread. + * This serves two purposes: + * 1. Allows you to make the different stages of the query execution pipeline work in parallel. + * 2. Allows you not to wait until the data is ready, and periodically check their readiness without blocking. + * This is necessary, for example, so that during the waiting period you can check if a packet + * has come over the network with a request to interrupt the execution of the query. + * It also allows you to execute multiple queries at the same time. */ class AsynchronousBlockInputStream : public IProfilingBlockInputStream { @@ -43,7 +44,7 @@ public: void readPrefix() override { - /// Не будем вызывать readPrefix у ребёнка, чтобы соответствующие действия совершались в отдельном потоке. + /// Do not call `readPrefix` on the child, so that the corresponding actions are performed in a separate thread. if (!started) { next(); @@ -64,8 +65,8 @@ public: } - /** Ждать готовность данных не более заданного таймаута. Запустить получение данных, если нужно. - * Если функция вернула true - данные готовы и можно делать read(); нельзя вызвать функцию сразу ещё раз. + /** Wait for the data to be ready no more than the specified timeout. Start receiving data if necessary. + * If the function returned true - the data is ready and you can do `read()`; You can not call the function just at the same moment again. */ bool poll(UInt64 milliseconds) { @@ -97,13 +98,13 @@ protected: Block readImpl() override { - /// Если вычислений ещё не было - вычислим первый блок синхронно + /// If there were no calculations yet, calculate the first block synchronously if (!started) { calculate(current_memory_tracker); started = true; } - else /// Если вычисления уже идут - подождём результата + else /// If the calculations are already in progress - wait for the result pool.wait(); if (exception) @@ -113,7 +114,7 @@ protected: if (!res) return res; - /// Запустим вычисления следующего блока + /// Start the next block calculation block = Block(); next(); @@ -128,7 +129,7 @@ protected: } - /// Вычисления, которые могут выполняться в отдельном потоке + /// Calculations that can be performed in a separate thread void calculate(MemoryTracker * memory_tracker) { CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread}; diff --git a/dbms/src/DataStreams/BinaryRowInputStream.h b/dbms/src/DataStreams/BinaryRowInputStream.h index 86b7d9fa1c9..af3cccf535a 100644 --- a/dbms/src/DataStreams/BinaryRowInputStream.h +++ b/dbms/src/DataStreams/BinaryRowInputStream.h @@ -10,7 +10,7 @@ class Block; class ReadBuffer; -/** Поток для ввода данных в бинарном построчном формате. +/** A stream for inputting data in a binary line-by-line format. */ class BinaryRowInputStream : public IRowInputStream { diff --git a/dbms/src/DataStreams/BinaryRowOutputStream.h b/dbms/src/DataStreams/BinaryRowOutputStream.h index f91b16cc79d..190e7e15afd 100644 --- a/dbms/src/DataStreams/BinaryRowOutputStream.h +++ b/dbms/src/DataStreams/BinaryRowOutputStream.h @@ -11,7 +11,7 @@ class IDataType; class WriteBuffer; -/** Поток для вывода данных в бинарном построчном формате. +/** A stream for outputting data in a binary line-by-line format. */ class BinaryRowOutputStream : public IRowOutputStream { diff --git a/dbms/src/DataStreams/BlockExtraInfoInputStream.h b/dbms/src/DataStreams/BlockExtraInfoInputStream.h index be816fe78ac..fb3c7e03817 100644 --- a/dbms/src/DataStreams/BlockExtraInfoInputStream.h +++ b/dbms/src/DataStreams/BlockExtraInfoInputStream.h @@ -5,8 +5,8 @@ namespace DB { -/** Прибавляет к одному потоку дополнительную информацию о блоках, которая задана - * в качестве параметра конструктора. +/** Adds to one thread additional block information that is specified + * as the constructor parameter. */ class BlockExtraInfoInputStream : public IProfilingBlockInputStream { diff --git a/dbms/src/DataStreams/BlockIO.h b/dbms/src/DataStreams/BlockIO.h index c8c9333cb9d..0d704993ea3 100644 --- a/dbms/src/DataStreams/BlockIO.h +++ b/dbms/src/DataStreams/BlockIO.h @@ -21,14 +21,14 @@ struct BlockIO BlockInputStreamPtr in; BlockOutputStreamPtr out; - Block in_sample; /// Пример блока, который будет прочитан из in. - Block out_sample; /// Пример блока, которого нужно писать в out. + Block in_sample; /// Example of a block to be read from `in`. + Block out_sample; /// Example of a block to be written to `out`. /// Callbacks for query logging could be set here. std::function finish_callback; std::function exception_callback; - /// Вызывайте эти функции, если нужно логгировать запрос. + /// Call these functions if you want to log the request. void onFinish() { if (finish_callback) @@ -43,7 +43,7 @@ struct BlockIO BlockIO & operator= (const BlockIO & rhs) { - /// Обеспечиваем правильный порядок уничтожения. + /// We provide the correct order of destruction. out = nullptr; in = nullptr; process_list_entry = nullptr; diff --git a/dbms/src/DataStreams/BlockOutputStreamFromRowOutputStream.h b/dbms/src/DataStreams/BlockOutputStreamFromRowOutputStream.h index b52b0ffca60..63743f7827a 100644 --- a/dbms/src/DataStreams/BlockOutputStreamFromRowOutputStream.h +++ b/dbms/src/DataStreams/BlockOutputStreamFromRowOutputStream.h @@ -7,8 +7,8 @@ namespace DB { -/** Преобразует поток для записи данных по строкам в поток для записи данных по блокам. - * Наример, для записи текстового дампа. +/** Transforms a stream to write data by rows to a stream to write data by blocks. + * For example, to write a text dump. */ class BlockOutputStreamFromRowOutputStream : public IBlockOutputStream { diff --git a/dbms/src/DataStreams/BlockStreamProfileInfo.h b/dbms/src/DataStreams/BlockStreamProfileInfo.h index bb0c80f88a7..e95fd3b4f9b 100644 --- a/dbms/src/DataStreams/BlockStreamProfileInfo.h +++ b/dbms/src/DataStreams/BlockStreamProfileInfo.h @@ -16,29 +16,29 @@ class Block; class ReadBuffer; class WriteBuffer; -/// Информация для профайлинга. См. IProfilingBlockInputStream.h +/// Information for profiling. See IProfilingBlockInputStream.h struct BlockStreamProfileInfo { bool started = false; - Stopwatch total_stopwatch {CLOCK_MONOTONIC_COARSE}; /// Время с учётом ожидания + Stopwatch total_stopwatch {CLOCK_MONOTONIC_COARSE}; /// Time with waiting time - String stream_name; /// Короткое имя потока, для которого собирается информация + String stream_name; /// The short name of the stream for which information is collected size_t rows = 0; size_t blocks = 0; size_t bytes = 0; - /// Информация о вложенных потоках - для выделения чистого времени работы. + /// Information about nested threads - to calculate pure processing time. using BlockStreamProfileInfos = std::vector; BlockStreamProfileInfos nested_infos; - /// Собрать BlockStreamProfileInfo для ближайших в дереве источников с именем name. Пример; собрать все info для PartialSorting stream-ов. + /// Collect BlockStreamProfileInfo for the nearest sources in the tree named `name`. Example; collect all info for PartialSorting streams. void collectInfosForStreamsWithName(const char * name, BlockStreamProfileInfos & res) const; - /** Получить число строк, если бы не было LIMIT-а. - * Если нет LIMIT-а - возвращается 0. - * Если запрос не содержит ORDER BY, то число может быть занижено - возвращается количество строк в блоках, которые были прочитаны до LIMIT-а. - * Если запрос содержит ORDER BY, то возвращается точное число строк, которое было бы, если убрать LIMIT. + /** Get the number of rows if there were no LIMIT. + * If there is no LIMIT, 0 is returned. + * If the query does not contain ORDER BY, the number can be underestimated - return the number of rows in blocks that were read before LIMIT reached. + * If the query contains an ORDER BY, then returns the exact number of rows as if LIMIT is removed from query. */ size_t getRowsBeforeLimit() const; bool hasAppliedLimit() const; @@ -57,10 +57,10 @@ struct BlockStreamProfileInfo private: void calculateRowsBeforeLimit() const; - /// Для этих полей сделаем accessor'ы, т.к. их необходимо предварительно вычислять. - mutable bool applied_limit = false; /// Применялся ли LIMIT + /// For these fields we make accessors, because they must be calculated beforehand. + mutable bool applied_limit = false; /// Whether LIMIT was applied mutable size_t rows_before_limit = 0; - mutable bool calculated_rows_before_limit = false; /// Вычислялось ли поле rows_before_limit + mutable bool calculated_rows_before_limit = false; /// Whether the field rows_before_limit was calculated }; } diff --git a/dbms/src/DataStreams/BlocksListBlockInputStream.h b/dbms/src/DataStreams/BlocksListBlockInputStream.h index a83748f399f..b477e34d3ca 100644 --- a/dbms/src/DataStreams/BlocksListBlockInputStream.h +++ b/dbms/src/DataStreams/BlocksListBlockInputStream.h @@ -6,17 +6,17 @@ namespace DB { -/** Поток блоков, из которого можно прочитать следующий блок из явно предоставленного списка. - * Также смотрите OneBlockInputStream. +/** A stream of blocks from which you can read the next block from an explicitly provided list. + * Also see OneBlockInputStream. */ class BlocksListBlockInputStream : public IProfilingBlockInputStream { public: - /// Захватывает владение списком блоков. + /// Acquires the ownership of the block list. BlocksListBlockInputStream(BlocksList && list_) : list(std::move(list_)), it(list.begin()), end(list.end()) {} - /// Использует лежащий где-то ещё список блоков. + /// Uses a list of blocks lying somewhere else. BlocksListBlockInputStream(BlocksList::iterator & begin_, BlocksList::iterator & end_) : it(begin_), end(end_) {} diff --git a/dbms/src/DataStreams/CSVRowInputStream.h b/dbms/src/DataStreams/CSVRowInputStream.h index 61bf3e11fb5..16d1f1056b8 100644 --- a/dbms/src/DataStreams/CSVRowInputStream.h +++ b/dbms/src/DataStreams/CSVRowInputStream.h @@ -9,14 +9,14 @@ namespace DB class ReadBuffer; -/** Поток для ввода данных в формате csv. - * Не соответствует https://tools.ietf.org/html/rfc4180 потому что пропускает пробелы и табы между значениями. +/** A stream for inputting data in csv format. + * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. */ class CSVRowInputStream : public IRowInputStream { public: - /** with_names - в первой строке заголовок с именами столбцов - * with_types - на следующей строке заголовок с именами типов + /** with_names - in the first line the header with column names + * with_types - on the next line header with type names */ CSVRowInputStream(ReadBuffer & istr_, const Block & sample_, const char delimiter_, bool with_names_ = false, bool with_types_ = false); @@ -35,11 +35,11 @@ private: bool with_types; DataTypes data_types; - /// Для удобной диагностики в случае ошибки. + /// For convenient diagnostics in case of an error. size_t row_num = 0; - /// Сколько байт было считано, не считая тех, что ещё в буфере. + /// How many bytes were read, not counting those that are still in the buffer. size_t bytes_read_at_start_of_buffer_on_current_row = 0; size_t bytes_read_at_start_of_buffer_on_prev_row = 0; diff --git a/dbms/src/DataStreams/CSVRowOutputStream.h b/dbms/src/DataStreams/CSVRowOutputStream.h index 81f536613af..161eab16985 100644 --- a/dbms/src/DataStreams/CSVRowOutputStream.h +++ b/dbms/src/DataStreams/CSVRowOutputStream.h @@ -10,14 +10,14 @@ namespace DB class WriteBuffer; -/** Поток для вывода данных в формате csv. - * Не соответствует https://tools.ietf.org/html/rfc4180 потому что использует LF, а не CR LF. +/** The stream for outputting data in csv format. + * Does not conform with https://tools.ietf.org/html/rfc4180 because it uses LF, not CR LF. */ class CSVRowOutputStream : public IRowOutputStream { public: - /** with_names - выводить в первой строке заголовок с именами столбцов - * with_types - выводить на следующей строке заголовок с именами типов + /** with_names - output in the first line a header with column names + * with_types - output in the next line header with the names of the types */ CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_ = false, bool with_types_ = false); diff --git a/dbms/src/DataStreams/CastTypeBlockInputStream.cpp b/dbms/src/DataStreams/CastTypeBlockInputStream.cpp new file mode 100644 index 00000000000..c8c7696fb16 --- /dev/null +++ b/dbms/src/DataStreams/CastTypeBlockInputStream.cpp @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + + +CastTypeBlockInputStream::CastTypeBlockInputStream( + const Context & context_, + BlockInputStreamPtr input_, + const Block & in_sample_, + const Block & out_sample_) + : context(context_) +{ + collectDifferent(in_sample_, out_sample_); + cast_functions.resize(in_sample_.columns()); + children.push_back(input_); +} + +String CastTypeBlockInputStream::getName() const +{ + return "CastType"; +} + +String CastTypeBlockInputStream::getID() const +{ + std::stringstream res; + res << "CastType(" << children.back()->getID() << ")"; + return res.str(); +} + +Block CastTypeBlockInputStream::readImpl() +{ + Block block = children.back()->read(); + + if (!block || cast_types.empty()) + return block; + + size_t block_size = block.columns(); + + if (block_size != cast_types.size()) + { + LOG_ERROR(log, "Number of columns do not match, skipping cast"); + return block; + } + + Block res; + + for (size_t i = 0; i < block_size; ++i) + { + const auto & elem = block.getByPosition(i); + + if (bool(cast_types[i])) + { + const auto & type = cast_types[i]->type; + Block temporary_block + { + { + elem.column, + elem.type, + elem.name + }, + { + std::make_shared(1, type->getName()), + std::make_shared(), + "" + }, + { + nullptr, + cast_types[i]->type, + "" + } + }; + + FunctionPtr & cast_function = cast_functions[i]; + + /// Initialize function. + if (!cast_function) + { + cast_function = FunctionFactory::instance().get("CAST", context); + + DataTypePtr unused_return_type; + ColumnsWithTypeAndName arguments{ temporary_block.getByPosition(0), temporary_block.getByPosition(1) }; + std::vector unused_prerequisites; + + /// Prepares function to execution. TODO It is not obvious. + cast_function->getReturnTypeAndPrerequisites(arguments, unused_return_type, unused_prerequisites); + } + + cast_function->execute(temporary_block, {0, 1}, 2); + + res.insert({ + temporary_block.getByPosition(2).column, + cast_types[i]->type, + cast_types[i]->name}); + } + else + { + res.insert(elem); + } + } + + return res; +} + +void CastTypeBlockInputStream::collectDifferent(const Block & in_sample, const Block & out_sample) +{ + size_t in_size = in_sample.columns(); + cast_types.resize(in_size); + for (size_t i = 0; i < in_size; ++i) + { + const auto & in_elem = in_sample.getByPosition(i); + const auto & out_elem = out_sample.getByPosition(i); + + /// Force conversion if source and destination types is different. + if (!out_elem.type->equals(*in_elem.type)) + { + cast_types[i] = NameAndTypePair(out_elem.name, out_elem.type); + } + } +} + +} diff --git a/dbms/src/DataStreams/CastTypeBlockInputStream.h b/dbms/src/DataStreams/CastTypeBlockInputStream.h new file mode 100644 index 00000000000..2f8c6adfb31 --- /dev/null +++ b/dbms/src/DataStreams/CastTypeBlockInputStream.h @@ -0,0 +1,41 @@ +#pragma once + +#include + +#include +#include +#include + + +namespace DB +{ + +class IFunction; + +/// Implicitly converts string and numeric values to Enum, numeric types to other numeric types. +class CastTypeBlockInputStream : public IProfilingBlockInputStream +{ +public: + CastTypeBlockInputStream(const Context & context_, + BlockInputStreamPtr input_, + const Block & in_sample_, + const Block & out_sample_); + + String getName() const override; + + String getID() const override; + +protected: + Block readImpl() override; + +private: + void collectDifferent(const Block & in_sample, const Block & out_sample); + +private: + const Context & context; + std::vector> cast_types; + std::vector> cast_functions; /// Used to perform type conversions. + Logger * log = &Logger::get("CastTypeBlockInputStream"); +}; + +} diff --git a/dbms/src/DataStreams/CollapsingFinalBlockInputStream.h b/dbms/src/DataStreams/CollapsingFinalBlockInputStream.h index 42ab1b96f22..4a867f96f2d 100644 --- a/dbms/src/DataStreams/CollapsingFinalBlockInputStream.h +++ b/dbms/src/DataStreams/CollapsingFinalBlockInputStream.h @@ -8,9 +8,9 @@ namespace DB { -/// Схлопывает одинаковые строки с противоположным знаком примерно как CollapsingSortedBlockInputStream. -/// Выдает строки в произвольном порядке (входные потоки по-прежнему должны быть упорядочены). -/// Выдает только строки с положительным знаком. +/// Collapses the same rows with the opposite sign roughly like CollapsingSortedBlockInputStream. +/// Outputs the rows in random order (the input streams must still be ordered). +/// Outputs only rows with a positive sign. class CollapsingFinalBlockInputStream : public IProfilingBlockInputStream { public: @@ -79,31 +79,31 @@ private: throw Exception("Sign column must have type Int8", ErrorCodes::BAD_TYPE_OF_FIELD); rows = sign_column->size(); - /// Заполняется целиком нулями. Потом выставляются единички в позициях строчек, которых нужно оставить. + /// Filled entirely with zeros. Then `1` are set in the positions of the rows to be left. filter.resize_fill(rows); } Block block; - /// Строки с одинаковым ключом будут упорядочены по возрастанию stream_index. + /// Rows with the same key will be sorted in ascending order of stream_index. size_t stream_index; size_t rows; - /// Какие строки нужно оставить. Заполняется при слиянии потоков. + /// Which rows should be left. Filled when the threads merge. IColumn::Filter filter; - /// Указывают в block. + /// Point to `block`. ConstColumnPlainPtrs sort_columns; const ColumnInt8 * sign_column; - /// Когда достигает нуля, блок можно выдавать в ответ. + /// When it reaches zero, the block can be outputted in response. int refcount = 0; - /// Куда положить блок, когда он готов попасть в ответ. + /// Where to put the block when it is ready to be outputted in response. BlockPlainPtrs * output_blocks; }; - /// При удалении последней ссылки на блок, добавляет блок в output_blocks. + /// When deleting the last block reference, adds a block to `output_blocks`. class MergingBlockPtr { public: @@ -135,7 +135,7 @@ private: destroy(); } - /// Обнулить указатель и не добавлять блок в output_blocks. + /// Zero the pointer and do not add a block to output_blocks. void cancel() { if (ptr) @@ -194,7 +194,7 @@ private: return block->stream_index > rhs.block->stream_index; } - /// Не согласован с operator< : не учитывает order. + /// Not consistent with operator< : does not consider order. bool equal(const Cursor & rhs) const { if (!block || !rhs.block) @@ -215,7 +215,7 @@ private: return block->sign_column->getData()[pos]; } - /// Помечает, что эту строку нужно взять в ответ. + /// Indicates that this row should be outputted in response. void addToFilter() { block->filter[pos] = 1; @@ -245,16 +245,16 @@ private: Queue queue; - Cursor previous; /// Текущий первичный ключ. - Cursor last_positive; /// Последняя положительная строка для текущего первичного ключа. + Cursor previous; /// The current primary key. + Cursor last_positive; /// The last positive row for the current primary key. - size_t count_positive = 0; /// Количество положительных строк для текущего первичного ключа. - size_t count_negative = 0; /// Количество отрицательных строк для текущего первичного ключа. - bool last_is_positive = false; /// true, если последняя строка для текущего первичного ключа положительная. + size_t count_positive = 0; /// The number of positive rows for the current primary key. + size_t count_negative = 0; /// The number of negative rows for the current primary key. + bool last_is_positive = false; /// true if the last row for the current primary key is positive. - size_t count_incorrect_data = 0; /// Чтобы не писать в лог слишком много сообщений об ошибке. + size_t count_incorrect_data = 0; /// To prevent too many error messages from writing to the log. - /// Посчитаем, сколько блоков получили на вход и отдали на выход. + /// Count the number of blocks fetched and outputted. size_t blocks_fetched = 0; size_t blocks_output = 0; diff --git a/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h b/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h index c229246813f..0f082fbcb80 100644 --- a/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h @@ -8,17 +8,17 @@ namespace DB { -/** Соединяет несколько сортированных потоков в один. - * При этом, для каждой группы идущих подряд одинаковых значений первичного ключа (столбцов, по которым сортируются данные), - * оставляет не более одной строки со значением столбца sign_column = -1 ("отрицательной строки") - * и не более одиной строки со значением столбца sign_column = 1 ("положительной строки"). - * То есть - производит схлопывание записей из лога изменений. +/** Merges several sorted streams to one. + * For each group of consecutive identical values of the primary key (the columns by which the data is sorted), + * keeps no more than one row with the value of the column `sign_column = -1` ("negative row") + * and no more than a row with the value of the column `sign_column = 1` ("positive row"). + * That is, it collapses the records from the change log. * - * Если количество положительных и отрицательных строк совпадает, и последняя строка положительная - то пишет первую отрицательную и последнюю положительную строку. - * Если количество положительных и отрицательных строк совпадает, и последняя строка отрицательная - то ничего не пишет. - * Если положительных на 1 больше, чем отрицательных - то пишет только последнюю положительную строку. - * Если отрицательных на 1 больше, чем положительных - то пишет только первую отрицательную строку. - * Иначе - логическая ошибка. + * If the number of positive and negative rows is the same, and the last row is positive, then the first negative and last positive rows are written. + * If the number of positive and negative rows is the same, and the last line is negative, it writes nothing. + * If the positive by 1 is greater than the negative rows, then only the last positive row is written. + * If negative by 1 is greater than positive rows, then only the first negative row is written. + * Otherwise, a logical error. */ class CollapsingSortedBlockInputStream : public MergingSortedBlockInputStream { @@ -50,7 +50,7 @@ public: } protected: - /// Может возвращаться на 1 больше записей, чем max_block_size. + /// Can return 1 more records than max_block_size. Block readImpl() override; private: @@ -59,21 +59,21 @@ private: Logger * log = &Logger::get("CollapsingSortedBlockInputStream"); - /// Прочитали до конца. + /// Read is finished. bool finished = false; - RowRef current_key; /// Текущий первичный ключ. - RowRef next_key; /// Первичный ключ следующей строки. + RowRef current_key; /// The current primary key. + RowRef next_key; /// The primary key of the next row. - RowRef first_negative; /// Первая отрицательная строка для текущего первичного ключа. - RowRef last_positive; /// Последняя положительная строка для текущего первичного ключа. - RowRef last_negative; /// Последняя отрицательная. Сорраняется только если ни одной строки в ответ еще не выписано. + RowRef first_negative; /// The first negative row for the current primary key. + RowRef last_positive; /// The last positive row for the current primary key. + RowRef last_negative; /// Last negative row. It is only stored if there is not one row is written to output. - size_t count_positive = 0; /// Количество положительных строк для текущего первичного ключа. - size_t count_negative = 0; /// Количество отрицательных строк для текущего первичного ключа. - bool last_is_positive = false; /// true, если последняя строка для текущего первичного ключа положительная. + size_t count_positive = 0; /// The number of positive rows for the current primary key. + size_t count_negative = 0; /// The number of negative rows for the current primary key. + bool last_is_positive = false; /// true if the last row for the current primary key is positive. - size_t count_incorrect_data = 0; /// Чтобы не писать в лог слишком много сообщений об ошибке. + size_t count_incorrect_data = 0; /// To prevent too many error messages from writing to the log. size_t blocks_written = 0; @@ -83,13 +83,13 @@ private: size_t last_positive_pos = 0; /// Global row number of last_positive size_t last_negative_pos = 0; /// Global row number of last_negative - /** Делаем поддержку двух разных курсоров - с Collation и без. - * Шаблоны используем вместо полиморфных SortCursor'ов и вызовов виртуальных функций. + /** We support two different cursors - with Collation and without. + * Templates are used instead of polymorphic SortCursors and calls to virtual functions. */ template void merge(ColumnPlainPtrs & merged_columns, std::priority_queue & queue); - /// Вставить в результат строки для текущего первичного ключа. + /// Output to result rows for the current primary key. void insertRows(ColumnPlainPtrs & merged_columns, size_t & merged_rows, bool last_in_stream = false); void reportIncorrectData(); diff --git a/dbms/src/DataStreams/ConcatBlockInputStream.h b/dbms/src/DataStreams/ConcatBlockInputStream.h index 5133265208b..e8da47a7c02 100644 --- a/dbms/src/DataStreams/ConcatBlockInputStream.h +++ b/dbms/src/DataStreams/ConcatBlockInputStream.h @@ -7,9 +7,9 @@ namespace DB { -/** Объединяет несколько источников в один. - * В отличие от UnionBlockInputStream, делает это последовательно. - * Блоки разных источников не перемежаются друг с другом. +/** Combines several sources into one. + * Unlike UnionBlockInputStream, it does this sequentially. + * Blocks of different sources are not interleaved with each other. */ class ConcatBlockInputStream : public IProfilingBlockInputStream { @@ -31,7 +31,7 @@ public: for (size_t i = 0; i < children.size(); ++i) children_ids[i] = children[i]->getID(); - /// Будем считать, что порядок конкатенации блоков не имеет значения. + /// Let's assume that the order of concatenation of blocks does not matter. std::sort(children_ids.begin(), children_ids.end()); for (size_t i = 0; i < children_ids.size(); ++i) diff --git a/dbms/src/DataStreams/CreatingSetsBlockInputStream.h b/dbms/src/DataStreams/CreatingSetsBlockInputStream.h index d9d096f6a92..c3b5bfcf3b7 100644 --- a/dbms/src/DataStreams/CreatingSetsBlockInputStream.h +++ b/dbms/src/DataStreams/CreatingSetsBlockInputStream.h @@ -10,9 +10,9 @@ namespace Poco { class Logger; } namespace DB { -/** Отдаёт без изменений данные из потока блоков, но - * в функции readPrefix или перед чтением первого блока - * инициализирует все переданные множества. +/** Returns the data from the stream of blocks without changes, but + * in the `readPrefix` function or before reading the first block + * initializes all the passed sets. */ class CreatingSetsBlockInputStream : public IProfilingBlockInputStream { @@ -44,7 +44,7 @@ public: for (size_t i = 0; i < children.size(); ++i) children_ids[i] = children[i]->getID(); - /// Будем считать, что порядок создания множеств не имеет значения. + /// Let's assume that the order of creating sets does not matter. std::sort(children_ids.begin(), children_ids.end() - 1); for (size_t i = 0; i < children_ids.size(); ++i) @@ -54,7 +54,7 @@ public: return res.str(); } - /// Берёт totals только из основного источника, а не из источников подзапросов. + /// Takes `totals` only from the main source, not from subquery sources. const Block & getTotals() override; protected: diff --git a/dbms/src/DataStreams/EmptyBlockOutputStream.h b/dbms/src/DataStreams/EmptyBlockOutputStream.h index 340be887939..b84e8d483e6 100644 --- a/dbms/src/DataStreams/EmptyBlockOutputStream.h +++ b/dbms/src/DataStreams/EmptyBlockOutputStream.h @@ -12,8 +12,8 @@ namespace ErrorCodes extern const int CANNOT_WRITE_TO_EMPTY_BLOCK_OUTPUT_STREAM; } -/** При попытке записать в этот поток блоков, кидает исключение. - * Используется там, где, в общем случае, нужно передать поток блоков, но в некоторых случаях, он не должен быть использован. +/** When trying to write blocks to this stream of blocks, throws an exception. + * Used where, in general, you need to pass a stream of blocks, but in some cases, it should not be used. */ class EmptyBlockOutputStream : public IBlockOutputStream { diff --git a/dbms/src/DataStreams/ExpressionBlockInputStream.h b/dbms/src/DataStreams/ExpressionBlockInputStream.h index c19921e1b67..6e5b166bad8 100644 --- a/dbms/src/DataStreams/ExpressionBlockInputStream.h +++ b/dbms/src/DataStreams/ExpressionBlockInputStream.h @@ -8,10 +8,10 @@ namespace DB class ExpressionActions; -/** Выполняет над блоком вычисление некоторого выражения. - * Выражение состоит из идентификаторов столбцов из блока, констант, обычных функций. - * Например: hits * 2 + 3, url LIKE '%yandex%' - * Выражение обрабатывает каждую строку независимо от других. +/** Executes a certain expression over the block. + * The expression consists of column identifiers from the block, constants, common functions. + * For example: hits * 2 + 3, url LIKE '%yandex%' + * The expression processes each row independently of the others. */ class ExpressionBlockInputStream : public IProfilingBlockInputStream { diff --git a/dbms/src/DataStreams/FilterBlockInputStream.h b/dbms/src/DataStreams/FilterBlockInputStream.h index 25767b22dc2..3f0cda420c6 100644 --- a/dbms/src/DataStreams/FilterBlockInputStream.h +++ b/dbms/src/DataStreams/FilterBlockInputStream.h @@ -9,9 +9,9 @@ namespace DB class ExpressionActions; -/** Реализует операции WHERE, HAVING. - * На вход подаётся поток блоков и выражение, добавляющее в блок один столбец типа ColumnUInt8, содержащий условия фильтрации. - * Выражение вычисляется и возвращается поток блоков, в котором содержатся только отфильтрованные строки. +/** Implements WHERE, HAVING operations. + * A stream of blocks and an expression, which adds to the block one ColumnUInt8 column containing the filtering conditions, are passed as input. + * The expression is evaluated and a stream of blocks is returned, which contains only the filtered rows. */ class FilterBlockInputStream : public IProfilingBlockInputStream { @@ -19,7 +19,7 @@ private: using ExpressionActionsPtr = std::shared_ptr; public: - /// filter_column_ - номер столбца с условиями фильтрации. + /// filter_column_ - the number of the column with filter conditions. FilterBlockInputStream(BlockInputStreamPtr input_, ExpressionActionsPtr expression_, ssize_t filter_column_); FilterBlockInputStream(BlockInputStreamPtr input_, ExpressionActionsPtr expression_, const String & filter_column_name_); diff --git a/dbms/src/DataStreams/ForkBlockInputStreams.h b/dbms/src/DataStreams/ForkBlockInputStreams.h index 0fc7cd6a88d..e10e0d3f8fd 100644 --- a/dbms/src/DataStreams/ForkBlockInputStreams.h +++ b/dbms/src/DataStreams/ForkBlockInputStreams.h @@ -7,27 +7,27 @@ namespace DB { -/** Позволяет из одного источника сделать несколько. - * Используется для однопроходного выполнения сразу нескольких запросов. +/** Allows you to make several sources from one. + * Used for single-pass execution of several queries at once. * - * Несколько полученных источников должны читаться из разных потоков! - * Расходует O(1) оперативки (не буферизует все данные). - * Для этого, чтения из разных полученных источников синхронизируются: - * чтение следующего блока блокируется, пока все источники не прочитают текущий блок. + * Multiple received sources should be read from different threads! + * Uses O(1) RAM (does not buffer all data). + * For this, readings from different sources are synchronized: + * reading of next block is blocked until all sources have read the current block. */ class ForkBlockInputStreams : private boost::noncopyable { public: ForkBlockInputStreams(BlockInputStreamPtr source_) : source(source_) {} - /// Создать источник. Вызывайте функцию столько раз, сколько размноженных источников вам нужно. + /// Create a source. Call the function as many times as many forked sources you need. BlockInputStreamPtr createInput() { destinations.emplace_back(std::make_shared(1)); return destinations.back(); } - /// Перед тем, как из полученных источников можно будет читать, необходимо "запустить" эту конструкцию. + /// Before you can read from the sources you have to "run" this construct. void run() { while (1) @@ -56,12 +56,12 @@ public: } private: - /// Откуда читать. + /// From where to read. BlockInputStreamPtr source; - /** Размноженные источники. - * Сделаны на основе очереди небольшой длины. - * Блок из source кладётся в каждую очередь. + /** Forked sources. + * Made on the basis of a queue of small length. + * A block from `source` is put in each queue. */ using Destination = std::shared_ptr; using Destinations = std::list; diff --git a/dbms/src/DataStreams/FormatFactory.h b/dbms/src/DataStreams/FormatFactory.h index a58f778df4f..c2aaa9a9fb7 100644 --- a/dbms/src/DataStreams/FormatFactory.h +++ b/dbms/src/DataStreams/FormatFactory.h @@ -9,8 +9,8 @@ namespace DB class Context; -/** Позволяет создать IBlockInputStream или IBlockOutputStream по названию формата. - * Замечание: формат и сжатие - независимые вещи. +/** Allows to create an IBlockInputStream or IBlockOutputStream by the name of the format. + * Note: format and compression are independent things. */ class FormatFactory { diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h index 5a7a6a74e70..0c289cb025d 100644 --- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h +++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h @@ -111,16 +111,16 @@ namespace Graphite }; } -/** Соединяет несколько сортированных потоков в один. +/** Merges several sorted streams into one. * - * При этом, для каждой группы идущих подряд одинаковых значений столбца path, - * и одинаковых значений time с округлением до некоторой точности - * (где точность округления зависит от набора шаблонов на path - * и количества времени, прошедшего от time до заданного времени), - * оставляет одну строку, - * выполняя округление времени, - * слияние значений value, используя заданные агрегатные функции, - * а также оставляя максимальное значение столбца version. + * For each group of consecutive identical values of the `path` column, + * and the same `time` values, rounded to some precision + * (where rounding accuracy depends on the template set for `path` + * and the amount of time elapsed from `time` to the specified time), + * keeps one line, + * performing the rounding of time, + * merge `value` values using the specified aggregate functions, + * as well as keeping the maximum value of the `version` column. */ class GraphiteRollupSortedBlockInputStream : public MergingSortedBlockInputStream { @@ -200,14 +200,14 @@ private: template void merge(ColumnPlainPtrs & merged_columns, std::priority_queue & queue); - /// Вставить значения в результирующие столбцы, которые не будут меняться в дальнейшем. + /// Insert the values into the resulting columns, which will not be changed in the future. template void startNextRow(ColumnPlainPtrs & merged_columns, TSortCursor & cursor); - /// Вставить в результирующие столбцы вычисленные значения time, value, version по последней группе строк. + /// Insert the calculated `time`, `value`, `version` values into the resulting columns by the last group of rows. void finishCurrentRow(ColumnPlainPtrs & merged_columns); - /// Обновить состояние агрегатной функции новым значением value. + /// Update the state of the aggregate function with the new `value`. void accumulateRow(RowRef & row); }; diff --git a/dbms/src/DataStreams/IBlockInputStream.h b/dbms/src/DataStreams/IBlockInputStream.h index e0d3c44f0ef..24c4692e3f7 100644 --- a/dbms/src/DataStreams/IBlockInputStream.h +++ b/dbms/src/DataStreams/IBlockInputStream.h @@ -31,53 +31,53 @@ namespace ErrorCodes } -/** Коллбэк для отслеживания прогресса выполнения запроса. - * Используется в IProfilingBlockInputStream и Context-е. - * Функция принимает количество строк в последнем блоке, количество байт в последнем блоке. - * Следует иметь ввиду, что колбэк может вызываться из разных потоков. +/** Callback to track the progress of the query. + * Used in IProfilingBlockInputStream and Context. + * The function takes the number of rows in the last block, the number of bytes in the last block. + * Note that the callback can be called from different threads. */ using ProgressCallback = std::function; -/** Интерфейс потока для чтения данных по блокам из БД. - * Реляционные операции предполагается делать также реализациями этого интерфейса. +/** The stream interface for reading data by blocks from the database. + * Relational operations are supposed to be done also as implementations of this interface. */ class IBlockInputStream : private boost::noncopyable { public: IBlockInputStream() {} - /** Прочитать следующий блок. - * Если блоков больше нет - вернуть пустой блок (для которого operator bool возвращает false). + /** Read next block. + * If there are no more blocks, return an empty block (for which operator `bool` returns false). */ virtual Block read() = 0; - /** Получить информацию про последний полученный блок. + /** Get information about the last block received. */ virtual BlockExtraInfo getBlockExtraInfo() const { throw Exception("Method getBlockExtraInfo is not supported by the data stream " + getName(), ErrorCodes::NOT_IMPLEMENTED); } - /** Прочитать что-нибудь перед началом всех данных или после конца всех данных. - * В функции readSuffix можно реализовать финализацию, которая может привести к исключению. - * readPrefix() должна вызываться до первого вызова read(). - * readSuffix() должна вызываться после того, как read() вернула пустой блок, или после вызова cancel(), но не во время выполнения read(). + /** Read something before starting all data or after the end of all data. + * In the `readSuffix` function, you can implement a finalization that can lead to an exception. + * readPrefix() must be called before the first call to read(). + * readSuffix() should be called after read() returns an empty block, or after a call to cancel(), but not during read() execution. */ virtual void readPrefix() {} virtual void readSuffix() {} virtual ~IBlockInputStream() {} - /** Для вывода дерева преобразований потока данных (плана выполнения запроса). + /** To output the data stream transformation tree (query execution plan). */ virtual String getName() const = 0; - /** Уникальный идентификатор части конвейера выполнения запроса. - * Источники с одинаковым идентификатором считаются идентичными - * (выдающими одинаковые данные), и могут быть заменены на один источник - * при одновременном выполнении сразу нескольких запросов. - * Если источник нельзя склеивать ни с каким другим - верните в качестве идентификатора адрес объекта. + /** The unique identifier of the pipeline part of the query execution. + * Sources with the same identifier are considered identical + * (producing the same data), and can be replaced by one source + * if several queries are executed simultaneously. + * If the source can not be glued together with any other - return the object's address as an identifier. */ virtual String getID() const = 0; @@ -92,18 +92,18 @@ public: void dumpTree(std::ostream & ostr, size_t indent = 0, size_t multiplier = 1); - /// Получить листовые источники (не считая этот). + /// Get leaf sources (not including this one). BlockInputStreams getLeaves(); - /// Получить количество строк и байт, прочитанных в листовых источниках. + /// Get the number of rows and bytes read in the leaf sources. void getLeafRowsBytes(size_t & rows, size_t & bytes); - /** Проверить глубину конвейера. - * Если задано max_depth и глубина больше - кинуть исключение. + /** Check the depth of the pipeline. + * If max_depth is specified and the `depth` is greater - throw an exception. */ size_t checkDepth(size_t max_depth) const; - /** Не давать изменить таблицу, пока жив поток блоков. + /** Do not allow to change the table while the blocks stream is alive. */ void addTableLock(const TableStructureReadLockPtr & lock) { table_locks.push_back(lock); } @@ -117,8 +117,8 @@ private: size_t checkDepthImpl(size_t max_depth, size_t level) const; - /** Получить текст, который идентифицирует этот источник и всё поддерево. - * В отличие от getID - без учёта параметров. + /** Get text that identifies this source and the entire subtree. + * Unlike getID - without taking into account the parameters. */ String getTreeID() const; }; diff --git a/dbms/src/DataStreams/IProfilingBlockInputStream.h b/dbms/src/DataStreams/IProfilingBlockInputStream.h index 17806f79785..88834789464 100644 --- a/dbms/src/DataStreams/IProfilingBlockInputStream.h +++ b/dbms/src/DataStreams/IProfilingBlockInputStream.h @@ -19,66 +19,66 @@ class IProfilingBlockInputStream; using ProfilingBlockInputStreamPtr = std::shared_ptr; -/** Смотрит за тем, как работает источник блоков. - * Позволяет получить информацию для профайлинга: - * строк в секунду, блоков в секунду, мегабайт в секунду и т. п. - * Позволяет остановить чтение данных (во вложенных источниках). +/** Watches out at how the source of the blocks works. + * Lets you get information for profiling: + * rows per second, blocks per second, megabytes per second, etc. + * Allows you to stop reading data (in nested sources). */ class IProfilingBlockInputStream : public IBlockInputStream { public: Block read() override final; - /** Реализация по-умолчанию вызывает readPrefixImpl() у себя, а затем readPrefix() у всех детей рекурсивно. - * Есть случаи, когда вы не хотите, чтобы readPrefix у детей вызывался синхронно, в этой функции, - * а хотите, чтобы они вызывались, например, в отдельных потоках (для распараллеливания инициализации детей). - * Тогда перегрузите функцию readPrefix. + /** The default implementation calls readPrefixImpl() on itself, and then readPrefix() recursively for all children. + * There are cases when you do not want `readPrefix` of children to be called synchronously, in this function, + * but you want them to be called, for example, in separate threads (for parallel initialization of children). + * Then overload `readPrefix` function. */ void readPrefix() override; - /** Реализация по-умолчанию вызывает рекурсивно readSuffix() у всех детей, а затем readSuffixImpl() у себя. - * Если этот поток вызывает у детей read() в отдельном потоке, этот поведение обычно неверно: - * readSuffix() у ребенка нельзя вызывать в момент, когда read() того же ребенка выполняется в другом потоке. - * В таком случае нужно переопределить этот метод, чтобы readSuffix() у детей вызывался, например, после соединения потоков. + /** The default implementation calls recursively readSuffix() on all children, and then readSuffixImpl() on itself. + * If this stream calls read() in children in a separate thread, this behavior is usually incorrect: + * readSuffix() of the child can not be called at the moment when the same child's read() is executed in another thread. + * In this case, you need to override this method so that readSuffix() in children is called, for example, after connecting streams. */ void readSuffix() override; /// Get information about execution speed. const BlockStreamProfileInfo & getProfileInfo() const { return info; } - /** Получить "тотальные" значения. - * Реализация по-умолчанию берёт их из себя или из первого дочернего источника, в котором они есть. - * Переопределённый метод может провести некоторые вычисления. Например, применить выражение к totals дочернего источника. - * Тотальных значений может не быть - тогда возвращается пустой блок. + /** Get "total" values. + * The default implementation takes them from itself or from the first child source in which they are. + * The overridden method can perform some calculations. For example, apply an expression to the `totals` of the child source. + * There can be no total values - then an empty block is returned. * - * Вызывайте этот метод только после получения всех данных с помощью read, - * иначе будут проблемы, если какие-то данные в это же время вычисляются в другом потоке. + * Call this method only after all the data has been retrieved with `read`, + * otherwise there will be problems if any data at the same time is computed in another thread. */ virtual const Block & getTotals(); - /// То же самое для минимумов и максимумов. + /// The same for minimums and maximums. const Block & getExtremes() const; - /** Установить колбэк прогресса выполнения. - * Колбэк пробрасывается во все дочерние источники. - * По-умолчанию, он вызывается для листовых источников, после каждого блока. - * (Но это может быть переопределено в методе progress()) - * Функция принимает количество строк в последнем блоке, количество байт в последнем блоке. - * Следует иметь ввиду, что колбэк может вызываться из разных потоков. + /** Set the execution progress bar callback. + * The callback is passed to all child sources. + * By default, it is called for leaf sources, after each block. + * (But this can be overridden in the progress() method) + * The function takes the number of rows in the last block, the number of bytes in the last block. + * Note that the callback can be called from different threads. */ void setProgressCallback(ProgressCallback callback); - /** В этом методе: - * - вызывается колбэк прогресса; - * - обновляется статус выполнения запроса в ProcessList-е; - * - проверяются ограничения и квоты, которые должны быть проверены не в рамках одного источника, - * а над общим количеством потраченных ресурсов во всех источниках сразу (информация в ProcessList-е). + /** In this method: + * - the progress callback is called; + * - the status of the query execution in ProcessList is updated; + * - checks restrictions and quotas that should be checked not within the same source, + * but over the total amount of resources spent in all sources at once (information in the ProcessList). */ virtual void progress(const Progress & value) { - /// Данные для прогресса берутся из листовых источников. + /// The data for progress is taken from leaf sources. if (children.empty()) progressImpl(value); } @@ -86,26 +86,26 @@ public: void progressImpl(const Progress & value); - /** Установить указатель на элемент списка процессов. - * Пробрасывается во все дочерние источники. - * В него будет записываться общая информация о потраченных на запрос ресурсах. - * На основе этой информации будет проверяться квота, и некоторые ограничения. - * Также эта информация будет доступна в запросе SHOW PROCESSLIST. + /** Set the pointer to the process list item. + * It is passed to all child sources. + * General information about the resources spent on the request will be written into it. + * Based on this information, the quota and some restrictions will be checked. + * This information will also be available in the SHOW PROCESSLIST request. */ void setProcessListElement(ProcessListElement * elem); - /** Установить информацию о приблизительном общем количестве строк, которых нужно прочитать. + /** Set the approximate total number of rows to read. */ void setTotalRowsApprox(size_t value) { total_rows_approx = value; } - /** Попросить прервать получение данных как можно скорее. - * По-умолчанию - просто выставляет флаг is_cancelled и просит прерваться всех детей. - * Эта функция может вызываться несколько раз, в том числе, одновременно из разных потоков. + /** Ask to abort the receipt of data as soon as possible. + * By default - just sets the flag is_cancelled and asks that all children be interrupted. + * This function can be called several times, including simultaneously from different threads. */ virtual void cancel(); - /** Требуется ли прервать получение данных. + /** Do you want to abort the receipt of data. */ bool isCancelled() const { @@ -140,7 +140,7 @@ public: /// in rows per second size_t min_execution_speed = 0; - /// Проверять, что скорость не слишком низкая, после прошествия указанного времени. + /// Verify that the speed is not too low after the specified time has elapsed. Poco::Timespan timeout_before_checking_execution_speed = 0; }; @@ -155,15 +155,15 @@ public: return limits; } - /** Установить квоту. Если устанавливается квота на объём исходных данных, - * то следует ещё установить mode = LIMITS_TOTAL в LocalLimits с помощью setLimits. + /** Set the quota. If you set a quota on the amount of raw data, + * then you should also set mode = LIMITS_TOTAL to LocalLimits with setLimits. */ void setQuota(QuotaForIntervals & quota_) { quota = "a_; } - /// Включить рассчёт минимумов и максимумов по столбцам результата. + /// Enable calculation of minimums and maximums by the result columns. void enableExtremes() { enabled_extremes = true; } protected: @@ -174,49 +174,49 @@ protected: bool enabled_extremes = false; - /// Дополнительная информация, которая может образоваться в процессе работы. + /// Additional information that can be generated during the work process. - /// Тотальные значения при агрегации. + /// Total values during aggregation. Block totals; - /// Минимумы и максимумы. Первая строчка блока - минимумы, вторая - максимумы. + /// Minimums and maximums. The first row of the block - minimums, the second - the maximums. Block extremes; - /// Приблизительное общее количество строк, которых нужно прочитать. Для прогресс-бара. + /// The approximate total number of rows to read. For progress bar. size_t total_rows_approx = 0; - /// Информация о приблизительном общем количестве строк собрана в родительском источнике. + /// Information about the approximate total number of rows is collected in the parent source. bool collected_total_rows_approx = false; - /// Превышено ограничение на количество строк/байт, и нужно прекратить выполнение на следующем вызове read, как будто поток иссяк. + /// The limit on the number of rows/bytes has been exceeded, and you need to stop execution on the next `read` call, as if the thread has run out. bool limit_exceeded_need_break = false; - /// Ограничения и квоты. + /// Limitations and quotas. LocalLimits limits; - QuotaForIntervals * quota = nullptr; /// Если nullptr - квота не используется. + QuotaForIntervals * quota = nullptr; /// If nullptr - the quota is not used. double prev_elapsed = 0; - /// Наследники должны реализовать эту функцию. + /// The heirs must implement this function. virtual Block readImpl() = 0; - /// Здесь можно делать предварительную инициализацию. + /// Here you can do a preliminary initialization. virtual void readPrefixImpl() {} - /// Здесь необходимо делать финализацию, которая может привести к исключению. + /// Here you need to do a finalization, which can lead to an exception. virtual void readSuffixImpl() {} void updateExtremes(Block & block); - /** Проверить ограничения и квоты. - * Но только те, что могут быть проверены в рамках каждого отдельного источника. + /** Check constraints and quotas. + * But only those that can be tested within each separate source. */ bool checkLimits(); void checkQuota(Block & block); - /// Собрать информацию о приблизительном общем числе строк по всем детям. + /// Gather information about the approximate total number of rows from all children. void collectTotalRowsApprox(); - /** Передать информацию о приблизительном общем числе строк в колбэк прогресса. - * Сделано так, что отправка происходит лишь в верхнем источнике. + /** Send information about the approximate total number of rows to the progress bar. + * It is done so that sending occurs only in the upper source. */ void collectAndSendTotalRowsApprox(); }; diff --git a/dbms/src/DataStreams/JSONCompactRowOutputStream.h b/dbms/src/DataStreams/JSONCompactRowOutputStream.h index 9423cde034c..21ebc58b7c3 100644 --- a/dbms/src/DataStreams/JSONCompactRowOutputStream.h +++ b/dbms/src/DataStreams/JSONCompactRowOutputStream.h @@ -9,7 +9,7 @@ namespace DB { -/** Поток для вывода данных в формате JSONCompact. +/** The stream for outputting data in the JSONCompact format. */ class JSONCompactRowOutputStream : public JSONRowOutputStream { diff --git a/dbms/src/DataStreams/JSONEachRowRowInputStream.h b/dbms/src/DataStreams/JSONEachRowRowInputStream.h index 7c7471203e7..a57ebe3c11f 100644 --- a/dbms/src/DataStreams/JSONEachRowRowInputStream.h +++ b/dbms/src/DataStreams/JSONEachRowRowInputStream.h @@ -11,10 +11,10 @@ namespace DB class ReadBuffer; -/** Поток для чтения данных в формате JSON, где каждая строчка представлена отдельным JSON объектом. - * Объекты могут быть разделены переводом строки, другими пробельными символами в любом количестве и, возможно, запятой. - * Поля могут быть перечислены в произвольном порядке (в том числе, в разных строках может быть разный порядок), - * и часть полей может отсутствовать. +/** A stream for reading data in JSON format, where each row is represented by a separate JSON object. + * Objects can be separated by feed return, other whitespace characters in any number and possibly a comma. + * Fields can be listed in any order (including, in different lines there may be different order), + * and some fields may be missing. */ class JSONEachRowRowInputStream : public IRowInputStream { @@ -30,10 +30,10 @@ private: const Block sample; bool skip_unknown; - /// Буфер для прочитанного из потока имени поля. Используется, если его потребовалось скопировать. + /// Buffer for the read from the stream field name. Used when you have to copy it. String name_buf; - /// Хэш-таблица соответствия имя поля -> позиция в блоке. NOTE Можно использовать perfect hash map. + /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map. using NameMap = HashMap; NameMap name_map; }; diff --git a/dbms/src/DataStreams/JSONEachRowRowOutputStream.h b/dbms/src/DataStreams/JSONEachRowRowOutputStream.h index cbf92aa6576..48dce75a324 100644 --- a/dbms/src/DataStreams/JSONEachRowRowOutputStream.h +++ b/dbms/src/DataStreams/JSONEachRowRowOutputStream.h @@ -8,8 +8,8 @@ namespace DB { -/** Поток для вывода данных в формате JSON, по объекту на каждую строчку. - * Не валидирует UTF-8. +/** The stream for outputting data in JSON format, by object per line. + * Does not validate UTF-8. */ class JSONEachRowRowOutputStream : public IRowOutputStream { diff --git a/dbms/src/DataStreams/LazyBlockInputStream.h b/dbms/src/DataStreams/LazyBlockInputStream.h index b9a9904ba4a..6239bfcafb4 100644 --- a/dbms/src/DataStreams/LazyBlockInputStream.h +++ b/dbms/src/DataStreams/LazyBlockInputStream.h @@ -6,9 +6,9 @@ namespace DB { -/** Инициализировать другой источник при первом вызове read, и затем использовать его. - * Это нужно, например, для чтения из таблицы, которая будет заполнена - * после создания объекта LazyBlockInputStream, но до первого вызова read. +/** Initialize another source on the first `read` call, and then use it. + * This is needed, for example, to read from a table that will be populated + * after creation of LazyBlockInputStream object, but before the first `read` call. */ class LazyBlockInputStream : public IProfilingBlockInputStream { @@ -41,7 +41,7 @@ protected: if (IProfilingBlockInputStream * p_input = dynamic_cast(input.get())) { - /// Они могли быть установлены раньше, но не были протащены в input. + /// They could have been set before, but were not passed into the `input`. if (progress_callback) p_input->setProgressCallback(progress_callback); if (process_list_elem) diff --git a/dbms/src/DataStreams/LimitBlockInputStream.h b/dbms/src/DataStreams/LimitBlockInputStream.h index 0ca994916e0..1588441ee91 100644 --- a/dbms/src/DataStreams/LimitBlockInputStream.h +++ b/dbms/src/DataStreams/LimitBlockInputStream.h @@ -7,15 +7,15 @@ namespace DB { -/** Реализует реляционную операцию LIMIT. +/** Implements the LIMIT relational operation. */ class LimitBlockInputStream : public IProfilingBlockInputStream { public: - /** Если always_read_till_end = false (по-умолчанию), то после чтения достаточного количества данных, - * возвращает пустой блок, и это приводит к отмене выполнения запроса. - * Если always_read_till_end = true - читает все данные до конца, но игнорирует их. Это нужно в редких случаях: - * когда иначе, из-за отмены запроса, мы бы не получили данные для GROUP BY WITH TOTALS с удалённого сервера. + /** If always_read_till_end = false (by default), then after reading enough data, + * returns an empty block, and this causes the query to be canceled. + * If always_read_till_end = true - reads all the data to the end, but ignores them. This is necessary in rare cases: + * when otherwise, due to the cancellation of the request, we would not have received the data for GROUP BY WITH TOTALS from the remote server. */ LimitBlockInputStream(BlockInputStreamPtr input_, size_t limit_, size_t offset_, bool always_read_till_end_ = false); diff --git a/dbms/src/DataStreams/MarkInCompressedFile.h b/dbms/src/DataStreams/MarkInCompressedFile.h index ba0578bffa3..3a1d9aa0f19 100644 --- a/dbms/src/DataStreams/MarkInCompressedFile.h +++ b/dbms/src/DataStreams/MarkInCompressedFile.h @@ -10,8 +10,8 @@ namespace DB { -/** Засечка - позиция в сжатом файле. Сжатый файл состоит из уложенных подряд сжатых блоков. - * Засечка представляют собой пару - смещение в файле до начала сжатого блока, смещение в разжатом блоке до начала данных. +/** Mark is the position in the compressed file. The compressed file consists of adjacent compressed blocks. + * Mark is a tuple - the offset in the file to the start of the compressed block, the offset in the decompressed block to the start of the data. */ struct MarkInCompressedFile { diff --git a/dbms/src/DataStreams/MaterializingBlockInputStream.h b/dbms/src/DataStreams/MaterializingBlockInputStream.h index afcc670010e..654249e309a 100644 --- a/dbms/src/DataStreams/MaterializingBlockInputStream.h +++ b/dbms/src/DataStreams/MaterializingBlockInputStream.h @@ -5,7 +5,7 @@ namespace DB { -/** Преобразует столбцы-константы в полноценные столбцы ("материализует" их). +/** Converts columns-constants to full columns ("materializes" them). */ class MaterializingBlockInputStream : public IProfilingBlockInputStream { diff --git a/dbms/src/DataStreams/MaterializingBlockOutputStream.h b/dbms/src/DataStreams/MaterializingBlockOutputStream.h index 3d57c53cc9e..c71531d5338 100644 --- a/dbms/src/DataStreams/MaterializingBlockOutputStream.h +++ b/dbms/src/DataStreams/MaterializingBlockOutputStream.h @@ -7,7 +7,7 @@ namespace DB { -/** Преобразует столбцы-константы в полноценные столбцы ("материализует" их). +/** Converts columns-constants to full columns ("materializes" them). */ class MaterializingBlockOutputStream : public IBlockOutputStream { diff --git a/dbms/src/DataStreams/MergingAggregatedBlockInputStream.h b/dbms/src/DataStreams/MergingAggregatedBlockInputStream.h index 1b50896f568..2346f7bd221 100644 --- a/dbms/src/DataStreams/MergingAggregatedBlockInputStream.h +++ b/dbms/src/DataStreams/MergingAggregatedBlockInputStream.h @@ -8,8 +8,8 @@ namespace DB { -/** Доагрегирует поток блоков, в котором каждый блок уже агрегирован. - * Агрегатные функции в блоках не должны быть финализированы, чтобы их состояния можно было объединить. +/** A pre-aggregate stream of blocks in which each block is already aggregated. + * Aggregate functions in blocks should not be finalized so that their states can be merged. */ class MergingAggregatedBlockInputStream : public IProfilingBlockInputStream { diff --git a/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h b/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h index 42e9ddfa657..f6d2b311858 100644 --- a/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h +++ b/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h @@ -13,48 +13,48 @@ namespace DB { -/** Доагрегирует потоки блоков, держа в оперативной памяти только по одному или несколько (до merging_threads) блоков из каждого источника. - * Это экономит оперативку в случае использования двухуровневой агрегации, где в каждом источнике будет до 256 блоков с частями результата. +/** Pre-aggregates block streams, holding in RAM only one or more (up to merging_threads) blocks from each source. + * This saves RAM in case of using two-level aggregation, where in each source there will be up to 256 blocks with parts of the result. * - * Агрегатные функции в блоках не должны быть финализированы, чтобы их состояния можно было объединить. + * Aggregate functions in blocks should not be finalized so that their states can be combined. * - * Используется для решения двух задач: + * Used to solve two tasks: * - * 1. Внешняя агрегация со сбросом данных на диск. - * Частично агрегированные данные (предварительно разбитые на 256 корзин) сброшены в какое-то количество файлов на диске. - * Нужно читать их и мерджить по корзинам - держа в оперативке одновременно только несколько корзин из каждого файла. + * 1. External aggregation with data flush to disk. + * Partially aggregated data (previously divided into 256 buckets) is flushed to some number of files on the disk. + * We need to read them and merge them by buckets - keeping only a few buckets from each file in RAM simultaneously. * - * 2. Слияние результатов агрегации при распределённой обработке запроса. - * С разных серверов приезжают частично агрегированные данные, которые могут быть разбиты, а могут быть не разбиты на 256 корзин, - * и эти корзины отдаются нам по сети с каждого сервера последовательно, друг за другом. - * Надо так же читать и мерджить по корзинам. + * 2. Merge aggregation results for distributed query processing. + * Partially aggregated data arrives from different servers, which can be splitted down or not, into 256 buckets, + * and these buckets are passed to us by the network from each server in sequence, one by one. + * You should also read and merge by the buckets. * - * Суть работы: + * The essence of the work: * - * Есть какое-то количество источников. Они отдают блоки с частично агрегированными данными. - * Каждый источник может отдать одну из следующих последовательностей блоков: - * 1. "неразрезанный" блок с bucket_num = -1; - * 2. "разрезанные" (two_level) блоки с bucket_num от 0 до 255; - * В обоих случаях, может ещё присутствовать блок "переполнений" (overflows) с bucket_num = -1 и is_overflows = true; + * There are a number of sources. They give out blocks with partially aggregated data. + * Each source can return one of the following block sequences: + * 1. "unsplitted" block with bucket_num = -1; + * 2. "splitted" (two_level) blocks with bucket_num from 0 to 255; + * In both cases, there may also be a block of "overflows" with bucket_num = -1 and is_overflows = true; * - * Исходим из соглашения, что разрезанные блоки всегда передаются в порядке bucket_num. - * То есть, если a < b, то блок с bucket_num = a идёт раньше bucket_num = b. - * Это нужно для экономного по памяти слияния - * - чтобы не надо было читать блоки наперёд, а идти по всем последовательностям по возрастанию bucket_num. + * We start from the convention that splitted blocks are always passed in the order of bucket_num. + * That is, if a < b, then the bucket_num = a block goes before bucket_num = b. + * This is needed for a memory-efficient merge + * - so that you do not need to read the blocks up front, but go all the way up by bucket_num. * - * При этом, не все bucket_num из диапазона 0..255 могут присутствовать. - * Блок переполнений может присутствовать в любом порядке относительно других блоков (но он может быть только один). + * In this case, not all bucket_num from the range of 0..255 can be present. + * The overflow block can be presented in any order relative to other blocks (but it can be only one). * - * Необходимо объединить эти последовательности блоков и отдать результат в виде последовательности с такими же свойствами. - * То есть, на выходе, если в последовательности есть "разрезанные" блоки, то они должны идти в порядке bucket_num. + * It is necessary to combine these sequences of blocks and return the result as a sequence with the same properties. + * That is, at the output, if there are "splitted" blocks in the sequence, then they should go in the order of bucket_num. * - * Мердж можно осуществлять с использованием нескольких (merging_threads) потоков. - * Для этого, получение набора блоков для следующего bucket_num надо делать последовательно, - * а затем, когда мы имеем несколько полученных наборов, их объединение можно делать параллельно. + * The merge can be performed using several (merging_threads) threads. + * For this, receiving of a set of blocks for the next bucket_num should be done sequentially, + * and then, when we have several received sets, they can be merged in parallel. * - * При получении следующих блоков из разных источников, - * данные из источников можно также читать в несколько потоков (reading_threads) - * для оптимальной работы при наличии быстрой сети или дисков (откуда эти блоки читаются). + * When you receive next blocks from different sources, + * data from sources can also be read in several threads (reading_threads) + * for optimal performance in the presence of a fast network or disks (from where these blocks are read). */ class MergingAggregatedMemoryEfficientBlockInputStream : public IProfilingBlockInputStream { @@ -69,14 +69,14 @@ public: String getID() const override; - /// Отправляет запрос (инициирует вычисления) раньше, чем read. + /// Sends the request (initiates calculations) earlier than `read`. void readPrefix() override; - /// Вызывается либо после того, как всё прочитано, либо после cancel-а. + /// Called either after everything is read, or after cancel. void readSuffix() override; - /** Отличается от реализации по-умолчанию тем, что пытается остановить все источники, - * пропуская отвалившиеся по эксепшену. + /** Different from the default implementation by trying to stop all sources, + * skipping failed by execution. */ void cancel() override; @@ -117,32 +117,33 @@ private: void start(); - /// Получить блоки, которые можно мерджить. Это позволяет мерджить их параллельно в отдельных потоках. + /// Get blocks that you can merge. This allows you to merge them in parallel in separate threads. BlocksToMerge getNextBlocksToMerge(); std::unique_ptr reading_pool; - /// Для параллельного мерджа. + /// For a parallel merge. struct ParallelMergeData { ThreadPool pool; - /// Сейчас один из мерджащих потоков получает следующие блоки для мерджа. Эта операция должна делаться последовательно. + /// Now one of the merging threads receives next blocks for the merge. This operation must be done sequentially. std::mutex get_next_blocks_mutex; std::atomic exhausted {false}; /// No more source data. std::atomic finish {false}; /// Need to terminate early. std::exception_ptr exception; - /// Следует отдавать блоки стого в порядке ключа (bucket_num). - /// Если значение - пустой блок - то нужно дождаться его мерджа. - /// (Такое значение означает обещание, что здесь будут данные. Это важно, потому что данные нужно отдавать в порядке ключа - bucket_num) + /// It is necessary to give out blocks in the order of the key (bucket_num). + /// If the value is an empty block, you need to wait for its merge. + /// (This means the promise that there will be data here, which is important because the data should be given out + /// in the order of the key - bucket_num) std::map merged_blocks; std::mutex merged_blocks_mutex; - /// Событие, с помощью которого мерджащие потоки говорят главному потоку, что новый блок готов. + /// An event that is used by merging threads to tell the main thread that the new block is ready. std::condition_variable merged_blocks_changed; - /// Событие, с помощью которого главный поток говорят мерджащим потокам, что можно обработать следующую группу блоков. + /// An event by which the main thread is telling merging threads that it is possible to process the next group of blocks. std::condition_variable have_space; ParallelMergeData(size_t max_threads) : pool(max_threads) {} diff --git a/dbms/src/DataStreams/MergingSortedBlockInputStream.h b/dbms/src/DataStreams/MergingSortedBlockInputStream.h index 437c9c96997..bc1a9772614 100644 --- a/dbms/src/DataStreams/MergingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/MergingSortedBlockInputStream.h @@ -22,12 +22,12 @@ namespace ErrorCodes } -/// Позволяет ссылаться на строку в блоке и удерживать владение блоком, -/// и таким образом избежать создания временного объекта-строки. -/// Не используется std::shared_ptr, так как не нужно место для weak_count и deleter; -/// не используется Poco::SharedPtr, так как нужно выделять блок и refcount одним куском; -/// не используется Poco::AutoPtr, так как у него нет move конструктора и есть лишние проверки на nullptr; -/// Счётчик ссылок неатомарный, так как используется из одного потока. +/// Allows you refer to the row in the block and hold the block ownership, +/// and thus avoid creating a temporary row object. +/// Do not use std::shared_ptr, since there is no need for a place for `weak_count` and `deleter`; +/// does not use Poco::SharedPtr, since you need to allocate a block and `refcount` in one piece; +/// does not use Poco::AutoPtr, since it does not have a `move` constructor and there are extra checks for nullptr; +/// The reference counter is not atomic, since it is used from one thread. namespace detail { struct SharedBlock : Block @@ -87,7 +87,7 @@ protected: std::swap(shared_block, other.shared_block); } - /// Количество и типы столбцов обязаны соответствовать. + /// The number and types of columns must match. bool operator==(const RowRef & other) const { size_t size = columns.size(); @@ -111,10 +111,10 @@ protected: void readSuffixImpl() override; - /// Инициализирует очередь и следующий блок результата. + /// Initializes the queue and the next result block. void init(Block & merged_block, ColumnPlainPtrs & merged_columns); - /// Достаёт из источника, соответствующего current следующий блок. + /// Gets the next block from the source corresponding to the `current`. template void fetchNextBlock(const TSortCursor & current, std::priority_queue & queue); @@ -131,7 +131,7 @@ protected: /// May be smaller or equal to max_block_size. To do 'reserve' for columns. size_t expected_block_size = 0; - /// Текущие сливаемые блоки. + /// Blocks currently being merged. size_t num_columns = 0; std::vector source_blocks; @@ -149,9 +149,9 @@ protected: MergedRowSources * out_row_sources = nullptr; - /// Эти методы используются в Collapsing/Summing/Aggregating... SortedBlockInputStream-ах. + /// These methods are used in Collapsing/Summing/Aggregating... SortedBlockInputStream-s. - /// Сохранить строчку, на которую указывает cursor, в row. + /// Save the row pointed to by cursor in `row`. template void setRow(Row & row, TSortCursor & cursor) { @@ -165,7 +165,7 @@ protected: { tryLogCurrentException(__PRETTY_FUNCTION__); - /// Узнаем имя столбца и бросим исключение поинформативней. + /// Find out the name of the column and throw more informative exception. String column_name; for (const auto & block : source_blocks) @@ -206,8 +206,8 @@ protected: private: - /** Делаем поддержку двух разных курсоров - с Collation и без. - * Шаблоны используем вместо полиморфных SortCursor'ов и вызовов виртуальных функций. + /** We support two different cursors - with Collation and without. + * Templates are used instead of polymorphic SortCursor and calls to virtual functions. */ template void initQueue(std::priority_queue & queue); @@ -217,7 +217,7 @@ private: Logger * log = &Logger::get("MergingSortedBlockInputStream"); - /// Прочитали до конца. + /// Read is finished. bool finished = false; }; diff --git a/dbms/src/DataStreams/NativeBlockInputStream.h b/dbms/src/DataStreams/NativeBlockInputStream.h index 879bff9bc62..74128408ff1 100644 --- a/dbms/src/DataStreams/NativeBlockInputStream.h +++ b/dbms/src/DataStreams/NativeBlockInputStream.h @@ -10,12 +10,12 @@ namespace DB class CompressedReadBufferFromFile; -/** Формат Native может содержать отдельно расположенный индекс, - * который позволяет понять, где какой столбец расположен, - * и пропускать ненужные столбцы. +/** The Native format can contain a separately located index, + * which allows you to understand where what column is located, + * and skip unnecessary columns. */ -/** Позиция одного кусочка одного столбца. */ +/** The position of one piece of a single column. */ struct IndexOfOneColumnForNativeFormat { String name; @@ -23,7 +23,7 @@ struct IndexOfOneColumnForNativeFormat MarkInCompressedFile location; }; -/** Индекс для блока данных. */ +/** The index for the data block. */ struct IndexOfBlockForNativeFormat { using Columns = std::vector; @@ -33,7 +33,7 @@ struct IndexOfBlockForNativeFormat Columns columns; }; -/** Весь индекс. */ +/** The whole index. */ struct IndexForNativeFormat { using Blocks = std::vector; @@ -46,24 +46,24 @@ struct IndexForNativeFormat read(istr, required_columns); } - /// Прочитать индекс, только для нужных столбцов. + /// Read the index, only for the required columns. void read(ReadBuffer & istr, const NameSet & required_columns); }; -/** Десериализует поток блоков из родного бинарного формата (с именами и типами столбцов). - * Предназначено для взаимодействия между серверами. +/** Deserializes the stream of blocks from the native binary format (with names and column types). + * Designed for communication between servers. * - * Также может использоваться для хранения данных на диске. - * В этом случае, может использовать индекс. + * Can also be used to store data on disk. + * In this case, can use the index. */ class NativeBlockInputStream : public IProfilingBlockInputStream { public: - /** В случае указания ненулевой server_revision, может ожидаться и считываться дополнительная информация о блоке, - * в зависимости от поддерживаемой для указанной ревизии. + /** If a non-zero server_revision is specified, additional block information may be expected and read, + * depending on what is supported for the specified revision. * - * index - не обязательный параметр. Если задан, то будут читаться только указанные в индексе кусочки столбцов. + * `index` is not required parameter. If set, only parts of columns specified in the index will be read. */ NativeBlockInputStream( ReadBuffer & istr_, UInt64 server_revision_ = 0, @@ -94,7 +94,7 @@ private: IndexForNativeFormat::Blocks::const_iterator index_block_end; IndexOfBlockForNativeFormat::Columns::const_iterator index_column_it; - /// Если задан индекс, то istr должен быть CompressedReadBufferFromFile. + /// If an index is specified, then `istr` must be CompressedReadBufferFromFile. CompressedReadBufferFromFile * istr_concrete; }; diff --git a/dbms/src/DataStreams/NativeBlockOutputStream.h b/dbms/src/DataStreams/NativeBlockOutputStream.h index e0af3523284..16ba2415cc7 100644 --- a/dbms/src/DataStreams/NativeBlockOutputStream.h +++ b/dbms/src/DataStreams/NativeBlockOutputStream.h @@ -11,17 +11,17 @@ class WriteBuffer; class CompressedWriteBuffer; -/** Сериализует поток блоков в родном бинарном формате (с именами и типами столбцов). - * Предназначено для взаимодействия между серверами. +/** Serializes the stream of blocks in their native binary format (with names and column types). + * Designed for communication between servers. * - * Может быть указан поток для записи индекса. Индекс содержит смещения до каждого кусочка каждого столбца. - * Если делается append в уже существующий файл, и нужно записать индекс, то укажите initial_size_of_file. + * A stream can be specified to write the index. The index contains offsets to each part of each column. + * If an `append` is made to an existing file, and you need to write the index, then specify `initial_size_of_file`. */ class NativeBlockOutputStream : public IBlockOutputStream { public: - /** В случае указания ненулевой client_revision, может записываться дополнительная информация о блоке, - * в зависимости от поддерживаемой для указанной ревизии. + /** If non-zero client_revision is specified, additional block information can be written, + * depending on what is supported for the specified revision. */ NativeBlockOutputStream( WriteBuffer & ostr_, UInt64 client_revision_ = 0, @@ -39,8 +39,8 @@ private: UInt64 client_revision; WriteBuffer * index_ostr; - size_t initial_size_of_file; /// Начальный размер файла с данными, если делается append. Используется для индекса. - /// Если требуется записывать индекс, то ostr обязан быть CompressedWriteBuffer. + size_t initial_size_of_file; /// The initial size of the data file, if `append` done. Used for the index. + /// If you need to write index, then `ostr` must be a CompressedWriteBuffer. CompressedWriteBuffer * ostr_concrete = nullptr; }; diff --git a/dbms/src/DataStreams/NullAndDoCopyBlockInputStream.h b/dbms/src/DataStreams/NullAndDoCopyBlockInputStream.h index a1f49de2470..3aa39e2da23 100644 --- a/dbms/src/DataStreams/NullAndDoCopyBlockInputStream.h +++ b/dbms/src/DataStreams/NullAndDoCopyBlockInputStream.h @@ -11,11 +11,11 @@ class IBlockOutputStream; using BlockOutputStreamPtr = std::shared_ptr; -/** Пустой поток блоков. - * Но при первой попытке чтения, копирует данные из переданного input-а в переданный output. - * Это нужно для выполнения запроса INSERT SELECT - запрос копирует данные, но сам ничего не возвращает. - * Запрос можно было бы выполнять и без оборачивания в пустой BlockInputStream, - * но не работал бы прогресс выполнения запроса и возможность отменить запрос. +/** An empty stream of blocks. + * But at the first read attempt, copies the data from the passed `input` to the `output`. + * This is necessary to execute the query INSERT SELECT - the query copies data, but returns nothing. + * The query could be executed without wrapping it in an empty BlockInputStream, + * but the progress of query execution and the ability to cancel the query would not work. */ class NullAndDoCopyBlockInputStream : public IProfilingBlockInputStream { diff --git a/dbms/src/DataStreams/NullBlockInputStream.h b/dbms/src/DataStreams/NullBlockInputStream.h index 7c42c97bcd3..a68612afb19 100644 --- a/dbms/src/DataStreams/NullBlockInputStream.h +++ b/dbms/src/DataStreams/NullBlockInputStream.h @@ -6,7 +6,7 @@ namespace DB { -/** Пустой поток блоков. +/** Empty stream of blocks. */ class NullBlockInputStream : public IBlockInputStream { diff --git a/dbms/src/DataStreams/NullBlockOutputStream.h b/dbms/src/DataStreams/NullBlockOutputStream.h index 2c51a138cff..1742bd6e33c 100644 --- a/dbms/src/DataStreams/NullBlockOutputStream.h +++ b/dbms/src/DataStreams/NullBlockOutputStream.h @@ -6,7 +6,7 @@ namespace DB { -/** Ничего не делает. Используется для отладки и бенчмарков. +/** Does nothing. Used for debugging and benchmarks. */ class NullBlockOutputStream : public IBlockOutputStream { diff --git a/dbms/src/DataStreams/NullableAdapterBlockInputStream.cpp b/dbms/src/DataStreams/NullableAdapterBlockInputStream.cpp index 81543c832ea..a19278de3cf 100644 --- a/dbms/src/DataStreams/NullableAdapterBlockInputStream.cpp +++ b/dbms/src/DataStreams/NullableAdapterBlockInputStream.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace DB { @@ -13,13 +14,6 @@ extern const int TYPE_MISMATCH; } -static DataTypePtr removeNullable(DataTypePtr type) -{ - while (type->isNullable()) - type = typeid_cast(type.get())->getNestedType(); - return type; -} - NullableAdapterBlockInputStream::NullableAdapterBlockInputStream( BlockInputStreamPtr input_, const Block & in_sample_, const Block & out_sample_) @@ -104,11 +98,12 @@ void NullableAdapterBlockInputStream::buildActions( actions.reserve(in_size); rename.reserve(in_size); - for (size_t i = 0; i < in_size; ++i) { + for (size_t i = 0; i < in_size; ++i) + { const auto & in_elem = in_sample.getByPosition(i); const auto & out_elem = out_sample.getByPosition(i); - if (removeNullable(in_elem.type)->getName() == removeNullable(out_elem.type)->getName()) + if (isConvertableTypes(in_elem.type, out_elem.type)) { bool is_in_nullable = in_elem.type->isNullable(); bool is_out_nullable = out_elem.type->isNullable(); diff --git a/dbms/src/DataStreams/NullableAdapterBlockInputStream.h b/dbms/src/DataStreams/NullableAdapterBlockInputStream.h index 613a6309f5e..5646e807a45 100644 --- a/dbms/src/DataStreams/NullableAdapterBlockInputStream.h +++ b/dbms/src/DataStreams/NullableAdapterBlockInputStream.h @@ -51,7 +51,6 @@ private: void buildActions(const Block & in_sample, const Block & out_sample); private: - NamesAndTypesListPtr required_columns; Actions actions; std::vector> rename; bool must_transform; diff --git a/dbms/src/DataStreams/ODBCDriverBlockOutputStream.h b/dbms/src/DataStreams/ODBCDriverBlockOutputStream.h index df998db16c6..09795b72a3a 100644 --- a/dbms/src/DataStreams/ODBCDriverBlockOutputStream.h +++ b/dbms/src/DataStreams/ODBCDriverBlockOutputStream.h @@ -2,7 +2,7 @@ #include #include - +#include namespace DB { @@ -10,11 +10,11 @@ namespace DB class WriteBuffer; -/** Формат данных, предназначенный для упрощения реализации ODBC драйвера. - * ODBC драйвер предназначен для сборки под разные платформы без зависимостей от основного кода, - * поэтому формат сделан так, чтобы в нём можно было как можно проще его распарсить. - * Выводится заголовок с нужной информацией. - * Затем данные выводятся в порядке строк. Каждое значение выводится так: длина в формате VarUInt, затем данные в текстовом виде. +/** A data format designed to simplify the implementation of the ODBC driver. + * ODBC driver is designed to be build for different platforms without dependencies from the main code, + * so the format is made that way so that it can be as easy as possible to parse it. + * A header is displayed with the required information. + * The data is then output in the order of the rows. Each value is displayed as follows: length in VarUInt format, then data in text form. */ class ODBCDriverBlockOutputStream : public IBlockOutputStream { diff --git a/dbms/src/DataStreams/OneBlockInputStream.h b/dbms/src/DataStreams/OneBlockInputStream.h index 74f81c3c46a..a9f0f928696 100644 --- a/dbms/src/DataStreams/OneBlockInputStream.h +++ b/dbms/src/DataStreams/OneBlockInputStream.h @@ -6,8 +6,8 @@ namespace DB { -/** Поток блоков, из которого можно прочитать один блок. - * Также смотрите BlocksListBlockInputStream. +/** A stream of blocks from which you can read one block. + * Also see BlocksListBlockInputStream. */ class OneBlockInputStream : public IProfilingBlockInputStream { diff --git a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h index b10f2c51f35..3bc08e75ea3 100644 --- a/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h +++ b/dbms/src/DataStreams/ParallelAggregatingBlockInputStream.h @@ -11,15 +11,15 @@ namespace DB { -/** Агрегирует несколько источников параллельно. - * Производит агрегацию блоков из разных источников независимо в разных потоках, затем объединяет результаты. - * Если final == false, агрегатные функции не финализируются, то есть, не заменяются на своё значение, а содержат промежуточное состояние вычислений. - * Это необходимо, чтобы можно было продолжить агрегацию (например, объединяя потоки частично агрегированных данных). +/** Aggregates several sources in parallel. + * Makes aggregation of blocks from different sources independently in different threads, then combines the results. + * If final == false, aggregate functions are not finalized, that is, they are not replaced by their value, but contain an intermediate state of calculations. + * This is necessary so that aggregation can continue (for example, by combining streams of partially aggregated data). */ class ParallelAggregatingBlockInputStream : public IProfilingBlockInputStream { public: - /** Столбцы из key_names и аргументы агрегатных функций, уже должны быть вычислены. + /** Columns from key_names and arguments of aggregate functions must already be computed. */ ParallelAggregatingBlockInputStream( BlockInputStreams inputs, BlockInputStreamPtr additional_input_at_end, @@ -32,7 +32,7 @@ public: void cancel() override; protected: - /// Ничего не делаем, чтобы подготовка к выполнению запроса делалась параллельно, в ParallelInputsProcessor. + /// Do nothing that preparation to execution of the query be done in parallel, in ParallelInputsProcessor. void readPrefix() override { } @@ -49,16 +49,16 @@ private: size_t keys_size; size_t aggregates_size; - /** Используется, если есть ограничение на максимальное количество строк при агрегации, - * и если group_by_overflow_mode == ANY. - * В этом случае, новые ключи не добавляются в набор, а производится агрегация только по - * ключам, которые уже успели попасть в набор. + /** Used if there is a limit on the maximum number of rows in the aggregation, + * and if group_by_overflow_mode == ANY. + * In this case, new keys are not added to the set, but aggregation is performed only by + * keys that have already been added into the set. */ bool no_more_keys = false; std::atomic executed {false}; - /// Для чтения сброшенных во временный файл данных. + /// To read the data stored into the temporary data file. struct TemporaryFileStream { ReadBufferFromFile file_in; @@ -117,7 +117,7 @@ private: void execute(); - /** Отсюда будем доставать готовые блоки после агрегации. + /** From here we get the finished blocks after the aggregation. */ std::unique_ptr impl; }; diff --git a/dbms/src/DataStreams/ParallelInputsProcessor.h b/dbms/src/DataStreams/ParallelInputsProcessor.h index cf098c1b09b..cdf17e1ab4f 100644 --- a/dbms/src/DataStreams/ParallelInputsProcessor.h +++ b/dbms/src/DataStreams/ParallelInputsProcessor.h @@ -33,33 +33,33 @@ namespace CurrentMetrics namespace DB { -/** Режим объединения. +/** Union mode. */ enum class StreamUnionMode { - Basic = 0, /// вынимать блоки - ExtraInfo /// вынимать блоки + дополнительную информацию + Basic = 0, /// take out blocks + ExtraInfo /// take out blocks + additional information }; -/// Пример обработчика. +/// Example of the handler. struct ParallelInputsHandler { - /// Обработка блока данных. + /// Processing the data block. void onBlock(Block & block, size_t thread_num) {} - /// Обработка блока данных + дополнительных информаций. + /// Processing the data block + additional information. void onBlock(Block & block, BlockExtraInfo & extra_info, size_t thread_num) {} - /// Вызывается для каждого потока, когда потоку стало больше нечего делать. - /// Из-за того, что иссякла часть источников, и сейчас источников осталось меньше, чем потоков. - /// Вызывается, если метод onException не кидает исключение; вызывается до метода onFinish. + /// Called for each thread, when the thread has nothing else to do. + /// Due to the fact that part of the sources has run out, and now there are fewer sources left than streams. + /// Called if the `onException` method does not throw an exception; is called before the `onFinish` method. void onFinishThread(size_t thread_num) {} - /// Блоки закончились. Из-за того, что все источники иссякли или из-за отмены работы. - /// Этот метод всегда вызывается ровно один раз, в конце работы, если метод onException не кидает исключение. + /// Blocks are over. Due to the fact that all sources ran out or because of the cancellation of work. + /// This method is always called exactly once, at the end of the work, if the `onException` method does not throw an exception. void onFinish() {} - /// Обработка исключения. Разумно вызывать в этом методе метод ParallelInputsProcessor::cancel, а также передавать эксепшен в основной поток. + /// Exception handling. It is reasonable to call the ParallelInputsProcessor::cancel method in this method, and also pass the exception to the main thread. void onException(std::exception_ptr & exception, size_t thread_num) {} }; @@ -68,13 +68,13 @@ template class ParallelInputsProcessor { public: - /** additional_input_at_end - если не nullptr, - * то из этого источника начинают доставаться блоки лишь после того, как все остальные источники обработаны. - * Это делается в основном потоке. + /** additional_input_at_end - if not nullptr, + * then the blocks from this source will start to be processed only after all other sources are processed. + * This is done in the main thread. * - * Предназначено для реализации FULL и RIGHT JOIN - * - где нужно сначала параллельно сделать JOIN, при этом отмечая, какие ключи не найдены, - * и только после завершения этой работы, создать блоки из ненайденных ключей. + * Intended for implementation of FULL and RIGHT JOIN + * - where you must first make JOIN in parallel, while noting which keys are not found, + * and only after the completion of this work, create blocks of keys that are not found. */ ParallelInputsProcessor(BlockInputStreams inputs_, BlockInputStreamPtr additional_input_at_end_, size_t max_threads_, Handler & handler_) : inputs(inputs_), additional_input_at_end(additional_input_at_end_), max_threads(std::min(inputs_.size(), max_threads_)), handler(handler_) @@ -95,7 +95,7 @@ public: } } - /// Запустить фоновые потоки, начать работу. + /// Start background threads, start work. void process() { active_threads = max_threads; @@ -104,7 +104,7 @@ public: threads.emplace_back(std::bind(&ParallelInputsProcessor::thread, this, current_memory_tracker, i)); } - /// Попросить все источники остановиться раньше, чем они иссякнут. + /// Ask all sources to stop earlier than they run out. void cancel() { finish = true; @@ -119,9 +119,9 @@ public: } catch (...) { - /** Если не удалось попросить остановиться одного или несколько источников. - * (например, разорвано соединение при распределённой обработке запроса) - * - то пофиг. + /** If you can not ask one or more sources to stop. + * (for example, the connection is broken for distributed query processing) + * - then do not care. */ LOG_ERROR(log, "Exception while cancelling " << child->getName()); } @@ -129,7 +129,7 @@ public: } } - /// Подождать завершения работы всех потоков раньше деструктора. + /// Wait until all threads are finished, before the destructor. void wait() { if (joined_threads) @@ -148,11 +148,11 @@ public: } private: - /// Данные отдельного источника + /// Single source data struct InputData { BlockInputStreamPtr in; - size_t i; /// Порядковый номер источника (для отладки). + size_t i; /// The source number (for debugging). InputData() {} InputData(BlockInputStreamPtr & in_, size_t i_) : in(in_), i(i_) {} @@ -197,10 +197,10 @@ private: handler.onFinishThread(thread_num); - /// Последний поток при выходе сообщает, что данных больше нет. + /// The last thread on the output indicates that there is no more data. if (0 == --active_threads) { - /// И ещё обрабатывает дополнительный источник, если такой есть. + /// And then it processes an additional source, if there is one. if (additional_input_at_end) { try @@ -219,38 +219,38 @@ private: } } - handler.onFinish(); /// TODO Если в onFinish или onFinishThread эксепшен, то вызывается std::terminate. + handler.onFinish (); /// TODO If in `onFinish` or `onFinishThread` there is an exception, then std::terminate is called. } } void loop(size_t thread_num) { - while (!finish) /// Может потребоваться прекратить работу раньше, чем все источники иссякнут. + while (!finish) /// You may need to stop work earlier than all sources run out. { InputData input; - /// Выбираем следующий источник. + /// Select the next source. { std::lock_guard lock(available_inputs_mutex); - /// Если свободных источников нет, то этот поток больше не нужен. (Но другие потоки могут работать со своими источниками.) + /// If there are no free sources, then this thread is no longer needed. (But other threads can work with their sources.) if (available_inputs.empty()) break; input = available_inputs.front(); - /// Убираем источник из очереди доступных источников. + /// We remove the source from the queue of available sources. available_inputs.pop(); } - /// Основная работа. + /// The main work. Block block = input.in->read(); { if (finish) break; - /// Если этот источник ещё не иссяк, то положим полученный блок в очередь готовых. + /// If this source is not run out yet, then put the resulting block in the ready queue. { std::lock_guard lock(available_inputs_mutex); @@ -280,38 +280,38 @@ private: Handler & handler; - /// Потоки. + /// Streams. using ThreadsData = std::vector; ThreadsData threads; - /** Набор доступных источников, которые не заняты каким-либо потоком в данный момент. - * Каждый поток берёт из этого набора один источник, вынимает из источника блок (в этот момент источник делает вычисления), - * и (если источник не исчерпан), кладёт назад в набор доступных источников. + /** A set of available sources that are not currently processed by any thread. + * Each thread takes one source from this set, takes a block out of the source (at this moment the source does the calculations) + * and (if the source is not run out), puts it back into the set of available sources. * - * Возникает вопрос, что лучше использовать: - * - очередь (только что обработанный источник будет в следующий раз обработан позже остальных) - * - стек (только что обработанный источник будет обработан как можно раньше). + * The question arises what is better to use: + * - the queue (just processed source will be processed the next time later than the rest) + * - stack (just processed source will be processed as soon as possible). * - * Стек лучше очереди, когда надо выполнять работу по чтению одного источника более последовательно, - * и теоретически, это позволяет достичь более последовательных чтений с диска. + * The stack is better than the queue when you need to do work on reading one source more consequentially, + * and theoretically, this allows you to achieve more consequent/consistent reads from the disk. * - * Но при использовании стека, возникает проблема при распределённой обработке запроса: - * данные всё-время читаются только с части серверов, а на остальных серверах - * возникает таймаут при send-е, и обработка запроса завершается с исключением. + * But when using the stack, there is a problem with distributed query processing: + * data is read only from a part of the servers, and on the other servers + * a timeout occurs during send, and the request processing ends with an exception. * - * Поэтому, используется очередь. Это можно улучшить в дальнейшем. + * Therefore, a queue is used. This can be improved in the future. */ using AvailableInputs = std::queue; AvailableInputs available_inputs; - /// Для операций с available_inputs. + /// For operations with available_inputs. std::mutex available_inputs_mutex; - /// Сколько источников иссякло. + /// How many sources ran out. std::atomic active_threads { 0 }; - /// Завершить работу потоков (раньше, чем иссякнут источники). + /// Finish the threads work (before the sources run out). std::atomic finish { false }; - /// Подождали завершения всех потоков. + /// Wait for the completion of all threads. std::atomic joined_threads { false }; Logger * log = &Logger::get("ParallelInputsProcessor"); diff --git a/dbms/src/DataStreams/PartialSortingBlockInputStream.h b/dbms/src/DataStreams/PartialSortingBlockInputStream.h index 438f6b44739..924fdb0e2bf 100644 --- a/dbms/src/DataStreams/PartialSortingBlockInputStream.h +++ b/dbms/src/DataStreams/PartialSortingBlockInputStream.h @@ -8,13 +8,13 @@ namespace DB { -/** Сортирует каждый блок по отдельности по значениям указанных столбцов. - * На данный момент, используется не очень оптимальный алгоритм. +/** Sorts each block individually by the values of the specified columns. + * At the moment, not very optimal algorithm is used. */ class PartialSortingBlockInputStream : public IProfilingBlockInputStream { public: - /// limit - если не 0, то можно каждый блок сортировать не полностью, а только limit первых по порядку строк. + /// limit - if not 0, then you can sort each block not completely, but only `limit` first rows by order. PartialSortingBlockInputStream(BlockInputStreamPtr input_, SortDescription & description_, size_t limit_ = 0) : description(description_), limit(limit_) { diff --git a/dbms/src/DataStreams/PrettyBlockOutputStream.h b/dbms/src/DataStreams/PrettyBlockOutputStream.h index 22222c539b3..9d60abf07e4 100644 --- a/dbms/src/DataStreams/PrettyBlockOutputStream.h +++ b/dbms/src/DataStreams/PrettyBlockOutputStream.h @@ -11,12 +11,12 @@ class WriteBuffer; class Context; -/** Выводит результат в виде красивых таблиц. +/** Prints the result in the form of beautiful tables. */ class PrettyBlockOutputStream : public IBlockOutputStream { public: - /// no_escapes - не использовать ANSI escape sequences - для отображения в браузере, а не в консоли. + /// no_escapes - do not use ANSI escape sequences - to display in the browser, not in the console. PrettyBlockOutputStream(WriteBuffer & ostr_, bool no_escapes_, size_t max_rows_, const Context & context_); void write(const Block & block) override; @@ -33,7 +33,7 @@ protected: using Widths_t = std::vector; - /// Вычислить видимую (при выводе на консоль с кодировкой UTF-8) ширину значений и имён столбцов. + /// Evaluate the visible width (when outputting to the console with UTF-8 encoding) the width of the values and column names. void calculateWidths(Block & block, Widths_t & max_widths, Widths_t & name_widths); WriteBuffer & ostr; diff --git a/dbms/src/DataStreams/PrettyCompactBlockOutputStream.h b/dbms/src/DataStreams/PrettyCompactBlockOutputStream.h index ddde71b1316..80d23690755 100644 --- a/dbms/src/DataStreams/PrettyCompactBlockOutputStream.h +++ b/dbms/src/DataStreams/PrettyCompactBlockOutputStream.h @@ -6,7 +6,7 @@ namespace DB { -/** Выводит результат в виде красивых таблиц, но с меньшим количеством строк-разделителей. +/** Prints the result in the form of beautiful tables, but with fewer delimiter lines. */ class PrettyCompactBlockOutputStream : public PrettyBlockOutputStream { diff --git a/dbms/src/DataStreams/PrettyCompactMonoBlockOutputStream.h b/dbms/src/DataStreams/PrettyCompactMonoBlockOutputStream.h index 26e727d5d8c..aa3ac169d2f 100644 --- a/dbms/src/DataStreams/PrettyCompactMonoBlockOutputStream.h +++ b/dbms/src/DataStreams/PrettyCompactMonoBlockOutputStream.h @@ -6,8 +6,8 @@ namespace DB { -/** Тоже самое, что и PrettyCompactBlockOutputStream, но выводит все max_rows (или меньше, - * если результат содержит меньшее число строк) одним блоком с одной шапкой. +/** Same as PrettyCompactBlockOutputStream, but prints all max_rows (or less, + * if the result contains fewer rows) by one block with one header. */ class PrettyCompactMonoBlockOutputStream : public PrettyCompactBlockOutputStream { diff --git a/dbms/src/DataStreams/PrettySpaceBlockOutputStream.h b/dbms/src/DataStreams/PrettySpaceBlockOutputStream.h index e8d968e27e5..2fd78fa883f 100644 --- a/dbms/src/DataStreams/PrettySpaceBlockOutputStream.h +++ b/dbms/src/DataStreams/PrettySpaceBlockOutputStream.h @@ -6,7 +6,7 @@ namespace DB { -/** Выводит результат, выравнивая пробелами. +/** Prints the result, aligned with spaces. */ class PrettySpaceBlockOutputStream : public PrettyBlockOutputStream { diff --git a/dbms/src/DataStreams/PushingToViewsBlockOutputStream.h b/dbms/src/DataStreams/PushingToViewsBlockOutputStream.h index 8e53a316776..6a8ea5fbb44 100644 --- a/dbms/src/DataStreams/PushingToViewsBlockOutputStream.h +++ b/dbms/src/DataStreams/PushingToViewsBlockOutputStream.h @@ -12,8 +12,8 @@ namespace DB { -/** Записывает данные в указанную таблицу, при этом рекурсивно вызываясь от всех зависимых вьюшек. - * Если вьюшка не материализованная, то в нее данные не записываются, лишь перенаправляются дальше. +/** Writes data to the specified table, recursively being called from all dependent views. + * If the view is not materialized, then the data is not written to it, only redirected further. */ class PushingToViewsBlockOutputStream : public IBlockOutputStream { @@ -23,9 +23,9 @@ public: { storage = context.getTable(database, table); - /** TODO Это очень важная строчка. При любой вставке в таблицу один из stream-ов должен владеть lock-ом. - * Хотя сейчас любая вставка в таблицу делается через PushingToViewsBlockOutputStream, - * но ясно, что здесь - не лучшее место для этой функциональности. + /** TODO This is a very important line. At any insertion into the table one of streams should own lock. + * Although now any insertion into the table is done via PushingToViewsBlockOutputStream, + * but it's clear that here is not the best place for this functionality. */ addTableLock(storage->lockStructure(true)); diff --git a/dbms/src/DataStreams/QueueBlockIOStream.h b/dbms/src/DataStreams/QueueBlockIOStream.h index aa9274426fb..cb5fed7eb1c 100644 --- a/dbms/src/DataStreams/QueueBlockIOStream.h +++ b/dbms/src/DataStreams/QueueBlockIOStream.h @@ -12,17 +12,17 @@ namespace DB { -/** Является одновременно InputStream и OutputStream. - * При записи, кладёт блоки в очередь. - * При чтении, вынимает их из очереди. - * Используется thread-safe очередь. - * Если очередь пуста - чтение блокируется. - * Если очередь переполнена - запись блокируется. +/** Is both an InputStream and an OutputStream. + * When writing, puts the blocks in the queue. + * When reading, it takes them out of the queue. + * A thread-safe queue is used. + * If the queue is empty, the read is blocked. + * If the queue is full, the write is blocked. * - * Используется для того, чтобы временно сохранить куда-то результат, и позже передать его дальше. - * Также используется для синхронизации, когда нужно из одного источника сделать несколько - * - для однопроходного выполнения сразу нескольких запросов. - * Также может использоваться для распараллеливания: несколько потоков кладут блоки в очередь, а один - вынимает. + * Used to temporarily store the result somewhere, and later pass it further. + * Also used for synchronization, when you need to make several sources from one + * - for single-pass execution of several queries at once. + * It can also be used for parallelization: several threads put blocks in the queue, and one - takes out. */ class QueueBlockIOStream : public IProfilingBlockInputStream, public IBlockOutputStream diff --git a/dbms/src/DataStreams/RemoveColumnsBlockInputStream.h b/dbms/src/DataStreams/RemoveColumnsBlockInputStream.h index 922e442b67c..3198ba2a803 100644 --- a/dbms/src/DataStreams/RemoveColumnsBlockInputStream.h +++ b/dbms/src/DataStreams/RemoveColumnsBlockInputStream.h @@ -8,7 +8,7 @@ namespace DB { -/** Удаляет из блока указанные столбцы. +/** Removes the specified columns from the block. */ class RemoveColumnsBlockInputStream : public IProfilingBlockInputStream { diff --git a/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h b/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h index c7c1625f5e5..e01c0fb6cf0 100644 --- a/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h @@ -8,9 +8,9 @@ namespace DB { -/** Соединяет несколько сортированных потоков в один. - * При этом, для каждой группы идущих подряд одинаковых значений первичного ключа (столбцов, по которым сортируются данные), - * оставляет +/** Merges several sorted streams into one. + * For each group of consecutive identical values of the primary key (the columns by which the data is sorted), + * keeps row with max `version` value. */ class ReplacingSortedBlockInputStream : public MergingSortedBlockInputStream { @@ -42,7 +42,7 @@ public: } protected: - /// Может возвращаться на 1 больше записей, чем max_block_size. + /// Can return 1 more records than max_block_size. Block readImpl() override; private: @@ -64,7 +64,7 @@ private: template void merge(ColumnPlainPtrs & merged_columns, std::priority_queue & queue); - /// Вставить в результат строки для текущего первичного ключа. + /// Output into result the rows for current primary key. void insertRow(ColumnPlainPtrs & merged_columns, size_t & merged_rows); }; diff --git a/dbms/src/DataStreams/SummingSortedBlockInputStream.h b/dbms/src/DataStreams/SummingSortedBlockInputStream.h index 387506d9869..b401d469d21 100644 --- a/dbms/src/DataStreams/SummingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/SummingSortedBlockInputStream.h @@ -14,17 +14,17 @@ namespace ErrorCodes } -/** Соединяет несколько сортированных потоков в один. - * При этом, для каждой группы идущих подряд одинаковых значений первичного ключа (столбцов, по которым сортируются данные), - * схлопывает их в одну строку, суммируя все числовые столбцы кроме первичного ключа. - * Если во всех числовых столбцах кроме первичного ключа получился ноль, то удаляет строчку. +/** Merges several sorted streams into one. + * For each group of consecutive identical values of the primary key (the columns by which the data is sorted), + * collapses them into one row, summing all the numeric columns except the primary key. + * If in all numeric columns, except for the primary key, the result is zero, it deletes the row. */ class SummingSortedBlockInputStream : public MergingSortedBlockInputStream { public: SummingSortedBlockInputStream(BlockInputStreams inputs_, const SortDescription & description_, - /// Список столбцов, которых нужно суммировать. Если пустое - берутся все числовые столбцы, не входящие в description. + /// List of columns to be summed. If empty, all numeric columns that are not in the description are taken. const Names & column_names_to_sum_, size_t max_block_size_) : MergingSortedBlockInputStream(inputs_, description_, max_block_size_), column_names_to_sum(column_names_to_sum_) @@ -36,68 +36,68 @@ public: String getID() const override; protected: - /// Может возвращаться на 1 больше записей, чем max_block_size. + /// Can return 1 more records than max_block_size. Block readImpl() override; private: Logger * log = &Logger::get("SummingSortedBlockInputStream"); - /// Прочитали до конца. + /// Read up to the end. bool finished = false; - /// Столбцы с какими номерами надо суммировать. - Names column_names_to_sum; /// Если задано - преобразуется в column_numbers_to_sum при инициализации. + /// Columns with which numbers should be summed. + Names column_names_to_sum; /// If set, it is converted to column_numbers_to_sum when initialized. ColumnNumbers column_numbers_to_sum; - /** Таблица может иметь вложенные таблицы, обрабатываемые особым образом. - * Если название вложенной таблицы заканчинвается на `Map` и она содержит не менее двух столбцов, - * удовлетворяющих следующим критериям: - * - первый столбец, а также все столбцы, имена которых заканчиваются на ID, Key или Type - числовые ((U)IntN, Date, DateTime); - * (кортеж из таких столбцов назовём keys) - * - остальные столбцы - арифметические ((U)IntN, Float32/64), условно (values...). - * Такая вложенная таблица воспринимается как отображение (keys...) => (values...) и при слиянии - * ее строк выполняется слияние элементов двух множеств по (keys...) со сложением соответствующих (values...). + /** A table can have nested tables that are treated in a special way. + * If the name of the nested table ends in `Map` and it contains at least two columns, + * satisfying the following criteria: + * - the first column, as well as all columns whose names end with `ID`, `Key` or `Type` - numeric ((U)IntN, Date, DateTime); + * (a tuple of such columns will be called `keys`) + * - the remaining columns are arithmetic ((U)IntN, Float32/64), called (`values`...). + * This nested table is treated as a mapping (keys...) => (values...) and when merge + * its rows, the merge of the elements of two sets by (keys...) with summing of corresponding (values...). * - * Пример: + * Example: * [(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] * [(1, 100)] + [(1, 150)] -> [(1, 250)] * [(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] * [(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] * - * Эта весьма необычная функциональность сделана исключительно для БК, - * не предназначена для использования кем-либо ещё, - * и может быть удалена в любой момент. + * This very unusual functionality is made exclusively for the banner system, + * is not supposed for use by anyone else, + * and can be deleted at any time. */ - /// Хранит номера столбцов-ключей и столбцов-значений. + /// Stores numbers of key-columns and value-columns. struct MapDescription { std::vector key_col_nums; std::vector val_col_nums; }; - /// Найденные вложенные Map-таблицы. + /// Found nested Map-tables. std::vector maps_to_sum; - RowRef current_key; /// Текущий первичный ключ. - RowRef next_key; /// Первичный ключ следующей строки. + RowRef current_key; /// The current primary key. + RowRef next_key; /// The primary key of the next row. Row current_row; - bool current_row_is_zero = true; /// Текущая строчка просуммировалась в ноль, и её следует удалить. + bool current_row_is_zero = true; /// The current row is summed to zero, and it should be deleted. - bool output_is_non_empty = false; /// Отдали ли мы наружу хоть одну строку. + bool output_is_non_empty = false; /// Have we given out at least one row as a result. - /** Делаем поддержку двух разных курсоров - с Collation и без. - * Шаблоны используем вместо полиморфных SortCursor'ов и вызовов виртуальных функций. + /** We support two different cursors - with Collation and without. + * Templates are used instead of polymorphic SortCursor and calls to virtual functions. */ template void merge(ColumnPlainPtrs & merged_columns, std::priority_queue & queue); - /// Вставить в результат просуммированную строку для текущей группы. + /// Insert the summed row for the current group into the result. void insertCurrentRow(ColumnPlainPtrs & merged_columns); - /** Для вложенных Map выполняется слияние по ключу с выбрасыванием строк вложенных массивов, в которых - * все элементы - нулевые. + /** For nested Map, a merge by key is performed with the ejection of rows of nested arrays, in which + * all items are zero. */ template bool mergeMaps(Row & row, TSortCursor & cursor); @@ -105,8 +105,8 @@ private: template bool mergeMap(const MapDescription & map, Row & row, TSortCursor & cursor); - /** Прибавить строчку под курсором к row. - * Возвращает false, если результат получился нулевым. + /** Add the row under the cursor to the `row`. + * Returns false if the result is zero. */ template bool addRow(Row & row, TSortCursor & cursor); diff --git a/dbms/src/DataStreams/TSKVRowInputStream.h b/dbms/src/DataStreams/TSKVRowInputStream.h index 438705833e5..b057c22af8a 100644 --- a/dbms/src/DataStreams/TSKVRowInputStream.h +++ b/dbms/src/DataStreams/TSKVRowInputStream.h @@ -11,13 +11,13 @@ namespace DB class ReadBuffer; -/** Поток для чтения данных в формате TSKV. - * TSKV - очень неэффективный формат данных. - * Похож на TSV, но каждое поле записано в виде key=value. - * Поля могут быть перечислены в произвольном порядке (в том числе, в разных строках может быть разный порядок), - * и часть полей может отсутствовать. - * В имени поля может быть заэскейплен знак равенства. - * Также, в качестве дополнительного элемента может присутствовать бесполезный фрагмент tskv - его нужно игнорировать. +/** Stream for reading data in TSKV format. + * TSKV is a very inefficient data format. + * Similar to TSV, but each field is written as key=value. + * Fields can be listed in any order (including, in different lines there may be different order), + * and some fields may be missing. + * An equal sign can be escaped in the field name. + * Also, as an additional element there may be a useless tskv fragment - it needs to be ignored. */ class TSKVRowInputStream : public IRowInputStream { @@ -31,13 +31,13 @@ public: private: ReadBuffer & istr; const Block sample; - /// Пропускать неизвестные поля. + /// Skip unknown fields. bool skip_unknown; - /// Буфер для прочитанного из потока имени поля. Используется, если его потребовалось скопировать. + /// Buffer for the read from the stream the field name. Used when you have to copy it. String name_buf; - /// Хэш-таблица соответствия имя поля -> позиция в блоке. NOTE Можно использовать perfect hash map. + /// Hash table matching `field name -> position in the block`. NOTE You can use perfect hash map. using NameMap = HashMap; NameMap name_map; }; diff --git a/dbms/src/DataStreams/TSKVRowOutputStream.h b/dbms/src/DataStreams/TSKVRowOutputStream.h index 8d1c7d05a62..255c7bc5b77 100644 --- a/dbms/src/DataStreams/TSKVRowOutputStream.h +++ b/dbms/src/DataStreams/TSKVRowOutputStream.h @@ -6,9 +6,9 @@ namespace DB { -/** Поток для вывода данных в формате TSKV. - * TSKV похож на TabSeparated, но перед каждым значением указывается его имя и знак равенства: name=value. - * Этот формат весьма неэффективен. +/** The stream for outputting data in the TSKV format. + * TSKV is similar to TabSeparated, but before every value, its name and equal sign are specified: name=value. + * This format is very inefficient. */ class TSKVRowOutputStream : public TabSeparatedRowOutputStream { diff --git a/dbms/src/DataStreams/TabSeparatedBlockOutputStream.h b/dbms/src/DataStreams/TabSeparatedBlockOutputStream.h index af991fb507b..00b02c7de9b 100644 --- a/dbms/src/DataStreams/TabSeparatedBlockOutputStream.h +++ b/dbms/src/DataStreams/TabSeparatedBlockOutputStream.h @@ -10,9 +10,9 @@ class Block; class WriteBuffer; -/** Пишет данные в tab-separated файл, но по столбцам, блоками. - * Блоки разделены двойным переводом строки. - * На каждой строке блока - данные одного столбца. +/** Writes the data into a tab-separated file, but by columns, in blocks. + * Blocks are separated by a double line feed. + * On each row of the block - the data of one column. */ class TabSeparatedBlockOutputStream : public IBlockOutputStream { diff --git a/dbms/src/DataStreams/TabSeparatedRawRowOutputStream.h b/dbms/src/DataStreams/TabSeparatedRawRowOutputStream.h index 3f6b152e275..7cf8ab5ce19 100644 --- a/dbms/src/DataStreams/TabSeparatedRawRowOutputStream.h +++ b/dbms/src/DataStreams/TabSeparatedRawRowOutputStream.h @@ -6,8 +6,8 @@ namespace DB { -/** Поток для вывода данных в формате tsv, но без эскейпинга отдельных значений. - * (То есть - вывод необратимый.) +/** A stream for outputting data in tsv format, but without escaping individual values. + * (That is, the output is irreversible.) */ class TabSeparatedRawRowOutputStream : public TabSeparatedRowOutputStream { diff --git a/dbms/src/DataStreams/TabSeparatedRowInputStream.h b/dbms/src/DataStreams/TabSeparatedRowInputStream.h index bd186be734e..9674a522703 100644 --- a/dbms/src/DataStreams/TabSeparatedRowInputStream.h +++ b/dbms/src/DataStreams/TabSeparatedRowInputStream.h @@ -10,13 +10,13 @@ namespace DB class ReadBuffer; -/** Поток для ввода данных в формате tsv. +/** A stream to input data in tsv format. */ class TabSeparatedRowInputStream : public IRowInputStream { public: - /** with_names - в первой строке заголовок с именами столбцов - * with_types - на следующей строке заголовок с именами типов + /** with_names - the first line is the header with the names of the columns + * with_types - on the next line header with type names */ TabSeparatedRowInputStream(ReadBuffer & istr_, const Block & sample_, bool with_names_ = false, bool with_types_ = false); @@ -34,11 +34,11 @@ private: bool with_types; DataTypes data_types; - /// Для удобной диагностики в случае ошибки. + /// For convenient diagnostics in case of an error. size_t row_num = 0; - /// Сколько байт было считано, не считая тех, что ещё в буфере. + /// How many bytes were read, not counting those still in the buffer. size_t bytes_read_at_start_of_buffer_on_current_row = 0; size_t bytes_read_at_start_of_buffer_on_prev_row = 0; diff --git a/dbms/src/DataStreams/TabSeparatedRowOutputStream.h b/dbms/src/DataStreams/TabSeparatedRowOutputStream.h index b8750a1e382..9847b18872c 100644 --- a/dbms/src/DataStreams/TabSeparatedRowOutputStream.h +++ b/dbms/src/DataStreams/TabSeparatedRowOutputStream.h @@ -9,13 +9,13 @@ namespace DB class WriteBuffer; -/** Поток для вывода данных в формате tsv. +/** A stream for outputting data in tsv format. */ class TabSeparatedRowOutputStream : public IRowOutputStream { public: - /** with_names - выводить в первой строке заголовок с именами столбцов - * with_types - выводить на следующей строке заголовок с именами типов + /** with_names - output in the first line a header with column names + * with_types - output the next line header with the names of the types */ TabSeparatedRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_ = false, bool with_types_ = false); diff --git a/dbms/src/DataStreams/TotalsHavingBlockInputStream.h b/dbms/src/DataStreams/TotalsHavingBlockInputStream.h index 124dcedd006..42137947427 100644 --- a/dbms/src/DataStreams/TotalsHavingBlockInputStream.h +++ b/dbms/src/DataStreams/TotalsHavingBlockInputStream.h @@ -9,9 +9,9 @@ namespace DB class ExpressionActions; -/** Принимает блоки после группировки, с нефиализированными агрегатными функциями. - * Вычисляет тотальные значения в соответствии с totals_mode. - * Если нужно, вычисляет выражение из HAVING и фильтрует им строки. Отдает финализированные и отфильтрованные блоки. +/** Takes blocks after grouping, with non-finalized aggregate functions. + * Calculates total values according to totals_mode. + * If necessary, evaluates the expression from HAVING and filters rows. Returns the finalized and filtered blocks. */ class TotalsHavingBlockInputStream : public IProfilingBlockInputStream { @@ -42,15 +42,15 @@ private: size_t passed_keys = 0; size_t total_keys = 0; - /** Здесь находятся значения, не прошедшие max_rows_to_group_by. - * Они прибавляются или не прибавляются к current_totals в зависимости от totals_mode. + /** Here are the values that did not pass max_rows_to_group_by. + * They are added or not added to the current_totals, depending on the totals_mode. */ Block overflow_aggregates; - /// Здесь накапливаются тотальные значения. После окончания работы, они будут помещены в IProfilingBlockInputStream::totals. + /// Here, total values are accumulated. After the work is finished, they will be placed in IProfilingBlockInputStream::totals. Block current_totals; - /// Если filter == nullptr - прибавлять все строки. Иначе - только строки, проходящие фильтр (HAVING). + /// If filter == nullptr - add all rows. Otherwise, only the rows that pass the filter (HAVING). void addToTotals(Block & totals, Block & block, const IColumn::Filter * filter); }; diff --git a/dbms/src/DataStreams/UnionBlockInputStream.h b/dbms/src/DataStreams/UnionBlockInputStream.h index f988ef59bc4..2267a8ae218 100644 --- a/dbms/src/DataStreams/UnionBlockInputStream.h +++ b/dbms/src/DataStreams/UnionBlockInputStream.h @@ -22,7 +22,7 @@ namespace template struct OutputData; -/// Блок или эксепшен. +/// A block or an exception. template <> struct OutputData { @@ -34,7 +34,7 @@ struct OutputData OutputData(std::exception_ptr & exception_) : exception(exception_) {} }; -/// Блок + дополнительнцю информацию или эксепшен. +/// Block + additional information or an exception. template <> struct OutputData { @@ -49,17 +49,17 @@ struct OutputData } -/** Объединяет несколько источников в один. - * Блоки из разных источников перемежаются друг с другом произвольным образом. - * Можно указать количество потоков (max_threads), - * в которых будет выполняться получение данных из разных источников. +/** Merges several sources into one. + * Blocks from different sources are interleaved with each other in an arbitrary way. + * You can specify the number of threads (max_threads), + * in which data will be retrieved from different sources. * - * Устроено так: - * - с помощью ParallelInputsProcessor в нескольких потоках вынимает из источников блоки; - * - полученные блоки складываются в ограниченную очередь готовых блоков; - * - основной поток вынимает готовые блоки из очереди готовых блоков; - * - если указан режим StreamUnionMode::ExtraInfo, в дополнение к блокам UnionBlockInputStream - * вынимает информацию о блоках; в таком случае все источники должны поддержать такой режим. + * It's managed like this: + * - with the help of ParallelInputsProcessor in several threads it takes out blocks from the sources; + * - the completed blocks are added to a limited queue of finished blocks; + * - the main thread takes out completed blocks from the queue of finished blocks; + * - if the StreamUnionMode::ExtraInfo mode is specified, in addition to the UnionBlockInputStream + * extracts blocks information; In this case all sources should support such mode. */ template @@ -95,7 +95,7 @@ public: for (size_t i = 0; i < children.size(); ++i) children_ids[i] = children[i]->getID(); - /// Порядок не имеет значения. + /// Order does not matter. std::sort(children_ids.begin(), children_ids.end()); for (size_t i = 0; i < children_ids.size(); ++i) @@ -121,8 +121,8 @@ public: } } - /** Отличается от реализации по-умолчанию тем, что пытается остановить все источники, - * пропуская отвалившиеся по эксепшену. + /** Different from the default implementation by trying to stop all sources, + * skipping failed by execution. */ void cancel() override { @@ -150,8 +150,8 @@ protected: std::exception_ptr exception; if (!all_read) { - /** Прочитаем всё до конца, чтобы ParallelInputsProcessor не заблокировался при попытке вставить в очередь. - * Может быть, в очереди есть ещё эксепшен. + /** Let's read everything up to the end, so that ParallelInputsProcessor is not blocked when trying to insert into the queue. + * Maybe there is an exception in the queue. */ OutputData res; while (true) @@ -181,17 +181,17 @@ protected: std::rethrow_exception(exception); } - /// Ничего не делаем, чтобы подготовка к выполнению запроса делалась параллельно, в ParallelInputsProcessor. + /// Do nothing, to make the preparation for the query execution in parallel, in ParallelInputsProcessor. void readPrefix() override { } - /** Возможны следующие варианты: - * 1. Функция readImpl вызывается до тех пор, пока она не вернёт пустой блок. - * Затем вызывается функция readSuffix и затем деструктор. - * 2. Вызывается функция readImpl. В какой-то момент, возможно из другого потока вызывается функция cancel. - * Затем вызывается функция readSuffix и затем деструктор. - * 3. В любой момент, объект может быть и так уничтожен (вызываться деструктор). + /** The following options are possible: + * 1. `readImpl` function is called until it returns an empty block. + * Then `readSuffix` function is called and then destructor. + * 2. `readImpl` function is called. At some point, `cancel` function is called perhaps from another thread. + * Then `readSuffix` function is called and then destructor. + * 3. At any time, the object can be destroyed (destructor called). */ Block readImpl() override @@ -199,14 +199,14 @@ protected: if (all_read) return received_payload.block; - /// Запускаем потоки, если это ещё не было сделано. + /// Run threads if this has not already been done. if (!started) { started = true; processor.process(); } - /// Будем ждать, пока будет готов следующий блок или будет выкинуто исключение. + /// We will wait until the next block is ready or an exception is thrown. //std::cerr << "popping\n"; output_queue.pop(received_payload); @@ -223,7 +223,7 @@ protected: return received_payload.block; } - /// Вызывается либо после того, как всё прочитано, либо после cancel-а. + /// Called either after everything is read, or after cancel. void readSuffix() override { //std::cerr << "readSuffix\n"; @@ -255,11 +255,11 @@ private: using OutputQueue = ConcurrentBoundedQueue; private: - /** Очередь готовых блоков. Также туда можно положить эксепшен вместо блока. - * Когда данные закончатся - в очередь вставляется пустой блок. - * В очередь всегда (даже после исключения или отмены запроса) рано или поздно вставляется пустой блок. - * Очередь всегда (даже после исключения или отмены запроса, даже в деструкторе) нужно дочитывать до пустого блока, - * иначе ParallelInputsProcessor может заблокироваться при вставке в очередь. + /** The queue of the finished blocks. Also, you can put an exception instead of a block. + * When data is run out, an empty block is inserted into the queue. + * Sooner or later, an empty block is always inserted into the queue (even after exception or query cancellation). + * The queue is always (even after exception or canceling the query, even in destructor) you must read up to an empty block, + * otherwise ParallelInputsProcessor can be blocked during insertion into the queue. */ OutputQueue output_queue; @@ -297,12 +297,12 @@ private: { //std::cerr << "pushing exception\n"; - /// Порядок строк имеет значение. Если его поменять, то возможна ситуация, - /// когда перед эксепшеном, в очередь окажется вставлен пустой блок (конец данных), - /// и эксепшен потеряется. + /// The order of the rows matters. If it is changed, then the situation is possible, + /// when before exception, an empty block (end of data) will be put into the queue, + /// and the exception is lost. parent.output_queue.push(exception); - parent.cancel(); /// Не кидает исключений. + parent.cancel(); /// Does not throw exceptions. } Self & parent; diff --git a/dbms/src/DataStreams/ValuesRowOutputStream.h b/dbms/src/DataStreams/ValuesRowOutputStream.h index 2e0a9f63957..9b9dfdf6a0b 100644 --- a/dbms/src/DataStreams/ValuesRowOutputStream.h +++ b/dbms/src/DataStreams/ValuesRowOutputStream.h @@ -9,7 +9,7 @@ namespace DB class WriteBuffer; -/** Поток для вывода данных в формате VALUES (как в INSERT запросе). +/** A stream for outputting data in the VALUES format (as in the INSERT request). */ class ValuesRowOutputStream : public IRowOutputStream { diff --git a/dbms/src/DataStreams/XMLRowOutputStream.h b/dbms/src/DataStreams/XMLRowOutputStream.h index a412461b616..b11db4e3cf4 100644 --- a/dbms/src/DataStreams/XMLRowOutputStream.h +++ b/dbms/src/DataStreams/XMLRowOutputStream.h @@ -10,7 +10,7 @@ namespace DB { -/** Поток для вывода данных в формате XML. +/** A stream for outputting data in XML format. */ class XMLRowOutputStream : public IRowOutputStream { diff --git a/dbms/src/DataStreams/copyData.h b/dbms/src/DataStreams/copyData.h index c73e16a2803..2a42ef191cb 100644 --- a/dbms/src/DataStreams/copyData.h +++ b/dbms/src/DataStreams/copyData.h @@ -9,8 +9,8 @@ namespace DB class IBlockInputStream; class IBlockOutputStream; -/** Копирует данные из InputStream в OutputStream - * (например, из БД в консоль и т. п.) +/** Copies data from the InputStream into the OutputStream + * (for example, from the database to the console, etc.) */ void copyData(IBlockInputStream & from, IBlockOutputStream & to, std::atomic * is_cancelled = nullptr); diff --git a/dbms/src/DataStreams/glueBlockInputStreams.h b/dbms/src/DataStreams/glueBlockInputStreams.h index f3a084f724c..bf0272b675c 100644 --- a/dbms/src/DataStreams/glueBlockInputStreams.h +++ b/dbms/src/DataStreams/glueBlockInputStreams.h @@ -6,11 +6,11 @@ namespace DB { -/** Если переданные источники (конвейеры выполнения запроса) имеют одинаковые части, - * то склеивает эти части, заменяя на один источник и вставляя "вилки" (размножители). - * Это используется для однопроходного выполнения нескольких запросов. +/** If passed sources (query execution pipelines) have the same parts, + * then glues these parts, replacing them with one source and inserting "forks" (multipliers). + * This is used for single-pass execution of multiple queries. * - * Для выполнения склеенного конвейера, все inputs и forks должны использоваться в разных потоках. + * To execute a glued pipeline, all `inputs` and `forks` must be used in different threads. */ void glueBlockInputStreams(BlockInputStreams & inputs, Forks & forks); diff --git a/dbms/src/DataStreams/isConvertableTypes.cpp b/dbms/src/DataStreams/isConvertableTypes.cpp new file mode 100644 index 00000000000..082c8dc8ec3 --- /dev/null +++ b/dbms/src/DataStreams/isConvertableTypes.cpp @@ -0,0 +1,34 @@ +#include + +#include +#include +#include + +namespace DB +{ + +static DataTypePtr removeNullable(const DataTypePtr & type) +{ + if (type->isNullable()) + return typeid_cast(type.get())->getNestedType(); + return type; +} + +bool isConvertableTypes(const DataTypePtr & from, const DataTypePtr & to) +{ + auto from_nn = removeNullable(from); + auto to_nn = removeNullable(to); + + if ( dynamic_cast(to_nn.get()) && + !dynamic_cast(from_nn.get())) + { + if (typeid_cast(from_nn.get())) + return true; + if (from_nn->isNumeric()) + return true; + } + + return from_nn->equals(*to_nn); +} + +} diff --git a/dbms/src/DataStreams/isConvertableTypes.h b/dbms/src/DataStreams/isConvertableTypes.h new file mode 100644 index 00000000000..af48f9abe38 --- /dev/null +++ b/dbms/src/DataStreams/isConvertableTypes.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +namespace DB +{ + +/// Check that type 'from' can be implicitly converted to type 'to'. +bool isConvertableTypes(const DataTypePtr & from, const DataTypePtr & to); + +} diff --git a/dbms/src/DataStreams/narrowBlockInputStreams.h b/dbms/src/DataStreams/narrowBlockInputStreams.h index b8fbd77cc75..305342185b7 100644 --- a/dbms/src/DataStreams/narrowBlockInputStreams.h +++ b/dbms/src/DataStreams/narrowBlockInputStreams.h @@ -6,12 +6,12 @@ namespace DB { -/** Если количество источников inputs больше width, - * то клеит источники друг с другом (с помощью ConcatBlockInputStream), - * чтобы количество источников стало не больше width. +/** If the number of sources of `inputs` is greater than `width`, + * then glues the sources to each other (using ConcatBlockInputStream), + * so that the number of sources becomes no more than `width`. * - * Старается клеить источники друг с другом равномерно-случайно. - * (чтобы избежать перевеса в случае, если распределение количества данных в разных источниках подчиняется некоторому шаблону) + * Trying to glue the sources with each other uniformly randomly. + * (to avoid overweighting if the distribution of the amount of data in different sources is subject to some pattern) */ BlockInputStreams narrowBlockInputStreams(BlockInputStreams & inputs, size_t width); diff --git a/dbms/src/DataStreams/tests/CMakeLists.txt b/dbms/src/DataStreams/tests/CMakeLists.txt index 405e2b1a4d6..2a07e8d9b72 100644 --- a/dbms/src/DataStreams/tests/CMakeLists.txt +++ b/dbms/src/DataStreams/tests/CMakeLists.txt @@ -9,7 +9,7 @@ add_executable (block_row_transforms block_row_transforms.cpp ${SRCS}) target_link_libraries (block_row_transforms dbms) add_executable (expression_stream expression_stream.cpp ${SRCS}) -target_link_libraries (expression_stream dbms storages_system) +target_link_libraries (expression_stream dbms clickhouse_storages_system) add_executable (block_tab_separated_streams block_tab_separated_streams.cpp ${SRCS}) target_link_libraries (block_tab_separated_streams dbms) @@ -18,7 +18,7 @@ add_executable (native_streams native_streams.cpp ${SRCS}) target_link_libraries (native_streams dbms) add_executable (filter_stream filter_stream.cpp ${SRCS}) -target_link_libraries (filter_stream dbms storages_system) +target_link_libraries (filter_stream dbms clickhouse_storages_system) add_executable (filter_stream_hitlog filter_stream_hitlog.cpp ${SRCS}) target_link_libraries (filter_stream_hitlog dbms) @@ -30,7 +30,7 @@ add_executable (aggregating_stream aggregating_stream.cpp ${SRCS}) target_link_libraries (aggregating_stream dbms) add_executable (union_stream union_stream.cpp ${SRCS}) -target_link_libraries (union_stream dbms storages_system) +target_link_libraries (union_stream dbms clickhouse_storages_system) add_executable (union_stream2 union_stream2.cpp ${SRCS}) target_link_libraries (union_stream2 dbms) @@ -39,7 +39,7 @@ add_executable (collapsing_sorted_stream collapsing_sorted_stream.cpp ${SRCS}) target_link_libraries (collapsing_sorted_stream dbms) add_executable (fork_streams fork_streams.cpp ${SRCS}) -target_link_libraries (fork_streams dbms storages_system) +target_link_libraries (fork_streams dbms clickhouse_storages_system) add_executable (glue_streams glue_streams.cpp ${SRCS}) target_link_libraries (glue_streams dbms) diff --git a/dbms/src/DataTypes/DataTypeAggregateFunction.h b/dbms/src/DataTypes/DataTypeAggregateFunction.h index 1866b760d6b..0de4487ed70 100644 --- a/dbms/src/DataTypes/DataTypeAggregateFunction.h +++ b/dbms/src/DataTypes/DataTypeAggregateFunction.h @@ -41,7 +41,7 @@ public: DataTypePtr clone() const override { return std::make_shared(function, argument_types, parameters); } - /// NOTE These two functions for serializing single values ​​are incompatible with the functions below. + /// NOTE These two functions for serializing single values are incompatible with the functions below. void serializeBinary(const Field & field, WriteBuffer & ostr) const override; void deserializeBinary(Field & field, ReadBuffer & istr) const override; diff --git a/dbms/src/DataTypes/DataTypeEnum.cpp b/dbms/src/DataTypes/DataTypeEnum.cpp index c963ef53ed6..8b1855273da 100644 --- a/dbms/src/DataTypes/DataTypeEnum.cpp +++ b/dbms/src/DataTypes/DataTypeEnum.cpp @@ -272,7 +272,8 @@ Field DataTypeEnum::castToValue(const Field & value_or_name) const { return static_cast(getValue(value_or_name.get())); } - else if (value_or_name.getType() == Field::Types::Int64) + else if (value_or_name.getType() == Field::Types::Int64 + || value_or_name.getType() == Field::Types::UInt64) { Int64 value = value_or_name.get(); checkOverflow(value); diff --git a/dbms/src/DataTypes/DataTypeFactory.cpp b/dbms/src/DataTypes/DataTypeFactory.cpp index 3fb29ca5089..0630f4907f1 100644 --- a/dbms/src/DataTypes/DataTypeFactory.cpp +++ b/dbms/src/DataTypes/DataTypeFactory.cpp @@ -180,7 +180,7 @@ DataTypePtr DataTypeFactory::get(const String & name) const if (function_name.empty()) throw Exception("Logical error: empty name of aggregate function passed", ErrorCodes::LOGICAL_ERROR); - function = AggregateFunctionFactory().get(function_name, argument_types); + function = AggregateFunctionFactory::instance().get(function_name, argument_types); if (!params_row.empty()) function->setParameters(params_row); function->setArguments(argument_types); diff --git a/dbms/src/DataTypes/DataTypeSet.h b/dbms/src/DataTypes/DataTypeSet.h index 514547136b9..8e4f11b100d 100644 --- a/dbms/src/DataTypes/DataTypeSet.h +++ b/dbms/src/DataTypes/DataTypeSet.h @@ -6,7 +6,7 @@ namespace DB { -/** The data type corresponding to the set of values ​​in the IN section. +/** The data type corresponding to the set of values in the IN section. * Used only as an intermediate option when evaluating expressions. */ class DataTypeSet final : public IDataTypeDummy diff --git a/dbms/src/Databases/DatabaseCloud.h b/dbms/src/Databases/DatabaseCloud.h index 3c31ab56dd4..e8fa46e7097 100644 --- a/dbms/src/Databases/DatabaseCloud.h +++ b/dbms/src/Databases/DatabaseCloud.h @@ -26,7 +26,7 @@ namespace DB * * cloud_path - the path to the "cloud"; There may be several different independent clouds /table_definitions - set of unique table definitions so you do not write them many times for a large number of tables - /hash128 -> sql - mapping: hash from table definition (identifier) ​​-> table definition itself as CREATE query + /hash128 -> sql - mapping: hash from table definition (identifier) -> table definition itself as CREATE query /tables - list of tables /database_name - name of the database /name_hash_mod -> compressed_table_list diff --git a/dbms/src/Dictionaries/MongoDBDictionarySource.cpp b/dbms/src/Dictionaries/MongoDBDictionarySource.cpp index 4f8b599ae0e..f9108c43803 100644 --- a/dbms/src/Dictionaries/MongoDBDictionarySource.cpp +++ b/dbms/src/Dictionaries/MongoDBDictionarySource.cpp @@ -32,6 +32,7 @@ namespace ErrorCodes static const size_t max_block_size = 8192; +#if POCO_VERSION < 0x01070800 /// See https://pocoproject.org/forum/viewtopic.php?f=10&t=6326&p=11426&hilit=mongodb+auth#p11485 static void authenticate(Poco::MongoDB::Connection & connection, const std::string & database, const std::string & user, const std::string & password) @@ -117,6 +118,7 @@ static void authenticate(Poco::MongoDB::Connection & connection, } } } +#endif MongoDBDictionarySource::MongoDBDictionarySource( diff --git a/dbms/src/Dictionaries/ODBCDictionarySource.cpp b/dbms/src/Dictionaries/ODBCDictionarySource.cpp index 9e783273c2b..34d41c48c06 100644 --- a/dbms/src/Dictionaries/ODBCDictionarySource.cpp +++ b/dbms/src/Dictionaries/ODBCDictionarySource.cpp @@ -21,13 +21,14 @@ ODBCDictionarySource::ODBCDictionarySource(const DictionaryStructure & dict_stru table{config.getString(config_prefix + ".table")}, where{config.getString(config_prefix + ".where", "")}, sample_block{sample_block}, - pool{std::make_shared( - config.getString(config_prefix + ".connector", "ODBC"), - config.getString(config_prefix + ".connection_string"))}, query_builder{dict_struct, db, table, where, ExternalQueryBuilder::None}, /// NOTE Better to obtain quoting style via ODBC interface. load_all_query{query_builder.composeLoadAllQuery()}, invalidate_query{config.getString(config_prefix + ".invalidate_query", "")} { + pool = createAndCheckResizePocoSessionPool([&] () { return std::make_shared( + config.getString(config_prefix + ".connector", "ODBC"), + config.getString(config_prefix + ".connection_string")); + }); } /// copy-constructor is provided in order to support cloneability @@ -45,6 +46,21 @@ ODBCDictionarySource::ODBCDictionarySource(const ODBCDictionarySource & other) { } +std::shared_ptr ODBCDictionarySource::createAndCheckResizePocoSessionPool(PocoSessionPoolConstructor pool_constr) +{ + static std::mutex mutex; + + Poco::ThreadPool & pool = Poco::ThreadPool::defaultPool(); + + /// NOTE: The lock don't guarantee that external users of the pool don't change its capacity + std::unique_lock lock(mutex); + + if (pool.available() == 0) + pool.addCapacity(2 * std::max(pool.capacity(), 1)); + + return pool_constr(); +} + BlockInputStreamPtr ODBCDictionarySource::loadAll() { LOG_TRACE(log, load_all_query); diff --git a/dbms/src/Dictionaries/ODBCDictionarySource.h b/dbms/src/Dictionaries/ODBCDictionarySource.h index 449a707d848..f51b61f4af2 100644 --- a/dbms/src/Dictionaries/ODBCDictionarySource.h +++ b/dbms/src/Dictionaries/ODBCDictionarySource.h @@ -61,11 +61,17 @@ private: const std::string table; const std::string where; Block sample_block; - std::shared_ptr pool; + std::shared_ptr pool = nullptr; ExternalQueryBuilder query_builder; const std::string load_all_query; std::string invalidate_query; mutable std::string invalidate_query_response; + + using PocoSessionPoolConstructor = std::function()>; + + /// Is used to adjust max size of default Poco thread pool. See issue #750 + /// Acquire the lock, resize pool and construct new Session + static std::shared_ptr createAndCheckResizePocoSessionPool(PocoSessionPoolConstructor pool_constr); }; diff --git a/dbms/src/Dictionaries/TrieDictionary.cpp b/dbms/src/Dictionaries/TrieDictionary.cpp new file mode 100644 index 00000000000..3e2ed7b4e8f --- /dev/null +++ b/dbms/src/Dictionaries/TrieDictionary.cpp @@ -0,0 +1,545 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TYPE_MISMATCH; + extern const int ARGUMENT_OUT_OF_BOUND; + extern const int BAD_ARGUMENTS; + extern const int DICTIONARY_IS_EMPTY; +} + +TrieDictionary::TrieDictionary( + const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr, + const DictionaryLifetime dict_lifetime, bool require_nonempty) + : name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime), + require_nonempty(require_nonempty) +{ + createAttributes(); + trie = btrie_create(); + + try + { + loadData(); + calculateBytesAllocated(); + } + catch (...) + { + creation_exception = std::current_exception(); + } + + creation_time = std::chrono::system_clock::now(); +} + +TrieDictionary::TrieDictionary(const TrieDictionary & other) + : TrieDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty} +{ + trie = btrie_create(); +} + +TrieDictionary::~TrieDictionary() +{ + btrie_destroy(trie); +} + +#define DECLARE(TYPE)\ +void TrieDictionary::get##TYPE(\ + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ + PaddedPODArray & out) const\ +{\ + validateKeyTypes(key_types);\ + \ + const auto & attribute = getAttribute(attribute_name);\ + if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\ + throw Exception{\ + name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\ + ErrorCodes::TYPE_MISMATCH};\ + \ + const auto null_value = std::get(attribute.null_values);\ + \ + getItemsNumber(attribute, key_columns,\ + [&] (const std::size_t row, const auto value) { out[row] = value; },\ + [&] (const std::size_t) { return null_value; });\ +} +DECLARE(UInt8) +DECLARE(UInt16) +DECLARE(UInt32) +DECLARE(UInt64) +DECLARE(Int8) +DECLARE(Int16) +DECLARE(Int32) +DECLARE(Int64) +DECLARE(Float32) +DECLARE(Float64) +#undef DECLARE + +void TrieDictionary::getString( + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, + ColumnString * out) const +{ + validateKeyTypes(key_types); + + const auto & attribute = getAttribute(attribute_name); + if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String)) + throw Exception{ + name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type), + ErrorCodes::TYPE_MISMATCH}; + + const auto & null_value = StringRef{std::get(attribute.null_values)}; + + getItemsImpl(attribute, key_columns, + [&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); }, + [&] (const std::size_t) { return null_value; }); +} + +#define DECLARE(TYPE)\ +void TrieDictionary::get##TYPE(\ + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ + const PaddedPODArray & def, PaddedPODArray & out) const\ +{\ + validateKeyTypes(key_types);\ + \ + const auto & attribute = getAttribute(attribute_name);\ + if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\ + throw Exception{\ + name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\ + ErrorCodes::TYPE_MISMATCH};\ + \ + getItemsNumber(attribute, key_columns,\ + [&] (const std::size_t row, const auto value) { out[row] = value; },\ + [&] (const std::size_t row) { return def[row]; });\ +} +DECLARE(UInt8) +DECLARE(UInt16) +DECLARE(UInt32) +DECLARE(UInt64) +DECLARE(Int8) +DECLARE(Int16) +DECLARE(Int32) +DECLARE(Int64) +DECLARE(Float32) +DECLARE(Float64) +#undef DECLARE + +void TrieDictionary::getString( + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, + const ColumnString * const def, ColumnString * const out) const +{ + validateKeyTypes(key_types); + + const auto & attribute = getAttribute(attribute_name); + if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String)) + throw Exception{ + name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type), + ErrorCodes::TYPE_MISMATCH}; + + getItemsImpl(attribute, key_columns, + [&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); }, + [&] (const std::size_t row) { return def->getDataAt(row); }); +} + +#define DECLARE(TYPE)\ +void TrieDictionary::get##TYPE(\ + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ + const TYPE def, PaddedPODArray & out) const\ +{\ + validateKeyTypes(key_types);\ + \ + const auto & attribute = getAttribute(attribute_name);\ + if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\ + throw Exception{\ + name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\ + ErrorCodes::TYPE_MISMATCH};\ + \ + getItemsNumber(attribute, key_columns,\ + [&] (const std::size_t row, const auto value) { out[row] = value; },\ + [&] (const std::size_t) { return def; });\ +} +DECLARE(UInt8) +DECLARE(UInt16) +DECLARE(UInt32) +DECLARE(UInt64) +DECLARE(Int8) +DECLARE(Int16) +DECLARE(Int32) +DECLARE(Int64) +DECLARE(Float32) +DECLARE(Float64) +#undef DECLARE + +void TrieDictionary::getString( + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, + const String & def, ColumnString * const out) const +{ + validateKeyTypes(key_types); + + const auto & attribute = getAttribute(attribute_name); + if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String)) + throw Exception{ + name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type), + ErrorCodes::TYPE_MISMATCH}; + + getItemsImpl(attribute, key_columns, + [&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); }, + [&] (const std::size_t) { return StringRef{def}; }); +} + +void TrieDictionary::has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray & out) const +{ + validateKeyTypes(key_types); + + const auto & attribute = attributes.front(); + + switch (attribute.type) + { + case AttributeUnderlyingType::UInt8: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::UInt16: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::UInt32: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::UInt64: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::Int8: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::Int16: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::Int32: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::Int64: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::Float32: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::Float64: has(attribute, key_columns, out); break; + case AttributeUnderlyingType::String: has(attribute, key_columns, out); break; + } +} + +void TrieDictionary::createAttributes() +{ + const auto size = dict_struct.attributes.size(); + attributes.reserve(size); + + for (const auto & attribute : dict_struct.attributes) + { + attribute_index_by_name.emplace(attribute.name, attributes.size()); + attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value)); + + if (attribute.hierarchical) + throw Exception{ + name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), + ErrorCodes::TYPE_MISMATCH}; + } +} + +void TrieDictionary::loadData() +{ + auto stream = source_ptr->loadAll(); + stream->readPrefix(); + + /// created upfront to avoid excess allocations + const auto keys_size = dict_struct.key.value().size(); + StringRefs keys(keys_size); + + const auto attributes_size = attributes.size(); + + while (const auto block = stream->read()) + { + const auto rows = block.rows(); + element_count += rows; + + const auto key_column_ptrs = ext::map(ext::range(0, keys_size), + [&] (const std::size_t attribute_idx) { + return block.safeGetByPosition(attribute_idx).column.get(); + }); + + const auto attribute_column_ptrs = ext::map(ext::range(0, attributes_size), + [&] (const std::size_t attribute_idx) { + return block.safeGetByPosition(keys_size + attribute_idx).column.get(); + }); + + for (const auto row_idx : ext::range(0, rows)) + { + /// calculate key once per row + const auto key_column = key_column_ptrs.front(); + + for (const auto attribute_idx : ext::range(0, attributes_size)) + { + const auto & attribute_column = *attribute_column_ptrs[attribute_idx]; + auto & attribute = attributes[attribute_idx]; + setAttributeValue(attribute, key_column->getDataAt(row_idx), attribute_column[row_idx]); + } + } + + } + + stream->readSuffix(); + + if (require_nonempty && 0 == element_count) + throw Exception{ + name + ": dictionary source is empty and 'require_nonempty' property is set.", + ErrorCodes::DICTIONARY_IS_EMPTY}; +} + +template +void TrieDictionary::addAttributeSize(const Attribute & attribute) +{ + const auto & vec = *std::get>(attribute.maps); + bytes_allocated += sizeof(ContainerType) + (vec.capacity() * sizeof(T)); + bucket_count = vec.size(); +} + +void TrieDictionary::calculateBytesAllocated() +{ + bytes_allocated += attributes.size() * sizeof(attributes.front()); + + for (const auto & attribute : attributes) + { + switch (attribute.type) + { + case AttributeUnderlyingType::UInt8: addAttributeSize(attribute); break; + case AttributeUnderlyingType::UInt16: addAttributeSize(attribute); break; + case AttributeUnderlyingType::UInt32: addAttributeSize(attribute); break; + case AttributeUnderlyingType::UInt64: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Int8: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Int16: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Int32: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Int64: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Float32: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Float64: addAttributeSize(attribute); break; + case AttributeUnderlyingType::String: + { + addAttributeSize(attribute); + bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); + + break; + } + } + } + + bytes_allocated += btrie_allocated(trie); +} + +void TrieDictionary::validateKeyTypes(const DataTypes & key_types) const +{ + if (key_types.size() != 1) + throw Exception{ + "Expected a single IP address", + ErrorCodes::TYPE_MISMATCH}; + + const auto & actual_type = key_types[0]->getName(); + + if (actual_type != "UInt32" && actual_type != "FixedString(16)") + throw Exception{ + "Key does not match, expected either UInt32 or FixedString(16)", + ErrorCodes::TYPE_MISMATCH}; +} + + +template +void TrieDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) +{ + std::get(attribute.null_values) = null_value.get::Type>(); + std::get>(attribute.maps) = std::make_unique>(); +} + +TrieDictionary::Attribute TrieDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) +{ + Attribute attr{type}; + + switch (type) + { + case AttributeUnderlyingType::UInt8: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::UInt16: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::UInt32: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::UInt64: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Int8: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Int16: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Int32: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Int64: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Float32: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Float64: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::String: + { + std::get(attr.null_values) = null_value.get(); + std::get>(attr.maps) = std::make_unique>(); + attr.string_arena = std::make_unique(); + break; + } + } + + return attr; +} + + +template +void TrieDictionary::getItemsNumber( + const Attribute & attribute, + const ConstColumnPlainPtrs & key_columns, + ValueSetter && set_value, + DefaultGetter && get_default) const +{ + if (false) {} +#define DISPATCH(TYPE) \ + else if (attribute.type == AttributeUnderlyingType::TYPE) \ + getItemsImpl(attribute, key_columns, std::forward(set_value), std::forward(get_default)); + DISPATCH(UInt8) + DISPATCH(UInt16) + DISPATCH(UInt32) + DISPATCH(UInt64) + DISPATCH(Int8) + DISPATCH(Int16) + DISPATCH(Int32) + DISPATCH(Int64) + DISPATCH(Float32) + DISPATCH(Float64) +#undef DISPATCH + else + throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR); +} + +template +void TrieDictionary::getItemsImpl( + const Attribute & attribute, + const ConstColumnPlainPtrs & key_columns, + ValueSetter && set_value, + DefaultGetter && get_default) const +{ + auto & vec = *std::get>(attribute.maps); + + const auto first_column = key_columns.front(); + const auto rows = first_column->size(); + if (first_column->isNumeric()) + { + for (const auto i : ext::range(0, rows)) + { + auto addr = Int32(first_column->get64(i)); + uintptr_t slot = btrie_find(trie, addr); + set_value(i, slot != BTRIE_NULL ? vec[slot] : get_default(i)); + } + } + else + { + for (const auto i : ext::range(0, rows)) + { + auto addr = first_column->getDataAt(i); + if (addr.size != 16) + throw Exception("Expected key to be FixedString(16)", ErrorCodes::LOGICAL_ERROR); + + uintptr_t slot = btrie_find_a6(trie, reinterpret_cast(addr.data)); + set_value(i, slot != BTRIE_NULL ? vec[slot] : get_default(i)); + } + } + + query_count.fetch_add(rows, std::memory_order_relaxed); +} + + +template +bool TrieDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value) +{ + // Insert value into appropriate vector type + auto & vec = *std::get>(attribute.maps); + size_t row = vec.size(); + vec.push_back(value); + + // Parse IP address and subnet length from string (e.g. 2a02:6b8::3/64) + Poco::Net::IPAddress addr, mask; + std::string addr_str(key.toString()); + size_t pos = addr_str.find('/'); + if (pos != std::string::npos) + { + + addr = Poco::Net::IPAddress(addr_str.substr(0, pos)); + mask = Poco::Net::IPAddress(std::stoi(addr_str.substr(pos + 1), nullptr, 10), addr.family()); + } + else + { + addr = Poco::Net::IPAddress(addr_str); + mask = Poco::Net::IPAddress(addr.length() * 8, addr.family()); + } + + /* + * Here we might overwrite the same key with the same slot as each key can map to multiple attributes. + * However, all columns have equal number of rows so it is okay to store only row number for each key + * instead of building a trie for each column. This comes at the cost of additional lookup in attribute + * vector on lookup time to return cell from row + column. The reason for this is to save space, + * and build only single trie instead of trie for each column. + */ + if (addr.family() == Poco::Net::IPAddress::IPv4) + { + UInt32 addr_v4 = Poco::ByteOrder::toNetwork(*reinterpret_cast(addr.addr())); + UInt32 mask_v4 = Poco::ByteOrder::toNetwork(*reinterpret_cast(mask.addr())); + return btrie_insert(trie, addr_v4, mask_v4, row) == 0; + } + + const uint8_t* addr_v6 = reinterpret_cast(addr.addr()); + const uint8_t* mask_v6 = reinterpret_cast(mask.addr()); + return btrie_insert_a6(trie, addr_v6, mask_v6, row) == 0; +} + +bool TrieDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value) +{ + switch (attribute.type) + { + case AttributeUnderlyingType::UInt8: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::UInt16: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::UInt32: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::UInt64: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::Int8: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::Int16: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::Int32: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::Int64: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::Float32: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::Float64: return setAttributeValueImpl(attribute, key, value.get()); + case AttributeUnderlyingType::String: + { + const auto & string = value.get(); + const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size()); + setAttributeValueImpl(attribute, key, StringRef{string_in_arena, string.size()}); + return true; + } + } + + return {}; +} + +const TrieDictionary::Attribute & TrieDictionary::getAttribute(const std::string & attribute_name) const +{ + const auto it = attribute_index_by_name.find(attribute_name); + if (it == std::end(attribute_index_by_name)) + throw Exception{ + name + ": no such attribute '" + attribute_name + "'", + ErrorCodes::BAD_ARGUMENTS}; + + return attributes[it->second]; +} + +template +void TrieDictionary::has(const Attribute & attribute, const ConstColumnPlainPtrs & key_columns, PaddedPODArray & out) const +{ + const auto first_column = key_columns.front(); + const auto rows = first_column->size(); + if (first_column->isNumeric()) + { + for (const auto i : ext::range(0, rows)) + { + auto addr = Int32(first_column->get64(i)); + uintptr_t slot = btrie_find(trie, addr); + out[i] = (slot != BTRIE_NULL); + } + } + else + { + for (const auto i : ext::range(0, rows)) + { + auto addr = first_column->getDataAt(i); + if (unlikely(addr.size != 16)) + throw Exception("Expected key to be FixedString(16)", ErrorCodes::LOGICAL_ERROR); + + uintptr_t slot = btrie_find_a6(trie, reinterpret_cast(addr.data)); + out[i] = (slot != BTRIE_NULL); + } + } + + query_count.fetch_add(rows, std::memory_order_relaxed);} + +} diff --git a/dbms/src/Dictionaries/TrieDictionary.h b/dbms/src/Dictionaries/TrieDictionary.h new file mode 100644 index 00000000000..09f745f012b --- /dev/null +++ b/dbms/src/Dictionaries/TrieDictionary.h @@ -0,0 +1,216 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +class TrieDictionary final : public IDictionaryBase +{ +public: + TrieDictionary( + const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr, + const DictionaryLifetime dict_lifetime, bool require_nonempty); + + TrieDictionary(const TrieDictionary & other); + + ~TrieDictionary(); + + std::string getKeyDescription() const { return key_description; }; + + std::exception_ptr getCreationException() const override { return creation_exception; } + + std::string getName() const override { return name; } + + std::string getTypeName() const override { return "Trie"; } + + std::size_t getBytesAllocated() const override { return bytes_allocated; } + + std::size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + + double getHitRate() const override { return 1.0; } + + std::size_t getElementCount() const override { return element_count; } + + double getLoadFactor() const override { return static_cast(element_count) / bucket_count; } + + bool isCached() const override { return false; } + + DictionaryPtr clone() const override { return std::make_unique(*this); } + + const IDictionarySource * getSource() const override { return source_ptr.get(); } + + const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } + + const DictionaryStructure & getStructure() const override { return dict_struct; } + + std::chrono::time_point getCreationTime() const override + { + return creation_time; + } + + bool isInjective(const std::string & attribute_name) const override + { + return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + } + +#define DECLARE(TYPE)\ + void get##TYPE(\ + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ + PaddedPODArray & out) const; + DECLARE(UInt8) + DECLARE(UInt16) + DECLARE(UInt32) + DECLARE(UInt64) + DECLARE(Int8) + DECLARE(Int16) + DECLARE(Int32) + DECLARE(Int64) + DECLARE(Float32) + DECLARE(Float64) +#undef DECLARE + + void getString( + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, + ColumnString * out) const; + +#define DECLARE(TYPE)\ + void get##TYPE(\ + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ + const PaddedPODArray & def, PaddedPODArray & out) const; + DECLARE(UInt8) + DECLARE(UInt16) + DECLARE(UInt32) + DECLARE(UInt64) + DECLARE(Int8) + DECLARE(Int16) + DECLARE(Int32) + DECLARE(Int64) + DECLARE(Float32) + DECLARE(Float64) +#undef DECLARE + + void getString( + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, + const ColumnString * const def, ColumnString * const out) const; + +#define DECLARE(TYPE)\ + void get##TYPE(\ + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ + const TYPE def, PaddedPODArray & out) const; + DECLARE(UInt8) + DECLARE(UInt16) + DECLARE(UInt32) + DECLARE(UInt64) + DECLARE(Int8) + DECLARE(Int16) + DECLARE(Int32) + DECLARE(Int64) + DECLARE(Float32) + DECLARE(Float64) +#undef DECLARE + + void getString( + const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, + const String & def, ColumnString * const out) const; + + void has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray & out) const; + +private: + template using ContainerType = std::vector; + template using ContainerPtrType = std::unique_ptr>; + + struct Attribute final + { + AttributeUnderlyingType type; + std::tuple< + UInt8, UInt16, UInt32, UInt64, + Int8, Int16, Int32, Int64, + Float32, Float64, + String> null_values; + std::tuple< + ContainerPtrType, ContainerPtrType, ContainerPtrType, ContainerPtrType, + ContainerPtrType, ContainerPtrType, ContainerPtrType, ContainerPtrType, + ContainerPtrType, ContainerPtrType, + ContainerPtrType> maps; + std::unique_ptr string_arena; + }; + + void createAttributes(); + + void loadData(); + + template + void addAttributeSize(const Attribute & attribute); + + void calculateBytesAllocated(); + + void validateKeyTypes(const DataTypes & key_types) const; + + template + void createAttributeImpl(Attribute & attribute, const Field & null_value); + + Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value); + + + template + void getItemsNumber( + const Attribute & attribute, + const ConstColumnPlainPtrs & key_columns, + ValueSetter && set_value, + DefaultGetter && get_default) const; + + template + void getItemsImpl( + const Attribute & attribute, + const ConstColumnPlainPtrs & key_columns, + ValueSetter && set_value, + DefaultGetter && get_default) const; + + + template + bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value); + + bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value); + + const Attribute & getAttribute(const std::string & attribute_name) const; + + template + void has(const Attribute & attribute, const ConstColumnPlainPtrs & key_columns, PaddedPODArray & out) const; + + const std::string name; + const DictionaryStructure dict_struct; + const DictionarySourcePtr source_ptr; + const DictionaryLifetime dict_lifetime; + const bool require_nonempty; + const std::string key_description{dict_struct.getKeyDescription()}; + + + btrie_t *trie; + std::map attribute_index_by_name; + std::vector attributes; + + std::size_t bytes_allocated = 0; + std::size_t element_count = 0; + std::size_t bucket_count = 0; + mutable std::atomic query_count{0}; + + std::chrono::time_point creation_time; + + std::exception_ptr creation_exception; +}; + + +} diff --git a/dbms/src/Functions/FunctionsArithmetic.h b/dbms/src/Functions/FunctionsArithmetic.h index 9fb659af477..234a2f981c6 100644 --- a/dbms/src/Functions/FunctionsArithmetic.h +++ b/dbms/src/Functions/FunctionsArithmetic.h @@ -91,7 +91,7 @@ struct PlusImpl template static inline Result apply(A a, B b) { - /// Далее везде, static_cast - чтобы не было неправильного результата в выражениях вида Int64 c = UInt32(a) * Int32(-1). + /// Next everywhere, static_cast - so that there is no wrong result in expressions of the form Int64 c = UInt32(a) * Int32(-1). return static_cast(a) + b; } }; @@ -140,7 +140,7 @@ struct DivideFloatingImpl template inline void throwIfDivisionLeadsToFPE(A a, B b) { - /// Возможно, лучше вместо проверок использовать siglongjmp? + /// Is it better to use siglongjmp instead of checks? if (unlikely(b == 0)) throw Exception("Division by zero", ErrorCodes::ILLEGAL_DIVISION); @@ -153,7 +153,7 @@ inline void throwIfDivisionLeadsToFPE(A a, B b) template inline bool divisionLeadsToFPE(A a, B b) { - /// Возможно, лучше вместо проверок использовать siglongjmp? + /// Is it better to use siglongjmp instead of checks? if (unlikely(b == 0)) return true; @@ -308,7 +308,7 @@ struct LeastBaseImpl template static inline Result apply(A a, B b) { - /** gcc 4.9.2 успешно векторизует цикл из этой функции. */ + /** gcc 4.9.2 successfully vectorizes a loop from this function. */ return static_cast(a) < static_cast(b) ? static_cast(a) : static_cast(b); } }; @@ -937,7 +937,7 @@ using FunctionBitRotateRight = FunctionBinaryArithmetic; using FunctionGreatest = FunctionBinaryArithmetic; -/// Свойства монотонности для некоторых функций. +/// Monotonicity properties for some functions. template <> struct FunctionUnaryArithmeticMonotonicity { @@ -974,7 +974,7 @@ template <> struct FunctionUnaryArithmeticMonotonicity } -/// Оптимизации для целочисленного деления на константу. +/// Optimizations for integer division by a constant. #if __SSE2__ #define LIBDIVIDE_USE_SSE2 1 @@ -1065,16 +1065,16 @@ struct ModuloByConstantImpl libdivide::divider divider(b); - /// Тут не удалось сделать так, чтобы SSE вариант из libdivide давал преимущество. + /// Here we failed to make the SSE variant from libdivide give an advantage. size_t size = a.size(); for (size_t i = 0; i < size; ++i) - c[i] = a[i] - (a[i] / divider) * b; /// NOTE: возможно, не сохраняется семантика деления с остатком отрицательных чисел. + c[i] = a[i] - (a[i] / divider) * b; /// NOTE: perhaps, the division semantics with the remainder of negative numbers is not preserved. } }; -/** Прописаны специализации для деления чисел типа UInt64 и UInt32 на числа той же знаковости. - * Можно дополнить до всех возможных комбинаций, но потребуется больше кода. +/** Specializations are specified for dividing numbers of the type UInt64 and UInt32 by the numbers of the same sign. + * Can be expanded to all possible combinations, but more code is needed. */ template <> struct BinaryOperationImpl> : DivideIntegralByConstantImpl {}; diff --git a/dbms/src/Functions/FunctionsArray.cpp b/dbms/src/Functions/FunctionsArray.cpp index d2c83cb30e6..76a5c1cc2d2 100644 --- a/dbms/src/Functions/FunctionsArray.cpp +++ b/dbms/src/Functions/FunctionsArray.cpp @@ -2818,7 +2818,7 @@ void FunctionArrayReduce::getReturnTypeAndPrerequisitesImpl( } } - aggregate_function = AggregateFunctionFactory().get(aggregate_function_name, argument_types); + aggregate_function = AggregateFunctionFactory::instance().get(aggregate_function_name, argument_types); if (has_parameters) aggregate_function->setParameters(params_row); diff --git a/dbms/src/Functions/FunctionsCoding.cpp b/dbms/src/Functions/FunctionsCoding.cpp index 32e64ddb83d..12f3f7c315f 100644 --- a/dbms/src/Functions/FunctionsCoding.cpp +++ b/dbms/src/Functions/FunctionsCoding.cpp @@ -14,6 +14,8 @@ void registerFunctionsCoding(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/dbms/src/Functions/FunctionsCoding.h b/dbms/src/Functions/FunctionsCoding.h index 96a5ac965a8..3258608ad9c 100644 --- a/dbms/src/Functions/FunctionsCoding.h +++ b/dbms/src/Functions/FunctionsCoding.h @@ -985,6 +985,211 @@ private: }; +class FunctionMACNumToString : public IFunction +{ +public: + static constexpr auto name = "MACNumToString"; + static FunctionPtr create(const Context & context) { return std::make_shared(); } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override { return 1; } + bool isInjective(const Block &) override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!typeid_cast(&*arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName() + ", expected UInt64", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + static void formatMAC(UInt64 mac, char *& out) + { + char * begin = out; + + /// mapping of digits up to base 16 + static char digits[] = "0123456789ABCDEF"; + + /// Запишем все задом наперед. + for (size_t offset = 0; offset <= 40; offset += 8) + { + if (offset > 0) + *(out++) = ':'; + + /// Достаем очередной байт. + UInt64 value = (mac >> offset) & static_cast(255); + + /// Быстрее, чем sprintf. + if (value < 16) + { + *(out++) = '0'; + } + if (value == 0) + { + *(out++) = '0'; + } + else + { + while (value > 0) + { + *(out++) = digits[value % 16]; + value /= 16; + } + } + } + + /// И развернем. + std::reverse(begin, out); + + *(out++) = '\0'; + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override + { + const ColumnPtr & column = block.safeGetByPosition(arguments[0]).column; + + if (const ColumnUInt64 * col = typeid_cast(column.get())) + { + const ColumnUInt64::Container_t & vec_in = col->getData(); + + std::shared_ptr col_res = std::make_shared(); + block.safeGetByPosition(result).column = col_res; + + ColumnString::Chars_t & vec_res = col_res->getChars(); + ColumnString::Offsets_t & offsets_res = col_res->getOffsets(); + + vec_res.resize(vec_in.size() * 18); /// самое длинное значение: xx:xx:xx:xx:xx:xx\0 + offsets_res.resize(vec_in.size()); + char * begin = reinterpret_cast(&vec_res[0]); + char * pos = begin; + + for (size_t i = 0; i < vec_in.size(); ++i) + { + formatMAC(vec_in[i], pos); + offsets_res[i] = pos - begin; + } + + vec_res.resize(pos - begin); + } + else if (const ColumnConst * col = typeid_cast *>(column.get())) + { + char buf[18]; + char * pos = buf; + formatMAC(col->getData(), pos); + + auto col_res = std::make_shared(col->size(), buf); + block.safeGetByPosition(result).column = col_res; + } + else + throw Exception("Illegal column " + block.safeGetByPosition(arguments[0]).column->getName() + + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } +}; + + +class FunctionMACStringToNum : public IFunction +{ +public: + static constexpr auto name = "MACStringToNum"; + static FunctionPtr create(const Context & context) { return std::make_shared(); } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!typeid_cast(&*arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + static UInt64 parseMAC(const char * pos) + { + + /// get integer value for a hexademical char digit, or -1 + const auto number_by_char = [] (const char ch) + { + if ('A' <= ch && ch <= 'F') + return 10 + ch - 'A'; + + if ('a' <= ch && ch <= 'f') + return 10 + ch - 'a'; + + if ('0' <= ch && ch <= '9') + return ch - '0'; + + return -1; + }; + + UInt64 res = 0; + for (int offset = 40; offset >= 0; offset -= 8) + { + UInt64 value = 0; + size_t len = 0; + int val = 0; + while ((val = number_by_char(*pos)) >= 0 && len <= 2) + { + value = value * 16 + val; + ++len; + ++pos; + } + if (len == 0 || value > 255 || (offset > 0 && *pos != ':')) + return 0; + res |= value << offset; + ++pos; + } + if (*(pos - 1) != '\0') + return 0; + return res; + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override + { + const ColumnPtr & column = block.safeGetByPosition(arguments[0]).column; + + if (const ColumnString * col = typeid_cast(column.get())) + { + auto col_res = std::make_shared(); + block.safeGetByPosition(result).column = col_res; + + ColumnUInt64::Container_t & vec_res = col_res->getData(); + vec_res.resize(col->size()); + + const ColumnString::Chars_t & vec_src = col->getChars(); + const ColumnString::Offsets_t & offsets_src = col->getOffsets(); + size_t prev_offset = 0; + + for (size_t i = 0; i < vec_res.size(); ++i) + { + vec_res[i] = parseMAC(reinterpret_cast(&vec_src[prev_offset])); + prev_offset = offsets_src[i]; + } + } + else if (const ColumnConstString * col = typeid_cast(column.get())) + { + auto col_res = std::make_shared>(col->size(), parseMAC(col->getData().c_str())); + block.safeGetByPosition(result).column = col_res; + } + else + throw Exception("Illegal column " + block.safeGetByPosition(arguments[0]).column->getName() + + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } +}; + + class FunctionUUIDNumToString : public IFunction { private: diff --git a/dbms/src/Functions/FunctionsConversion.h b/dbms/src/Functions/FunctionsConversion.h index 816a6d2e5ae..b7a8cae759f 100644 --- a/dbms/src/Functions/FunctionsConversion.h +++ b/dbms/src/Functions/FunctionsConversion.h @@ -900,9 +900,9 @@ public: size_t getNumberOfArguments() const override { return 2; } bool isInjective(const Block &) override { return true; } - /** Получить тип результата по типам аргументов и значениям константных аргументов. - * Если функция неприменима для данных аргументов - кинуть исключение. - * Для неконстантных столбцов arguments[i].column = nullptr. + /** Get the result type by argument types and constant argument values. + * If the function does not apply to these arguments, throw an exception. + * For non-constant columns arguments[i].column = nullptr. */ void getReturnTypeAndPrerequisitesImpl(const ColumnsWithTypeAndName & arguments, DataTypePtr & out_return_type, @@ -1089,8 +1089,8 @@ struct ToIntMonotonicity } }; -/** Монотонность для функции toString определяем, в основном, для тестовых целей. - * Всерьёз вряд ли кто-нибудь рассчитывает на оптимизацию запросов с условиями toString(CounterID) = 34. +/** The monotonicity for the `toString` function is mainly determined for test purposes. + * It is doubtful that anyone is looking to optimize queries with conditions `toString(CounterID) = 34`. */ struct ToStringMonotonicity { @@ -1101,7 +1101,7 @@ struct ToStringMonotonicity IFunction::Monotonicity positive(true, true); IFunction::Monotonicity not_monotonic; - /// Функция toString монотонна, если аргумент - Date или DateTime, или неотрицательные числа с одинаковым количеством знаков. + /// `toString` function is monotonous if the argument is Date or DateTime, or non-negative numbers with the same number of symbols. if (typeid_cast(&type) || typeid_cast(&type)) diff --git a/dbms/src/Functions/FunctionsEmbeddedDictionaries.h b/dbms/src/Functions/FunctionsEmbeddedDictionaries.h index 828de8603f5..554f94be304 100644 --- a/dbms/src/Functions/FunctionsEmbeddedDictionaries.h +++ b/dbms/src/Functions/FunctionsEmbeddedDictionaries.h @@ -33,19 +33,19 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -/** Функции, использующие словари Яндекс.Метрики - * - словари регионов, операционных систем, поисковых систем. +/** Functions using Yandex.Metrica dictionaries + * - dictionaries of regions, operating systems, search engines. * - * Подняться по дереву до определенного уровня. + * Climb up the tree to a certain level. * regionToCity, regionToArea, regionToCountry, ... * - * Преобразовать значения в столбце + * Convert values of a column * regionToName * - * Является ли первый идентификатор потомком второго. + * Whether the first identifier is a descendant of the second. * regionIn * - * Получить массив идентификаторов регионов, состоящий из исходного и цепочки родителей. Порядок implementation defined. + * Get an array of region identifiers, consisting of the source and the parents chain. Order implementation defined. * regionHierarchy */ @@ -131,9 +131,9 @@ struct SEHierarchyImpl #endif -/** Вспомогательная вещь, позволяющая достать из словаря конкретный словарь, соответствующий точке зрения - * (ключу словаря, передаваемому в аргументе функции). - * Пример: при вызове regionToCountry(x, 'ua'), может быть использован словарь, в котором Крым относится к Украине. +/** Auxiliary thing, allowing to get from the dictionary a specific dictionary, corresponding to the point of view + * (the dictionary key passed as function argument). + * Example: when calling regionToCountry(x, 'ua'), a dictionary can be used, in which Crimea refers to Ukraine. */ struct RegionsHierarchyGetter { @@ -146,7 +146,7 @@ struct RegionsHierarchyGetter } }; -/** Для словарей без поддержки ключей. Ничего не делает. +/** For dictionaries without key support. Doing nothing. */ template struct IdentityDictionaryGetter @@ -164,7 +164,7 @@ struct IdentityDictionaryGetter }; -/// Преобразует идентификатор, используя словарь. +/// Converts an identifier using a dictionary. template class FunctionTransformWithDictionary : public IFunction { @@ -213,7 +213,7 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override { - /// Ключ словаря, определяющий "точку зрения". + /// The dictionary key that defines the "point of view". std::string dict_key; if (arguments.size() == 2) @@ -257,7 +257,7 @@ public: }; -/// Проверяет принадлежность, используя словарь. +/// Checks belonging using a dictionary. template class FunctionIsInWithDictionary : public IFunction { @@ -311,7 +311,7 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override { - /// Ключ словаря, определяющий "точку зрения". + /// The dictionary key that defines the "point of view". std::string dict_key; if (arguments.size() == 3) @@ -390,7 +390,7 @@ public: }; -/// Получает массив идентификаторов, состоящий из исходного и цепочки родителей. +/// Gets an array of identifiers consisting of the source and the parents chain. template class FunctionHierarchyWithDictionary : public IFunction { @@ -439,7 +439,7 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override { - /// Ключ словаря, определяющий "точку зрения". + /// The dictionary key that defines the "point of view". std::string dict_key; if (arguments.size() == 2) @@ -670,7 +670,7 @@ struct FunctionSEHierarchy : #endif -/// Преобразует числовой идентификатор региона в имя на заданном языке, используя словарь. +/// Converts a region's numeric identifier to a name in the specified language using a dictionary. class FunctionRegionToName : public IFunction { public: @@ -727,7 +727,7 @@ public: { RegionsNames::Language language = RegionsNames::Language::RU; - /// Если указан язык результата + /// If the result language is specified if (arguments.size() == 2) { if (const ColumnConstString * col_language = typeid_cast(block.safeGetByPosition(arguments[1]).column.get())) diff --git a/dbms/src/Functions/FunctionsExternalDictionaries.h b/dbms/src/Functions/FunctionsExternalDictionaries.h index 375f5212911..817466fd75d 100644 --- a/dbms/src/Functions/FunctionsExternalDictionaries.h +++ b/dbms/src/Functions/FunctionsExternalDictionaries.h @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -37,17 +38,17 @@ namespace ErrorCodes extern const int UNKNOWN_TYPE; } -/** Функции, использующие подключаемые (внешние) словари. +/** Functions that use plug-ins (external) dictionaries. * - * Получить значение аттрибута заданного типа. + * Get the value of the attribute of the specified type. * dictGetType(dictionary, attribute, id), - * Type - placeholder для имени типа, в данный момент поддерживаются любые числовые и строковой типы. - * Тип должен соответствовать реальному типу аттрибута, с которым он был объявлен в структуре словаря. + * Type - placeholder for the type name, any numeric and string types are currently supported. + * The type must match the actual attribute type with which it was declared in the dictionary structure. * - * Получить массив идентификаторов, состоящий из исходного и цепочки родителей. + * Get an array of identifiers, consisting of the source and parents chain. * dictGetHierarchy(dictionary, id). * - * Является ли первы йидентификатор потомком второго. + * Is the first identifier the child of the second. * dictIsIn(dictionary, child_id, parent_id). */ @@ -102,7 +103,8 @@ private: !executeDispatchSimple(block, arguments, result, dict_ptr) && !executeDispatchSimple(block, arguments, result, dict_ptr) && !executeDispatchComplex(block, arguments, result, dict_ptr) && - !executeDispatchComplex(block, arguments, result, dict_ptr)) + !executeDispatchComplex(block, arguments, result, dict_ptr) && + !executeDispatchComplex(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), ErrorCodes::UNKNOWN_TYPE}; @@ -155,7 +157,7 @@ private: if (typeid_cast(key_col_with_type.column.get()) || typeid_cast(key_col_with_type.column.get())) { - /// Функции у внешних словарей поддерживают только полноценные (не константные) столбцы с ключами. + /// Functions in external dictionaries only support full-value (not constant) columns with keys. const ColumnPtr key_col_materialized = key_col_with_type.column->convertToFullColumnIfConst(); const auto key_columns = ext::map( @@ -285,6 +287,7 @@ private: !executeDispatch(block, arguments, result, dict_ptr) && !executeDispatchComplex(block, arguments, result, dict_ptr) && !executeDispatchComplex(block, arguments, result, dict_ptr) && + !executeDispatchComplex(block, arguments, result, dict_ptr) && !executeDispatchRange(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), @@ -551,7 +554,8 @@ private: !executeDispatch(block, arguments, result, dict_ptr) && !executeDispatch(block, arguments, result, dict_ptr) && !executeDispatchComplex(block, arguments, result, dict_ptr) && - !executeDispatchComplex(block, arguments, result, dict_ptr)) + !executeDispatchComplex(block, arguments, result, dict_ptr) && + !executeDispatchComplex(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), ErrorCodes::UNKNOWN_TYPE}; @@ -844,6 +848,7 @@ private: !executeDispatch(block, arguments, result, dict_ptr) && !executeDispatchComplex(block, arguments, result, dict_ptr) && !executeDispatchComplex(block, arguments, result, dict_ptr) && + !executeDispatchComplex(block, arguments, result, dict_ptr) && !executeDispatchRange(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), @@ -1153,7 +1158,8 @@ private: !executeDispatch(block, arguments, result, dict_ptr) && !executeDispatch(block, arguments, result, dict_ptr) && !executeDispatchComplex(block, arguments, result, dict_ptr) && - !executeDispatchComplex(block, arguments, result, dict_ptr)) + !executeDispatchComplex(block, arguments, result, dict_ptr) && + !executeDispatchComplex(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), ErrorCodes::UNKNOWN_TYPE}; diff --git a/dbms/src/Functions/FunctionsFormatting.h b/dbms/src/Functions/FunctionsFormatting.h index 288c453f464..25cb80f6ab1 100644 --- a/dbms/src/Functions/FunctionsFormatting.h +++ b/dbms/src/Functions/FunctionsFormatting.h @@ -13,12 +13,12 @@ namespace DB { -/** Функция для необычного преобразования в строку: +/** Function for an unusual conversion to a string: * - * bitmaskToList - принимает целое число - битовую маску, возвращает строку из степеней двойки через запятую. - * например, bitmaskToList(50) = '2,16,32' + * bitmaskToList - takes an integer - a bitmask, returns a string of degrees of 2 separated by a comma. + * for example, bitmaskToList(50) = '2,16,32' * - * formatReadableSize - выводит переданный размер в байтах в виде 123.45 GiB. + * formatReadableSize - prints the transferred size in bytes in form `123.45 GiB`. */ class FunctionBitmaskToList : public IFunction diff --git a/dbms/src/Functions/FunctionsMath.h b/dbms/src/Functions/FunctionsMath.h index ed808067b53..115790efe04 100644 --- a/dbms/src/Functions/FunctionsMath.h +++ b/dbms/src/Functions/FunctionsMath.h @@ -478,7 +478,7 @@ struct BinaryFunctionVectorized struct EImpl { static constexpr auto name = "e"; - static const double value; /// См. .cpp + static const double value; /// See .cpp }; struct PiImpl diff --git a/dbms/src/Functions/FunctionsMiscellaneous.h b/dbms/src/Functions/FunctionsMiscellaneous.h index 8bf61862206..951b3d3663b 100644 --- a/dbms/src/Functions/FunctionsMiscellaneous.h +++ b/dbms/src/Functions/FunctionsMiscellaneous.h @@ -49,8 +49,8 @@ public: }; -/** Создаёт массив, размножая столбец (первый аргумент) по количеству элементов в массиве (втором аргументе). - * Используется только в качестве prerequisites для функций высшего порядка. +/** Creates an array, multiplying the column (the first argument) by the number of elements in the array (the second argument). + * Used only as prerequisites for higher-order functions. */ class FunctionReplicate : public IFunction { diff --git a/dbms/src/Functions/FunctionsReinterpret.h b/dbms/src/Functions/FunctionsReinterpret.h index 5e7f72075b9..1ecc5beacbd 100644 --- a/dbms/src/Functions/FunctionsReinterpret.h +++ b/dbms/src/Functions/FunctionsReinterpret.h @@ -15,7 +15,7 @@ namespace DB { -/** Функции преобразования чисел и дат в строки, содержащие тот же набор байт в машинном представлении, и обратно. +/** Functions for transforming numbers and dates to strings that contain the same set of bytes in the machine representation, and vice versa. */ diff --git a/dbms/src/Functions/FunctionsString.h b/dbms/src/Functions/FunctionsString.h index 420829ffb4b..617832cba3f 100644 --- a/dbms/src/Functions/FunctionsString.h +++ b/dbms/src/Functions/FunctionsString.h @@ -20,10 +20,10 @@ namespace DB * lengthUTF8, substringUTF8, lowerUTF8, upperUTF8, reverseUTF8 * * s -> UInt8: empty, notEmpty - * s -> UInt64: length, lengthUTF8 - * s -> s: lower, upper, lowerUTF8, upperUTF8, reverse, reverseUTF8 - * s, s -> s: concat - * s, c1, c2 -> s: substring, substringUTF8 + * s -> UInt64: length, lengthUTF8 + * s -> s: lower, upper, lowerUTF8, upperUTF8, reverse, reverseUTF8 + * s, s -> s: concat + * s, c1, c2 -> s: substring, substringUTF8 * s, c1, c2, s2 -> s: replace, replaceUTF8 * * Функции поиска строк и регулярных выражений расположены отдельно. @@ -76,7 +76,7 @@ inline void UTF8CyrillicToCase(const UInt8 *& src, const UInt8 * const src_end, } else if (src[0] == 0xD0u && (src[1] >= 0xA0u && src[1] <= 0xAFu)) { - /// Р-Я + /// Р-Я *dst++ = xor_or_identity(*src++, 0x1); *dst++ = xor_or_identity(*src++, 0x20); } diff --git a/dbms/src/Functions/Regexps.h b/dbms/src/Functions/Regexps.h index 117930aebbd..c1f774e798d 100644 --- a/dbms/src/Functions/Regexps.h +++ b/dbms/src/Functions/Regexps.h @@ -29,7 +29,7 @@ namespace Regexps inline Pool::Pointer get(const std::string & pattern) { /// C++11 has thread-safe function-local statics on most modern compilers. - static Pool known_regexps; /// Разные переменные для разных параметров шаблона. + static Pool known_regexps; /// Different variables for different pattern parameters. return known_regexps.get(pattern, [&pattern] { diff --git a/dbms/src/Functions/likePatternToRegexp.h b/dbms/src/Functions/likePatternToRegexp.h index 9d769eb6fe4..3a078b468c2 100644 --- a/dbms/src/Functions/likePatternToRegexp.h +++ b/dbms/src/Functions/likePatternToRegexp.h @@ -4,7 +4,7 @@ namespace DB { -/// Переводит выражение LIKE в regexp re2. Например, abc%def -> ^abc.*def$ +/// Transforms the LIKE expression into regexp re2. For example, abc%def -> ^abc.*def$ inline String likePatternToRegexp(const String & pattern) { String res; diff --git a/dbms/src/IO/CompressedWriteBuffer.cpp b/dbms/src/IO/CompressedWriteBuffer.cpp index ba6fbe8307a..eb00b400196 100644 --- a/dbms/src/IO/CompressedWriteBuffer.cpp +++ b/dbms/src/IO/CompressedWriteBuffer.cpp @@ -70,15 +70,18 @@ void CompressedWriteBuffer::nextImpl() compressed_buffer[0] = static_cast(CompressionMethodByte::LZ4); if (method == CompressionMethod::LZ4) - compressed_size = header_size + LZ4_compress( + compressed_size = header_size + LZ4_compress_default( working_buffer.begin(), &compressed_buffer[header_size], - uncompressed_size); + uncompressed_size, + LZ4_COMPRESSBOUND(uncompressed_size)); else - compressed_size = header_size + LZ4_compressHC( + compressed_size = header_size + LZ4_compress_HC( working_buffer.begin(), &compressed_buffer[header_size], - uncompressed_size); + uncompressed_size, + LZ4_COMPRESSBOUND(uncompressed_size), + 0); UInt32 compressed_size_32 = compressed_size; UInt32 uncompressed_size_32 = uncompressed_size; diff --git a/dbms/src/IO/MySQLxxHelpers.h b/dbms/src/IO/MySQLxxHelpers.h index 4b54064faff..4441392fec3 100644 --- a/dbms/src/IO/MySQLxxHelpers.h +++ b/dbms/src/IO/MySQLxxHelpers.h @@ -3,12 +3,11 @@ #include #include #include - +#include #include #include #include - /// This is for Yandex.Metrica code. namespace mysqlxx diff --git a/dbms/src/IO/WriteHelpers.h b/dbms/src/IO/WriteHelpers.h index 46084d622eb..f94d0600ab4 100644 --- a/dbms/src/IO/WriteHelpers.h +++ b/dbms/src/IO/WriteHelpers.h @@ -368,7 +368,7 @@ inline void writeBackQuotedString(const String & s, WriteBuffer & buf) /// То же самое, но обратные кавычки применяются только при наличии символов, не подходящих для идентификатора без обратных кавычек. inline void writeProbablyBackQuotedString(const String & s, WriteBuffer & buf) { - if (s.empty() || !isWordCharASCII(s[0])) + if (s.empty() || !isValidIdentifierBegin(s[0])) writeBackQuotedString(s, buf); else { @@ -583,6 +583,7 @@ inline typename std::enable_if::value, void>::type writeBinary(const T & x, WriteBuffer & buf) { writePODBinary(x, buf); } inline void writeBinary(const String & x, WriteBuffer & buf) { writeStringBinary(x, buf); } +inline void writeBinary(const StringRef & x, WriteBuffer & buf) { writeStringBinary(x, buf); } inline void writeBinary(const uint128 & x, WriteBuffer & buf) { writePODBinary(x, buf); } inline void writeBinary(const LocalDate & x, WriteBuffer & buf) { writePODBinary(x, buf); } inline void writeBinary(const LocalDateTime & x, WriteBuffer & buf) { writePODBinary(x, buf); } diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index b57639e9bf6..54550f94003 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -104,7 +104,6 @@ struct ContextShared String flags_path; /// Databases databases; /// List of databases and tables in them. TableFunctionFactory table_function_factory; /// Table functions. - AggregateFunctionFactory aggregate_function_factory; /// Aggregate functions. FormatFactory format_factory; /// Formats. mutable std::shared_ptr embedded_dictionaries; /// Metrica's dictionaeis. Have lazy initialization. mutable std::shared_ptr external_dictionaries; @@ -211,7 +210,6 @@ Context::~Context() = default; const TableFunctionFactory & Context::getTableFunctionFactory() const { return shared->table_function_factory; } -const AggregateFunctionFactory & Context::getAggregateFunctionFactory() const { return shared->aggregate_function_factory; } InterserverIOHandler & Context::getInterserverIOHandler() { return shared->interserver_io_handler; } std::unique_lock Context::getLock() const diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h index fa0d1b7b8b1..579f7c31b50 100644 --- a/dbms/src/Interpreters/Context.h +++ b/dbms/src/Interpreters/Context.h @@ -187,7 +187,6 @@ public: void setSetting(const String & name, const std::string & value); const TableFunctionFactory & getTableFunctionFactory() const; - const AggregateFunctionFactory & getAggregateFunctionFactory() const; const EmbeddedDictionaries & getEmbeddedDictionaries() const; const ExternalDictionaries & getExternalDictionaries() const; void tryCreateEmbeddedDictionaries() const; diff --git a/dbms/src/Interpreters/DictionaryFactory.cpp b/dbms/src/Interpreters/DictionaryFactory.cpp index 2d1f7d34bd9..a160c763971 100644 --- a/dbms/src/Interpreters/DictionaryFactory.cpp +++ b/dbms/src/Interpreters/DictionaryFactory.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -81,6 +82,15 @@ DictionaryPtr DictionaryFactory::create(const std::string & name, Poco::Util::Ab return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime, size); } + else if ("ip_trie" == layout_type) + { + if (!dict_struct.key) + throw Exception{"'key' is required for dictionary of layout 'ip_trie'", + ErrorCodes::BAD_ARGUMENTS}; + + // This is specialised trie for storing IPv4 and IPv6 prefixes. + return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + } else { if (dict_struct.key) diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index 9bf69cf0541..dd59e96b4a4 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -883,7 +883,7 @@ void ExpressionAnalyzer::normalizeTreeImpl( { node->kind = ASTFunction::LAMBDA_EXPRESSION; } - else if (context.getAggregateFunctionFactory().isAggregateFunctionName(node->name)) + else if (AggregateFunctionFactory::instance().isAggregateFunctionName(node->name)) { node->kind = ASTFunction::AGGREGATE_FUNCTION; } @@ -2077,7 +2077,7 @@ void ExpressionAnalyzer::getAggregates(const ASTPtr & ast, ExpressionActionsPtr aggregate.argument_names[i] = name; } - aggregate.function = context.getAggregateFunctionFactory().get(node->name, types); + aggregate.function = AggregateFunctionFactory::instance().get(node->name, types); if (node->parameters) { diff --git a/dbms/src/Interpreters/InterpreterInsertQuery.cpp b/dbms/src/Interpreters/InterpreterInsertQuery.cpp index 295cd3b57b8..5f943f94a36 100644 --- a/dbms/src/Interpreters/InterpreterInsertQuery.cpp +++ b/dbms/src/Interpreters/InterpreterInsertQuery.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -121,7 +122,9 @@ BlockIO InterpreterInsertQuery::execute() res.in_sample = interpreter_select.getSampleBlock(); res.in = interpreter_select.execute().in; + res.in = std::make_shared(res.in, res.in_sample, res.out_sample); + res.in = std::make_shared(context, res.in, res.in_sample, res.out_sample); res.in = std::make_shared(res.in, out); } diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index afa653b6260..a179360e7c0 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -968,7 +968,7 @@ struct AdderNonJoined class NonJoinedBlockInputStream : public IProfilingBlockInputStream { public: - NonJoinedBlockInputStream(const Join & parent_, Block & left_sample_block, size_t max_block_size_) + NonJoinedBlockInputStream(const Join & parent_, const Block & left_sample_block, size_t max_block_size_) : parent(parent_), max_block_size(max_block_size_) { /** left_sample_block contains keys and "left" columns. @@ -981,39 +981,43 @@ public: result_sample_block = left_sample_block; -// std::cerr << result_sample_block.dumpStructure() << "\n"; - - /// Add new columns to the block. + /// Add columns from the right-side table to the block. for (size_t i = 0; i < num_columns_right; ++i) { - const ColumnWithTypeAndName & src_column = parent.sample_block_with_columns_to_add.safeGetByPosition(i); - ColumnWithTypeAndName new_column = src_column.cloneEmpty(); - result_sample_block.insert(std::move(new_column)); + const ColumnWithTypeAndName & src_column = parent.sample_block_with_columns_to_add.getByPosition(i); + result_sample_block.insert(src_column.cloneEmpty()); } - column_numbers_left.reserve(num_columns_left); - column_numbers_keys_and_right.reserve(num_keys + num_columns_right); + column_indices_left.reserve(num_columns_left); + column_indices_keys_and_right.reserve(num_keys + num_columns_right); + std::vector is_key_column_in_left_block(num_keys + num_columns_left, false); + + for (const std::string & key : parent.key_names_left) + { + size_t key_pos = left_sample_block.getPositionByName(key); + is_key_column_in_left_block[key_pos] = true; + /// Here we establish the mapping between key columns of the left- and right-side tables. + /// key_pos index is inserted in the position corresponding to key column in parent.blocks + /// (saved blocks of the right-side table) and points to the same key column + /// in the left_sample_block and thus in the result_sample_block. + column_indices_keys_and_right.push_back(key_pos); + } for (size_t i = 0; i < num_keys + num_columns_left; ++i) { - const String & name = left_sample_block.safeGetByPosition(i).name; - - auto found_key_column = std::find(parent.key_names_left.begin(), parent.key_names_left.end(), name); - if (parent.key_names_left.end() == found_key_column) - column_numbers_left.push_back(i); - else - column_numbers_keys_and_right.push_back(found_key_column - parent.key_names_left.begin()); + if (!is_key_column_in_left_block[i]) + column_indices_left.push_back(i); } for (size_t i = 0; i < num_columns_right; ++i) - column_numbers_keys_and_right.push_back(num_keys + num_columns_left + i); + column_indices_keys_and_right.push_back(num_keys + num_columns_left + i); /// If use_nulls, convert left columns to Nullable. if (parent.use_nulls) { for (size_t i = 0; i < num_columns_left; ++i) { - convertColumnToNullable(result_sample_block.getByPosition(column_numbers_left[i])); + convertColumnToNullable(result_sample_block.getByPosition(column_indices_left[i])); } } @@ -1050,9 +1054,14 @@ private: size_t max_block_size; Block result_sample_block; - ColumnNumbers column_numbers_left; - ColumnNumbers column_numbers_keys_and_right; + /// Indices of columns in result_sample_block that come from the left-side table (except key columns). + ColumnNumbers column_indices_left; + /// Indices of key columns in result_sample_block or columns that come from the right-side table. + /// Order is significant: it is the same as the order of columns in the blocks of the right-side table that are saved in parent.blocks. + ColumnNumbers column_indices_keys_and_right; + /// Columns of the current output block corresponding to column_indices_left. ColumnPlainPtrs columns_left; + /// Columns of the current output block corresponding to column_indices_keys_and_right. ColumnPlainPtrs columns_keys_and_right; std::unique_ptr> position; /// type erasure @@ -1063,19 +1072,19 @@ private: { Block block = result_sample_block.cloneEmpty(); - size_t num_columns_left = column_numbers_left.size(); - size_t num_columns_right = column_numbers_keys_and_right.size(); + size_t num_columns_left = column_indices_left.size(); + size_t num_columns_right = column_indices_keys_and_right.size(); for (size_t i = 0; i < num_columns_left; ++i) { - auto & column_with_type_and_name = block.safeGetByPosition(column_numbers_left[i]); + auto & column_with_type_and_name = block.safeGetByPosition(column_indices_left[i]); column_with_type_and_name.column = column_with_type_and_name.type->createColumn(); columns_left[i] = column_with_type_and_name.column.get(); } for (size_t i = 0; i < num_columns_right; ++i) { - auto & column_with_type_and_name = block.safeGetByPosition(column_numbers_keys_and_right[i]); + auto & column_with_type_and_name = block.safeGetByPosition(column_indices_keys_and_right[i]); column_with_type_and_name.column = column_with_type_and_name.type->createColumn(); columns_keys_and_right[i] = column_with_type_and_name.column.get(); columns_keys_and_right[i]->reserve(column_with_type_and_name.column->size()); diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index 75979f1ecd8..a63a4b37acf 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -360,9 +360,9 @@ private: ASTTableJoin::Kind kind; ASTTableJoin::Strictness strictness; - /// Names of key columns (columns for equi-JOIN) in "left" table. + /// Names of key columns (columns for equi-JOIN) in "left" table (in the order they appear in USING clause). const Names key_names_left; - /// Names of key columns (columns for equi-JOIN) in "right" table. + /// Names of key columns (columns for equi-JOIN) in "right" table (in the order they appear in USING clause). const Names key_names_right; /// Substitute NULLs for non-JOINed rows. @@ -387,7 +387,9 @@ private: Sizes key_sizes; + /// Block with columns from the right-side table except key columns. Block sample_block_with_columns_to_add; + /// Block with key columns in the same order they appear in the right-side table. Block sample_block_with_keys; Poco::Logger * log; diff --git a/dbms/src/Interpreters/Limits.h b/dbms/src/Interpreters/Limits.h index c8192f587da..994bf213cdb 100644 --- a/dbms/src/Interpreters/Limits.h +++ b/dbms/src/Interpreters/Limits.h @@ -9,22 +9,22 @@ namespace DB { -/** Ограничения при выполнении запроса - часть настроек. - * Используются, чтобы обеспечить более безопасное исполнение запросов из пользовательского интерфейса. - * В основном, ограничения проверяются на каждый блок (а не на каждую строку). То есть, ограничения могут быть немного нарушены. - * Почти все ограничения действуют только на SELECT-ы. - * Почти все ограничения действуют на каждый поток по отдельности. +/** Limits during query execution are part of the settings. + * Used to provide a more safe execution of queries from the user interface. + * Basically, constraints are checked for each block (not every row). That is, the limits can be slightly violated. + * Almost all limits apply only to SELECTs. + * Almost all limits apply to each thread individually. */ struct Limits { - /** Перечисление ограничений: тип, имя, значение по-умолчанию. - * По-умолчанию: всё не ограничено, кроме довольно слабых ограничений на глубину рекурсии и размер выражений. + /** Enumeration of limits: type, name, default value. + * By default: everything is unlimited, except for rather weak restrictions on the depth of recursion and the size of the expressions. */ #define APPLY_FOR_LIMITS(M) \ - /** Ограничения на чтение из самых "глубоких" источников. \ - * То есть, только в самом глубоком подзапросе. \ - * При чтении с удалённого сервера, проверяется только на удалённом сервере. \ + /** Limits on reading from the most "deep" sources. \ + * That is, only in the deepest subquery. \ + * When reading from a remote server, it is only checked on a remote server. \ */ \ M(SettingUInt64, max_rows_to_read, 0) \ M(SettingUInt64, max_bytes_to_read, 0) \ @@ -39,20 +39,20 @@ struct Limits M(SettingOverflowMode, sort_overflow_mode, OverflowMode::THROW) \ M(SettingUInt64, max_bytes_before_external_sort, 0) \ \ - /** Ограничение на размер результата. \ - * Проверяются также для подзапросов и на удалённых серверах. \ + /** Limits on result size. \ + * Are also checked for subqueries and on remote servers. \ */ \ M(SettingUInt64, max_result_rows, 0) \ M(SettingUInt64, max_result_bytes, 0) \ M(SettingOverflowMode, result_overflow_mode, OverflowMode::THROW) \ \ - /* TODO: Проверять также при слиянии и финализации агрегатных функций. */ \ + /* TODO: Check also when merging and finalizing aggregate functions. */ \ M(SettingSeconds, max_execution_time, 0) \ M(SettingOverflowMode, timeout_overflow_mode, OverflowMode::THROW) \ \ - /** В строчках в секунду. */ \ + /** In rows per second. */ \ M(SettingUInt64, min_execution_speed, 0) \ - /** Проверять, что скорость не слишком низкая, после прошествия указанного времени. */ \ + /** Check that the speed is not too low after the specified time has elapsed. */ \ M(SettingSeconds, timeout_before_checking_execution_speed, 0) \ \ M(SettingUInt64, max_columns_to_read, 0) \ @@ -61,42 +61,42 @@ struct Limits \ M(SettingUInt64, max_subquery_depth, 100) \ M(SettingUInt64, max_pipeline_depth, 1000) \ - M(SettingUInt64, max_ast_depth, 1000) /** Проверяются не во время парсинга, */ \ - M(SettingUInt64, max_ast_elements, 50000) /** а уже после парсинга запроса. */ \ + M(SettingUInt64, max_ast_depth, 1000) /** Checked not during parsing, */ \ + M(SettingUInt64, max_ast_elements, 50000) /** but after parsing the request. */ \ \ - /** 0 - можно всё. 1 - только запросы на чтение. 2 - только запросы на чтение, а также изменение настроек, кроме настройки readonly. */ \ + /** 0 - everything is allowed. 1 - only read requests. 2 - only read requests, as well as changing settings, except for the readonly setting. */ \ M(SettingUInt64, readonly, 0) \ \ - /** Ограничения для максимального размера множества, получающегося при выполнении секции IN. */ \ + /** Limits for the maximum size of the set resulting from the execution of the IN section. */ \ M(SettingUInt64, max_rows_in_set, 0) \ M(SettingUInt64, max_bytes_in_set, 0) \ M(SettingOverflowMode, set_overflow_mode, OverflowMode::THROW) \ \ - /** Ограничения для максимального размера множества, получающегося при выполнении секции IN. */ \ + /** Limits for the maximum size of the set obtained by executing the IN section. */ \ M(SettingUInt64, max_rows_in_join, 0) \ M(SettingUInt64, max_bytes_in_join, 0) \ M(SettingOverflowMode, join_overflow_mode, OverflowMode::THROW) \ \ - /** Ограничения для максимального размера передаваемой внешней таблицы, получающейся при выполнении секции GLOBAL IN/JOIN. */ \ + /** Limits for the maximum size of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed. */ \ M(SettingUInt64, max_rows_to_transfer, 0) \ M(SettingUInt64, max_bytes_to_transfer, 0) \ M(SettingOverflowMode, transfer_overflow_mode, OverflowMode::THROW) \ \ - /** Ограничения для максимального размера запоминаемого состояния при выполнении DISTINCT. */ \ + /** Limits for the maximum size of the stored state when executing DISTINCT. */ \ M(SettingUInt64, max_rows_in_distinct, 0) \ M(SettingUInt64, max_bytes_in_distinct, 0) \ M(SettingOverflowMode, distinct_overflow_mode, OverflowMode::THROW) \ \ - /** Максимальное использование памяти при обработке запроса. 0 - не ограничено. */ \ - M(SettingUInt64, max_memory_usage, 0) /* На один запрос */ \ - /* Суммарно на одновременно выполняющиеся запросы одного пользователя */ \ + /** Maximum memory usage when processing a request. 0 - not bounded. */ \ + M(SettingUInt64, max_memory_usage, 0) /* For one query */ \ + /* Totally for concurrently running queries of one user */ \ M(SettingUInt64, max_memory_usage_for_user, 0) \ - /* Суммарно на все одновременно выполняющиеся запросы */ \ + /* Totally for all concurrent queries */ \ M(SettingUInt64, max_memory_usage_for_all_queries, 0) \ \ - /** Максимальная скорость обмена данными по сети в байтах в секунду. 0 - не ограничена. */ \ + /** The maximum speed of data exchange over the network in bytes per second. 0 - not bounded. */ \ M(SettingUInt64, max_network_bandwidth, 0) \ - /** Максимальное количество байт на приём или передачу по сети, в рамках запроса. */ \ + /** The maximum number of bytes to receive or transmit over the network, as part of the query. */ \ M(SettingUInt64, max_network_bytes, 0) \ #define DECLARE(TYPE, NAME, DEFAULT) \ @@ -106,7 +106,7 @@ struct Limits #undef DECLARE - /// Установить настройку по имени. + /// Set setting by name. bool trySet(const String & name, const Field & value) { #define TRY_SET(TYPE, NAME, DEFAULT) \ @@ -122,7 +122,7 @@ struct Limits #undef TRY_SET } - /// Установить настройку по имени. Прочитать сериализованное в бинарном виде значение из буфера (для межсерверного взаимодействия). + /// Set the setting by name. Read the binary serialized value from the buffer (for server-to-server interaction). bool trySet(const String & name, ReadBuffer & buf) { #define TRY_SET(TYPE, NAME, DEFAULT) \ @@ -138,7 +138,7 @@ struct Limits #undef TRY_SET } - /// Пропустить сериализованное в бинарном виде значение из буфера. + /// Skip the binary-serialized value from the buffer. bool tryIgnore(const String & name, ReadBuffer & buf) { #define TRY_IGNORE(TYPE, NAME, DEFAULT) \ @@ -154,7 +154,7 @@ struct Limits #undef TRY_IGNORE } - /** Установить настройку по имени. Прочитать значение в текстовом виде из строки (например, из конфига, или из параметра URL). + /** Set the setting by name. Read the value in text form from a string (for example, from a config, or from a URL parameter). */ bool trySet(const String & name, const String & value) { @@ -174,7 +174,7 @@ struct Limits private: friend struct Settings; - /// Записать все настройки в буфер. (В отличие от соответствующего метода в Settings, пустая строка на конце не пишется). + /// Write all the settings to the buffer. (Unlike the corresponding method in Settings, the empty line on the end is not written). void serialize(WriteBuffer & buf) const { #define WRITE(TYPE, NAME, DEFAULT) \ diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h index 4d696ddfec7..5653fe484a4 100644 --- a/dbms/src/Interpreters/Settings.h +++ b/dbms/src/Interpreters/Settings.h @@ -28,52 +28,52 @@ struct Settings */ #define APPLY_FOR_SETTINGS(M) \ - /** При записи данных, для сжатия выделяется буфер размером max_compress_block_size. При переполнении буфера или если в буфер */ \ - /** записано данных больше или равно, чем min_compress_block_size, то при очередной засечке, данные так же будут сжиматься */ \ - /** В результате, для маленьких столбцов (числа 1-8 байт), при index_granularity = 8192, размер блока будет 64 KБ. */ \ - /** А для больших столбцов (Title - строка ~100 байт), размер блока будет ~819 КБ. */ \ - /** За счёт этого, коэффициент сжатия почти не ухудшится. */ \ + /** When writing data, a buffer of max_compress_block_size size is allocated for compression. When the buffer overflows or if into the buffer */ \ + /** written data is greater than or equal to min_compress_block_size, then with the next mark, the data will also be compressed */ \ + /** As a result, for small columns (around 1-8 bytes), with index_granularity = 8192, the block size will be 64 KB. */ \ + /** And for large columns (Title - string ~100 bytes), the block size will be ~819 KB. */ \ + /** Due to this, the compression ratio almost does not get worse. */ \ M(SettingUInt64, min_compress_block_size, DEFAULT_MIN_COMPRESS_BLOCK_SIZE) \ M(SettingUInt64, max_compress_block_size, DEFAULT_MAX_COMPRESS_BLOCK_SIZE) \ - /** Максимальный размер блока для чтения */ \ + /** Maximum block size for reading */ \ M(SettingUInt64, max_block_size, DEFAULT_BLOCK_SIZE) \ - /** Максимальный размер блока для вставки, если мы управляем формированием блоков для вставки. */ \ + /** The maximum block size for insertion, if we control the creation of blocks for insertion. */ \ M(SettingUInt64, max_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE) \ /** Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough. */ \ M(SettingUInt64, min_insert_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE) \ /** Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough. */ \ M(SettingUInt64, min_insert_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256)) \ - /** Максимальное количество потоков выполнения запроса. По-умолчанию - определять автоматически. */ \ + /** The maximum number of threads to execute the request. By default, it is determined automatically. */ \ M(SettingMaxThreads, max_threads, 0) \ - /** Максимальный размер буфера для чтения из файловой системы. */ \ + /** The maximum size of the buffer to read from the file system. */ \ M(SettingUInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE) \ - /** Максимальное количество соединений при распределённой обработке одного запроса (должно быть больше, чем max_threads). */ \ + /** The maximum number of connections for distributed processing of one query (should be greater than max_threads). */ \ M(SettingUInt64, max_distributed_connections, DEFAULT_MAX_DISTRIBUTED_CONNECTIONS) \ - /** Какую часть запроса можно прочитать в оперативку для парсинга (оставшиеся данные для INSERT, если есть, считываются позже) */ \ + /** Which part of the query can be read into RAM for parsing (the remaining data for INSERT, if any, is read later) */ \ M(SettingUInt64, max_query_size, DEFAULT_MAX_QUERY_SIZE) \ - /** Интервал в микросекундах для проверки, не запрошена ли остановка выполнения запроса, и отправки прогресса. */ \ + /** The interval in microseconds to check if the request is cancelled, and to send progress info. */ \ M(SettingUInt64, interactive_delay, DEFAULT_INTERACTIVE_DELAY) \ M(SettingSeconds, connect_timeout, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC) \ - /** Если следует выбрать одну из рабочих реплик. */ \ + /** If you should select one of the working replicas. */ \ M(SettingMilliseconds, connect_timeout_with_failover_ms, DBMS_DEFAULT_CONNECT_TIMEOUT_WITH_FAILOVER_MS) \ M(SettingSeconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC) \ M(SettingSeconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC) \ - /** Время ожидания в очереди запросов, если количество одновременно выполняющихся запросов превышает максимальное. */ \ + /** The wait time in the request queue, if the number of concurrent requests exceeds the maximum. */ \ M(SettingMilliseconds, queue_max_wait_ms, DEFAULT_QUERIES_QUEUE_WAIT_TIME_MS) \ - /** Блокироваться в цикле ожидания запроса в сервере на указанное количество секунд. */ \ + /** Block at the query wait cycle on the server for the specified number of seconds. */ \ M(SettingUInt64, poll_interval, DBMS_DEFAULT_POLL_INTERVAL) \ - /** Максимальное количество соединений с одним удалённым сервером в пуле. */ \ + /** Maximum number of connections with one remote server in the pool. */ \ M(SettingUInt64, distributed_connections_pool_size, DBMS_DEFAULT_DISTRIBUTED_CONNECTIONS_POOL_SIZE) \ - /** Максимальное количество попыток соединения с репликами. */ \ + /** The maximum number of attempts to connect to replicas. */ \ M(SettingUInt64, connections_with_failover_max_tries, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES) \ - /** Считать минимумы и максимумы столбцов результата. Они могут выводиться в JSON-форматах. */ \ + /** Calculate minimums and maximums of the result columns. They can be output in JSON-formats. */ \ M(SettingBool, extremes, false) \ - /** Использовать ли кэш разжатых блоков. */ \ + /** Whether to use the cache of uncompressed blocks. */ \ M(SettingBool, use_uncompressed_cache, true) \ - /** Следует ли отменять выполняющийся запрос с таким же id, как новый. */ \ + /** Whether the running request should be canceled with the same id as the new one. */ \ M(SettingBool, replace_running_query, false) \ - /** Количество потоков, выполняющих фоновую работу для таблиц (например, слияние в merge tree). \ - * TODO: Сейчас применяется только при запуске сервера. Можно сделать изменяемым динамически. */ \ + /** Number of threads performing background work for tables (for example, merging in merge tree). \ + * TODO: Now only applies when the server is started. You can make it dynamically variable. */ \ M(SettingUInt64, background_pool_size, DBMS_DEFAULT_BACKGROUND_POOL_SIZE) \ \ /** Sleep time for StorageDistributed DirectoryMonitors in case there is no work or exception has been thrown */ \ @@ -82,9 +82,9 @@ struct Settings /** Allows disabling WHERE to PREWHERE optimization in SELECT queries from MergeTree */ \ M(SettingBool, optimize_move_to_prewhere, true) \ \ - /** Ожидать выполнения действий по манипуляции с партициями. 0 - не ждать, 1 - ждать выполнения только у себя, 2 - ждать всех. */ \ + /** Wait for actions to manipulate the partitions. 0 - do not wait, 1 - wait for execution only of itself, 2 - wait for everyone. */ \ M(SettingUInt64, replication_alter_partitions_sync, 1) \ - /** Ожидать выполнения действий по изменению структуры таблицы в течение указанного количества секунд. 0 - ждать неограниченное время. */ \ + /** Wait for actions to change the table structure within the specified number of seconds. 0 - wait unlimited time. */ \ M(SettingUInt64, replication_alter_columns_timeout, 60) \ \ M(SettingLoadBalancing, load_balancing, LoadBalancing::RANDOM) \ @@ -92,78 +92,78 @@ struct Settings M(SettingTotalsMode, totals_mode, TotalsMode::AFTER_HAVING_EXCLUSIVE) \ M(SettingFloat, totals_auto_threshold, 0.5) \ \ - /** Включена ли компиляция запросов. */ \ + /** Whether query compilation is enabled. */ \ M(SettingBool, compile, false) \ - /** Количество одинаковых по структуре запросов перед тем, как инициируется их компиляция. */ \ + /** The number of structurally identical queries before they are compiled. */ \ M(SettingUInt64, min_count_to_compile, 3) \ - /** При каком количестве ключей, начинает использоваться двухуровневая агрегация. 0 - порог не выставлен. */ \ + /** From what number of keys, a two-level aggregation starts. 0 - the threshold is not set. */ \ M(SettingUInt64, group_by_two_level_threshold, 100000) \ - /** При каком размере состояния агрегации в байтах, начинает использоваться двухуровневая агрегация. 0 - порог не выставлен. \ - * Двухуровневая агрегация начинает использоваться при срабатывании хотя бы одного из порогов. */ \ + /** From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. \ + * Two-level aggregation is used when at least one of the thresholds is triggered. */ \ M(SettingUInt64, group_by_two_level_threshold_bytes, 100000000) \ - /** Включён ли экономный по памяти режим распределённой агрегации. */ \ + /** Is the memory-saving mode of distributed aggregation enabled. */ \ M(SettingBool, distributed_aggregation_memory_efficient, false) \ /** Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. \ * 0 means - same as 'max_threads'. */ \ M(SettingUInt64, aggregation_memory_efficient_merge_threads, 0) \ \ - /** Максимальное количество используемых реплик каждого шарда при выполнении запроса */ \ + /** The maximum number of replicas of each shard used when executing the query */ \ M(SettingUInt64, max_parallel_replicas, 1) \ M(SettingUInt64, parallel_replicas_count, 0) \ M(SettingUInt64, parallel_replica_offset, 0) \ \ - /** Тихо пропускать недоступные шарды. */ \ + /** Silently skip unavailable shards. */ \ M(SettingBool, skip_unavailable_shards, false) \ \ - /** Не мерджить состояния агрегации с разных серверов при распределённой обработке запроса \ - * - на случай, когда доподлинно известно, что на разных шардах разные ключи. \ + /** Do not merge aggregation states from different servers for distributed query processing \ + * - in case it is for certain that there are different keys on different shards. \ */ \ M(SettingBool, distributed_group_by_no_merge, false) \ \ - /** Тонкие настройки для чтения из MergeTree */ \ + /** Advanced settings for reading from MergeTree */ \ \ - /** Если из одного файла читается хотя бы столько строк, чтение можно распараллелить. */ \ + /** If at least as many lines are read from one file, the reading can be parallelized. */ \ M(SettingUInt64, merge_tree_min_rows_for_concurrent_read, (20 * 8192)) \ - /** Можно пропускать чтение более чем стольки строк ценой одного seek по файлу. */ \ + /** You can skip reading more than that number of rows at the price of one seek per file. */ \ M(SettingUInt64, merge_tree_min_rows_for_seek, 0) \ - /** Если отрезок индекса может содержать нужные ключи, делим его на столько частей и рекурсивно проверяем их. */ \ + /** If the index segment can contain the required keys, divide it into as many parts and recursively check them. */ \ M(SettingUInt64, merge_tree_coarse_index_granularity, 8) \ - /** Максимальное количество строк на запрос, для использования кэша разжатых данных. Если запрос большой - кэш не используется. \ - * (Чтобы большие запросы не вымывали кэш.) */ \ + /** The maximum number of rows per request, to use the cache of uncompressed data. If the request is large, the cache is not used. \ + * (For large queries not to flush out the cache.) */ \ M(SettingUInt64, merge_tree_max_rows_to_use_cache, (1024 * 1024)) \ \ - /** Распределять чтение из MergeTree по потокам равномерно, обеспечивая стабильное среднее время исполнения каждого потока в пределах одного чтения. */ \ + /** Distribute read from MergeTree over threads evenly, ensuring stable average execution time of each thread within one read operation. */ \ M(SettingBool, merge_tree_uniform_read_distribution, true) \ \ - /** Минимальная длина выражения expr = x1 OR ... expr = xN для оптимизации */ \ + /** The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization */ \ M(SettingUInt64, optimize_min_equality_disjunction_chain_length, 3) \ \ - /** Минимальное количество байт для операций ввода/ввывода минуя кэш страниц. 0 - отключено. */ \ + /** The minimum number of bytes for input/output operations is bypassing the page cache. 0 - disabled. */ \ M(SettingUInt64, min_bytes_to_use_direct_io, 0) \ \ - /** Кидать исключение, если есть индекс, и он не используется. */ \ + /** Throw an exception if there is an index, and it is not used. */ \ M(SettingBool, force_index_by_date, 0) \ M(SettingBool, force_primary_key, 0) \ \ - /** В запросе INSERT с указанием столбцов, заполнять значения по-умолчанию только для столбцов с явными DEFAULT-ами. */ \ + /** In the INSERT query with specified columns, fill in the default values ​​only for columns with explicit DEFAULTs. */ \ M(SettingBool, strict_insert_defaults, 0) \ \ - /** В случае превышения максимального размера mark_cache, удалять только записи, старше чем mark_cache_min_lifetime секунд. */ \ + /** If the maximum size of mark_cache is exceeded, delete only records older than mark_cache_min_lifetime seconds. */ \ M(SettingUInt64, mark_cache_min_lifetime, 10000) \ \ - /** Позволяет использовать больше источников, чем количество потоков - для более равномерного распределения работы по потокам. \ - * Предполагается, что это временное решение, так как можно будет в будущем сделать количество источников равное количеству потоков, \ - * но чтобы каждый источник динамически выбирал себе доступную работу. \ + /** Allows you to use more sources than the number of threads - to more evenly distribute work across threads. \ + * It is assumed that this is a temporary solution, since it will be possible in the future to make the number of sources equal to the number of threads, \ + * but for each source to dynamically select available work for itself. \ */ \ M(SettingFloat, max_streams_to_max_threads_ratio, 1) \ \ - /** Позволяет выбирать метод сжатия данных при записи */\ + /** Allows you to select the method of data compression when writing */ \ M(SettingCompressionMethod, network_compression_method, CompressionMethod::LZ4) \ \ - /** Приоритет запроса. 1 - самый высокий, больше - ниже; 0 - не использовать приоритеты. */ \ + /** Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities. */ \ M(SettingUInt64, priority, 0) \ \ - /** Логгировать запросы и писать лог в системную таблицу. */ \ + /** Log requests and write the log to the system table. */ \ M(SettingBool, log_queries, 0) \ \ /** If query length is greater than specified threshold (in bytes), then cut query when writing to query log. \ @@ -171,48 +171,48 @@ struct Settings */ \ M(SettingUInt64, log_queries_cut_to_length, 100000) \ \ - /** Как выполняются распределённые подзапросы внутри секций IN или JOIN? */ \ + /** How are distributed subqueries performed inside IN or JOIN sections? */ \ M(SettingDistributedProductMode, distributed_product_mode, DistributedProductMode::DENY) \ \ - /** Схема выполнения GLOBAL-подзапросов. */ \ + /** The scheme for executing GLOBAL subqueries. */ \ M(SettingGlobalSubqueriesMethod, global_subqueries_method, GlobalSubqueriesMethod::PUSH) \ \ - /** Максимальное количество одновременно выполняющихся запросов на одного user-а. */ \ + /** The maximum number of concurrent requests per user. */ \ M(SettingUInt64, max_concurrent_queries_for_user, 0) \ \ - /** Для запросов INSERT в реплицируемую таблицу, ждать записи на указанное число реплик и лианеризовать добавление данных. 0 - отключено. */ \ + /** For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled. */ \ M(SettingUInt64, insert_quorum, 0) \ M(SettingMilliseconds, insert_quorum_timeout, 600000) \ - /** Для запросов SELECT из реплицируемой таблицы, кидать исключение, если на реплике нет куска, записанного с кворумом; \ - * не читать куски, которые ещё не были записаны с кворумом. */ \ + /** For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; \ + * do not read the parts that have not yet been written with the quorum. */ \ M(SettingUInt64, select_sequential_consistency, 0) \ - /** Максимальное количество различных шардов и максимальное количество реплик одного шарда в функции remote. */ \ + /** The maximum number of different shards and the maximum number of replicas of one shard in the `remote` function. */ \ M(SettingUInt64, table_function_remote_max_addresses, 1000) \ - /** Маскимальное количество потоков при распределённой обработке одного запроса */ \ + /** Maximum number of threads for distributed processing of one query */ \ M(SettingUInt64, max_distributed_processing_threads, 8) \ \ - /** Настройки понижения числа потоков в случае медленных чтений. */ \ - /** Обращать внимания только на чтения, занявшие не меньше такого количества времени. */ \ + /** Settings to reduce the number of threads in case of slow reads. */ \ + /** Pay attention only to readings that took at least that much time. */ \ M(SettingMilliseconds, read_backoff_min_latency_ms, 1000) \ - /** Считать события, когда пропускная способность меньше стольки байт в секунду. */ \ + /** Count events when the bandwidth is less than that many bytes per second. */ \ M(SettingUInt64, read_backoff_max_throughput, 1048576) \ - /** Не обращать внимания на событие, если от предыдущего прошло меньше стольки-то времени. */ \ + /** Do not pay attention to the event, if the previous one has passed less than a certain amount of time. */ \ M(SettingMilliseconds, read_backoff_min_interval_between_events_ms, 1000) \ - /** Количество событий, после которого количество потоков будет уменьшено. */ \ + /** The number of events after which the number of threads will be reduced. */ \ M(SettingUInt64, read_backoff_min_events, 2) \ \ - /** В целях тестирования exception safety - кидать исключение при каждом выделении памяти с указанной вероятностью. */ \ + /** For testing of `exception safety` - throw an exception every time you allocate memory with the specified probability. */ \ M(SettingFloat, memory_tracker_fault_probability, 0.) \ \ - /** Сжимать результат, если клиент по HTTP сказал, что он понимает данные, сжатые методом gzip или deflate */ \ + /** Compress the result if the client over HTTP said that it understands data compressed by gzip or deflate */ \ M(SettingBool, enable_http_compression, 0) \ - /** Уровень сжатия - используется, если клиент по HTTP сказал, что он понимает данные, сжатые методом gzip или deflate */ \ + /** Compression level - used if the client on HTTP said that it understands data compressed by gzip or deflate */ \ M(SettingInt64, http_zlib_compression_level, 3) \ \ - /** При разжатии данных POST от клиента, сжатых родным форматом, не проверять чексуммы */ \ + /** If you uncompress the POST data from the client compressed by the native format, do not check the checksum */ \ M(SettingBool, http_native_compression_disable_checksumming_on_decompress, 0) \ \ - /** Таймаут в секундах */ \ + /** Timeout in seconds */ \ M(SettingUInt64, resharding_barrier_timeout, 300) \ \ /** What aggregate function to use for implementation of count(DISTINCT ...) */ \ @@ -301,7 +301,7 @@ struct Settings void set(const String & name, const String & value); /** Set multiple settings from "profile" (in server configuration file (users.xml), profiles contain groups of multiple settings). - * Профиль также может быть установлен с помощью функций set, как настройка profile. + * The profile can also be set using the `set` functions, like the profile setting. */ void setProfile(const String & profile_name, Poco::Util::AbstractConfiguration & config); diff --git a/dbms/src/Interpreters/SystemLog.h b/dbms/src/Interpreters/SystemLog.h index ede68910638..904d59db099 100644 --- a/dbms/src/Interpreters/SystemLog.h +++ b/dbms/src/Interpreters/SystemLog.h @@ -80,8 +80,9 @@ public: */ void add(const LogElement & element) { - /// We could lock here in case of queue overflow. Maybe better to throw an exception or even don't do logging in that case. - queue.push({false, element}); + /// Without try we could block here in case of queue overflow. + if (!queue.tryPush({false, element})) + LOG_ERROR(log, "SystemLog queue is full"); } private: @@ -215,7 +216,7 @@ void SystemLog::flush() { try { - LOG_TRACE(log, "Flushing query log"); + LOG_TRACE(log, "Flushing system log"); if (!is_prepared) /// BTW, flush method is called from single thread. prepareTable(); @@ -223,6 +224,10 @@ void SystemLog::flush() Block block = LogElement::createBlock(); for (const LogElement & elem : data) elem.appendToBlock(block); + + /// Clear queue early, because insertion to the table could lead to generation of more log entrites + /// and pushing them to already full queue will lead to deadlock. + data.clear(); /// We write to table indirectly, using InterpreterInsertQuery. /// This is needed to support DEFAULT-columns in table. @@ -242,10 +247,9 @@ void SystemLog::flush() catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__); + /// In case of exception, also clean accumulated data - to avoid locking. + data.clear(); } - - /// In case of exception, also clean accumulated data - to avoid locking. - data.clear(); } diff --git a/dbms/src/Interpreters/tests/CMakeLists.txt b/dbms/src/Interpreters/tests/CMakeLists.txt index 3fdce76f906..c143dc3fc94 100644 --- a/dbms/src/Interpreters/tests/CMakeLists.txt +++ b/dbms/src/Interpreters/tests/CMakeLists.txt @@ -8,7 +8,7 @@ add_executable (create_query create_query.cpp) target_link_libraries (create_query dbms) add_executable (select_query select_query.cpp) -target_link_libraries (select_query dbms storages_system) +target_link_libraries (select_query dbms clickhouse_storages_system) add_executable (aggregate aggregate.cpp) target_link_libraries (aggregate dbms) diff --git a/dbms/src/Server/CMakeLists.txt b/dbms/src/Server/CMakeLists.txt index 7335b1d874b..3449db60e1a 100644 --- a/dbms/src/Server/CMakeLists.txt +++ b/dbms/src/Server/CMakeLists.txt @@ -8,10 +8,10 @@ add_library(clickhouse-server StatusFile.cpp ReplicasStatusHandler.cpp ) -target_link_libraries(clickhouse-server daemon storages_system clickhouse_functions) +target_link_libraries(clickhouse-server daemon clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions) add_library(clickhouse-local LocalServer.cpp) -target_link_libraries(clickhouse-local dbms clickhouse_functions) +target_link_libraries(clickhouse-local dbms clickhouse_functions clickhouse_aggregate_functions) add_library(clickhouse-extract-from-config ExtractFromConfig.cpp) target_link_libraries(clickhouse-extract-from-config dbms) diff --git a/dbms/src/Server/LocalServer.cpp b/dbms/src/Server/LocalServer.cpp index ff22dd84bba..507bd503aa7 100644 --- a/dbms/src/Server/LocalServer.cpp +++ b/dbms/src/Server/LocalServer.cpp @@ -22,6 +22,7 @@ #include #include "StatusFile.h" #include +#include namespace DB @@ -266,6 +267,7 @@ try /// Don't initilaize DateLUT registerFunctions(); + registerAggregateFunctions(); /// Maybe useless if (config().has("macros")) diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp index 1554cbf8837..9ef1a22ee94 100644 --- a/dbms/src/Server/Server.cpp +++ b/dbms/src/Server/Server.cpp @@ -39,6 +39,8 @@ #endif #include +#include + namespace DB { @@ -212,6 +214,7 @@ int Server::main(const std::vector & args) Logger * log = &logger(); registerFunctions(); + registerAggregateFunctions(); /** Context contains all that query execution is dependent: * settings, available functions, data types, aggregate functions, databases... @@ -443,10 +446,12 @@ int Server::main(const std::vector & args) listen_hosts.emplace_back(config().getString(key)); } + bool try_listen = false; if (listen_hosts.empty()) { listen_hosts.emplace_back("::1"); listen_hosts.emplace_back("127.0.0.1"); + try_listen = true; } auto make_socket_address = [&](const std::string & host, std::uint16_t port) { @@ -457,7 +462,6 @@ int Server::main(const std::vector & args) } catch (const Poco::Net::DNSException & e) { - /// Better message when IPv6 is disabled on host. if (e.code() == EAI_FAMILY #if defined(EAI_ADDRFAMILY) || e.code() == EAI_ADDRFAMILY @@ -466,9 +470,9 @@ int Server::main(const std::vector & args) { LOG_ERROR(log, "Cannot resolve listen_host (" << host << "), error: " << e.message() << ". " - << "If it is an IPv6 address and your host has disabled IPv6, then consider to " - << "specify IPv4 address to listen in element of configuration " - << "file. Example: 0.0.0.0"); + "If it is an IPv6 address and your host has disabled IPv6, then consider to " + "specify IPv4 address to listen in element of configuration " + "file. Example: 0.0.0.0"); } throw; @@ -479,76 +483,92 @@ int Server::main(const std::vector & args) for (const auto & listen_host : listen_hosts) { /// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file. - - /// HTTP - if (config().has("http_port")) + try { - Poco::Net::SocketAddress http_socket_address = make_socket_address(listen_host, config().getInt("http_port")); - Poco::Net::ServerSocket http_socket(http_socket_address); - http_socket.setReceiveTimeout(settings.receive_timeout); - http_socket.setSendTimeout(settings.send_timeout); + /// HTTP + if (config().has("http_port")) + { + Poco::Net::SocketAddress http_socket_address = make_socket_address(listen_host, config().getInt("http_port")); + Poco::Net::ServerSocket http_socket(http_socket_address); + http_socket.setReceiveTimeout(settings.receive_timeout); + http_socket.setSendTimeout(settings.send_timeout); - servers.emplace_back(new Poco::Net::HTTPServer( - new HTTPRequestHandlerFactory(*this, "HTTPHandler-factory"), server_pool, http_socket, http_params)); + servers.emplace_back(new Poco::Net::HTTPServer( + new HTTPRequestHandlerFactory(*this, "HTTPHandler-factory"), server_pool, http_socket, http_params)); - LOG_INFO(log, "Listening http://" + http_socket_address.toString()); + LOG_INFO(log, "Listening http://" + http_socket_address.toString()); + } + + /// HTTPS + if (config().has("https_port")) + { + #if Poco_NetSSL_FOUND + std::call_once(ssl_init_once, SSLInit); + Poco::Net::SocketAddress http_socket_address = make_socket_address(listen_host, config().getInt("https_port")); + Poco::Net::SecureServerSocket http_socket(http_socket_address); + http_socket.setReceiveTimeout(settings.receive_timeout); + http_socket.setSendTimeout(settings.send_timeout); + + servers.emplace_back(new Poco::Net::HTTPServer( + new HTTPRequestHandlerFactory(*this, "HTTPHandler-factory"), server_pool, http_socket, http_params)); + + LOG_INFO(log, "Listening https://" + http_socket_address.toString()); + #else + throw Exception{"https protocol disabled because poco library built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; + #endif + } + + /// TCP + if (config().has("tcp_port")) + { + Poco::Net::SocketAddress tcp_address = make_socket_address(listen_host, config().getInt("tcp_port")); + Poco::Net::ServerSocket tcp_socket(tcp_address); + tcp_socket.setReceiveTimeout(settings.receive_timeout); + tcp_socket.setSendTimeout(settings.send_timeout); + servers.emplace_back( + new Poco::Net::TCPServer(new TCPConnectionFactory(*this), server_pool, tcp_socket, new Poco::Net::TCPServerParams)); + + LOG_INFO(log, "Listening tcp: " + tcp_address.toString()); + } + + /// At least one of TCP and HTTP servers must be created. + if (servers.empty()) + throw Exception("No 'tcp_port' and 'http_port' is specified in configuration file.", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + + /// Interserver IO HTTP + if (config().has("interserver_http_port")) + { + Poco::Net::SocketAddress interserver_address = make_socket_address(listen_host, config().getInt("interserver_http_port")); + Poco::Net::ServerSocket interserver_io_http_socket(interserver_address); + interserver_io_http_socket.setReceiveTimeout(settings.receive_timeout); + interserver_io_http_socket.setSendTimeout(settings.send_timeout); + servers.emplace_back(new Poco::Net::HTTPServer( + new HTTPRequestHandlerFactory(*this, "InterserverIOHTTPHandler-factory"), + server_pool, + interserver_io_http_socket, + http_params)); + + LOG_INFO(log, "Listening interserver: " + interserver_address.toString()); + } + } + catch (const Poco::Net::NetException & e) + { + if (try_listen && e.code() == POCO_EPROTONOSUPPORT) + LOG_ERROR(log, "Listen [" << listen_host << "]: " << e.what() << ": " << e.message() + << " If it is an IPv6 or IPv4 address and your host has disabled IPv6 or IPv4, then consider to " + "specify not disabled IPv4 or IPv6 address to listen in element of configuration " + "file. Example for disabled IPv6: 0.0.0.0 ." + " Example for disabled IPv4: ::"); + else + throw; } - /// HTTPS - if (config().has("https_port")) - { -#if Poco_NetSSL_FOUND - std::call_once(ssl_init_once, SSLInit); - Poco::Net::SocketAddress http_socket_address = make_socket_address(listen_host, config().getInt("https_port")); - Poco::Net::SecureServerSocket http_socket(http_socket_address); - http_socket.setReceiveTimeout(settings.receive_timeout); - http_socket.setSendTimeout(settings.send_timeout); - - servers.emplace_back(new Poco::Net::HTTPServer( - new HTTPRequestHandlerFactory(*this, "HTTPHandler-factory"), server_pool, http_socket, http_params)); - - LOG_INFO(log, "Listening https://" + http_socket_address.toString()); -#else - throw Exception{"https protocol disabled because poco library built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - } - - /// TCP - if (config().has("tcp_port")) - { - Poco::Net::SocketAddress tcp_address = make_socket_address(listen_host, config().getInt("tcp_port")); - Poco::Net::ServerSocket tcp_socket(tcp_address); - tcp_socket.setReceiveTimeout(settings.receive_timeout); - tcp_socket.setSendTimeout(settings.send_timeout); - servers.emplace_back( - new Poco::Net::TCPServer(new TCPConnectionFactory(*this), server_pool, tcp_socket, new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening tcp: " + tcp_address.toString()); - } - - - /// At least one of TCP and HTTP servers must be created. - if (servers.empty()) - throw Exception("No 'tcp_port' and 'http_port' is specified in configuration file.", ErrorCodes::NO_ELEMENTS_IN_CONFIG); - - /// Interserver IO HTTP - if (config().has("interserver_http_port")) - { - Poco::Net::SocketAddress interserver_address = make_socket_address(listen_host, config().getInt("interserver_http_port")); - Poco::Net::ServerSocket interserver_io_http_socket(interserver_address); - interserver_io_http_socket.setReceiveTimeout(settings.receive_timeout); - interserver_io_http_socket.setSendTimeout(settings.send_timeout); - servers.emplace_back(new Poco::Net::HTTPServer( - new HTTPRequestHandlerFactory(*this, "InterserverIOHTTPHandler-factory"), - server_pool, - interserver_io_http_socket, - http_params)); - - LOG_INFO(log, "Listening interserver: " + interserver_address.toString()); - } } + if (servers.empty()) + throw Exception("No servers started (add valid listen_host and 'tcp_port' or 'http_port' to configuration file.)", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + for (auto & server : servers) server->start(); @@ -600,7 +620,7 @@ int Server::main(const std::vector & args) LOG_DEBUG( log, "Closed connections." << (current_connections ? " But " + std::to_string(current_connections) + " remains." - + " Tip: To increase wait time add to config: 60 ." : "")); + " Tip: To increase wait time add to config: 60" : "")); main_config_reloader.reset(); users_config_reloader.reset(); diff --git a/dbms/src/Storages/ColumnsDescription.cpp b/dbms/src/Storages/ColumnsDescription.cpp index 207f4f4cddc..c663643c049 100644 --- a/dbms/src/Storages/ColumnsDescription.cpp +++ b/dbms/src/Storages/ColumnsDescription.cpp @@ -104,7 +104,7 @@ ColumnsDescription ColumnsDescription::parse(const String & str) ASTPtr default_expr; Expected expected{}; - auto begin = default_expr_str.data(); + const char * begin = default_expr_str.data(); const auto end = begin + default_expr_str.size(); const char * max_parsed_pos = begin; if (!expr_parser.parse(begin, end, default_expr, max_parsed_pos, expected)) diff --git a/dbms/src/Storages/Distributed/DistributedBlockOutputStream.cpp b/dbms/src/Storages/Distributed/DistributedBlockOutputStream.cpp index 2e1f4e8b2bc..99ded795581 100644 --- a/dbms/src/Storages/Distributed/DistributedBlockOutputStream.cpp +++ b/dbms/src/Storages/Distributed/DistributedBlockOutputStream.cpp @@ -11,11 +11,12 @@ #include #include -#include #include -#include #include +#include + +#include #include @@ -124,8 +125,8 @@ void DistributedBlockOutputStream::writeToLocal(const Block & block, const size_ void DistributedBlockOutputStream::writeToShard(const Block & block, const std::vector & dir_names) { /** tmp directory is used to ensure atomicity of transactions - * and keep monitor thread out from reading incomplete data - */ + * and keep monitor thread out from reading incomplete data + */ std::string first_file_tmp_path{}; auto first = true; @@ -140,7 +141,7 @@ void DistributedBlockOutputStream::writeToShard(const Block & block, const std:: if (Poco::File(path).createDirectory()) storage.requireDirectoryMonitor(dir_name); - const auto & file_name = toString(Increment{path + "increment.txt"}.get(true)) + ".bin"; + const auto & file_name = toString(storage.file_names_increment.get()) + ".bin"; const auto & block_file_path = path + file_name; /** on first iteration write block to a temporary directory for subsequent hardlinking to ensure diff --git a/dbms/src/Storages/MergeTree/DataPartsExchange.cpp b/dbms/src/Storages/MergeTree/DataPartsExchange.cpp index 4ef09d16191..33dbdf13860 100644 --- a/dbms/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/dbms/src/Storages/MergeTree/DataPartsExchange.cpp @@ -215,10 +215,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPartImpl( Poco::File part_file(part_path); if (part_file.exists()) - { - LOG_ERROR(log, "Directory " + part_path + " already exists. Removing."); - part_file.remove(true); - } + throw Exception("Directory " + part_path + " already exists.", ErrorCodes::DIRECTORY_ALREADY_EXISTS); CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch}; diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index efea7b58feb..55a7d56e7b5 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -445,7 +446,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) } } - calculateColumnSizes(); + calculateColumnSizesImpl(); LOG_DEBUG(log, "Loaded data parts (" << data_parts.size() << " items)"); } @@ -1578,7 +1579,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::loadPartAndFixMetadata(const St } -void MergeTreeData::calculateColumnSizes() +void MergeTreeData::calculateColumnSizesImpl() { column_sizes.clear(); @@ -1591,7 +1592,7 @@ void MergeTreeData::addPartContributionToColumnSizes(const DataPartPtr & part) const auto & files = part->checksums.files; /// TODO This method doesn't take into account columns with multiple files. - for (const auto & column : *columns) + for (const auto & column : getColumnsList()) { const auto escaped_name = escapeForFileName(column.name); const auto bin_file_name = escaped_name + ".bin"; @@ -1611,6 +1612,14 @@ void MergeTreeData::addPartContributionToColumnSizes(const DataPartPtr & part) } } +static inline void logSubtract(size_t & from, size_t value, Logger * log, const String & variable) +{ + if (value > from) + LOG_ERROR(log, "Possibly incorrect subtraction: " << from << " - " << value << " = " << from - value << ", variable " << variable); + + from -= value; +} + void MergeTreeData::removePartContributionToColumnSizes(const DataPartPtr & part) { const auto & files = part->checksums.files; @@ -1627,12 +1636,12 @@ void MergeTreeData::removePartContributionToColumnSizes(const DataPartPtr & part if (files.count(bin_file_name)) { const auto & bin_file_checksums = files.at(bin_file_name); - column_size.data_compressed -= bin_file_checksums.file_size; - column_size.data_uncompressed -= bin_file_checksums.uncompressed_size; + logSubtract(column_size.data_compressed, bin_file_checksums.file_size, log, bin_file_name + ".file_size"); + logSubtract(column_size.data_uncompressed, bin_file_checksums.uncompressed_size, log, bin_file_name + ".uncompressed_size"); } if (files.count(mrk_file_name)) - column_size.marks -= files.at(mrk_file_name).file_size; + logSubtract(column_size.marks, files.at(mrk_file_name).file_size, log, mrk_file_name + ".file_size"); } } diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.h b/dbms/src/Storages/MergeTree/MergeTreeData.h index 78d48bdf7bd..a7f56476468 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.h +++ b/dbms/src/Storages/MergeTree/MergeTreeData.h @@ -446,9 +446,16 @@ public: for (const auto & col : column_sizes) total_size += col.second.getTotalCompressedSize(); + return total_size; } + void recalculateColumnSizes() + { + std::lock_guard lock{data_parts_mutex}; + calculateColumnSizesImpl(); + } + /// For ATTACH/DETACH/DROP/RESHARD PARTITION. static String getMonthName(const Field & partition); static String getMonthName(DayNum_t month); @@ -535,7 +542,7 @@ private: ExpressionActionsPtr & out_expression, NameToNameMap & out_rename_map, bool & out_force_update_metadata) const; /// Calculates column sizes in compressed form for the current state of data_parts. Call with data_parts mutex locked. - void calculateColumnSizes(); + void calculateColumnSizesImpl(); /// Adds or subtracts the contribution of the part to compressed column sizes. void addPartContributionToColumnSizes(const DataPartPtr & part); void removePartContributionToColumnSizes(const DataPartPtr & part); diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp index 61b3a6a1626..45a4bcb22e9 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMerger.cpp @@ -21,9 +21,11 @@ #include #include #include -#include +#include #include +#include + #include #include #include diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp index a995be61138..9584b8ace19 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp @@ -565,11 +565,9 @@ void MergeTreeDataPart::loadColumns(bool require) throw Exception("No columns.txt in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); /// If there is no file with a list of columns, write it down. - for (const NameAndTypePair & column : *storage.columns) - { + for (const NameAndTypePair & column : storage.getColumnsList()) if (Poco::File(getFullPath() + escapeForFileName(column.name) + ".bin").exists()) columns.push_back(column); - } if (columns.empty()) throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h index a19cde60f2d..af834757dea 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h @@ -28,7 +28,7 @@ public: QueryProcessingStage::Enum & processed_stage, size_t max_block_size, unsigned threads, - size_t * inout_part_index, /// If not nullptr, from this counter values are taken ​​for the virtual column _part_index. + size_t * inout_part_index, /// If not nullptr, from this counter values are taken for the virtual column _part_index. Int64 max_block_number_to_read) const; private: diff --git a/dbms/src/Storages/MergeTree/MergeTreeSettings.h b/dbms/src/Storages/MergeTree/MergeTreeSettings.h index a030af6cc36..edbf3044783 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeSettings.h +++ b/dbms/src/Storages/MergeTree/MergeTreeSettings.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB diff --git a/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h b/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h index 01da66151f6..92a0dda7013 100644 --- a/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/dbms/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -136,7 +136,7 @@ public: private: void init(); - /** If `permutation` is given, it rearranges the values ​​in the columns when writing. + /** If `permutation` is given, it rearranges the values in the columns when writing. * This is necessary to not keep the whole block in the RAM to sort it. */ void writeImpl(const Block & block, const IColumn::Permutation * permutation); diff --git a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeAlterThread.cpp b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeAlterThread.cpp index ec623d644eb..f69ccd3642f 100644 --- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeAlterThread.cpp +++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeAlterThread.cpp @@ -199,6 +199,9 @@ void ReplicatedMergeTreeAlterThread::run() transaction->commit(); } + /// Columns sizes could be quietly changed in case of MODIFY/ADD COLUMN + storage.data.recalculateColumnSizes(); + /// The same for non-replicated data. if (storage.unreplicated_data) { @@ -216,6 +219,8 @@ void ReplicatedMergeTreeAlterThread::run() transaction->commit(); } + + storage.unreplicated_data->recalculateColumnSizes(); } /// List of columns for a specific replica. diff --git a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h index ef255ff2ca7..077be41a693 100644 --- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h +++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h @@ -63,7 +63,7 @@ struct ReplicatedMergeTreeLogEntryData /// The name of resulting part. /// For DROP_RANGE, the name of a non-existent part. You need to remove all the parts covered by it. String new_part_name; - String block_id; /// For parts of level zero, the block identifier for deduplication (node ​​name in /blocks /). + String block_id; /// For parts of level zero, the block identifier for deduplication (node name in /blocks/). Strings parts_to_merge; bool deduplicate = false; /// Do deduplicate on merge diff --git a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h index 42f192ae909..4feff1b0443 100644 --- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h +++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h @@ -14,7 +14,7 @@ class StorageReplicatedMergeTree; /** Initializes ZK session. - * Exposes ephemeral nodes. It sets the node values ​​that are required for replica detection. + * Exposes ephemeral nodes. It sets the node values that are required for replica detection. * Starts participation in the leader selection. Starts all background threads. * Then monitors whether the session has expired. And if it expired, it will reinitialize it. */ diff --git a/dbms/src/Storages/StorageBuffer.h b/dbms/src/Storages/StorageBuffer.h index 21c5f0a957f..90608253dec 100644 --- a/dbms/src/Storages/StorageBuffer.h +++ b/dbms/src/Storages/StorageBuffer.h @@ -85,7 +85,7 @@ public: void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override { name = new_table_name; } bool supportsSampling() const override { return true; } - bool supportsPrewhere() const override { return true; } + bool supportsPrewhere() const override { return false; } bool supportsFinal() const override { return true; } bool supportsIndexForIn() const override { return true; } bool supportsParallelReplicas() const override { return true; } diff --git a/dbms/src/Storages/StorageDistributed.cpp b/dbms/src/Storages/StorageDistributed.cpp index 9f9f34da749..3b2093be68b 100644 --- a/dbms/src/Storages/StorageDistributed.cpp +++ b/dbms/src/Storages/StorageDistributed.cpp @@ -38,6 +38,8 @@ #include +#include + namespace DB { @@ -53,29 +55,70 @@ namespace ErrorCodes namespace { - /// select query has database and table names as AST pointers - /// Creates a copy of query, changes database and table names. - inline ASTPtr rewriteSelectQuery(const ASTPtr & query, const std::string & database, const std::string & table) + +/// select query has database and table names as AST pointers +/// Creates a copy of query, changes database and table names. +ASTPtr rewriteSelectQuery(const ASTPtr & query, const std::string & database, const std::string & table) +{ + auto modified_query_ast = query->clone(); + typeid_cast(*modified_query_ast).replaceDatabaseAndTable(database, table); + return modified_query_ast; +} + +/// insert query has database and table names as bare strings +/// Creates a copy of query, changes the database and table names. +ASTPtr rewriteInsertQuery(const ASTPtr & query, const std::string & database, const std::string & table) +{ + auto modified_query_ast = query->clone(); + + auto & actual_query = typeid_cast(*modified_query_ast); + actual_query.database = database; + actual_query.table = table; + /// make sure query is not INSERT SELECT + actual_query.select = nullptr; + + return modified_query_ast; +} + +/// Calculate maximum number in file names in directory and all subdirectories. +/// To ensure global order of data blocks yet to be sent across server restarts. +UInt64 getMaximumFileNumber(const std::string & path) +{ + UInt64 res = 0; + + boost::filesystem::recursive_directory_iterator begin(path); + boost::filesystem::recursive_directory_iterator end; + for (auto it = begin; it != end; ++it) { - auto modified_query_ast = query->clone(); - typeid_cast(*modified_query_ast).replaceDatabaseAndTable(database, table); - return modified_query_ast; + const auto & path = it->path(); + + if (it->status().type() != boost::filesystem::regular_file || !endsWith(path.filename().string(), ".bin")) + continue; + + UInt64 num = 0; + try + { + num = parse(path.filename().stem().string()); + } + catch (Exception & e) + { + e.addMessage("Unexpected file name " + path.filename().string() + " found at " + path.parent_path().string() + ", should have numeric base name."); + throw; + } + + if (num > res) + res = num; } - /// insert query has database and table names as bare strings - /// Creates a copy of query, changes the database and table names. - inline ASTPtr rewriteInsertQuery(const ASTPtr & query, const std::string & database, const std::string & table) - { - auto modified_query_ast = query->clone(); + return res; +} - auto & actual_query = typeid_cast(*modified_query_ast); - actual_query.database = database; - actual_query.table = table; - /// make sure query is not INSERT SELECT - actual_query.select = nullptr; +void initializeFileNamesIncrement(const std::string & path, SimpleIncrement & increment) +{ + if (!path.empty()) + increment.set(getMaximumFileNumber(path)); +} - return modified_query_ast; - } } @@ -96,6 +139,7 @@ StorageDistributed::StorageDistributed( path(data_path_.empty() ? "" : (data_path_ + escapeForFileName(name) + '/')) { createDirectoryMonitors(); + initializeFileNamesIncrement(path, file_names_increment); } @@ -120,6 +164,7 @@ StorageDistributed::StorageDistributed( path(data_path_.empty() ? "" : (data_path_ + escapeForFileName(name) + '/')) { createDirectoryMonitors(); + initializeFileNamesIncrement(path, file_names_increment); } @@ -443,10 +488,11 @@ void StorageDistributed::createDirectoryMonitors() Poco::File{path}.createDirectory(); - Poco::DirectoryIterator end; - for (Poco::DirectoryIterator it{path}; it != end; ++it) - if (it->isDirectory()) - createDirectoryMonitor(it.name()); + boost::filesystem::directory_iterator begin(path); + boost::filesystem::directory_iterator end; + for (auto it = begin; it != end; ++it) + if (it->status().type() == boost::filesystem::directory_file) + createDirectoryMonitor(it->path().filename().string()); } diff --git a/dbms/src/Storages/StorageDistributed.h b/dbms/src/Storages/StorageDistributed.h index db6c3417ef4..4cac100568b 100644 --- a/dbms/src/Storages/StorageDistributed.h +++ b/dbms/src/Storages/StorageDistributed.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -159,6 +160,9 @@ private: String path; /// Can be empty if data_path_ is empty. In this case, a directory for the data to be sent is not created. std::unordered_map> directory_monitors; + + /// Used for global monotonic ordering of files to send. + SimpleIncrement file_names_increment; }; } diff --git a/dbms/src/Storages/StorageFactory.cpp b/dbms/src/Storages/StorageFactory.cpp index 0639c700a82..a9223385c78 100644 --- a/dbms/src/Storages/StorageFactory.cpp +++ b/dbms/src/Storages/StorageFactory.cpp @@ -153,7 +153,7 @@ static void appendGraphitePattern(const Context & context, else if (key == "function") { /// TODO Not only Float64 - pattern.function = context.getAggregateFunctionFactory().get( + pattern.function = AggregateFunctionFactory::instance().get( config.getString(config_element + ".function"), { std::make_shared() }); } else if (startsWith(key, "retention")) @@ -249,9 +249,14 @@ StoragePtr StorageFactory::get( bool attach, bool has_force_restore_data_flag) const { - checkAllTypesAreAllowedInTable(*columns); - checkAllTypesAreAllowedInTable(materialized_columns); - checkAllTypesAreAllowedInTable(alias_columns); + /// Check for some special types, that are not allowed to be stored in tables. Example: NULL data type. + /// Exception: any type is allowed in View, because plain (non-materialized) View does not store anything itself. + if (name != "View") + { + checkAllTypesAreAllowedInTable(*columns); + checkAllTypesAreAllowedInTable(materialized_columns); + checkAllTypesAreAllowedInTable(alias_columns); + } if (name == "Log") { diff --git a/dbms/src/Storages/StorageMerge.cpp b/dbms/src/Storages/StorageMerge.cpp index 68be2361613..b1a6e1a1298 100644 --- a/dbms/src/Storages/StorageMerge.cpp +++ b/dbms/src/Storages/StorageMerge.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB @@ -176,6 +177,14 @@ BlockInputStreams StorageMerge::read( else if (processed_stage_in_source_table != processed_stage_in_source_tables.value()) throw Exception("Source tables for Merge table are processing data up to different stages", ErrorCodes::INCOMPATIBLE_SOURCE_TABLES); + + /// Subordinary tables could have different but convertible types, like numeric types of different width. + /// We must return streams with structure equals to structure of Merge table. + for (auto & stream : source_streams) + { + /// will throw if some columns not convertible + stream = std::make_shared(context, stream, table->getSampleBlock(), getSampleBlock()); + } } else { @@ -199,7 +208,13 @@ BlockInputStreams StorageMerge::read( throw Exception("Source tables for Merge table are processing data up to different stages", ErrorCodes::INCOMPATIBLE_SOURCE_TABLES); - return streams.empty() ? std::make_shared() : streams.front(); + auto stream = streams.empty() ? std::make_shared() : streams.front(); + if (!streams.empty()) + { + /// will throw if some columns not convertible + stream = std::make_shared(context, stream, table->getSampleBlock(), getSampleBlock()); + } + return stream; })); } diff --git a/dbms/src/Storages/StorageMergeTree.cpp b/dbms/src/Storages/StorageMergeTree.cpp index 4bb34060a10..c2e8e3037df 100644 --- a/dbms/src/Storages/StorageMergeTree.cpp +++ b/dbms/src/Storages/StorageMergeTree.cpp @@ -131,6 +131,7 @@ BlockOutputStreamPtr StorageMergeTree::write(ASTPtr query, const Settings & sett bool StorageMergeTree::checkTableCanBeDropped() const { + const_cast(getData()).recalculateColumnSizes(); context.checkTableCanBeDropped(database_name, table_name, getData().getTotalCompressedSize()); return true; } @@ -243,6 +244,9 @@ void StorageMergeTree::alter( for (auto & transaction : transactions) transaction->commit(); + /// Columns sizes could be changed + data.recalculateColumnSizes(); + if (primary_key_is_modified) data.loadDataParts(false); } diff --git a/dbms/src/Storages/StorageMergeTree.h b/dbms/src/Storages/StorageMergeTree.h index ea8af476c46..6d4ab228f00 100644 --- a/dbms/src/Storages/StorageMergeTree.h +++ b/dbms/src/Storages/StorageMergeTree.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace DB diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index d40e4bf529b..4a2bbc0801e 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -2954,6 +2954,7 @@ void StorageReplicatedMergeTree::attachPartition(ASTPtr query, const Field & fie bool StorageReplicatedMergeTree::checkTableCanBeDropped() const { /// Consider only synchronized data + const_cast(getData()).recalculateColumnSizes(); context.checkTableCanBeDropped(database_name, table_name, getData().getTotalCompressedSize()); return true; } diff --git a/dbms/src/Storages/System/CMakeLists.txt b/dbms/src/Storages/System/CMakeLists.txt index 49deb422b83..dae153e4fb1 100644 --- a/dbms/src/Storages/System/CMakeLists.txt +++ b/dbms/src/Storages/System/CMakeLists.txt @@ -1,4 +1,4 @@ include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake) add_headers_and_sources(storages_system .) -add_library(storages_system ${storages_system_headers} ${storages_system_sources}) +add_library(clickhouse_storages_system ${storages_system_headers} ${storages_system_sources}) diff --git a/dbms/src/Storages/System/StorageSystemFunctions.cpp b/dbms/src/Storages/System/StorageSystemFunctions.cpp index e9c7075f11f..98cf8cbe9be 100644 --- a/dbms/src/Storages/System/StorageSystemFunctions.cpp +++ b/dbms/src/Storages/System/StorageSystemFunctions.cpp @@ -49,7 +49,7 @@ BlockInputStreams StorageSystemFunctions::read( column_is_aggregate.column->insert(UInt64(0)); } - const auto & aggregate_functions = context.getAggregateFunctionFactory().aggregate_functions; + const auto & aggregate_functions = AggregateFunctionFactory::instance().aggregate_functions; for (const auto & it : aggregate_functions) { column_name.column->insert(it.first); diff --git a/dbms/src/Storages/System/StorageSystemParts.cpp b/dbms/src/Storages/System/StorageSystemParts.cpp index 6c747f08075..eb687a18598 100644 --- a/dbms/src/Storages/System/StorageSystemParts.cpp +++ b/dbms/src/Storages/System/StorageSystemParts.cpp @@ -19,26 +19,27 @@ StorageSystemParts::StorageSystemParts(const std::string & name_) : name(name_), columns { - {"partition", std::make_shared()}, - {"name", std::make_shared()}, - {"replicated", std::make_shared()}, - {"active", std::make_shared()}, - {"marks", std::make_shared()}, - {"bytes", std::make_shared()}, - {"modification_time", std::make_shared()}, - {"remove_time", std::make_shared()}, + {"partition", std::make_shared()}, + {"name", std::make_shared()}, + {"replicated", std::make_shared()}, + {"active", std::make_shared()}, + {"marks", std::make_shared()}, + {"rows", std::make_shared()}, + {"bytes", std::make_shared()}, + {"modification_time", std::make_shared()}, + {"remove_time", std::make_shared()}, {"refcount", std::make_shared()}, {"min_date", std::make_shared()}, {"max_date", std::make_shared()}, {"min_block_number", std::make_shared()}, {"max_block_number", std::make_shared()}, - {"level", std::make_shared()}, + {"level", std::make_shared()}, {"primary_key_bytes_in_memory", std::make_shared()}, {"primary_key_bytes_in_memory_allocated", std::make_shared()}, - {"database", std::make_shared()}, - {"table", std::make_shared()}, - {"engine", std::make_shared()}, + {"database", std::make_shared()}, + {"table", std::make_shared()}, + {"engine", std::make_shared()}, } { } @@ -231,6 +232,7 @@ BlockInputStreams StorageSystemParts::read( block.getByPosition(i++).column->insert(replicated); block.getByPosition(i++).column->insert(static_cast(!need[replicated][0] || active_parts.count(part))); block.getByPosition(i++).column->insert(part->size); + block.getByPosition(i++).column->insert(part->getExactSizeRows()); block.getByPosition(i++).column->insert(static_cast(part->size_in_bytes)); block.getByPosition(i++).column->insert(part->modification_time); block.getByPosition(i++).column->insert(part->remove_time); diff --git a/dbms/src/Storages/tests/CMakeLists.txt b/dbms/src/Storages/tests/CMakeLists.txt index bd0897f969d..663ea09be6e 100644 --- a/dbms/src/Storages/tests/CMakeLists.txt +++ b/dbms/src/Storages/tests/CMakeLists.txt @@ -1,7 +1,7 @@ include_directories (${CMAKE_CURRENT_BINARY_DIR}) add_executable (system_numbers system_numbers.cpp) -target_link_libraries (system_numbers dbms storages_system) +target_link_libraries (system_numbers dbms clickhouse_storages_system) add_executable (storage_log storage_log.cpp) target_link_libraries (storage_log dbms) diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index 9be8227d5b6..5cb094333fe 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -160,7 +160,7 @@ def main(args): report_testcase.append(stderr_element) print(stderr) - if 'Connection refused' in stderr or 'Attempt to read after eof' in stderr: + if args.stop and ('Connection refused' in stderr or 'Attempt to read after eof' in stderr) and not 'Received exception from server' in stderr: SERVER_DIED = True elif stderr: @@ -235,6 +235,7 @@ if __name__ == '__main__': group.add_argument('--no-zookeeper', action = 'store_false', default = None, dest = 'zookeeper', help = 'Do not run zookeeper related tests') group.add_argument('--shard', action = 'store_true', default = None, dest = 'shard', help = 'Run sharding related tests (required to clickhouse-server listen 127.0.0.2 127.0.0.3)') group.add_argument('--no-shard', action = 'store_false', default = None, dest = 'shard', help = 'Do not run shard related tests') + group.add_argument('--stop', action = 'store_true', default = None, dest = 'stop', help = 'Stop on network errors ') args = parser.parse_args() diff --git a/dbms/tests/queries/0_stateless/00203_full_join.reference b/dbms/tests/queries/0_stateless/00203_full_join.reference index 0b56ddea8cf..eedd5818063 100644 --- a/dbms/tests/queries/0_stateless/00203_full_join.reference +++ b/dbms/tests/queries/0_stateless/00203_full_join.reference @@ -38,3 +38,8 @@ Hello [0,1,2] 3 4 5 +1 2 3 aaa +2 3 4 bbb ccc +5 6 7 ddd +2 3 4 bbb ccc +5 6 7 ddd diff --git a/dbms/tests/queries/0_stateless/00203_full_join.sql b/dbms/tests/queries/0_stateless/00203_full_join.sql index 8a262a821ab..19304f29268 100644 --- a/dbms/tests/queries/0_stateless/00203_full_join.sql +++ b/dbms/tests/queries/0_stateless/00203_full_join.sql @@ -9,3 +9,16 @@ SELECT k, x FROM (SELECT arrayJoin([1, 2, 3]) AS k, 'Hello' AS x) ANY RIGHT JOIN SELECT k, y FROM (SELECT arrayJoin([1, 2, 3]) AS k, 'Hello' AS x) ANY RIGHT JOIN (SELECT range(k) AS y, arrayJoin([3, 4, 5]) AS k) USING k WHERE k < 10 ORDER BY k; SELECT x, y FROM (SELECT arrayJoin([1, 2, 3]) AS k, 'Hello' AS x) ANY RIGHT JOIN (SELECT range(k) AS y, arrayJoin([3, 4, 5]) AS k) USING k WHERE k < 10 ORDER BY k; SELECT k FROM (SELECT arrayJoin([1, 2, 3]) AS k, 'Hello' AS x) ANY RIGHT JOIN (SELECT range(k) AS y, arrayJoin([3, 4, 5]) AS k) USING k WHERE k < 10 ORDER BY k; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t1 (k1 UInt32, k2 UInt32, k3 UInt32, val_t1 String) ENGINE=TinyLog; +CREATE TABLE t2 (val_t2 String, k3 UInt32, k2_alias UInt32, k1 UInt32) ENGINE=TinyLog; + +INSERT INTO t1 VALUES (1, 2, 3, 'aaa'), (2, 3, 4, 'bbb'); +INSERT INTO t2 VALUES ('ccc', 4, 3, 2), ('ddd', 7, 6, 5); + +SELECT k1, k2, k3, val_t1, val_t2 FROM t1 ANY FULL JOIN t2 USING (k3, k1, k2 AS k2_alias) ORDER BY k1, k2, k3; + +SELECT k1, k2, k3, val_t1, val_t2 FROM t1 ANY RIGHT JOIN t2 USING (k3, k1, k2 AS k2_alias) ORDER BY k1, k2, k3; diff --git a/dbms/tests/queries/0_stateless/00296_multiple_attaches_zookeeper.reference b/dbms/tests/queries/0_stateless/00296_multiple_attaches_zookeeper.reference index 34908ef6081..ca486aed769 100644 --- a/dbms/tests/queries/0_stateless/00296_multiple_attaches_zookeeper.reference +++ b/dbms/tests/queries/0_stateless/00296_multiple_attaches_zookeeper.reference @@ -43,3 +43,4 @@ 4 4 4 +4 diff --git a/dbms/tests/queries/0_stateless/00296_multiple_attaches_zookeeper.sql b/dbms/tests/queries/0_stateless/00296_multiple_attaches_zookeeper.sql index 3f4fcf02fc6..d38de63a995 100644 --- a/dbms/tests/queries/0_stateless/00296_multiple_attaches_zookeeper.sql +++ b/dbms/tests/queries/0_stateless/00296_multiple_attaches_zookeeper.sql @@ -63,6 +63,8 @@ SELECT count() FROM system.parts WHERE database = 'test' AND table = 'r1' AND ac SELECT count() FROM system.parts WHERE database = 'test' AND table = 'r2' AND active; -- 4 SELECT count() FROM system.parts WHERE database = 'test' AND table = 'r3' AND active; -- 4 +SELECT sum(rows) FROM system.parts WHERE database = 'test' AND table = 'r1' AND active; -- 4 + DROP TABLE test.r1; DROP TABLE test.r2; DROP TABLE test.r3; diff --git a/dbms/tests/queries/0_stateless/00428_partition.reference b/dbms/tests/queries/0_stateless/00428_partition.reference index e69de29bb2d..788600df41e 100644 --- a/dbms/tests/queries/0_stateless/00428_partition.reference +++ b/dbms/tests/queries/0_stateless/00428_partition.reference @@ -0,0 +1,6 @@ +5 +5 +5 +5 +31,1,2 +1,2,3 diff --git a/dbms/tests/queries/0_stateless/00428_partition.sh b/dbms/tests/queries/0_stateless/00428_partition.sh new file mode 100755 index 00000000000..08705d3d24c --- /dev/null +++ b/dbms/tests/queries/0_stateless/00428_partition.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -e +# Not found column date in block. There are only columns: x. + +# Test 1. Complex test checking columns.txt + +chl="clickhouse-client -q" +ch_dir=`clickhouse --extract-from-config -c /etc/clickhouse-server/config.xml -k path` + +$chl "DROP TABLE IF EXISTS test.partition_428" +$chl "CREATE TABLE test.partition_428 (p Date, k Int8, v1 Int8 MATERIALIZED k + 1) ENGINE = MergeTree(p, k, 1)" +$chl "INSERT INTO test.partition_428 (p, k) VALUES(toDate(31), 1)" +$chl "INSERT INTO test.partition_428 (p, k) VALUES(toDate(1), 2)" + +for part in `$chl "SELECT name FROM system.parts WHERE database='test' AND table='partition_428'"`; do + cat $ch_dir/data/test/partition_428/$part/columns.txt | wc -l # 2 header lines + 3 columns +done + +$chl "ALTER TABLE test.partition_428 DETACH PARTITION 197001" +$chl "ALTER TABLE test.partition_428 ATTACH PARTITION 197001" + +for part in `$chl "SELECT name FROM system.parts WHERE database='test' AND table='partition_428'"`; do + cat $ch_dir/data/test/partition_428/$part/columns.txt | wc -l # 2 header lines + 3 columns +done + +$chl "ALTER TABLE test.partition_428 MODIFY COLUMN v1 Int8" +$chl "OPTIMIZE TABLE test.partition_428" + +$chl "SELECT toUInt16(p), k, v1 FROM test.partition_428 ORDER BY k FORMAT CSV" +$chl "DROP TABLE test.partition_428" + +# Test 2. Simple test + +$chl "drop table if exists test.partition_428" +$chl "create table test.partition_428 (date MATERIALIZED toDate(0), x UInt64, sample_key MATERIALIZED intHash64(x)) ENGINE=MergeTree(date,sample_key,(date,x,sample_key),8192)" +$chl "insert into test.partition_428 ( x ) VALUES ( now() )" +$chl "insert into test.partition_428 ( x ) VALUES ( now()+1 )" +$chl "alter table test.partition_428 detach partition 197001" +$chl "alter table test.partition_428 attach partition 197001" +$chl "optimize table test.partition_428" +$chl "drop table test.partition_428" diff --git a/dbms/tests/queries/0_stateless/00428_partition.sql b/dbms/tests/queries/0_stateless/00428_partition.sql deleted file mode 100644 index 7ff9bbd7c19..00000000000 --- a/dbms/tests/queries/0_stateless/00428_partition.sql +++ /dev/null @@ -1,8 +0,0 @@ --- Not found column date in block. There are only columns: x. -create table test.partition_428 (date MATERIALIZED toDate(0), x UInt64, sample_key MATERIALIZED intHash64(x)) ENGINE=MergeTree(date,sample_key,(date,x,sample_key),8192); -insert into test.partition_428 ( x ) VALUES ( now() ); -insert into test.partition_428 ( x ) VALUES ( now()+1 ); -alter table test.partition_428 detach partition 197001; -alter table test.partition_428 attach partition 197001; -optimize table test.partition_428; -drop table test.partition_428; diff --git a/dbms/tests/queries/0_stateless/00432_aggregate_function_scalars_and_constants.reference b/dbms/tests/queries/0_stateless/00432_aggregate_function_scalars_and_constants.reference index 2b71732c082..6087cae7ec5 100644 --- a/dbms/tests/queries/0_stateless/00432_aggregate_function_scalars_and_constants.reference +++ b/dbms/tests/queries/0_stateless/00432_aggregate_function_scalars_and_constants.reference @@ -27,3 +27,6 @@ 1 0 + +1 +2 diff --git a/dbms/tests/queries/0_stateless/00432_aggregate_function_scalars_and_constants.sql b/dbms/tests/queries/0_stateless/00432_aggregate_function_scalars_and_constants.sql index 8235c6af5e9..e8d9704c3a9 100644 --- a/dbms/tests/queries/0_stateless/00432_aggregate_function_scalars_and_constants.sql +++ b/dbms/tests/queries/0_stateless/00432_aggregate_function_scalars_and_constants.sql @@ -38,3 +38,20 @@ SELECT arrayReduce('groupUniqArrayMergeIf', SELECT ''; SELECT arrayReduce('avgState', [0]) IN (arrayReduce('avgState', [0, 1]), arrayReduce('avgState', [0])); SELECT arrayReduce('avgState', [0]) IN (arrayReduce('avgState', [0, 1]), arrayReduce('avgState', [1])); + +SELECT ''; +SELECT arrayReduce('uniqExactMerge', + [arrayReduce('uniqExactMergeState', + [ + arrayReduce('uniqExactState', [12345678901]), + arrayReduce('uniqExactState', [12345678901]) + ]) + ]); + +SELECT arrayReduce('uniqExactMerge', + [arrayReduce('uniqExactMergeState', + [ + arrayReduce('uniqExactState', [12345678901]), + arrayReduce('uniqExactState', [12345678902]) + ]) + ]); diff --git a/dbms/tests/queries/0_stateless/00453_cast_enum.reference b/dbms/tests/queries/0_stateless/00453_cast_enum.reference new file mode 100644 index 00000000000..5918d812cfe --- /dev/null +++ b/dbms/tests/queries/0_stateless/00453_cast_enum.reference @@ -0,0 +1,4 @@ +session 2017-01-01 0 +session 2017-01-01 1 +pageview 2017-01-01 0 +pageview 2017-01-01 1 diff --git a/dbms/tests/queries/0_stateless/00453_cast_enum.sql b/dbms/tests/queries/0_stateless/00453_cast_enum.sql new file mode 100644 index 00000000000..95f66129fb5 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00453_cast_enum.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS test.cast_enums; +CREATE TABLE test.cast_enums +( + type Enum8('session' = 1, 'pageview' = 2, 'click' = 3), + date Date, + id UInt64 +) ENGINE = MergeTree(date, (type, date, id), 8192); + +INSERT INTO test.cast_enums SELECT 'session' AS type, toDate('2017-01-01') AS date, number AS id FROM system.numbers LIMIT 2; +INSERT INTO test.cast_enums SELECT 2 AS type, toDate('2017-01-01') AS date, number AS id FROM system.numbers LIMIT 2; + +SELECT type, date, id FROM test.cast_enums ORDER BY type, id; + +DROP TABLE IF EXISTS test.cast_enums; diff --git a/dbms/tests/queries/0_stateless/00453_top_k.reference b/dbms/tests/queries/0_stateless/00453_top_k.reference new file mode 100644 index 00000000000..1a768b03965 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00453_top_k.reference @@ -0,0 +1 @@ +[0,1,2,3,4,5,6,7,8,9] diff --git a/dbms/tests/queries/0_stateless/00453_top_k.sql b/dbms/tests/queries/0_stateless/00453_top_k.sql new file mode 100644 index 00000000000..1f79a8c5393 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00453_top_k.sql @@ -0,0 +1 @@ +SELECT topK(10)(n) FROM (SELECT if(number % 100 < 10, number % 10, number) AS n FROM system.numbers LIMIT 100000); \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/00458_merge_type_cast.reference b/dbms/tests/queries/0_stateless/00458_merge_type_cast.reference new file mode 100644 index 00000000000..196ab90f4a9 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00458_merge_type_cast.reference @@ -0,0 +1,33 @@ + UInt32 | UInt64 + = 1: +1 +1 + 1: +1 +1 + 4294967290: +4294967290 +4294967290 + 4294967299: +4294967299 + Int64 | UInt64 + 1: +1 +1 + -1: + Int32 | UInt64 +1 +1 +2147483650 + String | FixedString(16) +1 +1 + DateTime | UInt64 +1 +1 + Array(UInt32) | Array(UInt64) +[1] +[1] +[4294967290] +[4294967290] +[4294967299] diff --git a/dbms/tests/queries/0_stateless/00458_merge_type_cast.sql b/dbms/tests/queries/0_stateless/00458_merge_type_cast.sql new file mode 100644 index 00000000000..6c4a7bd7661 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00458_merge_type_cast.sql @@ -0,0 +1,137 @@ + +SELECT ' UInt32 | UInt64 '; + +DROP TABLE IF EXISTS test.u32; +DROP TABLE IF EXISTS test.u64; +DROP TABLE IF EXISTS test.merge_32_64; + +CREATE TABLE test.u32 (x UInt32) ENGINE = Memory; +CREATE TABLE test.u64 (x UInt64) ENGINE = Memory; +CREATE TABLE test.merge_32_64 (x UInt64) ENGINE = Merge(test, 'u32|u64'); + +INSERT INTO test.u32 VALUES (1); +INSERT INTO test.u64 VALUES (1); + +INSERT INTO test.u32 VALUES (4294967290); +INSERT INTO test.u64 VALUES (4294967290); +--now inserts 3. maybe need out of range check? +--INSERT INTO test.u32 VALUES (4294967299); +INSERT INTO test.u64 VALUES (4294967299); + +select ' = 1:'; +SELECT x FROM test.merge_32_64 WHERE x = 1; +select ' 1:'; +SELECT x FROM test.merge_32_64 WHERE x IN (1); +select ' 4294967290:'; +SELECT x FROM test.merge_32_64 WHERE x IN (4294967290); +select ' 4294967299:'; +SELECT x FROM test.merge_32_64 WHERE x IN (4294967299); +--select ' -1: '; +--SELECT x FROM test.merge_32_64 WHERE x IN (-1); + +DROP TABLE test.u32; +DROP TABLE test.u64; +DROP TABLE test.merge_32_64; + + +SELECT ' Int64 | UInt64 '; + +DROP TABLE IF EXISTS test.s64; +DROP TABLE IF EXISTS test.u64; +DROP TABLE IF EXISTS test.merge_s64_u64; + +CREATE TABLE test.s64 (x Int64) ENGINE = Memory; +CREATE TABLE test.u64 (x UInt64) ENGINE = Memory; +CREATE TABLE test.merge_s64_u64 (x UInt64) ENGINE = Merge(test, 's64|u64'); + +INSERT INTO test.s64 VALUES (1); +INSERT INTO test.s64 VALUES (-1); +INSERT INTO test.u64 VALUES (1); + +select ' 1:'; +SELECT x FROM test.merge_s64_u64 WHERE x IN (1); +select ' -1: '; +SELECT x FROM test.merge_s64_u64 WHERE x IN (-1); + +DROP TABLE test.s64; +DROP TABLE test.u64; +DROP TABLE test.merge_s64_u64; + + +SELECT ' Int32 | UInt64 '; + +DROP TABLE IF EXISTS test.one; +DROP TABLE IF EXISTS test.two; +DROP TABLE IF EXISTS test.merge_one_two; + +CREATE TABLE test.one (x Int32) ENGINE = Memory; +CREATE TABLE test.two (x UInt64) ENGINE = Memory; +CREATE TABLE test.merge_one_two (x UInt64) ENGINE = Merge(test, 'one|two'); + +INSERT INTO test.one VALUES (1); +INSERT INTO test.two VALUES (1); + +INSERT INTO test.one VALUES (2147483650); +INSERT INTO test.two VALUES (2147483650); + +SELECT * FROM test.merge_one_two WHERE x IN (1); +SELECT x FROM test.merge_one_two WHERE x IN (2147483650); +SELECT x FROM test.merge_one_two WHERE x IN (-1); + + +SELECT ' String | FixedString(16) '; + +DROP TABLE IF EXISTS test.one; +DROP TABLE IF EXISTS test.two; +DROP TABLE IF EXISTS test.merge_one_two; + +CREATE TABLE test.one (x String) ENGINE = Memory; +CREATE TABLE test.two (x FixedString(16)) ENGINE = Memory; +CREATE TABLE test.merge_one_two (x String) ENGINE = Merge(test, 'one|two'); + +INSERT INTO test.one VALUES ('1'); +INSERT INTO test.two VALUES ('1'); + +SELECT * FROM test.merge_one_two WHERE x IN ('1'); + + +SELECT ' DateTime | UInt64 '; + +DROP TABLE IF EXISTS test.one; +DROP TABLE IF EXISTS test.two; +DROP TABLE IF EXISTS test.merge_one_two; + +CREATE TABLE test.one (x DateTime) ENGINE = Memory; +CREATE TABLE test.two (x UInt64) ENGINE = Memory; +CREATE TABLE test.merge_one_two (x UInt64) ENGINE = Merge(test, 'one|two'); + +INSERT INTO test.one VALUES (1); +INSERT INTO test.two VALUES (1); + +SELECT * FROM test.merge_one_two WHERE x IN (1); + + +SELECT ' Array(UInt32) | Array(UInt64) '; + +DROP TABLE IF EXISTS test.one; +DROP TABLE IF EXISTS test.two; +DROP TABLE IF EXISTS test.merge_one_two; + +CREATE TABLE test.one (x Array(UInt32)) ENGINE = Memory; +CREATE TABLE test.two (x Array(UInt64)) ENGINE = Memory; +CREATE TABLE test.merge_one_two (x Array(UInt64)) ENGINE = Merge(test, 'one|two'); + +INSERT INTO test.one VALUES ([1]); +INSERT INTO test.two VALUES ([1]); +INSERT INTO test.one VALUES ([4294967290]); +INSERT INTO test.two VALUES ([4294967290]); +INSERT INTO test.one VALUES ([4294967299]); +INSERT INTO test.two VALUES ([4294967299]); + +SELECT x FROM test.merge_one_two WHERE x IN (1); +SELECT x FROM test.merge_one_two WHERE x IN (4294967290); +SELECT x FROM test.merge_one_two WHERE x IN (4294967299); + +DROP TABLE IF EXISTS test.one; +DROP TABLE IF EXISTS test.two; +DROP TABLE IF EXISTS test.merge_one_two; diff --git a/dbms/tests/queries/0_stateless/00459_group_array_insert_at.reference b/dbms/tests/queries/0_stateless/00459_group_array_insert_at.reference new file mode 100644 index 00000000000..f55b099b52e --- /dev/null +++ b/dbms/tests/queries/0_stateless/00459_group_array_insert_at.reference @@ -0,0 +1,25 @@ +['0','','1','','2','','3','','4','','5','','6','','7','','8','','9'] +['0','-','1','-','2','-','3','-','4','-','5','-','6','-','7','-','8','-','9'] +[[],[123],[0],[123],[0,1],[123],[0,1,2],[123],[0,1,2,3],[123],[0,1,2,3,4],[123],[0,1,2,3,4,5],[123],[0,1,2,3,4,5,6],[123],[0,1,2,3,4,5,6,7],[123],[0,1,2,3,4,5,6,7,8]] +0 [0] +1 [0,1] +2 [0,0,2] +3 [0,0,0,3] +4 [0,0,0,0,4] +5 [0,0,0,0,0,5] +6 [0,0,0,0,0,0,6] +7 [0,0,0,0,0,0,0,7] +8 [0,0,0,0,0,0,0,0,8] +9 [0,0,0,0,0,0,0,0,0,9] +0 0 +0 ['0','-','-','-','-','-','-','-','-','-'] +1 ['-','1','-','-','-','-','-','-','-','-'] +2 ['-','-','2','-','-','-','-','-','-','-'] +3 ['-','-','-','3','-','-','-','-','-','-'] +4 ['-','-','-','-','4','-','-','-','-','-'] +5 ['-','-','-','-','-','5','-','-','-','-'] +6 ['-','-','-','-','-','-','6','-','-','-'] +7 ['-','-','-','-','-','-','-','7','-','-'] +8 ['-','-','-','-','-','-','-','-','8','-'] +9 ['-','-','-','-','-','-','-','-','-','9'] +10 ['-','-','-','-','-','-','-','-','-','-'] diff --git a/dbms/tests/queries/0_stateless/00459_group_array_insert_at.sql b/dbms/tests/queries/0_stateless/00459_group_array_insert_at.sql new file mode 100644 index 00000000000..b692038e9c5 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00459_group_array_insert_at.sql @@ -0,0 +1,6 @@ +SELECT groupArrayInsertAt(toString(number), number * 2) FROM (SELECT * FROM system.numbers LIMIT 10); +SELECT groupArrayInsertAt('-')(toString(number), number * 2) FROM (SELECT * FROM system.numbers LIMIT 10); +SELECT groupArrayInsertAt([123])(range(number), number * 2) FROM (SELECT * FROM system.numbers LIMIT 10); +SELECT number, groupArrayInsertAt(number, number) FROM (SELECT * FROM system.numbers LIMIT 10) GROUP BY number ORDER BY number; +SELECT k, ignore(groupArrayInsertAt(x, x)) FROM (SELECT dummy AS k, randConstant() % 10 AS x FROM remote('127.0.0.{1,1}', system.one)) GROUP BY k ORDER BY k; +SELECT k, groupArrayInsertAt('-', 10)(toString(x), x) FROM (SELECT number AS k, number AS x FROM system.numbers LIMIT 11) GROUP BY k ORDER BY k; \ No newline at end of file diff --git a/debian/control b/debian/control index 58f7fc2b08c..a91d321d09d 100644 --- a/debian/control +++ b/debian/control @@ -1,9 +1,15 @@ Source: clickhouse Priority: optional Maintainer: Alexey Milovidov -Build-Depends: debhelper (>= 9), cmake, gcc-6, g++-6, - libicu-dev, libreadline-dev, default-libmysqlclient-dev | libmysqlclient-dev, unixodbc-dev, - libglib2.0-dev, libltdl-dev, libssl-dev +Build-Depends: debhelper (>= 9), + cmake, + gcc-6, g++-6, + default-libmysqlclient-dev | libmysqlclient-dev, + libicu-dev, + libltdl-dev, + libreadline-dev, + libssl-dev, + unixodbc-dev Standards-Version: 3.8.0 Section: libs diff --git a/debian/rules b/debian/rules index 0679aa3f968..064919cd19c 100755 --- a/debian/rules +++ b/debian/rules @@ -2,7 +2,7 @@ # -*- makefile -*- # Uncomment this to turn on verbose mode. -export DH_VERBOSE=1 +#export DH_VERBOSE=1 # -pie only for static mode export DEB_BUILD_MAINT_OPTIONS=hardening=+all,-pie @@ -33,6 +33,9 @@ CXX := $(DEB_HOST_GNU_TYPE)-$(DEB_CXX) endif CMAKE_FLAGS ?= -DCMAKE_CXX_COMPILER=`which $(CXX)` -DCMAKE_C_COMPILER=`which $(CC)` -DENABLE_TESTS=0 $(CMAKE_FLAGS_ADD) +ifndef DH_VERBOSE + CMAKE_FLAGS += -DCMAKE_VERBOSE_MAKEFILE=0 +endif %: dh $@ --parallel --buildsystem=cmake --builddirectory=$(BUILDDIR) diff --git a/doc/build_freebsd.sh b/doc/build_freebsd.sh index 92fd419323d..230e1793449 100755 --- a/doc/build_freebsd.sh +++ b/doc/build_freebsd.sh @@ -1,35 +1,46 @@ #!/bin/sh -# How to build ClickHouse under freebsd 11+ -# [temporary solution before port created] +# How to build ClickHouse under freebsd 11+ + +# Variant 1: Use pkg: +# pkg install databases/clickhouse + +# Variant 2: Use ports: +# make -C /usr/ports/databases/clickhouse install clean + +# Run server: +# echo clickhouse_enable="YES" >> /etc/rc.conf.local +# service clickhouse restart + + +# Variant 3: Manual build: # pkg install -y curl sudo # curl https://raw.githubusercontent.com/yandex/ClickHouse/master/doc/build_freebsd.sh | sh -# install compiler and libs -sudo pkg install git cmake bash mysql57-client icu libltdl unixODBC google-perftools +# install compiler and libs +sudo pkg install devel/git devel/cmake shells/bash devel/icu devel/libltdl databases/unixODBC devel/google-perftools devel/libzookeeper devel/libdouble-conversion archivers/zstd archivers/liblz4 devel/sparsehash devel/re2 -# install testing only stuff if you want: -sudo pkg install python py27-lxml py27-termcolor curl perl5 +# install testing only stuff if you want: +sudo pkg install lang/python devel/py-lxml devel/py-termcolor ftp/curl perl5 -# Checkout ClickHouse sources +# If you want ODBC support: Check UNIXODBC option: +# make -C /usr/ports/devel/poco config reinstall + +# Checkout ClickHouse sources git clone https://github.com/yandex/ClickHouse.git -# Build! +# Build! mkdir -p ClickHouse/build cd ClickHouse/build -cmake .. -DUSE_INTERNAL_GPERFTOOLS_LIBRARY=0 -# WIP: variant with libs from ports: -# sudo pkg install devel/boost-libs devel/libzookeeper devel/libdouble-conversion archivers/zstd archivers/liblz4 devel/sparsehash devel/re2 -# Check UNIXODBC option: -# make -C /usr/ports/devel/poco config reinstall -# cmake .. -DUNBUNDLED=1 -DUSE_STATIC_LIBRARIES=0 -DNO_WERROR=1 +cmake .. -DUNBUNDLED=1 -DUSE_STATIC_LIBRARIES=0 -DNO_WERROR=1 -DUSE_INTERNAL_BOOST_LIBRARY=1 +# build with boost 1.64 from ports temporary broken make -C dbms/src/Server -j $(nproc || sysctl -n hw.ncpu || echo 2) cd ../.. -# run server: +# Run server: # ClickHouse/build/dbms/src/Server/clickhouse --server --config-file=ClickHouse/dbms/src/Server/config.xml & -# run client: +# Run client: # ClickHouse/build/dbms/src/Server/clickhouse --client diff --git a/doc/example_datasets/1_ontime.txt b/doc/example_datasets/1_ontime.txt index 752e242faae..3b9f96de503 100644 --- a/doc/example_datasets/1_ontime.txt +++ b/doc/example_datasets/1_ontime.txt @@ -9,11 +9,11 @@ http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html 1. https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh -for s in `seq 1987 2015` +for s in `seq 1987 2017` do for m in `seq 1 12` do -wget http://tsdata.bts.gov/PREZIP/On_Time_On_Time_Performance_${s}_${m}.zip +wget http://transtats.bts.gov/PREZIP/On_Time_On_Time_Performance_${s}_${m}.zip done done diff --git a/doc/presentations/evolution/LICENSE.md b/doc/presentations/evolution/LICENSE.md deleted file mode 100644 index bd3449d3576..00000000000 --- a/doc/presentations/evolution/LICENSE.md +++ /dev/null @@ -1,21 +0,0 @@ -# The MIT License - -Copyright © 2010–2015 Vadim Makeev, http://pepelsbey.net/ - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ---- - -# Лицензия MIT - -Copyright © 2010–2015 Вадим Макеев, http://pepelsbey.net/ - -Данная лицензия разрешает лицам, получившим копию данного программного обеспечения и сопутствующей документации (в дальнейшем именуемыми «Программное Обеспечение»), безвозмездно использовать Программное Обеспечение без ограничений, включая неограниченное право на использование, копирование, изменение, добавление, публикацию, распространение, сублицензирование и/или продажу копий Программного Обеспечения, также как и лицам, которым предоставляется данное Программное Обеспечение, при соблюдении следующих условий: - -Указанное выше уведомление об авторском праве и данные условия должны быть включены во все копии или значимые части данного Программного Обеспечения. - -ДАННОЕ ПРОГРАММНОЕ ОБЕСПЕЧЕНИЕ ПРЕДОСТАВЛЯЕТСЯ «КАК ЕСТЬ», БЕЗ КАКИХ-ЛИБО ГАРАНТИЙ, ЯВНО ВЫРАЖЕННЫХ ИЛИ ПОДРАЗУМЕВАЕМЫХ, ВКЛЮЧАЯ, НО НЕ ОГРАНИЧИВАЯСЬ ГАРАНТИЯМИ ТОВАРНОЙ ПРИГОДНОСТИ, СООТВЕТСТВИЯ ПО ЕГО КОНКРЕТНОМУ НАЗНАЧЕНИЮ И ОТСУТСТВИЯ НАРУШЕНИЙ ПРАВ. НИ В КАКОМ СЛУЧАЕ АВТОРЫ ИЛИ ПРАВООБЛАДАТЕЛИ НЕ НЕСУТ ОТВЕТСТВЕННОСТИ ПО ИСКАМ О ВОЗМЕЩЕНИИ УЩЕРБА, УБЫТКОВ ИЛИ ДРУГИХ ТРЕБОВАНИЙ ПО ДЕЙСТВУЮЩИМ КОНТРАКТАМ, ДЕЛИКТАМ ИЛИ ИНОМУ, ВОЗНИКШИМ ИЗ, ИМЕЮЩИМ ПРИЧИНОЙ ИЛИ СВЯЗАННЫМ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ ИЛИ ИСПОЛЬЗОВАНИЕМ ПРОГРАММНОГО ОБЕСПЕЧЕНИЯ ИЛИ ИНЫМИ ДЕЙСТВИЯМИ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ. diff --git a/doc/presentations/evolution/index_en.html b/doc/presentations/evolution/index_en.html deleted file mode 100644 index dab4fd4e883..00000000000 --- a/doc/presentations/evolution/index_en.html +++ /dev/null @@ -1,344 +0,0 @@ - - - - Evolution of data structures in Yandex.Metrica - - - - - - -
-

Evolution of data structures in Yandex.Metrica

-
- -
-

Evolution of data structures
in Yandex.Metrica

-
- -
-

Introduction

-
- -
-

About me

-

Alexey, developer of ClickHouse.

-

I work on data processing engine of Yandex.Metrica since 2008.

-
- -
-

About Yandex

-

Yandex is one of the largest internet companies in Europe
operating Russia’s most popular search engine.

-
- -
-

About Yandex.Metrica

-

Yandex.Metrica (https://metrica.yandex.com/) is a service for web analytics.

-

Largest in Russia, second largest in the world (just after Google Analytics).

-

-

We are processing about ~25 billions of events (page views, conversions, etc).

-

We must generate and show reports in realtime.

-
- -
- -
- -
-

History

-
- -
-

How to store data?

-

Big data processing is not a problem.

-

The challenge is how to store data in that way to allow both:

-

- efficient ingestion of click stream in realtime;

-

- efficient generation of reports;

-

 

-

Let review our historical solutions first...

-
- -
-

MySQL (MyISAM) 2008-2011

-

We have had about 50 predefined report types.

-

We create a table for each of them.

-

Each table has primary key in form of:

-

    site_id, date, key -> aggregated statistics.

-

The data was inserted in mini-batches of aggregated deltas,
using ON DUPLICATE KEY UPDATE.

-

 

-

... but this just don't work.

-
- -
-

Data locality on disk (artistic view)

-

The main concern is data locality.

- -
- -
-

MySQL (MyISAM) 2008-2011

-

We use HDD (rotational drives).
We cannot afford petabytes of SSDs.

-

Each seek is ~12 ms of latency,
usually no more than 1000 random reads/second in RAID array.

- -

Time to read data from disk array is dependent on:
- number of seeks;
- total amount of data;

- -

Example: read 100 000 rows, randomly scattered on disk:
- at least 100 seconds in worst case.
User won't wait hundred seconds for the report.

- -

The only way to read data from disk array in appropriate amount of time is to minimize number of seek by maintaining data locality.

-
- -
-

MySQL (MyISAM) 2008-2011

-

Fundamental problem:

- -

Data is inserted almost in time order: -
- each second we have new portion data for this second; -
- but data for different web sites are comes in random order in a stream;

- -

Data is selected by ranges for specified web site and date period: -
- in ranges of completely different order;

-
- -
-

MySQL (MyISAM) 2008-2011

-

MyISAM stores data in MYD and MYI files.
MYD contains data almost in order of insertion.
MYI contains B-tree index that maps a key to offset in MYD file.

-

Insertion of data is almost fine.
But selecting of data by range of primary key was non-practical.

-

Nevertheless, we made it work by:

-

- - tricky partitioning;
- - organizing data in few generations with different partitioning scheme;
- - moving data between tables by scheduled scripts;
- - report generation becomes ugly UNION ALL queries.

-
- -
-

MySQL (MyISAM) 2008-2011

-

As of 2011 we was storing about 580 billion rows in MyISAM tables.

-

We were not satisfied by performance and maintenance cost:
- Example: page title report loading time, 90% quantile was more than 10 seconds.

-

... After all, everything was converted and deleted.

-
- -
-

Metrage, 2010-2015

-

(Specialized data structure, developed specially for aggregation of data and report generation).

-

To maintain data locality, we need
to constantly reordering data by primary key.

-

We cannot maintain desired order at INSERT time, nor on SELECT time;
we must do it in background.

-

Obviously: we need an LSM-tree!

-
- -
-

Metrage, 2010-2015

-

Metrage: Metrica + Aggregated statistics.

-

We have created custom data structure for that purpose.

-

In 2010, there was no LevelDB.
We just got some insights from article about TokuDB.

-

Metrage is designed for the purpose of realtime data aggregation:
- - row in Metrage table is custom C++ struct with update and merge methods.

-

Example: a row in Metrage table could contain a HyperLogLog.

-

Data in Metrage is aggregated:
- on insertion, in batches;
- during background compaction;
- on the fly, during report generation.

-
- -
-

Metrage, 2010-2015

-

Everything was working fine.
The problem of data locality was solved.
Reports was loading quickly.

-

As of 2015 we stored 3.37 trillion rows in Metrage
and used 39 * 2 servers for this.

-

But we have had just ~50 pre-defined reports.

-

No customization and drill down was possible.

-

The user wants to slice and dice every report by every dimension!

-

 

-

... and we have developed just another custom data structure.

-
- -
-

The report builder, 2010

-

We had quickly made a prototype of so-called "report builder".

-

This was 2010 year. It was just simple specialized column-oriented data structure.

-

It worked fine and we got understanding, what the right direction to go.

-

We need good column-oriented DBMS.

-
- -
-

Why column-oriented?

-

This is how "traditional" row-oriented databases work:

-

-
- -
-

Why column-oriented?

-

And this is how column-oriented databases work:

-

-
- -
-

Why column-oriented?

-

Hypothesis:

-

If we have good enough column-oriented DBMS,
we could store all our data in non-aggregated form
(raw pageviews and sessions) and generate all the reports on the fly,
to allow infinite customization.

-

To check this hypothesis, we started to evaluate existing solutions.

-

MonetDB, InfiniDB, Infobright and so on...

-

No appropriate solutions were exist in 2010.

-
- -
-

ClickHouse

-

As an experimental project, we started to develop
our own column-oriented DBMS: ClickHouse.

-

In 2012 it was in production state.

-

In 2014 we re-lauched Yandex.Metrica as Metrica 2.

-

All data is stored in ClickHouse and in non-aggregated form
and every report is generated on the fly.

-

In Metrika 2 the user could create it's own report with
- - custom dimensions, metrics, filters, user-centric segmentation...
- - and to dig through data to the detail of individual visitors.

-
- -
-

ClickHouse

-

The main target for ClickHouse is query execution speed.

-

In Yandex.Metrika, users could analyze data for their web sites of any volume.

-

Biggest classifieds and e-commerce sites with hundreds millions PV/day are using Yandex.Metrika (e.g. ru.aliexpress.com).

-

In contrast to GA*, in Yandex.Metrika, you could get data reports for large web sites without sampling.

-

As data is processed on the fly, ClickHouse must be able to crunch all that pageviews in sub second time.

-

* in Google Analytics you could get reports without sampling only in "premium" version.

-
- -
-

The main cluster of Yandex.Metrica

-
    -
  • 20.6 trillions of rows (as of Feb 2017)
  • -
  • 450 servers
  • -
  • total throughput of query processing is up to two terabytes per second
  • -
-

* If you want to try ClickHouse, one server or VM is enough.

-
- -
-

ClickHouse

-
    -
  • column-oriented
  • -
  • distributed
  • -
  • linearly scalable
  • -
  • fault-tolerant
  • -
  • data ingestion in realtime
  • -
  • realtime (sub-second) queries
  • -
  • support of SQL dialect + extensions
    (arrays, nested data types, domain-specific functions, approximate query execution)
  • -
-
- -
-

Open-source (since June 2016)

-

We think ClickHouse is too good to be used solely by Yandex.

-

We made it open-source. License: Apache 2.0.

-

https://github.com/yandex/ClickHouse/

-
- -
-

Open-source

-

More than 100 companies is already using ClickHouse.

-

Examples: Mail.ru, Cloudflare, Kaspersky...

-
- -
-

When to use ClickHouse

-

For well structured, clean, immutable events.

-

 

-

Click stream. Web analytics. Adv. networks. RTB. E-commerce.

-

Analytics for online games. Sensor and monitoring data. Telecom data.

-
- -
-

When not to use ClickHouse

-

OLTP
ClickHouse doesn't have UPDATE statement and full-featured transactions.

-

Key-Value
If you want high load of small single-row queries, please use another system.

-

Blob-store, document oriented
ClickHouse is intended for vast amount of fine-grained data.

-

Over-normalized data
Better to make up single wide fact table with pre-joined dimensions.

-
- -
-

ClickHouse vs. Spark

-

https://www.percona.com/blog/2017/02/13/clickhouse-new-opensource-columnar-database/

- -
- -
-

ClickHouse vs. PrestoDB

-

Ömer Osman Koçak:

- «When we evaluated ClickHouse the results were great compared to Prestodb. Even though the columnar storage optimizations for ORC and Clickhouse is quite similar, Clickhouse uses CPU and Memory resources more efficiently (Presto also uses vectorized execution but cannot take advantage of hardware level optimizations such as SIMD instruction sets because it's written in Java so that's fair) so we also wanted to add support for Clickhouse for our open-source analytics platform Rakam (https://github.com/rakam-io/rakam)»

-
- -
-

ClickHouse vs. InfiniDB

-

«结论:clickhouse速度更快!»

-

«In conclusion, ClickHouse is faster!»

-

http://verynull.com/2016/08/22/infinidb与clickhouse对比/

-

-
- -
-

Why ClickHouse is so fast?

-

 

-

— we just cannot make it slower.

-

Yandex.Metrica must work.

-
- -
-

Why ClickHouse is so fast?

-

Algorithmic optimizations.

-

MergeTree, locality of data on disk
— fast range queries.

-

Example: uniqCombined function is a combination of three different data structures, used for different ranges of cardinalities.

-

Low-level optimizations.

-

Example: vectorized query execution.

-

Specialization and attention to detail.

-

Example: we have 17 different algorithms for GROUP BY. Best one is selected for your query.

-
- -
-

How to connect to ClickHouse

-

HTTP REST

-

clickhouse-client

-

JDBC

-

 

-

Python, PHP, Go, Perl, Ruby, Node.JS, R, .NET

-

 

-

Web UI: https://github.com/smi2/clickhouse-frontend

-

Redash, Zeppelin, Superset, Grafana, PowerBI - somewhat works

-
- -
-

Community

-

Web site: https://clickhouse.yandex/

-

Google groups: https://groups.google.com/forum/#!forum/clickhouse

-

Maillist: clickhouse-feedback@yandex-team.com

-

Telegram chat: https://telegram.me/clickhouse_en and https://telegram.me/clickhouse_ru (now with 403 members)

-

GitHub: https://github.com/yandex/ClickHouse/

-

 

-

+ meetups. Moscow, Saint-Petersburg... International meetups will be announced this year.

-
- -
-

 

-

Thank you. Questions.

-
- -
-

Bonus

- -
- -
-

ClickHouse vs. typical row-oriented DBMS

-

Itai Shirav:

«I haven't made a rigorous comparison, but I did convert a time-series table with 9 million rows from Postgres to ClickHouse.

-

Under ClickHouse queries run about 100 times faster, and the table takes 20 times less disk space. Which is pretty amazing if you ask me».

-
- -
-

ClickHouse for sensor data

-

-
- -
-

ClickHouse vs. Greenplum

-

-
- -
- - - diff --git a/doc/presentations/evolution/pictures/column_oriented.gif b/doc/presentations/evolution/pictures/column_oriented.gif deleted file mode 100644 index 15f4b12e697..00000000000 Binary files a/doc/presentations/evolution/pictures/column_oriented.gif and /dev/null differ diff --git a/doc/presentations/evolution/pictures/data_locality.png b/doc/presentations/evolution/pictures/data_locality.png deleted file mode 100644 index 7719eb504b6..00000000000 Binary files a/doc/presentations/evolution/pictures/data_locality.png and /dev/null differ diff --git a/doc/presentations/evolution/pictures/example_french.png b/doc/presentations/evolution/pictures/example_french.png deleted file mode 100644 index f941291164c..00000000000 Binary files a/doc/presentations/evolution/pictures/example_french.png and /dev/null differ diff --git a/doc/presentations/evolution/pictures/greenplum.png b/doc/presentations/evolution/pictures/greenplum.png deleted file mode 100644 index e919a45dadc..00000000000 Binary files a/doc/presentations/evolution/pictures/greenplum.png and /dev/null differ diff --git a/doc/presentations/evolution/pictures/infinidb_cn.png b/doc/presentations/evolution/pictures/infinidb_cn.png deleted file mode 100644 index 957c392a448..00000000000 Binary files a/doc/presentations/evolution/pictures/infinidb_cn.png and /dev/null differ diff --git a/doc/presentations/evolution/pictures/kaspersky.png b/doc/presentations/evolution/pictures/kaspersky.png deleted file mode 100644 index f8aae1da9ee..00000000000 Binary files a/doc/presentations/evolution/pictures/kaspersky.png and /dev/null differ diff --git a/doc/presentations/evolution/pictures/metrika2.png b/doc/presentations/evolution/pictures/metrika2.png deleted file mode 100644 index f53e308e55b..00000000000 Binary files a/doc/presentations/evolution/pictures/metrika2.png and /dev/null differ diff --git a/doc/presentations/evolution/pictures/metrika_market_share.png b/doc/presentations/evolution/pictures/metrika_market_share.png deleted file mode 100644 index 9817998acc7..00000000000 Binary files a/doc/presentations/evolution/pictures/metrika_market_share.png and /dev/null differ diff --git a/doc/presentations/evolution/pictures/row_oriented.gif b/doc/presentations/evolution/pictures/row_oriented.gif deleted file mode 100644 index 53daa20f322..00000000000 Binary files a/doc/presentations/evolution/pictures/row_oriented.gif and /dev/null differ diff --git a/doc/presentations/evolution/pictures/spark.png b/doc/presentations/evolution/pictures/spark.png deleted file mode 100644 index 3ae61297631..00000000000 Binary files a/doc/presentations/evolution/pictures/spark.png and /dev/null differ diff --git a/doc/presentations/evolution/pictures/yandex_office.jpg b/doc/presentations/evolution/pictures/yandex_office.jpg deleted file mode 100644 index 1a0d20050ce..00000000000 Binary files a/doc/presentations/evolution/pictures/yandex_office.jpg and /dev/null differ diff --git a/doc/presentations/evolution/shower/shower.min.js b/doc/presentations/evolution/shower/shower.min.js deleted file mode 100644 index 449843ac45d..00000000000 --- a/doc/presentations/evolution/shower/shower.min.js +++ /dev/null @@ -1,8 +0,0 @@ -/** - * Core for Shower HTML presentation engine - * shower-core v2.0.7, https://github.com/shower/core - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -!function(a){var b,c={NOT_RESOLVED:"NOT_RESOLVED",IN_RESOLVING:"IN_RESOLVING",RESOLVED:"RESOLVED"},d=function(){var l={trackCircularDependencies:!0,allowMultipleDeclarations:!0},m={},n=!1,o=[],p=function(a,d,e){e||(e=d,d=[]);var f=m[a];f||(f=m[a]={name:a,decl:b}),f.decl={name:a,prev:f.decl,fn:e,state:c.NOT_RESOLVED,deps:d,dependents:[],exports:b}},q=function(b,c,d){"string"==typeof b&&(b=[b]),n||(n=!0,k(v)),o.push({deps:b,cb:function(b,f){f?(d||e)(f):c.apply(a,b)}})},r=function(a){var b=m[a];return b?c[b.decl.state]:"NOT_DEFINED"},s=function(a){return!!m[a]},t=function(a){for(var b in a)a.hasOwnProperty(b)&&(l[b]=a[b])},u=function(){var a,b={};for(var c in m)m.hasOwnProperty(c)&&(a=m[c],(b[a.decl.state]||(b[a.decl.state]=[])).push(c));return b},v=function(){n=!1,w()},w=function(){var a,b=o,c=0;for(o=[];a=b[c++];)x(null,a.deps,[],a.cb)},x=function(a,b,c,d){var e=b.length;e||d([]);for(var g,h,i=[],j=function(a,b){if(b)return void d(null,b);if(!--e){for(var c,f=[],g=0;c=i[g++];)f.push(c.exports);d(f)}},k=0,l=e;k ")+'"')},h=function(a){return Error('Declaration of module "'+a.name+'" has already been provided')},i=function(a){return Error('Multiple declarations of module "'+a.name+'" have been detected')},j=function(a,b){for(var c,d=0;c=b[d++];)if(a===c)return!0;return!1},k=function(){var b=[],c=function(a){return 1===b.push(a)},d=function(){var a=b,c=0,d=b.length;for(b=[];c=0&&!b.defaultPrevented();){var d=a[c];d&&(d.context?d.callback.call(d.context,b):d.callback(b)),c--}}}),a(e)}),shower.modules.define("Plugins",["Emitter","util.extend"],function(a,b,c){function d(a){this.events=new b({context:this}),this._showerGlobal=a,this._showerInstances=a.getInited(),this._plugins={},this._instances=[],a.events.on("init",this._onShowerInit,this)}c(d.prototype,{destroy:function(){this._showerGlobal.events.off("init",this._onShowerInit,this),this._plugins=null},add:function(a,b){if(this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" already exist.");return this._requireAndAdd({name:a,options:b}),this},remove:function(a){if(!this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" not found.");return delete this._plugins[a],this.events.emit("remove",{name:a}),this},get:function(a,b){var c,d=this._plugins[a];if(d&&b)for(var e=0,f=this._instances.length;e=0;e--)if(d[e].getId()===a){b=d[e],c=e;break}return{slide:b,index:c}},_onSlideActivate:function(a){window.location.hash=a.get("slide").getId(),this._setTitle()},_onContainerSlideModeChange:function(){this._setTitle(),this.save()},_isSlideMode:function(){return this._shower.container.isSlideMode()},_onPopstate:function(){var a,b=this._shower,c=window.location.hash.substr(1),d=b.player.getCurrentSlide(),e=b.player.getCurrentSlideIndex();this._isSlideMode()&&e===-1?b.player.go(0):e===-1&&""!==window.location.hash&&b.player.go(0),d&&c!==d.getId()&&(a=this._getSlideById(c),b.player.go(a.index))},_setTitle:function(){var a=document.title,b=this._isSlideMode(),c=this._shower.player.getCurrentSlide();if(b&&c){var d=c.getTitle();document.title=d?d+" — "+this._documentTitle:this._documentTitle}else this._documentTitle!==a&&(document.title=this._documentTitle)}}),a(e)}),shower.modules.define("shower.Player",["Emitter","util.bound","util.extend"],function(a,b,c,d){function e(a){this.events=new b({context:this,parent:a.events}),this._shower=a,this._showerListeners=null,this._playerListeners=null,this._currentSlideNumber=-1,this._currentSlide=null,this.init()}d(e.prototype,{init:function(){this._showerListeners=this._shower.events.group().on("slideadd",this._onSlideAdd,this).on("slideremove",this._onSlideRemove,this).on("slidemodeenter",this._onSlideModeEnter,this),this._playerListeners=this.events.group().on("prev",this._onPrev,this).on("next",this._onNext,this),document.addEventListener("keydown",c(this,"_onKeyDown"))},destroy:function(){this._showerListeners.offAll(),this._playerListeners.offAll(),document.removeEventListener("keydown",c(this,"_onKeyDown")),this._currentSlide=null,this._currentSlideNumber=null,this._shower=null},next:function(){return this.events.emit("next"),this},prev:function(){return this.events.emit("prev"),this},first:function(){return this.go(0),this},last:function(){return this.go(this._shower.getSlidesCount()-1),this},go:function(a){"number"!=typeof a&&(a=this._shower.getSlideIndex(a));var b=this._shower.getSlidesCount(),c=this._currentSlide;return a!=this._currentSlideNumber&&a=0&&(c&&c.isActive()&&c.deactivate(),c=this._shower.get(a),this._currentSlide=c,this._currentSlideNumber=a,c.isActive()||c.activate(),this.events.emit("activate",{index:a,slide:c})),this},getCurrentSlide:function(){return this._currentSlide},getCurrentSlideIndex:function(){return this._currentSlideNumber},_onPrev:function(){this._changeSlide(this._currentSlideNumber-1)},_onNext:function(){this._changeSlide(this._currentSlideNumber+1)},_changeSlide:function(a){this.go(a)},_onSlideAdd:function(a){var b=a.get("slide");b.events.on("activate",this._onSlideActivate,this)},_onSlideRemove:function(a){var b=a.get("slide");b.events.off("activate",this._onSlideActivate,this)},_onSlideActivate:function(a){var b=a.get("slide"),c=this._shower.getSlideIndex(b);this.go(c)},_onKeyDown:function(a){if(this._shower.isHotkeysEnabled()&&!/^(?:button|input|select|textarea)$/i.test(a.target.tagName))switch(this.events.emit("keydown",{event:a}),a.which){case 33:case 38:case 37:case 72:case 75:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.prev();break;case 34:case 40:case 39:case 76:case 74:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.next();break;case 36:a.preventDefault(),this.first();break;case 35:a.preventDefault(),this.last();break;case 32:this._shower.container.isSlideMode()&&(a.shiftKey?this.prev():this.next())}},_onSlideModeEnter:function(){this._currentSlide||this.go(0)}}),a(e)}),shower.modules.define("shower.slidesParser",["Slide"],function(a,b){function c(a,c){var d=a.querySelectorAll(c);return d=Array.prototype.slice.call(d),d.map(function(a,c){var d=new b(a);return a.id||(a.id=c+1),d})}a(c)}),shower.modules.define("Slide",["shower.defaultOptions","Emitter","Options","slide.Layout","slide.layoutFactory","util.Store","util.extend"],function(a,b,c,d,e,f,g,h){function i(a,b,e){this.events=new c,this.options=new d(b),this.layout=null,this.state=new g({visited:0,index:null},e),this._content=a,this._isVisited=this.state.get("visited")>0,this._isActive=!1,this.init()}h(i.prototype,{init:function(){this.layout="string"==typeof this._content?new f.createLayout({content:this._content}):new e(this._content,this.options),this.layout.setParent(this),this._setupListeners()},destroy:function(){this._clearListeners(),this._isActive=null,this.options=null,this.layout.destroy()},activate:function(){this._isActive=!0;var a=this.state.get("visited");return this.state.set("visited",++a),this.events.emit("activate",{slide:this}),this},deactivate:function(){return this._isActive=!1,this.events.emit("deactivate",{slide:this}),this},isActive:function(){return this._isActive},isVisited:function(){return this.state.get("visited")>0},getTitle:function(){return this.layout.getTitle()},setTitle:function(a){return this.layout.setTitle(a),this},getId:function(){return this.layout.getElement().id},getContent:function(){return this.layout.getContent()},_setupListeners:function(){this.layoutListeners=this.layout.events.group().on("click",this._onSlideClick,this)},_clearListeners:function(){this.layoutListeners.offAll()},_onSlideClick:function(){this.activate(),this.events.emit("click",{slide:this})}}),a(i)}),shower.modules.define("slide.Layout",["Options","shower.defaultOptions","Emitter","util.bound","util.extend"],function(a,b,c,d,e,f){function g(a,e){this.options=new b({title_element_selector:c.slide_title_element_selector,active_classname:c.slide_active_classname,visited_classname:c.slide_visited_classname},e),this.events=new d,this._element=a,this._parent=null,this._parentElement=null,this.init()}f(g.prototype,{init:function(){var a=this._element.parentNode;a?this._parentElement=a:this.setParentElement(a)},destroy:function(){this.setParent(null)},setParent:function(a){this._parent!=a&&(this._clearListeners(),this._parent=a,this._parent&&this._setupListeners(),this.events.emit("parentchange",{parent:a}))},getParent:function(){return this._parent},setParentElement:function(a){a!=this._parentElement&&(this._parentElement=a,a.appendChild(this._element),this.events.emit("parentelementchange",{parentElement:a}))},getParentElement:function(){return this._parentElement},getElement:function(){return this._element},setTitle:function(a){var b=this.options.get("title_element_selector"),c=this._element.querySelector(b);c?c.innerHTML=a:(c=document.createElement(b),c.innerHTML=a,this._element.insertBefore(c,this._element.firstChild))},getTitle:function(){var a=this.options.get("title_element_selector"),b=this._element.querySelector(a);return b?b.textContent:null},getData:function(a){var b=this._element;return b.dataset?b.dataset[a]:b.getAttribute("data-"+a)},getContent:function(){return this._element.innerHTML},_setupListeners:function(){this._slideListeners=this._parent.events.group().on("activate",this._onSlideActivate,this).on("deactivate",this._onSlideDeactivate,this),this._element.addEventListener("click",e(this,"_onSlideClick"),!1)},_clearListeners:function(){this._slideListeners&&this._slideListeners.offAll(),this._element.removeEventListener("click",e(this,"_onSlideClick"))},_onSlideActivate:function(){this._element.classList.add(this.options.get("active_classname"))},_onSlideDeactivate:function(){var a=this._element.classList;a.remove(this.options.get("active_classname")),a.add(this.options.get("visited_classname"))},_onSlideClick:function(){this.events.emit("click")}}),a(g)}),shower.modules.define("slide.layoutFactory",["slide.Layout","util.extend"],function(a,b,c){var d={};c(d,{createLayout:function(a){a=a||{};var e=d._createElement(c({content:"",contentType:"slide"},a));return new b(e)},_createElement:function(a){var b=document.createElement("section");return b.innerHTML=a.content,b.classList.add(a.contentType),b}}),a(d)}),shower.modules.define("util.bound",function(a){function b(a,b){return a["__bound_"+b]||(a["__bound_"+b]=a[b].bind(a))}a(b)}),shower.modules.define("util.extend",function(a){function b(a){if(!a)throw new Error("util.extend: Target not found");return"undefined"==typeof Object.assign?c.apply(null,arguments):Object.assign.apply(null,arguments)}function c(a){for(var b=1,c=arguments.length;b0&&(a.preventDefault(),this.prev())},_go:function(){for(var a=0,b=this._elements.length;awindow.innerWidth/2?c.player.next():c.player.prev()),d||f.activate())},_onTouchMove:function(a){this._shower.container.isSlideMode()&&a.preventDefault()},_getSlideByElement:function(a){for(var b=this._shower.getSlides(),c=null,d=0,e=b.length;d` of your presentation. - -## PDF - -Ribbon could be exported to PDF by printing it from the list mode in Chrome or Opera browsers. See [printing documentation](https://github.com/shower/shower/blob/master/docs/printing-en.md) for more options. - -## Development - -If you want to adjust theme for your needs: - -1. Fork this repository and clone it to your local machine. -2. Install dependencies: `npm install`. -3. Start a local server with watcher: `npm run dev` or just `gulp` if you have it installed globally. -4. Edit your files and see changes in the opened browser. - -To take part in Ribbon development please read [contributing guidelines](CONTRIBUTING.md) first and [file an issue](https://github.com/shower/shower/issues/new) before sending any pull request. - ---- -Licensed under [MIT License](LICENSE.md). diff --git a/doc/presentations/evolution/shower/themes/ribbon/index.html b/doc/presentations/evolution/shower/themes/ribbon/index.html deleted file mode 100644 index 98850917e05..00000000000 --- a/doc/presentations/evolution/shower/themes/ribbon/index.html +++ /dev/null @@ -1,304 +0,0 @@ - - - - Ribbon theme for Shower - - - - - - -
-

Presentation Title

-

Yours Truly, Famous Inc.

-
-
-

Slide Header

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch letterpress.

-

Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid four loko quinoa.

-

Echo Park 8-bit sustainable umami deep v Kickstarter.

-
-
-

Inline Elements

-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-
-
-

Quotes

-
-

Flannel bicycle rights locavore selfies skateboard. Authentic fanny pack paleo four loko bespoke. Artisan tattooed chia XOXO ennui, lomo disrupt 8-bit art party Tumblr scenester.

-
-
-
-

Post-ironic fashion axe flexitarian, Tonx narwhal messenger bag Tumblr. Portland gentrify deep v kale chips literally.

-
-
Yours Truly
-
-
-
-

Nested Lists

-
    -
  1. Literally viral vegan, ugh drinking vinegar photo booth
  2. -
  3. Wes Anderson chillwave Marfa pour-over Etsy banh mi
  4. -
  5. Ethnic polaroid lo-fi iPhone ennui -
      -
    • Yr wayfarers before they sold out Kickstarter asymmetrical
    • -
    • Irony flexitarian readymade quinoa, kogi bespoke meggings narwhal
    • -
    • Skateboard Etsy twee artisan Echo Park
    • -
    -
  6. -
  7. Tonx kitsch fingerstache readymade, retro single-origin coffee
  8. -
-
-
-

Block Lists

-
    -
  • Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack.
  • -
  • Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag.
  • -
  • Leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical paleo you probably haven’t heard of.
  • -
-
-
-

Latin and Cyrillic List Bullets

-
    -
  • Occupy locavore blog, mustache you probably haven't heard of them
  • -
  • Skateboard pork belly aesthetic hoodie selfies brunch
  • -
  • Food truck gluten-free disrupt Portland
  • -
-
    -
  • Helvetica narwhal drinking vinegar chillwave, post-ironic ennui
  • -
  • Cray pug paleo retro, Echo Park narwhal Wes Anderson
  • -
  • Disrupt Williamsburg fixie, shabby chic bicycle rights hashtag kogi
  • -
-
-
-

Two Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Three Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Simple Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Striped Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Plain Code Listing

-
<html lang="en">
-<head> <!--Comment-->
-    <title>Shower</title>
-    <meta charset="UTF-8">
-    <link rel="stylesheet" href="screen.css">
-    <script src="script.js"></script>
-</head>
-
-
-

Numbered Code Listing

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Lines

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Hidden Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Grid Slide

-
-
-

White Slide

-
-
-

Black Slide

-
-
-

Shout

-
-
-

Multiline
Shout

-
-
-

Linked Shout

-
-
-

Growing Shout

-
-
-

Shrinking Shout

-
-
-
- -
Copyright © 2016 Yours Truly, Famous Inc.
-
-
-
- -
-
- -
-
- - - - - - - - - -
-
-

Timer

-
-
-

List Navigation

-
    -
  1. Ennui keffiyeh thundercats
  2. - - - - -
-

Before they sold out master

-
-
- - - - - diff --git a/doc/presentations/evolution/shower/themes/ribbon/pictures/canvas.png b/doc/presentations/evolution/shower/themes/ribbon/pictures/canvas.png deleted file mode 100644 index 6ddd30154f2..00000000000 Binary files a/doc/presentations/evolution/shower/themes/ribbon/pictures/canvas.png and /dev/null differ diff --git a/doc/presentations/evolution/shower/themes/ribbon/pictures/exact.png b/doc/presentations/evolution/shower/themes/ribbon/pictures/exact.png deleted file mode 100644 index b27251c57cb..00000000000 Binary files a/doc/presentations/evolution/shower/themes/ribbon/pictures/exact.png and /dev/null differ diff --git a/doc/presentations/evolution/shower/themes/ribbon/pictures/square.png b/doc/presentations/evolution/shower/themes/ribbon/pictures/square.png deleted file mode 100644 index 62cb2384a5f..00000000000 Binary files a/doc/presentations/evolution/shower/themes/ribbon/pictures/square.png and /dev/null differ diff --git a/doc/presentations/evolution/shower/themes/ribbon/pictures/tall.png b/doc/presentations/evolution/shower/themes/ribbon/pictures/tall.png deleted file mode 100644 index fbc9f09a2ab..00000000000 Binary files a/doc/presentations/evolution/shower/themes/ribbon/pictures/tall.png and /dev/null differ diff --git a/doc/presentations/evolution/shower/themes/ribbon/pictures/wide.png b/doc/presentations/evolution/shower/themes/ribbon/pictures/wide.png deleted file mode 100644 index 1e83b0ac7ad..00000000000 Binary files a/doc/presentations/evolution/shower/themes/ribbon/pictures/wide.png and /dev/null differ diff --git a/doc/presentations/evolution/shower/themes/ribbon/styles/presentation_links.html b/doc/presentations/evolution/shower/themes/ribbon/styles/presentation_links.html deleted file mode 100644 index 74981d8eac5..00000000000 --- a/doc/presentations/evolution/shower/themes/ribbon/styles/presentation_links.html +++ /dev/null @@ -1,2 +0,0 @@ -http://www.slideshare.net/AlexeyMilovidov1/clickhouse-69616890/AlexeyMilovidov1/clickhouse-69616890 -file:///home/milovidov/work/Presentation/shower/index.html#cover diff --git a/doc/presentations/evolution/shower/themes/ribbon/styles/screen-16x10.css b/doc/presentations/evolution/shower/themes/ribbon/styles/screen-16x10.css deleted file mode 100644 index 5ea77cc9961..00000000000 --- a/doc/presentations/evolution/shower/themes/ribbon/styles/screen-16x10.css +++ /dev/null @@ -1,204 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8"; - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot); - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/CYblzLEXzCqQIvrYs7QKQe2omRk.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/pUcnOdRwl83MvPPzrNomhyletnA.woff) format('woff'), - url(https://yastatic.net/adv-www/_/vNFEmXOcGYKJ4AAidUprHWoXrLU.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/0w7OcWZM_QLP8x-LQUXFOgXO6dE.svg#YandexSansTextWeb-Bold) format('svg'); - font-weight: 700; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot); - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/z3MYElcut0R2MF_Iw1RDNrstgYs.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/1jvKJ_-hCXl3s7gmFl-y_-UHTaI.woff) format('woff'), - url(https://yastatic.net/adv-www/_/9nzjfpCR2QHvK1EzHpDEIoVFGuY.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/gwyBTpxSwkFCF1looxqs6JokKls.svg#YandexSansTextWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot); - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/lGQcYklLVV0hyvz1HFmFsUTj8_0.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/f0AAJ9GJ4iiwEmhG-7PWMHk6vUY.woff) format('woff'), - url(https://yastatic.net/adv-www/_/4UDe4nlVvgEJ-VmLWNVq3SxCsA.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/EKLr1STNokPqxLAQa_RyN82pL98.svg#YandexSansTextWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot); - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'), - url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot); - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/LGiRvlfqQHlWR9YKLhsw5e7KGNA.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/40vXwNl4eYYMgteIVgLP49dwmfc.woff) format('woff'), - url(https://yastatic.net/adv-www/_/X6zG5x_wO8-AtwJ-vDLJcKC5228.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/ZKhaR0m08c8CRRL77GtFKoHcLYA.svg#YandexSansDisplayWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - -*,::after,::before{box-sizing:border-box} -a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline} -article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block} -.caption p,body{line-height:1} -p {line-height: 1} -ol,ul{list-style:none} -blockquote,q{quotes:none} -blockquote::after,blockquote::before,q::after,q::before{content:none} -table{border-collapse:collapse;border-spacing:0} -a{text-decoration:none} -@page{margin:0;size:1024px 640px} -.shower{color:#000;counter-reset:slide;font:25px/2 Yandex Sans Display Web,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none} -@media print{.shower{text-rendering:geometricPrecision} -} -.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90} -@media (min-width:1174px){.caption{font-size:50px} -} -@media (min-width:2348px){.caption{font-size:100px} -} -.caption h1{padding-bottom:.15em;font:1em/2 Yandex Sans Display Web,sans-serif} -.caption p{font-size:.6em} -.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60} -.slide{position:relative;z-index:1;overflow:hidden;padding:20px 100px 0;width:1024px;height:640px;background:#fff;font-size:25px} - -/*.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}*/ - -.slide h1{vertical-align:middle; color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide h2{margin-bottom:34px;color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide p{margin-bottom:1em} -.slide p.note{color:#979a9e} -.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2} -.slide b,.slide strong{font-weight:700} -.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic} -.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em} -.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace} -.slide mark{background:#fafaa2} -.slide sub,.slide sup{position:relative;line-height:0;font-size:75%} -.slide sub{bottom:-.25em} -.slide sup{top:-.5em} -.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'} -.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700} -.slide ol,.slide ul{margin-bottom:0em;counter-reset:list} -.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em} -.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right} -.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em} -.slide ul>li::before{padding-right:.5em;content:'•'} -.slide ul>li:lang(ru)::before{content:'—'} -.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."} -.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)} -.slide table td:first-child,.slide table th:first-child{padding-left:96px} -.slide table td:last-child,.slide table th:last-child{padding-right:96px} -.slide table th{text-align:left;font-weight:700} -.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x} -.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)} -.slide table.striped tr>*{background-image:none} -.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal} -.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4} -.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)} -.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."} -.slide pre mark{position:relative;z-index:-1;margin:0 -.3em} -.slide pre mark.important{background:#c00;color:#fff} -.slide pre .comment{color:#999} -.slide footer{position:absolute;right:0;bottom:-640px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s} -.slide footer mark{background:rgba(255,255,255,.8)} -.slide:hover>footer{bottom:0} -.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated} -@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto} -} -.slide.black{background-color:#000} -.slide.black::after,.slide.white::after{visibility:hidden} -.slide.white{background-color:#fff} -.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto} -.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2} -.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3} -.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)} -.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x} -.slide .cover{z-index:-1;max-width:100%;max-height:100%} -.slide .cover.w,.slide .cover.width{width:100%;max-height:none} -.slide .cover.h,.slide .cover.height{height:100%;max-width:none} -.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)} -.slide .cover+figcaption.white{color:#fff} -.slide .cover+figcaption a{color:currentcolor} -.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)} -.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none} -.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)} -.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)} -.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0} -.slide .place.r,.slide .place.right{right:0;left:auto} -.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0} -.slide .place.l,.slide .place.left{left:0} -.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)} -.progress[style*='100%']{padding-left:10px} -.badge,.badge a,.progress{position:absolute} -.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden} -@media (min-width:1174px){.badge{font-size:20px} -} -@media (min-width:2348px){.badge{font-size:40px} -} -.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)} -.region{display:none} -@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)} -} -@media screen and (min-width:1174px){.shower.list{padding-top:50px} -} -@media screen and (min-width:2348px){.shower.list{padding-top:100px} -} -@media screen{.shower.list .caption{display:block} -.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -455px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)} -} -@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -270px 50px;-webkit-transform:scale(.5);transform:scale(.5)} -} -@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)} -} -@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide *{pointer-events:none} -.shower.list .badge,.shower.list .slide footer{display:block} -.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-320px 0 0 -512px;width:1024px;height:640px;background:#000} -.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden} -.shower.full .slide:target{margin:0;visibility:visible} -.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0} -.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)} -.shower.full .slide .next{visibility:hidden} -.shower.full .slide .next.active{visibility:visible} -.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform} -.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)} -.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)} -.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)} -.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)} -.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block} -} diff --git a/doc/presentations/evolution/shower/themes/ribbon/styles/screen-4x3.css b/doc/presentations/evolution/shower/themes/ribbon/styles/screen-4x3.css deleted file mode 100644 index 6648b972c30..00000000000 --- a/doc/presentations/evolution/shower/themes/ribbon/styles/screen-4x3.css +++ /dev/null @@ -1,7 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8";@font-face{font-family:PT Sans;src:url(../fonts/pt-sans-regular.woff) format("woff")}@font-face{font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold.woff) format("woff")}@font-face{font-style:italic;font-family:PT Sans;src:url(../fonts/pt-sans-italic.woff) format("woff")}@font-face{font-style:italic;font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold-italic.woff) format("woff")}@font-face{font-family:PT Sans Narrow;font-weight:700;src:url(../fonts/pt-sans-narrow-bold.woff) format("woff")}@font-face{font-family:PT Mono;src:url(../fonts/pt-mono-regular.woff) format("woff")}*,::after,::before{box-sizing:border-box}a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block}.caption p,body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote::after,blockquote::before,q::after,q::before{content:none}table{border-collapse:collapse;border-spacing:0}a{text-decoration:none}@page{margin:0;size:1024px 768px}.shower{color:#000;counter-reset:slide;font:25px/2 PT Sans,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none}@media print{.shower{text-rendering:geometricPrecision}}.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90}@media (min-width:1174px){.caption{font-size:50px}}@media (min-width:2348px){.caption{font-size:100px}}.caption h1{padding-bottom:.15em;font:700 1em/1 PT Sans Narrow,sans-serif}.caption p{font-size:.6em}.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60}.slide{position:relative;z-index:1;overflow:hidden;padding:106px 100px 0;width:1024px;height:768px;background:#fff;font-size:25px}.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}.slide h2{margin-bottom:34px;color:#585a5e;font:700 50px/1 PT Sans Narrow,sans-serif}.slide p{margin-bottom:1em}.slide p.note{color:#979a9e}.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2}.slide b,.slide strong{font-weight:700}.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic}.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em}.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace}.slide mark{background:#fafaa2}.slide sub,.slide sup{position:relative;line-height:0;font-size:75%}.slide sub{bottom:-.25em}.slide sup{top:-.5em}.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'}.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700}.slide ol,.slide ul{margin-bottom:1em;counter-reset:list}.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em}.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right}.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em}.slide ul>li::before{padding-right:.5em;content:'•'}.slide ul>li:lang(ru)::before{content:'—'}.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."}.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)}.slide table td:first-child,.slide table th:first-child{padding-left:96px}.slide table td:last-child,.slide table th:last-child{padding-right:96px}.slide table th{text-align:left;font-weight:700}.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x}.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)}.slide table.striped tr>*{background-image:none}.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal}.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4}.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)}.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."}.slide pre mark{position:relative;z-index:-1;margin:0 -.3em}.slide pre mark.important{background:#c00;color:#fff}.slide pre .comment{color:#999}.slide footer{position:absolute;right:0;bottom:-768px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s}.slide footer mark{background:rgba(255,255,255,.8)}.slide:hover>footer{bottom:0}.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated}@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto}}.slide.black{background-color:#000}.slide.black::after,.slide.white::after{visibility:hidden}.slide.white{background-color:#fff}.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto}.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2}.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3}.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)}.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x}.slide .cover{z-index:-1;max-width:100%;max-height:100%}.slide .cover.w,.slide .cover.width{width:100%;max-height:none}.slide .cover.h,.slide .cover.height{height:100%;max-width:none}.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)}.slide .cover+figcaption.white{color:#fff}.slide .cover+figcaption a{color:currentcolor}.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)}.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none}.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)}.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)}.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0}.slide .place.r,.slide .place.right{right:0;left:auto}.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0}.slide .place.l,.slide .place.left{left:0}.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)}.progress[style*='100%']{padding-left:10px}.badge,.badge a,.progress{position:absolute}.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden}@media (min-width:1174px){.badge{font-size:20px}}@media (min-width:2348px){.badge{font-size:40px}}.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)}.region{display:none}@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)}}@media screen and (min-width:1174px){.shower.list{padding-top:50px}}@media screen and (min-width:2348px){.shower.list{padding-top:100px}}@media screen{.shower.list .caption{display:block}.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -551px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)}}@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -334px 50px;-webkit-transform:scale(.5);transform:scale(.5)}}@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)}}@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)}.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)}.shower.list .slide *{pointer-events:none}.shower.list .badge,.shower.list .slide footer{display:block}.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-384px 0 0 -512px;width:1024px;height:768px;background:#000}.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden}.shower.full .slide:target{margin:0;visibility:visible}.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0}.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)}.shower.full .slide .next{visibility:hidden}.shower.full .slide .next.active{visibility:visible}.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform}.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)}.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)}.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)}.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)}.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block}} \ No newline at end of file diff --git a/doc/presentations/group_by/LICENSE.md b/doc/presentations/group_by/LICENSE.md deleted file mode 100644 index bd3449d3576..00000000000 --- a/doc/presentations/group_by/LICENSE.md +++ /dev/null @@ -1,21 +0,0 @@ -# The MIT License - -Copyright © 2010–2015 Vadim Makeev, http://pepelsbey.net/ - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ---- - -# Лицензия MIT - -Copyright © 2010–2015 Вадим Макеев, http://pepelsbey.net/ - -Данная лицензия разрешает лицам, получившим копию данного программного обеспечения и сопутствующей документации (в дальнейшем именуемыми «Программное Обеспечение»), безвозмездно использовать Программное Обеспечение без ограничений, включая неограниченное право на использование, копирование, изменение, добавление, публикацию, распространение, сублицензирование и/или продажу копий Программного Обеспечения, также как и лицам, которым предоставляется данное Программное Обеспечение, при соблюдении следующих условий: - -Указанное выше уведомление об авторском праве и данные условия должны быть включены во все копии или значимые части данного Программного Обеспечения. - -ДАННОЕ ПРОГРАММНОЕ ОБЕСПЕЧЕНИЕ ПРЕДОСТАВЛЯЕТСЯ «КАК ЕСТЬ», БЕЗ КАКИХ-ЛИБО ГАРАНТИЙ, ЯВНО ВЫРАЖЕННЫХ ИЛИ ПОДРАЗУМЕВАЕМЫХ, ВКЛЮЧАЯ, НО НЕ ОГРАНИЧИВАЯСЬ ГАРАНТИЯМИ ТОВАРНОЙ ПРИГОДНОСТИ, СООТВЕТСТВИЯ ПО ЕГО КОНКРЕТНОМУ НАЗНАЧЕНИЮ И ОТСУТСТВИЯ НАРУШЕНИЙ ПРАВ. НИ В КАКОМ СЛУЧАЕ АВТОРЫ ИЛИ ПРАВООБЛАДАТЕЛИ НЕ НЕСУТ ОТВЕТСТВЕННОСТИ ПО ИСКАМ О ВОЗМЕЩЕНИИ УЩЕРБА, УБЫТКОВ ИЛИ ДРУГИХ ТРЕБОВАНИЙ ПО ДЕЙСТВУЮЩИМ КОНТРАКТАМ, ДЕЛИКТАМ ИЛИ ИНОМУ, ВОЗНИКШИМ ИЗ, ИМЕЮЩИМ ПРИЧИНОЙ ИЛИ СВЯЗАННЫМ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ ИЛИ ИСПОЛЬЗОВАНИЕМ ПРОГРАММНОГО ОБЕСПЕЧЕНИЯ ИЛИ ИНЫМИ ДЕЙСТВИЯМИ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ. diff --git a/doc/presentations/group_by/README.md b/doc/presentations/group_by/README.md deleted file mode 100644 index 4c555351be1..00000000000 --- a/doc/presentations/group_by/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# Shower Presentation Template [![Build Status](https://travis-ci.org/shower/shower.svg?branch=master)](https://travis-ci.org/shower/shower) - -Shower logo - -> Shower ['ʃəuə] noun. A person or thing that shows. - -1. Built on HTML, CSS and vanilla JavaScript -2. Works in all modern browsers -3. Themes are separated from engine -4. Modular and extensible -5. Fully keyboard accessible -6. Printable to PDF - -[See it in action](http://shwr.me/). Includes [Ribbon](https://github.com/shower/ribbon/) and [Material](https://github.com/shower/material/) themes, and [core](https://github.com/shower/core/) with plugins. - -Follow [@shower_me](https://twitter.com/shower_me) for support and updates, [file an issue](https://github.com/shower/shower/issues/new) if you have any. - -## Quick Start - -1. Download and unzip [template archive](http://shwr.me/shower.zip) -2. Open `index.html` and start creating your presentation - -## Advanced - -1. [Fork](https://github.com/shower/shower/fork) this repository -2. Go to fork setting and rename it: `shower` → `my-slides` -2. Clone it to your computer: `git clone git@github.com:username/my-slides.git` -3. Go to `my-slides` folder and install dependencies: `npm install` -4. Start creating your presentation - -Once you’re done you can build a clean copy of your slides: - - npm run prepare - -And you’ll find your presentation in `prepared` folder with only needed files in it. You can also run `npm run archive` to get the same files in `archive.zip`. But there’s more! You can easily publish your presentation online by running: - - npm run publish - -And you’ll have your slides published on `http://username.github.io/my-slides/`. - -## Usage Examples - -- [Installable Web Apps](http://pepelsbey.net/pres/web-apps/) -- [Clear and Sharp](http://pepelsbey.net/pres/clear-and-sharp/) -- [CSS Management](http://pepelsbey.net/pres/knife-train/) -- [Push it!](http://pepelsbey.net/pres/push-it/) -- [Pre-fixes](http://pepelsbey.net/pres/pre-fixes/) -- [Web In Curves](http://pepelsbey.net/pres/web-in-curves/) -- [Sense Coding](http://pepelsbey.net/pres/sense-coding/) - -## Browser Support - -Latest stable versions of Chrome, Internet Explorer, Firefox, Opera and Safari are supported. - -## Contributing - -You’re always welcome to contribute. Fork project, make changes and send it as pull request. But it’s better to file an [issue](https://github.com/shower/shower/issues) with your idea first. Read [contributing rules](CONTRIBUTING.md) for more details. - -Main contributors in historical order: [pepelsbey](https://github.com/pepelsbey), [jahson](https://github.com/jahson), [miripiruni](https://github.com/miripiruni), [kizu](https://github.com/kizu), [artpolikarpov](https://github.com/artpolikarpov), [tonyganch](https://github.com/tonyganch), [zloylos](https://github.com/zloylos). - ---- -Licensed under [MIT License](LICENSE.md). diff --git a/doc/presentations/group_by/index.html b/doc/presentations/group_by/index.html deleted file mode 100644 index 2298b2ac38a..00000000000 --- a/doc/presentations/group_by/index.html +++ /dev/null @@ -1,559 +0,0 @@ - - - - Параллельный и распределённый GROUP BY - - - - - - -
-

Параллельный и распределённый GROUP BY

-
- -
-

Параллельный и распределённый GROUP BY

-
- -
-

Обо мне

-

Алексей, разработчик ClickHouse.

-

С 2008 занимался движком обработки данных в Яндекс.Метрике.

-
- -
-

 

-

ClickHouse — это аналитическая СУБД.

-

Один запрос — много данных на входе, мало на выходе.

-

Данные нужно агрегировать налету.

-
- -
-

Метрика 2.0

- -
- -
-

Пример запроса

-

SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u
- FROM hits
- WHERE MobilePhoneModel != ''
- GROUP BY MobilePhoneModel
- ORDER BY u DESC

-
- -
-

 

-

Чтобы быстро обрабатывать запросы, данные необходимо:

-
    -
  • быстро читать;
  • -
  • быстро считать.
  • -
-

 

-

Конвейер выполнения запроса:

-

— фильтрация, JOIN, агрегация, сортировка...

-
- -
-

Как тестировать производительность?

-

Бенчмарки должны быть:

-
    -
  • на реальных данных;
  • -
  • на разных наборах данных, покрывающих разные кейсы;
  • -
  • воспроизводимые;
  • -
  • автоматизированные.
  • -
-
- -
-

Пример бенчмарка (не лучшего)

-
/** Выполнять так:
-for file in MobilePhoneModel PageCharset Params URLDomain UTMSource Referer URL Title; do
- for size in 30000 100000 300000 1000000 5000000; do
-  echo
-  BEST_METHOD=0
-  BEST_RESULT=0
-  for method in {1..10}; do
-   echo -ne $file $size $method '';
-   TOTAL_ELEMS=0
-   for i in {0..1000}; do
-    TOTAL_ELEMS=$(( $TOTAL_ELEMS + $size ))
-    if [[ $TOTAL_ELEMS -gt 25000000 ]]; then break; fi
-    ./hash_map_string_3 $size $method < ${file}.bin 2>&1 |
-     grep HashMap | grep -oE '[0-9\.]+ elem';
-   done | awk -W interactive '{ if ($1 > x) { x = $1 }; printf(".") } END { print x }' |
-    tee /tmp/hash_map_string_3_res;
-   CUR_RESULT=$(cat /tmp/hash_map_string_3_res | tr -d '.')
-   if [[ $CUR_RESULT -gt $BEST_RESULT ]]; then
-    BEST_METHOD=$method
-    BEST_RESULT=$CUR_RESULT
-   fi;
-  done;
-  echo Best: $BEST_METHOD - $BEST_RESULT
- done;
-done
-*/
-
- -
-

Агрегация

-
- -
-

Одна машина, одно ядро

-
- -
-

Плохой способ

-

Читаем данные в массив; сортируем по ключу; -идём по группам ключей и считаем агрегатные функции.

-

Достоинства:

-

+ простота интерфейса агрегатных функций; -+ возможность более эффективной реализации агрегатных функций; -+ можно запускать произвольные скрипты для reduce в режиме streaming.

-

Недостатки:

-

− пусть N — общее число данных, а M — количество ключей; -Отвратительно работает при N > M — в типичном случае. -Тратится O(N) оперативки на промежуточные данные вместо O(M).

-
- -
-

Хороший способ

-

Читаем данные, кладём в ассоциативный массив

-

key tuple -> states of aggregate functions

-

обновляем состояния агрегатных функций.

-
- -
-

Какой ассоциативный массив?

-

Lookup-таблица. Хэш-таблица.

-

Бинарное дерево. Skip-лист. B-дерево.

-

Трай. Трай+хэш-таблица...

-
- -
-

Бинарное дерево

-

− слишком большой оверхед на элемент;

-

− отвратительная кэш-локальность;

-

− вообще тормозит.

-
- -
-

Skip-лист. Трай. B-дерево...

-

− вообще для другой задачи;

-
- -
-

Lookup-таблица

-

+ прекрасно для агрегации по числовым ключам не более ~16 бит;

-

− не подходит для чуть более сложных случаев.

-
- -
-

Хэш-таблица

-

+ моя любимая структура данных;

-

− много деталей.

-
- -
-

Трай+хэш-таблица

-

+ иногда кое что в этом есть, см. далее;

-
- -
-

Одна машина, много ядер

-
- -
-

1. Тривиальный способ

- -

Разные потоки читают разные данные по мере возможности. -Агрегируют независимо в свои локальные хэш-таблицы. -Когда все данные прочитаны, мержим все хэш-таблицы в одну. -Например, идём по всем локальным хэш-таблицам кроме первой - и перекладываем всё в первую. - -Фаза чтения данных и предварительной агрегации распараллеливается. -Фаза мержа выполняется последовательно. - -Пусть N — общее число данных, а M — количество ключей. -O(M) работы выполняется последовательно - и при большом M (кардинальность GROUP BY) - работа плохо распараллеливается. - -Достоинства: тривиально. - -Недостатки: не масштабируется при большой кардинальности.

-
- - -
-

2. Partitioning способ

- -

Для каждого блока данных, выполняем агрегацию в две стадии: - -Стадия 1. -Разные потоки будут обрабатывать разные куски блока, какие успеют. -В каждом потоке, с помощью отдельной хэш-функции, - хэшируем ключ в номер потока и запоминаем его. - - hash: key -> bucket_num - -Стадия 2. -Каждый поток идёт по всему блоку данных - и берёт для агрегации только строки с нуждым номером корзины. - -Модификация: можно всё в одну стадию — тогда каждый поток -будет вычислять хэш-функцию от всех строк заново: - подходит, если это дёшево. -

- -
-

-Достоинства: -+ хорошо масштабируется при большой кардинальности - и равномерном распределении ключей; -+ идейная простота. - -Недостатки: -− если объём данных распределён неравномерно по ключам, - то стадия 2 плохо масштабируется. -Это типичный случай. -Почти всегда объём данных по ключам распределён по power law. - -Ещё недостатки: -− если размер блока маленький, то получается слишком - мелко-гранулированная многопоточность: - большой оверхед на синхронизацию; -− если размер блока большой, то плохая кэш-локальность; -− на второй стадии, часть memory bandwidth умножается на число потоков; -− нужно вычислять ещё одну хэш-функцию, - она должна быть независима от той, что в хэш-таблице;

-
- - -
-

3. Параллельный мерж хэш-таблиц

- -

Отресайзим полученные в разных потоках хэш-таблицы к одному размеру. -Разобъём их неявно на разные подмножества ключей. -В разных потоках будем мержить соответствующие - подмножества ключей хэш-таблиц. - -Рисунок на доске. - -Недостаток: -− очень сложный код.

-
- - -
-

4. Ordered мерж хэш-таблиц

- -

Для open addressing linear probing хэш-таблиц, или для chaining хэш-таблиц, -данные в хэш-таблице расположены почти упорядоченно -по остатку от деления хэш-функции на размер хэш-таблицы -— с точностью до цепочек разрешения коллизий. - -Отресайзим полученные в разных потоках хэш-таблицы к одному размеру. -Сделаем ordered iterator, который будет - перебирать данные в хэш-таблице в фиксированном порядке. - -Объём работы на итерирование: - количество цепочек разрешения коллизий * средний квадрат длин цепочек. - -Сделаем merging iterator, который с помощью heap (priority queue) - будет перебирать все хэш-таблицы разом. -

- -
-

Достоинства: - -+ не нужно никуда перекладывать элементы: мерж делается inplace. - -+ бонус: подходит для внешней памяти. - - -Недостатки: - -− отвратительно сложный код; - -− для open addressing linear probing хэш-таблиц, - средний квадрат длин цепочек разрешения коллизий слишком большой; - -− priority queue тормозит; - -− стадия мержа не распараллеливается* - - -* — можно совместить с предыдущим способом.

-
- - -
-

5. Robin Hood ordered мерж хэш-таблиц

- -

Если использовать Robin Hood хэш-таблицу, то данные -(за исключением O(1) граничных цепочек разрешения коллизий) -будут полностью упорядочены -по остатку от деления хэш-функции на размер хэш-таблицы. - -Достоинства: -+ вроде бы красивый алгоритм. -+ бонус: подходит для внешней памяти. - -Недостатки: -− вынуждает использовать robin-hood probing; -− priority queue тормозит; -− стадия мержа не распараллеливается*

-
- - -
-

6. Shared хэш-таблица под mutex-ом

- -

Достоинства: очень просто. - -Недостатки: отрицательная масштабируемость.

-
- - -
-

7. Много мелких хэш-таблиц под разными mutex-ами

- -

В какую класть — выбирается с помощью отдельной хэш-функции. - -Недостатки: - -− в типичном случае данные распределены сильно неравномерно, - и потоки будут конкурировать на одной горячей корзине. - -− в случае маленькой хэш-таблицы, слишком тормозит. - -Достоинства: если данные почему-то распределены равномерно, - то кое-как масштабируется.

-
- - -
-

8. Shared хэш-таблица и в каждой ячейке spin-lock

- -

Недостатки: - -− spin-lock — это очень опасно; - очень сложно тестировать производительность; - вы обязательно сделаете отстой. - -− в типичном случае данные распределены сильно неравномерно, - и потоки будут конкурировать на одной горячей ячейке.

-
- - -
-

9. Lock free shared хэш-таблица

- -

Недостатки: - -− lock free хэш-таблицы либо нельзя ресайзить, либо они очень сложные; - -− в типичном случае данные распределены сильно неравномерно, - и потоки будут конкурировать на одной горячей ячейке: - false sharing, тормоза. - -− сложный код, много инструкций, всё тормозит; - -− я вообще недолюбливаю lock-free алгоритмы.

-
- - -
-

10. Shared хэш-таблица + thread local хэш-таблицы

- -

Пытаемся положить в shared хэш-таблицу путём блокирования ячейки; -если ячейка уже заблокирована — кладём к локальную хэш-таблицу. - -Тогда горячие ключи попадут в локальные хэш-таблицы. -Локальные хэш-таблицы будут маленькими. -В конце мержим все локальные хэш-таблицы в глобальную. - -Дополнения: можно сначала смотреть - на наличие ключа в локальной хэш-таблице. - -Достоинства: -+ отлично масштабируется; -+ сравнительно простая реализация. - -Недостатки: -− много лукапов, много инструкций — в целом довольно медленно. - -Даже не смотря на то, что thread local хэш-таблица - зачастую ещё и cache local.

-
- - -
-

11. Two-level хэш-таблица

- -

На первой стадии, в каждом потоке независимо -кладём данные в свои num_buckets = 256 хэш-таблиц, -хранящих разные ключи. - -В какую из них класть (номер корзины) -определяется другой хэш-функцией, -либо отдельным байтом хэш-функции. - -Имеем num_threads * num_buckets хэш-таблиц. - -На второй стадии мержим состояния -num_threads * num_buckets хэш-таблиц -в одни num_buckets хэш-таблиц, -распараллеливая мерж по bucket-ам. -

- -
-

-Достоинства: - -+ отлично масштабируется; -+ простота реализации; - -+ бонус: ресайзы хэш-таблиц амортизируются; - -+ бонус: нахаляву получаем в результате partitioning, - который полезен для других стадий конвейера. - -+ бонус: подходит для внешней памяти. - -Недостатки: - -− при большой кардинальности, во время мержа - делается до такого же объёма работ как на первой стадии; - -− при маленькой кардинальности, - слишком много отдельных хэш-таблиц; - -− при маленькой кардинальности, - работает несколько медленнее тривиального способа;

-
- - -
-

12. Тривиальный + two-level хэш-таблица

- -

Используем тривиальный способ. - -Когда разных ключей много, конвертируем в two-level.

-

-Именно такой способ используется в ClickHouse :) -

-
- - -
-

Много машин, много ядер

- -

На разных машинах расположены части данных, -которые надо обработать. - -Отличия от shared memory: - -— почти отсутствует возможность work stealing; -— нужно явно передавать данные по сети.

-
- - -
-

1. Тривиальный способ

- -

Передаём промежуточные результаты на сервер-инициатор запроса. -Последовательно кладём всё в одну хэш-таблицу. - -Достоинства: - -+ тривиально; -+ хорошо масштабируется при маленькой кардинальности. - -Недостатки: - -− при большой кардинальности не масштабируется; -− требуется оперативка на весь результат.

-
- - -
-

2. Ordered merge

- -

Передаём промежуточные результаты на сервер-инициатор запроса -в заданном порядке. Мержим. - -Достоинства: -+ тратится O(1) оперативки; - -Недостатки: -− при большой кардинальности не масштабируется; -− мерж сортированных потоков (heap) — это медленная операция; -− требуется либо сортировать результаты на удалённых серверах, - либо использовать один из тех fancy алгоритмов выше.

-
- - -
-

3. Partitioned merge

- -

Передаём промежуточные результаты на сервер-инициатор запроса, -разбитыми на отдельные согласованные корзины-партиции, -в заданном порядке корзин. - -Мержим по одной или по несколько корзин одновременно. - -Достоинства: -+ тратится до в num_buckets раз меньше оперативки, чем размер результата; -+ можно легко распараллелить, мержа сразу несколько корзин - — отлично масштабируется по ядрам. - -Недостатки: -− мерж делается на одном сервере — инициаторе запроса - — эта стадия не масштабируется по серверам.

- -

-Именно такой способ используется в ClickHouse :) -

-
- - -
-

4. Reshuffle + partitioned merge

- -

На удалённых серверах получаем промежуточные результаты, -разбитые на согласованные партиции. - -Затем передаём партиции между серверами так, -чтобы на разных серверах были разные партиции, -а данные одной партиции оказались на одном сервере. - -Мержим на всех серверах параллельно, да ещё и используя многие ядра. - -Достоинства: -+ прекрасно масштабируется; -+ при INSERT SELECT, результат можно - вовсе не передавать на сервер-инициатор, - а сразу сохранить в распределённую таблицу на кластере. - -Недостатки: -− сложная координация серверов;

-
- - -
-

Всё

- -

Можно задавать вопросы.

-
- - - -
- - - diff --git a/doc/presentations/group_by/pictures/metrika2.png b/doc/presentations/group_by/pictures/metrika2.png deleted file mode 100644 index 3ee37e98fc6..00000000000 Binary files a/doc/presentations/group_by/pictures/metrika2.png and /dev/null differ diff --git a/doc/presentations/group_by/shower/shower.min.js b/doc/presentations/group_by/shower/shower.min.js deleted file mode 100644 index 449843ac45d..00000000000 --- a/doc/presentations/group_by/shower/shower.min.js +++ /dev/null @@ -1,8 +0,0 @@ -/** - * Core for Shower HTML presentation engine - * shower-core v2.0.7, https://github.com/shower/core - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -!function(a){var b,c={NOT_RESOLVED:"NOT_RESOLVED",IN_RESOLVING:"IN_RESOLVING",RESOLVED:"RESOLVED"},d=function(){var l={trackCircularDependencies:!0,allowMultipleDeclarations:!0},m={},n=!1,o=[],p=function(a,d,e){e||(e=d,d=[]);var f=m[a];f||(f=m[a]={name:a,decl:b}),f.decl={name:a,prev:f.decl,fn:e,state:c.NOT_RESOLVED,deps:d,dependents:[],exports:b}},q=function(b,c,d){"string"==typeof b&&(b=[b]),n||(n=!0,k(v)),o.push({deps:b,cb:function(b,f){f?(d||e)(f):c.apply(a,b)}})},r=function(a){var b=m[a];return b?c[b.decl.state]:"NOT_DEFINED"},s=function(a){return!!m[a]},t=function(a){for(var b in a)a.hasOwnProperty(b)&&(l[b]=a[b])},u=function(){var a,b={};for(var c in m)m.hasOwnProperty(c)&&(a=m[c],(b[a.decl.state]||(b[a.decl.state]=[])).push(c));return b},v=function(){n=!1,w()},w=function(){var a,b=o,c=0;for(o=[];a=b[c++];)x(null,a.deps,[],a.cb)},x=function(a,b,c,d){var e=b.length;e||d([]);for(var g,h,i=[],j=function(a,b){if(b)return void d(null,b);if(!--e){for(var c,f=[],g=0;c=i[g++];)f.push(c.exports);d(f)}},k=0,l=e;k ")+'"')},h=function(a){return Error('Declaration of module "'+a.name+'" has already been provided')},i=function(a){return Error('Multiple declarations of module "'+a.name+'" have been detected')},j=function(a,b){for(var c,d=0;c=b[d++];)if(a===c)return!0;return!1},k=function(){var b=[],c=function(a){return 1===b.push(a)},d=function(){var a=b,c=0,d=b.length;for(b=[];c=0&&!b.defaultPrevented();){var d=a[c];d&&(d.context?d.callback.call(d.context,b):d.callback(b)),c--}}}),a(e)}),shower.modules.define("Plugins",["Emitter","util.extend"],function(a,b,c){function d(a){this.events=new b({context:this}),this._showerGlobal=a,this._showerInstances=a.getInited(),this._plugins={},this._instances=[],a.events.on("init",this._onShowerInit,this)}c(d.prototype,{destroy:function(){this._showerGlobal.events.off("init",this._onShowerInit,this),this._plugins=null},add:function(a,b){if(this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" already exist.");return this._requireAndAdd({name:a,options:b}),this},remove:function(a){if(!this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" not found.");return delete this._plugins[a],this.events.emit("remove",{name:a}),this},get:function(a,b){var c,d=this._plugins[a];if(d&&b)for(var e=0,f=this._instances.length;e=0;e--)if(d[e].getId()===a){b=d[e],c=e;break}return{slide:b,index:c}},_onSlideActivate:function(a){window.location.hash=a.get("slide").getId(),this._setTitle()},_onContainerSlideModeChange:function(){this._setTitle(),this.save()},_isSlideMode:function(){return this._shower.container.isSlideMode()},_onPopstate:function(){var a,b=this._shower,c=window.location.hash.substr(1),d=b.player.getCurrentSlide(),e=b.player.getCurrentSlideIndex();this._isSlideMode()&&e===-1?b.player.go(0):e===-1&&""!==window.location.hash&&b.player.go(0),d&&c!==d.getId()&&(a=this._getSlideById(c),b.player.go(a.index))},_setTitle:function(){var a=document.title,b=this._isSlideMode(),c=this._shower.player.getCurrentSlide();if(b&&c){var d=c.getTitle();document.title=d?d+" — "+this._documentTitle:this._documentTitle}else this._documentTitle!==a&&(document.title=this._documentTitle)}}),a(e)}),shower.modules.define("shower.Player",["Emitter","util.bound","util.extend"],function(a,b,c,d){function e(a){this.events=new b({context:this,parent:a.events}),this._shower=a,this._showerListeners=null,this._playerListeners=null,this._currentSlideNumber=-1,this._currentSlide=null,this.init()}d(e.prototype,{init:function(){this._showerListeners=this._shower.events.group().on("slideadd",this._onSlideAdd,this).on("slideremove",this._onSlideRemove,this).on("slidemodeenter",this._onSlideModeEnter,this),this._playerListeners=this.events.group().on("prev",this._onPrev,this).on("next",this._onNext,this),document.addEventListener("keydown",c(this,"_onKeyDown"))},destroy:function(){this._showerListeners.offAll(),this._playerListeners.offAll(),document.removeEventListener("keydown",c(this,"_onKeyDown")),this._currentSlide=null,this._currentSlideNumber=null,this._shower=null},next:function(){return this.events.emit("next"),this},prev:function(){return this.events.emit("prev"),this},first:function(){return this.go(0),this},last:function(){return this.go(this._shower.getSlidesCount()-1),this},go:function(a){"number"!=typeof a&&(a=this._shower.getSlideIndex(a));var b=this._shower.getSlidesCount(),c=this._currentSlide;return a!=this._currentSlideNumber&&a=0&&(c&&c.isActive()&&c.deactivate(),c=this._shower.get(a),this._currentSlide=c,this._currentSlideNumber=a,c.isActive()||c.activate(),this.events.emit("activate",{index:a,slide:c})),this},getCurrentSlide:function(){return this._currentSlide},getCurrentSlideIndex:function(){return this._currentSlideNumber},_onPrev:function(){this._changeSlide(this._currentSlideNumber-1)},_onNext:function(){this._changeSlide(this._currentSlideNumber+1)},_changeSlide:function(a){this.go(a)},_onSlideAdd:function(a){var b=a.get("slide");b.events.on("activate",this._onSlideActivate,this)},_onSlideRemove:function(a){var b=a.get("slide");b.events.off("activate",this._onSlideActivate,this)},_onSlideActivate:function(a){var b=a.get("slide"),c=this._shower.getSlideIndex(b);this.go(c)},_onKeyDown:function(a){if(this._shower.isHotkeysEnabled()&&!/^(?:button|input|select|textarea)$/i.test(a.target.tagName))switch(this.events.emit("keydown",{event:a}),a.which){case 33:case 38:case 37:case 72:case 75:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.prev();break;case 34:case 40:case 39:case 76:case 74:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.next();break;case 36:a.preventDefault(),this.first();break;case 35:a.preventDefault(),this.last();break;case 32:this._shower.container.isSlideMode()&&(a.shiftKey?this.prev():this.next())}},_onSlideModeEnter:function(){this._currentSlide||this.go(0)}}),a(e)}),shower.modules.define("shower.slidesParser",["Slide"],function(a,b){function c(a,c){var d=a.querySelectorAll(c);return d=Array.prototype.slice.call(d),d.map(function(a,c){var d=new b(a);return a.id||(a.id=c+1),d})}a(c)}),shower.modules.define("Slide",["shower.defaultOptions","Emitter","Options","slide.Layout","slide.layoutFactory","util.Store","util.extend"],function(a,b,c,d,e,f,g,h){function i(a,b,e){this.events=new c,this.options=new d(b),this.layout=null,this.state=new g({visited:0,index:null},e),this._content=a,this._isVisited=this.state.get("visited")>0,this._isActive=!1,this.init()}h(i.prototype,{init:function(){this.layout="string"==typeof this._content?new f.createLayout({content:this._content}):new e(this._content,this.options),this.layout.setParent(this),this._setupListeners()},destroy:function(){this._clearListeners(),this._isActive=null,this.options=null,this.layout.destroy()},activate:function(){this._isActive=!0;var a=this.state.get("visited");return this.state.set("visited",++a),this.events.emit("activate",{slide:this}),this},deactivate:function(){return this._isActive=!1,this.events.emit("deactivate",{slide:this}),this},isActive:function(){return this._isActive},isVisited:function(){return this.state.get("visited")>0},getTitle:function(){return this.layout.getTitle()},setTitle:function(a){return this.layout.setTitle(a),this},getId:function(){return this.layout.getElement().id},getContent:function(){return this.layout.getContent()},_setupListeners:function(){this.layoutListeners=this.layout.events.group().on("click",this._onSlideClick,this)},_clearListeners:function(){this.layoutListeners.offAll()},_onSlideClick:function(){this.activate(),this.events.emit("click",{slide:this})}}),a(i)}),shower.modules.define("slide.Layout",["Options","shower.defaultOptions","Emitter","util.bound","util.extend"],function(a,b,c,d,e,f){function g(a,e){this.options=new b({title_element_selector:c.slide_title_element_selector,active_classname:c.slide_active_classname,visited_classname:c.slide_visited_classname},e),this.events=new d,this._element=a,this._parent=null,this._parentElement=null,this.init()}f(g.prototype,{init:function(){var a=this._element.parentNode;a?this._parentElement=a:this.setParentElement(a)},destroy:function(){this.setParent(null)},setParent:function(a){this._parent!=a&&(this._clearListeners(),this._parent=a,this._parent&&this._setupListeners(),this.events.emit("parentchange",{parent:a}))},getParent:function(){return this._parent},setParentElement:function(a){a!=this._parentElement&&(this._parentElement=a,a.appendChild(this._element),this.events.emit("parentelementchange",{parentElement:a}))},getParentElement:function(){return this._parentElement},getElement:function(){return this._element},setTitle:function(a){var b=this.options.get("title_element_selector"),c=this._element.querySelector(b);c?c.innerHTML=a:(c=document.createElement(b),c.innerHTML=a,this._element.insertBefore(c,this._element.firstChild))},getTitle:function(){var a=this.options.get("title_element_selector"),b=this._element.querySelector(a);return b?b.textContent:null},getData:function(a){var b=this._element;return b.dataset?b.dataset[a]:b.getAttribute("data-"+a)},getContent:function(){return this._element.innerHTML},_setupListeners:function(){this._slideListeners=this._parent.events.group().on("activate",this._onSlideActivate,this).on("deactivate",this._onSlideDeactivate,this),this._element.addEventListener("click",e(this,"_onSlideClick"),!1)},_clearListeners:function(){this._slideListeners&&this._slideListeners.offAll(),this._element.removeEventListener("click",e(this,"_onSlideClick"))},_onSlideActivate:function(){this._element.classList.add(this.options.get("active_classname"))},_onSlideDeactivate:function(){var a=this._element.classList;a.remove(this.options.get("active_classname")),a.add(this.options.get("visited_classname"))},_onSlideClick:function(){this.events.emit("click")}}),a(g)}),shower.modules.define("slide.layoutFactory",["slide.Layout","util.extend"],function(a,b,c){var d={};c(d,{createLayout:function(a){a=a||{};var e=d._createElement(c({content:"",contentType:"slide"},a));return new b(e)},_createElement:function(a){var b=document.createElement("section");return b.innerHTML=a.content,b.classList.add(a.contentType),b}}),a(d)}),shower.modules.define("util.bound",function(a){function b(a,b){return a["__bound_"+b]||(a["__bound_"+b]=a[b].bind(a))}a(b)}),shower.modules.define("util.extend",function(a){function b(a){if(!a)throw new Error("util.extend: Target not found");return"undefined"==typeof Object.assign?c.apply(null,arguments):Object.assign.apply(null,arguments)}function c(a){for(var b=1,c=arguments.length;b0&&(a.preventDefault(),this.prev())},_go:function(){for(var a=0,b=this._elements.length;awindow.innerWidth/2?c.player.next():c.player.prev()),d||f.activate())},_onTouchMove:function(a){this._shower.container.isSlideMode()&&a.preventDefault()},_getSlideByElement:function(a){for(var b=this._shower.getSlides(),c=null,d=0,e=b.length;d` of your presentation. - -## PDF - -Ribbon could be exported to PDF by printing it from the list mode in Chrome or Opera browsers. See [printing documentation](https://github.com/shower/shower/blob/master/docs/printing-en.md) for more options. - -## Development - -If you want to adjust theme for your needs: - -1. Fork this repository and clone it to your local machine. -2. Install dependencies: `npm install`. -3. Start a local server with watcher: `npm run dev` or just `gulp` if you have it installed globally. -4. Edit your files and see changes in the opened browser. - -To take part in Ribbon development please read [contributing guidelines](CONTRIBUTING.md) first and [file an issue](https://github.com/shower/shower/issues/new) before sending any pull request. - ---- -Licensed under [MIT License](LICENSE.md). diff --git a/doc/presentations/group_by/shower/themes/ribbon/index.html b/doc/presentations/group_by/shower/themes/ribbon/index.html deleted file mode 100644 index 98850917e05..00000000000 --- a/doc/presentations/group_by/shower/themes/ribbon/index.html +++ /dev/null @@ -1,304 +0,0 @@ - - - - Ribbon theme for Shower - - - - - - -
-

Presentation Title

-

Yours Truly, Famous Inc.

-
-
-

Slide Header

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch letterpress.

-

Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid four loko quinoa.

-

Echo Park 8-bit sustainable umami deep v Kickstarter.

-
-
-

Inline Elements

-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-
-
-

Quotes

-
-

Flannel bicycle rights locavore selfies skateboard. Authentic fanny pack paleo four loko bespoke. Artisan tattooed chia XOXO ennui, lomo disrupt 8-bit art party Tumblr scenester.

-
-
-
-

Post-ironic fashion axe flexitarian, Tonx narwhal messenger bag Tumblr. Portland gentrify deep v kale chips literally.

-
-
Yours Truly
-
-
-
-

Nested Lists

-
    -
  1. Literally viral vegan, ugh drinking vinegar photo booth
  2. -
  3. Wes Anderson chillwave Marfa pour-over Etsy banh mi
  4. -
  5. Ethnic polaroid lo-fi iPhone ennui -
      -
    • Yr wayfarers before they sold out Kickstarter asymmetrical
    • -
    • Irony flexitarian readymade quinoa, kogi bespoke meggings narwhal
    • -
    • Skateboard Etsy twee artisan Echo Park
    • -
    -
  6. -
  7. Tonx kitsch fingerstache readymade, retro single-origin coffee
  8. -
-
-
-

Block Lists

-
    -
  • Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack.
  • -
  • Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag.
  • -
  • Leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical paleo you probably haven’t heard of.
  • -
-
-
-

Latin and Cyrillic List Bullets

-
    -
  • Occupy locavore blog, mustache you probably haven't heard of them
  • -
  • Skateboard pork belly aesthetic hoodie selfies brunch
  • -
  • Food truck gluten-free disrupt Portland
  • -
-
    -
  • Helvetica narwhal drinking vinegar chillwave, post-ironic ennui
  • -
  • Cray pug paleo retro, Echo Park narwhal Wes Anderson
  • -
  • Disrupt Williamsburg fixie, shabby chic bicycle rights hashtag kogi
  • -
-
-
-

Two Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Three Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Simple Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Striped Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Plain Code Listing

-
<html lang="en">
-<head> <!--Comment-->
-    <title>Shower</title>
-    <meta charset="UTF-8">
-    <link rel="stylesheet" href="screen.css">
-    <script src="script.js"></script>
-</head>
-
-
-

Numbered Code Listing

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Lines

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Hidden Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Grid Slide

-
-
-

White Slide

-
-
-

Black Slide

-
-
-

Shout

-
-
-

Multiline
Shout

-
-
-

Linked Shout

-
-
-

Growing Shout

-
-
-

Shrinking Shout

-
-
-
- -
Copyright © 2016 Yours Truly, Famous Inc.
-
-
-
- -
-
- -
-
- - - - - - - - - -
-
-

Timer

-
-
-

List Navigation

-
    -
  1. Ennui keffiyeh thundercats
  2. - - - - -
-

Before they sold out master

-
-
- - - - - diff --git a/doc/presentations/group_by/shower/themes/ribbon/pictures/canvas.png b/doc/presentations/group_by/shower/themes/ribbon/pictures/canvas.png deleted file mode 100644 index 6ddd30154f2..00000000000 Binary files a/doc/presentations/group_by/shower/themes/ribbon/pictures/canvas.png and /dev/null differ diff --git a/doc/presentations/group_by/shower/themes/ribbon/pictures/exact.png b/doc/presentations/group_by/shower/themes/ribbon/pictures/exact.png deleted file mode 100644 index b27251c57cb..00000000000 Binary files a/doc/presentations/group_by/shower/themes/ribbon/pictures/exact.png and /dev/null differ diff --git a/doc/presentations/group_by/shower/themes/ribbon/pictures/square.png b/doc/presentations/group_by/shower/themes/ribbon/pictures/square.png deleted file mode 100644 index 62cb2384a5f..00000000000 Binary files a/doc/presentations/group_by/shower/themes/ribbon/pictures/square.png and /dev/null differ diff --git a/doc/presentations/group_by/shower/themes/ribbon/pictures/tall.png b/doc/presentations/group_by/shower/themes/ribbon/pictures/tall.png deleted file mode 100644 index fbc9f09a2ab..00000000000 Binary files a/doc/presentations/group_by/shower/themes/ribbon/pictures/tall.png and /dev/null differ diff --git a/doc/presentations/group_by/shower/themes/ribbon/pictures/wide.png b/doc/presentations/group_by/shower/themes/ribbon/pictures/wide.png deleted file mode 100644 index 1e83b0ac7ad..00000000000 Binary files a/doc/presentations/group_by/shower/themes/ribbon/pictures/wide.png and /dev/null differ diff --git a/doc/presentations/group_by/shower/themes/ribbon/styles/presentation_links.html b/doc/presentations/group_by/shower/themes/ribbon/styles/presentation_links.html deleted file mode 100644 index 74981d8eac5..00000000000 --- a/doc/presentations/group_by/shower/themes/ribbon/styles/presentation_links.html +++ /dev/null @@ -1,2 +0,0 @@ -http://www.slideshare.net/AlexeyMilovidov1/clickhouse-69616890/AlexeyMilovidov1/clickhouse-69616890 -file:///home/milovidov/work/Presentation/shower/index.html#cover diff --git a/doc/presentations/group_by/shower/themes/ribbon/styles/screen-16x10.css b/doc/presentations/group_by/shower/themes/ribbon/styles/screen-16x10.css deleted file mode 100644 index 5bd31e4f4c4..00000000000 --- a/doc/presentations/group_by/shower/themes/ribbon/styles/screen-16x10.css +++ /dev/null @@ -1,204 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8"; - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot); - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/CYblzLEXzCqQIvrYs7QKQe2omRk.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/pUcnOdRwl83MvPPzrNomhyletnA.woff) format('woff'), - url(https://yastatic.net/adv-www/_/vNFEmXOcGYKJ4AAidUprHWoXrLU.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/0w7OcWZM_QLP8x-LQUXFOgXO6dE.svg#YandexSansTextWeb-Bold) format('svg'); - font-weight: 700; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot); - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/z3MYElcut0R2MF_Iw1RDNrstgYs.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/1jvKJ_-hCXl3s7gmFl-y_-UHTaI.woff) format('woff'), - url(https://yastatic.net/adv-www/_/9nzjfpCR2QHvK1EzHpDEIoVFGuY.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/gwyBTpxSwkFCF1looxqs6JokKls.svg#YandexSansTextWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot); - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/lGQcYklLVV0hyvz1HFmFsUTj8_0.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/f0AAJ9GJ4iiwEmhG-7PWMHk6vUY.woff) format('woff'), - url(https://yastatic.net/adv-www/_/4UDe4nlVvgEJ-VmLWNVq3SxCsA.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/EKLr1STNokPqxLAQa_RyN82pL98.svg#YandexSansTextWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot); - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'), - url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot); - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/LGiRvlfqQHlWR9YKLhsw5e7KGNA.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/40vXwNl4eYYMgteIVgLP49dwmfc.woff) format('woff'), - url(https://yastatic.net/adv-www/_/X6zG5x_wO8-AtwJ-vDLJcKC5228.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/ZKhaR0m08c8CRRL77GtFKoHcLYA.svg#YandexSansDisplayWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - -*,::after,::before{box-sizing:border-box} -a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline} -article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block} -.caption p,body{line-height:1} -p {line-height: 1; white-space: pre;} -ol,ul{list-style:none} -blockquote,q{quotes:none} -blockquote::after,blockquote::before,q::after,q::before{content:none} -table{border-collapse:collapse;border-spacing:0} -a{text-decoration:none} -@page{margin:0;size:1024px 640px} -.shower{color:#000;counter-reset:slide;font:25px/2 Yandex Sans Display Web,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none} -@media print{.shower{text-rendering:geometricPrecision} -} -.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90} -@media (min-width:1174px){.caption{font-size:50px} -} -@media (min-width:2348px){.caption{font-size:100px} -} -.caption h1{padding-bottom:.15em;font:1em/2 Yandex Sans Display Web,sans-serif} -.caption p{font-size:.6em} -.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60} -.slide{position:relative;z-index:1;overflow:hidden;padding:20px 100px 0;width:1024px;height:640px;background:#fff;font-size:25px} - -/*.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}*/ - -.slide h1{vertical-align:middle; color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide h2{margin-bottom:34px;color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide p{margin-bottom:1em} -.slide p.note{color:#979a9e} -.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2} -.slide b,.slide strong{font-weight:700} -.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic} -.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em} -.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace} -.slide mark{background:#fafaa2} -.slide sub,.slide sup{position:relative;line-height:0;font-size:75%} -.slide sub{bottom:-.25em} -.slide sup{top:-.5em} -.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'} -.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700} -.slide ol,.slide ul{margin-bottom:0em;counter-reset:list} -.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em} -.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right} -.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em} -.slide ul>li::before{padding-right:.5em;content:'•'} -.slide ul>li:lang(ru)::before{content:'—'} -.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."} -.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)} -.slide table td:first-child,.slide table th:first-child{padding-left:96px} -.slide table td:last-child,.slide table th:last-child{padding-right:96px} -.slide table th{text-align:left;font-weight:700} -.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x} -.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)} -.slide table.striped tr>*{background-image:none} -.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal} -.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4} -.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)} -.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."} -.slide pre mark{position:relative;z-index:-1;margin:0 -.3em} -.slide pre mark.important{background:#c00;color:#fff} -.slide pre .comment{color:#999} -.slide footer{position:absolute;right:0;bottom:-640px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s} -.slide footer mark{background:rgba(255,255,255,.8)} -.slide:hover>footer{bottom:0} -.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated} -@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto} -} -.slide.black{background-color:#000} -.slide.black::after,.slide.white::after{visibility:hidden} -.slide.white{background-color:#fff} -.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto} -.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2} -.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3} -.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)} -.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x} -.slide .cover{z-index:-1;max-width:100%;max-height:100%} -.slide .cover.w,.slide .cover.width{width:100%;max-height:none} -.slide .cover.h,.slide .cover.height{height:100%;max-width:none} -.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)} -.slide .cover+figcaption.white{color:#fff} -.slide .cover+figcaption a{color:currentcolor} -.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)} -.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none} -.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)} -.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)} -.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0} -.slide .place.r,.slide .place.right{right:0;left:auto} -.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0} -.slide .place.l,.slide .place.left{left:0} -.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)} -.progress[style*='100%']{padding-left:10px} -.badge,.badge a,.progress{position:absolute} -.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden} -@media (min-width:1174px){.badge{font-size:20px} -} -@media (min-width:2348px){.badge{font-size:40px} -} -.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)} -.region{display:none} -@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)} -} -@media screen and (min-width:1174px){.shower.list{padding-top:50px} -} -@media screen and (min-width:2348px){.shower.list{padding-top:100px} -} -@media screen{.shower.list .caption{display:block} -.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -455px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)} -} -@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -270px 50px;-webkit-transform:scale(.5);transform:scale(.5)} -} -@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)} -} -@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide *{pointer-events:none} -.shower.list .badge,.shower.list .slide footer{display:block} -.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-320px 0 0 -512px;width:1024px;height:640px;background:#000} -.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden} -.shower.full .slide:target{margin:0;visibility:visible} -.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0} -.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)} -.shower.full .slide .next{visibility:hidden} -.shower.full .slide .next.active{visibility:visible} -.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform} -.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)} -.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)} -.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)} -.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)} -.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block} -} diff --git a/doc/presentations/group_by/shower/themes/ribbon/styles/screen-4x3.css b/doc/presentations/group_by/shower/themes/ribbon/styles/screen-4x3.css deleted file mode 100644 index 6648b972c30..00000000000 --- a/doc/presentations/group_by/shower/themes/ribbon/styles/screen-4x3.css +++ /dev/null @@ -1,7 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8";@font-face{font-family:PT Sans;src:url(../fonts/pt-sans-regular.woff) format("woff")}@font-face{font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold.woff) format("woff")}@font-face{font-style:italic;font-family:PT Sans;src:url(../fonts/pt-sans-italic.woff) format("woff")}@font-face{font-style:italic;font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold-italic.woff) format("woff")}@font-face{font-family:PT Sans Narrow;font-weight:700;src:url(../fonts/pt-sans-narrow-bold.woff) format("woff")}@font-face{font-family:PT Mono;src:url(../fonts/pt-mono-regular.woff) format("woff")}*,::after,::before{box-sizing:border-box}a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block}.caption p,body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote::after,blockquote::before,q::after,q::before{content:none}table{border-collapse:collapse;border-spacing:0}a{text-decoration:none}@page{margin:0;size:1024px 768px}.shower{color:#000;counter-reset:slide;font:25px/2 PT Sans,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none}@media print{.shower{text-rendering:geometricPrecision}}.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90}@media (min-width:1174px){.caption{font-size:50px}}@media (min-width:2348px){.caption{font-size:100px}}.caption h1{padding-bottom:.15em;font:700 1em/1 PT Sans Narrow,sans-serif}.caption p{font-size:.6em}.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60}.slide{position:relative;z-index:1;overflow:hidden;padding:106px 100px 0;width:1024px;height:768px;background:#fff;font-size:25px}.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}.slide h2{margin-bottom:34px;color:#585a5e;font:700 50px/1 PT Sans Narrow,sans-serif}.slide p{margin-bottom:1em}.slide p.note{color:#979a9e}.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2}.slide b,.slide strong{font-weight:700}.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic}.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em}.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace}.slide mark{background:#fafaa2}.slide sub,.slide sup{position:relative;line-height:0;font-size:75%}.slide sub{bottom:-.25em}.slide sup{top:-.5em}.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'}.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700}.slide ol,.slide ul{margin-bottom:1em;counter-reset:list}.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em}.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right}.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em}.slide ul>li::before{padding-right:.5em;content:'•'}.slide ul>li:lang(ru)::before{content:'—'}.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."}.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)}.slide table td:first-child,.slide table th:first-child{padding-left:96px}.slide table td:last-child,.slide table th:last-child{padding-right:96px}.slide table th{text-align:left;font-weight:700}.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x}.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)}.slide table.striped tr>*{background-image:none}.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal}.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4}.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)}.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."}.slide pre mark{position:relative;z-index:-1;margin:0 -.3em}.slide pre mark.important{background:#c00;color:#fff}.slide pre .comment{color:#999}.slide footer{position:absolute;right:0;bottom:-768px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s}.slide footer mark{background:rgba(255,255,255,.8)}.slide:hover>footer{bottom:0}.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated}@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto}}.slide.black{background-color:#000}.slide.black::after,.slide.white::after{visibility:hidden}.slide.white{background-color:#fff}.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto}.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2}.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3}.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)}.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x}.slide .cover{z-index:-1;max-width:100%;max-height:100%}.slide .cover.w,.slide .cover.width{width:100%;max-height:none}.slide .cover.h,.slide .cover.height{height:100%;max-width:none}.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)}.slide .cover+figcaption.white{color:#fff}.slide .cover+figcaption a{color:currentcolor}.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)}.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none}.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)}.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)}.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0}.slide .place.r,.slide .place.right{right:0;left:auto}.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0}.slide .place.l,.slide .place.left{left:0}.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)}.progress[style*='100%']{padding-left:10px}.badge,.badge a,.progress{position:absolute}.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden}@media (min-width:1174px){.badge{font-size:20px}}@media (min-width:2348px){.badge{font-size:40px}}.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)}.region{display:none}@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)}}@media screen and (min-width:1174px){.shower.list{padding-top:50px}}@media screen and (min-width:2348px){.shower.list{padding-top:100px}}@media screen{.shower.list .caption{display:block}.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -551px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)}}@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -334px 50px;-webkit-transform:scale(.5);transform:scale(.5)}}@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)}}@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)}.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)}.shower.list .slide *{pointer-events:none}.shower.list .badge,.shower.list .slide footer{display:block}.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-384px 0 0 -512px;width:1024px;height:768px;background:#000}.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden}.shower.full .slide:target{margin:0;visibility:visible}.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0}.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)}.shower.full .slide .next{visibility:hidden}.shower.full .slide .next.active{visibility:visible}.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform}.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)}.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)}.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)}.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)}.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block}} \ No newline at end of file diff --git a/doc/presentations/meetup3/LICENSE.md b/doc/presentations/meetup3/LICENSE.md deleted file mode 100644 index bd3449d3576..00000000000 --- a/doc/presentations/meetup3/LICENSE.md +++ /dev/null @@ -1,21 +0,0 @@ -# The MIT License - -Copyright © 2010–2015 Vadim Makeev, http://pepelsbey.net/ - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ---- - -# Лицензия MIT - -Copyright © 2010–2015 Вадим Макеев, http://pepelsbey.net/ - -Данная лицензия разрешает лицам, получившим копию данного программного обеспечения и сопутствующей документации (в дальнейшем именуемыми «Программное Обеспечение»), безвозмездно использовать Программное Обеспечение без ограничений, включая неограниченное право на использование, копирование, изменение, добавление, публикацию, распространение, сублицензирование и/или продажу копий Программного Обеспечения, также как и лицам, которым предоставляется данное Программное Обеспечение, при соблюдении следующих условий: - -Указанное выше уведомление об авторском праве и данные условия должны быть включены во все копии или значимые части данного Программного Обеспечения. - -ДАННОЕ ПРОГРАММНОЕ ОБЕСПЕЧЕНИЕ ПРЕДОСТАВЛЯЕТСЯ «КАК ЕСТЬ», БЕЗ КАКИХ-ЛИБО ГАРАНТИЙ, ЯВНО ВЫРАЖЕННЫХ ИЛИ ПОДРАЗУМЕВАЕМЫХ, ВКЛЮЧАЯ, НО НЕ ОГРАНИЧИВАЯСЬ ГАРАНТИЯМИ ТОВАРНОЙ ПРИГОДНОСТИ, СООТВЕТСТВИЯ ПО ЕГО КОНКРЕТНОМУ НАЗНАЧЕНИЮ И ОТСУТСТВИЯ НАРУШЕНИЙ ПРАВ. НИ В КАКОМ СЛУЧАЕ АВТОРЫ ИЛИ ПРАВООБЛАДАТЕЛИ НЕ НЕСУТ ОТВЕТСТВЕННОСТИ ПО ИСКАМ О ВОЗМЕЩЕНИИ УЩЕРБА, УБЫТКОВ ИЛИ ДРУГИХ ТРЕБОВАНИЙ ПО ДЕЙСТВУЮЩИМ КОНТРАКТАМ, ДЕЛИКТАМ ИЛИ ИНОМУ, ВОЗНИКШИМ ИЗ, ИМЕЮЩИМ ПРИЧИНОЙ ИЛИ СВЯЗАННЫМ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ ИЛИ ИСПОЛЬЗОВАНИЕМ ПРОГРАММНОГО ОБЕСПЕЧЕНИЯ ИЛИ ИНЫМИ ДЕЙСТВИЯМИ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ. diff --git a/doc/presentations/meetup3/clickhouse_for_analysts.pdf b/doc/presentations/meetup3/clickhouse_for_analysts.pdf deleted file mode 100644 index 6008d3dfc27..00000000000 Binary files a/doc/presentations/meetup3/clickhouse_for_analysts.pdf and /dev/null differ diff --git a/doc/presentations/meetup3/index.html b/doc/presentations/meetup3/index.html deleted file mode 100644 index 7b0bb2c38ec..00000000000 --- a/doc/presentations/meetup3/index.html +++ /dev/null @@ -1,174 +0,0 @@ - - - - ClickHouse meetup в Санкт-Петербурге - - - - - - -
-

ClickHouse meetup в Санкт-Петербурге

-
- -
-

ClickHouse meetup
в Санкт-Петербурге

-
- -
-

ClickHouse: настоящее и будущее

-
- -
-

Команда

-

Теперь 5 разработчиков.

-

- - - - - -

-
- -
- -

В предыдущих сериях

- -

— HTTP и executable источники;

-

— оптимизация мержей, вертикальный мерж;

-

— трассировка распределённых запросов;

-

— clickhouse-local;

-

— операторы BETWEEN, ||;

-

— функции UUID - text;

- -
-
- -

Новое в языке запросов

- -

— KILL QUERY;

-

— LIMIT BY;

-

— SELECT INTO OUTFILE;

- -
-
- -

Интерфейсы

- -

— возможность получить прогресс выполнения в HTTP заголовках;

-

— возможность пропуска ошибок в текстовых форматах;

-

— правильные коды ответа в HTTP интерфейсе;

- -
-
- -

Сборка

- -

— «правильная» сборка и пакеты;

-

— Таблица system.build_options;

- -
-
- -

Словари

- -

— производительность кэшируемых внешних словарей;

-

— инструментирование кэшируемых внешних словарей;

-

— HTTPS словари;

- -
-
- -

Инструментирование

- -

— информация об использовании памяти под индекс;

-

— информация о размере столбцов в несжатом виде;

-

— метрики по потреблению оперативки кэшами;

-

— метрики про мержи;

- -
-
- -

Оптимизации

- -

— оптимизация DISTINCT;

-

— производительность gzip в HTTP интерфейсе;

-

— оптимизация mark cache;

- -
-
- -

Функции

- -

— правильная логика сравнений, least, greatest;

-

— groupUniqArray для всех типов данных;

-

— decodeURLComponent;

- -
-
- -

Что-то ещё

- -

— защита от случайного DROP TABLE;

-

— use_client_time_zone; timezone в конфиге;

-

— fsync_metadata;

- -
-
- -

Сообщество

- -

— интеграция с Grafana, Redash, Apache Zeppelin, Superset;

-

— правильные пакеты для CentOS, RHEL, GosLinux;

-

— драйвер native протокола для Go и C++;

-

— возможность передавать заголовки X-ClickHouse-*;

-

— бенчмарки NYC Taxi, Percona (Spark);

-

— бенчмарк Greenplum;

-

— англоязычный Telegram чат;

-

— встречи и доклады (Брюссель, Париж);

- -
- -
-

ClickHouse vs. Spark

-

https://www.percona.com/blog/2017/02/13/clickhouse-new-opensource-columnar-database/

- -
- -
-

ClickHouse vs. Greenplum

-

-
- -
- -

TODO (март-апрель 2017)

- -

— распределённые DDL запросы;

-

— конфиги в ZooKeeper;

-

— полная поддержка NULL;

- -
-
- -

TODO (весна-лето 2017)

- -

— работоспособность ODBC драйвера под Windows;

-

— переделать анализ запроса: правильная поддержка JOIN;

- -
- -
- -

Дополнительно

- -

job-clickhouse@yandex-team.ru

- -
- -
- - - diff --git a/doc/presentations/meetup3/pictures/greenplum.png b/doc/presentations/meetup3/pictures/greenplum.png deleted file mode 100644 index e919a45dadc..00000000000 Binary files a/doc/presentations/meetup3/pictures/greenplum.png and /dev/null differ diff --git a/doc/presentations/meetup3/pictures/milovidov.jpg b/doc/presentations/meetup3/pictures/milovidov.jpg deleted file mode 100644 index eb0317f8608..00000000000 Binary files a/doc/presentations/meetup3/pictures/milovidov.jpg and /dev/null differ diff --git a/doc/presentations/meetup3/pictures/proller.jpg b/doc/presentations/meetup3/pictures/proller.jpg deleted file mode 100644 index 02b1daa1d6c..00000000000 Binary files a/doc/presentations/meetup3/pictures/proller.jpg and /dev/null differ diff --git a/doc/presentations/meetup3/pictures/spark.png b/doc/presentations/meetup3/pictures/spark.png deleted file mode 100644 index 3ae61297631..00000000000 Binary files a/doc/presentations/meetup3/pictures/spark.png and /dev/null differ diff --git a/doc/presentations/meetup3/pictures/stanly.jpg b/doc/presentations/meetup3/pictures/stanly.jpg deleted file mode 100644 index fb406f156ba..00000000000 Binary files a/doc/presentations/meetup3/pictures/stanly.jpg and /dev/null differ diff --git a/doc/presentations/meetup3/pictures/vludv.jpg b/doc/presentations/meetup3/pictures/vludv.jpg deleted file mode 100644 index ebe6db87126..00000000000 Binary files a/doc/presentations/meetup3/pictures/vludv.jpg and /dev/null differ diff --git a/doc/presentations/meetup3/pictures/ztlpn.jpg b/doc/presentations/meetup3/pictures/ztlpn.jpg deleted file mode 100644 index a2931b91585..00000000000 Binary files a/doc/presentations/meetup3/pictures/ztlpn.jpg and /dev/null differ diff --git a/doc/presentations/meetup3/shower/shower.min.js b/doc/presentations/meetup3/shower/shower.min.js deleted file mode 100644 index 449843ac45d..00000000000 --- a/doc/presentations/meetup3/shower/shower.min.js +++ /dev/null @@ -1,8 +0,0 @@ -/** - * Core for Shower HTML presentation engine - * shower-core v2.0.7, https://github.com/shower/core - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -!function(a){var b,c={NOT_RESOLVED:"NOT_RESOLVED",IN_RESOLVING:"IN_RESOLVING",RESOLVED:"RESOLVED"},d=function(){var l={trackCircularDependencies:!0,allowMultipleDeclarations:!0},m={},n=!1,o=[],p=function(a,d,e){e||(e=d,d=[]);var f=m[a];f||(f=m[a]={name:a,decl:b}),f.decl={name:a,prev:f.decl,fn:e,state:c.NOT_RESOLVED,deps:d,dependents:[],exports:b}},q=function(b,c,d){"string"==typeof b&&(b=[b]),n||(n=!0,k(v)),o.push({deps:b,cb:function(b,f){f?(d||e)(f):c.apply(a,b)}})},r=function(a){var b=m[a];return b?c[b.decl.state]:"NOT_DEFINED"},s=function(a){return!!m[a]},t=function(a){for(var b in a)a.hasOwnProperty(b)&&(l[b]=a[b])},u=function(){var a,b={};for(var c in m)m.hasOwnProperty(c)&&(a=m[c],(b[a.decl.state]||(b[a.decl.state]=[])).push(c));return b},v=function(){n=!1,w()},w=function(){var a,b=o,c=0;for(o=[];a=b[c++];)x(null,a.deps,[],a.cb)},x=function(a,b,c,d){var e=b.length;e||d([]);for(var g,h,i=[],j=function(a,b){if(b)return void d(null,b);if(!--e){for(var c,f=[],g=0;c=i[g++];)f.push(c.exports);d(f)}},k=0,l=e;k ")+'"')},h=function(a){return Error('Declaration of module "'+a.name+'" has already been provided')},i=function(a){return Error('Multiple declarations of module "'+a.name+'" have been detected')},j=function(a,b){for(var c,d=0;c=b[d++];)if(a===c)return!0;return!1},k=function(){var b=[],c=function(a){return 1===b.push(a)},d=function(){var a=b,c=0,d=b.length;for(b=[];c=0&&!b.defaultPrevented();){var d=a[c];d&&(d.context?d.callback.call(d.context,b):d.callback(b)),c--}}}),a(e)}),shower.modules.define("Plugins",["Emitter","util.extend"],function(a,b,c){function d(a){this.events=new b({context:this}),this._showerGlobal=a,this._showerInstances=a.getInited(),this._plugins={},this._instances=[],a.events.on("init",this._onShowerInit,this)}c(d.prototype,{destroy:function(){this._showerGlobal.events.off("init",this._onShowerInit,this),this._plugins=null},add:function(a,b){if(this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" already exist.");return this._requireAndAdd({name:a,options:b}),this},remove:function(a){if(!this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" not found.");return delete this._plugins[a],this.events.emit("remove",{name:a}),this},get:function(a,b){var c,d=this._plugins[a];if(d&&b)for(var e=0,f=this._instances.length;e=0;e--)if(d[e].getId()===a){b=d[e],c=e;break}return{slide:b,index:c}},_onSlideActivate:function(a){window.location.hash=a.get("slide").getId(),this._setTitle()},_onContainerSlideModeChange:function(){this._setTitle(),this.save()},_isSlideMode:function(){return this._shower.container.isSlideMode()},_onPopstate:function(){var a,b=this._shower,c=window.location.hash.substr(1),d=b.player.getCurrentSlide(),e=b.player.getCurrentSlideIndex();this._isSlideMode()&&e===-1?b.player.go(0):e===-1&&""!==window.location.hash&&b.player.go(0),d&&c!==d.getId()&&(a=this._getSlideById(c),b.player.go(a.index))},_setTitle:function(){var a=document.title,b=this._isSlideMode(),c=this._shower.player.getCurrentSlide();if(b&&c){var d=c.getTitle();document.title=d?d+" — "+this._documentTitle:this._documentTitle}else this._documentTitle!==a&&(document.title=this._documentTitle)}}),a(e)}),shower.modules.define("shower.Player",["Emitter","util.bound","util.extend"],function(a,b,c,d){function e(a){this.events=new b({context:this,parent:a.events}),this._shower=a,this._showerListeners=null,this._playerListeners=null,this._currentSlideNumber=-1,this._currentSlide=null,this.init()}d(e.prototype,{init:function(){this._showerListeners=this._shower.events.group().on("slideadd",this._onSlideAdd,this).on("slideremove",this._onSlideRemove,this).on("slidemodeenter",this._onSlideModeEnter,this),this._playerListeners=this.events.group().on("prev",this._onPrev,this).on("next",this._onNext,this),document.addEventListener("keydown",c(this,"_onKeyDown"))},destroy:function(){this._showerListeners.offAll(),this._playerListeners.offAll(),document.removeEventListener("keydown",c(this,"_onKeyDown")),this._currentSlide=null,this._currentSlideNumber=null,this._shower=null},next:function(){return this.events.emit("next"),this},prev:function(){return this.events.emit("prev"),this},first:function(){return this.go(0),this},last:function(){return this.go(this._shower.getSlidesCount()-1),this},go:function(a){"number"!=typeof a&&(a=this._shower.getSlideIndex(a));var b=this._shower.getSlidesCount(),c=this._currentSlide;return a!=this._currentSlideNumber&&a=0&&(c&&c.isActive()&&c.deactivate(),c=this._shower.get(a),this._currentSlide=c,this._currentSlideNumber=a,c.isActive()||c.activate(),this.events.emit("activate",{index:a,slide:c})),this},getCurrentSlide:function(){return this._currentSlide},getCurrentSlideIndex:function(){return this._currentSlideNumber},_onPrev:function(){this._changeSlide(this._currentSlideNumber-1)},_onNext:function(){this._changeSlide(this._currentSlideNumber+1)},_changeSlide:function(a){this.go(a)},_onSlideAdd:function(a){var b=a.get("slide");b.events.on("activate",this._onSlideActivate,this)},_onSlideRemove:function(a){var b=a.get("slide");b.events.off("activate",this._onSlideActivate,this)},_onSlideActivate:function(a){var b=a.get("slide"),c=this._shower.getSlideIndex(b);this.go(c)},_onKeyDown:function(a){if(this._shower.isHotkeysEnabled()&&!/^(?:button|input|select|textarea)$/i.test(a.target.tagName))switch(this.events.emit("keydown",{event:a}),a.which){case 33:case 38:case 37:case 72:case 75:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.prev();break;case 34:case 40:case 39:case 76:case 74:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.next();break;case 36:a.preventDefault(),this.first();break;case 35:a.preventDefault(),this.last();break;case 32:this._shower.container.isSlideMode()&&(a.shiftKey?this.prev():this.next())}},_onSlideModeEnter:function(){this._currentSlide||this.go(0)}}),a(e)}),shower.modules.define("shower.slidesParser",["Slide"],function(a,b){function c(a,c){var d=a.querySelectorAll(c);return d=Array.prototype.slice.call(d),d.map(function(a,c){var d=new b(a);return a.id||(a.id=c+1),d})}a(c)}),shower.modules.define("Slide",["shower.defaultOptions","Emitter","Options","slide.Layout","slide.layoutFactory","util.Store","util.extend"],function(a,b,c,d,e,f,g,h){function i(a,b,e){this.events=new c,this.options=new d(b),this.layout=null,this.state=new g({visited:0,index:null},e),this._content=a,this._isVisited=this.state.get("visited")>0,this._isActive=!1,this.init()}h(i.prototype,{init:function(){this.layout="string"==typeof this._content?new f.createLayout({content:this._content}):new e(this._content,this.options),this.layout.setParent(this),this._setupListeners()},destroy:function(){this._clearListeners(),this._isActive=null,this.options=null,this.layout.destroy()},activate:function(){this._isActive=!0;var a=this.state.get("visited");return this.state.set("visited",++a),this.events.emit("activate",{slide:this}),this},deactivate:function(){return this._isActive=!1,this.events.emit("deactivate",{slide:this}),this},isActive:function(){return this._isActive},isVisited:function(){return this.state.get("visited")>0},getTitle:function(){return this.layout.getTitle()},setTitle:function(a){return this.layout.setTitle(a),this},getId:function(){return this.layout.getElement().id},getContent:function(){return this.layout.getContent()},_setupListeners:function(){this.layoutListeners=this.layout.events.group().on("click",this._onSlideClick,this)},_clearListeners:function(){this.layoutListeners.offAll()},_onSlideClick:function(){this.activate(),this.events.emit("click",{slide:this})}}),a(i)}),shower.modules.define("slide.Layout",["Options","shower.defaultOptions","Emitter","util.bound","util.extend"],function(a,b,c,d,e,f){function g(a,e){this.options=new b({title_element_selector:c.slide_title_element_selector,active_classname:c.slide_active_classname,visited_classname:c.slide_visited_classname},e),this.events=new d,this._element=a,this._parent=null,this._parentElement=null,this.init()}f(g.prototype,{init:function(){var a=this._element.parentNode;a?this._parentElement=a:this.setParentElement(a)},destroy:function(){this.setParent(null)},setParent:function(a){this._parent!=a&&(this._clearListeners(),this._parent=a,this._parent&&this._setupListeners(),this.events.emit("parentchange",{parent:a}))},getParent:function(){return this._parent},setParentElement:function(a){a!=this._parentElement&&(this._parentElement=a,a.appendChild(this._element),this.events.emit("parentelementchange",{parentElement:a}))},getParentElement:function(){return this._parentElement},getElement:function(){return this._element},setTitle:function(a){var b=this.options.get("title_element_selector"),c=this._element.querySelector(b);c?c.innerHTML=a:(c=document.createElement(b),c.innerHTML=a,this._element.insertBefore(c,this._element.firstChild))},getTitle:function(){var a=this.options.get("title_element_selector"),b=this._element.querySelector(a);return b?b.textContent:null},getData:function(a){var b=this._element;return b.dataset?b.dataset[a]:b.getAttribute("data-"+a)},getContent:function(){return this._element.innerHTML},_setupListeners:function(){this._slideListeners=this._parent.events.group().on("activate",this._onSlideActivate,this).on("deactivate",this._onSlideDeactivate,this),this._element.addEventListener("click",e(this,"_onSlideClick"),!1)},_clearListeners:function(){this._slideListeners&&this._slideListeners.offAll(),this._element.removeEventListener("click",e(this,"_onSlideClick"))},_onSlideActivate:function(){this._element.classList.add(this.options.get("active_classname"))},_onSlideDeactivate:function(){var a=this._element.classList;a.remove(this.options.get("active_classname")),a.add(this.options.get("visited_classname"))},_onSlideClick:function(){this.events.emit("click")}}),a(g)}),shower.modules.define("slide.layoutFactory",["slide.Layout","util.extend"],function(a,b,c){var d={};c(d,{createLayout:function(a){a=a||{};var e=d._createElement(c({content:"",contentType:"slide"},a));return new b(e)},_createElement:function(a){var b=document.createElement("section");return b.innerHTML=a.content,b.classList.add(a.contentType),b}}),a(d)}),shower.modules.define("util.bound",function(a){function b(a,b){return a["__bound_"+b]||(a["__bound_"+b]=a[b].bind(a))}a(b)}),shower.modules.define("util.extend",function(a){function b(a){if(!a)throw new Error("util.extend: Target not found");return"undefined"==typeof Object.assign?c.apply(null,arguments):Object.assign.apply(null,arguments)}function c(a){for(var b=1,c=arguments.length;b0&&(a.preventDefault(),this.prev())},_go:function(){for(var a=0,b=this._elements.length;awindow.innerWidth/2?c.player.next():c.player.prev()),d||f.activate())},_onTouchMove:function(a){this._shower.container.isSlideMode()&&a.preventDefault()},_getSlideByElement:function(a){for(var b=this._shower.getSlides(),c=null,d=0,e=b.length;d` of your presentation. - -## PDF - -Ribbon could be exported to PDF by printing it from the list mode in Chrome or Opera browsers. See [printing documentation](https://github.com/shower/shower/blob/master/docs/printing-en.md) for more options. - -## Development - -If you want to adjust theme for your needs: - -1. Fork this repository and clone it to your local machine. -2. Install dependencies: `npm install`. -3. Start a local server with watcher: `npm run dev` or just `gulp` if you have it installed globally. -4. Edit your files and see changes in the opened browser. - -To take part in Ribbon development please read [contributing guidelines](CONTRIBUTING.md) first and [file an issue](https://github.com/shower/shower/issues/new) before sending any pull request. - ---- -Licensed under [MIT License](LICENSE.md). diff --git a/doc/presentations/meetup3/shower/themes/ribbon/index.html b/doc/presentations/meetup3/shower/themes/ribbon/index.html deleted file mode 100644 index 98850917e05..00000000000 --- a/doc/presentations/meetup3/shower/themes/ribbon/index.html +++ /dev/null @@ -1,304 +0,0 @@ - - - - Ribbon theme for Shower - - - - - - -
-

Presentation Title

-

Yours Truly, Famous Inc.

-
-
-

Slide Header

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch letterpress.

-

Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid four loko quinoa.

-

Echo Park 8-bit sustainable umami deep v Kickstarter.

-
-
-

Inline Elements

-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-
-
-

Quotes

-
-

Flannel bicycle rights locavore selfies skateboard. Authentic fanny pack paleo four loko bespoke. Artisan tattooed chia XOXO ennui, lomo disrupt 8-bit art party Tumblr scenester.

-
-
-
-

Post-ironic fashion axe flexitarian, Tonx narwhal messenger bag Tumblr. Portland gentrify deep v kale chips literally.

-
-
Yours Truly
-
-
-
-

Nested Lists

-
    -
  1. Literally viral vegan, ugh drinking vinegar photo booth
  2. -
  3. Wes Anderson chillwave Marfa pour-over Etsy banh mi
  4. -
  5. Ethnic polaroid lo-fi iPhone ennui -
      -
    • Yr wayfarers before they sold out Kickstarter asymmetrical
    • -
    • Irony flexitarian readymade quinoa, kogi bespoke meggings narwhal
    • -
    • Skateboard Etsy twee artisan Echo Park
    • -
    -
  6. -
  7. Tonx kitsch fingerstache readymade, retro single-origin coffee
  8. -
-
-
-

Block Lists

-
    -
  • Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack.
  • -
  • Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag.
  • -
  • Leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical paleo you probably haven’t heard of.
  • -
-
-
-

Latin and Cyrillic List Bullets

-
    -
  • Occupy locavore blog, mustache you probably haven't heard of them
  • -
  • Skateboard pork belly aesthetic hoodie selfies brunch
  • -
  • Food truck gluten-free disrupt Portland
  • -
-
    -
  • Helvetica narwhal drinking vinegar chillwave, post-ironic ennui
  • -
  • Cray pug paleo retro, Echo Park narwhal Wes Anderson
  • -
  • Disrupt Williamsburg fixie, shabby chic bicycle rights hashtag kogi
  • -
-
-
-

Two Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Three Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Simple Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Striped Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Plain Code Listing

-
<html lang="en">
-<head> <!--Comment-->
-    <title>Shower</title>
-    <meta charset="UTF-8">
-    <link rel="stylesheet" href="screen.css">
-    <script src="script.js"></script>
-</head>
-
-
-

Numbered Code Listing

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Lines

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Hidden Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Grid Slide

-
-
-

White Slide

-
-
-

Black Slide

-
-
-

Shout

-
-
-

Multiline
Shout

-
-
-

Linked Shout

-
-
-

Growing Shout

-
-
-

Shrinking Shout

-
-
-
- -
Copyright © 2016 Yours Truly, Famous Inc.
-
-
-
- -
-
- -
-
- - - - - - - - - -
-
-

Timer

-
-
-

List Navigation

-
    -
  1. Ennui keffiyeh thundercats
  2. - - - - -
-

Before they sold out master

-
-
- - - - - diff --git a/doc/presentations/meetup3/shower/themes/ribbon/pictures/canvas.png b/doc/presentations/meetup3/shower/themes/ribbon/pictures/canvas.png deleted file mode 100644 index 6ddd30154f2..00000000000 Binary files a/doc/presentations/meetup3/shower/themes/ribbon/pictures/canvas.png and /dev/null differ diff --git a/doc/presentations/meetup3/shower/themes/ribbon/pictures/exact.png b/doc/presentations/meetup3/shower/themes/ribbon/pictures/exact.png deleted file mode 100644 index b27251c57cb..00000000000 Binary files a/doc/presentations/meetup3/shower/themes/ribbon/pictures/exact.png and /dev/null differ diff --git a/doc/presentations/meetup3/shower/themes/ribbon/pictures/square.png b/doc/presentations/meetup3/shower/themes/ribbon/pictures/square.png deleted file mode 100644 index 62cb2384a5f..00000000000 Binary files a/doc/presentations/meetup3/shower/themes/ribbon/pictures/square.png and /dev/null differ diff --git a/doc/presentations/meetup3/shower/themes/ribbon/pictures/tall.png b/doc/presentations/meetup3/shower/themes/ribbon/pictures/tall.png deleted file mode 100644 index fbc9f09a2ab..00000000000 Binary files a/doc/presentations/meetup3/shower/themes/ribbon/pictures/tall.png and /dev/null differ diff --git a/doc/presentations/meetup3/shower/themes/ribbon/pictures/wide.png b/doc/presentations/meetup3/shower/themes/ribbon/pictures/wide.png deleted file mode 100644 index 1e83b0ac7ad..00000000000 Binary files a/doc/presentations/meetup3/shower/themes/ribbon/pictures/wide.png and /dev/null differ diff --git a/doc/presentations/meetup3/shower/themes/ribbon/styles/presentation_links.html b/doc/presentations/meetup3/shower/themes/ribbon/styles/presentation_links.html deleted file mode 100644 index 74981d8eac5..00000000000 --- a/doc/presentations/meetup3/shower/themes/ribbon/styles/presentation_links.html +++ /dev/null @@ -1,2 +0,0 @@ -http://www.slideshare.net/AlexeyMilovidov1/clickhouse-69616890/AlexeyMilovidov1/clickhouse-69616890 -file:///home/milovidov/work/Presentation/shower/index.html#cover diff --git a/doc/presentations/meetup3/shower/themes/ribbon/styles/screen-16x10.css b/doc/presentations/meetup3/shower/themes/ribbon/styles/screen-16x10.css deleted file mode 100644 index d21f190feea..00000000000 --- a/doc/presentations/meetup3/shower/themes/ribbon/styles/screen-16x10.css +++ /dev/null @@ -1,204 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8"; - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot); - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/CYblzLEXzCqQIvrYs7QKQe2omRk.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/pUcnOdRwl83MvPPzrNomhyletnA.woff) format('woff'), - url(https://yastatic.net/adv-www/_/vNFEmXOcGYKJ4AAidUprHWoXrLU.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/0w7OcWZM_QLP8x-LQUXFOgXO6dE.svg#YandexSansTextWeb-Bold) format('svg'); - font-weight: 700; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot); - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/z3MYElcut0R2MF_Iw1RDNrstgYs.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/1jvKJ_-hCXl3s7gmFl-y_-UHTaI.woff) format('woff'), - url(https://yastatic.net/adv-www/_/9nzjfpCR2QHvK1EzHpDEIoVFGuY.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/gwyBTpxSwkFCF1looxqs6JokKls.svg#YandexSansTextWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot); - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/lGQcYklLVV0hyvz1HFmFsUTj8_0.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/f0AAJ9GJ4iiwEmhG-7PWMHk6vUY.woff) format('woff'), - url(https://yastatic.net/adv-www/_/4UDe4nlVvgEJ-VmLWNVq3SxCsA.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/EKLr1STNokPqxLAQa_RyN82pL98.svg#YandexSansTextWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot); - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'), - url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot); - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/LGiRvlfqQHlWR9YKLhsw5e7KGNA.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/40vXwNl4eYYMgteIVgLP49dwmfc.woff) format('woff'), - url(https://yastatic.net/adv-www/_/X6zG5x_wO8-AtwJ-vDLJcKC5228.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/ZKhaR0m08c8CRRL77GtFKoHcLYA.svg#YandexSansDisplayWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - -*,::after,::before{box-sizing:border-box} -a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline} -article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block} -.caption p,body{line-height:1} -p {line-height: 1} -ol,ul{list-style:none} -blockquote,q{quotes:none} -blockquote::after,blockquote::before,q::after,q::before{content:none} -table{border-collapse:collapse;border-spacing:0} -a{text-decoration:none} -@page{margin:0;size:1280px 720px} -.shower{color:#000;counter-reset:slide;font:25px/2 Yandex Sans Display Web,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none} -@media print{.shower{text-rendering:geometricPrecision} -} -.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90} -@media (min-width:1174px){.caption{font-size:50px} -} -@media (min-width:2348px){.caption{font-size:100px} -} -.caption h1{padding-bottom:.15em;font:1em/2 Yandex Sans Display Web,sans-serif} -.caption p{font-size:.6em} -.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60} -.slide{position:relative;z-index:1;overflow:hidden;padding:20px 100px 0;width:1024px;height:640px;background:#fff;font-size:25px} - -/*.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}*/ - -.slide h1{vertical-align:middle; color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide h2{margin-bottom:34px;color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide p{margin-bottom:1em} -.slide p.note{color:#979a9e} -.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2} -.slide b,.slide strong{font-weight:700} -.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic} -.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em} -.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace} -.slide mark{background:#fafaa2} -.slide sub,.slide sup{position:relative;line-height:0;font-size:75%} -.slide sub{bottom:-.25em} -.slide sup{top:-.5em} -.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'} -.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700} -.slide ol,.slide ul{margin-bottom:0em;counter-reset:list} -.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em} -.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right} -.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em} -.slide ul>li::before{padding-right:.5em;content:'•'} -.slide ul>li:lang(ru)::before{content:'—'} -.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."} -.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)} -.slide table td:first-child,.slide table th:first-child{padding-left:96px} -.slide table td:last-child,.slide table th:last-child{padding-right:96px} -.slide table th{text-align:left;font-weight:700} -.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x} -.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)} -.slide table.striped tr>*{background-image:none} -.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal} -.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4} -.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)} -.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."} -.slide pre mark{position:relative;z-index:-1;margin:0 -.3em} -.slide pre mark.important{background:#c00;color:#fff} -.slide pre .comment{color:#999} -.slide footer{position:absolute;right:0;bottom:-640px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s} -.slide footer mark{background:rgba(255,255,255,.8)} -.slide:hover>footer{bottom:0} -.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated} -@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto} -} -.slide.black{background-color:#000} -.slide.black::after,.slide.white::after{visibility:hidden} -.slide.white{background-color:#fff} -.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto} -.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2} -.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3} -.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)} -.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x} -.slide .cover{z-index:-1;max-width:100%;max-height:100%} -.slide .cover.w,.slide .cover.width{width:100%;max-height:none} -.slide .cover.h,.slide .cover.height{height:100%;max-width:none} -.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)} -.slide .cover+figcaption.white{color:#fff} -.slide .cover+figcaption a{color:currentcolor} -.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)} -.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none} -.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)} -.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)} -.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0} -.slide .place.r,.slide .place.right{right:0;left:auto} -.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0} -.slide .place.l,.slide .place.left{left:0} -.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)} -.progress[style*='100%']{padding-left:10px} -.badge,.badge a,.progress{position:absolute} -.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden} -@media (min-width:1174px){.badge{font-size:20px} -} -@media (min-width:2348px){.badge{font-size:40px} -} -.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)} -.region{display:none} -@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)} -} -@media screen and (min-width:1174px){.shower.list{padding-top:50px} -} -@media screen and (min-width:2348px){.shower.list{padding-top:100px} -} -@media screen{.shower.list .caption{display:block} -.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -455px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)} -} -@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -270px 50px;-webkit-transform:scale(.5);transform:scale(.5)} -} -@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)} -} -@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide *{pointer-events:none} -.shower.list .badge,.shower.list .slide footer{display:block} -.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-320px 0 0 -512px;width:1024px;height:640px;background:#000} -.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden} -.shower.full .slide:target{margin:0;visibility:visible} -.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0} -.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)} -.shower.full .slide .next{visibility:hidden} -.shower.full .slide .next.active{visibility:visible} -.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform} -.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)} -.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)} -.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)} -.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)} -.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block} -} diff --git a/doc/presentations/meetup3/shower/themes/ribbon/styles/screen-4x3.css b/doc/presentations/meetup3/shower/themes/ribbon/styles/screen-4x3.css deleted file mode 100644 index 6648b972c30..00000000000 --- a/doc/presentations/meetup3/shower/themes/ribbon/styles/screen-4x3.css +++ /dev/null @@ -1,7 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8";@font-face{font-family:PT Sans;src:url(../fonts/pt-sans-regular.woff) format("woff")}@font-face{font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold.woff) format("woff")}@font-face{font-style:italic;font-family:PT Sans;src:url(../fonts/pt-sans-italic.woff) format("woff")}@font-face{font-style:italic;font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold-italic.woff) format("woff")}@font-face{font-family:PT Sans Narrow;font-weight:700;src:url(../fonts/pt-sans-narrow-bold.woff) format("woff")}@font-face{font-family:PT Mono;src:url(../fonts/pt-mono-regular.woff) format("woff")}*,::after,::before{box-sizing:border-box}a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block}.caption p,body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote::after,blockquote::before,q::after,q::before{content:none}table{border-collapse:collapse;border-spacing:0}a{text-decoration:none}@page{margin:0;size:1024px 768px}.shower{color:#000;counter-reset:slide;font:25px/2 PT Sans,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none}@media print{.shower{text-rendering:geometricPrecision}}.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90}@media (min-width:1174px){.caption{font-size:50px}}@media (min-width:2348px){.caption{font-size:100px}}.caption h1{padding-bottom:.15em;font:700 1em/1 PT Sans Narrow,sans-serif}.caption p{font-size:.6em}.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60}.slide{position:relative;z-index:1;overflow:hidden;padding:106px 100px 0;width:1024px;height:768px;background:#fff;font-size:25px}.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}.slide h2{margin-bottom:34px;color:#585a5e;font:700 50px/1 PT Sans Narrow,sans-serif}.slide p{margin-bottom:1em}.slide p.note{color:#979a9e}.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2}.slide b,.slide strong{font-weight:700}.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic}.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em}.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace}.slide mark{background:#fafaa2}.slide sub,.slide sup{position:relative;line-height:0;font-size:75%}.slide sub{bottom:-.25em}.slide sup{top:-.5em}.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'}.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700}.slide ol,.slide ul{margin-bottom:1em;counter-reset:list}.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em}.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right}.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em}.slide ul>li::before{padding-right:.5em;content:'•'}.slide ul>li:lang(ru)::before{content:'—'}.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."}.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)}.slide table td:first-child,.slide table th:first-child{padding-left:96px}.slide table td:last-child,.slide table th:last-child{padding-right:96px}.slide table th{text-align:left;font-weight:700}.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x}.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)}.slide table.striped tr>*{background-image:none}.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal}.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4}.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)}.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."}.slide pre mark{position:relative;z-index:-1;margin:0 -.3em}.slide pre mark.important{background:#c00;color:#fff}.slide pre .comment{color:#999}.slide footer{position:absolute;right:0;bottom:-768px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s}.slide footer mark{background:rgba(255,255,255,.8)}.slide:hover>footer{bottom:0}.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated}@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto}}.slide.black{background-color:#000}.slide.black::after,.slide.white::after{visibility:hidden}.slide.white{background-color:#fff}.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto}.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2}.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3}.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)}.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x}.slide .cover{z-index:-1;max-width:100%;max-height:100%}.slide .cover.w,.slide .cover.width{width:100%;max-height:none}.slide .cover.h,.slide .cover.height{height:100%;max-width:none}.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)}.slide .cover+figcaption.white{color:#fff}.slide .cover+figcaption a{color:currentcolor}.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)}.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none}.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)}.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)}.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0}.slide .place.r,.slide .place.right{right:0;left:auto}.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0}.slide .place.l,.slide .place.left{left:0}.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)}.progress[style*='100%']{padding-left:10px}.badge,.badge a,.progress{position:absolute}.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden}@media (min-width:1174px){.badge{font-size:20px}}@media (min-width:2348px){.badge{font-size:40px}}.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)}.region{display:none}@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)}}@media screen and (min-width:1174px){.shower.list{padding-top:50px}}@media screen and (min-width:2348px){.shower.list{padding-top:100px}}@media screen{.shower.list .caption{display:block}.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -551px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)}}@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -334px 50px;-webkit-transform:scale(.5);transform:scale(.5)}}@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)}}@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)}.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)}.shower.list .slide *{pointer-events:none}.shower.list .badge,.shower.list .slide footer{display:block}.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-384px 0 0 -512px;width:1024px;height:768px;background:#000}.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden}.shower.full .slide:target{margin:0;visibility:visible}.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0}.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)}.shower.full .slide .next{visibility:hidden}.shower.full .slide .next.active{visibility:visible}.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform}.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)}.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)}.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)}.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)}.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block}} \ No newline at end of file diff --git a/doc/presentations/meetup4/LICENSE.md b/doc/presentations/meetup4/LICENSE.md deleted file mode 100644 index bd3449d3576..00000000000 --- a/doc/presentations/meetup4/LICENSE.md +++ /dev/null @@ -1,21 +0,0 @@ -# The MIT License - -Copyright © 2010–2015 Vadim Makeev, http://pepelsbey.net/ - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ---- - -# Лицензия MIT - -Copyright © 2010–2015 Вадим Макеев, http://pepelsbey.net/ - -Данная лицензия разрешает лицам, получившим копию данного программного обеспечения и сопутствующей документации (в дальнейшем именуемыми «Программное Обеспечение»), безвозмездно использовать Программное Обеспечение без ограничений, включая неограниченное право на использование, копирование, изменение, добавление, публикацию, распространение, сублицензирование и/или продажу копий Программного Обеспечения, также как и лицам, которым предоставляется данное Программное Обеспечение, при соблюдении следующих условий: - -Указанное выше уведомление об авторском праве и данные условия должны быть включены во все копии или значимые части данного Программного Обеспечения. - -ДАННОЕ ПРОГРАММНОЕ ОБЕСПЕЧЕНИЕ ПРЕДОСТАВЛЯЕТСЯ «КАК ЕСТЬ», БЕЗ КАКИХ-ЛИБО ГАРАНТИЙ, ЯВНО ВЫРАЖЕННЫХ ИЛИ ПОДРАЗУМЕВАЕМЫХ, ВКЛЮЧАЯ, НО НЕ ОГРАНИЧИВАЯСЬ ГАРАНТИЯМИ ТОВАРНОЙ ПРИГОДНОСТИ, СООТВЕТСТВИЯ ПО ЕГО КОНКРЕТНОМУ НАЗНАЧЕНИЮ И ОТСУТСТВИЯ НАРУШЕНИЙ ПРАВ. НИ В КАКОМ СЛУЧАЕ АВТОРЫ ИЛИ ПРАВООБЛАДАТЕЛИ НЕ НЕСУТ ОТВЕТСТВЕННОСТИ ПО ИСКАМ О ВОЗМЕЩЕНИИ УЩЕРБА, УБЫТКОВ ИЛИ ДРУГИХ ТРЕБОВАНИЙ ПО ДЕЙСТВУЮЩИМ КОНТРАКТАМ, ДЕЛИКТАМ ИЛИ ИНОМУ, ВОЗНИКШИМ ИЗ, ИМЕЮЩИМ ПРИЧИНОЙ ИЛИ СВЯЗАННЫМ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ ИЛИ ИСПОЛЬЗОВАНИЕМ ПРОГРАММНОГО ОБЕСПЕЧЕНИЯ ИЛИ ИНЫМИ ДЕЙСТВИЯМИ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ. diff --git a/doc/presentations/meetup4/clickhouse_for_analysts.pdf b/doc/presentations/meetup4/clickhouse_for_analysts.pdf deleted file mode 100644 index 6008d3dfc27..00000000000 Binary files a/doc/presentations/meetup4/clickhouse_for_analysts.pdf and /dev/null differ diff --git a/doc/presentations/meetup4/index.html b/doc/presentations/meetup4/index.html deleted file mode 100644 index 59f71b1ca7e..00000000000 --- a/doc/presentations/meetup4/index.html +++ /dev/null @@ -1,341 +0,0 @@ - - - - ClickHouse meetup в Новосибирске - - - - - - -
-

ClickHouse meetup в Новосибирске

-
- -
-

ClickHouse meetup
в Новосибирске

-
- -
-

ClickHouse: настоящее и будущее

-
- -
-

Что такое ClickHouse?

-

ClickHouse - distributed analytical column-oriented DBMS

-
- -
-

Почему column-oriented?

-

Так работают row-oriented системы:

-

-
-
-

Почему column-oriented?

-

Так работают column-oriented системы:

-

-
-
-

Почему ClickHouse?

-

Ничего готового не подошло.

-

Тогда мы сделали ClickHouse.

-

«Эволюция структур данных в Яндекс.Метрике»

-

https://habrahabr.ru/company/yandex/blog/273305/

-
-
-

Метрика 2.0

- -
-
-

Коротко

-
    -
  • column-oriented
  • -
  • линейная масштабируемость
  • -
  • отказоустойчивость
  • -
  • загрузка данных в реальном времени
  • -
  • онлайн (sub-second) запросы
  • -
  • поддержка диалекта SQL + расширения
    (массивы, вложенные структуры данных, domain-specific функции, сэмплирование)
  • -
-
-
-

Основной кластер Метрики

-
    -
  • >20 триллионов строк
  • -
  • 460 серверов
  • -
  • скорость обработки данных до двух терабайт в секунду
  • -
-

* Если вы хотите попробовать ClickHouse, достаточно и одного сервера.

-
-
-

ClickHouse в Яндексе

-

Нам удалось сделать систему сравнительно удобной.

-

С самого начала мы имели подробную документацию.

-

В течение пары лет ClickHouse распространился по другим отделам Яндекса.

-

Почта, Маркет, Директ, Вебмастер, AdFox, Инфраструктура, Бизнес аналитика...

-

Есть случаи, когда аналитики самостоятельно устанавливали ClickHouse на виртуальные машины и успешно использовали без каких-либо вопросов.

-
-
-

Open-source

-

Потом мы решили — ClickHouse слишком хорошая система, чтобы нам одним на нём сидеть.

-

Чтобы было веселее, надо подсадить на ClickHouse людей снаружи, пусть радуются. Решили сделать open-source.

-
-
-

Open-source

-

Лицензия Apache 2.0 — минимум ограничений.

-

Цель — максимальное распространение продукта.

-

Мы хотим, чтобы продуктом Яндекса пользовались по всему миру.

-

См. “Яндекс открывает ClickHouse”

-

https://habrahabr.ru/company/yandex/blog/303282/

-
-
-

Когда надо использовать ClickHouse

-

Хорошо структурированные, очищенные, неизменяемые события.

-

 

-

Click stream. Веб-аналитика. Рекламные сети. RTB. E-commerce.

-

Аналитика онлайн игр. Данные сенсоров и мониторингов. Телеком данные.

-

Финансовые транзакции. Биржевая аналитика.

-
-
-

Когда не надо использовать ClickHouse

-

OLTP
В ClickHouse нет UPDATE и полноценных транзакций.

-

Key-Value
Если нужны частые запросы на обновление по ключу, используйте другое решение.

-

Blob-store, document oriented
ClickHouse предназначен для большого количества мелко-гранулированных данных.

-

Излишне нормализованные данные
Лучше сделать широкую таблицу фактов.

-
-
-

Почему ClickHouse такой быстрый?

-

 

-

— от безысходности.

-

Яндекс.Метрика должна работать.

-
-
-

Почему ClickHouse такой быстрый?

-

Алгоритмическая оптимизация.

-

MergeTree, локальность расположения данных на диске
— быстрые диапазонные запросы.

-

Пример: функция uniqCombined состоит из комбинации трёх различных структур данных, подходящих под разные диапазоны кардинальностей.

-

Низкоуровневая оптимизация.

-

Пример: vectorized query execution.

-

Специализация и внимание к деталям.

-

Пример: у нас есть 17 разных алгоритмов выполнения GROUP BY. Для вашего запроса выбирается лучший.

-
- - -
-

Что нового в ClickHouse

-
- -
-

Команда

-

Теперь 5 разработчиков.

-

- - - - - -

-
- -
- -

В предыдущих сериях

- -

— HTTP и executable источники;

-

— оптимизация мержей, вертикальный мерж;

-

— трассировка распределённых запросов;

-

— clickhouse-local;

-

— операторы BETWEEN, ||;

-

— функции UUID - text;

- -
-
- -

Новое в языке запросов

- -

— KILL QUERY;

-

— LIMIT BY;

-

— SELECT INTO OUTFILE;

- -
-
- -

Сборка

- -

— «правильная» сборка и пакеты;

-

— Таблица system.build_options;

- -
-
- -

Интерфейсы

- -

— возможность получить прогресс выполнения в HTTP заголовках;

-

— возможность пропуска ошибок в текстовых форматах;

-

— правильные коды ответа в HTTP интерфейсе;

- -
-
- -

Словари

- -

— производительность кэшируемых внешних словарей;

-

— инструментирование кэшируемых внешних словарей;

-

— HTTPS словари;

- -
-
- -

Инструментирование

- -

— информация об использовании памяти под индекс;

-

— информация о размере столбцов в несжатом виде;

-

— метрики по потреблению оперативки кэшами;

-

— метрики про мержи;

- -
-
- -

Оптимизации

- -

— оптимизация DISTINCT;

-

— производительность gzip в HTTP интерфейсе;

-

— оптимизация mark cache;

- -
-
- -

Функции

- -

— правильная логика сравнений, least, greatest;

-

— groupUniqArray для всех типов данных;

-

— decodeURLComponent;

- -
-
- -

Что-то ещё

- -

— защита от случайного DROP TABLE;

-

— use_client_time_zone; timezone в конфиге;

-

— fsync_metadata;

- -
-
- -

Сообщество

- -

— интеграция с Grafana, Redash, Apache Zeppelin, Superset;

-

— правильные пакеты для CentOS, RHEL, GosLinux;

-

— драйвер native протокола для Go и C++;

-

— возможность передавать заголовки X-ClickHouse-*;

-

— бенчмарки NYC Taxi, Percona (Spark);

-

— бенчмарк Greenplum;

-

— англоязычный Telegram чат;

-

— встречи и доклады (Париж - февраль, Сан-Франциско - апрель);

- -
- -
-

ClickHouse vs. Spark

-

https://www.percona.com/blog/2017/02/13/clickhouse-new-opensource-columnar-database/

- -
- -
-

ClickHouse vs. Greenplum

-

-
- -
- -

TODO (март-апрель 2017)

- -

— распределённые DDL запросы;

-

конфиги в ZooKeeper;

-

— полная поддержка NULL;

- -
-
- -

TODO (весна-лето 2017)

- -

— работоспособность ODBC драйвера под Windows;

-

— переделать анализ запроса: правильная поддержка JOIN;

- -
- -
-

Сообщество

-

Сайт: https://clickhouse.yandex/

-

Google groups: https://groups.google.com/forum/#!forum/clickhouse

-

Рассылка: clickhouse-feedback@yandex-team.com

-

Telegram чат: https://telegram.me/clickhouse_en and https://telegram.me/clickhouse_ru (уже 500 участников)

-

GitHub: https://github.com/yandex/ClickHouse/

-

 

-

+ встречи. Москва, Санкт-Петербург, Новосибирск... Далее: Екатеринбург, Киев, Минск, Сан-Франциско...

-
- -
-

Бонус

-
- - -
-

Подключение к ClickHouse

-

HTTP REST

-

clickhouse-client

-

JDBC

-

 

-

Python, PHP, Go, Perl, Ruby, Node.JS, R, .NET

-

 

-

Web UI: https://github.com/smi2/clickhouse-frontend

-
- -
-

ClickHouse vs. typical row-oriented DBMS

-

Itai Shirav:

«I haven't made a rigorous comparison, but I did convert a time-series table with 9 million rows from Postgres to ClickHouse.

-

Under ClickHouse queries run about 100 times faster, and the table takes 20 times less disk space. Which is pretty amazing if you ask me».

-
-
-

 

-

Bao Dang:

«Obviously, ClickHouse outperformed PostgreSQL at any metric».

-

https://github.com/AnalyticsGo/AnalyticsGo/issues/1

-
-
-

ClickHouse vs. Vertica

-

Timur Shenkao:

«ClickHouse is extremely fast at simple SELECTs without joins, much faster than Vertica».

-
-
-

ClickHouse vs. PrestoDB

-

Ömer Osman Koçak:

- «When we evaluated ClickHouse the results were great compared to Prestodb. Even though the columnar storage optimizations for ORC and Clickhouse is quite similar, Clickhouse uses CPU and Memory resources more efficiently (Presto also uses vectorized execution but cannot take advantage of hardware level optimizations such as SIMD instruction sets because it's written in Java so that's fair) so we also wanted to add support for Clickhouse for our open-source analytics platform Rakam (https://github.com/rakam-io/rakam)»

-
-
-

ClickHouse vs. Spark

-

«Я потестировал Clickhouse, по скорости просто отлично = намного быстрее spark на одной машине (у меня получилось порядка 3x, но еще буду сравнивать). Кроме того compression получается лучше».

-
-
-

ClickHouse vs. Google BigQuery

-

«ClickHouse показывает сравнимую скорость на таком запросе за 30 дней и в 8 раз быстрее (!) на таком запросе. В планах есть протестировать и другие запросы, еще не добрались.

Скорость выполнения запросов стабильна. В Google BigQuery в период пиковых нагрузок, например в 4:00 p.m. PDT или в начале месяца, время выполнения запросов может заметно увеличиваться».

-
-
-

ClickHouse vs. Druid

-

«В этом году мы развернули сборку на основе Druid — Imply Analytics Platform, а также Tranquility, и уже приготовились запускать в продакшн… Но после выхода ClickHouse сразу отказались от Druid, хотя потратили два месяца на его изучение и внедрение».

-

https://habrahabr.ru/company/smi2/blog/314558/

-
-
-

ClickHouse vs. InfiniDB

-

«结论:clickhouse速度更快!»

-

«In conclusion, ClickHouse is faster!»

-

http://verynull.com/2016/08/22/infinidb与clickhouse对比/

-

-
-
-

ClickHouse for sensor data

-

-
- -
- - - diff --git a/doc/presentations/meetup4/pictures/column_oriented.gif b/doc/presentations/meetup4/pictures/column_oriented.gif deleted file mode 100644 index 15f4b12e697..00000000000 Binary files a/doc/presentations/meetup4/pictures/column_oriented.gif and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/greenplum.png b/doc/presentations/meetup4/pictures/greenplum.png deleted file mode 100644 index e919a45dadc..00000000000 Binary files a/doc/presentations/meetup4/pictures/greenplum.png and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/infinidb_cn.png b/doc/presentations/meetup4/pictures/infinidb_cn.png deleted file mode 100644 index 957c392a448..00000000000 Binary files a/doc/presentations/meetup4/pictures/infinidb_cn.png and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/kaspersky.png b/doc/presentations/meetup4/pictures/kaspersky.png deleted file mode 100644 index f8aae1da9ee..00000000000 Binary files a/doc/presentations/meetup4/pictures/kaspersky.png and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/metrika2.png b/doc/presentations/meetup4/pictures/metrika2.png deleted file mode 100644 index 3ee37e98fc6..00000000000 Binary files a/doc/presentations/meetup4/pictures/metrika2.png and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/milovidov.jpg b/doc/presentations/meetup4/pictures/milovidov.jpg deleted file mode 100644 index eb0317f8608..00000000000 Binary files a/doc/presentations/meetup4/pictures/milovidov.jpg and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/proller.jpg b/doc/presentations/meetup4/pictures/proller.jpg deleted file mode 100644 index 02b1daa1d6c..00000000000 Binary files a/doc/presentations/meetup4/pictures/proller.jpg and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/row_oriented.gif b/doc/presentations/meetup4/pictures/row_oriented.gif deleted file mode 100644 index 53daa20f322..00000000000 Binary files a/doc/presentations/meetup4/pictures/row_oriented.gif and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/spark.png b/doc/presentations/meetup4/pictures/spark.png deleted file mode 100644 index 3ae61297631..00000000000 Binary files a/doc/presentations/meetup4/pictures/spark.png and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/stanly.jpg b/doc/presentations/meetup4/pictures/stanly.jpg deleted file mode 100644 index fb406f156ba..00000000000 Binary files a/doc/presentations/meetup4/pictures/stanly.jpg and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/vludv.jpg b/doc/presentations/meetup4/pictures/vludv.jpg deleted file mode 100644 index ebe6db87126..00000000000 Binary files a/doc/presentations/meetup4/pictures/vludv.jpg and /dev/null differ diff --git a/doc/presentations/meetup4/pictures/ztlpn.jpg b/doc/presentations/meetup4/pictures/ztlpn.jpg deleted file mode 100644 index a2931b91585..00000000000 Binary files a/doc/presentations/meetup4/pictures/ztlpn.jpg and /dev/null differ diff --git a/doc/presentations/meetup4/shower/shower.min.js b/doc/presentations/meetup4/shower/shower.min.js deleted file mode 100644 index 449843ac45d..00000000000 --- a/doc/presentations/meetup4/shower/shower.min.js +++ /dev/null @@ -1,8 +0,0 @@ -/** - * Core for Shower HTML presentation engine - * shower-core v2.0.7, https://github.com/shower/core - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -!function(a){var b,c={NOT_RESOLVED:"NOT_RESOLVED",IN_RESOLVING:"IN_RESOLVING",RESOLVED:"RESOLVED"},d=function(){var l={trackCircularDependencies:!0,allowMultipleDeclarations:!0},m={},n=!1,o=[],p=function(a,d,e){e||(e=d,d=[]);var f=m[a];f||(f=m[a]={name:a,decl:b}),f.decl={name:a,prev:f.decl,fn:e,state:c.NOT_RESOLVED,deps:d,dependents:[],exports:b}},q=function(b,c,d){"string"==typeof b&&(b=[b]),n||(n=!0,k(v)),o.push({deps:b,cb:function(b,f){f?(d||e)(f):c.apply(a,b)}})},r=function(a){var b=m[a];return b?c[b.decl.state]:"NOT_DEFINED"},s=function(a){return!!m[a]},t=function(a){for(var b in a)a.hasOwnProperty(b)&&(l[b]=a[b])},u=function(){var a,b={};for(var c in m)m.hasOwnProperty(c)&&(a=m[c],(b[a.decl.state]||(b[a.decl.state]=[])).push(c));return b},v=function(){n=!1,w()},w=function(){var a,b=o,c=0;for(o=[];a=b[c++];)x(null,a.deps,[],a.cb)},x=function(a,b,c,d){var e=b.length;e||d([]);for(var g,h,i=[],j=function(a,b){if(b)return void d(null,b);if(!--e){for(var c,f=[],g=0;c=i[g++];)f.push(c.exports);d(f)}},k=0,l=e;k ")+'"')},h=function(a){return Error('Declaration of module "'+a.name+'" has already been provided')},i=function(a){return Error('Multiple declarations of module "'+a.name+'" have been detected')},j=function(a,b){for(var c,d=0;c=b[d++];)if(a===c)return!0;return!1},k=function(){var b=[],c=function(a){return 1===b.push(a)},d=function(){var a=b,c=0,d=b.length;for(b=[];c=0&&!b.defaultPrevented();){var d=a[c];d&&(d.context?d.callback.call(d.context,b):d.callback(b)),c--}}}),a(e)}),shower.modules.define("Plugins",["Emitter","util.extend"],function(a,b,c){function d(a){this.events=new b({context:this}),this._showerGlobal=a,this._showerInstances=a.getInited(),this._plugins={},this._instances=[],a.events.on("init",this._onShowerInit,this)}c(d.prototype,{destroy:function(){this._showerGlobal.events.off("init",this._onShowerInit,this),this._plugins=null},add:function(a,b){if(this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" already exist.");return this._requireAndAdd({name:a,options:b}),this},remove:function(a){if(!this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" not found.");return delete this._plugins[a],this.events.emit("remove",{name:a}),this},get:function(a,b){var c,d=this._plugins[a];if(d&&b)for(var e=0,f=this._instances.length;e=0;e--)if(d[e].getId()===a){b=d[e],c=e;break}return{slide:b,index:c}},_onSlideActivate:function(a){window.location.hash=a.get("slide").getId(),this._setTitle()},_onContainerSlideModeChange:function(){this._setTitle(),this.save()},_isSlideMode:function(){return this._shower.container.isSlideMode()},_onPopstate:function(){var a,b=this._shower,c=window.location.hash.substr(1),d=b.player.getCurrentSlide(),e=b.player.getCurrentSlideIndex();this._isSlideMode()&&e===-1?b.player.go(0):e===-1&&""!==window.location.hash&&b.player.go(0),d&&c!==d.getId()&&(a=this._getSlideById(c),b.player.go(a.index))},_setTitle:function(){var a=document.title,b=this._isSlideMode(),c=this._shower.player.getCurrentSlide();if(b&&c){var d=c.getTitle();document.title=d?d+" — "+this._documentTitle:this._documentTitle}else this._documentTitle!==a&&(document.title=this._documentTitle)}}),a(e)}),shower.modules.define("shower.Player",["Emitter","util.bound","util.extend"],function(a,b,c,d){function e(a){this.events=new b({context:this,parent:a.events}),this._shower=a,this._showerListeners=null,this._playerListeners=null,this._currentSlideNumber=-1,this._currentSlide=null,this.init()}d(e.prototype,{init:function(){this._showerListeners=this._shower.events.group().on("slideadd",this._onSlideAdd,this).on("slideremove",this._onSlideRemove,this).on("slidemodeenter",this._onSlideModeEnter,this),this._playerListeners=this.events.group().on("prev",this._onPrev,this).on("next",this._onNext,this),document.addEventListener("keydown",c(this,"_onKeyDown"))},destroy:function(){this._showerListeners.offAll(),this._playerListeners.offAll(),document.removeEventListener("keydown",c(this,"_onKeyDown")),this._currentSlide=null,this._currentSlideNumber=null,this._shower=null},next:function(){return this.events.emit("next"),this},prev:function(){return this.events.emit("prev"),this},first:function(){return this.go(0),this},last:function(){return this.go(this._shower.getSlidesCount()-1),this},go:function(a){"number"!=typeof a&&(a=this._shower.getSlideIndex(a));var b=this._shower.getSlidesCount(),c=this._currentSlide;return a!=this._currentSlideNumber&&a=0&&(c&&c.isActive()&&c.deactivate(),c=this._shower.get(a),this._currentSlide=c,this._currentSlideNumber=a,c.isActive()||c.activate(),this.events.emit("activate",{index:a,slide:c})),this},getCurrentSlide:function(){return this._currentSlide},getCurrentSlideIndex:function(){return this._currentSlideNumber},_onPrev:function(){this._changeSlide(this._currentSlideNumber-1)},_onNext:function(){this._changeSlide(this._currentSlideNumber+1)},_changeSlide:function(a){this.go(a)},_onSlideAdd:function(a){var b=a.get("slide");b.events.on("activate",this._onSlideActivate,this)},_onSlideRemove:function(a){var b=a.get("slide");b.events.off("activate",this._onSlideActivate,this)},_onSlideActivate:function(a){var b=a.get("slide"),c=this._shower.getSlideIndex(b);this.go(c)},_onKeyDown:function(a){if(this._shower.isHotkeysEnabled()&&!/^(?:button|input|select|textarea)$/i.test(a.target.tagName))switch(this.events.emit("keydown",{event:a}),a.which){case 33:case 38:case 37:case 72:case 75:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.prev();break;case 34:case 40:case 39:case 76:case 74:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.next();break;case 36:a.preventDefault(),this.first();break;case 35:a.preventDefault(),this.last();break;case 32:this._shower.container.isSlideMode()&&(a.shiftKey?this.prev():this.next())}},_onSlideModeEnter:function(){this._currentSlide||this.go(0)}}),a(e)}),shower.modules.define("shower.slidesParser",["Slide"],function(a,b){function c(a,c){var d=a.querySelectorAll(c);return d=Array.prototype.slice.call(d),d.map(function(a,c){var d=new b(a);return a.id||(a.id=c+1),d})}a(c)}),shower.modules.define("Slide",["shower.defaultOptions","Emitter","Options","slide.Layout","slide.layoutFactory","util.Store","util.extend"],function(a,b,c,d,e,f,g,h){function i(a,b,e){this.events=new c,this.options=new d(b),this.layout=null,this.state=new g({visited:0,index:null},e),this._content=a,this._isVisited=this.state.get("visited")>0,this._isActive=!1,this.init()}h(i.prototype,{init:function(){this.layout="string"==typeof this._content?new f.createLayout({content:this._content}):new e(this._content,this.options),this.layout.setParent(this),this._setupListeners()},destroy:function(){this._clearListeners(),this._isActive=null,this.options=null,this.layout.destroy()},activate:function(){this._isActive=!0;var a=this.state.get("visited");return this.state.set("visited",++a),this.events.emit("activate",{slide:this}),this},deactivate:function(){return this._isActive=!1,this.events.emit("deactivate",{slide:this}),this},isActive:function(){return this._isActive},isVisited:function(){return this.state.get("visited")>0},getTitle:function(){return this.layout.getTitle()},setTitle:function(a){return this.layout.setTitle(a),this},getId:function(){return this.layout.getElement().id},getContent:function(){return this.layout.getContent()},_setupListeners:function(){this.layoutListeners=this.layout.events.group().on("click",this._onSlideClick,this)},_clearListeners:function(){this.layoutListeners.offAll()},_onSlideClick:function(){this.activate(),this.events.emit("click",{slide:this})}}),a(i)}),shower.modules.define("slide.Layout",["Options","shower.defaultOptions","Emitter","util.bound","util.extend"],function(a,b,c,d,e,f){function g(a,e){this.options=new b({title_element_selector:c.slide_title_element_selector,active_classname:c.slide_active_classname,visited_classname:c.slide_visited_classname},e),this.events=new d,this._element=a,this._parent=null,this._parentElement=null,this.init()}f(g.prototype,{init:function(){var a=this._element.parentNode;a?this._parentElement=a:this.setParentElement(a)},destroy:function(){this.setParent(null)},setParent:function(a){this._parent!=a&&(this._clearListeners(),this._parent=a,this._parent&&this._setupListeners(),this.events.emit("parentchange",{parent:a}))},getParent:function(){return this._parent},setParentElement:function(a){a!=this._parentElement&&(this._parentElement=a,a.appendChild(this._element),this.events.emit("parentelementchange",{parentElement:a}))},getParentElement:function(){return this._parentElement},getElement:function(){return this._element},setTitle:function(a){var b=this.options.get("title_element_selector"),c=this._element.querySelector(b);c?c.innerHTML=a:(c=document.createElement(b),c.innerHTML=a,this._element.insertBefore(c,this._element.firstChild))},getTitle:function(){var a=this.options.get("title_element_selector"),b=this._element.querySelector(a);return b?b.textContent:null},getData:function(a){var b=this._element;return b.dataset?b.dataset[a]:b.getAttribute("data-"+a)},getContent:function(){return this._element.innerHTML},_setupListeners:function(){this._slideListeners=this._parent.events.group().on("activate",this._onSlideActivate,this).on("deactivate",this._onSlideDeactivate,this),this._element.addEventListener("click",e(this,"_onSlideClick"),!1)},_clearListeners:function(){this._slideListeners&&this._slideListeners.offAll(),this._element.removeEventListener("click",e(this,"_onSlideClick"))},_onSlideActivate:function(){this._element.classList.add(this.options.get("active_classname"))},_onSlideDeactivate:function(){var a=this._element.classList;a.remove(this.options.get("active_classname")),a.add(this.options.get("visited_classname"))},_onSlideClick:function(){this.events.emit("click")}}),a(g)}),shower.modules.define("slide.layoutFactory",["slide.Layout","util.extend"],function(a,b,c){var d={};c(d,{createLayout:function(a){a=a||{};var e=d._createElement(c({content:"",contentType:"slide"},a));return new b(e)},_createElement:function(a){var b=document.createElement("section");return b.innerHTML=a.content,b.classList.add(a.contentType),b}}),a(d)}),shower.modules.define("util.bound",function(a){function b(a,b){return a["__bound_"+b]||(a["__bound_"+b]=a[b].bind(a))}a(b)}),shower.modules.define("util.extend",function(a){function b(a){if(!a)throw new Error("util.extend: Target not found");return"undefined"==typeof Object.assign?c.apply(null,arguments):Object.assign.apply(null,arguments)}function c(a){for(var b=1,c=arguments.length;b0&&(a.preventDefault(),this.prev())},_go:function(){for(var a=0,b=this._elements.length;awindow.innerWidth/2?c.player.next():c.player.prev()),d||f.activate())},_onTouchMove:function(a){this._shower.container.isSlideMode()&&a.preventDefault()},_getSlideByElement:function(a){for(var b=this._shower.getSlides(),c=null,d=0,e=b.length;d` of your presentation. - -## PDF - -Ribbon could be exported to PDF by printing it from the list mode in Chrome or Opera browsers. See [printing documentation](https://github.com/shower/shower/blob/master/docs/printing-en.md) for more options. - -## Development - -If you want to adjust theme for your needs: - -1. Fork this repository and clone it to your local machine. -2. Install dependencies: `npm install`. -3. Start a local server with watcher: `npm run dev` or just `gulp` if you have it installed globally. -4. Edit your files and see changes in the opened browser. - -To take part in Ribbon development please read [contributing guidelines](CONTRIBUTING.md) first and [file an issue](https://github.com/shower/shower/issues/new) before sending any pull request. - ---- -Licensed under [MIT License](LICENSE.md). diff --git a/doc/presentations/meetup4/shower/themes/ribbon/index.html b/doc/presentations/meetup4/shower/themes/ribbon/index.html deleted file mode 100644 index 98850917e05..00000000000 --- a/doc/presentations/meetup4/shower/themes/ribbon/index.html +++ /dev/null @@ -1,304 +0,0 @@ - - - - Ribbon theme for Shower - - - - - - -
-

Presentation Title

-

Yours Truly, Famous Inc.

-
-
-

Slide Header

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch letterpress.

-

Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid four loko quinoa.

-

Echo Park 8-bit sustainable umami deep v Kickstarter.

-
-
-

Inline Elements

-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-
-
-

Quotes

-
-

Flannel bicycle rights locavore selfies skateboard. Authentic fanny pack paleo four loko bespoke. Artisan tattooed chia XOXO ennui, lomo disrupt 8-bit art party Tumblr scenester.

-
-
-
-

Post-ironic fashion axe flexitarian, Tonx narwhal messenger bag Tumblr. Portland gentrify deep v kale chips literally.

-
-
Yours Truly
-
-
-
-

Nested Lists

-
    -
  1. Literally viral vegan, ugh drinking vinegar photo booth
  2. -
  3. Wes Anderson chillwave Marfa pour-over Etsy banh mi
  4. -
  5. Ethnic polaroid lo-fi iPhone ennui -
      -
    • Yr wayfarers before they sold out Kickstarter asymmetrical
    • -
    • Irony flexitarian readymade quinoa, kogi bespoke meggings narwhal
    • -
    • Skateboard Etsy twee artisan Echo Park
    • -
    -
  6. -
  7. Tonx kitsch fingerstache readymade, retro single-origin coffee
  8. -
-
-
-

Block Lists

-
    -
  • Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack.
  • -
  • Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag.
  • -
  • Leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical paleo you probably haven’t heard of.
  • -
-
-
-

Latin and Cyrillic List Bullets

-
    -
  • Occupy locavore blog, mustache you probably haven't heard of them
  • -
  • Skateboard pork belly aesthetic hoodie selfies brunch
  • -
  • Food truck gluten-free disrupt Portland
  • -
-
    -
  • Helvetica narwhal drinking vinegar chillwave, post-ironic ennui
  • -
  • Cray pug paleo retro, Echo Park narwhal Wes Anderson
  • -
  • Disrupt Williamsburg fixie, shabby chic bicycle rights hashtag kogi
  • -
-
-
-

Two Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Three Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Simple Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Striped Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Plain Code Listing

-
<html lang="en">
-<head> <!--Comment-->
-    <title>Shower</title>
-    <meta charset="UTF-8">
-    <link rel="stylesheet" href="screen.css">
-    <script src="script.js"></script>
-</head>
-
-
-

Numbered Code Listing

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Lines

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Hidden Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Grid Slide

-
-
-

White Slide

-
-
-

Black Slide

-
-
-

Shout

-
-
-

Multiline
Shout

-
-
-

Linked Shout

-
-
-

Growing Shout

-
-
-

Shrinking Shout

-
-
-
- -
Copyright © 2016 Yours Truly, Famous Inc.
-
-
-
- -
-
- -
-
- - - - - - - - - -
-
-

Timer

-
-
-

List Navigation

-
    -
  1. Ennui keffiyeh thundercats
  2. - - - - -
-

Before they sold out master

-
-
- - - - - diff --git a/doc/presentations/meetup4/shower/themes/ribbon/pictures/canvas.png b/doc/presentations/meetup4/shower/themes/ribbon/pictures/canvas.png deleted file mode 100644 index 6ddd30154f2..00000000000 Binary files a/doc/presentations/meetup4/shower/themes/ribbon/pictures/canvas.png and /dev/null differ diff --git a/doc/presentations/meetup4/shower/themes/ribbon/pictures/exact.png b/doc/presentations/meetup4/shower/themes/ribbon/pictures/exact.png deleted file mode 100644 index b27251c57cb..00000000000 Binary files a/doc/presentations/meetup4/shower/themes/ribbon/pictures/exact.png and /dev/null differ diff --git a/doc/presentations/meetup4/shower/themes/ribbon/pictures/square.png b/doc/presentations/meetup4/shower/themes/ribbon/pictures/square.png deleted file mode 100644 index 62cb2384a5f..00000000000 Binary files a/doc/presentations/meetup4/shower/themes/ribbon/pictures/square.png and /dev/null differ diff --git a/doc/presentations/meetup4/shower/themes/ribbon/pictures/tall.png b/doc/presentations/meetup4/shower/themes/ribbon/pictures/tall.png deleted file mode 100644 index fbc9f09a2ab..00000000000 Binary files a/doc/presentations/meetup4/shower/themes/ribbon/pictures/tall.png and /dev/null differ diff --git a/doc/presentations/meetup4/shower/themes/ribbon/pictures/wide.png b/doc/presentations/meetup4/shower/themes/ribbon/pictures/wide.png deleted file mode 100644 index 1e83b0ac7ad..00000000000 Binary files a/doc/presentations/meetup4/shower/themes/ribbon/pictures/wide.png and /dev/null differ diff --git a/doc/presentations/meetup4/shower/themes/ribbon/styles/presentation_links.html b/doc/presentations/meetup4/shower/themes/ribbon/styles/presentation_links.html deleted file mode 100644 index 74981d8eac5..00000000000 --- a/doc/presentations/meetup4/shower/themes/ribbon/styles/presentation_links.html +++ /dev/null @@ -1,2 +0,0 @@ -http://www.slideshare.net/AlexeyMilovidov1/clickhouse-69616890/AlexeyMilovidov1/clickhouse-69616890 -file:///home/milovidov/work/Presentation/shower/index.html#cover diff --git a/doc/presentations/meetup4/shower/themes/ribbon/styles/screen-16x10.css b/doc/presentations/meetup4/shower/themes/ribbon/styles/screen-16x10.css deleted file mode 100644 index d21f190feea..00000000000 --- a/doc/presentations/meetup4/shower/themes/ribbon/styles/screen-16x10.css +++ /dev/null @@ -1,204 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8"; - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot); - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/CYblzLEXzCqQIvrYs7QKQe2omRk.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/pUcnOdRwl83MvPPzrNomhyletnA.woff) format('woff'), - url(https://yastatic.net/adv-www/_/vNFEmXOcGYKJ4AAidUprHWoXrLU.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/0w7OcWZM_QLP8x-LQUXFOgXO6dE.svg#YandexSansTextWeb-Bold) format('svg'); - font-weight: 700; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot); - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/z3MYElcut0R2MF_Iw1RDNrstgYs.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/1jvKJ_-hCXl3s7gmFl-y_-UHTaI.woff) format('woff'), - url(https://yastatic.net/adv-www/_/9nzjfpCR2QHvK1EzHpDEIoVFGuY.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/gwyBTpxSwkFCF1looxqs6JokKls.svg#YandexSansTextWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot); - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/lGQcYklLVV0hyvz1HFmFsUTj8_0.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/f0AAJ9GJ4iiwEmhG-7PWMHk6vUY.woff) format('woff'), - url(https://yastatic.net/adv-www/_/4UDe4nlVvgEJ-VmLWNVq3SxCsA.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/EKLr1STNokPqxLAQa_RyN82pL98.svg#YandexSansTextWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot); - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'), - url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot); - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/LGiRvlfqQHlWR9YKLhsw5e7KGNA.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/40vXwNl4eYYMgteIVgLP49dwmfc.woff) format('woff'), - url(https://yastatic.net/adv-www/_/X6zG5x_wO8-AtwJ-vDLJcKC5228.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/ZKhaR0m08c8CRRL77GtFKoHcLYA.svg#YandexSansDisplayWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - -*,::after,::before{box-sizing:border-box} -a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline} -article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block} -.caption p,body{line-height:1} -p {line-height: 1} -ol,ul{list-style:none} -blockquote,q{quotes:none} -blockquote::after,blockquote::before,q::after,q::before{content:none} -table{border-collapse:collapse;border-spacing:0} -a{text-decoration:none} -@page{margin:0;size:1280px 720px} -.shower{color:#000;counter-reset:slide;font:25px/2 Yandex Sans Display Web,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none} -@media print{.shower{text-rendering:geometricPrecision} -} -.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90} -@media (min-width:1174px){.caption{font-size:50px} -} -@media (min-width:2348px){.caption{font-size:100px} -} -.caption h1{padding-bottom:.15em;font:1em/2 Yandex Sans Display Web,sans-serif} -.caption p{font-size:.6em} -.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60} -.slide{position:relative;z-index:1;overflow:hidden;padding:20px 100px 0;width:1024px;height:640px;background:#fff;font-size:25px} - -/*.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}*/ - -.slide h1{vertical-align:middle; color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide h2{margin-bottom:34px;color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide p{margin-bottom:1em} -.slide p.note{color:#979a9e} -.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2} -.slide b,.slide strong{font-weight:700} -.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic} -.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em} -.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace} -.slide mark{background:#fafaa2} -.slide sub,.slide sup{position:relative;line-height:0;font-size:75%} -.slide sub{bottom:-.25em} -.slide sup{top:-.5em} -.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'} -.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700} -.slide ol,.slide ul{margin-bottom:0em;counter-reset:list} -.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em} -.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right} -.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em} -.slide ul>li::before{padding-right:.5em;content:'•'} -.slide ul>li:lang(ru)::before{content:'—'} -.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."} -.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)} -.slide table td:first-child,.slide table th:first-child{padding-left:96px} -.slide table td:last-child,.slide table th:last-child{padding-right:96px} -.slide table th{text-align:left;font-weight:700} -.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x} -.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)} -.slide table.striped tr>*{background-image:none} -.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal} -.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4} -.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)} -.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."} -.slide pre mark{position:relative;z-index:-1;margin:0 -.3em} -.slide pre mark.important{background:#c00;color:#fff} -.slide pre .comment{color:#999} -.slide footer{position:absolute;right:0;bottom:-640px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s} -.slide footer mark{background:rgba(255,255,255,.8)} -.slide:hover>footer{bottom:0} -.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated} -@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto} -} -.slide.black{background-color:#000} -.slide.black::after,.slide.white::after{visibility:hidden} -.slide.white{background-color:#fff} -.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto} -.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2} -.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3} -.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)} -.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x} -.slide .cover{z-index:-1;max-width:100%;max-height:100%} -.slide .cover.w,.slide .cover.width{width:100%;max-height:none} -.slide .cover.h,.slide .cover.height{height:100%;max-width:none} -.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)} -.slide .cover+figcaption.white{color:#fff} -.slide .cover+figcaption a{color:currentcolor} -.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)} -.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none} -.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)} -.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)} -.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0} -.slide .place.r,.slide .place.right{right:0;left:auto} -.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0} -.slide .place.l,.slide .place.left{left:0} -.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)} -.progress[style*='100%']{padding-left:10px} -.badge,.badge a,.progress{position:absolute} -.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden} -@media (min-width:1174px){.badge{font-size:20px} -} -@media (min-width:2348px){.badge{font-size:40px} -} -.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)} -.region{display:none} -@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)} -} -@media screen and (min-width:1174px){.shower.list{padding-top:50px} -} -@media screen and (min-width:2348px){.shower.list{padding-top:100px} -} -@media screen{.shower.list .caption{display:block} -.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -455px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)} -} -@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -270px 50px;-webkit-transform:scale(.5);transform:scale(.5)} -} -@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)} -} -@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide *{pointer-events:none} -.shower.list .badge,.shower.list .slide footer{display:block} -.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-320px 0 0 -512px;width:1024px;height:640px;background:#000} -.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden} -.shower.full .slide:target{margin:0;visibility:visible} -.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0} -.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)} -.shower.full .slide .next{visibility:hidden} -.shower.full .slide .next.active{visibility:visible} -.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform} -.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)} -.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)} -.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)} -.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)} -.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block} -} diff --git a/doc/presentations/meetup4/shower/themes/ribbon/styles/screen-4x3.css b/doc/presentations/meetup4/shower/themes/ribbon/styles/screen-4x3.css deleted file mode 100644 index 6648b972c30..00000000000 --- a/doc/presentations/meetup4/shower/themes/ribbon/styles/screen-4x3.css +++ /dev/null @@ -1,7 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8";@font-face{font-family:PT Sans;src:url(../fonts/pt-sans-regular.woff) format("woff")}@font-face{font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold.woff) format("woff")}@font-face{font-style:italic;font-family:PT Sans;src:url(../fonts/pt-sans-italic.woff) format("woff")}@font-face{font-style:italic;font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold-italic.woff) format("woff")}@font-face{font-family:PT Sans Narrow;font-weight:700;src:url(../fonts/pt-sans-narrow-bold.woff) format("woff")}@font-face{font-family:PT Mono;src:url(../fonts/pt-mono-regular.woff) format("woff")}*,::after,::before{box-sizing:border-box}a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block}.caption p,body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote::after,blockquote::before,q::after,q::before{content:none}table{border-collapse:collapse;border-spacing:0}a{text-decoration:none}@page{margin:0;size:1024px 768px}.shower{color:#000;counter-reset:slide;font:25px/2 PT Sans,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none}@media print{.shower{text-rendering:geometricPrecision}}.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90}@media (min-width:1174px){.caption{font-size:50px}}@media (min-width:2348px){.caption{font-size:100px}}.caption h1{padding-bottom:.15em;font:700 1em/1 PT Sans Narrow,sans-serif}.caption p{font-size:.6em}.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60}.slide{position:relative;z-index:1;overflow:hidden;padding:106px 100px 0;width:1024px;height:768px;background:#fff;font-size:25px}.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}.slide h2{margin-bottom:34px;color:#585a5e;font:700 50px/1 PT Sans Narrow,sans-serif}.slide p{margin-bottom:1em}.slide p.note{color:#979a9e}.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2}.slide b,.slide strong{font-weight:700}.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic}.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em}.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace}.slide mark{background:#fafaa2}.slide sub,.slide sup{position:relative;line-height:0;font-size:75%}.slide sub{bottom:-.25em}.slide sup{top:-.5em}.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'}.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700}.slide ol,.slide ul{margin-bottom:1em;counter-reset:list}.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em}.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right}.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em}.slide ul>li::before{padding-right:.5em;content:'•'}.slide ul>li:lang(ru)::before{content:'—'}.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."}.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)}.slide table td:first-child,.slide table th:first-child{padding-left:96px}.slide table td:last-child,.slide table th:last-child{padding-right:96px}.slide table th{text-align:left;font-weight:700}.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x}.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)}.slide table.striped tr>*{background-image:none}.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal}.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4}.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)}.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."}.slide pre mark{position:relative;z-index:-1;margin:0 -.3em}.slide pre mark.important{background:#c00;color:#fff}.slide pre .comment{color:#999}.slide footer{position:absolute;right:0;bottom:-768px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s}.slide footer mark{background:rgba(255,255,255,.8)}.slide:hover>footer{bottom:0}.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated}@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto}}.slide.black{background-color:#000}.slide.black::after,.slide.white::after{visibility:hidden}.slide.white{background-color:#fff}.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto}.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2}.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3}.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)}.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x}.slide .cover{z-index:-1;max-width:100%;max-height:100%}.slide .cover.w,.slide .cover.width{width:100%;max-height:none}.slide .cover.h,.slide .cover.height{height:100%;max-width:none}.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)}.slide .cover+figcaption.white{color:#fff}.slide .cover+figcaption a{color:currentcolor}.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)}.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none}.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)}.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)}.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0}.slide .place.r,.slide .place.right{right:0;left:auto}.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0}.slide .place.l,.slide .place.left{left:0}.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)}.progress[style*='100%']{padding-left:10px}.badge,.badge a,.progress{position:absolute}.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden}@media (min-width:1174px){.badge{font-size:20px}}@media (min-width:2348px){.badge{font-size:40px}}.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)}.region{display:none}@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)}}@media screen and (min-width:1174px){.shower.list{padding-top:50px}}@media screen and (min-width:2348px){.shower.list{padding-top:100px}}@media screen{.shower.list .caption{display:block}.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -551px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)}}@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -334px 50px;-webkit-transform:scale(.5);transform:scale(.5)}}@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)}}@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)}.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)}.shower.list .slide *{pointer-events:none}.shower.list .badge,.shower.list .slide footer{display:block}.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-384px 0 0 -512px;width:1024px;height:768px;background:#000}.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden}.shower.full .slide:target{margin:0;visibility:visible}.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0}.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)}.shower.full .slide .next{visibility:hidden}.shower.full .slide .next.active{visibility:visible}.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform}.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)}.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)}.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)}.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)}.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block}} \ No newline at end of file diff --git a/doc/presentations/tbd/LICENSE.md b/doc/presentations/tbd/LICENSE.md deleted file mode 100644 index bd3449d3576..00000000000 --- a/doc/presentations/tbd/LICENSE.md +++ /dev/null @@ -1,21 +0,0 @@ -# The MIT License - -Copyright © 2010–2015 Vadim Makeev, http://pepelsbey.net/ - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ---- - -# Лицензия MIT - -Copyright © 2010–2015 Вадим Макеев, http://pepelsbey.net/ - -Данная лицензия разрешает лицам, получившим копию данного программного обеспечения и сопутствующей документации (в дальнейшем именуемыми «Программное Обеспечение»), безвозмездно использовать Программное Обеспечение без ограничений, включая неограниченное право на использование, копирование, изменение, добавление, публикацию, распространение, сублицензирование и/или продажу копий Программного Обеспечения, также как и лицам, которым предоставляется данное Программное Обеспечение, при соблюдении следующих условий: - -Указанное выше уведомление об авторском праве и данные условия должны быть включены во все копии или значимые части данного Программного Обеспечения. - -ДАННОЕ ПРОГРАММНОЕ ОБЕСПЕЧЕНИЕ ПРЕДОСТАВЛЯЕТСЯ «КАК ЕСТЬ», БЕЗ КАКИХ-ЛИБО ГАРАНТИЙ, ЯВНО ВЫРАЖЕННЫХ ИЛИ ПОДРАЗУМЕВАЕМЫХ, ВКЛЮЧАЯ, НО НЕ ОГРАНИЧИВАЯСЬ ГАРАНТИЯМИ ТОВАРНОЙ ПРИГОДНОСТИ, СООТВЕТСТВИЯ ПО ЕГО КОНКРЕТНОМУ НАЗНАЧЕНИЮ И ОТСУТСТВИЯ НАРУШЕНИЙ ПРАВ. НИ В КАКОМ СЛУЧАЕ АВТОРЫ ИЛИ ПРАВООБЛАДАТЕЛИ НЕ НЕСУТ ОТВЕТСТВЕННОСТИ ПО ИСКАМ О ВОЗМЕЩЕНИИ УЩЕРБА, УБЫТКОВ ИЛИ ДРУГИХ ТРЕБОВАНИЙ ПО ДЕЙСТВУЮЩИМ КОНТРАКТАМ, ДЕЛИКТАМ ИЛИ ИНОМУ, ВОЗНИКШИМ ИЗ, ИМЕЮЩИМ ПРИЧИНОЙ ИЛИ СВЯЗАННЫМ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ ИЛИ ИСПОЛЬЗОВАНИЕМ ПРОГРАММНОГО ОБЕСПЕЧЕНИЯ ИЛИ ИНЫМИ ДЕЙСТВИЯМИ С ПРОГРАММНЫМ ОБЕСПЕЧЕНИЕМ. diff --git a/doc/presentations/tbd/index.html b/doc/presentations/tbd/index.html deleted file mode 100644 index 0aaa3d7f18f..00000000000 --- a/doc/presentations/tbd/index.html +++ /dev/null @@ -1,220 +0,0 @@ - - - - Немного про ClickHouse - - - - - - -
-

Немного про ClickHouse

-
-
-

Немного про ClickHouse

-
-
-

Обо мне

-

Алексей, разработчик ClickHouse.

-

С 2008 занимался движком обработки данных в Яндекс.Метрике.

-
-
-

История

-

Яндекс.Метрика — сервис веб-аналитики.

-

В России первый, в мире — второй.

-

-

Ежедневно приходит ~25 млрд. событий.

-

Надо показывать отчёты в реальном времени.

-
-
-

Старая Метрика (2008–2014)

-

Всё отлично работало. Пользователь мог получить примерно 50 разных отчётов.

-

Но есть проблема. Нам хочется большего. Чтобы каждый отчёт был сколь угодно кастомизируемым.

-
-
-

Конструктор отчётов

-

Быстро сделали прототип и на его основе реализовали "Конструктор отчётов".

-

Это 2010 год.

-

Стало понятно, куда двигаться дальше.

-

Нам нужна хорошая column-oriented DBMS.

-
-
-

Почему column-oriented?

-

Так работают row-oriented системы:

-

-
-
-

Почему column-oriented?

-

Так работают column-oriented системы:

-

-
-
-

Почему ClickHouse?

-

Ничего готового не подошло.

-

Тогда мы сделали ClickHouse.

-

«Эволюция структур данных в Яндекс.Метрике»

-

https://habrahabr.ru/company/yandex/blog/273305/

-
-
-

Метрика 2.0

- -
-
-

Коротко

-
    -
  • column-oriented
  • -
  • линейная масштабируемость
  • -
  • отказоустойчивость
  • -
  • загрузка данных в реальном времени
  • -
  • онлайн (sub-second) запросы
  • -
  • поддержка диалекта SQL + расширения
    (массивы, вложенные структуры данных, domain-specific функции, сэмплирование)
  • -
-
-
-

Основной кластер Метрики

-
    -
  • 18.3 триллионов строк
  • -
  • 426 серверов
  • -
  • скорость обработки данных до двух терабайт в секунду
  • -
-

* Если вы хотите попробовать ClickHouse, достаточно и одного сервера.

-
-
-

ClickHouse в Яндексе

-

Нам удалось сделать систему сравнительно удобной.

-

С самого начала мы имели подробную документацию.

-

В течение пары лет ClickHouse распространился по другим отделам Яндекса.

-

Почта, Маркет, Директ, Вебмастер, AdFox, Инфраструктура, Бизнес аналитика...

-

Есть случаи, когда аналитики самостоятельно устанавливали ClickHouse на виртуальные машины и успешно использовали без каких-либо вопросов.

-
-
-

Open-source

-

Потом мы решили — ClickHouse слишком хорошая система, чтобы нам одним на нём сидеть.

-

Чтобы было веселее, надо подсадить на ClickHouse людей снаружи, пусть радуются. Решили сделать open-source.

-
-
-

Open-source

-

Лицензия Apache 2.0 — минимум ограничений.

-

Цель — максимальное распространение продукта.

-

Мы хотим, чтобы продуктом Яндекса пользовались по всему миру.

-

См. “Яндекс открывает ClickHouse”

-

https://habrahabr.ru/company/yandex/blog/303282/

-
-
-

Когда надо использовать ClickHouse

-

Хорошо структурированные, очищенные, неизменяемые события.

-

 

-

Click stream. Веб-аналитика. Рекламные сети. RTB. E-commerce.

-

Аналитика онлайн игр. Данные сенсоров и мониторингов. Телеком данные.

-

Финансовые транзакции. Биржевая аналитика.

-
-
-

Когда не надо использовать ClickHouse

-

OLTP
В ClickHouse нет UPDATE и полноценных транзакций.

-

Key-Value
Если нужны частые запросы на обновление по ключу, используйте другое решение.

-

Blob-store, document oriented
ClickHouse предназначен для большого количества мелко-гранулированных данных.

-

Излишне нормализованные данные
Лучше сделать широкую таблицу фактов.

-
-
-

Почему ClickHouse такой быстрый?

-

 

-

— от безысходности.

-

Яндекс.Метрика должна работать.

-
-
-

Почему ClickHouse такой быстрый?

-

Алгоритмическая оптимизация.

-

MergeTree, локальность расположения данных на диске
— быстрые диапазонные запросы.

-

Пример: функция uniqCombined состоит из комбинации трёх различных структур данных, подходящих под разные диапазоны кардинальностей.

-

Низкоуровневая оптимизация.

-

Пример: vectorized query execution.

-

Специализация и внимание к деталям.

-

Пример: у нас есть 17 разных алгоритмов выполнения GROUP BY. Для вашего запроса выбирается лучший.

-
- -
-

ClickHouse vs. typical row-oriented DBMS

-

Itai Shirav:

«I haven't made a rigorous comparison, but I did convert a time-series table with 9 million rows from Postgres to ClickHouse.

-

Under ClickHouse queries run about 100 times faster, and the table takes 20 times less disk space. Which is pretty amazing if you ask me».

-
-
-

 

-

Bao Dang:

«Obviously, ClickHouse outperformed PostgreSQL at any metric».

-

https://github.com/AnalyticsGo/AnalyticsGo/issues/1

-
-
-

ClickHouse vs. Vertica

-

Timur Shenkao:

«ClickHouse is extremely fast at simple SELECTs without joins, much faster than Vertica».

-
-
-

ClickHouse vs. PrestoDB

-

Ömer Osman Koçak:

- «When we evaluated ClickHouse the results were great compared to Prestodb. Even though the columnar storage optimizations for ORC and Clickhouse is quite similar, Clickhouse uses CPU and Memory resources more efficiently (Presto also uses vectorized execution but cannot take advantage of hardware level optimizations such as SIMD instruction sets because it's written in Java so that's fair) so we also wanted to add support for Clickhouse for our open-source analytics platform Rakam (https://github.com/rakam-io/rakam)»

-
-
-

ClickHouse vs. Spark

-

«Я потестировал Clickhouse, по скорости просто отлично = намного быстрее spark на одной машине (у меня получилось порядка 3x, но еще буду сравнивать). Кроме того compression получается лучше».

-
-
-

ClickHouse vs. Google BigQuery

-

«ClickHouse показывает сравнимую скорость на таком запросе за 30 дней и в 8 раз быстрее (!) на таком запросе. В планах есть протестировать и другие запросы, еще не добрались.

Скорость выполнения запросов стабильна. В Google BigQuery в период пиковых нагрузок, например в 4:00 p.m. PDT или в начале месяца, время выполнения запросов может заметно увеличиваться».

-
-
-

ClickHouse vs. Druid

-

«В этом году мы развернули сборку на основе Druid — Imply Analytics Platform, а также Tranquility, и уже приготовились запускать в продакшн… Но после выхода ClickHouse сразу отказались от Druid, хотя потратили два месяца на его изучение и внедрение».

-

https://habrahabr.ru/company/smi2/blog/314558/

-
-
-

ClickHouse vs. InfiniDB

-

«结论:clickhouse速度更快!»

-

«In conclusion, ClickHouse is faster!»

-

http://verynull.com/2016/08/22/infinidb与clickhouse对比/

-

-
-
-

ClickHouse for sensor data

-

-
-
-

ClickHouse vs. Greenplum

-

-

На самом деле всё сложнее.

-
- -
-

Подключение к ClickHouse

-

HTTP REST

-

clickhouse-client

-

JDBC

-

 

-

Python, PHP, Go, Perl, Ruby, Node.JS, R, .NET

-

 

-

Web UI: https://github.com/smi2/clickhouse-frontend

-
-
-

Сообщество

-

Официальный сайт: https://clickhouse.yandex/

-

Google группа: https://groups.google.com/forum/#!forum/clickhouse

-

Рассылка: clickhouse-feedback@yandex-team.com

-

Чат в Telegram: https://telegram.me/clickhouse_ru

-

GitHub: https://github.com/yandex/ClickHouse/

-

 

-

+ митапы. Следите за анонсами.

-
-
-

 

-

Как запустить ClickHouse своими силами
и выиграть джекпот:

-

https://habrahabr.ru/company/smi2/blog/314558/

-
- -
-

 

-

Начните использовать ClickHouse сегодня!

-

Спасибо. Задавайте вопросы.

-
- -
- - - diff --git a/doc/presentations/tbd/index_en.html b/doc/presentations/tbd/index_en.html deleted file mode 100644 index 4e86af5a5ae..00000000000 --- a/doc/presentations/tbd/index_en.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - Briefly about ClickHouse - - - - - - -
-

Briefly about ClickHouse

-
-
-

Briefly about ClickHouse

-
-
-

About me

-

Alexey, developer of ClickHouse.

-

I work on data processing engine of Yandex.Metrica since 2008.

-
-
-

The history

-

Yandex.Metrica (https://metrica.yandex.com/) is a service for web analytics.

-

Largest in Russia, second largest in the world (just after Google Analytics).

-

-

We are processing about ~25 billions of events (page views, conversions, etc).

-

We must generate and show reports in realtime.

-
-
-

The old Metrica (RIP 2008–2014)

-

Everything was working fine. User could show about 50 different reports.

-

But there was a problem. We want more than just 50 pre-defined reports. We need to make every report infinitely customizable. The user must be able to slice and dice, and drill down every report from summary up to show single visitors.

-
-
-

The report builder

-

We had quickly made a prototype of so-called "report builder".

-

This was 2010 year. It was just simple specialized column-oriented data structure.

-

It worked fine and we got understanding, what the right direction to go.

-

We need good column-oriented DBMS.

-
-
-

Why column-oriented?

-

This is how "traditional" row-oriented databases work:

-

-
-
-

Why column-oriented?

-

And this is how column-oriented databases work:

-

-
-
-

Why ClickHouse?

-

In 2011 there was nothing suitable in the marked. In fact there is nothing comparable even now.

-

Then we developed ClickHouse.

-

See nice article «Evolution of data structures in Yandex.Metrica»

-

https://habrahabr.ru/company/yandex/blog/273305/

-

The article is in russian. Use machine translation. Also there is third-party translation to chinese, baidu for it.

-
-
-

The Metrica 2.0

- -
-
-

Briefly

-
    -
  • column-oriented
  • -
  • linearly scalable
  • -
  • fault-tolerant
  • -
  • data ingestion in realtime
  • -
  • realtime (sub-second) queries
  • -
  • support of SQL dialect + extensions
    (arrays, nested data types, domain-specific functions, approximate query execution)
  • -
-
-
-

The main cluster of Yandex.Metrica

-
    -
  • 18.3 trillions of rows (as of Nov 2016)
  • -
  • 426 servers
  • -
  • total throughput of query processing is up to two terabytes per second
  • -
-

* If you want to try ClickHouse, one server or VM is enough.

-
-
-

ClickHouse in Yandex

-

Surprisingly, ClickHouse appears to be rather convenient and handy for usage.

-

We have descriptive documentation from the beginning.

-

In about two years, many other departments in Yandex had started to use ClickHouse in production.

-

Yandex.Mail, Comparison shopping, Ads, Webmaster tools, Infrastructure monitoring, Business analytics, etc...

-

There was even cases, when single analysts install ClickHouse on their VMs and started to use it without any questions.

-
-
-

Open-source

-

Then we decided — ClickHouse is just too good to be used solely by Yandex.

-

To just have more fun, we need to make more companies and people around the world using ClickHouse, to let them be happy. We decided to be open-source.

-
-
-

Open-source

-

Apache 2.0 licence — very unrestrictive.

-

The goal — maximum widespread of ClickHouse.

-

We want for product by Yandex to be used everywhere.

-

See “Yandex open-sourced ClickHouse”

-

https://habrahabr.ru/company/yandex/blog/303282/

-

Article is also in russian, but you may just check corresponding Hacker's news thread

-
-
-

When to use ClickHouse

-

For well structured, clean, immutable events.

-

 

-

Click stream. Web analytics. Adv. networks. RTB. E-commerce.

-

Analytics for online games. Sensor and monitoring data. Telecom data.

-

Stock exchanges. Financial transactions.

-
-
-

When not to use ClickHouse

-

OLTP
ClickHouse doesn't have UPDATE statement and full-featured transactions.

-

Key-Value
If you want high load of small single-row queries, please use another system.

-

Blob-store, document oriented
ClickHouse is intended for vast amount of fine-grained data.

-

Over-normalized data
Better to make up single wide fact table with pre-joined dimensions.

-
-
-

Why ClickHouse is so fast?

-

 

-

— we just cannot make it slower.

-

Yandex.Metrica must work.

-
-
-

Why ClickHouse is so fast?

-

Algorithmic optimizations.

-

MergeTree, locality of data on disk
— fast range queries.

-

Example: uniqCombined function is a combination of three different data structures, used for different ranges of cardinalities.

-

Low-level optimizations.

-

Example: vectorized query execution.

-

Specialization and attention to detail.

-

Example: we have 17 different algorithms for GROUP BY. Best one is selected for your query.

-
- -
-

ClickHouse vs. typical row-oriented DBMS

-

Itai Shirav:

«I haven't made a rigorous comparison, but I did convert a time-series table with 9 million rows from Postgres to ClickHouse.

-

Under ClickHouse queries run about 100 times faster, and the table takes 20 times less disk space. Which is pretty amazing if you ask me».

-
-
-

 

-

Bao Dang:

«Obviously, ClickHouse outperformed PostgreSQL at any metric».

-

https://github.com/AnalyticsGo/AnalyticsGo/issues/1

-
-
-

ClickHouse vs. Vertica

-

Timur Shenkao:

«ClickHouse is extremely fast at simple SELECTs without joins, much faster than Vertica».

-
-
-

ClickHouse vs. PrestoDB

-

Ömer Osman Koçak:

- «When we evaluated ClickHouse the results were great compared to Prestodb. Even though the columnar storage optimizations for ORC and Clickhouse is quite similar, Clickhouse uses CPU and Memory resources more efficiently (Presto also uses vectorized execution but cannot take advantage of hardware level optimizations such as SIMD instruction sets because it's written in Java so that's fair) so we also wanted to add support for Clickhouse for our open-source analytics platform Rakam (https://github.com/rakam-io/rakam)»

-
-
-

ClickHouse vs. Spark

-

«Я потестировал Clickhouse, по скорости просто отлично = намного быстрее spark на одной машине (у меня получилось порядка 3x, но еще буду сравнивать). Кроме того compression получается лучше».

-
-
-

ClickHouse vs. Google BigQuery

-

«ClickHouse показывает сравнимую скорость на таком запросе за 30 дней и в 8 раз быстрее (!) на таком запросе. В планах есть протестировать и другие запросы, еще не добрались.

Скорость выполнения запросов стабильна. В Google BigQuery в период пиковых нагрузок, например в 4:00 p.m. PDT или в начале месяца, время выполнения запросов может заметно увеличиваться».

-
-
-

ClickHouse vs. Druid

-

«В этом году мы развернули сборку на основе Druid — Imply Analytics Platform, а также Tranquility, и уже приготовились запускать в продакшн… Но после выхода ClickHouse сразу отказались от Druid, хотя потратили два месяца на его изучение и внедрение».

-

https://habrahabr.ru/company/smi2/blog/314558/

-
-
-

ClickHouse vs. InfiniDB

-

«结论:clickhouse速度更快!»

-

«In conclusion, ClickHouse is faster!»

-

http://verynull.com/2016/08/22/infinidb与clickhouse对比/

-

-
-
-

ClickHouse for sensor data

-

-
-
-

ClickHouse vs. Greenplum

-

-

In fact, things are not so simple, there are many details.

-
- -
-

How to connect to ClickHouse

-

HTTP REST

-

clickhouse-client

-

JDBC

-

 

-

Python, PHP, Go, Perl, Ruby, Node.JS, R, .NET

-

 

-

Web UI: https://github.com/smi2/clickhouse-frontend

-

Redash, Zeppelin, Grafana, PowerBI - somewhat works

-
-
-

Community

-

Web site: https://clickhouse.yandex/

-

Google groups: https://groups.google.com/forum/#!forum/clickhouse

-

Maillist: clickhouse-feedback@yandex-team.com

-

Telegram chat: https://telegram.me/clickhouse_en and https://telegram.me/clickhouse_ru (now 308 members)

-

GitHub: https://github.com/yandex/ClickHouse/

-

 

-

+ meetups. Moscow, Saint-Petersburg... International meetups (Berlin, Paris) will be announced this year.

-
-
-

 

-

How to start using ClickHouse and win jackpot:

-

https://habrahabr.ru/company/smi2/blog/314558/

-
- -
-

 

-

More than 100 companies are already using ClickHouse in production. What about you? Start to use ClickHouse today!

-

Thank you. Questions.

-
- -
- - - diff --git a/doc/presentations/tbd/pictures/column_oriented.gif b/doc/presentations/tbd/pictures/column_oriented.gif deleted file mode 100644 index 15f4b12e697..00000000000 Binary files a/doc/presentations/tbd/pictures/column_oriented.gif and /dev/null differ diff --git a/doc/presentations/tbd/pictures/greenplum.png b/doc/presentations/tbd/pictures/greenplum.png deleted file mode 100644 index e919a45dadc..00000000000 Binary files a/doc/presentations/tbd/pictures/greenplum.png and /dev/null differ diff --git a/doc/presentations/tbd/pictures/infinidb_cn.png b/doc/presentations/tbd/pictures/infinidb_cn.png deleted file mode 100644 index 957c392a448..00000000000 Binary files a/doc/presentations/tbd/pictures/infinidb_cn.png and /dev/null differ diff --git a/doc/presentations/tbd/pictures/kaspersky.png b/doc/presentations/tbd/pictures/kaspersky.png deleted file mode 100644 index f8aae1da9ee..00000000000 Binary files a/doc/presentations/tbd/pictures/kaspersky.png and /dev/null differ diff --git a/doc/presentations/tbd/pictures/metrika2.png b/doc/presentations/tbd/pictures/metrika2.png deleted file mode 100644 index 3ee37e98fc6..00000000000 Binary files a/doc/presentations/tbd/pictures/metrika2.png and /dev/null differ diff --git a/doc/presentations/tbd/pictures/metrika_market_share.png b/doc/presentations/tbd/pictures/metrika_market_share.png deleted file mode 100644 index 9817998acc7..00000000000 Binary files a/doc/presentations/tbd/pictures/metrika_market_share.png and /dev/null differ diff --git a/doc/presentations/tbd/pictures/row_oriented.gif b/doc/presentations/tbd/pictures/row_oriented.gif deleted file mode 100644 index 53daa20f322..00000000000 Binary files a/doc/presentations/tbd/pictures/row_oriented.gif and /dev/null differ diff --git a/doc/presentations/tbd/shower/shower.min.js b/doc/presentations/tbd/shower/shower.min.js deleted file mode 100644 index 449843ac45d..00000000000 --- a/doc/presentations/tbd/shower/shower.min.js +++ /dev/null @@ -1,8 +0,0 @@ -/** - * Core for Shower HTML presentation engine - * shower-core v2.0.7, https://github.com/shower/core - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -!function(a){var b,c={NOT_RESOLVED:"NOT_RESOLVED",IN_RESOLVING:"IN_RESOLVING",RESOLVED:"RESOLVED"},d=function(){var l={trackCircularDependencies:!0,allowMultipleDeclarations:!0},m={},n=!1,o=[],p=function(a,d,e){e||(e=d,d=[]);var f=m[a];f||(f=m[a]={name:a,decl:b}),f.decl={name:a,prev:f.decl,fn:e,state:c.NOT_RESOLVED,deps:d,dependents:[],exports:b}},q=function(b,c,d){"string"==typeof b&&(b=[b]),n||(n=!0,k(v)),o.push({deps:b,cb:function(b,f){f?(d||e)(f):c.apply(a,b)}})},r=function(a){var b=m[a];return b?c[b.decl.state]:"NOT_DEFINED"},s=function(a){return!!m[a]},t=function(a){for(var b in a)a.hasOwnProperty(b)&&(l[b]=a[b])},u=function(){var a,b={};for(var c in m)m.hasOwnProperty(c)&&(a=m[c],(b[a.decl.state]||(b[a.decl.state]=[])).push(c));return b},v=function(){n=!1,w()},w=function(){var a,b=o,c=0;for(o=[];a=b[c++];)x(null,a.deps,[],a.cb)},x=function(a,b,c,d){var e=b.length;e||d([]);for(var g,h,i=[],j=function(a,b){if(b)return void d(null,b);if(!--e){for(var c,f=[],g=0;c=i[g++];)f.push(c.exports);d(f)}},k=0,l=e;k ")+'"')},h=function(a){return Error('Declaration of module "'+a.name+'" has already been provided')},i=function(a){return Error('Multiple declarations of module "'+a.name+'" have been detected')},j=function(a,b){for(var c,d=0;c=b[d++];)if(a===c)return!0;return!1},k=function(){var b=[],c=function(a){return 1===b.push(a)},d=function(){var a=b,c=0,d=b.length;for(b=[];c=0&&!b.defaultPrevented();){var d=a[c];d&&(d.context?d.callback.call(d.context,b):d.callback(b)),c--}}}),a(e)}),shower.modules.define("Plugins",["Emitter","util.extend"],function(a,b,c){function d(a){this.events=new b({context:this}),this._showerGlobal=a,this._showerInstances=a.getInited(),this._plugins={},this._instances=[],a.events.on("init",this._onShowerInit,this)}c(d.prototype,{destroy:function(){this._showerGlobal.events.off("init",this._onShowerInit,this),this._plugins=null},add:function(a,b){if(this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" already exist.");return this._requireAndAdd({name:a,options:b}),this},remove:function(a){if(!this._plugins.hasOwnProperty(a))throw new Error("Plugin "+a+" not found.");return delete this._plugins[a],this.events.emit("remove",{name:a}),this},get:function(a,b){var c,d=this._plugins[a];if(d&&b)for(var e=0,f=this._instances.length;e=0;e--)if(d[e].getId()===a){b=d[e],c=e;break}return{slide:b,index:c}},_onSlideActivate:function(a){window.location.hash=a.get("slide").getId(),this._setTitle()},_onContainerSlideModeChange:function(){this._setTitle(),this.save()},_isSlideMode:function(){return this._shower.container.isSlideMode()},_onPopstate:function(){var a,b=this._shower,c=window.location.hash.substr(1),d=b.player.getCurrentSlide(),e=b.player.getCurrentSlideIndex();this._isSlideMode()&&e===-1?b.player.go(0):e===-1&&""!==window.location.hash&&b.player.go(0),d&&c!==d.getId()&&(a=this._getSlideById(c),b.player.go(a.index))},_setTitle:function(){var a=document.title,b=this._isSlideMode(),c=this._shower.player.getCurrentSlide();if(b&&c){var d=c.getTitle();document.title=d?d+" — "+this._documentTitle:this._documentTitle}else this._documentTitle!==a&&(document.title=this._documentTitle)}}),a(e)}),shower.modules.define("shower.Player",["Emitter","util.bound","util.extend"],function(a,b,c,d){function e(a){this.events=new b({context:this,parent:a.events}),this._shower=a,this._showerListeners=null,this._playerListeners=null,this._currentSlideNumber=-1,this._currentSlide=null,this.init()}d(e.prototype,{init:function(){this._showerListeners=this._shower.events.group().on("slideadd",this._onSlideAdd,this).on("slideremove",this._onSlideRemove,this).on("slidemodeenter",this._onSlideModeEnter,this),this._playerListeners=this.events.group().on("prev",this._onPrev,this).on("next",this._onNext,this),document.addEventListener("keydown",c(this,"_onKeyDown"))},destroy:function(){this._showerListeners.offAll(),this._playerListeners.offAll(),document.removeEventListener("keydown",c(this,"_onKeyDown")),this._currentSlide=null,this._currentSlideNumber=null,this._shower=null},next:function(){return this.events.emit("next"),this},prev:function(){return this.events.emit("prev"),this},first:function(){return this.go(0),this},last:function(){return this.go(this._shower.getSlidesCount()-1),this},go:function(a){"number"!=typeof a&&(a=this._shower.getSlideIndex(a));var b=this._shower.getSlidesCount(),c=this._currentSlide;return a!=this._currentSlideNumber&&a=0&&(c&&c.isActive()&&c.deactivate(),c=this._shower.get(a),this._currentSlide=c,this._currentSlideNumber=a,c.isActive()||c.activate(),this.events.emit("activate",{index:a,slide:c})),this},getCurrentSlide:function(){return this._currentSlide},getCurrentSlideIndex:function(){return this._currentSlideNumber},_onPrev:function(){this._changeSlide(this._currentSlideNumber-1)},_onNext:function(){this._changeSlide(this._currentSlideNumber+1)},_changeSlide:function(a){this.go(a)},_onSlideAdd:function(a){var b=a.get("slide");b.events.on("activate",this._onSlideActivate,this)},_onSlideRemove:function(a){var b=a.get("slide");b.events.off("activate",this._onSlideActivate,this)},_onSlideActivate:function(a){var b=a.get("slide"),c=this._shower.getSlideIndex(b);this.go(c)},_onKeyDown:function(a){if(this._shower.isHotkeysEnabled()&&!/^(?:button|input|select|textarea)$/i.test(a.target.tagName))switch(this.events.emit("keydown",{event:a}),a.which){case 33:case 38:case 37:case 72:case 75:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.prev();break;case 34:case 40:case 39:case 76:case 74:if(a.altKey||a.ctrlKey||a.metaKey)return;a.preventDefault(),this.next();break;case 36:a.preventDefault(),this.first();break;case 35:a.preventDefault(),this.last();break;case 32:this._shower.container.isSlideMode()&&(a.shiftKey?this.prev():this.next())}},_onSlideModeEnter:function(){this._currentSlide||this.go(0)}}),a(e)}),shower.modules.define("shower.slidesParser",["Slide"],function(a,b){function c(a,c){var d=a.querySelectorAll(c);return d=Array.prototype.slice.call(d),d.map(function(a,c){var d=new b(a);return a.id||(a.id=c+1),d})}a(c)}),shower.modules.define("Slide",["shower.defaultOptions","Emitter","Options","slide.Layout","slide.layoutFactory","util.Store","util.extend"],function(a,b,c,d,e,f,g,h){function i(a,b,e){this.events=new c,this.options=new d(b),this.layout=null,this.state=new g({visited:0,index:null},e),this._content=a,this._isVisited=this.state.get("visited")>0,this._isActive=!1,this.init()}h(i.prototype,{init:function(){this.layout="string"==typeof this._content?new f.createLayout({content:this._content}):new e(this._content,this.options),this.layout.setParent(this),this._setupListeners()},destroy:function(){this._clearListeners(),this._isActive=null,this.options=null,this.layout.destroy()},activate:function(){this._isActive=!0;var a=this.state.get("visited");return this.state.set("visited",++a),this.events.emit("activate",{slide:this}),this},deactivate:function(){return this._isActive=!1,this.events.emit("deactivate",{slide:this}),this},isActive:function(){return this._isActive},isVisited:function(){return this.state.get("visited")>0},getTitle:function(){return this.layout.getTitle()},setTitle:function(a){return this.layout.setTitle(a),this},getId:function(){return this.layout.getElement().id},getContent:function(){return this.layout.getContent()},_setupListeners:function(){this.layoutListeners=this.layout.events.group().on("click",this._onSlideClick,this)},_clearListeners:function(){this.layoutListeners.offAll()},_onSlideClick:function(){this.activate(),this.events.emit("click",{slide:this})}}),a(i)}),shower.modules.define("slide.Layout",["Options","shower.defaultOptions","Emitter","util.bound","util.extend"],function(a,b,c,d,e,f){function g(a,e){this.options=new b({title_element_selector:c.slide_title_element_selector,active_classname:c.slide_active_classname,visited_classname:c.slide_visited_classname},e),this.events=new d,this._element=a,this._parent=null,this._parentElement=null,this.init()}f(g.prototype,{init:function(){var a=this._element.parentNode;a?this._parentElement=a:this.setParentElement(a)},destroy:function(){this.setParent(null)},setParent:function(a){this._parent!=a&&(this._clearListeners(),this._parent=a,this._parent&&this._setupListeners(),this.events.emit("parentchange",{parent:a}))},getParent:function(){return this._parent},setParentElement:function(a){a!=this._parentElement&&(this._parentElement=a,a.appendChild(this._element),this.events.emit("parentelementchange",{parentElement:a}))},getParentElement:function(){return this._parentElement},getElement:function(){return this._element},setTitle:function(a){var b=this.options.get("title_element_selector"),c=this._element.querySelector(b);c?c.innerHTML=a:(c=document.createElement(b),c.innerHTML=a,this._element.insertBefore(c,this._element.firstChild))},getTitle:function(){var a=this.options.get("title_element_selector"),b=this._element.querySelector(a);return b?b.textContent:null},getData:function(a){var b=this._element;return b.dataset?b.dataset[a]:b.getAttribute("data-"+a)},getContent:function(){return this._element.innerHTML},_setupListeners:function(){this._slideListeners=this._parent.events.group().on("activate",this._onSlideActivate,this).on("deactivate",this._onSlideDeactivate,this),this._element.addEventListener("click",e(this,"_onSlideClick"),!1)},_clearListeners:function(){this._slideListeners&&this._slideListeners.offAll(),this._element.removeEventListener("click",e(this,"_onSlideClick"))},_onSlideActivate:function(){this._element.classList.add(this.options.get("active_classname"))},_onSlideDeactivate:function(){var a=this._element.classList;a.remove(this.options.get("active_classname")),a.add(this.options.get("visited_classname"))},_onSlideClick:function(){this.events.emit("click")}}),a(g)}),shower.modules.define("slide.layoutFactory",["slide.Layout","util.extend"],function(a,b,c){var d={};c(d,{createLayout:function(a){a=a||{};var e=d._createElement(c({content:"",contentType:"slide"},a));return new b(e)},_createElement:function(a){var b=document.createElement("section");return b.innerHTML=a.content,b.classList.add(a.contentType),b}}),a(d)}),shower.modules.define("util.bound",function(a){function b(a,b){return a["__bound_"+b]||(a["__bound_"+b]=a[b].bind(a))}a(b)}),shower.modules.define("util.extend",function(a){function b(a){if(!a)throw new Error("util.extend: Target not found");return"undefined"==typeof Object.assign?c.apply(null,arguments):Object.assign.apply(null,arguments)}function c(a){for(var b=1,c=arguments.length;b0&&(a.preventDefault(),this.prev())},_go:function(){for(var a=0,b=this._elements.length;awindow.innerWidth/2?c.player.next():c.player.prev()),d||f.activate())},_onTouchMove:function(a){this._shower.container.isSlideMode()&&a.preventDefault()},_getSlideByElement:function(a){for(var b=this._shower.getSlides(),c=null,d=0,e=b.length;d` of your presentation. - -## PDF - -Ribbon could be exported to PDF by printing it from the list mode in Chrome or Opera browsers. See [printing documentation](https://github.com/shower/shower/blob/master/docs/printing-en.md) for more options. - -## Development - -If you want to adjust theme for your needs: - -1. Fork this repository and clone it to your local machine. -2. Install dependencies: `npm install`. -3. Start a local server with watcher: `npm run dev` or just `gulp` if you have it installed globally. -4. Edit your files and see changes in the opened browser. - -To take part in Ribbon development please read [contributing guidelines](CONTRIBUTING.md) first and [file an issue](https://github.com/shower/shower/issues/new) before sending any pull request. - ---- -Licensed under [MIT License](LICENSE.md). diff --git a/doc/presentations/tbd/shower/themes/ribbon/index.html b/doc/presentations/tbd/shower/themes/ribbon/index.html deleted file mode 100644 index 98850917e05..00000000000 --- a/doc/presentations/tbd/shower/themes/ribbon/index.html +++ /dev/null @@ -1,304 +0,0 @@ - - - - Ribbon theme for Shower - - - - - - -
-

Presentation Title

-

Yours Truly, Famous Inc.

-
-
-

Slide Header

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch letterpress.

-

Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid four loko quinoa.

-

Echo Park 8-bit sustainable umami deep v Kickstarter.

-
-
-

Inline Elements

-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-

Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack bespoke Helvetica roof party. Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag, leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical <paleo> you probably haven’t heard of.

-
-
-
-

Quotes

-
-

Flannel bicycle rights locavore selfies skateboard. Authentic fanny pack paleo four loko bespoke. Artisan tattooed chia XOXO ennui, lomo disrupt 8-bit art party Tumblr scenester.

-
-
-
-

Post-ironic fashion axe flexitarian, Tonx narwhal messenger bag Tumblr. Portland gentrify deep v kale chips literally.

-
-
Yours Truly
-
-
-
-

Nested Lists

-
    -
  1. Literally viral vegan, ugh drinking vinegar photo booth
  2. -
  3. Wes Anderson chillwave Marfa pour-over Etsy banh mi
  4. -
  5. Ethnic polaroid lo-fi iPhone ennui -
      -
    • Yr wayfarers before they sold out Kickstarter asymmetrical
    • -
    • Irony flexitarian readymade quinoa, kogi bespoke meggings narwhal
    • -
    • Skateboard Etsy twee artisan Echo Park
    • -
    -
  6. -
  7. Tonx kitsch fingerstache readymade, retro single-origin coffee
  8. -
-
-
-

Block Lists

-
    -
  • Retro meh brunch aesthetic Cosby sweater Shoreditch. Banksy Tumblr sriracha, flexitarian pug chia master cleanse vinyl wayfarers fanny pack.
  • -
  • Messenger bag retro cred Portland next level. Yr stumptown Schlitz Carles deep v small batch. Hella sustainable messenger bag.
  • -
  • Leggings skateboard literally1 bicycle rights H20 mumblecore banh mi DIY VHS. Semiotics four loko street art asymmetrical. Asymmetrical paleo you probably haven’t heard of.
  • -
-
-
-

Latin and Cyrillic List Bullets

-
    -
  • Occupy locavore blog, mustache you probably haven't heard of them
  • -
  • Skateboard pork belly aesthetic hoodie selfies brunch
  • -
  • Food truck gluten-free disrupt Portland
  • -
-
    -
  • Helvetica narwhal drinking vinegar chillwave, post-ironic ennui
  • -
  • Cray pug paleo retro, Echo Park narwhal Wes Anderson
  • -
  • Disrupt Williamsburg fixie, shabby chic bicycle rights hashtag kogi
  • -
-
-
-

Two Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Three Columns

-

Echo Park 8-bit sustainable umami deep v Kickstarter. DIY cliche typewriter brunch, Odd Future sriracha pickled aesthetic. Farm-to-table bespoke fingerstache, kale chips umami brunch American Apparel letterpress. Whatever authentic disrupt, you probably haven't heard of them direct trade mlkshk Etsy. Gluten-free roof party plaid American Apparel four loko quinoa. Echo Park 8-bit sustainable.

-
-
-

Simple Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Striped Table

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GentrifyTweeArtisanBanksy
MessengerMixtapeSmall batchBicycle rights
MeggingsFreeganRetro biodieselSlow-carb
VeganOccupyNormcorePut a bird on it
Next levelSelfiesSustainableOrganic
UmamiAsymmetricalKeytarCraft beer
BiodieselHaven’t heardSkateboardFarm-to-table
-
-
-

Plain Code Listing

-
<html lang="en">
-<head> <!--Comment-->
-    <title>Shower</title>
-    <meta charset="UTF-8">
-    <link rel="stylesheet" href="screen.css">
-    <script src="script.js"></script>
-</head>
-
-
-

Numbered Code Listing

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Lines

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Hidden Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Highlighted Code Steps

-
-            <html lang="en">
-            <head> <!--Comment-->
-                <title>Shower</title>
-                <meta charset="UTF-8">
-                <link rel="stylesheet" href="screen.css">
-                <script src="script.js"></script>
-            </head>
-        
-
-
-

Grid Slide

-
-
-

White Slide

-
-
-

Black Slide

-
-
-

Shout

-
-
-

Multiline
Shout

-
-
-

Linked Shout

-
-
-

Growing Shout

-
-
-

Shrinking Shout

-
-
-
- -
Copyright © 2016 Yours Truly, Famous Inc.
-
-
-
- -
-
- -
-
- - - - - - - - - -
-
-

Timer

-
-
-

List Navigation

-
    -
  1. Ennui keffiyeh thundercats
  2. - - - - -
-

Before they sold out master

-
-
- - - - - diff --git a/doc/presentations/tbd/shower/themes/ribbon/pictures/canvas.png b/doc/presentations/tbd/shower/themes/ribbon/pictures/canvas.png deleted file mode 100644 index 6ddd30154f2..00000000000 Binary files a/doc/presentations/tbd/shower/themes/ribbon/pictures/canvas.png and /dev/null differ diff --git a/doc/presentations/tbd/shower/themes/ribbon/pictures/exact.png b/doc/presentations/tbd/shower/themes/ribbon/pictures/exact.png deleted file mode 100644 index b27251c57cb..00000000000 Binary files a/doc/presentations/tbd/shower/themes/ribbon/pictures/exact.png and /dev/null differ diff --git a/doc/presentations/tbd/shower/themes/ribbon/pictures/square.png b/doc/presentations/tbd/shower/themes/ribbon/pictures/square.png deleted file mode 100644 index 62cb2384a5f..00000000000 Binary files a/doc/presentations/tbd/shower/themes/ribbon/pictures/square.png and /dev/null differ diff --git a/doc/presentations/tbd/shower/themes/ribbon/pictures/tall.png b/doc/presentations/tbd/shower/themes/ribbon/pictures/tall.png deleted file mode 100644 index fbc9f09a2ab..00000000000 Binary files a/doc/presentations/tbd/shower/themes/ribbon/pictures/tall.png and /dev/null differ diff --git a/doc/presentations/tbd/shower/themes/ribbon/pictures/wide.png b/doc/presentations/tbd/shower/themes/ribbon/pictures/wide.png deleted file mode 100644 index 1e83b0ac7ad..00000000000 Binary files a/doc/presentations/tbd/shower/themes/ribbon/pictures/wide.png and /dev/null differ diff --git a/doc/presentations/tbd/shower/themes/ribbon/styles/presentation_links.html b/doc/presentations/tbd/shower/themes/ribbon/styles/presentation_links.html deleted file mode 100644 index 74981d8eac5..00000000000 --- a/doc/presentations/tbd/shower/themes/ribbon/styles/presentation_links.html +++ /dev/null @@ -1,2 +0,0 @@ -http://www.slideshare.net/AlexeyMilovidov1/clickhouse-69616890/AlexeyMilovidov1/clickhouse-69616890 -file:///home/milovidov/work/Presentation/shower/index.html#cover diff --git a/doc/presentations/tbd/shower/themes/ribbon/styles/screen-16x10.css b/doc/presentations/tbd/shower/themes/ribbon/styles/screen-16x10.css deleted file mode 100644 index 5ea77cc9961..00000000000 --- a/doc/presentations/tbd/shower/themes/ribbon/styles/screen-16x10.css +++ /dev/null @@ -1,204 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8"; - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot); - src: url(https://yastatic.net/adv-www/_/yy5JveR58JFkc97waf-xp0i6_jM.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/CYblzLEXzCqQIvrYs7QKQe2omRk.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/pUcnOdRwl83MvPPzrNomhyletnA.woff) format('woff'), - url(https://yastatic.net/adv-www/_/vNFEmXOcGYKJ4AAidUprHWoXrLU.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/0w7OcWZM_QLP8x-LQUXFOgXO6dE.svg#YandexSansTextWeb-Bold) format('svg'); - font-weight: 700; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot); - src: url(https://yastatic.net/adv-www/_/LI6l3L2RqcgxBe2pXmuUha37czQ.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/z3MYElcut0R2MF_Iw1RDNrstgYs.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/1jvKJ_-hCXl3s7gmFl-y_-UHTaI.woff) format('woff'), - url(https://yastatic.net/adv-www/_/9nzjfpCR2QHvK1EzHpDEIoVFGuY.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/gwyBTpxSwkFCF1looxqs6JokKls.svg#YandexSansTextWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Text Web'; - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot); - src: url(https://yastatic.net/adv-www/_/ayAFYoY8swgBLhq_I56tKj2JftU.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/lGQcYklLVV0hyvz1HFmFsUTj8_0.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/f0AAJ9GJ4iiwEmhG-7PWMHk6vUY.woff) format('woff'), - url(https://yastatic.net/adv-www/_/4UDe4nlVvgEJ-VmLWNVq3SxCsA.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/EKLr1STNokPqxLAQa_RyN82pL98.svg#YandexSansTextWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot); - src: url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'), - url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg'); - font-weight: 400; - font-style: normal; - font-stretch: normal - } - - @font-face { - font-family: 'Yandex Sans Display Web'; - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot); - src: url(https://yastatic.net/adv-www/_/g8_MyyKVquSZ3xEL6tarK__V9Vw.eot?#iefix) format('embedded-opentype'), - url(https://yastatic.net/adv-www/_/LGiRvlfqQHlWR9YKLhsw5e7KGNA.woff2) format('woff2'), - url(https://yastatic.net/adv-www/_/40vXwNl4eYYMgteIVgLP49dwmfc.woff) format('woff'), - url(https://yastatic.net/adv-www/_/X6zG5x_wO8-AtwJ-vDLJcKC5228.ttf) format('truetype'), - url(https://yastatic.net/adv-www/_/ZKhaR0m08c8CRRL77GtFKoHcLYA.svg#YandexSansDisplayWeb-Light) format('svg'); - font-weight: 300; - font-style: normal; - font-stretch: normal - } - -*,::after,::before{box-sizing:border-box} -a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline} -article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block} -.caption p,body{line-height:1} -p {line-height: 1} -ol,ul{list-style:none} -blockquote,q{quotes:none} -blockquote::after,blockquote::before,q::after,q::before{content:none} -table{border-collapse:collapse;border-spacing:0} -a{text-decoration:none} -@page{margin:0;size:1024px 640px} -.shower{color:#000;counter-reset:slide;font:25px/2 Yandex Sans Display Web,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none} -@media print{.shower{text-rendering:geometricPrecision} -} -.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90} -@media (min-width:1174px){.caption{font-size:50px} -} -@media (min-width:2348px){.caption{font-size:100px} -} -.caption h1{padding-bottom:.15em;font:1em/2 Yandex Sans Display Web,sans-serif} -.caption p{font-size:.6em} -.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60} -.slide{position:relative;z-index:1;overflow:hidden;padding:20px 100px 0;width:1024px;height:640px;background:#fff;font-size:25px} - -/*.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}*/ - -.slide h1{vertical-align:middle; color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide h2{margin-bottom:34px;color:#000;font:400 50px/2 Yandex Sans Display Web,sans-serif} -.slide p{margin-bottom:1em} -.slide p.note{color:#979a9e} -.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2} -.slide b,.slide strong{font-weight:700} -.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic} -.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em} -.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace} -.slide mark{background:#fafaa2} -.slide sub,.slide sup{position:relative;line-height:0;font-size:75%} -.slide sub{bottom:-.25em} -.slide sup{top:-.5em} -.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'} -.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700} -.slide ol,.slide ul{margin-bottom:0em;counter-reset:list} -.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em} -.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right} -.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em} -.slide ul>li::before{padding-right:.5em;content:'•'} -.slide ul>li:lang(ru)::before{content:'—'} -.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."} -.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)} -.slide table td:first-child,.slide table th:first-child{padding-left:96px} -.slide table td:last-child,.slide table th:last-child{padding-right:96px} -.slide table th{text-align:left;font-weight:700} -.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x} -.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)} -.slide table.striped tr>*{background-image:none} -.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal} -.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4} -.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)} -.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."} -.slide pre mark{position:relative;z-index:-1;margin:0 -.3em} -.slide pre mark.important{background:#c00;color:#fff} -.slide pre .comment{color:#999} -.slide footer{position:absolute;right:0;bottom:-640px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s} -.slide footer mark{background:rgba(255,255,255,.8)} -.slide:hover>footer{bottom:0} -.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated} -@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto} -} -.slide.black{background-color:#000} -.slide.black::after,.slide.white::after{visibility:hidden} -.slide.white{background-color:#fff} -.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto} -.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2} -.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3} -.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)} -.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x} -.slide .cover{z-index:-1;max-width:100%;max-height:100%} -.slide .cover.w,.slide .cover.width{width:100%;max-height:none} -.slide .cover.h,.slide .cover.height{height:100%;max-width:none} -.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)} -.slide .cover+figcaption.white{color:#fff} -.slide .cover+figcaption a{color:currentcolor} -.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)} -.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none} -.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)} -.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)} -.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0} -.slide .place.r,.slide .place.right{right:0;left:auto} -.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0} -.slide .place.l,.slide .place.left{left:0} -.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)} -.progress[style*='100%']{padding-left:10px} -.badge,.badge a,.progress{position:absolute} -.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden} -@media (min-width:1174px){.badge{font-size:20px} -} -@media (min-width:2348px){.badge{font-size:40px} -} -.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)} -.region{display:none} -@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)} -} -@media screen and (min-width:1174px){.shower.list{padding-top:50px} -} -@media screen and (min-width:2348px){.shower.list{padding-top:100px} -} -@media screen{.shower.list .caption{display:block} -.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -455px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)} -} -@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -270px 50px;-webkit-transform:scale(.5);transform:scale(.5)} -} -@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)} -} -@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)} -.shower.list .slide *{pointer-events:none} -.shower.list .badge,.shower.list .slide footer{display:block} -.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-320px 0 0 -512px;width:1024px;height:640px;background:#000} -.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden} -.shower.full .slide:target{margin:0;visibility:visible} -.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0} -.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)} -.shower.full .slide .next{visibility:hidden} -.shower.full .slide .next.active{visibility:visible} -.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform} -.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)} -.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)} -.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)} -.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)} -.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block} -} diff --git a/doc/presentations/tbd/shower/themes/ribbon/styles/screen-4x3.css b/doc/presentations/tbd/shower/themes/ribbon/styles/screen-4x3.css deleted file mode 100644 index 6648b972c30..00000000000 --- a/doc/presentations/tbd/shower/themes/ribbon/styles/screen-4x3.css +++ /dev/null @@ -1,7 +0,0 @@ -/** - * Ribbon theme for Shower HTML presentation engine - * shower-ribbon v2.0.8, https://github.com/shower/ribbon - * @copyright 2010–2016 Vadim Makeev, http://pepelsbey.net/ - * @license MIT - */ -@charset "UTF-8";@font-face{font-family:PT Sans;src:url(../fonts/pt-sans-regular.woff) format("woff")}@font-face{font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold.woff) format("woff")}@font-face{font-style:italic;font-family:PT Sans;src:url(../fonts/pt-sans-italic.woff) format("woff")}@font-face{font-style:italic;font-weight:700;font-family:PT Sans;src:url(../fonts/pt-sans-bold-italic.woff) format("woff")}@font-face{font-family:PT Sans Narrow;font-weight:700;src:url(../fonts/pt-sans-narrow-bold.woff) format("woff")}@font-face{font-family:PT Mono;src:url(../fonts/pt-mono-regular.woff) format("woff")}*,::after,::before{box-sizing:border-box}a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,code,dd,del,details,dfn,div,dl,dt,em,embed,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hgroup,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,nav,object,ol,output,p,pre,q,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbody,td,tfoot,th,thead,time,tr,tt,u,ul,var,video{margin:0;padding:0;border:0;font:inherit;vertical-align:baseline}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section{display:block}.caption p,body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote::after,blockquote::before,q::after,q::before{content:none}table{border-collapse:collapse;border-spacing:0}a{text-decoration:none}@page{margin:0;size:1024px 768px}.shower{color:#000;counter-reset:slide;font:25px/2 PT Sans,sans-serif;-webkit-print-color-adjust:exact;-webkit-text-size-adjust:none;-moz-text-size-adjust:none;-ms-text-size-adjust:none}@media print{.shower{text-rendering:geometricPrecision}}.caption{font-size:25px;display:none;margin-top:-.2em;padding:0 1em .93em;width:100%;color:#3c3d40;text-shadow:0 1px 0 #8d8e90}@media (min-width:1174px){.caption{font-size:50px}}@media (min-width:2348px){.caption{font-size:100px}}.caption h1{padding-bottom:.15em;font:700 1em/1 PT Sans Narrow,sans-serif}.caption p{font-size:.6em}.caption a{color:#4b86c2;text-shadow:0 -1px 0 #1f3f60}.slide{position:relative;z-index:1;overflow:hidden;padding:106px 100px 0;width:1024px;height:768px;background:#fff;font-size:25px}.slide::after{position:absolute;top:0;right:100px;padding-top:15px;width:50px;height:100px;background:url(../images/ribbon.svg) no-repeat;color:#fff;counter-increment:slide;content:counter(slide);text-align:center}.slide h2{margin-bottom:34px;color:#585a5e;font:700 50px/1 PT Sans Narrow,sans-serif}.slide p{margin-bottom:1em}.slide p.note{color:#979a9e}.slide a{background:-webkit-linear-gradient(bottom,currentColor .09em,transparent .09em) repeat-x;background:linear-gradient(to top,currentColor .09em,transparent .09em) repeat-x;color:#4b86c2}.slide b,.slide strong{font-weight:700}.slide blockquote,.slide dfn,.slide em,.slide i{font-style:italic}.slide code,.slide kbd,.slide mark,.slide samp{padding:.1em .3em;border-radius:.2em}.slide code,.slide kbd,.slide samp{background:rgba(88,90,94,.1);line-height:1;font-family:PT Mono,monospace,monospace}.slide mark{background:#fafaa2}.slide sub,.slide sup{position:relative;line-height:0;font-size:75%}.slide sub{bottom:-.25em}.slide sup{top:-.5em}.slide blockquote::before{position:absolute;margin:-.15em 0 0 -.43em;color:#ccc;line-height:1;font-size:8em;content:'\201C'}.slide blockquote+figcaption{margin:-1em 0 1em;font-style:italic;font-weight:700}.slide ol,.slide ul{margin-bottom:1em;counter-reset:list}.slide ol li,.slide ul li{page-break-inside:avoid;text-indent:-2em}.slide ol li::before,.slide ul li::before{display:inline-block;width:2em;color:#979a9e;text-align:right}.slide ol ol,.slide ol ul,.slide ul ol,.slide ul ul{margin-bottom:0;margin-left:2em}.slide ul>li::before{padding-right:.5em;content:'•'}.slide ul>li:lang(ru)::before{content:'—'}.slide ol>li::before{padding-right:.4em;counter-increment:list;content:counter(list) "."}.slide table{margin-left:-100px;margin-bottom:1em;width:calc(100% + 100px + 100px)}.slide table td:first-child,.slide table th:first-child{padding-left:96px}.slide table td:last-child,.slide table th:last-child{padding-right:96px}.slide table th{text-align:left;font-weight:700}.slide table tr:not(:last-of-type)>*{background:-webkit-linear-gradient(bottom,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x;background:linear-gradient(to top,rgba(88,90,94,.5) .055em,transparent .055em) repeat-x}.slide table.striped tr:nth-child(even){background:rgba(88,90,94,.1)}.slide table.striped tr>*{background-image:none}.slide pre{margin-bottom:1em;counter-reset:code;white-space:normal}.slide pre code{display:block;margin-left:-100px;padding:0 0 0 100px;width:calc(100% + 100px + 100px);border-radius:0;background:0 0;line-height:2;white-space:pre;-moz-tab-size:4;-o-tab-size:4;tab-size:4}.slide pre code:not(:only-child).mark{background:rgba(88,90,94,.1)}.slide pre code:not(:only-child)::before{position:absolute;margin-left:-2em;color:#979a9e;counter-increment:code;content:counter(code,decimal-leading-zero) "."}.slide pre mark{position:relative;z-index:-1;margin:0 -.3em}.slide pre mark.important{background:#c00;color:#fff}.slide pre .comment{color:#999}.slide footer{position:absolute;right:0;bottom:-768px;left:0;display:none;padding:41px 100px 8px;background:#fbfbba;box-shadow:0 1px 0 #fafaa2 inset;-webkit-transition:bottom .3s;transition:bottom .3s}.slide footer mark{background:rgba(255,255,255,.8)}.slide:hover>footer{bottom:0}.slide.grid{background-image:url(../images/grid.png);-ms-interpolation-mode:nearest-neighbor;image-rendering:-webkit-optimize-contrast;image-rendering:-moz-crisp-edges;image-rendering:pixelated}@media (-webkit-min-device-pixel-ratio:2),(min-resolution:2dppx){.slide.grid{background-image:url(../images/grid@2x.png);background-size:1024px auto}}.slide.black{background-color:#000}.slide.black::after,.slide.white::after{visibility:hidden}.slide.white{background-color:#fff}.slide .double,.slide .triple{-webkit-column-gap:75px;-moz-column-gap:75px;column-gap:75px;-webkit-hyphens:auto;-ms-hyphens:auto;hyphens:auto}.slide .double{-webkit-column-count:2;-moz-column-count:2;column-count:2}.slide .triple{-webkit-column-count:3;-moz-column-count:3;column-count:3}.slide .shout{position:absolute;top:50%;left:0;width:100%;text-align:center;line-height:1;font-size:150px;-webkit-transform:translateY(-50%);transform:translateY(-50%)}.slide .shout a{background:-webkit-linear-gradient(bottom,currentColor .11em,transparent .11em) repeat-x;background:linear-gradient(to top,currentColor .11em,transparent .11em) repeat-x}.slide .cover{z-index:-1;max-width:100%;max-height:100%}.slide .cover.w,.slide .cover.width{width:100%;max-height:none}.slide .cover.h,.slide .cover.height{height:100%;max-width:none}.slide .cover+figcaption{position:absolute;bottom:20px;right:10px;font-size:12px;opacity:.7;-webkit-transform-origin:0 100%;transform-origin:0 100%;-webkit-transform:translateX(100%) rotate(-90deg);transform:translateX(100%) rotate(-90deg)}.slide .cover+figcaption.white{color:#fff}.slide .cover+figcaption a{color:currentcolor}.slide .cover,.slide .place{position:absolute;top:50%;left:50%;-webkit-transform:translate(-50%,-50%);transform:translate(-50%,-50%)}.slide .place.b.l,.slide .place.b.r,.slide .place.bottom.left,.slide .place.bottom.right,.slide .place.t.l,.slide .place.t.r,.slide .place.top.left,.slide .place.top.right{-webkit-transform:none;transform:none}.slide .place.b,.slide .place.bottom,.slide .place.t,.slide .place.top{-webkit-transform:translate(-50%,0);transform:translate(-50%,0)}.slide .place.l,.slide .place.left,.slide .place.r,.slide .place.right{-webkit-transform:translate(0,-50%);transform:translate(0,-50%)}.slide .place.t,.slide .place.t.r,.slide .place.top,.slide .place.top.left,.slide .place.top.right{top:0}.slide .place.r,.slide .place.right{right:0;left:auto}.slide .place.b,.slide .place.b.l,.slide .place.b.r,.slide .place.bottom,.slide .place.bottom.left,.slide .place.bottom.right{top:auto;bottom:0}.slide .place.l,.slide .place.left{left:0}.progress{left:-20px;bottom:0;z-index:1;display:none;width:0;height:0;box-sizing:content-box;border:10px solid #4b86c2;border-right-color:transparent;-webkit-transition:width .2s linear;transition:width .2s linear;clip:rect(10px,1044px,20px,20px)}.progress[style*='100%']{padding-left:10px}.badge,.badge a,.progress{position:absolute}.badge{font-size:10px;top:0;z-index:1;overflow:hidden;display:none;width:9em;height:9em;right:0;visibility:hidden}@media (min-width:1174px){.badge{font-size:20px}}@media (min-width:2348px){.badge{font-size:40px}}.badge a{right:-50%;bottom:50%;left:-50%;visibility:visible;background:#4b86c2;color:#fff;text-align:center;line-height:2;-webkit-transform-origin:50% 100%;transform-origin:50% 100%;-webkit-transform:rotate(45deg);transform:rotate(45deg)}.region{display:none}@media screen{.shower.list{padding-top:25px;width:100%;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;background:#585a5e;position:absolute;clip:rect(0,auto,auto,0)}}@media screen and (min-width:1174px){.shower.list{padding-top:50px}}@media screen and (min-width:2348px){.shower.list{padding-top:100px}}@media screen{.shower.list .caption{display:block}.shower.list .slide{-webkit-transform-origin:0 0;transform-origin:0 0;margin:0 -768px -551px 25px;-webkit-transform:scale(.25);transform:scale(.25);border-radius:2px;box-shadow:0 20px 50px rgba(0,0,0,.3)}}@media screen and (min-width:1174px){.shower.list .slide{margin:0 -512px -334px 50px;-webkit-transform:scale(.5);transform:scale(.5)}}@media screen and (min-width:2348px){.shower.list .slide{margin:0 0 100px 100px;-webkit-transform:scale(1);transform:scale(1)}}@media screen{.shower.list .slide:hover{box-shadow:0 0 0 20px rgba(0,0,0,.1),0 20px 50px rgba(0,0,0,.3)}.shower.list .slide:target{box-shadow:0 0 0 1px #376da3,0 0 0 20px #4b86c2,0 20px 50px rgba(0,0,0,.3)}.shower.list .slide *{pointer-events:none}.shower.list .badge,.shower.list .slide footer{display:block}.shower.full{position:absolute;top:50%;left:50%;overflow:hidden;margin:-384px 0 0 -512px;width:1024px;height:768px;background:#000}.shower.full .slide{position:absolute;top:0;left:0;margin-left:-150%;visibility:hidden}.shower.full .slide:target{margin:0;visibility:visible}.shower.full .slide pre code:not(:only-child).mark.next{visibility:visible;background:0 0}.shower.full .slide pre code:not(:only-child).mark.next.active{background:rgba(88,90,94,.1)}.shower.full .slide .next{visibility:hidden}.shower.full .slide .next.active{visibility:visible}.shower.full .slide .shout.grow,.shower.full .slide .shout.shrink{opacity:0;-webkit-transition:.4s ease-out;transition:.4s ease-out;-webkit-transition-property:opacity,-webkit-transform;transition-property:opacity,transform;transition-property:opacity,transform,-webkit-transform}.shower.full .slide .shout.grow{-webkit-transform:scale(.1) translateY(-50%);transform:scale(.1) translateY(-50%)}.shower.full .slide .shout.shrink{-webkit-transform:scale(10) translateY(-50%);transform:scale(10) translateY(-50%)}.shower.full .slide:target .shout.grow,.shower.full .slide:target .shout.shrink{opacity:1;-webkit-transform:scale(1) translateY(-50%);transform:scale(1) translateY(-50%)}.shower.full .progress{display:block;-webkit-transform:translateZ(0);transform:translateZ(0)}.shower.full .region{position:absolute;clip:rect(0 0 0 0);overflow:hidden;margin:-1px;padding:0;width:1px;height:1px;border:none;display:block}} \ No newline at end of file diff --git a/docs/en/access_rights.rst b/docs/en/access_rights.rst new file mode 100644 index 00000000000..a2e0c8da2cb --- /dev/null +++ b/docs/en/access_rights.rst @@ -0,0 +1,69 @@ +Access rights +============= +Users and access rights are set up in the user config. This is usually ``users.xml``. + +Users are recorded in the ``users`` section. Let's look at part of the ``users.xml`` file: + +.. code-block:: xml + + + + + + + + + + + + + default + + default + + + + + + + web + default + + +Here we can see that two users are declared: ``default`` and ``web``. We added the ``web`` user ourselves. +The ``default`` user is chosen in cases when the username is not passed, so this user must be present in the config file. The ``default`` user is also used for distributed query processing - the system accesses remote servers under this username. So the ``default`` user must have an empty password and must not have substantial restrictions or quotas - otherwise, distributed queries will fail. + +The password is specified in plain text directly in the config. In this regard, you should not consider these passwords as providing security against potential malicious attacks. Rather, they are necessary for protection from Yandex employees. + +A list of networks is specified that access is allowed from. In this example, the list of networks for both users is loaded from a separate file (``/etc/metrika.xml``) containing the ``networks`` substitution. Here is a fragment of it: + +.. code-block:: xml + + + ... + + ::/64 + 93.111.222.128/26 + 2a02:6b8:0:111::/64 + ... + + + +We could have defined this list of networks directly in ``users.xml``, or in a file in the ``users.d`` directory (for more information, see the section "Configuration files"). + +The config includes comments explaining how to open access from everywhere. + +For use in production, only specify IP elements (IP addresses and their masks), since using ``host`` and ``host_regexp`` might cause extra latency. + +Next the user settings profile is specified (see the section "Settings profiles"). You can specify the default profile, ``default``. The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is ``readonly`` set to ``1``, which provides read-only access. + +After this, the quota is defined (see the section "Quotas"). You can specify the default quota, ``default``. It is set in the config by default so that it only counts resource usage, but does not restrict it. The quota can have any name. You can specify the same quota for different users - in this case, resource usage is calculated for each user individually. diff --git a/docs/en/agg_functions/index.rst b/docs/en/agg_functions/index.rst new file mode 100644 index 00000000000..17bef2c7669 --- /dev/null +++ b/docs/en/agg_functions/index.rst @@ -0,0 +1,284 @@ +Aggregate functions +================== + +count() +------- +Counts the number of rows. Accepts zero arguments and returns UInt64. +The syntax COUNT(DISTINCT x) is not supported. The separate 'uniq' aggregate function exists for this purpose. + +A 'SELECT count() FROM table' query is not optimized, because the number of entries in the table is not stored separately. It will select some small column from the table and count the number of values in it. + +any(x) +------ +Selects the first encountered value. +The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate. +To get a determinate result, you can use the 'min' or 'max' function instead of 'any'. + +In some cases, you can rely on the order of execution. This applies to cases when SELECT comes from a subquery that uses ORDER BY. + +When a SELECT query has the GROUP BY clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the SELECT, HAVING, and ORDER BY clauses be calculated from keys or from aggregate functions. That is, each column selected from the table must be used either in keys, or inside aggregate functions. To get behavior like in MySQL, you can put the other columns in the 'any' aggregate function. + +anyLast(x) +---------- +Selects the last value encountered. +The result is just as indeterminate as for the 'any' function. + +min(x) +------ +Calculates the minimum. + +max(x) +----- +Calculates the maximum + +argMin(arg, val) +---------------- +Calculates the 'arg' value for a minimal 'val' value. If there are several different values of 'arg' for minimal values of 'val', the first of these values encountered is output. + +argMax(arg, val) +--------------- +Calculates the 'arg' value for a maximum 'val' value. If there are several different values of 'arg' for maximum values of 'val', the first of these values encountered is output. + +sum(x) +------- +Calculates the sum. +Only works for numbers. + +avg(x) +------ +Calculates the average. +Only works for numbers. +The result is always Float64. + +uniq(x) +-------- +Calculates the approximate number of different values of the argument. Works for numbers, strings, dates, and dates with times. + +Uses an adaptive sampling algorithm: for the calculation state, it uses a sample of element hash values with a size up to 65535. +Compared with the widely known HyperLogLog algorithm, this algorithm is less effective in terms of accuracy and memory consumption (even up to proportionality), but it is adaptive. This means that with fairly high accuracy, it consumes less memory during simultaneous computation of cardinality for a large number of data sets whose cardinality has power law distribution (i.e. in cases when most of the data sets are small). This algorithm is also very accurate for data sets with small cardinality (up to 65536) and very efficient on CPU (when computing not too many of these functions, using 'uniq' is almost as fast as using other aggregate functions). + +There is no compensation for the bias of an estimate, so for large data sets the results are systematically deflated. This function is normally used for computing the number of unique visitors in Yandex.Metrica, so this bias does not play a role. + +The result is determinate (it doesn't depend on the order of query execution). + +uniqCombined(x) +-------------- +Approximately computes the number of different values ​​of the argument. Works for numbers, strings, dates, date-with-time, for several arguments and arguments-tuples. + +A combination of three algorithms is used: an array, a hash table and HyperLogLog with an error correction table. The memory consumption is several times smaller than the uniq function, and the accuracy is several times higher. The speed of operation is slightly lower than that of the uniq function, but sometimes it can be even higher - in the case of distributed requests, in which a large number of aggregation states are transmitted over the network. The maximum state size is 96 KiB (HyperLogLog of 217 6-bit cells). + +The result is deterministic (it does not depend on the order of query execution). + +The uniqCombined function is a good default choice for calculating the number of different values. + +uniqHLL12(x) +------------ +Uses the HyperLogLog algorithm to approximate the number of different values of the argument. It uses 212 5-bit cells. The size of the state is slightly more than 2.5 KB. + +The result is determinate (it doesn't depend on the order of query execution). + +In most cases, use the 'uniq' function. You should only use this function if you understand its advantages well. + +uniqExact(x) +------------ +Calculates the number of different values of the argument, exactly. +There is no reason to fear approximations, so it's better to use the 'uniq' function. +You should use the 'uniqExact' function if you definitely need an exact result. + +The 'uniqExact' function uses more memory than the 'uniq' function, because the size of the state has unbounded growth as the number of different values increases. + +groupArray(x) +------------ +Creates an array of argument values. +Values can be added to the array in any (indeterminate) order. + +In some cases, you can rely on the order of execution. This applies to cases when SELECT comes from a subquery that uses ORDER BY. + +groupUniqArray(x) +----------------- +Creates an array from different argument values. Memory consumption is the same as for the 'uniqExact' function. + +quantile(level)(x) +------------------ +Approximates the 'level' quantile. 'level' is a constant, a floating-point number from 0 to 1. We recommend using a 'level' value in the range of 0.01 .. 0.99. +Don't use a 'level' value equal to 0 or 1 - use the 'min' and 'max' functions for these cases. + +The algorithm is the same as for the 'median' function. Actually, 'quantile' and 'median' are internally the same function. You can use the 'quantile' function without parameters - in this case, it calculates the median, and you can use the 'median' function with parameters - in this case, it calculates the quantile of the set level. + +When using multiple 'quantile' and 'median' functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the 'quantiles' function. + +quantileDeterministic(level)(x, determinator) +-------------- +Calculates the quantile of 'level' using the same algorithm as the 'medianDeterministic' function. + + +quantileTiming(level)(x) +--------------- +Calculates the quantile of 'level' using the same algorithm as the 'medianTiming' function. + +quantileTimingWeighted(level)(x, weight) +--------------- +Calculates the quantile of 'level' using the same algorithm as the 'medianTimingWeighted' function. + +quantileExact(level)(x) +------------ +Computes the level quantile exactly. To do this, all transferred values are added to an array, which is then partially sorted. Therefore, the function consumes O (n) memory, where n is the number of transferred values. However, for a small number of values, the function is very effective. + +quantileExactWeighted(level)(x, weight) +---------------- +Computes the level quantile exactly. In this case, each value is taken into account with the weight weight - as if it is present weight once. The arguments of the function can be considered as histograms, where the value "x" corresponds to the "column" of the histogram of the height weight, and the function itself can be considered as the summation of histograms. + +The algorithm is a hash table. Because of this, in case the transmitted values ​​are often repeated, the function consumes less RAM than the quantileExact. You can use this function instead of quantileExact, specifying the number 1 as the weight. + +quantileTDigest(level)(x) +------------- +Computes the level quantile approximatively, using the t-digest algorithm. The maximum error is 1%. The memory consumption per state is proportional to the logarithm of the number of transmitted values. + +The performance of the function is below quantile, quantileTiming. By the ratio of state size and accuracy, the function is significantly better than quantile. + +The result depends on the order in which the query is executed, and is nondeterministic. + +median +------ +Approximates the median. Also see the similar 'quantile' function. +Works for numbers, dates, and dates with times. +For numbers it returns Float64, for dates - a date, and for dates with times - a date with time. + +Uses reservoir sampling with a reservoir size up to 8192. +If necessary, the result is output with linear approximation from the two neighboring values. +This algorithm proved to be more practical than another well-known algorithm - QDigest. + +The result depends on the order of running the query, and is nondeterministic. + +quantiles(level1, level2, ...)(x) +--------------- +Approximates quantiles of all specified levels. +The result is an array containing the corresponding number of values. + +varSamp(x) +-------- +Calculates the amount Σ((x - x̅)2) / (n - 1), where 'n' is the sample size and 'x̅' is the average value of 'x'. + +It represents an unbiased estimate of the variance of a random variable, if the values passed to the function are a sample of this random amount. + +Returns Float64. If n <= 1, it returns +∞. + +varPop(x) +--------- +Calculates the amount Σ((x - x̅)2) / n, where 'n' is the sample size and 'x̅' is the average value of 'x'. + +In other words, dispersion for a set of values. Returns Float64. + +stddevSamp(x) +----------- +The result is equal to the square root of 'varSamp(x)'. + + +stddevPop(x) +--------- +The result is equal to the square root of 'varPop(x)'. + + +covarSamp(x, y) +---------- +Calculates the value of Σ((x - x̅)(y - y̅)) / (n - 1). + +Returns Float64. If n <= 1, it returns +∞. + +covarPop(x, y) +---------- +Calculates the value of Σ((x - x̅)(y - y̅)) / n. + +corr(x, y) +--------- +Calculates the Pearson correlation coefficient: Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)2) * Σ((y - y̅)2)). + +Parametric aggregate functions +================ +Some aggregate functions can accept not only argument columns (used for compression), but a set of parameters - constants for initialization. The syntax is two pairs of brackets instead of one. The first is for parameters, and the second is for arguments. + +sequenceMatch(pattern)(time, cond1, cond2, ...) +------------ +Pattern matching for event chains. + +'pattern' is a string containing a pattern to match. The pattern is similar to a regular expression. +'time' is the event time of the DateTime type. +'cond1, cond2 ...' are from one to 32 arguments of the UInt8 type that indicate whether an event condition was met. + +The function collects a sequence of events in RAM. Then it checks whether this sequence matches the pattern. +It returns UInt8 - 0 if the pattern isn't matched, or 1 if it matches. + +Example: sequenceMatch('(?1).*(?2)')(EventTime, URL LIKE '%company%', URL LIKE '%cart%') +- whether there was a chain of events in which pages with the address in company were visited earlier than pages with the address in cart. + +This is a degenerate example. You could write it using other aggregate functions: +minIf(EventTime, URL LIKE '%company%') < maxIf(EventTime, URL LIKE '%cart%'). +However, there is no such solution for more complex situations. + +Pattern syntax: +``(?1)`` - Reference to a condition (any number in place of 1). +``.*`` - Any number of events. +``(?t>=1800)`` - Time condition. +Any quantity of any type of events is allowed over the specified time. +The operators <, >, <= may be used instead of >=. +Any number may be specified in place of 1800. + +Events that occur during the same second may be put in the chain in any order. This may affect the result of the function. + +sequenceCount(pattern)(time, cond1, cond2, ...) +------------------ +Similar to the sequenceMatch function, but it does not return the fact that there is a chain of events, and UInt64 is the number of strings found. +Chains are searched without overlapping. That is, the following chain can start only after the end of the previous one. + +uniqUpTo(N)(x) +------------- +Calculates the number of different argument values, if it is less than or equal to N. +If the number of different argument values is greater than N, it returns N + 1. + +Recommended for use with small Ns, up to 10. The maximum N value is 100. + +For the state of an aggregate function, it uses the amount of memory equal to 1 + N * the size of one value of bytes. +For strings, it stores a non-cryptographic hash of 8 bytes. That is, the calculation is approximated for strings. + +It works as fast as possible, except for cases when a large N value is used and the number of unique values is slightly less than N. + +Usage example: +Problem: Generate a report that shows only keywords that produced at least 5 unique users. +Solution: Write in the query ``GROUP BY SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5`` + +Aggregate function combinators +======================= +The name of an aggregate function can have a suffix appended to it. This changes the way the aggregate function works. +There are ``If`` and ``Array`` combinators. See the sections below. + +-If combinator. Conditional aggregate functions +--------------------- +The suffix ``-If`` can be appended to the name of any aggregate function. In this case, the aggregate function accepts an extra argument - a condition (Uint8 type). The aggregate function processes only the rows that trigger the condition. If the condition was not triggered even once, it returns a default value (usually zeros or empty strings). + +Examples: ``sumIf(column, cond)``, ``countIf(cond)``, ``avgIf(x, cond)``, ``quantilesTimingIf(level1, level2)(x, cond)``, ``argMinIf(arg, val, cond)`` and so on. + +You can use aggregate functions to calculate aggregates for multiple conditions at once, without using subqueries and JOINs. +For example, in Yandex.Metrica, we use conditional aggregate functions for implementing segment comparison functionality. + +-Array combinator. Aggregate functions for array arguments +----------------- +The -Array suffix can be appended to any aggregate function. In this case, the aggregate function takes arguments of the 'Array(T)' type (arrays) instead of 'T' type arguments. If the aggregate function accepts multiple arguments, this must be arrays of equal lengths. When processing arrays, the aggregate function works like the original aggregate function across all array elements. + +Example 1: ``sumArray(arr)`` - Totals all the elements of all 'arr' arrays. In this example, it could have been written more simply: sum(arraySum(arr)). + +Example 2: ``uniqArray(arr)`` - Count the number of unique elements in all 'arr' arrays. This could be done an easier way: ``uniq(arrayJoin(arr))``, but it's not always possible to add 'arrayJoin' to a query. + +The ``-If`` and ``-Array`` combinators can be used together. However, 'Array' must come first, then 'If'. +Examples: ``uniqArrayIf(arr, cond)``, ``quantilesTimingArrayIf(level1, level2)(arr, cond)``. Due to this order, the 'cond' argument can't be an array. + +-State combinator +------------ +If this combinator is used, the aggregate function returns a non-finished value (for example, in the case of the uniq function, the number of unique values), and the intermediate aggregation state (for example, in the case of the uniq function, a hash table for calculating the number of unique values) AggregateFunction (...) and can be used for further processing or can be stored in a table for subsequent pre-aggregation - see the sections "AggregatingMergeTree" and "functions for working with intermediate aggregation states". + +-Merge combinator +------------ +In the case of using this combinator, the aggregate function will take as an argument the intermediate state of aggregation, pre-aggregate (combine together) these states, and return the finished value. + +-MergeState combinator +---------------- +Merges the intermediate aggregation states, similar to the -Merge combo, but returns a non-ready value, and an intermediate aggregation state, similar to the -State combinator. diff --git a/docs/en/conf.py b/docs/en/conf.py new file mode 100644 index 00000000000..2f49322f3e5 --- /dev/null +++ b/docs/en/conf.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +# +# ClickHouse documentation build configuration file, created by +# sphinx-quickstart on Tue Mar 21 13:05:32 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.mathjax', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'ClickHouse' +copyright = u'2017, Alexey Milovidov' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1' +# The full version, including alpha/beta/rc tags. +release = '1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'ClickHousedoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'ClickHouse.tex', u'ClickHouse Documentation', + u'Alexey Milovidov', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'clickhouse', u'ClickHouse Documentation', + [u'Alexey Milovidov'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'ClickHouse', u'ClickHouse Documentation', + u'Alexey Milovidov', 'ClickHouse', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/en/configuration_files.rst b/docs/en/configuration_files.rst new file mode 100644 index 00000000000..cf35ecdeef1 --- /dev/null +++ b/docs/en/configuration_files.rst @@ -0,0 +1,21 @@ +Configuration files +====================== + +The main server config file is ``config.xml``. It resides in the ``/etc/clickhouse-server/`` directory. + +Certain settings can be overridden in the ``*.xml`` and ``*.conf`` files from the ``conf.d`` and ``config.d`` directories next to the config. + +The ``replace`` and ``remove`` attributes can be specified for the elements of these config files. +If neither is specified, it combines the contents of elements recursively, replacing values of duplicate children. +If ``replace`` is specified, it replaces the entire element with the specified one. +If ``remove`` is specified, it deletes the element. + +The config can also define "substitutions". If an element has the ``incl`` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is ``/etc/metrika.xml``. This can be changed in the config in the ``include_from`` element. The substitution values are specified in ``/yandex/substitution_name`` elements of this file. + +You can also perform substitutions from ZooKeeper nodes. To do that add the ``from_zk="/path/to/node"`` attribute to a config element. Element contents will be substituted with the contents of the /path/to/node ZooKeeper node. The ZooKeeper node can contain a whole XML subtree, and it will be inserted as a child of the substituted node. + +The 'config.xml' file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the 'users_config' element. By default, it is 'users.xml'. If 'users_config' is omitted, the user settings, profiles, and quotas are specified directly in ``config.xml``. For ``users_config``, overrides and substitutions may also exist in files from the ``users_config.d`` directory (for example, ``users.d``). + +For each config file, the server also generates ``file-preprocessed.xml`` files on launch. These files contain all the completed substitutions and overrides, and they are intended for informational use. If ZooKeeper substitutions were used in a config file and the ZooKeeper is unavailable during server startup, the configuration is loaded from the respective preprocessed file. + +The server tracks changes to config files and files and ZooKeeper nodes that were used for substitutions and overrides and reloads users and clusters configurations in runtime. That is, you can add or change users, clusters and their settings without relaunching the server. diff --git a/docs/en/data_types/array.rst b/docs/en/data_types/array.rst new file mode 100644 index 00000000000..51df7d27a4a --- /dev/null +++ b/docs/en/data_types/array.rst @@ -0,0 +1,5 @@ +Array(T) +-------- + +Array of T-type items. The T type can be any type, including an array. +We don't recommend using multidimensional arrays, because they are not well supported (for example, you can't store multidimensional arrays in tables with engines from MergeTree family). diff --git a/docs/en/data_types/boolean.rst b/docs/en/data_types/boolean.rst new file mode 100644 index 00000000000..0c6189fefad --- /dev/null +++ b/docs/en/data_types/boolean.rst @@ -0,0 +1,4 @@ +Boolean +--------------- + +There is no separate type for boolean values. For them, the type UInt8 is used, in which only the values 0 and 1 are used. diff --git a/docs/en/data_types/date.rst b/docs/en/data_types/date.rst new file mode 100644 index 00000000000..679374ae521 --- /dev/null +++ b/docs/en/data_types/date.rst @@ -0,0 +1,7 @@ +Date +---- + +A date. Stored in two bytes as the number of days since 1970-01-01 (unsigned). Allows storing values from just after the beginning of the Unix Epoch to the upper threshold defined by a constant at the compilation stage (currently, this is until the year 2038, but it may be expanded to 2106). +The minimum value is output as 0000-00-00. + +The date is stored without the time zone. diff --git a/docs/en/data_types/datetime.rst b/docs/en/data_types/datetime.rst new file mode 100644 index 00000000000..6f299b67028 --- /dev/null +++ b/docs/en/data_types/datetime.rst @@ -0,0 +1,16 @@ +DateTime +-------- + +Date with time. Stored in four bytes as a Unix timestamp (unsigned). Allows storing values in the same range as for the Date type. The minimal value is output as 0000-00-00 00:00:00. The time is stored with accuracy up to one second (without leap seconds). + + +Time zones +~~~~~~~~~~~~~ + +The date with time is converted from text (divided into component parts) to binary and back, using the system's time zone at the time the client or server starts. In text format, information about daylight savings is lost. + +Note that by default the client adopts the server time zone at the beginning of the session. You can change this behaviour with the --use_client_time_zone command line switch. + +Supports only those time zones that never had the time differ from UTC for a partial number of hours (without leap seconds) over the entire time range you will be working with. + +So when working with a textual date (for example, when saving text dumps), keep in mind that there may be ambiguity during changes for daylight savings time, and there may be problems matching data if the time zone changed. diff --git a/docs/en/data_types/enum.rst b/docs/en/data_types/enum.rst new file mode 100644 index 00000000000..fbdf7a0de3d --- /dev/null +++ b/docs/en/data_types/enum.rst @@ -0,0 +1,28 @@ +Enum +---- + +Enum8 or Enum16. A set of enumerated string values that are stored as Int8 or Int16. + +Example: + +:: + Enum8('hello' = 1, 'world' = 2) +- This data type has two possible values - 'hello' and 'world'. + +The numeric values must be within -128..127 for ``Enum8`` and -32768..32767 for ``Enum16``. Every member of the enum must also have different numbers. The empty string is a valid value. The numbers do not need to be sequential and can be in any order. The order does not matter. + +In memory, the data is stored in the same way as the numeric types ``Int8`` and ``Int16``. +When reading in text format, the string is read and the corresponding numeric value is looked up. An exception will be thrown if it is not found. +When writing in text format, the stored number is looked up and the corresponding string is written out. An exception will be thrown if the number does not correspond to a known value. +In binary format, the information is saved in the same way as ``Int8`` and ``Int16``. +The implicit default value for an Enum is the value having the smallest numeric value. + +In ORDER BY, GROUP BY, IN, DISTINCT, etc. Enums behave like the numeric value. e.g. they will be sorted by the numeric value in an ``ORDER BY``. Equality and comparison operators behave like they do on the underlying numeric value. + +Enum values cannot be compared to numbers, they must be compared to a string. If the string compared to is not a valid value for the Enum, an exception will be thrown. The ``IN`` operator is supported with the Enum on the left hand side and a set of strings on the right hand side. + +Most numeric and string operations are not defined for Enum values, e.g. adding a number to an Enum or concatenating a string to an Enum. However, the toString function can be used to convert the Enum to its string value. Enum values are also convertible to numeric types using the ``toT`` function where ``T`` is a numeric type. When ``T`` corresponds to the enum's underlying numeric type, this conversion is zero-cost. + +It is possible to add new members to the ``Enum`` using ``ALTER``. If the only change is to the set of values, the operation will be almost instant. It is also possible to remove members of the Enum using ALTER. Removing members is only safe if the removed value has never been used in the table. As a safeguard, changing the numeric value of a previously defined Enum member will throw an exception. + +Using ``ALTER``, it is possible to change an ``Enum8`` to an ``Enum16`` or vice versa - just like changing an ``Int8`` to ``Int16``. diff --git a/docs/en/data_types/fixedstring.rst b/docs/en/data_types/fixedstring.rst new file mode 100644 index 00000000000..c58adb1ba13 --- /dev/null +++ b/docs/en/data_types/fixedstring.rst @@ -0,0 +1,10 @@ +FixedString(N) +-------------- + +A fixed-length string of N bytes (not characters or code points). N must be a strictly positive natural number. +When server reads a string (as an input passed in INSERT query, for example) that contains fewer bytes, the string is padded to N bytes by appending null bytes at the right. +When server reads a string that contains more bytes, an error message is returned. +When server writes a string (as an output of SELECT query, for example), null bytes are not trimmed off of the end of the string, but are output. +Note that this behavior differs from MySQL behavior for the CHAR type (where strings are padded with spaces, and the spaces are removed for output). + +Fewer functions can work with the FixedString(N) type than with String, so it is less convenient to use. diff --git a/docs/en/data_types/float.rst b/docs/en/data_types/float.rst new file mode 100644 index 00000000000..8259f221813 --- /dev/null +++ b/docs/en/data_types/float.rst @@ -0,0 +1,7 @@ +Float32, Float64 +---------------- + +Floating-point numbers are just like 'float' and 'double' in the C language. +In contrast to standard SQL, floating-point numbers support 'inf', '-inf', and even 'nan's. +See the notes on sorting nans in "ORDER BY clause". +We do not recommend storing floating-point numbers in tables. diff --git a/docs/en/data_types/index.rst b/docs/en/data_types/index.rst new file mode 100644 index 00000000000..0af43b6db52 --- /dev/null +++ b/docs/en/data_types/index.rst @@ -0,0 +1,8 @@ +Data types +=========== + +.. toctree:: + :glob: + + * + */index diff --git a/docs/en/data_types/int_uint.rst b/docs/en/data_types/int_uint.rst new file mode 100644 index 00000000000..636b64c783b --- /dev/null +++ b/docs/en/data_types/int_uint.rst @@ -0,0 +1,40 @@ +UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 +-------------------------------------------------------- + +Fixed-length integers, with or without a sign. + +Int ranges +""""""""""""" + +.. table:: + + +--------+----------------------+-----------------------+ + | Тип | From | To | + +========+======================+=======================+ + | Int8 | -128 | 127 | + +--------+----------------------+-----------------------+ + | Int16 | -32768 | 32767 | + +--------+----------------------+-----------------------+ + | Int32 | -2147483648 | 2147483647 | + +--------+----------------------+-----------------------+ + | Int64 | -9223372036854775808 | 9223372036854775807 | + +--------+----------------------+-----------------------+ + + + +Uint ranges +"""""""""""""" + +.. table:: + + +--------+----------------------+-----------------------+ + | Тип | From | To | + +========+======================+=======================+ + | UInt8 | 0 | 255 | + +--------+----------------------+-----------------------+ + | UInt16 | 0 | 65535 | + +--------+----------------------+-----------------------+ + | UInt32 | 0 | 4294967295 | + +--------+----------------------+-----------------------+ + | UInt64 | 0 | 18446744073709551615 | + +--------+----------------------+-----------------------+ diff --git a/docs/en/data_types/nested_data_structures/aggregatefunction.rst b/docs/en/data_types/nested_data_structures/aggregatefunction.rst new file mode 100644 index 00000000000..bb38fed92b2 --- /dev/null +++ b/docs/en/data_types/nested_data_structures/aggregatefunction.rst @@ -0,0 +1,4 @@ +AggregateFunction(name, types_of_arguments...) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The intermediate state of an aggregate function. To get it, use aggregate functions with the '-State' suffix. For more information, see "AggregatingMergeTree". diff --git a/docs/en/data_types/nested_data_structures/index.rst b/docs/en/data_types/nested_data_structures/index.rst new file mode 100644 index 00000000000..8be688424ce --- /dev/null +++ b/docs/en/data_types/nested_data_structures/index.rst @@ -0,0 +1,7 @@ +Nested data structures +-------------------------- + +.. toctree:: + :glob: + + * diff --git a/docs/en/data_types/nested_data_structures/nested.rst b/docs/en/data_types/nested_data_structures/nested.rst new file mode 100644 index 00000000000..b1d3b1b0174 --- /dev/null +++ b/docs/en/data_types/nested_data_structures/nested.rst @@ -0,0 +1,94 @@ +Nested(Name1 Type1, Name2 Type2, ...) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A nested data structure is like a nested table. The parameters of a nested data structure - the column names and types - are specified the same way as in a CREATE query. Each table row can correspond to any number of rows in a nested data structure. + +Example: + +.. code-block:: sql + + CREATE TABLE test.visits + ( + CounterID UInt32, + StartDate Date, + Sign Int8, + IsNew UInt8, + VisitID UInt64, + UserID UInt64, + ... + Goals Nested + ( + ID UInt32, + Serial UInt32, + EventTime DateTime, + Price Int64, + OrderID String, + CurrencyID UInt32 + ), + ... + ) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) + +This example declares the 'Goals' nested data structure, which contains data about conversions (goals reached). Each row in the 'visits' table can correspond to zero or any number of conversions. + +Only a single nesting level is supported. Nested structure columns with array type are equivalent to multidimensional arrays and thus their support is limited (storing such columns in tables with engines from MergeTree family is not supported). + +In most cases, when working with a nested data structure, its individual columns are specified. To do this, the column names are separated by a dot. These columns make up an array of matching types. All the column arrays of a single nested data structure have the same length. + +Example: + +.. code-block:: sql + + SELECT + Goals.ID, + Goals.EventTime + FROM test.visits + WHERE CounterID = 101500 AND length(Goals.ID) < 5 + LIMIT 10 + + ┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ + │ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ + │ [1073752] │ ['2014-03-17 00:28:25'] │ + │ [1073752] │ ['2014-03-17 10:46:20'] │ + │ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ + │ [] │ [] │ + │ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ + │ [] │ [] │ + │ [] │ [] │ + │ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ + │ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ + └────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ + +It is easiest to think of a nested data structure as a set of multiple column arrays of the same length. + +The only place where a SELECT query can specify the name of an entire nested data structure instead of individual columns is the ARRAY JOIN clause. For more information, see "ARRAY JOIN clause". Example: + +.. code-block:: sql + + SELECT + Goal.ID, + Goal.EventTime + FROM test.visits + ARRAY JOIN Goals AS Goal + WHERE CounterID = 101500 AND length(Goals.ID) < 5 + LIMIT 10 + + ┌─Goal.ID─┬──────Goal.EventTime─┐ + │ 1073752 │ 2014-03-17 16:38:10 │ + │ 591325 │ 2014-03-17 16:38:48 │ + │ 591325 │ 2014-03-17 16:42:27 │ + │ 1073752 │ 2014-03-17 00:28:25 │ + │ 1073752 │ 2014-03-17 10:46:20 │ + │ 1073752 │ 2014-03-17 13:59:20 │ + │ 591325 │ 2014-03-17 22:17:55 │ + │ 591325 │ 2014-03-17 22:18:07 │ + │ 591325 │ 2014-03-17 22:18:51 │ + │ 1073752 │ 2014-03-17 11:37:06 │ + └─────────┴─────────────────────┘ + +You can't perform SELECT for an entire nested data structure. You can only explicitly list individual columns that are part of it. + +For an INSERT query, you should pass all the component column arrays of a nested data structure separately (as if they were individual column arrays). During insertion, the system checks that they have the same length. + +For a DESCRIBE query, the columns in a nested data structure are listed separately in the same way. + +The ALTER query is very limited for elements in a nested data structure. diff --git a/docs/en/data_types/special_data_types/expression.rst b/docs/en/data_types/special_data_types/expression.rst new file mode 100644 index 00000000000..3cd57b38746 --- /dev/null +++ b/docs/en/data_types/special_data_types/expression.rst @@ -0,0 +1,4 @@ +Expression +~~~~~~~~~~ + +Used for representing lambda expressions in high-order functions. diff --git a/docs/en/data_types/special_data_types/index.rst b/docs/en/data_types/special_data_types/index.rst new file mode 100644 index 00000000000..5a63f005328 --- /dev/null +++ b/docs/en/data_types/special_data_types/index.rst @@ -0,0 +1,9 @@ +Special data types +---------------------- + +Special data type values can't be saved to a table or output in results, but are used as the intermediate result of running a query. + +.. toctree:: + :glob: + + * diff --git a/docs/en/data_types/special_data_types/set.rst b/docs/en/data_types/special_data_types/set.rst new file mode 100644 index 00000000000..0a48c339984 --- /dev/null +++ b/docs/en/data_types/special_data_types/set.rst @@ -0,0 +1,4 @@ +Set +~~~ + +Used for the right half of an IN expression. diff --git a/docs/en/data_types/string.rst b/docs/en/data_types/string.rst new file mode 100644 index 00000000000..46b0631e24b --- /dev/null +++ b/docs/en/data_types/string.rst @@ -0,0 +1,14 @@ +String +------ + +Strings of an arbitrary length. The length is not limited. The value can contain an arbitrary set of bytes, including null bytes. +The String type replaces the types VARCHAR, BLOB, CLOB, and others from other DBMSs. + + +Кодировки +~~~~~~~~~ + +ClickHouse doesn't have the concept of encodings. Strings can contain an arbitrary set of bytes, which are stored and output as-is. +If you need to store texts, we recommend using UTF-8 encoding. At the very least, if your terminal uses UTF-8 (as recommended), you can read and write your values without making conversions. +Similarly, certain functions for working with strings have separate variations that work under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. +For example, the 'length' function calculates the string length in bytes, while the 'lengthUTF8' function calculates the string length in Unicode code points, assuming that the value is UTF-8 encoded. diff --git a/docs/en/data_types/tuple.rst b/docs/en/data_types/tuple.rst new file mode 100644 index 00000000000..d53cd8daa14 --- /dev/null +++ b/docs/en/data_types/tuple.rst @@ -0,0 +1,6 @@ +Tuple(T1, T2, ...) +------------------ + +Tuples can't be written to tables (other than Memory tables). They are used for temporary column grouping. Columns can be grouped when an IN expression is used in a query, and for specifying certain formal parameters of lambda functions. For more information, see "IN operators" and "Higher order functions". + +Tuples can be output as the result of running a query. In this case, for text formats other than JSON*, values are comma-separated in brackets. In JSON* formats, tuples are output as arrays (in square brackets). diff --git a/docs/en/dicts/external_dicts.rst b/docs/en/dicts/external_dicts.rst new file mode 100644 index 00000000000..39b59501da7 --- /dev/null +++ b/docs/en/dicts/external_dicts.rst @@ -0,0 +1,361 @@ +External dictionaries +=============== + +It is possible to add your own dictionaries from various data sources. The data source for a dictionary can be a file in the local file system, the ClickHouse server, or a MySQL server. +A dictionary can be stored completely in RAM and updated regularly, or it can be partially cached in RAM and dynamically load missing values. + +The configuration of external dictionaries is in a separate file or files specified in the 'dictionaries_config' configuration parameter. +This parameter contains the absolute or relative path to the file with the dictionary configuration. A relative path is relative to the directory with the server config file. The path can contain wildcards * and ?, in which case all matching files are found. Example: dictionaries/*.xml. + +The dictionary configuration, as well as the set of files with the configuration, can be updated without restarting the server. The server checks updates every 5 seconds. This means that dictionaries can be enabled dynamically. + +Dictionaries can be created when starting the server, or at first use. This is defined by the 'dictionaries_lazy_load' parameter in the main server config file. This parameter is optional, 'true' by default. If set to 'true', each dictionary is created at first use. If dictionary creation failed, the function that was using the dictionary throws an exception. If 'false', all dictionaries are created when the server starts, and if there is an error, the server shuts down. + +The dictionary config file has the following format: + +.. code-block:: xml + + + Optional element with any content; completely ignored. + + + + + os + + + + + + + /opt/dictionaries/os.tsv + + TabSeparated + + + + + + + + + + cat /opt/dictionaries/os.tsv + + TabSeparated + + + + + + http://[::1]/os.tsv + + TabSeparated + + + + + + + 300 + 360 + + + + + + + + + + + + + + + + Id + + + + + Name + + String + + + + + + ParentID + UInt64 + 0 + + true + + true + + + + + +The dictionary identifier (key attribute) should be a number that fits into UInt64. Also, you can use arbitrary tuples as keys (see section "Dictionaries with complex keys"). Note: you can use complex keys consisting of just one element. This allows using e.g. Strings as dictionary keys. + +There are six ways to store dictionaries in memory. + +flat +----- +This is the most effective method. It works if all keys are smaller than ``500,000``. If a larger key is discovered when creating the dictionary, an exception is thrown and the dictionary is not created. The dictionary is loaded to RAM in its entirety. The dictionary uses the amount of memory proportional to maximum key value. With the limit of 500,000, memory consumption is not likely to be high. All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. + +hashed +------- +This method is slightly less effective than the first one. The dictionary is also loaded to RAM in its entirety, and can contain any number of items with any identifiers. In practice, it makes sense to use up to tens of millions of items, while there is enough RAM. +All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. + +cache +------- +This is the least effective method. It is appropriate if the dictionary doesn't fit in RAM. It is a cache of a fixed number of cells, where frequently-used data can be located. MySQL, ClickHouse, executable, http sources are supported, but file sources are not supported. +When searching a dictionary, the cache is searched first. For each data block, all keys not found in the cache (or expired keys) are collected in a package, which is sent to the source with the query ``SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)``. The received data is then written to the cache. + +range_hashed +-------- +The table lists some data for date ranges, for each key. To give the possibility to extract this data for a given key, for a given date. + +Example: in the table there are discounts for each advertiser in the form: +:: + advertiser id discount start date end date value + 123 2015-01-01 2015-01-15 0.15 + 123 2015-01-16 2015-01-31 0.25 + 456 2015-01-01 2015-01-15 0.05 + +Adding layout = range_hashed. +When using such a layout, the structure should have the elements range_min, range_max. + +Example: + +.. code-block:: xml + + + + Id + + + first + + + last + + ... + +These columns must be of type Date. Other types are not yet supported. +The columns indicate a closed date range. + +To work with such dictionaries, dictGetT functions must take one more argument - the date: + +``dictGetT('dict_name', 'attr_name', id, date)`` + +The function takes out the value for this id and for the date range, which includes the transmitted date. If no id is found or the range found is not found for the found id, the default value for the dictionary is returned. + +If there are overlapping ranges, then any suitable one can be used. + +If the range boundary is NULL or is an incorrect date (1900-01-01, 2039-01-01), then the range should be considered open. The range can be open on both sides. + +In the RAM, the data is presented as a hash table with a value in the form of an ordered array of ranges and their corresponding values. + +Example of a dictionary by ranges: + +.. code-block:: xml + + + + xxx + + + xxx + 3306 + xxx + + xxx + 1 + + dicts + xxx
+
+ + + 300 + 360 + + + + + + + Abcdef + + + StartDate + + + EndDate + + + XXXType + String + + + +
+
+ +ip_trie +------- +The table stores IP prefixes for each key (IP address), which makes it possible to map IP addresses to metadata such as ASN or threat score. + +Example: in the table there are prefixes matches to AS number and country: +:: + prefix asn cca2 + 202.79.32.0/20 17501 NP + 2620:0:870::/48 3856 US + 2a02:6b8:1::/48 13238 RU + 2001:db8::/32 65536 ZZ + + +When using such a layout, the structure should have the "key" element. + +Example: + +.. code-block:: xml + + + + + prefix + String + + + + asn + UInt32 + + + + cca2 + String + ?? + + ... + +These key must have only one attribute of type String, containing a valid IP prefix. Other types are not yet supported. + +For querying, same functions (dictGetT with tuple) as for complex key dictionaries have to be used: + +``dictGetT('dict_name', 'attr_name', tuple(ip))`` + +The function accepts either UInt32 for IPv4 address or FixedString(16) for IPv6 address in wire format: + +``dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1')))`` + +No other type is supported. The function returns attribute for a prefix matching the given IP address. If there are overlapping prefixes, the most specific one is returned. + +The data is stored currently in a bitwise trie, it has to fit in memory. + +complex_key_hashed +---------------- + +The same as ``hashed``, but for complex keys. + +complex_key_cache +---------- + +The same as ``cache``, but for complex keys. + +Notes +---------- + +We recommend using the ``flat`` method when possible, or ``hashed``. The speed of the dictionaries is impeccable with this type of memory storage. + +Use the cache method only in cases when it is unavoidable. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary only works normally for high enough hit rates (recommended 99% and higher). You can view the average hit rate in the system.dictionaries table. Set a large enough cache size. You will need to experiment to find the right number of cells - select a value, use a query to get the cache completely full, look at the memory consumption (this information is in the system.dictionaries table), then proportionally increase the number of cells so that a reasonable amount of memory is consumed. We recommend MySQL as the source for the cache, because ClickHouse doesn't handle requests with random reads very well. + +In all cases, performance is better if you call the function for working with a dictionary after ``GROUP BY``, and if the attribute being fetched is marked as injective. For a dictionary cache, performance improves if you call the function after LIMIT. To do this, you can use a subquery with LIMIT, and call the function with the dictionary from the outside. + +An attribute is called injective if different attribute values correspond to different keys. So when ``GROUP BY`` uses a function that fetches an attribute value by the key, this function is automatically taken out of ``GROUP BY``. + +When updating dictionaries from a file, first the file modification time is checked, and it is loaded only if the file has changed. +When updating from MySQL, for flat and hashed dictionaries, first a ``SHOW TABLE STATUS`` query is made, and the table update time is checked. If it is not NULL, it is compared to the stored time. This works for MyISAM tables, but for InnoDB tables the update time is unknown, so loading from InnoDB is performed on each update. + +For cache dictionaries, the expiration (lifetime) of data in the cache can be set. If more time than 'lifetime' has passed since loading the data in a cell, the cell's value is not used, and it is re-requested the next time it needs to be used. + +If a dictionary couldn't be loaded even once, an attempt to use it throws an exception. +If an error occurred during a request to a cached source, an exception is thrown. +Dictionary updates (other than loading for first use) do not block queries. During updates, the old version of a dictionary is used. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries. + +You can view the list of external dictionaries and their status in the system.dictionaries table. + +To use external dictionaries, see the section "Functions for working with external dictionaries". + +Note that you can convert values for a small dictionary by specifying all the contents of the dictionary directly in a ``SELECT`` query (see the section "transform function"). This functionality is not related to external dictionaries. + +Dictionaries with complex keys +---------------------------- + +You can use tuples consisting of fields of arbitrary types as keys. Configure your dictionary with ``complex_key_hashed`` or ``complex_key_cache`` layout in this case. + +Key structure is configured not in the ```` element but in the ```` element. Fields of the key tuple are configured analogously to dictionary attributes. Example: + +.. code-block:: xml + + + + + field1 + String + + + field2 + UInt32 + + ... + + ... + + +When using such dictionary, use a Tuple of field values as a key in dictGet* functions. Example: ``dictGetString('dict_name', 'attr_name', tuple('field1_value', 123))``. diff --git a/docs/en/dicts/index.rst b/docs/en/dicts/index.rst new file mode 100644 index 00000000000..4ed265c26c4 --- /dev/null +++ b/docs/en/dicts/index.rst @@ -0,0 +1,11 @@ +Dictionaries +======= + +A dictionary is a mapping (key -> attributes) that can be used in a query as functions. You can think of this as a more convenient and efficient type of JOIN with dimension tables. + +There are built-in (internal) and add-on (external) dictionaries. + +.. toctree:: + :glob: + + * diff --git a/docs/en/dicts/internal_dicts.rst b/docs/en/dicts/internal_dicts.rst new file mode 100644 index 00000000000..18a15c0245b --- /dev/null +++ b/docs/en/dicts/internal_dicts.rst @@ -0,0 +1,45 @@ +Internal dictionaries +------------------ + +ClickHouse contains a built-in feature for working with a geobase. + +This allows you to: + * Use a region's ID to get its name in the desired language. + * Use a region's ID to get the ID of a city, area, federal district, country, or continent. + * Check whether a region is part of another region. + * Get a chain of parent regions. + +All the functions support "translocality," the ability to simultaneously use different perspectives on region ownership. For more information, see the section "Functions for working with Yandex.Metrica dictionaries". + +The internal dictionaries are disabled in the default package. +To enable them, uncomment the parameters ``path_to_regions_hierarchy_file`` and ``path_to_regions_names_files`` in the server config file. + +The geobase is loaded from text files. +If you are Yandex employee, to create them, use the following instructions: +https://github.yandex-team.ru/raw/Metrika/ClickHouse_private/master/doc/create_embedded_geobase_dictionaries.txt + +Put the regions_hierarchy*.txt files in the path_to_regions_hierarchy_file directory. This configuration parameter must contain the path to the regions_hierarchy.txt file (the default regional hierarchy), and the other files (regions_hierarchy_ua.txt) must be located in the same directory. + +Put the regions_names_*.txt files in the path_to_regions_names_files directory. + +You can also create these files yourself. The file format is as follows: + +``regions_hierarchy*.txt``: TabSeparated (no header), columns: + * Region ID (UInt32) + * Parent region ID (UInt32) + * Region type (UInt8): 1 - continent, 3 - country, 4 - federal district, 5 - region, 6 - city; other types don't have values. + * Population (UInt32) - Optional column. + +``regions_names_*.txt``: TabSeparated (no header), columns: + * Region ID (UInt32) + * Region name (String) - Can't contain tabs or line breaks, even escaped ones. + +A flat array is used for storing in RAM. For this reason, IDs shouldn't be more than a million. + +Dictionaries can be updated without the server restart. However, the set of available dictionaries is not updated. For updates, the file modification times are checked. If a file has changed, the dictionary is updated. +The interval to check for changes is configured in the 'builtin_dictionaries_reload_interval' parameter. +Dictionary updates (other than loading at first use) do not block queries. During updates, queries use the old versions of dictionaries. If an error occurs during an update, the error is written to the server log, while queries continue using the old version of dictionaries. + +We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server. + +There are also functions for working with OS identifiers and Yandex.Metrica search engines, but they shouldn't be used. diff --git a/docs/en/external_data.rst b/docs/en/external_data.rst new file mode 100644 index 00000000000..6021cdecb2c --- /dev/null +++ b/docs/en/external_data.rst @@ -0,0 +1,56 @@ +External data for query processing +==================================== + +ClickHouse allows sending a server the data that is needed for processing a query, together with a SELECT query. This data is put in a temporary table (see the section "Temporary tables") and can be used in the query (for example, in IN operators). + +For example, if you have a text file with important user identifiers, you can upload it to the server along with a query that uses filtration by this list. + +If you need to run more than one query with a large volume of external data, don't use this feature. It is better to upload the data to the DB ahead of time. + +External data can be uploaded using the command-line client (in non-interactive mode), or using the HTTP interface. + +In the command-line client, you can specify a parameters section in the format +:: + --external --file=... [--name=...] [--format=...] [--types=...|--structure=...] + +You may have multiple sections like this, for the number of tables being transmitted. + +**--external** - Marks the beginning of the section. +**--file** - Path to the file with the table dump, or ``-``, which refers to stdin +Only a single table can be retrieved from stdin. + +The following parameters are optional: +**--name** - Name of the table. If omitted, ``_data`` is used. +**--format** - Data format in the file. If omitted, ``TabSeparated`` is used. + +One of the following parameters is required: +**--types** - A comma-separated list of column types. For example, ``UInt64,String``. Columns will be named ``_1``, ``_2``, ... +**--structure** - Table structure, in the format ``UserID UInt64, URL String``. Defines the column names and types. + +The files specified in ``file`` will be parsed by the format specified in ``format``, using the data types specified in ``types`` or ``structure``. The table will be uploaded to the server and accessible there as a temporary table with the name ``name``. + +Examples: +:: + echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 + 849897 + cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' + /bin/sh 20 + /bin/false 5 + /bin/bash 4 + /usr/sbin/nologin 1 + /bin/sync 1 + +When using the HTTP interface, external data is passed in the multipart/form-data format. Each table is transmitted as a separate file. The table name is taken from the file name. The 'query_string' passes the parameters 'name_format', 'name_types', and 'name_structure', where name is the name of the table that these parameters correspond to. The meaning of the parameters is the same as when using the command-line client. + +Example: +:: + cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv + + curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' + /bin/sh 20 + /bin/false 5 + /bin/bash 4 + /usr/sbin/nologin 1 + /bin/sync 1 + +For distributed query processing, the temporary tables are sent to all the remote servers. diff --git a/docs/en/formats/blocktabseparated.rst b/docs/en/formats/blocktabseparated.rst new file mode 100644 index 00000000000..989804dffff --- /dev/null +++ b/docs/en/formats/blocktabseparated.rst @@ -0,0 +1,9 @@ +BlockTabSeparated +----------------- + +Data is not written by row, but by column and block. +Each block consists of parts of columns, each of which is written on a separate line. +The values are tab-separated. The last value in a column part is followed by a line break instead of a tab. +Blocks are separated by a double line break. +The rest of the rules are the same as in the TabSeparated format. +This format is only appropriate for outputting a query result, not for parsing. diff --git a/docs/en/formats/csv.rst b/docs/en/formats/csv.rst new file mode 100644 index 00000000000..5f3d1133a7a --- /dev/null +++ b/docs/en/formats/csv.rst @@ -0,0 +1,10 @@ +CSV +---- + +Comma separated values (`RFC `_). + +String values are output in double quotes. Double quote inside a string is output as two consecutive double quotes. That's all escaping rules. Date and DateTime values are output in double quotes. Numbers are output without quotes. Fields are delimited by commas. Rows are delimited by unix newlines (LF). Arrays are output in following way: first, array are serialized to String (as in TabSeparated or Values formats), and then the String value are output in double quotes. Tuples are narrowed and serialized as separate columns. + +During parsing, values could be enclosed or not enclosed in quotes. Supported both single and double quotes. In particular, Strings could be represented without quotes - in that case, they are parsed up to comma or newline (CR or LF). Contrary to RFC, in case of parsing strings without quotes, leading and trailing spaces and tabs are ignored. As line delimiter, both Unix (LF), Windows (CR LF) or Mac OS Classic (LF CR) variants are supported. + +CSV format supports output of totals and extremes similar to TabSeparated format. diff --git a/docs/en/formats/csvwithnames.rst b/docs/en/formats/csvwithnames.rst new file mode 100644 index 00000000000..524adbccf37 --- /dev/null +++ b/docs/en/formats/csvwithnames.rst @@ -0,0 +1,4 @@ +CSVWithNames +------------ + +Also contains header, similar to ``TabSeparatedWithNames``. diff --git a/docs/en/formats/index.rst b/docs/en/formats/index.rst new file mode 100644 index 00000000000..2947bd2d93e --- /dev/null +++ b/docs/en/formats/index.rst @@ -0,0 +1,9 @@ +Formats +======= + +The format determines how data is given (written by server as output) to you after SELECTs, and how it is accepted (read by server as input) for INSERTs. + +.. toctree:: + :glob: + + * diff --git a/docs/en/formats/json.rst b/docs/en/formats/json.rst new file mode 100644 index 00000000000..b02a8ebcacb --- /dev/null +++ b/docs/en/formats/json.rst @@ -0,0 +1,85 @@ +JSON +----- + +Outputs data in JSON format. Besides data tables, it also outputs column names and types, along with some additional information - the total number of output rows, and the number of rows that could have been output if there weren't a LIMIT. Example: + +.. code-block:: sql + + SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON + + { + "meta": + [ + { + "name": "SearchPhrase", + "type": "String" + }, + { + "name": "c", + "type": "UInt64" + } + ], + + "data": + [ + { + "SearchPhrase": "", + "c": "8267016" + }, + { + "SearchPhrase": "интерьер ванной комнаты", + "c": "2166" + }, + { + "SearchPhrase": "яндекс", + "c": "1655" + }, + { + "SearchPhrase": "весна 2014 мода", + "c": "1549" + }, + { + "SearchPhrase": "фриформ фото", + "c": "1480" + } + ], + + "totals": + { + "SearchPhrase": "", + "c": "8873898" + }, + + "extremes": + { + "min": + { + "SearchPhrase": "", + "c": "1480" + }, + "max": + { + "SearchPhrase": "", + "c": "8267016" + } + }, + + "rows": 5, + + "rows_before_limit_at_least": 141137 + } + +JSON is compatible with JavaScript. For this purpose, certain symbols are additionally escaped: the forward slash ``/`` is escaped as ``\/``; alternative line breaks ``U+2028`` and ``U+2029``, which don't work in some browsers, are escaped as \uXXXX-sequences. ASCII control characters are escaped: backspace, form feed, line feed, carriage return, and horizontal tab as ``\b``, ``\f``, ``\n``, ``\r``, and ``\t`` respectively, along with the rest of the bytes from the range 00-1F using \uXXXX-sequences. Invalid UTF-8 sequences are changed to the replacement character ``�`` and, thus, the output text will consist of valid UTF-8 sequences. UInt64 and Int64 numbers are output in double quotes for compatibility with JavaScript. + +``rows`` - The total number of output rows. + +``rows_before_limit_at_least`` - The minimal number of rows there would have been without a LIMIT. Output only if the query contains LIMIT. + +If the query contains GROUP BY, ``rows_before_limit_at_least`` is the exact number of rows there would have been without a LIMIT. + +``totals`` - Total values (when using WITH TOTALS). + +``extremes`` - Extreme values (when extremes is set to 1). + +This format is only appropriate for outputting a query result, not for parsing. +See JSONEachRow format for INSERT queries. diff --git a/docs/en/formats/jsoncompact.rst b/docs/en/formats/jsoncompact.rst new file mode 100644 index 00000000000..30303904b28 --- /dev/null +++ b/docs/en/formats/jsoncompact.rst @@ -0,0 +1,44 @@ +JSONCompact +----------- + +Differs from ``JSON`` only in that data rows are output in arrays, not in objects. + +Example: +:: + { + "meta": + [ + { + "name": "SearchPhrase", + "type": "String" + }, + { + "name": "c", + "type": "UInt64" + } + ], + + "data": + [ + ["", "8267016"], + ["bath interiors", "2166"], + ["yandex", "1655"], + ["spring 2014 fashion", "1549"], + ["freeform photo", "1480"] + ], + + "totals": ["","8873898"], + + "extremes": + { + "min": ["","1480"], + "max": ["","8267016"] + }, + + "rows": 5, + + "rows_before_limit_at_least": 141137 + } + +This format is only appropriate for outputting a query result, not for parsing. +See ``JSONEachRow`` format for INSERT queries. diff --git a/docs/en/formats/jsoneachrow.rst b/docs/en/formats/jsoneachrow.rst new file mode 100644 index 00000000000..b13d9f378d9 --- /dev/null +++ b/docs/en/formats/jsoneachrow.rst @@ -0,0 +1,23 @@ +JSONEachRow +----------- + +If put in SELECT query, displays data in newline delimited JSON (JSON objects separated by \\n character) format. +If put in INSERT query, expects this kind of data as input. +:: + {"SearchPhrase":"","count()":"8267016"} + {"SearchPhrase":"bathroom interior","count()":"2166"} + {"SearchPhrase":"yandex","count()":"1655"} + {"SearchPhrase":"spring 2014 fashion","count()":"1549"} + {"SearchPhrase":"free-form photo","count()":"1480"} + {"SearchPhrase":"Angelina Jolie","count()":"1245"} + {"SearchPhrase":"omsk","count()":"1112"} + {"SearchPhrase":"photos of dog breeds","count()":"1091"} + {"SearchPhrase":"curtain design","count()":"1064"} + {"SearchPhrase":"baku","count()":"1000"} + +Unlike JSON format, there are no replacements of invalid UTF-8 sequences. There can be arbitrary amount of bytes in a line. +This is done in order to avoid data loss during formatting. Values are displayed analogous to JSON format. + +In INSERT queries JSON data can be supplied with arbitrary order of columns (JSON key-value pairs). It is also possible to omit values in which case the default value of the column is inserted. N.B. when using JSONEachRow format, complex default values are not supported, so when omitting a column its value will be zeros or empty string depending on its type. + +Space characters between JSON objects are skipped. Between objects there can be a comma which is ignored. Newline character is not a mandatory separator for objects. diff --git a/docs/en/formats/native.rst b/docs/en/formats/native.rst new file mode 100644 index 00000000000..bea3eae72d8 --- /dev/null +++ b/docs/en/formats/native.rst @@ -0,0 +1,6 @@ +Native +------ + +The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is "columnar" - it doesn't convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients. + +You can use this format to quickly generate dumps that can only be read by the ClickHouse DBMS. It doesn't make sense to work with this format yourself. diff --git a/docs/en/formats/null.rst b/docs/en/formats/null.rst new file mode 100644 index 00000000000..a3ea6d40e51 --- /dev/null +++ b/docs/en/formats/null.rst @@ -0,0 +1,4 @@ +Null +---- + +Nothing is output. However, the query is processed, and when using the command-line client, data is transmitted to the client. This is used for tests, including productivity testing. Obviously, this format is only appropriate for outputting a query result, not for parsing. diff --git a/docs/en/formats/pretty.rst b/docs/en/formats/pretty.rst new file mode 100644 index 00000000000..daefa8f8da0 --- /dev/null +++ b/docs/en/formats/pretty.rst @@ -0,0 +1,34 @@ +Pretty +------ + +Writes data as Unicode-art tables, also using ANSI-escape sequences for setting colors in the terminal. +A full grid of the table is drawn, and each row occupies two lines in the terminal. Each result block is output as a separate table. This is necessary so that blocks can be output without buffering results (buffering would be necessary in order to pre-calculate the visible width of all the values). +To avoid dumping too much data to the terminal, only the first 10,000 rows are printed. If the number of rows is greater than or equal to 10,000, the message "Showed first 10,000" is printed. +This format is only appropriate for outputting a query result, not for parsing. + +The Pretty format supports outputting total values (when using WITH TOTALS) and extremes (when 'extremes' is set to 1). In these cases, total values and extreme values are output after the main data, in separate tables. Example (shown for the PrettyCompact format): + +.. code-block:: sql + + SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact + + ┌──EventDate─┬───────c─┐ + │ 2014-03-17 │ 1406958 │ + │ 2014-03-18 │ 1383658 │ + │ 2014-03-19 │ 1405797 │ + │ 2014-03-20 │ 1353623 │ + │ 2014-03-21 │ 1245779 │ + │ 2014-03-22 │ 1031592 │ + │ 2014-03-23 │ 1046491 │ + └────────────┴─────────┘ + + Totals: + ┌──EventDate─┬───────c─┐ + │ 0000-00-00 │ 8873898 │ + └────────────┴─────────┘ + + Extremes: + ┌──EventDate─┬───────c─┐ + │ 2014-03-17 │ 1031592 │ + │ 2014-03-23 │ 1406958 │ + └────────────┴─────────┘ diff --git a/docs/en/formats/prettycompact.rst b/docs/en/formats/prettycompact.rst new file mode 100644 index 00000000000..bae790f1aaf --- /dev/null +++ b/docs/en/formats/prettycompact.rst @@ -0,0 +1,4 @@ +PrettyCompact +------------- + +Differs from ``Pretty`` in that the grid is drawn between rows and the result is more compact. This format is used by default in the command-line client in interactive mode. diff --git a/docs/en/formats/prettycompactmonoblock.rst b/docs/en/formats/prettycompactmonoblock.rst new file mode 100644 index 00000000000..35e18b045b1 --- /dev/null +++ b/docs/en/formats/prettycompactmonoblock.rst @@ -0,0 +1,4 @@ +PrettyCompactMonoBlock +---------------------- + +Differs from ``PrettyCompact`` in that up to 10,000 rows are buffered, then output as a single table, not by blocks. diff --git a/docs/en/formats/prettynoescapes.rst b/docs/en/formats/prettynoescapes.rst new file mode 100644 index 00000000000..954485f4d6f --- /dev/null +++ b/docs/en/formats/prettynoescapes.rst @@ -0,0 +1,18 @@ +PrettyNoEscapes +--------------- + +Differs from Pretty in that ANSI-escape sequences aren't used. This is necessary for displaying this format in a browser, as well as for using the 'watch' command-line utility. + +Example: +:: + watch -n1 "clickhouse-client --query='SELECT * FROM system.events FORMAT PrettyCompactNoEscapes'" + +You can use the HTTP interface for displaying in the browser. + +PrettyCompactNoEscapes +---------------------- +The same. + +PrettySpaceNoEscapes +-------------------- +The same. diff --git a/docs/en/formats/prettyspace.rst b/docs/en/formats/prettyspace.rst new file mode 100644 index 00000000000..3d8aec8a934 --- /dev/null +++ b/docs/en/formats/prettyspace.rst @@ -0,0 +1,4 @@ +PrettySpace +----------- + +Differs from ``PrettyCompact`` in that whitespace (space characters) is used instead of the grid. diff --git a/docs/en/formats/rowbinary.rst b/docs/en/formats/rowbinary.rst new file mode 100644 index 00000000000..b87a27706a8 --- /dev/null +++ b/docs/en/formats/rowbinary.rst @@ -0,0 +1,13 @@ +RowBinary +--------- + +Writes data by row in binary format. Rows and values are listed consecutively, without separators. +This format is less efficient than the Native format, since it is row-based. + +Numbers is written in little endian, fixed width. For example, UInt64 takes 8 bytes. +DateTime is written as UInt32 with unix timestamp value. +Date is written as UInt16 with number of days since 1970-01-01 in value. +String is written as length in varint (unsigned `LEB128 `_) format and then bytes of string. +FixedString is written as just its bytes. +Array is written as length in varint (unsigned `LEB128 `_) format and then all elements, contiguously + diff --git a/docs/en/formats/tabseparated.rst b/docs/en/formats/tabseparated.rst new file mode 100644 index 00000000000..685251a050c --- /dev/null +++ b/docs/en/formats/tabseparated.rst @@ -0,0 +1,55 @@ +TabSeparated +------------ + +In TabSeparated format, data is written by row. Each row contains values separated by tabs. Each value is follow by a tab, except the last value in the row, which is followed by a line break. Strictly Unix line breaks are assumed everywhere. The last row also must contain a line break at the end. Values are written in text format, without enclosing quotation marks, and with special characters escaped. + +Numbers are written in decimal form. Numbers may contain an extra "+" symbol at the beginning (but it is not recorded during an output). Non-negative numbers can't contain the negative sign. When parsing, it is allowed to parse an empty string as a zero, or (for signed types) a string consisting of just a minus sign as a zero. Numbers that do not fit into the corresponding data type may be parsed as a different number, without an error message. + +Floating-point numbers are formatted in decimal form. The dot is used as the decimal separator. Exponential entries are supported, as are 'inf', '+inf', '-inf', and 'nan'. An entry of floating-point numbers may begin or end with a decimal point. +During formatting, accuracy may be lost on floating-point numbers. +During parsing, a result is not necessarily the nearest machine-representable number. + +Dates are formatted in YYYY-MM-DD format and parsed in the same format, but with any characters as separators. +DateTimes are formatted in the format YYYY-MM-DD hh:mm:ss and parsed in the same format, but with any characters as separators. +This all occurs in the system time zone at the time the client or server starts (depending on which one formats data). For DateTimes, daylight saving time is not specified. So if a dump has times during daylight saving time, the dump does not unequivocally match the data, and parsing will select one of the two times. +During a parsing operation, incorrect dates and dates with times can be parsed with natural overflow or as null dates and times, without an error message. + +As an exception, parsing DateTime is also supported in Unix timestamp format, if it consists of exactly 10 decimal digits. The result is not time zone-dependent. The formats ``YYYY-MM-DD hh:mm:ss`` and ``NNNNNNNNNN`` are differentiated automatically. + +Strings are parsed and formatted with backslash-escaped special characters. The following escape sequences are used while formatting: ``\b``, ``\f``, ``\r``, ``\n``, ``\t``, ``\0``, ``\'``, and ``\\``. For parsing, also supported \a, \v and \xHH (hex escape sequence) and any sequences of the type \c where c is any character (these sequences are converted to c). This means that parsing supports formats where a line break can be written as \n or as \ and a line break. For example, the string 'Hello world' with a line break between the words instead of a space can be retrieved in any of the following variations: +:: + Hello\nworld + + Hello\ + world + +The second variant is supported because MySQL uses it when writing tab-separated dumps. + +Only a small set of symbols are escaped. You can easily stumble onto a string value that your terminal will ruin in output. + +Minimum set of symbols that you must escape in TabSeparated format is tab, newline (LF) and backslash. + +Arrays are formatted as a list of comma-separated values in square brackets. Number items in the array are formatted as normally, but dates, dates with times, and strings are formatted in single quotes with the same escaping rules as above. + +The TabSeparated format is convenient for processing data using custom programs and scripts. It is used by default in the HTTP interface, and in the command-line client's batch mode. This format also allows transferring data between different DBMSs. For example, you can get a dump from MySQL and upload it to ClickHouse, or vice versa. + +The TabSeparated format supports outputting total values (when using WITH TOTALS) and extreme values (when 'extremes' is set to 1). In these cases, the total values and extremes are output after the main data. The main result, total values, and extremes are separated from each other by an empty line. Example: + +``SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` + +.. code-block:: sql + + 2014-03-17 1406958 + 2014-03-18 1383658 + 2014-03-19 1405797 + 2014-03-20 1353623 + 2014-03-21 1245779 + 2014-03-22 1031592 + 2014-03-23 1046491 + + 0000-00-00 8873898 + + 2014-03-17 1031592 + 2014-03-23 1406958 + +It's also available as ``TSV``. diff --git a/docs/en/formats/tabseparatedraw.rst b/docs/en/formats/tabseparatedraw.rst new file mode 100644 index 00000000000..9b8db1346e1 --- /dev/null +++ b/docs/en/formats/tabseparatedraw.rst @@ -0,0 +1,7 @@ +TabSeparatedRaw +--------------- + +Differs from the ``TabSeparated`` format in that the rows are formatted without escaping. +This format is only appropriate for outputting a query result, but not for parsing data to insert into a table. + +It's also available as ``TSVRaw``. diff --git a/docs/en/formats/tabseparatedwithnames.rst b/docs/en/formats/tabseparatedwithnames.rst new file mode 100644 index 00000000000..17c75719bef --- /dev/null +++ b/docs/en/formats/tabseparatedwithnames.rst @@ -0,0 +1,8 @@ +TabSeparatedWithNames +--------------------- + +Differs from the TabSeparated format in that the column names are output in the first row. +For parsing, the first row is completely ignored. You can't use column names to determine their position or to check their correctness. +(Support for using header while parsing could be added in future.) + +It's also available as ``TSVWithNames``. diff --git a/docs/en/formats/tabseparatedwithnamesandtypes.rst b/docs/en/formats/tabseparatedwithnamesandtypes.rst new file mode 100644 index 00000000000..b903dcb339a --- /dev/null +++ b/docs/en/formats/tabseparatedwithnamesandtypes.rst @@ -0,0 +1,7 @@ +TabSeparatedWithNamesAndTypes +----------------------------- + +Differs from the ``TabSeparated`` format in that the column names are output to the first row, while the column types are in the second row. +For parsing, the first and second rows are completely ignored. + +It's also available as ``TSVWithNamesAndTypes``. diff --git a/docs/en/formats/tskv.rst b/docs/en/formats/tskv.rst new file mode 100644 index 00000000000..2a86d63ebea --- /dev/null +++ b/docs/en/formats/tskv.rst @@ -0,0 +1,19 @@ +TSKV +----- + +Similar to TabSeparated, but displays data in name=value format. Names are displayed just as in TabSeparated. Additionally, a ``=`` symbol is displayed. +:: + SearchPhrase= count()=8267016 + SearchPhrase=bathroom interior count()=2166 + SearchPhrase=yandex count()=1655 + SearchPhrase=spring 2014 fashion count()=1549 + SearchPhrase=free-form photo count()=1480 + SearchPhrase=Angelina Jolie count()=1245 + SearchPhrase=omsk count()=1112 + SearchPhrase=photos of dog breeds count()=1091 + SearchPhrase=curtain design count()=1064 + SearchPhrase=baku count()=1000 + +In case of many small columns this format is obviously not effective and there usually is no reason to use it. This format is supported because it is used for some cases in Yandex. + +Format is supported both for input and output. In INSERT queries data can be supplied with arbitrary order of columns. It is also possible to omit values in which case the default value of the column is inserted. N.B. when using TSKV format, complex default values are not supported, so when omitting a column its value will be zeros or empty string depending on its type. diff --git a/docs/en/formats/values.rst b/docs/en/formats/values.rst new file mode 100644 index 00000000000..51fc6716325 --- /dev/null +++ b/docs/en/formats/values.rst @@ -0,0 +1,9 @@ +Values +------ + +Prints every row in parentheses. Rows are separated by commas. There is no comma after the last row. The values inside the parentheses are also comma-separated. Numbers are output in decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are same as in the TabSeparated format. During formatting, extra spaces aren't inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). + +Minimum set of symbols that you must escape in Values format is single quote and backslash. + +This is the format that is used in ``INSERT INTO t VALUES`` ... +But you can also use it for query result. diff --git a/docs/en/formats/vertical.rst b/docs/en/formats/vertical.rst new file mode 100644 index 00000000000..fa09851255e --- /dev/null +++ b/docs/en/formats/vertical.rst @@ -0,0 +1,5 @@ +Vertical +-------- + +Prints each value on a separate line with the column name specified. This format is convenient for printing just one or a few rows, if each row consists of a large number of columns. +This format is only appropriate for outputting a query result, not for parsing. diff --git a/docs/en/formats/xml.rst b/docs/en/formats/xml.rst new file mode 100644 index 00000000000..ed6145d0a1c --- /dev/null +++ b/docs/en/formats/xml.rst @@ -0,0 +1,74 @@ +XML +---- + +XML format is supported only for displaying data, not for INSERTS. Example: + +.. code-block:: xml + + + + + + + SearchPhrase + String + + + count() + UInt64 + + + + + + + 8267016 + + + bathroom interior + 2166 + + + yandex> + 1655 + + + spring 2014 fashion + 1549 + + + free-form photo + 1480 + + + Angelina Jolie + 1245 + + + omsk + 1112 + + + photos of dog breeds + 1091 + + + curtain design + 1064 + + + baku + 1000 + + + 10 + 141137 + + +If name of a column contains some unacceptable character, field is used as a name. In other aspects XML uses JSON structure. +As in case of JSON, invalid UTF-8 sequences are replaced by replacement character � so displayed text will only contain valid UTF-8 sequences. + +In string values ``<`` and ``&`` are displayed as ``<`` and ``&``. + +Arrays are displayed as ``HelloWorld...``, +and tuples as ``HelloWorld...``. diff --git a/docs/en/functions/arithmetic_functions.rst b/docs/en/functions/arithmetic_functions.rst new file mode 100644 index 00000000000..9e26a8e1a60 --- /dev/null +++ b/docs/en/functions/arithmetic_functions.rst @@ -0,0 +1,64 @@ +Arithmetic functions +====================== + +For all arithmetic functions, the result type is calculated as the smallest number type that the result fits in, if there is such a type. The minimum is taken simultaneously based on the number of bits, whether it is signed, and whether it floats. If there are not enough bits, the highest bit type is taken. + +Example + +.. code-block:: sql + + :) SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0) + + ┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ + │ UInt8 │ UInt16 │ UInt32 │ UInt64 │ + └───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ + +Arithmetic functions work for any pair of types from UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, or Float64. + +Overflow is produced the same way as in C++. + + +plus(a, b), a + b operator +-------------------------- +Calculates the sum of the numbers. +You can also add whole numbers with a date or date and time. In the case of a date, adding a whole number means adding the corresponding number of days. For a date with time, it means adding the corresponding number of seconds. + +minus(a, b), a - b operator +--------------------------- +Calculates the difference. The result is always signed. + +You can also calculate whole numbers from a date or date with time. The idea is the same - see above for 'plus'. + +multiply(a, b), a * b operator +------------------------------ +Calculates the product of the numbers. + +divide(a, b), a / b operator +----------------------------- +Calculates the quotient of the numbers. The result type is always a floating-point type. +It is not integer division. For integer division, use the 'intDiv' function. +When dividing by zero you get 'inf', '-inf', or 'nan'. + +intDiv(a, b) +------------ +Calculates the quotient of the numbers. Divides into integers, rounding down (by the absolute value). +When dividing by zero or when dividing a minimal negative number by minus one, an exception is thrown. + +intDivOrZero(a, b) +------------------ +Differs from 'intDiv' in that it returns zero when dividing by zero or when dividing a minimal negative number by minus one. + +modulo(a, b), a % b operator +---------------------------- +Calculates the remainder after division. +If arguments are floating-point numbers, they are pre-converted to integers by dropping the decimal portion. The remainder is taken in the same sense as in C++. Truncated division is used for negative numbers. +An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one. + +negate(a), -a operator +---------------------- +Calculates a number with the reverse sign. The result is always signed. + +abs(a) +------ +Calculates the absolute value of the number 'a'. That is, if a< 0, it returns -a. +For unsigned types, it doesn't do anything. For signed integer types, it returns an unsigned number. diff --git a/docs/en/functions/array_functions.rst b/docs/en/functions/array_functions.rst new file mode 100644 index 00000000000..bf557d1af2e --- /dev/null +++ b/docs/en/functions/array_functions.rst @@ -0,0 +1,176 @@ +Functions for working with arrays +----------------------------- + +empty +~~~~~ +Returns 1 for an empty array, or 0 for a non-empty array. +The result type is UInt8. +The function also works for strings. + +notEmpty +~~~~~~~~ +Returns 0 for an empty array, or 1 for a non-empty array. +The result type is UInt8. +The function also works for strings. + +length +~~~~~~ +Returns the number of items in the array. +The result type is UInt64. +The function also works for strings. + +emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64 +~~~~~~~~~~~~~~ + +emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64 +~~~~~~~~~~~~~~~ + +emptyArrayFloat32, emptyArrayFloat64 +~~~~~~~~~~~~~~~ + +emptyArrayDate, emptyArrayDateTime +~~~~~~~~~~~~~~ + +emptyArrayString +~~~~~~~~~~~~ +Accepts zero arguments and returns an empty array of the appropriate type. + +emptyArrayToSingle +~~~~~~~~~~~~~~ +Accepts an empty array as argument and returns an array of one element equal to the default value. + +range(N) +~~~~~~~ +Returns an array of numbers from 0 to N-1. +Just in case, an exception is thrown if arrays with a total length of more than 100,000,000 elements are created in a data block. + +array(x1, ...), оператор [x1, ...] +~~~~~~~~~~~~ +Creates an array from the function arguments. +The arguments must be constants and have types that have the smallest common type. At least one argument must be passed, because otherwise it isn't clear which type of array to create. That is, you can't use this function to create an empty array (to do that, use the 'emptyArray*' function described above). +Returns an 'Array(T)' type result, where 'T' is the smallest common type out of the passed arguments. + +arrayElement(arr, n), оператор arr[n] +~~~~~~~~~~~~ +Get the element with the index 'n' from the array 'arr'. +'n' should be any integer type. +Indexes in an array begin from one. +Negative indexes are supported - in this case, it selects the corresponding element numbered from the end. For example, 'arr[-1]' is the last item in the array. + +If the index goes beyond the array bounds: +- if both arguments are constants, an exception is thrown. +- otherwise, a default value is returned (0 for numbers, an empty string for strings, etc.). + +has(arr, elem) +~~~~~~~~~~~ +Checks whether the 'arr' array has the 'elem' element. +Returns 0 if the the element is not in the array, or 1 if it is. +'elem' must be a constant. + +indexOf(arr, x) +~~~~~~~~~~ +Returns the index of the 'x' element (starting from 1) if it is in the array, or 0 if it is not. + +countEqual(arr, x) +~~~~~~~~ +Returns the number of elements in the array equal to 'x'. Equivalent to ``arrayCount(elem -> elem = x, arr)``. + +arrayEnumerate(arr) +~~~~~~~~~ +Returns the array ``[1, 2, 3, ..., length(arr)]`` + +This function is normally used together with ARRAY JOIN. It allows counting something just once for each array after applying ARRAY JOIN. Example: + +.. code-block:: sql + + SELECT + count() AS Reaches, + countIf(num = 1) AS Hits + FROM test.hits + ARRAY JOIN + GoalsReached, + arrayEnumerate(GoalsReached) AS num + WHERE CounterID = 160656 + LIMIT 10 + + ┌─Reaches─┬──Hits─┐ + │ 95606 │ 31406 │ + └─────────┴───────┘ + +In this example, Reaches is the number of conversions (the strings received after applying ARRAY JOIN), and Hits is the number of pageviews (strings before ARRAY JOIN). In this particular case, you can get the same result in an easier way: + +.. code-block:: sql + + SELECT + sum(length(GoalsReached)) AS Reaches, + count() AS Hits + FROM test.hits + WHERE (CounterID = 160656) AND notEmpty(GoalsReached) + + ┌─Reaches─┬──Hits─┐ + │ 95606 │ 31406 │ + └─────────┴───────┘ + +This function can also be used in higher-order functions. For example, you can use it to get array indexes for elements that match a condition. + +arrayEnumerateUniq(arr, ...) +~~~~~~~~~~ +Returns an array the same size as the source array, indicating for each element what its position is among elements with the same value. +For example: ``arrayEnumerateUniq([10, 20, 10, 30]) = [1, 1, 2, 1]``. + +This function is useful when using ARRAY JOIN and aggregation of array elements. Example: + +.. code-block:: sql + + SELECT + Goals.ID AS GoalID, + sum(Sign) AS Reaches, + sumIf(Sign, num = 1) AS Visits + FROM test.visits + ARRAY JOIN + Goals, + arrayEnumerateUniq(Goals.ID) AS num + WHERE CounterID = 160656 + GROUP BY GoalID + ORDER BY Reaches DESC + LIMIT 10 + + ┌──GoalID─┬─Reaches─┬─Visits─┐ + │ 53225 │ 3214 │ 1097 │ + │ 2825062 │ 3188 │ 1097 │ + │ 56600 │ 2803 │ 488 │ + │ 1989037 │ 2401 │ 365 │ + │ 2830064 │ 2396 │ 910 │ + │ 1113562 │ 2372 │ 373 │ + │ 3270895 │ 2262 │ 812 │ + │ 1084657 │ 2262 │ 345 │ + │ 56599 │ 2260 │ 799 │ + │ 3271094 │ 2256 │ 812 │ + └─────────┴─────────┴────────┘ + +In this example, each goal ID has a calculation of the number of conversions (each element in the Goals nested data structure is a goal that was reached, which we refer to as a conversion) and the number of sessions. +Without ARRAY JOIN, we would have counted the number of sessions as ``sum(Sign)``. But in this particular case, the rows were multiplied by the nested Goals structure, so in order to count each session one time after this, +we apply a condition to the value of the ``arrayEnumerateUniq(Goals.ID)`` function. + +The arrayEnumerateUniq function can take multiple arrays of the same size as arguments. In this case, uniqueness is considered for tuples of elements in the same positions in all the arrays. + +.. code-block:: sql + + SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res + + ┌─res───────────┐ + │ [1,2,1,1,2,1] │ + └───────────────┘ + +This is necessary when using ARRAY JOIN with a nested data structure and further aggregation across multiple elements in this structure. + +arrayUniq(arr, ...) +~~~~~~~~~~~~~~~~~~~ +If a single array is passed, returns a number of unique elements in that array. +If multiple arrays of the same size are passed as arguments to the function, returns a number of unique tuples of elements in the same positions in all the arrays. + +If you need an array of the unique elements, you can use ``arrayReduce('groupUniqArray', arr)``. + +arrayJoin(arr) +~~~~~~~~ +A special function. See the section "arrayJoin function". diff --git a/docs/en/functions/array_join.rst b/docs/en/functions/array_join.rst new file mode 100644 index 00000000000..4b78d1fe42e --- /dev/null +++ b/docs/en/functions/array_join.rst @@ -0,0 +1,30 @@ +arrayJoin function +--------------- +This is a very unusual function. + +Normal functions don't change a set of rows, but just change the values in each row (map). Aggregate functions compress a set of rows (fold or reduce). +The 'arrayJoin' function takes each row and generates a set of rows (unfold). + +This function takes an array as an argument, and propagates the source row to multiple rows for the number of elements in the array. +All the values in columns are simply copied, except the values in the column where this function is applied - it is replaced with the corresponding array value. + +A query can use multiple 'arrayJoin' functions. In this case, the transformation is performed multiple times. + +Note the ARRAY JOIN syntax in the SELECT query, which provides broader possibilities. + +Example: + +.. code-block:: sql + + :) SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src + + SELECT + arrayJoin([1, 2, 3] AS src) AS dst, + 'Hello', + src + + ┌─dst─┬─\'Hello\'─┬─src─────┐ + │ 1 │ Hello │ [1,2,3] │ + │ 2 │ Hello │ [1,2,3] │ + │ 3 │ Hello │ [1,2,3] │ + └─────┴───────────┴─────────┘ diff --git a/docs/en/functions/bit_functions.rst b/docs/en/functions/bit_functions.rst new file mode 100644 index 00000000000..410776fb596 --- /dev/null +++ b/docs/en/functions/bit_functions.rst @@ -0,0 +1,24 @@ +Bit functions +--------------- + +Bit functions work for any pair of types from UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, or Float64. + +The result type is an integer with bits equal to the maximum bits of its arguments. If at least one of the arguments is signed, the result is a signed number. If an argument is a floating-point number, it is cast to Int64. + +bitAnd(a, b) +~~~~~~~~~~~~ + +bitOr(a, b) +~~~~~~~~~~~ + +bitXor(a, b) +~~~~~~~~~~~~ + +bitNot(a) +~~~~~~~~~ + +bitShiftLeft(a, b) +~~~~~~~~~~~~~~~~~~ + +bitShiftRight(a, b) +~~~~~~~~~~~~~~~~~~ diff --git a/docs/en/functions/comparison_functions.rst b/docs/en/functions/comparison_functions.rst new file mode 100644 index 00000000000..bed301bcc30 --- /dev/null +++ b/docs/en/functions/comparison_functions.rst @@ -0,0 +1,36 @@ +Comparison functions +------------------ + +Comparison functions always return 0 or 1 (Uint8). + +The following types can be compared: + * numbers + * strings and fixed strings + * dates + * dates with times + +within each group, but not between different groups. + +For example, you can't compare a date with a string. You have to use a function to convert the string to a date, or vice versa. + +Strings are compared by bytes. A shorter string is smaller than all strings that start with it and that contain at least one more character. + +Note: before version 1.1.54134 signed and unsigned numbers were compared the same way as in C++. That is, you could got an incorrect result in such cases: SELECT 9223372036854775807 > -1. From version 1.1.54134, the behavior has changed and numbers are compared mathematically correct. + +equals, a = b and a == b operator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +notEquals, a != b and a <> b operator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +less, < operator +~~~~~~~~~~~~~~~~~ + +greater, > operator +~~~~~~~~~~~~~~~~~~~ + +lessOrEquals, <= operator +~~~~~~~~~~~~~~~~~~~~~~~~ + +greaterOrEquals, >= operator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/en/functions/conditional_functions.rst b/docs/en/functions/conditional_functions.rst new file mode 100644 index 00000000000..afe64f8e27e --- /dev/null +++ b/docs/en/functions/conditional_functions.rst @@ -0,0 +1,7 @@ +Conditional functions +------------- + +if(cond, then, else), оператор cond ? then : else +~~~~~~~~~~~~~~~~~ +Returns 'then' if 'cond != 0', or 'else' if 'cond = 0'. +'cond' must be UInt 8, and 'then' and 'else' must be a type that has the smallest common type. diff --git a/docs/en/functions/date_time_functions.rst b/docs/en/functions/date_time_functions.rst new file mode 100644 index 00000000000..1a93c7a725a --- /dev/null +++ b/docs/en/functions/date_time_functions.rst @@ -0,0 +1,141 @@ +Functions for working with dates and times +-------------------------------------- + +Time Zone Support + +All functions for working with the date and time for which this makes sense, can take a second, optional argument - the time zone name. Example: Asia / Yekaterinburg. In this case, they do not use the local time zone (the default), but the specified one. +.. code-block:: sql + SELECT + toDateTime('2016-06-15 23:00:00') AS time, + toDate(time) AS date_local, + toDate(time, 'Asia/Yekaterinburg') AS date_yekat, + toString(time, 'US/Samoa') AS time_samoa + + ┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ + │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ + └─────────────────────┴────────────┴────────────┴─────────────────────┘ +Only time zones are supported, different from UTC for an integer number of hours. + +toYear +~~~~~~~ +Converts a date or date with time to a UInt16 number containing the year number (AD). + +toMonth +~~~~~~~ +Converts a date or date with time to a UInt8 number containing the month number (1-12). + +toDayOfMonth +~~~~~~~ +Converts a date or date with time to a UInt8 number containing the number of the day of the month (1-31). + +toDayOfWeek +~~~~~~~ +Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7). + +toHour +~~~~~~~ +Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23). +This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true - even in Moscow the clocks were once changed at a different time). + +toMinute +~~~~~~~ +Converts a date with time to a UInt8 number containing the number of the minute of the hour (0-59). + +toSecond +~~~~~~~ +Converts a date with time to a UInt8 number containing the number of the second in the minute (0-59). +Leap seconds are not accounted for. + +toStartOfDay +~~~~~~~ +Rounds down a date with time to the start of the day. + +toMonday +~~~~~~~ +Rounds down a date or date with time to the nearest Monday. +Returns the date. + +toStartOfMonth +~~~~~~~ +Rounds down a date or date with time to the first day of the month. +Returns the date. + +toStartOfQuarter +~~~~~~~ +Rounds down a date or date with time to the first day of the quarter. +The first day of the quarter is either 1 January, 1 April, 1 July, or 1 October. Returns the date. + +toStartOfYear +~~~~~~~ +Rounds down a date or date with time to the first day of the year. +Returns the date. + +toStartOfMinute +~~~~~~~ +Rounds down a date with time to the start of the minute. + +toStartOfFiveMinute +~~~~~~~ +Rounds down a date with time to the start of the 5 minute (00:00, 00:05, 00:10...). + +toStartOfHour +~~~~~~~ +Rounds down a date with time to the start of the hour. + +toTime +~~~~~~~ +Converts a date with time to some fixed date, while preserving the time. + +toRelativeYearNum +~~~~~~~ +Converts a date with time or date to the number of the year, starting from a certain fixed point in the past. + +toRelativeMonthNum +~~~~~~~ +Converts a date with time or date to the number of the month, starting from a certain fixed point in the past. + +toRelativeWeekNum +~~~~~~~ +Converts a date with time or date to the number of the week, starting from a certain fixed point in the past. + +toRelativeDayNum +~~~~~~~ +Converts a date with time or date to the number of the day, starting from a certain fixed point in the past. + +toRelativeHourNum +~~~~~~~ +Converts a date with time or date to the number of the hour, starting from a certain fixed point in the past. + +toRelativeMinuteNum +~~~~~~~ +Converts a date with time or date to the number of the minute, starting from a certain fixed point in the past. + +toRelativeSecondNum +~~~~~~~ +Converts a date with time or date to the number of the second, starting from a certain fixed point in the past. + +now +~~~~~~~ +Accepts zero arguments and returns the current time at one of the moments of request execution. +This function returns a constant, even if the request took a long time to complete. + +today +~~~~~~~ +Accepts zero arguments and returns the current date at one of the moments of request execution. +The same as 'toDate(now())'. + +yesterday +~~~~~~~ +Accepts zero arguments and returns yesterday's date at one of the moments of request execution. +The same as 'today() - 1'. + +timeSlot +~~~~~~~ +Rounds the time to the half hour. +This function is specific to Yandex.Metrica, since half an hour is the minimum amount of time for breaking a session into two sessions if a counter shows a single user's consecutive pageviews that differ in time by strictly more than this amount. This means that tuples (the counter number, user ID, and time slot) can be used to search for pageviews that are included in the corresponding session. + +timeSlots(StartTime, Duration) +~~~~~~~ +For a time interval starting at 'StartTime' and continuing for 'Duration' seconds, it returns an array of moments in time, consisting of points from this interval rounded down to the half hour. +For example, timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600)) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]. +This is necessary for searching for pageviews in the corresponding session. diff --git a/docs/en/functions/encoding_functions.rst b/docs/en/functions/encoding_functions.rst new file mode 100644 index 00000000000..6cb33f45360 --- /dev/null +++ b/docs/en/functions/encoding_functions.rst @@ -0,0 +1,31 @@ +Encoding functions +-------- + +hex +~~~~~ +Accepts a string, number, date, or date with time. Returns a string containing the argument's hexadecimal representation. Uses uppercase letters A-F. +Doesn't use ``0x`` prefixes or ``h`` suffixes. +For strings, all bytes are simply encoded as two hexadecimal numbers. Numbers are converted to big endian ("human readable") format. +For numbers, older zeros are trimmed, but only by entire bytes. +For example, ``hex(1) = '01'``. Dates are encoded as the number of days since the beginning of the Unix Epoch. Dates with times are encoded as the number of seconds since the beginning of the Unix Epoch. + +unhex(str) +~~~~~~~ +Accepts a string containing any number of hexadecimal digits, and returns a string containing the corresponding bytes. Supports both uppercase and lowercase letters A-F. The number of hexadecimal digits doesn't have to be even. If it is odd, the last digit is interpreted as the younger half of the 00-0F byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn't thrown). +If you want to convert the result to a number, you can use the functions 'reverse' and 'reinterpretAsType' + +UUIDStringToNum(str) +~~~~~~~ +Accepts a string containing the UUID in the text format (``123e4567-e89b-12d3-a456-426655440000``). Returns a binary representation of the UUID in ``FixedString(16)``. + +UUIDNumToString(str) +~~~~~~~~ +Accepts a FixedString(16) value containing the UUID in the binary format. Returns a readable string containing the UUID in the text format. + +bitmaskToList(num) +~~~~~~~ +Accepts an integer. Returns a string containing the list of powers of two that total the source number when summed. They are comma-separated without spaces in text format, in ascending order. + +bitmaskToArray(num) +~~~~~~~~~ +Accepts an integer. Returns an array of UInt64 numbers containing the list of powers of two that total the source number when summed. Numbers in the array are in ascending order. diff --git a/docs/en/functions/ext_dict_functions.rst b/docs/en/functions/ext_dict_functions.rst new file mode 100644 index 00000000000..6a3d3743df3 --- /dev/null +++ b/docs/en/functions/ext_dict_functions.rst @@ -0,0 +1,43 @@ +Functions for working with external dictionaries +------- +For more information, see the section "External dictionaries". + +dictGetUInt8, dictGetUInt16, dictGetUInt32, dictGetUInt64 +~~~~~~~~~ + +dictGetInt8, dictGetInt16, dictGetInt32, dictGetInt64 +~~~~~~~~~~ + +dictGetFloat32, dictGetFloat64 +~~~~~~~~~ + +dictGetDate, dictGetDateTime +~~~~~~~ + +dictGetString +~~~~~~ +``dictGetT('dict_name', 'attr_name', id)`` +- Gets the value of the 'attr_name' attribute from the 'dict_name' dictionary by the 'id' key. +'dict_name' and 'attr_name' are constant strings. +'id' must be UInt64. +If the 'id' key is not in the dictionary, it returns the default value set in the dictionary definition. + +dictGetTOrDefault +~~~~~~~~ +``dictGetT('dict_name', 'attr_name', id, default)`` +Similar to the functions dictGetT, but the default value is taken from the last argument of the function. + +dictIsIn +~~~~~~ +``dictIsIn('dict_name', child_id, ancestor_id)`` +- For the 'dict_name' hierarchical dictionary, finds out whether the 'child_id' key is located inside 'ancestor_id' (or matches 'ancestor_id'). Returns UInt8. + +dictGetHierarchy +~~~~~~~~ +``dictGetHierarchy('dict_name', id)`` +- For the 'dict_name' hierarchical dictionary, returns an array of dictionary keys starting from 'id' and continuing along the chain of parent elements. Returns Array(UInt64). + +dictHas +~~~~~~ +``dictHas('dict_name', id)`` +- check the presence of a key in the dictionary. Returns a value of type UInt8, equal to 0, if there is no key and 1 if there is a key. diff --git a/docs/en/functions/hash_functions.rst b/docs/en/functions/hash_functions.rst new file mode 100644 index 00000000000..96fa80a8323 --- /dev/null +++ b/docs/en/functions/hash_functions.rst @@ -0,0 +1,69 @@ +Hash functions +------------- +Hash functions can be used for deterministic pseudo-random shuffling of elements. + + +halfMD5 +~~~~~~ +Calculates the MD5 from a string. Then it takes the first 8 bytes of the hash and interprets them as UInt64 in big endian. +Accepts a String-type argument. Returns UInt64. +This function works fairly slowly (5 million short strings per second per processor core). +If you don't need MD5 in particular, use the 'sipHash64' function instead. + +MD5 +~~~ +Calculates the MD5 from a string and returns the resulting set of bytes as FixedString(16). +If you don't need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the 'sipHash128' function instead. +If you need the same result as gives 'md5sum' utility, write ``lower(hex(MD5(s)))``. + +sipHash64 +~~~~~~~ +Calculates SipHash from a string. +Accepts a String-type argument. Returns UInt64. +SipHash is a cryptographic hash function. It works at least three times faster than MD5. For more information, see https://131002.net/siphash/ + +sipHash128 +~~~~~ +Calculates SipHash from a string. +Accepts a String-type argument. Returns FixedString(16). +Differs from sipHash64 in that the final xor-folding state is only done up to 128 bits. + +cityHash64 +~~~~~ +Calculates CityHash64 from a string or a similar hash function for any number of any type of arguments. +For String-type arguments, CityHash is used. This is a fast non-cryptographic hash function for strings with decent quality. +For other types of arguments, a decent implementation-specific fast non-cryptographic hash function is used. +If multiple arguments are passed, the function is calculated using the same rules and chain combinations using the CityHash combinator. +For example, you can compute the checksum of an entire table with accuracy up to the row order: ``SELECT sum(cityHash64(*)) FROM table``. + +intHash32 +~~~~~ +Calculates a 32-bit hash code from any type of integer. +This is a relatively fast non-cryptographic hash function of average quality for numbers. + +intHash64 +~~~~~ +Calculates a 64-bit hash code from any type of integer. +It works faster than intHash32. Average quality. + +SHA1 +~~~~ + +SHA224 +~~~~~ + +SHA256 +~~~~~ +Calculates SHA-1, SHA-224, or SHA-256 from a string and returns the resulting set of bytes as FixedString(20), FixedString(28), or FixedString(32). +The function works fairly slowly (SHA-1 processes about 5 million short strings per second per processor core, while SHA-224 and SHA-256 process about 2.2 million). +We recommend using this function only in cases when you need a specific hash function and you can't select it. +Even in these cases, we recommend applying the function offline and pre-calculating values when inserting them into the table, instead of applying it in SELECTS. + +URLHash(url[, N]) +~~~~~~~~ +A fast, decent-quality non-cryptographic hash function for a string obtained from a URL using some type of normalization. + +``URLHash(s)`` - Calculates a hash from a string without one of the trailing symbols ``/``,``?`` or ``#`` at the end, if present. + +``URL Hash(s, N)`` - Calculates a hash from a string up to the N level in the URL hierarchy, without one of the trailing symbols ``/``,``?`` or ``#`` at the end, if present. +Levels are the same as in URLHierarchy. This function is specific to Yandex.Metrica. diff --git a/docs/en/functions/higher_order_functions.rst b/docs/en/functions/higher_order_functions.rst new file mode 100644 index 00000000000..38503138cc7 --- /dev/null +++ b/docs/en/functions/higher_order_functions.rst @@ -0,0 +1,67 @@ +Higher-order functions +----------------------- + +-> operator, lambda(params, expr) function +~~~~~~~~~~~~~~ +Allows describing a lambda function for passing to a higher-order function. The left side of the arrow has a formal parameter - any ID, or multiple formal parameters - any IDs in a tuple. The right side of the arrow has an expression that can use these formal parameters, as well as any table columns. + +Examples: ``x -> 2 * x, str -> str != Referer.`` + +Higher-order functions can only accept lambda functions as their functional argument. + +A lambda function that accepts multiple arguments can be passed to a higher-order function. In this case, the higher-order function is passed several arrays of identical length that these arguments will correspond to. + +For all functions other than 'arrayMap' and 'arrayFilter', the first argument (the lambda function) can be omitted. In this case, identical mapping is assumed. + +arrayMap(func, arr1, ...) +~~~~~~~~~~~~ +Returns an array obtained from the original application of the 'func' function to each element in the 'arr' array. + +arrayFilter(func, arr1, ...) +~~~~~~~~~~~~~ +Returns an array containing only the elements in 'arr1' for which 'func' returns something other than 0. + +Examples: + +.. code-block:: sql + + SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res + + ┌─res───────────┐ + │ ['abc World'] │ + └───────────────┘ + + SELECT + arrayFilter( + (i, x) -> x LIKE '%World%', + arrayEnumerate(arr), + ['Hello', 'abc World'] AS arr) + AS res + + ┌─res─┐ + │ [2] │ + └─────┘ + +arrayCount([func,] arr1, ...) +~~~~~~~~~ +Returns the number of elements in 'arr' for which 'func' returns something other than 0. If 'func' is not specified, it returns the number of non-zero items in the array. + +arrayExists([func,] arr1, ...) +~~~~~~~~~~ +Returns 1 if there is at least one element in 'arr' for which 'func' returns something other than 0. Otherwise, it returns 0. + +arrayAll([func,] arr1, ...) +~~~~~~~~~ +Returns 1 if 'func' returns something other than 0 for all the elements in 'arr'. Otherwise, it returns 0. + +arraySum([func,] arr1, ...) +~~~~~~~~~~~ +Returns the sum of the 'func' values. If the function is omitted, it just returns the sum of the array elements. + +arrayFirst(func, arr1, ...) +~~~~~~~~~ +Returns the first element in the 'arr1' array for which 'func' returns something other than 0. + +arrayFirstIndex(func, arr1, ...) +~~~~~~~ +Returns the index of the first element in the 'arr1' array for which 'func' returns something other than 0. diff --git a/docs/en/functions/in_functions.rst b/docs/en/functions/in_functions.rst new file mode 100644 index 00000000000..34c00540fd0 --- /dev/null +++ b/docs/en/functions/in_functions.rst @@ -0,0 +1,18 @@ +Functions for implementing the IN operator +--------------- + +in, notIn, globalIn, globalNotIn +~~~~~~~~~~~~~ +See the section "IN operators". + +tuple(x, y, ...), оператор (x, y, ...) +~~~~~~~~~~~~~ +A function that allows grouping multiple columns. +For columns with the types T1, T2, ..., it returns a Tuple(T1, T2, ...) type tuple containing these columns. There is no cost to execute the function. +Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can't be written to a table. + +tupleElement(tuple, n), оператор x.N +~~~~~~~~~~~ +A function that allows getting columns from a tuple. +'N' is the column index, starting from 1. 'N' must be a constant. 'N' must be a strict postive integer no greater than the size of the tuple. +There is no cost to execute the function. diff --git a/docs/en/functions/index.rst b/docs/en/functions/index.rst new file mode 100644 index 00000000000..542c6e53145 --- /dev/null +++ b/docs/en/functions/index.rst @@ -0,0 +1,71 @@ +Functions +======= + +There are at least* two types of functions - regular functions (they are just called "functions") and aggregate functions. These are completely different concepts. Regular functions work as if they are applied to each row separately (for each row, the result of the function doesn't depend on the other rows). Aggregate functions accumulate a set of values from various rows (i.e. they depend on the entire set of rows). + +In this section we discuss regular functions. For aggregate functions, see the section "Aggregate functions". +* - There is a third type of function that the 'arrayJoin' function belongs to; table functions can also be mentioned separately. + + + +.. toctree:: + :glob: + + * + */index + + +Strong typing +~~~~~~~~~~~~~~~~~ +In contrast to standard SQL, ClickHouse has strong typing. In other words, it doesn't make implicit conversions between types. Each function works for a specific set of types. This means that sometimes you need to use type conversion functions. + +Common subexpression elimination +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +All expressions in a query that have the same AST (the same record or same result of syntactic parsing) are considered to have identical values. Such expressions are concatenated and executed once. Identical subqueries are also eliminated this way. + +Types of results +~~~~~~~~~~~~~~~ +All functions return a single return as the result (not several values, and not zero values). The type of result is usually defined only by the types of arguments, not by the values. Exceptions are the tupleElement function (the a.N operator), and the toFixedString function. + +Constants +~~~~~~~~~ +For simplicity, certain functions can only work with constants for some arguments. For example, the right argument of the LIKE operator must be a constant. +Almost all functions return a constant for constant arguments. The exception is functions that generate random numbers. +The 'now' function returns different values for queries that were run at different times, but the result is considered a constant, since constancy is only important within a single query. +A constant expression is also considered a constant (for example, the right half of the LIKE operator can be constructed from multiple constants). + +Functions can be implemented in different ways for constant and non-constant arguments (different code is executed). But the results for a constant and for a true column containing only the same value should match each other. + +Immutability +~~~~~~~~~~~~~~ + +Functions can't change the values of their arguments - any changes are returned as the result. Thus, the result of calculating separate functions does not depend on the order in which the functions are written in the query. + + +Error handling +~~~~~~~~~~~~~~~~ + +Some functions might throw an exception if the data is invalid. In this case, the query is canceled and an error text is returned to the client. For distributed processing, when an exception occurs on one of the servers, the other servers also attempt to abort the query. + + +Evaluation of argument expressions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In almost all programming languages, one of the arguments might not be evaluated for certain operators. This is usually for the operators ``&&``, ``||``, ``?:``. +But in ClickHouse, arguments of functions (operators) are always evaluated. This is because entire parts of columns are evaluated at once, instead of calculating each row separately. + +Performing functions for distributed query processing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For distributed query processing, as many stages of query processing as possible are performed on remote servers, and the rest of the stages (merging intermediate results and everything after that) are performed on the requestor server. + +This means that functions can be performed on different servers. +For example, in the query ``SELECT f(sum(g(x))) FROM distributed_table GROUP BY h(y)``, +- if distributed_table has at least two shards, the functions ``g`` and ``h`` are performed on remote servers, and the function ``f`` - is performed on the requestor server. +- if distributed_table has only one shard, all the functions ``f``, ``g``, and ``h`` are performed on this shard's server. + +The result of a function usually doesn't depend on which server it is performed on. However, sometimes this is important. +For example, functions that work with dictionaries use the dictionary that exists on the server they are running on. +Another example is the hostName function, which returns the name of the server it is running on in order to make GROUP BY by servers in a SELECT query. + +If a function in a query is performed on the requestor server, but you need to perform it on remote servers, you can wrap it in an 'any' aggregate function or add it to a key in GROUP BY. diff --git a/docs/en/functions/ip_address_functions.rst b/docs/en/functions/ip_address_functions.rst new file mode 100644 index 00000000000..a5a9866f192 --- /dev/null +++ b/docs/en/functions/ip_address_functions.rst @@ -0,0 +1,101 @@ +Functions for working with IP addresses +------------------------- + +IPv4NumToString(num) +~~~~~~~~~~~~~ +Takes a UInt32 number. Interprets it as an IPv4 address in big endian. Returns a string containing the corresponding IPv4 address in the format A.B.C.d (dot-separated numbers in decimal form). + +IPv4StringToNum(s) +~~~~~~~~ +The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. + +IPv4NumToStringClassC(num) +~~~~~~~~~~~ +Similar to IPv4NumToString, but using ``xxx`` instead of the last octet. + +Example: + +.. code-block:: sql + + SELECT + IPv4NumToStringClassC(ClientIP) AS k, + count() AS c + FROM test.hits + GROUP BY k + ORDER BY c DESC + LIMIT 10 + + ┌─k──────────────┬─────c─┐ + │ 83.149.9.xxx │ 26238 │ + │ 217.118.81.xxx │ 26074 │ + │ 213.87.129.xxx │ 25481 │ + │ 83.149.8.xxx │ 24984 │ + │ 217.118.83.xxx │ 22797 │ + │ 78.25.120.xxx │ 22354 │ + │ 213.87.131.xxx │ 21285 │ + │ 78.25.121.xxx │ 20887 │ + │ 188.162.65.xxx │ 19694 │ + │ 83.149.48.xxx │ 17406 │ + └────────────────┴───────┘ + +Since using ``'xxx'`` is highly unusual, this may be changed in the future. We recommend that you don't rely on the exact format of this fragment. + +IPv6NumToString(x) +~~~~~~~~~~~~ +Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format. +IPv6-mapped IPv4 addresses are output in the format ``::ffff:111.222.33.44``. Examples: + +.. code-block:: sql + + SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr + + ┌─addr─────────┐ + │ 2a02:6b8::11 │ + └──────────────┘ + SELECT + IPv6NumToString(ClientIP6 AS k), + count() AS c + FROM hits_all + WHERE EventDate = today() AND substring(ClientIP6, 1, 12) != unhex('00000000000000000000FFFF') + GROUP BY k + ORDER BY c DESC + LIMIT 10 + + ┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ + │ 2a02:2168:aaa:bbbb::2 │ 24695 │ + │ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ + │ 2a02:6b8:0:fff::ff │ 16389 │ + │ 2a01:4f8:111:6666::2 │ 16016 │ + │ 2a02:2168:888:222::1 │ 15896 │ + │ 2a01:7e00::ffff:ffff:ffff:222 │ 14774 │ + │ 2a02:8109:eee:ee:eeee:eeee:eeee:eeee │ 14443 │ + │ 2a02:810b:8888:888:8888:8888:8888:8888 │ 14345 │ + │ 2a02:6b8:0:444:4444:4444:4444:4444 │ 14279 │ + │ 2a01:7e00::ffff:ffff:ffff:ffff │ 13880 │ + └─────────────────────────────────────────┴───────┘ + SELECT + IPv6NumToString(ClientIP6 AS k), + count() AS c + FROM hits_all + WHERE EventDate = today() + GROUP BY k + ORDER BY c DESC + LIMIT 10 + + ┌─IPv6NumToString(ClientIP6)─┬──────c─┐ + │ ::ffff:94.26.111.111 │ 747440 │ + │ ::ffff:37.143.222.4 │ 529483 │ + │ ::ffff:5.166.111.99 │ 317707 │ + │ ::ffff:46.38.11.77 │ 263086 │ + │ ::ffff:79.105.111.111 │ 186611 │ + │ ::ffff:93.92.111.88 │ 176773 │ + │ ::ffff:84.53.111.33 │ 158709 │ + │ ::ffff:217.118.11.22 │ 154004 │ + │ ::ffff:217.118.11.33 │ 148449 │ + │ ::ffff:217.118.11.44 │ 148243 │ + └────────────────────────────┴────────┘ + +IPv6StringToNum(s) +~~~~~~~~ +The reverse function of IPv6NumToString. If the IPv6 address has an invalid format, it returns a string of null bytes. +HEX can be uppercase or lowercase. diff --git a/docs/en/functions/json_functions.rst b/docs/en/functions/json_functions.rst new file mode 100644 index 00000000000..1ee15d94deb --- /dev/null +++ b/docs/en/functions/json_functions.rst @@ -0,0 +1,51 @@ +Functions for working with JSON. +------------------- +In Yandex.Metrica, JSON is passed by users as session parameters. There are several functions for working with this JSON. (Although in most of the cases, the JSONs are additionally pre-processed, and the resulting values are put in separate columns in their processed format.) All these functions are based on strong assumptions about what the JSON can be, but they try not to do anything. + +The following assumptions are made: + #. The field name (function argument) must be a constant. + #. The field name is somehow canonically encoded in JSON. For example, ``visitParamHas('{"abc":"def"}', 'abc') = 1``, but ``visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0`` + #. Fields are searched for on any nesting level, indiscriminately. If there are multiple matching fields, the first occurrence is used. + #. JSON doesn't have space characters outside of string literals. + +visitParamHas(params, name) +~~~~~~~ +Checks whether there is a field with the 'name' name. + +visitParamExtractUInt(params, name) +~~~~~~~~~ +Parses UInt64 from the value of the field named 'name'. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn't exist, or it exists but doesn't contain a number, it returns 0. + +visitParamExtractInt(params, name) +~~~~~~~ +The same as for Int64. + +visitParamExtractFloat(params, name) +~~~~~~~ +The same as for Float64. + +visitParamExtractBool(params, name) +~~~~~~~~ +Parses a true/false value. The result is UInt8. + +visitParamExtractRaw(params, name) +~~~~~~~ +Returns the value of a field, including separators. + +Examples: +:: + visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' + visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' + +visitParamExtractString(params, name) +~~~~~~~~~~~ +Parses the string in double quotes. The value is unescaped. If unescaping failed, it returns an empty string. + +Examples: +:: + visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' + visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' + visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' + visitParamExtractString('{"abc":"hello}', 'abc') = '' + +Currently, there is no support for code points not from the basic multilingual plane written in the format ``\uXXXX\uYYYY`` (they are converted to CESU-8 instead of UTF-8). diff --git a/docs/en/functions/logical_functions.rst b/docs/en/functions/logical_functions.rst new file mode 100644 index 00000000000..229f32edcfa --- /dev/null +++ b/docs/en/functions/logical_functions.rst @@ -0,0 +1,19 @@ +Logical functions +------------------ + +Logical functions accept any numeric types, but return a UInt8 number equal to 0 or 1. + +Zero as an argument is considered "false," while any non-zero value is considered "true".. + +and, AND operator +~~~~~~~~~~~~~~~~~ + +or, OR operator +~~~~~~~~~~~~~~~ + +not, NOT operator +~~~~~~~~~~~~~~~ + +xor +~~~~~~~~~~~~~~~ + diff --git a/docs/en/functions/math_functions.rst b/docs/en/functions/math_functions.rst new file mode 100644 index 00000000000..100ad9dd21b --- /dev/null +++ b/docs/en/functions/math_functions.rst @@ -0,0 +1,98 @@ +Mathematical functions +--------------- +All the functions return a Float64 number. The accuracy of the result is close to the maximum precision possible, but the result might not coincide with the machine representable number nearest to the corresponding real number. + +e() +~~~~ +Accepts zero arguments and returns a Float64 number close to the e number. + +pi() +~~~~ +Accepts zero arguments and returns a Float64 number close to π. + +exp(x) +~~~~~ +Accepts a numeric argument and returns a Float64 number close to the exponent of the argument. + +log(x) +~~~~~~ +Accepts a numeric argument and returns a Float64 number close to the natural logarithm of the argument. + +exp2(x) +~~~~~~~ +Accepts a numeric argument and returns a Float64 number close to 2x. + +log2(x) +~~~~~ +Accepts a numeric argument and returns a Float64 number close to the binary logarithm of the argument. + +exp10(x) +~~~~~~~ +Accepts a numeric argument and returns a Float64 number close to 10x. + +log10(x) +~~~~~~~ +Accepts a numeric argument and returns a Float64 number close to the decimal logarithm of the argument. + +sqrt(x) +~~~~~~~~ +Accepts a numeric argument and returns a Float64 number close to the square root of the argument. + +cbrt(x) +~~~~~~~ +Accepts a numeric argument and returns a Float64 number close to the cubic root of the argument. + +erf(x) +~~~~~~~ + +If 'x' is non-negative, then erf(x / σ√2) - is the probability that a random variable having a normal distribution with standard deviation 'σ' takes the value that is separated from the expected value by more than 'x'. + +Example (three sigma rule): + +.. code-block:: sql + + SELECT erf(3 / sqrt(2)) + + ┌─erf(divide(3, sqrt(2)))─┐ + │ 0.9973002039367398 │ + └─────────────────────────┘ + +erfc(x) +~~~~~~ +Accepts a numeric argument and returns a Float64 number close to 1 - erf(x), but without loss of precision for large 'x' values. + +lgamma(x) +~~~~~~~ +The logarithm of the gamma function. + +tgamma(x) +~~~~~~ +Gamma function. + +sin(x) +~~~~~ +The sine. + +cos(x) +~~~~~ +The cosine. + +tan(x) +~~~~~~ +The tangent. + +asin(x) +~~~~~~ +The arc sine + +acos(x) +~~~~~~ +The arc cosine. + +atan(x) +~~~~~ +The arc tangent. + +pow(x, y) +~~~~~~~ +xy. diff --git a/docs/en/functions/other_functions.rst b/docs/en/functions/other_functions.rst new file mode 100644 index 00000000000..be5947c9048 --- /dev/null +++ b/docs/en/functions/other_functions.rst @@ -0,0 +1,268 @@ +Other functions +------------- + +hostName() +~~~~~~~ +Returns a string with the name of the host that this function was performed on. For distributed processing, this is the name of the remote server host, if the function is performed on a remote server. + +visibleWidth(x) +~~~~~~~~~ +Calculates the approximate width when outputting values to the console in text format (tab-separated). This function is used by the system for implementing Pretty formats. + +toTypeName(x) +~~~~~~~~ +Gets the type name. Returns a string containing the type name of the passed argument. + +blockSize() +~~~~~~~~ +Gets the size of the block. +In ClickHouse, queries are always run on blocks (sets of column parts). This function allows getting the size of the block that you called it for. + +materialize(x) +~~~~~~~~ +Turns a constant into a full column containing just one value. +In ClickHouse, full columns and constants are represented differently in memory. Functions work differently for constant arguments and normal arguments (different code is executed), although the result is almost always the same. This function is for debugging this behavior. + +ignore(...) +~~~~~~~ +A function that accepts any arguments and always returns 0. +However, the argument is still calculated. This can be used for benchmarks. + +sleep(seconds) +~~~~~~~~~ +Sleeps 'seconds' seconds on each data block. You can specify an integer or a floating-point number. + +currentDatabase() +~~~~~~~~~~ +Returns the name of the current database. +You can use this function in table engine parameters in a CREATE TABLE query where you need to specify the database.. + +isFinite(x) +~~~~~~~ +Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is not infinite and not a NaN, otherwise 0. + +isInfinite(x) +~~~~~~~ +Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is infinite, otherwise 0. +Note that 0 is returned for a NaN + +isNaN(x) +~~~~~ +Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is a NaN, otherwise 0. + +hasColumnInTable('database', 'table', 'column') +~~~~~~~~ +Accepts constant String columns - database name, table name and column name. Returns constant UInt8 value, equal to 1 if column exists, +otherwise 0. +If table doesn't exist than exception is thrown. +For elements of nested data structure function checks existence of column. For nested data structure 0 is returned. + +bar +~~~~~ +Allows building a unicode-art diagram. + +``bar(x, min, max, width)`` - Draws a band with a width proportional to (x - min) and equal to 'width' characters when x == max. +``min, max`` - Integer constants. The value must fit in Int64. +``width`` - Constant, positive number, may be a fraction. + +The band is drawn with accuracy to one eighth of a symbol. Example: + +.. code-block:: sql + + SELECT + toHour(EventTime) AS h, + count() AS c, + bar(c, 0, 600000, 20) AS bar + FROM test.hits + GROUP BY h + ORDER BY h ASC + + ┌──h─┬──────c─┬─bar────────────────┐ + │ 0 │ 292907 │ █████████▋ │ + │ 1 │ 180563 │ ██████ │ + │ 2 │ 114861 │ ███▋ │ + │ 3 │ 85069 │ ██▋ │ + │ 4 │ 68543 │ ██▎ │ + │ 5 │ 78116 │ ██▌ │ + │ 6 │ 113474 │ ███▋ │ + │ 7 │ 170678 │ █████▋ │ + │ 8 │ 278380 │ █████████▎ │ + │ 9 │ 391053 │ █████████████ │ + │ 10 │ 457681 │ ███████████████▎ │ + │ 11 │ 493667 │ ████████████████▍ │ + │ 12 │ 509641 │ ████████████████▊ │ + │ 13 │ 522947 │ █████████████████▍ │ + │ 14 │ 539954 │ █████████████████▊ │ + │ 15 │ 528460 │ █████████████████▌ │ + │ 16 │ 539201 │ █████████████████▊ │ + │ 17 │ 523539 │ █████████████████▍ │ + │ 18 │ 506467 │ ████████████████▊ │ + │ 19 │ 520915 │ █████████████████▎ │ + │ 20 │ 521665 │ █████████████████▍ │ + │ 21 │ 542078 │ ██████████████████ │ + │ 22 │ 493642 │ ████████████████▍ │ + │ 23 │ 400397 │ █████████████▎ │ + └────┴────────┴────────────────────┘ + +transform +~~~~~~~ +Transforms a value according to the explicitly defined mapping of some elements to other ones. +There are two variations of this function: + +1. ``transform(x, array_from, array_to, default)`` + +``x`` - What to transform + +``array_from`` - Constant array of values for converting. + +``array_to`` - Constant array of values to convert the values in 'from' to. + +``default`` - Constant. Which value to use if 'x' is not equal to one of the values in 'from' + +``'array_from'`` and ``'array_to'`` are arrays of the same size. + +Types: + +``transform(T, Array(T), Array(U), U) -> U`` + +``'T'`` and ``'U'`` can be numeric, string, or Date or DateTime types. +Where the same letter is indicated (T or U), for numeric types these might not be matching types, but types that have a common type. +For example, the first argument can have the Int64 type, while the second has the Array(Uint16) type. + +If the 'x' value is equal to one of the elements in the 'array_from' array, it returns the existing element (that is numbered the same) from the 'array_to' array. Otherwise, it returns 'default'. If there are multiple matching elements in 'array_from', it returns one of the matches. + +Example: + +.. code-block:: sql + + SELECT + transform(SearchEngineID, [2, 3], ['Яндекс', 'Google'], 'Остальные') AS title, + count() AS c + FROM test.hits + WHERE SearchEngineID != 0 + GROUP BY title + ORDER BY c DESC + + ┌─title─────┬──────c─┐ + │ Яндекс │ 498635 │ + │ Google │ 229872 │ + │ Остальные │ 104472 │ + └───────────┴────────┘ + + +2. ``transform(x, array_from, array_to)`` + +Differs from the first variation in that the 'default' argument is omitted. +If the 'x' value is equal to one of the elements in the 'array_from' array, it returns the matching element (that is numbered the same) from the 'array_to' array. Otherwise, it returns 'x'. + +Types: + +``transform(T, Array(T), Array(T)) -> T`` + +Example: + +.. code-block:: sql + + SELECT + transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'ввв.яндекс.рф', 'example.com']) AS s, + count() AS c + FROM test.hits + GROUP BY domain(Referer) + ORDER BY count() DESC + LIMIT 10 + + ┌─s──────────────┬───────c─┐ + │ │ 2906259 │ + │ www.yandex │ 867767 │ + │ ███████.ru │ 313599 │ + │ mail.yandex.ru │ 107147 │ + │ ввв.яндекс.рф │ 105668 │ + │ ██████.ru │ 100355 │ + │ █████████.ru │ 65040 │ + │ news.yandex.ru │ 64515 │ + │ ██████.net │ 59141 │ + │ example.com │ 57316 │ + └────────────────┴─────────┘ + +formatReadableSize(x) +~~~~~~~~~~~ +Gets a size (number of bytes). Returns a string that contains rounded size with the suffix (KiB, MiB etc.). + +Example: + +.. code-block:: sql + + SELECT + arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, + formatReadableSize(filesize_bytes) AS filesize + + ┌─filesize_bytes─┬─filesize───┐ + │ 1 │ 1.00 B │ + │ 1024 │ 1.00 KiB │ + │ 1048576 │ 1.00 MiB │ + │ 192851925 │ 183.92 MiB │ + └────────────────┴────────────┘ + +least(a, b) +~~~~~~ +Returns the least element of a and b. + +greatest(a, b) +~~~~~~~~ +Returns the greatest element of a and b + +uptime() +~~~~~~ +Returns server's uptime in seconds. + +version() +~~~~~~~ +Returns server's version as a string. + +rowNumberInAllBlocks() +~~~~~~~~~~ +Returns an incremental row number within all blocks that were processed by this function. + +runningDifference(x) +~~~~~~~~ +Calculates the difference between consecutive values in the data block. +Result of the function depends on the order of the data in the blocks. + +It works only inside of the each processed block of data. Data splitting in the blocks is not explicitly controlled by the user. +If you specify ORDER BY in subquery and call runningDifference outside of it, you could get an expected result. + +Example: + +.. code-block:: sql + + SELECT + EventID, + EventTime, + runningDifference(EventTime) AS delta + FROM + ( + SELECT + EventID, + EventTime + FROM events + WHERE EventDate = '2016-11-24' + ORDER BY EventTime ASC + LIMIT 5 + ) + + ┌─EventID─┬───────────EventTime─┬─delta─┐ + │ 1106 │ 2016-11-24 00:00:04 │ 0 │ + │ 1107 │ 2016-11-24 00:00:05 │ 1 │ + │ 1108 │ 2016-11-24 00:00:05 │ 0 │ + │ 1109 │ 2016-11-24 00:00:09 │ 4 │ + │ 1110 │ 2016-11-24 00:00:10 │ 1 │ + └─────────┴─────────────────────┴───────┘ + +MACNumToString(num) +~~~~~~~~~~~~~ +Takes a UInt64 number. Interprets it as an MAC address in big endian. Returns a string containing the corresponding MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form). + +MACStringToNum(s) +~~~~~~~~ +The reverse function of MACNumToString. If the MAC address has an invalid format, it returns 0. + \ No newline at end of file diff --git a/docs/en/functions/random_functions.rst b/docs/en/functions/random_functions.rst new file mode 100644 index 00000000000..330686df5f4 --- /dev/null +++ b/docs/en/functions/random_functions.rst @@ -0,0 +1,17 @@ +Functions for generating pseudo-random numbers +---------------------- +Non-cryptographic generators of pseudo-random numbers are used. + +All the functions accept zero arguments or one argument. +If an argument is passed, it can be any type, and its value is not used for anything. +The only purpose of this argument is to prevent common subexpression elimination, so that two different instances of the same function return different columns with different random numbers. + +rand +~~~~ +Returns a pseudo-random UInt32 number, evenly distributed among all UInt32-type numbers. +Uses a linear congruential generator. + +rand64 +~~~~ +Returns a pseudo-random UInt64 number, evenly distributed among all UInt64-type numbers. +Uses a linear congruential generator. diff --git a/docs/en/functions/rounding_functions.rst b/docs/en/functions/rounding_functions.rst new file mode 100644 index 00000000000..b4be1e121e8 --- /dev/null +++ b/docs/en/functions/rounding_functions.rst @@ -0,0 +1,38 @@ +Rounding functions +---------------- + +floor(x[, N]) +~~~~~~~ +Returns a rounder number that is less than or equal to 'x'. +A round number is a multiple of 1 / 10N, or the nearest number of the appropriate data type ``if 1 / 10N`` isn't exact. +'N' is an integer constant, optional parameter. By default it is zero, which means to round to an integer. +'N' may be negative. + +Examples: ``floor(123.45, 1) = 123.4, floor(123.45, -1) = 120``. + +'x' is any numeric type. The result is a number of the same type. +For integer arguments, it makes sense to round with a negative 'N' value (for non-negative 'N', the function doesn't do anything). +If rounding causes overflow (for example, ``floor(-128, -1))``, an implementation-specific result is returned. + +ceil(x[, N]) +~~~~~~ +Returns the smallest round number that is greater than or equal to 'x'. In every other way, it is the same as the 'floor' function (see above).. + +round(x[, N]) +~~~~~~~ +Returns the round number nearest to 'num', which may be less than, greater than, or equal to 'x'. +If 'x' is exactly in the middle between the nearest round numbers, one of them is returned (implementation-specific). +The number '-0.' may or may not be considered round (implementation-specific). +In every other way, this function is the same as 'floor' and 'ceil' described above. + +roundToExp2(num) +~~~~~~~~ +Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to the nearest (whole non-negative) degree of two. + +roundDuration(num) +~~~~~~~~ +Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function is specific to Yandex.Metrica and used for implementing the report on session length. + +roundAge(num) +~~~~~~~ +Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rounds the number down to numbers from the set: 18, 25, 35, 45. This function is specific to Yandex.Metrica and used for implementing the report on user age. diff --git a/docs/en/functions/splitting_merging_functions.rst b/docs/en/functions/splitting_merging_functions.rst new file mode 100644 index 00000000000..40dd5b46b53 --- /dev/null +++ b/docs/en/functions/splitting_merging_functions.rst @@ -0,0 +1,23 @@ +Functions for splitting and merging strings and arrays +---------------- + +splitByChar(separator, s) +~~~~~~~~~~~~ +Splits a string into substrings, using 'separator' as the separator. +'separator' must be a string constant consisting of exactly one character. +Returns an array of selected substrings. Empty substrings may be selected if the separator occurs at the beginning or end of the string, or if there are multiple consecutive separators. + +splitByString(separator, s) +~~~~~~~~~~~ +The same as above, but it uses a string of multiple characters as the separator. The string must be non-empty. + +arrayStringConcat(arr[, separator]) +~~~~~~~~~~~~~ +Concatenates strings from the array elements, using 'separator' as the separator. +'separator' is a string constant, an optional parameter. By default it is an empty string. +Returns a string. + +alphaTokens(s) +~~~~~~~~~~ +Selects substrings of consecutive bytes from the range a-z and A-Z. +Returns an array of selected substrings. diff --git a/docs/en/functions/string_functions.rst b/docs/en/functions/string_functions.rst new file mode 100644 index 00000000000..201dc3b612c --- /dev/null +++ b/docs/en/functions/string_functions.rst @@ -0,0 +1,74 @@ +Functions for working with strings +------------------------------ + +empty +~~~~~ +Returns 1 for an empty string or 0 for a non-empty string. +The result type is UInt8. +A string is considered non-empty if it contains at least one byte, even if this is a space or a null byte. +The function also works for arrays. + +notEmpty +~~~~~~~~ +Returns 0 for an empty string or 1 for a non-empty string. +The result type is UInt8. +The function also works for arrays. + +length +~~~~~~ +Returns the length of a string in bytes (not in characters, and not in code points). +The result type is UInt64. +The function also works for arrays. + +lengthUTF8 +~~~~~~~~~~ +Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). +The result type is UInt64. + +lower +~~~~~~ +Converts ASCII Latin symbols in a string to lowercase. + +upper +~~~~~ +Converts ASCII Latin symbols in a string to uppercase. + +lowerUTF8 +~~~~~~~~~ +Converts a string to lowercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. It doesn't detect the language. So for Turkish the result might not be exactly correct. +If length of UTF-8 sequence is different for upper and lower case of code point, then result for that code point could be incorrect. +If value contains invalid UTF-8, the behavior is unspecified. + +upperUTF8 +~~~~~~~~~ +Converts a string to uppercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. It doesn't detect the language. So for Turkish the result might not be exactly correct. +If length of UTF-8 sequence is different for upper and lower case of code point, then result for that code point could be incorrect. +If value contains invalid UTF-8, the behavior is unspecified. + +reverse +~~~~~~~ +Reverses the string (as a sequence of bytes). + +reverseUTF8 +~~~~~~~~~~~ +Reverses a sequence of Unicode code points, assuming that the string contains a set of bytes representing a UTF-8 text. Otherwise, it does something else (it doesn't throw an exception). + +concat(s1, s2, ...) +~~~~~~~~~~~~~~~~~~~ +Concatenates strings from the function arguments, without a separator. + +substring(s, offset, length) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Returns a substring starting with the byte from the 'offset' index that is 'length' bytes long. Character indexing starts from one (as in standard SQL). The 'offset' and 'length' arguments must be constants. + +substringUTF8(s, offset, length) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The same as 'substring', but for Unicode code points. Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). + +appendTrailingCharIfAbsent(s, c) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If the ``s`` string is non-empty and does not contain the ``c`` character at the end, it appends the ``c`` character to the end. + +convertCharset(s, from, to) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Returns a string with the data s (encoded as from charset) that was converted to the to charset. diff --git a/docs/en/functions/string_replace_functions.rst b/docs/en/functions/string_replace_functions.rst new file mode 100644 index 00000000000..a857a52a0ef --- /dev/null +++ b/docs/en/functions/string_replace_functions.rst @@ -0,0 +1,70 @@ +Functions for searching and replacing in strings +--------------------------------- + +replaceOne(haystack, pattern, replacement) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Replaces the first occurrence, if it exists, of the 'pattern' substring in 'haystack' with the 'replacement' substring. +Hereafter, 'pattern' and 'replacement' must be constants. + +replaceAll(haystack, pattern, replacement) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Replaces all occurrences of the 'pattern' substring in 'haystack' with the 'replacement' substring. + +replaceRegexpOne(haystack, pattern, replacement) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Replacement using the 'pattern' regular expression. A re2 regular expression. Replaces only the first occurrence, if it exists. +A pattern can be specified as 'replacement'. This pattern can include substitutions \0-\9\. +The substitution \0 includes the entire regular expression. +The substitutions \1-\9 include the subpattern corresponding to the number. +In order to specify the \ symbol in a pattern, you must use a \ symbol to escape it. +Also keep in mind that a string literal requires an extra escape. + +Example 1. Converting the date to American format: + +.. code-block:: sql + + SELECT DISTINCT + EventDate, + replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res + FROM test.hits + LIMIT 7 + FORMAT TabSeparated + + 2014-03-17 03/17/2014 + 2014-03-18 03/18/2014 + 2014-03-19 03/19/2014 + 2014-03-20 03/20/2014 + 2014-03-21 03/21/2014 + 2014-03-22 03/22/2014 + 2014-03-23 03/23/2014 + +Example 2. Copy the string ten times: + +.. code-block:: sql + + SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res + + ┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ + │ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ + └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ + +replaceRegexpAll(haystack, pattern, replacement) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This does the same thing, but replaces all the occurrences. Example: + +.. code-block:: sql + SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res + + ┌─res────────────────────────┐ + │ HHeelllloo,, WWoorrlldd!! │ + └────────────────────────────┘ + +As an exception, if a regular expression worked on an empty substring, the replacement is not made more than once. +Example: + +.. code-block:: sql + SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res + + ┌─res─────────────────┐ + │ here: Hello, World! │ + └─────────────────────┘ diff --git a/docs/en/functions/string_search_functions.rst b/docs/en/functions/string_search_functions.rst new file mode 100644 index 00000000000..fe8a8f732ed --- /dev/null +++ b/docs/en/functions/string_search_functions.rst @@ -0,0 +1,52 @@ +Functions for searching strings +------------------------ +The search is case-sensitive in all these functions. +The search substring or regular expression must be a constant in all these functions. + +position(haystack, needle) +~~~~~~~~~~~~~~~~~~~~~~~~~~ +Searches for the 'needle' substring in the 'haystack' string. +Returns the position (in bytes) of the found substring, starting from 1, or returns 0 if the substring was not found. +There's also positionCaseInsensitive function. + +positionUTF8(haystack, needle) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The same as 'position', but the position is returned in Unicode code points. Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). +There's also positionCaseInsensitiveUTF8 function. + +match(haystack, pattern) +~~~~~~~~~~~~~~~~~~~~~~~~ +Checks whether the string matches the 'pattern' regular expression. +The regular expression is re2. +Returns 0 if it doesn't match, or 1 if it matches. + +Note that the backslash symbol (``\``) is used for escaping in the regular expression. The same symbol is used for escaping in string literals. +So in order to escape the symbol in a regular expression, you must write two backslashes (``\\``) in a string literal. + +The regular expression works with the string as if it is a set of bytes. +The regular expression can't contain null bytes. +For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster. + +extract(haystack, pattern) +~~~~~~~~~~~~~~~~~~~~~~~~~~ +Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern. + +extractAll(haystack, pattern) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Extracts all the fragments of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. Returns an array of strings consisting of all matches to the regex. In general, the behavior is the same as the 'extract' function (it takes the first subpattern, or the entire expression if there isn't a subpattern). + +like(haystack, pattern), оператор haystack LIKE pattern +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Checks whether a string matches a simple regular expression. The regular expression can contain the metasymbols ``%`` and ``_``. + +``%`` indicates any quantity of any bytes (including zero characters). + +``_`` indicates any one byte. + +Use the backslash (``\``) for escaping metasymbols. See the note on escaping in the description of the 'match' function. + +For regular expressions like%needle%, the code is more optimal and works as fast as the 'position' function. For other regular expressions, the code is the same as for the 'match' function. + +notLike(haystack, pattern), оператор haystack NOT LIKE pattern +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The same thing as 'like', but negative. diff --git a/docs/en/functions/type_conversion_functions.rst b/docs/en/functions/type_conversion_functions.rst new file mode 100644 index 00000000000..6c26660a184 --- /dev/null +++ b/docs/en/functions/type_conversion_functions.rst @@ -0,0 +1,120 @@ +Type conversion functions +---------------------------- + +toUInt8, toUInt16, toUInt32, toUInt64 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +toInt8, toInt16, toInt32, toInt64 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +toFloat32, toFloat64 +~~~~~~~~~~~~~~~~~~~~ + +toUInt8OrZero, toUInt16OrZero, toUInt32OrZero, toUInt64OrZero, toInt8OrZero, toInt16OrZero, toInt32OrZero, toInt64OrZero, toFloat32OrZero, toFloat64OrZero +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +toDate, toDateTime +~~~~~~~~~~~~~~~~~~ + +toString +~~~~~~~~ +Functions for converting between numbers, strings (but not fixed strings), dates, and dates with times. All these functions accept one argument. + +When converting to or from a string, the value is formatted or parsed using the same rules as for the TabSeparated format (and almost all other text formats). If the string can't be parsed, an exception is thrown and the request is canceled. + +When converting dates to numbers or vice versa, the date corresponds to the number of days since the beginning of the Unix epoch. +When converting dates with times to numbers or vice versa, the date with time corresponds to the number of seconds since the beginning of the Unix epoch. + +Formats of date and date with time for toDate/toDateTime functions are defined as follows: +:: + YYYY-MM-DD + YYYY-MM-DD hh:mm:ss + +As an exception, if converting from UInt32, Int32, UInt64, or Int64 type numbers to Date, and if the number is greater than or equal to 65536, the number is interpreted as a Unix timestamp (and not as the number of days) and is rounded to the date. This allows support for the common occurrence of writing 'toDate(unix_timestamp)', which otherwise would be an error and would require writing the more cumbersome 'toDate(toDateTime(unix_timestamp))'. + +Conversion between a date and date with time is performed the natural way: by adding a null time or dropping the time. + +Conversion between numeric types uses the same rules as assignments between different numeric types in C++. + +To do transformations on DateTime in given time zone, pass second argument with time zone name: + +.. code-block:: sql + + SELECT + now() AS now_local, + toString(now(), 'Asia/Yekaterinburg') AS now_yekat + + ┌───────────now_local─┬─now_yekat───────────┐ + │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ + └─────────────────────┴─────────────────────┘ + +To format DateTime in given time zone: +:: + toString(now(), 'Asia/Yekaterinburg') + +To get unix timestamp for string with datetime in specified time zone: +:: + toUnixTimestamp('2000-01-01 00:00:00', 'Asia/Yekaterinburg') + +toFixedString(s, N) +~~~~~~~~~~~~~~~~~~~~ +Converts a String type argument to a FixedString(N) type (a string with fixed length N). N must be a constant. If the string has fewer bytes than N, it is passed with null bytes to the right. If the string has more bytes than N, an exception is thrown. + +toStringCutToZero(s) +~~~~~~~~~~~~~~~~~~~~ +Accepts a String or FixedString argument. Returns a String that is cut to a first null byte occurrence. + +Example: + +.. code-block:: sql + + :) SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut + + ┌─s─────────────┬─s_cut─┐ + │ foo\0\0\0\0\0 │ foo │ + └───────────────┴───────┘ + + :) SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut + + ┌─s──────────┬─s_cut─┐ + │ foo\0bar\0 │ foo │ + └────────────┴───────┘ + +reinterpretAsUInt8, reinterpretAsUInt16, reinterpretAsUInt32, reinterpretAsUInt64 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +reinterpretAsInt8, reinterpretAsInt16, reinterpretAsInt32, reinterpretAsInt64 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +reinterpretAsFloat32, reinterpretAsFloat64 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +reinterpretAsDate, reinterpretAsDateTime +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. A date is interpreted as the number of days since the beginning of the Unix Epoch, and a date with time is interpreted as the number of seconds since the beginning of the Unix Epoch. + +reinterpretAsString +~~~~~~~~~~~~~~~~~~~ +This function accepts a number or date or date with time, and returns a string containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long. + +CAST(x, t) +~~~~~~~~~~ +Casts x to the t data type. +The syntax ``CAST(x AS t)`` is also supported. + +Example: + +.. code-block:: sql + + SELECT + '2016-06-15 23:00:00' AS timestamp, + CAST(timestamp AS DateTime) AS datetime, + CAST(timestamp AS Date) AS date, + CAST(timestamp, 'String') AS string, + CAST(timestamp, 'FixedString(22)') AS fixed_string + + ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ + │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ + └─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ + +Casting to FixedString(N) works only for String and FixedString(N). diff --git a/docs/en/functions/url_functions.rst b/docs/en/functions/url_functions.rst new file mode 100644 index 00000000000..64bf0768dad --- /dev/null +++ b/docs/en/functions/url_functions.rst @@ -0,0 +1,118 @@ +Functions for working with URLs +------------------------ + +All these functions don't follow the RFC. They are maximally simplified for improved performance. + +Функции, извлекающие часть URL-а. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If there isn't anything similar in a URL, an empty string is returned. + +protocol +"""""""" +- Selects the protocol. Examples: http, ftp, mailto, magnet... + +domain +""""""" +- Selects the domain. + +domainWithoutWWW +"""""""""""" +- Selects the domain and removes no more than one 'www.' from the beginning of it, if present. + +topLevelDomain +""""""""""" +- Selects the top-level domain. Example: .ru. + +firstSignificantSubdomain +"""""""""""""" +- Selects the "first significant subdomain". This is a non-standard concept specific to Yandex.Metrica. The first significant subdomain is a second-level domain if it is 'com', 'net', 'org', or 'co'. Otherwise, it is a third-level domain. For example, firstSignificantSubdomain('https://news.yandex.ru/') = 'yandex', firstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex'. The list of "insignificant" second-level domains and other implementation details may change in the future. + +cutToFirstSignificantSubdomain +"""""""""""""""" +- Selects the part of the domain that includes top-level subdomains up to the "first significant subdomain" (see the explanation above). + +For example, ``cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'``. + +path +"""" +- Selects the path. Example: /top/news.html The path does not include the query-string. + +pathFull +""""""" +- The same as above, but including query-string and fragment. Example: /top/news.html?page=2#comments + +queryString +""""""""" +- Selects the query-string. Example: page=1&lr=213. query-string does not include the first question mark, or # and everything that comes after #. + +fragment +"""""" +- Selects the fragment identifier. fragment does not include the first number sign (#). + +queryStringAndFragment +""""""""" +- Selects the query-string and fragment identifier. Example: page=1#29390. + +extractURLParameter(URL, name) +""""""""" +- Selects the value of the 'name' parameter in the URL, if present. Otherwise, selects an empty string. If there are many parameters with this name, it returns the first occurrence. This function works under the assumption that the parameter name is encoded in the URL in exactly the same way as in the argument passed. + +extractURLParameters(URL) +"""""""""" +- Gets an array of name=value strings corresponding to the URL parameters. The values are not decoded in any way. + +extractURLParameterNames(URL) +"""""""" +- Gets an array of name=value strings corresponding to the names of URL parameters. The values are not decoded in any way. + +URLHierarchy(URL) +""""""""" +- Gets an array containing the URL trimmed to the ``/``, ``?`` characters in the path and query-string. Consecutive separator characters are counted as one. The cut is made in the position after all the consecutive separator characters. Example: + +URLPathHierarchy(URL) +"""""""" +- The same thing, but without the protocol and host in the result. The / element (root) is not included. Example: +This function is used for implementing tree-view reports by URL in Yandex.Metrica. +:: + URLPathHierarchy('https://example.com/browse/CONV-6788') = + [ + '/browse/', + '/browse/CONV-6788' + ] + +decodeURLComponent(URL) +""""""""""" +Returns a URL-decoded URL. +Example: + +.. code-block:: sql + + :) SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; + + ┌─DecodedURL─────────────────────────────┐ + │ http://127.0.0.1:8123/?query=SELECT 1; │ + └────────────────────────────────────────┘ + +Functions that remove part of a URL. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If the URL doesn't have anything similar, the URL remains unchanged. + +cutWWW +""""" +Removes no more than one 'www.' from the beginning of the URL's domain, if present. + +cutQueryString +"""""" +Removes the query-string. The question mark is also removed.. + +cutFragment +"""""""" +Removes the fragment identifier. The number sign is also removed. + +cutQueryStringAndFragment +"""""""""" +Removes the query-string and fragment identifier. The question mark and number sign are also removed. + +cutURLParameter(URL, name) +"""""""""" +Removes the URL parameter named 'name', if present. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument. diff --git a/docs/en/functions/ym_dict_functions.rst b/docs/en/functions/ym_dict_functions.rst new file mode 100644 index 00000000000..cf9da55d5ab --- /dev/null +++ b/docs/en/functions/ym_dict_functions.rst @@ -0,0 +1,119 @@ +Functions for working with Yandex.Metrica dictionaries +---------------- +In order for the functions below to work, the server config must specify the paths and addresses for getting all the Yandex.Metrica dictionaries. The dictionaries are loaded at the first call of any of these functions. If the reference lists can't be loaded, an exception is thrown. + +For information about creating reference lists, see the section "Dictionaries". + +Multiple geobases +~~~~~~~~~ +ClickHouse supports working with multiple alternative geobases (regional hierarchies) simultaneously, in order to support various perspectives on which countries certain regions belong to. + +The 'clickhouse-server' config specifies the file with the regional hierarchy: +``/opt/geo/regions_hierarchy.txt`` + +Besides this file, it also searches for files nearby that have the _ symbol and any suffix appended to the name (before the file extension). +For example, it will also find the file ``/opt/geo/regions_hierarchy_ua.txt``, if present. + +``ua`` is called the dictionary key. For a dictionary without a suffix, the key is an empty string. + +All the dictionaries are re-loaded in runtime (once every certain number of seconds, as defined in the builtin_dictionaries_reload_interval config parameter, or once an hour by default). However, the list of available dictionaries is defined one time, when the server starts. + +All functions for working with regions have an optional argument at the end - the dictionary key. It is indicated as the geobase. +Example: +:: + regionToCountry(RegionID) - Uses the default dictionary: /opt/geo/regions_hierarchy.txt + regionToCountry(RegionID, '') - Uses the default dictionary: /opt/geo/regions_hierarchy.txt + regionToCountry(RegionID, 'ua') - Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt + +regionToCity(id[, geobase]) +~~~~~~~~ +Accepts a UInt32 number - the region ID from the Yandex geobase. If this region is a city or part of a city, it returns the region ID for the appropriate city. Otherwise, returns 0. + +regionToArea(id[, geobase]) +~~~~~~~~ +Converts a region to an area (type 5 in the geobase). In every other way, this function is the same as 'regionToCity'. + +.. code-block:: sql + + SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua'), 'en') + FROM system.numbers + LIMIT 15 + + ┌─regionToName(regionToArea(toUInt32(number), \'ua\'), \'en\')─┐ + │ │ + │ Moscow and Moscow region │ + │ Saint-Petersburg and Leningradskaya oblast │ + │ Belgorod District │ + │ Ivanovo district │ + │ Kaluga District │ + │ Kostroma District │ + │ Kursk District │ + │ Lipetsk District │ + │ Orel District │ + │ Ryazhan District │ + │ Smolensk District │ + │ Tambov District │ + │ Tver District │ + │ Tula District │ + └──────────────────────────────────────────────────────────────┘ + +regionToDistrict(id[, geobase]) +~~~~~~~~~ +Converts a region to a federal district (type 4 in the geobase). In every other way, this function is the same as 'regionToCity'. + +.. code-block:: sql + + SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua'), 'en') + FROM system.numbers + LIMIT 15 + + ┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'), \'en\')─┐ + │ │ + │ Central │ + │ Northwest │ + │ South │ + │ North Kavkaz │ + │ Volga Region │ + │ Ural │ + │ Siberian │ + │ Far East │ + │ Scotland │ + │ Faroe Islands │ + │ Flemish Region │ + │ Brussels-Capital Region │ + │ Wallonia │ + │ Federation of Bosnia and Herzegovina │ + └──────────────────────────────────────────────────────────────────┘ + +regionToCountry(id[, geobase]) +~~~~~~~~~ +Converts a region to a country. In every other way, this function is the same as 'regionToCity'. +Example: ``regionToCountry(toUInt32(213)) = 225`` converts ``Moscow (213)`` to ``Russia (225)``. + +regionToContinent(id[, geobase]) +~~~~~~~~~ +Converts a region to a continent. In every other way, this function is the same as 'regionToCity'. +Example: ``regionToContinent(toUInt32(213)) = 10001`` converts Moscow (213) to Eurasia (10001). + +regionToPopulation(id[, geobase]) +~~~~~~~~ +Gets the population for a region. +The population can be recorded in files with the geobase. See the section "External dictionaries". +If the population is not recorded for the region, it returns 0. +In the Yandex geobase, the population might be recorded for child regions, but not for parent regions.. + +regionIn(lhs, rhs[, geobase]) +~~~~~~~~~~ +Checks whether a 'lhs' region belongs to a 'rhs' region. Returns a UInt8 number equal to 1 if it belongs, or 0 if it doesn't belong. +The relationship is reflexive - any region also belongs to itself. + +regionHierarchy(id[, geobase]) +~~~~~~~~~ +ПAccepts a UInt32 number - the region ID from the Yandex geobase. Returns an array of region IDs consisting of the passed region and all parents along the chain. +Example: ``regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]``. + +regionToName(id[, lang]) +~~~~~~~~ +Accepts a UInt32 number - the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language 'ru' is used. If the language is not supported, an exception is thrown. Returns a string - the name of the region in the corresponding language. If the region with the specified ID doesn't exist, an empty string is returned. + +``ua`` and ``uk`` mean the same thing - Ukrainian. diff --git a/docs/en/getting_started.md b/docs/en/getting_started.md new file mode 100644 index 00000000000..b3097545f58 --- /dev/null +++ b/docs/en/getting_started.md @@ -0,0 +1,133 @@ +Начало работы +============= + +Системные требования +----------------- + +Система некроссплатформенная. Требуется ОС Linux Ubuntu не более старая, чем Precise (12.04); архитектура x86_64 с поддержкой набора инструкций SSE 4.2. +Для проверки наличия SSE 4.2, выполните: +:: + grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" + +Рекомендуется использовать Ubuntu Trusty или Ubuntu Xenial или Ubuntu Precise. +Терминал должен работать в кодировке UTF-8 (как по умолчанию в Ubuntu). + +Установка +----------------- + +В целях тестирования и разработки, система может быть установлена на один сервер или на рабочий компьютер. + +Установка из пакетов +~~~~~~~~~~~~~~~~~~~~ + +Пропишите в `/etc/apt/sources.list` (или в отдельный файл `/etc/apt/sources.list.d/clickhouse.list`) репозитории: +:: + deb http://repo.yandex.ru/clickhouse/trusty stable main + +На других версиях Ubuntu, замените `trusty` на `xenial` или `precise`. + +Затем выполните: +:: + sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 # optional + sudo apt-get update + sudo apt-get install clickhouse-client clickhouse-server-common + +Также можно скачать и установить пакеты вручную, отсюда: +http://repo.yandex.ru/clickhouse/trusty/pool/main/c/clickhouse/, +http://repo.yandex.ru/clickhouse/xenial/pool/main/c/clickhouse/, +http://repo.yandex.ru/clickhouse/precise/pool/main/c/clickhouse/. + +ClickHouse содержит настройки ограничения доступа. Они расположены в файле users.xml (рядом с config.xml). +По умолчанию, разрешён доступ отовсюду для пользователя default без пароля. См. секцию users/default/networks. +Подробнее смотрите в разделе "конфигурационные файлы". + +Установка из исходников +~~~~~~~~~~~~~~~~~~~~~~~ +Для сборки воспользуйтесь инструкцией: build.md + +Вы можете собрать пакеты и установить их. +Также вы можете использовать программы без установки пакетов. + +Клиент: dbms/src/Client/ +Сервер: dbms/src/Server/ + +Для сервера создаёте директории с данными, например: +:: + /opt/clickhouse/data/default/ + /opt/clickhouse/metadata/default/ + +(Настраивается в конфиге сервера.) +Сделайте chown под нужного пользователя. + +Обратите внимание на путь к логам в конфиге сервера (src/dbms/src/Server/config.xml). + +Другие методы установки +~~~~~~~~~~~~~~~~~~~~~~~ +Docker образ: https://hub.docker.com/r/yandex/clickhouse-server/ + +Gentoo overlay: https://github.com/kmeaw/clickhouse-overlay + + +Запуск +------- + +Для запуска сервера (в качестве демона), выполните: +:: + sudo service clickhouse-server start + +Смотрите логи в директории `/var/log/clickhouse-server/` + +Если сервер не стартует - проверьте правильность конфигурации в файле `/etc/clickhouse-server/config.xml` + +Также можно запустить сервер из консоли: +:: + clickhouse-server --config-file=/etc/clickhouse-server/config.xml + +При этом, лог будет выводиться в консоль - удобно для разработки. +Если конфигурационный файл лежит в текущей директории, то указывать параметр --config-file не требуется - по умолчанию будет использован файл ./config.xml + +Соединиться с сервером можно с помощью клиента командной строки: +:: + clickhouse-client + +Параметры по умолчанию обозначают - соединяться с localhost:9000, от имени пользователя default без пароля. +Клиент может быть использован для соединения с удалённым сервером. Пример: +:: + clickhouse-client --host=example.com + +Подробнее смотри раздел "Клиент командной строки". + +Проверим работоспособность системы: +:: + milovidov@milovidov-Latitude-E6320:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client + ClickHouse client version 0.0.18749. + Connecting to localhost:9000. + Connected to ClickHouse server version 0.0.18749. + + :) SELECT 1 + + SELECT 1 + + ┌─1─┐ + │ 1 │ + └───┘ + + 1 rows in set. Elapsed: 0.003 sec. + + :) + +Поздравляю, система работает! + +Тестовые данные +--------------- +Если вы сотрудник Яндекса, вы можете воспользоваться тестовыми данными Яндекс.Метрики для изучения возможностей системы. +Как загрузить тестовые данные, написано здесь. + +Если вы внешний пользователь системы, вы можете воспользоваться использовать общедоступные данные, способы загрузки которых указаны здесь. + +Если возникли вопросы +--------------------- +Если вы являетесь сотрудником Яндекса, обращайтесь на внутреннюю рассылку по ClickHouse. +Вы можете подписаться на эту рассылку, чтобы получать анонсы, быть в курсе нововведений, а также видеть вопросы, которые возникают у других пользователей. + +Иначе вы можете задавать вопросы на Stackoverflow или участвовать в обсуждениях на Google Groups. Также вы можете отправить приватное сообщение для разрабочиков по адресу clickhouse-feedback@yandex-team.com. diff --git a/docs/en/getting_started.rst b/docs/en/getting_started.rst new file mode 100644 index 00000000000..c91fe20979a --- /dev/null +++ b/docs/en/getting_started.rst @@ -0,0 +1,130 @@ +Getting started +============= + +System requirements +----------------- + +This is not a cross-platform system. It requires Linux Ubuntu Precise (12.04) or newer, x86_64 architecture with SSE 4.2 instruction set. +To test for SSE 4.2 support, do: +:: + grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" + +We recommend using Ubuntu Trusty or Ubuntu Xenial or Ubuntu Precise. +The terminal must use UTF-8 encoding (the default in Ubuntu). + +Installation +----------------- + +For testing and development, the system can be installed on a single server or on a desktop computer. + +Installing from packages +~~~~~~~~~~~~~~~~~~~~ + +In `/etc/apt/sources.list` (or in a separate `/etc/apt/sources.list.d/clickhouse.list` file), add the repository: +:: + deb http://repo.yandex.ru/clickhouse/trusty stable main + +For other Ubuntu versions, replace `trusty` to `xenial` or `precise`. + +Then run: +:: + sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 # optional + sudo apt-get update + sudo apt-get install clickhouse-client clickhouse-server-common + +You can also download and install packages manually from here: +http://repo.yandex.ru/clickhouse/trusty/pool/main/c/clickhouse/, +http://repo.yandex.ru/clickhouse/xenial/pool/main/c/clickhouse/, +http://repo.yandex.ru/clickhouse/precise/pool/main/c/clickhouse/. + +ClickHouse contains access restriction settings. They are located in the 'users.xml' file (next to 'config.xml'). +By default, access is allowed from everywhere for the default user without a password. See 'user/default/networks'. For more information, see the section "Configuration files". + +Installing from source +~~~~~~~~~~~~~~~~~~~~~~~ +To build, follow the instructions in build.md (for Linux) or in build_osx.md (for Mac OS X). + +You can compile packages and install them. You can also use programs without installing packages. +:: + Client: dbms/src/Client/ + Server: dbms/src/Server/ + +For the server, create a catalog with data, such as: +:: + /opt/clickhouse/data/default/ + /opt/clickhouse/metadata/default/ + +(Configured in the server config.) +Run 'chown' for the desired user. + +Note the path to logs in the server config (src/dbms/src/Server/config.xml). + +Other methods of installation +~~~~~~~~~~~~~~~~~~~~~~~ +The Docker image is located here: https://hub.docker.com/r/yandex/clickhouse-server/ + +There is Gentoo overlay located here: https://github.com/kmeaw/clickhouse-overlay + + +Launch +------- + +To start the server (as a daemon), run: +:: + sudo service clickhouse-server start + +View the logs in the catalog `/var/log/clickhouse-server/` + +If the server doesn't start, check the configurations in the file `/etc/clickhouse-server/config.xml` + +You can also launch the server from the console: +:: + clickhouse-server --config-file=/etc/clickhouse-server/config.xml + +In this case, the log will be printed to the console, which is convenient during development. If the configuration file is in the current directory, you don't need to specify the '--config-file' parameter. By default, it uses './config.xml'. + +You can use the command-line client to connect to the server: +:: + clickhouse-client + +The default parameters indicate connecting with localhost:9000 on behalf of the user 'default' without a password. +The client can be used for connecting to a remote server. For example: +:: + clickhouse-client --host=example.com + +For more information, see the section "Command-line client". + +Checking the system: +:: + milovidov@milovidov-Latitude-E6320:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client + ClickHouse client version 0.0.18749. + Connecting to localhost:9000. + Connected to ClickHouse server version 0.0.18749. + + :) SELECT 1 + + SELECT 1 + + ┌─1─┐ + │ 1 │ + └───┘ + + 1 rows in set. Elapsed: 0.003 sec. + + :) + +Congratulations, it works! + +Test data +--------------- +If you are Yandex employee, you can use Yandex.Metrica test data to explore the system's capabilities. You can find instructions for using the test data here. + +Otherwise, you could use one of available public datasets, described here. + + +If you have questions +--------------------- +If you are Yandex employee, use internal ClickHouse maillist. +You can subscribe to this list to get announcements, information on new developments, and questions that other users have. + +Otherwise, you could ask questions on Stack Overflow; discuss in Google Groups; or send private message to developers to address clickhouse-feedback@yandex-team.com. diff --git a/docs/en/index.rst b/docs/en/index.rst new file mode 100644 index 00000000000..6e8752ca6d3 --- /dev/null +++ b/docs/en/index.rst @@ -0,0 +1,25 @@ + +Documentation +----------------- + +.. toctree:: + :maxdepth: 6 + + introduction/index + getting_started + interfaces/index + query_language/index + external_data + table_engines/index + system_tables/index + table_functions/index + formats/index + data_types/index + operators/index + functions/index + agg_functions/index + dicts/index + settings/index + configuration_files + access_rights + quotas diff --git a/docs/en/interfaces/cli.rst b/docs/en/interfaces/cli.rst new file mode 100644 index 00000000000..ff5b2d3f481 --- /dev/null +++ b/docs/en/interfaces/cli.rst @@ -0,0 +1,93 @@ +Command-line client +----------------------- +Для работы из командной строки вы можете использовать clickhouse-client: +:: + $ clickhouse-client + ClickHouse client version 0.0.26176. + Connecting to localhost:9000. + Connected to ClickHouse server version 0.0.26176. + + :) SELECT 1 + + +The ``clickhouse-client`` program accepts the following parameters, which are all optional: + +``--host, -h`` - server name, by defaul - localhost. +You can use either the name or the IPv4 or IPv6 address. + +``--port`` - The port to connect to, by default - 9000. +Note that the HTTP interface and the native interface use different ports. + +``--user, -u`` - The username, by default - default. + +``--password`` - The password, by default - empty string. + +``--query, -q`` - Query to process when using non-interactive mode. + +``--database, -d`` - Select the current default database, by default - the current DB from the server settings (by default, the 'default' DB). + +``--multiline, -m`` - If specified, allow multiline queries (do not send request on Enter). + +``--multiquery, -n`` - If specified, allow processing multiple queries separated by semicolons. +Only works in non-interactive mode. + +``--format, -f`` - Use the specified default format to output the result. +``--vertical, -E`` - If specified, use the Vertical format by default to output the result. This is the same as '--format=Vertical'. In this format, each value is printed on a separate line, which is helpful when displaying wide tables. +``--time, -t`` - If specified, print the query execution time to 'stderr' in non-interactive mode. +``--stacktrace`` - If specified, also prints the stack trace if an exception occurs. +``--config-file`` - Name of the configuration file that has additional settings or changed defaults for the settings listed above. +By default, files are searched for in this order: +./clickhouse-client.xml +~/./clickhouse-client/config.xml +/etc/clickhouse-client/config.xml +Settings are only taken from the first file found. + +You can also specify any settings that will be used for processing queries. For example, ``clickhouse-client --max_threads=1``. For more information, see the section "Settings". + +The client can be used in interactive and non-interactive (batch) mode. +To use batch mode, specify the 'query' parameter, or send data to 'stdin' (it verifies that 'stdin' is not a terminal), or both. +Similar to the HTTP interface, when using the 'query' parameter and sending data to 'stdin', the request is a concatenation of the 'query' parameter, a line break, and the data in 'stdin'. This is convenient for large INSERT queries. + +Examples for insert data via clickhouse-client: +:: + echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; + + cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; + 3, 'some text', '2016-08-14 00:00:00' + 4, 'some more text', '2016-08-14 00:00:01' + _EOF + + cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; + + +In batch mode, the default data format is TabSeparated. You can set the format in the FORMAT clause of the query. + +By default, you can only process a single query in batch mode. To make multiple queries from a "script," use the 'multiquery' parameter. This works for all queries except INSERT. Query results are output consecutively without additional separators. +Similarly, to process a large number of queries, you can run 'clickhouse-client' for each query. Note that it may take tens of milliseconds to launch the 'clickhouse-client' program. + +In interactive mode, you get a command line where you can enter queries. + +If 'multiline' is not specified (the default): +To run a query, press Enter. The semicolon is not necessary at the end of the query. To enter a multiline query, enter a backslash ``\`` before the line break - after you press Enter, you will be asked to enter the next line of the query. + +If 'multiline' is specified: +To run a query, end it with a semicolon and press Enter. If the semicolon was omitted at the end of the entered line, you will be asked to enter the next line of the query. + +You can specify ``\G`` instead of or after the semicolon. This indicates using Vertical format. In this format, each value is printed on a separate line, which is convenient for wide tables. This unusual feature was added for compatibility with the MySQL CLI. + +The command line is based on 'readline' (and 'history') (or 'libedit', or even nothing, depending on build). In other words, it uses the familiar keyboard shortcuts and keeps a history. The history is written to /.clickhouse-client-history. + +By default, the format used is PrettyCompact. You can change the format in the FORMAT clause of the query, or by specifying '\G' at the end of the query, using the '--format' or '--vertical' argument in the command line, or using the client configuration file. + +To exit the client, press Ctrl+D (or Ctrl+C), or enter one of the following : +"exit", "quit", "logout", "учше", "йгше", "дщпщге", "exit;", "quit;", "logout;", "учшеж", "йгшеж", "дщпщгеж", "q", "й", "\q", "\Q", ":q", "\й", "\Й", "Жй" + +When processing a query, the client shows: +#. Progress, which is updated no more than 10 times per second (by default). For quick queries, the progress might not have time to be displayed. +#. The formatted query after parsing, for debugging. +#. The result in the specified format. +#. The number of lines in the result, the time passed, and the average speed of query processing. + +To cancel a lengthy query, press Ctrl+C. However, you will still need to wait a little for the server to abort the request. It is not possible to cancel a query at certain stages. If you don't wait and press Ctrl+C a second time, the client will exit. + +The command-line client allows passing external data (external temporary tables) for querying. For more information, see the section "External data for request processing". diff --git a/docs/en/interfaces/http_interface.rst b/docs/en/interfaces/http_interface.rst new file mode 100644 index 00000000000..016226f188c --- /dev/null +++ b/docs/en/interfaces/http_interface.rst @@ -0,0 +1,204 @@ +HTTP interface +============== + +The HTTP interface lets you use ClickHouse on any platform from any programming language. We use it for working from Java and Perl, as well as shell scripts. In other departments, the HTTP interface is used from Perl, Python, and Go. The HTTP interface is more limited than the native interface, but it has better compatibility. + +By default, clickhouse-server listens for HTTP on port 8123 (this can be changed in the config). +If you make a GET / request without parameters, it returns the string "Ok" (with a line break at the end). You can use this in health-check scripts. + +.. code-block:: bash + + $ curl 'http://localhost:8123/' + Ok. + +Send the request as a URL 'query' parameter, or as a POST. Or send the beginning of the request in the 'query' parameter, and the rest in the POST (we'll explain later why this is necessary). URL length is limited by 16KB, this limit should be taken into account when sending long queries in the 'query' parameter. + +If successful, you receive the 200 response code and the result in the response body. +If an error occurs, you receive the 500 response code and an error description text in the response body. + +When using the GET method, 'readonly' is set. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body, or in the URL parameter. + +Examples: + +.. code-block:: bash + + $ curl 'http://localhost:8123/?query=SELECT%201' + 1 + + $ wget -O- -q 'http://localhost:8123/?query=SELECT 1' + 1 + + $ GET 'http://localhost:8123/?query=SELECT 1' + 1 + + $ echo -ne 'GET /?query=SELECT%201 HTTP/1.0\r\n\r\n' | nc localhost 8123 + HTTP/1.0 200 OK + Connection: Close + Date: Fri, 16 Nov 2012 19:21:50 GMT + + 1 + +As you can see, curl is not very convenient because spaces have to be URL-escaped. Although wget escapes everything on its own, we don't recommend it because it doesn't work well over HTTP 1.1 when using keep-alive and Transfer-Encoding: chunked. + +.. code-block:: bash + + $ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @- + 1 + + $ echo 'SELECT 1' | curl 'http://localhost:8123/?query=' --data-binary @- + 1 + + $ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @- + 1 + +If part of the query is sent in the parameter, and part in the POST, a line break is inserted between these two data parts. +Example (this won't work): + +.. code-block:: bash + + $ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @- + Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL + ECT 1 + , expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception + +By default, data is returned in TabSeparated format (for more information, see the "Formats" section). +You use the FORMAT clause of the query to request any other format. + +.. code-block:: bash + + $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @- + ┏━━━┓ + ┃ 1 ┃ + ┡━━━┩ + │ 1 │ + └───┘ + +The POST method of transmitting data is necessary for INSERT queries. In this case, you can write the beginning of the query in the URL parameter, and use POST to pass the data to insert. The data to insert could be, for example, a tab-separated dump from MySQL. In this way, the INSERT query replaces LOAD DATA LOCAL INFILE from MySQL. + +Examples: + +Creating a table: + +.. code-block:: bash + + echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | POST 'http://localhost:8123/' + +Using the familiar INSERT query for data insertion: + +.. code-block:: bash + + echo 'INSERT INTO t VALUES (1),(2),(3)' | POST 'http://localhost:8123/' + +Data can be sent separately from the query: + +.. code-block:: bash + + echo '(4),(5),(6)' | POST 'http://localhost:8123/?query=INSERT INTO t VALUES' + +You can specify any data format. The 'Values' format is the same as what is used when writing INSERT INTO t VALUES: + +.. code-block:: bash + + echo '(7),(8),(9)' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT Values' + +To insert data from a tab-separated dump, specify the corresponding format: + +.. code-block:: bash + + echo -ne '10\n11\n12\n' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT TabSeparated' + +Reading the table contents. Data is output in random order due to parallel query processing: + +.. code-block:: bash + + $ GET 'http://localhost:8123/?query=SELECT a FROM t' + 7 + 8 + 9 + 10 + 11 + 12 + 1 + 2 + 3 + 4 + 5 + 6 + +Deleting the table. + +.. code-block:: bash + + POST 'http://localhost:8123/?query=DROP TABLE t' + +For successful requests that don't return a data table, an empty response body is returned. + +You can use compression when transmitting data. The compressed data has a non-standard format, and you will need to use a special compressor program to work with it (`sudo apt-get install compressor-metrika-yandex`). + +If you specified 'compress=1' in the URL, the server will compress the data it sends you. +If you specified 'decompress=1' in the URL, the server will decompress the same data that you pass in the POST method. + +You can use this to reduce network traffic when transmitting a large amount of data, or for creating dumps that are immediately compressed. + +You can use the 'database' URL parameter to specify the default database. + +.. code-block:: bash + + $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @- + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + +By default, the database that is registered in the server settings is used as the default database. By default, this is the database called 'default'. Alternatively, you can always specify the database using a dot before the table name. + +The username and password can be indicated in one of two ways: + +1. Using HTTP Basic Authentication. Example: :: + + echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- + +2. In the 'user' and 'password' URL parameters. Example: :: + + echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- + +3. Using 'X-ClickHouse-User' and 'X-ClickHouse-Key' headers. Example: :: + + echo 'SELECT 1' | curl -H "X-ClickHouse-User: user" -H "X-ClickHouse-Key: password" 'http://localhost:8123/' -d @- + + +If the user name is not indicated, the username 'default' is used. If the password is not indicated, an empty password is used. +You can also use the URL parameters to specify any settings for processing a single query, or entire profiles of settings. Example: +`http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1` + +For more information, see the section "Settings". + +.. code-block:: bash + + $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + +For information about other parameters, see the section "SET". + +In contrast to the native interface, the HTTP interface does not support the concept of sessions or session settings, does not allow aborting a query (to be exact, it allows this in only a few cases), and does not show the progress of query processing. Parsing and data formatting are performed on the server side, and using the network might be ineffective. + +The optional 'query_id' parameter can be passed as the query ID (any string). For more information, see the section "Settings, replace_running_query". + +The optional 'quota_key' parameter can be passed as the quota key (any string). It can also be passed as 'X-ClickHouse-Quota' header. For more information, see the section "Quotas". + +The HTTP interface allows passing external data (external temporary tables) for querying. For more information, see the section "External data for query processing". diff --git a/docs/en/interfaces/index.rst b/docs/en/interfaces/index.rst new file mode 100644 index 00000000000..e681fc3dffb --- /dev/null +++ b/docs/en/interfaces/index.rst @@ -0,0 +1,9 @@ +Interfaces +========== + +To explore the system's capabilities, download data to tables, or make manual queries, use the clickhouse-client program. + +.. toctree:: + :glob: + + * diff --git a/docs/en/interfaces/jdbc.rst b/docs/en/interfaces/jdbc.rst new file mode 100644 index 00000000000..57cbcb6fdfe --- /dev/null +++ b/docs/en/interfaces/jdbc.rst @@ -0,0 +1,4 @@ +JDBC driver +------------ + +There is official JDBC driver for ClickHouse. See `here `_ . diff --git a/docs/en/interfaces/tcp.rst b/docs/en/interfaces/tcp.rst new file mode 100644 index 00000000000..be87c68d927 --- /dev/null +++ b/docs/en/interfaces/tcp.rst @@ -0,0 +1,4 @@ +Native interface (TCP) +---------------------- + +The native interface is used in the "clickhouse-client" command-line client for interaction between servers with distributed query processing, and also in C++ programs. We will only cover the command-line client. diff --git a/docs/en/interfaces/third-party_client_libraries.rst b/docs/en/interfaces/third-party_client_libraries.rst new file mode 100644 index 00000000000..4b7310948f3 --- /dev/null +++ b/docs/en/interfaces/third-party_client_libraries.rst @@ -0,0 +1,30 @@ +Third-party client libraries +-------------------------------------- + +There exist third-party client libraries for ClickHouse: + +* Python: + - `infi.clickhouse_orm `_ + - `sqlalchemy-clickhouse `_ +* PHP + - `clickhouse-php-client `_ + - `PhpClickHouseClient `_ + - `phpClickHouse `_ +* Go + - `clickhouse `_ + - `go-clickhouse `_ +* NodeJs + - `clickhouse `_ + - `node-clickhouse `_ +* Perl + - `perl-DBD-ClickHouse `_ + - `HTTP-ClickHouse `_ + - `AnyEvent-ClickHouse `_ +* Ruby + - `clickhouse `_ +* R + - `clickhouse-r `_ +* .NET + - `ClickHouse-Net `_ + +Libraries was not tested by us. Ordering is arbitrary. diff --git a/docs/en/interfaces/third-party_gui.rst b/docs/en/interfaces/third-party_gui.rst new file mode 100644 index 00000000000..f122ce8e904 --- /dev/null +++ b/docs/en/interfaces/third-party_gui.rst @@ -0,0 +1,15 @@ +Third-party GUI +------------------------------ + +There are `open source project Tabix `_ company of SMI2, which implements a graphical web interface for ClickHouse. + +Tabix key features: +- works with ClickHouse from the browser directly, without installing additional software; +- query editor that supports highlighting of SQL syntax ClickHouse, auto-completion for all objects, including dictionaries and context-sensitive help for built-in functions. +- graphs, charts and geo-referenced for mapping query results; +- interactive designer PivotTables (pivot) for query results; +- graphical tools for analysis ClickHouse; +- two color theme: light and dark. + + +`Tabix documentation `_ diff --git a/docs/en/introduction/distinctive_features.rst b/docs/en/introduction/distinctive_features.rst new file mode 100644 index 00000000000..4ac206a3057 --- /dev/null +++ b/docs/en/introduction/distinctive_features.rst @@ -0,0 +1,62 @@ +Distinctive features of ClickHouse +=================================== + +1. True column-oriented DBMS. +--------------------------------- +In a true column-oriented DBMS, there isn't any "garbage" stored with the values. For example, constant-length values must be supported, to avoid storing their length "number" next to the values. As an example, a billion UInt8-type values should actually consume around 1 GB uncompressed, or this will strongly affect the CPU use. It is very important to store data compactly (without any "garbage") even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data. + +This is worth noting because there are systems that can store values of separate columns separately, but that can't effectively process analytical queries due to their optimization for other scenarios. Example are HBase, BigTable, Cassandra, and HyperTable. In these systems, you will get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second. + +Also note that ClickHouse is a DBMS, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server. + +2. Data compression. +----------------- +Some column-oriented DBMSs (InfiniDB CE and MonetDB) do not use data compression. However, data compression really improves performance. + +3. Disk storage of data. +---------------------------- +Many column-oriented DBMSs (SAP HANA, and Google PowerDrill) can only work in RAM. But even on thousands of servers, the RAM is too small for storing all the pageviews and sessions in Yandex.Metrica. + +4. Parallel processing on multiple cores. +--------------------------------------------------------------- +Large queries are parallelized in a natural way. + +5. Distributed processing on multiple servers. +----------------------------------------------- +Almost none of the columnar DBMSs listed above have support for distributed processing. +In ClickHouse, data can reside on different shards. Each shard can be a group of replicas that are used for fault tolerance. The query is processed on all the shards in parallel. This is transparent for the user. + +6. SQL support. +--------------- +If you are familiar with standard SQL, we can't really talk about SQL support. +NULLs are not supported. All the functions have different names. However, this is a declarative query language based on SQL that can't be differentiated from SQL in many instances. +JOINs are supported. Subqueries are supported in FROM, IN, JOIN clauses; and scalar subqueries. +Correlated subqueries are not supported. + +7. Vector engine. +----------------- +Data is not only stored by columns, but is processed by vectors - parts of columns. This allows us to achieve high CPU performance. + +8. Real-time data updates. +----------------------- +ClickHouse supports primary key tables. In order to quickly perform queries on the range of the primary key, the data is sorted incrementally using the merge tree. Due to this, data can continually be added to the table. There is no locking when adding data. + +9. Indexes. +----------------- +Having a primary key allows, for example, extracting data for specific clients (Metrica counters) for a specific time range, with low latency less than several dozen milliseconds. + +10. Suitable for online queries. +------------------ +This lets us use the system as the back-end for a web interface. Low latency means queries can be processed without delay, while the Yandex.Metrica interface page is loading (in online mode). + +11. Support for approximated calculations. +----------------- + +#. The system contains aggregate functions for approximated calculation of the number of various values, medians, and quantiles. +#. Supports running a query based on a part (sample) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk. +#. Supports running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources. + +14. Data replication and support for data integrity on replicas. +----------------- +Uses asynchronous multimaster replication. After being written to any available replica, data is distributed to all the remaining replicas. The system maintains identical data on different replicas. Data is restored automatically after a failure, or using a "button" for complex cases. +For more information, see the section "Data replication". diff --git a/docs/en/introduction/features.rst b/docs/en/introduction/features.rst new file mode 100644 index 00000000000..3f462ac7140 --- /dev/null +++ b/docs/en/introduction/features.rst @@ -0,0 +1,6 @@ +ClickHouse features that can be considered disadvantages +------------------------------------------------------------ + +#. No transactions. +#. For aggregation, query results must fit in the RAM on a single server. However, the volume of source data for a query may be indefinitely large. +#. Lack of full-fledged UPDATE/DELETE implementation. diff --git a/docs/en/introduction/index.rst b/docs/en/introduction/index.rst new file mode 100644 index 00000000000..ead7d9e53f5 --- /dev/null +++ b/docs/en/introduction/index.rst @@ -0,0 +1,13 @@ +Introduction +========= + +.. toctree:: + :glob: + + what_is_clickhouse + distinctive_features + features + ya_metrika_task + use_case + possible_silly_questions + performance diff --git a/docs/en/introduction/performance.rst b/docs/en/introduction/performance.rst new file mode 100644 index 00000000000..9340c3a72bf --- /dev/null +++ b/docs/en/introduction/performance.rst @@ -0,0 +1,21 @@ +Performance +=================== +According to internal testing results, ClickHouse shows the best performance for comparable operating scenarios among systems of its class that were available for testing. This includes the highest throughput for long queries, and the lowest latency on short queries. Testing results are shown on this page. + +Throughput for a single large query +------------------------------- +Throughput can be measured in rows per second or in megabytes per second. If the data is placed in the page cache, a query that is not too complex is processed on modern hardware at a speed of approximately 2-10 GB/s of uncompressed data on a single server (for the simplest cases, the speed may reach 30 GB/s). If data is not placed in the page cache, the speed depends on the disk subsystem and the data compression rate. For example, if the disk subsystem allows reading data at 400 MB/s, and the data compression rate is 3, the speed will be around 1.2 GB/s. To get the speed in rows per second, divide the speed in bytes per second by the total size of the columns used in the query. For example, if 10 bytes of columns are extracted, the speed will be around 100-200 million rows per second. + +The processing speed increases almost linearly for distributed processing, but only if the number of rows resulting from aggregation or sorting is not too large. + +Latency when processing short queries. +-------------------- +If a query uses a primary key and does not select too many rows to process (hundreds of thousands), and does not use too many columns, we can expect less than 50 milliseconds of latency (single digits of milliseconds in the best case) if data is placed in the page cache. Otherwise, latency is calculated from the number of seeks. If you use rotating drives, for a system that is not overloaded, the latency is calculated by this formula: seek time (10 ms) * number of columns queried * number of data parts. + +Throughput when processing a large quantity of short queries. +-------------------- +Under the same conditions, ClickHouse can handle several hundred queries per second on a single server (up to several thousand in the best case). Since this scenario is not typical for analytical DBMSs, we recommend expecting a maximum of 100 queries per second. + +Performance on data insertion. +------------------ +We recommend inserting data in packets of at least 1000 rows, or no more than a single request per second. When inserting to a MergeTree table from a tab-separated dump, the insertion speed will be from 50 to 200 MB/s. If the inserted rows are around 1 Kb in size, the speed will be from 50,000 to 200,000 rows per second. If the rows are small, the performance will be higher in rows per second (on Yandex Banner System data -> 500,000 rows per second, on Graphite data -> 1,000,000 rows per second). To improve performance, you can make multiple INSERT queries in parallel, and performance will increase linearly. diff --git a/docs/en/introduction/possible_silly_questions.rst b/docs/en/introduction/possible_silly_questions.rst new file mode 100644 index 00000000000..fdf4bad0922 --- /dev/null +++ b/docs/en/introduction/possible_silly_questions.rst @@ -0,0 +1,19 @@ +Possible silly questions +----------------------- + +1. Why not to use systems like map-reduce? +""""""""""""""""""" + +Systems like map-reduce are distributed computing systems, where the reduce phase is performed using distributed sorting. +Regarding this aspect, map-reduce is similar to other systems like YAMR, Hadoop, YT. + +These systems are not suitable for online queries because of latency, So they can't be used in backend-level for web interface. +Systems like this also are not suitable for real-time updates. +Distributed sorting is not optimal solution for reduce operations, if the result of the operation and all intermediate results, shall they exist, fit in operational memory of a single server, as usually happens in case of online analytical queries. +In this case the optimal way to perform reduce operations is by using a hash-table. A common optimization method for map-reduce tasks is combine operation (partial reduce) which uses hash-tables in memory. This optimization is done by the user manually. +Distributed sorting is the main reason for long latencies of simple map-reduce jobs. + +Systems similar to map-reduce enable running any code on the cluster. But for OLAP use-cases declarative query languages are better suited as they allow to carry out investigations faster. For example, for Hadoop there are Hive and Pig. There are others: Cloudera Impala, Shark (deprecated) and Spark SQL for Spark, Presto, Apache Drill. +However, performance of such tasks is highly sub-optimal compared to the performance of specialized systems and relatively high latency does not allow the use of these systems as a backend for the web interface. + +YT allows you to store separate groups of columns. But YT is not a truly columnar storage system, as the system has no fixed length data types (so you can efficiently store a number without "garbage"), and there is no vector engine. Tasks in YT are performed by arbitrary code in streaming mode, so can not be sufficiently optimized (up to hundreds of millions of lines per second per server). In 2014-2016 YT is to develop "dynamic table sorting" functionality using Merge Tree, strongly typed values ​​and SQL-like language support. Dynamically sorted tables are not suited for OLAP tasks, since the data is stored in rows. Query language development in YT is still in incubating phase, which does not allow it to focus on this functionality. YT developers are considering dynamically sorted tables for use in OLTP and Key-Value scenarios. diff --git a/docs/en/introduction/use_case.rst b/docs/en/introduction/use_case.rst new file mode 100644 index 00000000000..a7bbf51cc5f --- /dev/null +++ b/docs/en/introduction/use_case.rst @@ -0,0 +1,13 @@ +Usage in Yandex.Metrica and other Yandex services +------------------------------------------ + +ClickHouse is used for multiple purposes in Yandex.Metrica. Its main task is to build reports in online mode using non-aggregated data. It uses a cluster of 374 servers, which store over 20.3 trillion rows in the database. The volume of compressed data, without counting duplication and replication, is about 2 PB. The volume of uncompressed data (in TSV format) would be approximately 17 PB. + +ClickHouse is also used for: + * Storing WebVisor data. + * Processing intermediate data. + * Building global reports with Analytics. + * Running queries for debugging the Metrica engine. + * Analyzing logs from the API and the user interface. + +ClickHouse has at least a dozen installations in other Yandex services: in search verticals, Market, Direct, business analytics, mobile development, AdFox, personal services, and others. diff --git a/docs/en/introduction/what_is_clickhouse.rst b/docs/en/introduction/what_is_clickhouse.rst new file mode 100644 index 00000000000..06d4857e1d7 --- /dev/null +++ b/docs/en/introduction/what_is_clickhouse.rst @@ -0,0 +1,112 @@ +What is ClickHouse? +==================== + +ClickHouse is a columnar DBMS for OLAP. + +In a "normal" row-oriented DBMS, data is stored in this order: +:: + 5123456789123456789 1 Eurobasket - Greece - Bosnia and Herzegovina - example.com 1 2011-09-01 01:03:02 6274717 1294101174 11409 612345678912345678 0 33 6 http://www.example.com/basketball/team/123/match/456789.html http://www.example.com/basketball/team/123/match/987654.html 0 1366 768 32 10 3183 0 0 13 0\0 1 1 0 0 2011142 -1 0 0 01321 613 660 2011-09-01 08:01:17 0 0 0 0 utf-8 1466 0 0 0 5678901234567890123 277789954 0 0 0 0 0 + 5234985259563631958 0 Consulting, Tax assessment, Accounting, Law 1 2011-09-01 01:03:02 6320881 2111222333 213 6458937489576391093 0 3 2 http://www.example.ru/ 0 800 600 16 10 2 153.1 0 0 10 63 1 1 0 0 2111678 000 0 588 368 240 2011-09-01 01:03:17 4 0 60310 0 windows-1251 1466 0 000 778899001 0 0 0 0 0 +... + +In other words, all the values related to a row are stored next to each other. Examples of a row-oriented DBMS are MySQL, Postgres, MS SQL Server, and others. + +In a column-oriented DBMS, data is stored like this: +:: + WatchID: 5385521489354350662 5385521490329509958 5385521489953706054 5385521490476781638 5385521490583269446 5385521490218868806 5385521491437850694 5385521491090174022 5385521490792669254 5385521490420695110 5385521491532181574 5385521491559694406 5385521491459625030 5385521492275175494 5385521492781318214 5385521492710027334 5385521492955615302 5385521493708759110 5385521494506434630 5385521493104611398 + JavaEnable: 1 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 1 + Title: Yandex Announcements - Investor Relations - Yandex Yandex — Contact us — Moscow Yandex — Mission Ru Yandex — History — History of Yandex Yandex Financial Releases - Investor Relations - Yandex Yandex — Locations Yandex Board of Directors - Corporate Governance - Yandex Yandex — Technologies + GoodEvent: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + EventTime: 2016-05-18 05:19:20 2016-05-18 08:10:20 2016-05-18 07:38:00 2016-05-18 01:13:08 2016-05-18 00:04:06 2016-05-18 04:21:30 2016-05-18 00:34:16 2016-05-18 07:35:49 2016-05-18 11:41:59 2016-05-18 01:13:32 + +These examples only show the order that data is arranged in. +The values from different columns are stored separately, and data from the same column is stored together. +Examples of a column-oriented DBMS: ``Vertica``, ``Paraccel (Actian Matrix) (Amazon Redshift)``, ``Sybase IQ``, ``Exasol``, ``Infobright``, ``InfiniDB``, ``MonetDB (VectorWise) (Actian Vector)``, ``LucidDB``, ``SAP HANA``, ``Google Dremel``, ``Google PowerDrill``, ``Druid``, ``kdb+`` и т. п. + +Different orders for storing data are better suited to different scenarios. +The data access scenario refers to what queries are made, how often, and in what proportion; how much data is read for each type of query - rows, columns, and bytes; the relationship between reading and updating data; the working size of the data and how locally it is used; whether transactions are used, and how isolated they are; requirements for data replication and logical integrity; requirements for latency and throughput for each type of query, and so on. + +The higher the load on the system, the more important it is to customize the system to the scenario, and the more specific this customization becomes. There is no system that is equally well-suited to significantly different scenarios. If a system is adaptable to a wide set of scenarios, under a high load, the system will handle all the scenarios equally poorly, or will work well for just one of the scenarios. + +We'll say that the following is true for the OLAP (online analytical processing) scenario: + +* The vast majority of requests are for read access. +* Data is updated in fairly large batches (> 1000 rows), not by single rows; or it is not updated at all. +* Data is added to the DB but is not modified. +* For reads, quite a large number of rows are extracted from the DB, but only a small subset of columns. +* Tables are "wide," meaning they contain a large number of columns. +* Queries are relatively rare (usually hundreds of queries per server or less per second). +* For simple queries, latencies around 50 ms are allowed. +* Column values are fairly small - numbers and short strings (for example, 60 bytes per URL). +* Requires high throughput when processing a single query (up to billions of rows per second per server). +* There are no transactions. +* Low requirements for data consistency. +* There is one large table per query. All tables are small, except for one. +* A query result is significantly smaller than the source data. That is, data is filtered or aggregated. The result fits in a single server's RAM. + +It is easy to see that the OLAP scenario is very different from other popular scenarios (such as OLTP or Key-Value access). So it doesn't make sense to try to use OLTP or a Key-Value DB for processing analytical queries if you want to get decent performance. For example, if you try to use MongoDB or Elliptics for analytics, you will get very poor performance compared to OLAP databases. + +Columnar-oriented databases are better suited to OLAP scenarios (at least 100 times better in processing speed for most queries), for the following reasons: + +1. For I/O. +1.1. For an analytical query, only a small number of table columns need to be read. In a column-oriented database, you can read just the data you need. For example, if you need 5 columns out of 100, you can expect a 20-fold reduction in I/O. +1.2. Since data is read in packets, it is easier to compress. Data in columns is also easier to compress. This further reduces the I/O volume. +1.3. Due to the reduced I/O, more data fits in the system cache. + +For example, the query "count the number of records for each advertising platform" requires reading one "advertising platform ID" column, which takes up 1 byte uncompressed. If most of the traffic was not from advertising platforms, you can expect at least 10-fold compression of this column. When using a quick compression algorithm, data decompression is possible at a speed of at least several gigabytes of uncompressed data per second. In other words, this query can be processed at a speed of approximately several billion rows per second on a single server. This speed is actually achieved in practice. + +Example: +:: + milovidov@████████.yandex.ru:~$ clickhouse-client + ClickHouse client version 0.0.52053. + Connecting to localhost:9000. + Connected to ClickHouse server version 0.0.52053. + + :) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 + + SELECT + CounterID, + count() + FROM hits + GROUP BY CounterID + ORDER BY count() DESC + LIMIT 20 + + ┌─CounterID─┬──count()─┐ + │ 114208 │ 56057344 │ + │ 115080 │ 51619590 │ + │ 3228 │ 44658301 │ + │ 38230 │ 42045932 │ + │ 145263 │ 42042158 │ + │ 91244 │ 38297270 │ + │ 154139 │ 26647572 │ + │ 150748 │ 24112755 │ + │ 242232 │ 21302571 │ + │ 338158 │ 13507087 │ + │ 62180 │ 12229491 │ + │ 82264 │ 12187441 │ + │ 232261 │ 12148031 │ + │ 146272 │ 11438516 │ + │ 168777 │ 11403636 │ + │ 4120072 │ 11227824 │ + │ 10938808 │ 10519739 │ + │ 74088 │ 9047015 │ + │ 115079 │ 8837972 │ + │ 337234 │ 8205961 │ + └───────────┴──────────┘ + + 20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) + + :) + +2. For CPU. +Since executing a query requires processing a large number of rows, it helps to dispatch all operations for entire vectors instead of for separate rows, or to implement the query engine so that there is almost no dispatching cost. If you don't do this, with any half-decent disk subsystem, the query interpreter inevitably stalls the CPU. +It makes sense to both store data in columns and process it, when possible, by columns. + +There are two ways to do this: +1. A vector engine. All operations are written for vectors, instead of for separate values. This means you don't need to call operations very often, and dispatching costs are negligible. Operation code contains an optimized internal cycle. +2. Code generation. The code generated for the query has all the indirect calls in it. + +This is not done in "normal" databases, because it doesn't make sense when running simple queries. However, there are exceptions. For example, MemSQL uses code generation to reduce latency when processing SQL queries. (For comparison, analytical DBMSs require optimization of throughput, not latency.) + +Note that for CPU efficiency, the query language must be declarative (SQL or MDX), or at least a vector (J, K). The query should only contain implicit loops, allowing for optimization. diff --git a/docs/en/introduction/ya_metrika_task.rst b/docs/en/introduction/ya_metrika_task.rst new file mode 100644 index 00000000000..6e98d268aa5 --- /dev/null +++ b/docs/en/introduction/ya_metrika_task.rst @@ -0,0 +1,26 @@ +The Yandex.Metrica task +---------------------------------- + +We need to get custom reports based on hits and sessions, with custom segments set by the user. Data for the reports is updated in real-time. Queries must be run immediately (in online mode). We must be able to build reports for any time period. Complex aggregates must be calculated, such as the number of unique visitors. +At this time (April 2014), Yandex.Metrica receives approximately 12 billion events (pageviews and mouse clicks) daily. All these events must be stored in order to build custom reports. A single query may require scanning hundreds of millions of rows over a few seconds, or millions of rows in no more than a few hundred milliseconds. + +Aggregated and non-aggregated data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +There is a popular opinion that in order to effectively calculate statistics, you must aggregate data, since this reduces the volume of data. + +But data aggregation is a very limited solution, for the following reasons: + +* You must have a pre-defined list of reports the user will need. The user can't make custom reports. +* When aggregating a large quantity of keys, the volume of data is not reduced, and aggregation is useless. +* For a large number of reports, there are too many aggregation variations (combinatorial explosion). +* When aggregating keys with high cardinality (such as URLs), the volume of data is not reduced by much (less than twofold). For this reason, the volume of data with aggregation might grow instead of shrink. +* Users will not view all the reports we calculate for them. A large portion of calculations are useless. +* The logical integrity of data may be violated for various aggregations. + +If we do not aggregate anything and work with non-aggregated data, this might actually reduce the volume of calculations. + +However, with aggregation, a significant part of the work is taken offline and completed relatively calmly. In contrast, online calculations require calculating as fast as possible, since the user is waiting for the result. + +Yandex.Metrica has a specialized system for aggregating data called Metrage, which is used for the majority of reports. Starting in 2009, Yandex.Metrica also used a specialized OLAP database for non-aggregated data called OLAPServer, which was previously used for the report builder. OLAPServer worked well for non-aggregated data, but it had many restrictions that did not allow it to be used for all reports as desired. These included the lack of support for data types (only numbers), and the inability to incrementally update data in real-time (it could only be done by rewriting data daily). OLAPServer is not a DBMS, but a specialized DB. + +To remove the limitations of OLAPServer and solve the problem of working with non-aggregated data for all reports, we developed the ClickHouse DBMS. diff --git a/docs/en/operators/index.rst b/docs/en/operators/index.rst new file mode 100644 index 00000000000..45db6586c4d --- /dev/null +++ b/docs/en/operators/index.rst @@ -0,0 +1,138 @@ +Operators +========= + +All operators are transformed to the corresponding functions at the query parsing stage, in accordance with their precedence and associativity. + +Access operators +----------------- + +``a[N]`` - Access to an array element, arrayElement(a, N) function. + +``a.N`` - Access to a tuple element, tupleElement(a, N) function. + +Numeric negation operator +---------------------------- + +``-a`` - negate(a) function + +Multiplication and division operators +----------------------------- + +``a * b`` - multiply(a, b) function + +``a / b`` - divide(a, b) function + +``a % b`` - modulo(a, b) function + +Addition and subtraction operators +------------------------------ + +``a + b`` - plus(a, b) function + +``a - b`` - minus(a, b) function + +Comparison operators +------------------- + +``a = b`` - equals(a, b) function + +``a == b`` - equals(a, b) function + +``a != b`` - notEquals(a, b) function + +``a <> b`` - notEquals(a, b) function + +``a <= b`` - lessOrEquals(a, b) function + +``a >= b`` - greaterOrEquals(a, b) function + +``a < b`` - less(a, b) function + +``a > b`` - greater(a, b) function + +``a LIKE s`` - like(a, b) function + +``a NOT LIKE s`` - notLike(a, b) function + +``a BETWEEN b AND c`` - equivalent to a >= b AND a <= c + + +Operators for working with data sets +---------------------------------- + +*See the section "IN operators".* + +``a IN ...`` - in(a, b) function + +``a NOT IN ...`` - notIn(a, b) function + +``a GLOBAL IN ...`` - globalIn(a, b) function + +``a GLOBAL NOT IN ...`` - globalNotIn(a, b) function + + + +Logical negation operator +------------------------------ + +``NOT a`` - ``not(a)`` function + + +Logical "AND" operator +------------------------- + +``a AND b`` - function ``and(a, b)`` + + +Logical "OR" operator +--------------------------- + +``a OR b`` - function ``or(a, b)`` + +Conditional operator +----------------- + +``a ? b : c`` - function ``if(a, b, c)`` + +Conditional expression +------------------ + +.. code-block:: sql + + CASE [x] + WHEN a THEN b + [WHEN ... THEN ...] + ELSE c + END + +If x is given - transform(x, [a, ...], [b, ...], c). Otherwise, multiIf(a, b, ..., c). + +String concatenation operator +------------------------- + +``s1 || s2`` - concat(s1, s2) function + +Lambda creation operator +---------------------------------- + +``x -> expr`` - lambda(x, expr) function + +The following operators do not have a priority, since they are brackets: + +Array creation operator +-------------------------- + +``[x1, ...]`` - array(x1, ...) function + +Tuple creation operator +------------------------- +``(x1, x2, ...)`` - tuple(x2, x2, ...) function + + +Associativity +---------------- + +All binary operators have left associativity. For example, ``'1 + 2 + 3'`` is transformed to ``'plus(plus(1, 2), 3)'``. +Sometimes this doesn't work the way you expect. For example, ``'SELECT 4 > 3 > 2'`` results in ``0``. + +For efficiency, the 'and' and 'or' functions accept any number of arguments. The corresponding chains of AND and OR operators are transformed to a single call of these functions. diff --git a/docs/en/query_language/clickhouse_local.rst b/docs/en/query_language/clickhouse_local.rst new file mode 100644 index 00000000000..7121d2b7473 --- /dev/null +++ b/docs/en/query_language/clickhouse_local.rst @@ -0,0 +1,4 @@ +clickhouse-local +-------------------------- + +Application ``clickhouse-local`` can fast processing of local files that store tables without resorting to deployment and configuration clickhouse-server ... diff --git a/docs/en/query_language/index.rst b/docs/en/query_language/index.rst new file mode 100644 index 00000000000..7d11bcfe297 --- /dev/null +++ b/docs/en/query_language/index.rst @@ -0,0 +1,7 @@ +Query language +========== + +.. toctree:: + :glob: + + * diff --git a/docs/en/query_language/queries.rst b/docs/en/query_language/queries.rst new file mode 100644 index 00000000000..055c2fb618c --- /dev/null +++ b/docs/en/query_language/queries.rst @@ -0,0 +1,1402 @@ +Queries +------- + +CREATE DATABASE +~~~~~~~~~~~~~~~ +Creates the 'db_name' database. +:: + CREATE DATABASE [IF NOT EXISTS] db_name + +A database is just a directory for tables. +If "IF NOT EXISTS" is included, the query won't return an error if the database already exists. + +CREATE TABLE +~~~~~~~~~~~~ +The ``CREATE TABLE`` query can have several forms. +:: + CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name + ( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... + ) ENGINE = engine + +Creates a table named 'name' in the 'db' database or the current database if 'db' is not set, with the structure specified in brackets and the 'engine' engine. The structure of the table is a list of column descriptions. If indexes are supported by the engine, they are indicated as parameters for the table engine. + +A column description is ``name type`` in the simplest case. For example: ``RegionID UInt32``. +Expressions can also be defined for default values (see below). +:: + CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name AS [db2.]name2 [ENGINE = engine] + +Creates a table with the same structure as another table. You can specify a different engine for the table. If the engine is not specified, the same engine will be used as for the 'db2.name2' table. +:: + CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name ENGINE = engine AS SELECT ... + +Creates a table with a structure like the result of the ``SELECT`` query, with the 'engine' engine, and fills it with data from SELECT. + +In all cases, if IF NOT EXISTS is specified, the query won't return an error if the table already exists. In this case, the query won't do anything. + +Default values +""""""""""""""""""""" +The column description can specify an expression for a default value, in one of the following ways: +``DEFAULT expr``, ``MATERIALIZED expr``, ``ALIAS expr``. +Example: ``URLDomain String DEFAULT domain(URL)``. + +If an expression for the default value is not defined, the default values will be set to zeros for numbers, empty strings for strings, empty arrays for arrays, and 0000-00-00 for dates or 0000-00-00 00:00:00 for dates with time. NULLs are not supported. + +If the default expression is defined, the column type is optional. If there isn't an explicitly defined type, the default expression type is used. Example: ``EventDate DEFAULT toDate(EventTime)`` - the 'Date' type will be used for the 'EventDate' column. + +If the data type and default expression are defined explicitly, this expression will be cast to the specified type using type casting functions. Example: ``Hits UInt32 DEFAULT 0`` means the same thing as ``Hits UInt32 DEFAULT toUInt32(0)``. + +Default expressions may be defined as an arbitrary expression from table constants and columns. When creating and changing the table structure, it checks that expressions don't contain loops. For INSERT, it checks that expressions are resolvable - that all columns they can be calculated from have been passed. + +``DEFAULT expr`` + +Normal default value. If the INSERT query doesn't specify the corresponding column, it will be filled in by computing the corresponding expression. + +``MATERIALIZED expr`` + +Materialized expression. Such a column can't be specified for INSERT, because it is always calculated. +For an INSERT without a list of columns, these columns are not considered. +In addition, this column is not substituted when using an asterisk in a SELECT query. This is to preserve the invariant that the dump obtained using SELECT * can be inserted back into the table using INSERT without specifying the list of columns. + +``ALIAS expr`` + +Synonym. Such a column isn't stored in the table at all. +Its values can't be inserted in a table, and it is not substituted when using an asterisk in a SELECT query. +It can be used in SELECTs if the alias is expanded during query parsing. + +When using the ALTER query to add new columns, old data for these columns is not written. Instead, when reading old data that does not have values for the new columns, expressions are computed on the fly by default. However, if running the expressions requires different columns that are not indicated in the query, these columns will additionally be read, but only for the blocks of data that need it. + +If you add a new column to a table but later change its default expression, the values used for old data will change (for data where values were not stored on the disk). Note that when running background merges, data for columns that are missing in one of the merging parts is written to the merged part. + +It is not possible to set default values for elements in nested data structures. + +Temporary tables +""""""""""""""""" +In all cases, if TEMPORARY is specified, a temporary table will be created. Temporary tables have the following characteristics: +- Temporary tables disappear when the session ends, including if the connection is lost. +- A temporary table is created with the Memory engine. The other table engines are not supported. +- The DB can't be specified for a temporary table. It is created outside of databases. +- If a temporary table has the same name as another one and a query specifies the table name without specifying the DB, the temporary table will be used. +- For distributed query processing, temporary tables used in a query are passed to remote servers. + +In most cases, temporary tables are not created manually, but when using external data for a query, or for distributed (GLOBAL) IN. For more information, see the appropriate sections. + +CREATE VIEW +~~~~~~~~~~~~ +``CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]name [ENGINE = engine] [POPULATE] AS SELECT ...`` + +Creates a view. There are two types of views: normal and MATERIALIZED. + +Normal views don't store any data, but just perform a read from another table. In other words, a normal view is nothing more than a saved query. When reading from a view, this saved query is used as a subquery in the FROM clause. + +As an example, assume you've created a view: +:: + CREATE VIEW view AS SELECT ... +and written a query: +:: + SELECT a, b, c FROM view + +This query is fully equivalent to using the subquery: +:: + SELECT a, b, c FROM (SELECT ...) + +Materialized views store data transformed by the corresponding SELECT query. + +When creating a materialized view, you can specify ENGINE - the table engine for storing data. By default, it uses the same engine as for the table that the SELECT query is made from. + +A materialized view is arranged as follows: when inserting data to the table specified in SELECT, part of the inserted data is converted by this SELECT query, and the result is inserted in the view. + +If you specify POPULATE, the existing table data is inserted in the view when creating it, as if making a CREATE TABLE ... AS SELECT ... query. Otherwise, the query contains only the data inserted in the table after creating the view. We don't recommend using POPULATE, since data inserted in the table during the view creation will not be inserted in it. + +The SELECT query can contain DISTINCT, GROUP BY, ORDER BY, LIMIT ... Note that the corresponding conversions are performed independently on each block of inserted data. For example, if GROUP BY is set, data is aggregated during insertion, but only within a single packet of inserted data. The data won't be further aggregated. The exception is when using an ENGINE that independently performs data aggregation, such as SummingMergeTree. + +The execution of ALTER queries on materialized views has not been fully developed, so they might be inconvenient. + +Views look the same as normal tables. For example, they are listed in the result of the SHOW TABLES query. + +There isn't a separate query for deleting views. To delete a view, use DROP TABLE. + +ATTACH +~~~~~~ +The query is exactly the same as CREATE, except +- The word ATTACH is used instead of CREATE. +- The query doesn't create data on the disk, but assumes that data is already in the appropriate places, and just adds information about the table to the server. +After executing an ATTACH query, the server will know about the existence of the table. + +This query is used when starting the server. The server stores table metadata as files with ATTACH queries, which it simply runs at launch (with the exception of system tables, which are explicitly created on the server). + +DROP +~~~~ +This query has two types: ``DROP DATABASE`` and ``DROP TABLE``. +:: + DROP DATABASE [IF EXISTS] db + +Deletes all tables inside the 'db' database, then deletes the 'db' database itself. +If IF EXISTS is specified, it doesn't return an error if the database doesn't exist. +:: + DROP TABLE [IF EXISTS] [db.]name + +Deletes the table. +If IF EXISTS is specified, it doesn't return an error if the table doesn't exist or the database doesn't exist. + +DETACH +~~~~~~ +Deletes information about the table from the server. The server stops knowing about the table's existence. +:: + DETACH TABLE [IF EXISTS] [db.]name + +This does not delete the table's data or metadata. On the next server launch, the server will read the metadata and find out about the table again. Similarly, a "detached" table can be re-attached using the ATTACH query (with the exception of system tables, which do not have metadata stored for them). + +There is no DETACH DATABASE query. + +RENAME +~~~~~~ +Renames one or more tables. +:: + RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ... + + All tables are renamed under global locking. Renaming tables is a light operation. If you indicated another database after TO, the table will be moved to this database. However, the directories with databases must reside in the same file system (otherwise, an error is returned). + +ALTER +~~~~~ +The ALTER query is only supported for *MergeTree type tables, as well as for Merge and Distributed types. The query has several variations. + +Column manipulations +"""""""""""""""""""""""" +Lets you change the table structure. +:: + ALTER TABLE [db].name ADD|DROP|MODIFY COLUMN ... + +In the query, specify a list of one or more comma-separated actions. Each action is an operation on a column. + +The following actions are supported: +:: + ADD COLUMN name [type] [default_expr] [AFTER name_after] + +Adds a new column to the table with the specified name, type, and default expression (see the section "Default expressions"). If you specify 'AFTER name_after' (the name of another column), the column is added after the specified one in the list of table columns. Otherwise, the column is added to the end of the table. Note that there is no way to add a column to the beginning of a table. For a chain of actions, 'name_after' can be the name of a column that is added in one of the previous actions. + +Adding a column just changes the table structure, without performing any actions with data. The data doesn't appear on the disk after ALTER. If the data is missing for a column when reading from the table, it is filled in with default values (by performing the default expression if there is one, or using zeros or empty strings). The column appears on the disk after merging data parts (see MergeTree). + +This approach allows us to complete the ALTER query instantly, without increasing the volume of old data. + +.. code-block:: sql + + DROP COLUMN name + +Deletes the column with the name 'name'. + +Deletes data from the file system. Since this deletes entire files, the query is completed almost instantly. + +.. code-block:: sql + + MODIFY COLUMN name [type] [default_expr] + +Changes the 'name' column's type to 'type' and/or the default expression to 'default_expr'. When changing the type, values are converted as if the 'toType' function were applied to them. + +If only the default expression is changed, the query doesn't do anything complex, and is completed almost instantly. + +Changing the column type is the only complex action - it changes the contents of files with data. For large tables, this may take a long time. + +There are several stages of execution: +- Preparing temporary (new) files with modified data. +- Renaming old files. +- Renaming the temporary (new) files to the old names. +- Deleting the old files. + +Only the first stage takes time. If there is a failure at this stage, the data is not changed. +If there is a failure during one of the successive stages, data can be restored manually. The exception is if the old files were deleted from the file system but the data for the new files did not get written to the disk and was lost. + +There is no support for changing the column type in arrays and nested data structures. + +The ALTER query lets you create and delete separate elements (columns) in nested data structures, but not whole nested data structures. To add a nested data structure, you can add columns with a name like 'name.nested_name' and the type 'Array(T)'. A nested data structure is equivalent to multiple array columns with a name that has the same prefix before the dot. + +There is no support for deleting of columns in the primary key or the sampling key (columns that are in the ENGINE expression). Changing the type of columns in the primary key is allowed only if such change doesn't entail changing the actual data (e.g. adding the value to an Enum or changing the type from DateTime to UInt32 is allowed). + +If the ALTER query is not sufficient for making the table changes you need, you can create a new table, copy the data to it using the INSERT SELECT query, then switch the tables using the RENAME query and delete the old table. + +The ALTER query blocks all reads and writes for the table. In other words, if a long SELECT is running at the time of the ALTER query, the ALTER query will wait for the SELECT to complete. At the same time, all new queries to the same table will wait while this ALTER is running. + +For tables that don't store data themselves (Merge and Distributed), ALTER just changes the table structure, and does not change the structure of subordinate tables. For example, when running ALTER for a Distributed table, you will also need to run ALTER for the tables on all remote servers. + +The ALTER query for changing columns is replicated. The instructions are saved in ZooKeeper, then each replica applies them. All ALTER queries are run in the same order. The query waits for the appropriate actions to be completed on the other replicas. However, a query to change columns in a replicated table can be interrupted, and all actions will be performed asynchronously. + +Manipulations with partitions and parts +"""""""""""""""""""""""""""""""""" +Only works for tables in the MergeTree family. The following operations are available: + +* ``DETACH PARTITION`` - Move a partition to the 'detached' directory and forget it. +* ``DROP PARTITION`` - Delete a partition. +* ``ATTACH PART|PARTITION`` - Add a new part or partition from the 'detached' directory to the table. +* ``FREEZE PARTITION`` - Create a backup of a partition. +* ``FETCH PARTITION`` - Download a partition from another server. + +Each type of query is covered separately below. + +A partition in a table is data for a single calendar month. This is determined by the values of the date key specified in the table engine parameters. Each month's data is stored separately in order to simplify manipulations with this data. + +A "part" in the table is part of the data from a single partition, sorted by the primary key. + +You can use the ``system.parts`` table to view the set of table parts and partitions: +:: + SELECT * FROM system.parts WHERE active + +``active`` - Only count active parts. Inactive parts are, for example, source parts remaining after merging to a larger part - these parts are deleted approximately 10 minutes after merging. + +Another way to view a set of parts and partitions is to go into the directory with table data. +The directory with data is +/var/lib/clickhouse/data/database/table/, +where /var/lib/clickhouse/ is the path to ClickHouse data, 'database' is the database name, and 'table' is the table name. Example: +:: + $ ls -l /var/lib/clickhouse/data/test/visits/ + total 48 + drwxrwxrwx 2 clickhouse clickhouse 20480 мая 13 02:58 20140317_20140323_2_2_0 + drwxrwxrwx 2 clickhouse clickhouse 20480 мая 13 02:58 20140317_20140323_4_4_0 + drwxrwxrwx 2 clickhouse clickhouse 4096 мая 13 02:55 detached + -rw-rw-rw- 1 clickhouse clickhouse 2 мая 13 02:58 increment.txt + +Here ``20140317_20140323_2_2_0``, ``20140317_20140323_4_4_0`` - are directories of parts. + +Let's look at the name of the first part: ``20140317_20140323_2_2_0``. + * ``20140317`` - minimum date of part data + * ``20140323`` - maximum date of part data .. |br| raw:: html + * ``2`` - minimum number of the data block .. |br| raw:: html + * ``2`` - maximum number of the data block .. |br| raw:: html + * ``0`` - part level - depth of the merge tree that formed it + +Each part corresponds to a single partition and contains data for a single month. +201403 - The partition name. A partition is a set of parts for a single month. + +On an operating server, you can't manually change the set of parts or their data on the file system, since the server won't know about it. For non-replicated tables, you can do this when the server is stopped, but we don't recommended it. For replicated tables, the set of parts can't be changed in any case. + +The 'detached' directory contains parts that are not used by the server - detached from the table using the ALTER ... DETACH query. Parts that are damaged are also moved to this directory, instead of deleting them. You can add, delete, or modify the data in the 'detached' directory at any time - the server won't know about this until you make the ALTER TABLE ... ATTACH query. +:: +ALTER TABLE [db.]table DETACH PARTITION 'name' + +Move all data for partitions named 'name' to the 'detached' directory and forget about them. +The partition name is specified in YYYYMM format. It can be indicated in single quotes or without them. + +After the query is executed, you can do whatever you want with the data in the 'detached' directory — delete it from the file system, or just leave it. + +The query is replicated - data will be moved to the 'detached' directory and forgotten on all replicas. The query can only be sent to a leader replica. To find out if a replica is a leader, perform SELECT to the 'system.replicas' system table. Alternatively, it is easier to make a query on all replicas, and all except one will throw an exception. +:: + ALTER TABLE [db.]table DROP PARTITION 'name' + +Similar to the DETACH operation. Deletes data from the table. Data parts will be tagged as inactive and will be completely deleted in approximately 10 minutes. The query is replicated - data will be deleted on all replicas. +:: + ALTER TABLE [db.]table ATTACH PARTITION|PART 'name' + +Adds data to the table from the 'detached' directory. + +It is possible to add data for an entire partition or a separate part. For a part, specify the full name of the part in single quotes. + +The query is replicated. Each replica checks whether there is data in the 'detached' directory. If there is data, it checks the integrity, verifies that it matches the data on the server that initiated the query, and then adds it if everything is correct. If not, it downloads data from the query requestor replica, or from another replica where the data has already been added. + +So you can put data in the 'detached' directory on one replica, and use the ALTER ... ATTACH query to add it to the table on all replicas. +:: + ALTER TABLE [db.]table FREEZE PARTITION 'name' + +Creates a local backup of one or multiple partitions. The name can be the full name of the partition (for example, 201403), or its prefix (for example, 2014) - then the backup will be created for all the corresponding partitions. + +The query does the following: for a data snapshot at the time of execution, it creates hardlinks to table data in the directory /var/lib/clickhouse/shadow/N/... +/var/lib/clickhouse/ is the working ClickHouse directory from the config. +N is the incremental number of the backup. + +``/var/lib/clickhouse/`` - working directory of ClickHouse from config file. +``N`` - incremental number of backup. + +The same structure of directories is created inside the backup as inside ``/var/lib/clickhouse/``. +It also performs 'chmod' for all files, forbidding writes to them. + +The backup is created almost instantly (but first it waits for current queries to the corresponding table to finish running). At first, the backup doesn't take any space on the disk. As the system works, the backup can take disk space, as data is modified. If the backup is made for old enough data, it won't take space on the disk. + +After creating the backup, data from ``/var/lib/clickhouse/shadow/`` can be copied to the remote server and then deleted on the local server. The entire backup process is performed without stopping the server. + +The ``ALTER ... FREEZE PARTITION`` query is not replicated. A local backup is only created on the local server. + +As an alternative, you can manually copy data from the ``/var/lib/clickhouse/data/database/table directory``. But if you do this while the server is running, race conditions are possible when copying directories with files being added or changed, and the backup may be inconsistent. You can do this if the server isn't running - then the resulting data will be the same as after the ALTER TABLE t FREEZE PARTITION query. + +``ALTER TABLE ... FREEZE PARTITION`` only copies data, not table metadata. To make a backup of table metadata, copy the file ``/var/lib/clickhouse/metadata/database/table.sql`` + +To restore from a backup: +* Use the CREATE query to create the table if it doesn't exist. The query can be taken from an .sql file (replace ATTACH in it with CREATE). +* Copy data from the ``data/database/table/`` directory inside the backup to the ``/var/lib/clickhouse/data/database/table/detached/`` directory. +* Run ``ALTER TABLE ... ATTACH PARTITION YYYYMM``queries where ``YYYYMM`` is the month, for every month. + +In this way, data from the backup will be added to the table. +Restoring from a backup doesn't require stopping the server. + +Backups and replication +""""""""""""""""""" +Replication provides protection from device failures. If all data disappeared on one of your replicas, follow the instructions in the "Restoration after failure" section to restore it. + +For protection from device failures, you must use replication. For more information about replication, see the section "Data replication". + +Backups protect against human error (accidentally deleting data, deleting the wrong data or in the wrong cluster, or corrupting data). For high-volume databases, it can be difficult to copy backups to remote servers. In such cases, to protect from human error, you can keep a backup on the same server (it will reside in /var/lib/clickhouse/shadow/). +:: + ALTER TABLE [db.]table FETCH PARTITION 'name' FROM 'path-in-zookeeper' + +This query only works for replicatable tables. + +It downloads the specified partition from the shard that has its ZooKeeper path specified in the FROM clause, then puts it in the 'detached' directory for the specified table. + +Although the query is called ALTER TABLE, it does not change the table structure, and does not immediately change the data available in the table. + +Data is placed in the 'detached' directory. You can use the ALTER TABLE ... ATTACH query to attach the data. + +The path to ZooKeeper is specified in the FROM clause. For example, ``/clickhouse/tables/01-01/visits``. +Before downloading, the system checks that the partition exists and the table structure matches. The most appropriate replica is selected automatically from the healthy replicas. + +The ALTER ... FETCH PARTITION query is not replicated. The partition will be downloaded to the 'detached' directory only on the local server. Note that if after this you use the ALTER TABLE ... ATTACH query to add data to the table, the data will be added on all replicas (on one of the replicas it will be added from the 'detached' directory, and on the rest it will be loaded from neighboring replicas). + +Synchronicity of ALTER queries +""""""""""""""""""""""""""" +For non-replicatable tables, all ALTER queries are performed synchronously. For replicatable tables, the query just adds instructions for the appropriate actions to ZooKeeper, and the actions themselves are performed as soon as possible. However, the query can wait for these actions to be completed on all the replicas. + +For ``ALTER ... ATTACH|DETACH|DROP`` queries, you can use the ``'replication_alter_partitions_sync'`` setting to set up waiting. +Possible values: 0 - do not wait, 1 - wait for own completion (default), 2 - wait for all. + +SHOW DATABASES +~~~~~~~~~~~~~~ + +.. code-block:: sql + + SHOW DATABASES [INTO OUTFILE filename] [FORMAT format] + +Prints a list of all databases. +This query is identical to the query ``SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format]`` +See the section "Formats". + +SHOW TABLES +~~~~~~~~~~~ + +.. code-block:: sql + + SHOW TABLES [FROM db] [LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format] + +Outputs a list of +* tables from the current database, or from the 'db' database if "FROM db" is specified. +* all tables, or tables whose name matches the pattern, if "LIKE 'pattern'" is specified. + +The query is identical to the query SELECT name FROM system.tables +WHERE database = 'db' [AND name LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format] +See the section "LIKE operator". + +SHOW PROCESSLIST +~~~~~~~~~~~~~~~~ + +.. code-block:: sql + + SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] + +Outputs a list of queries currently being processed, other than SHOW PROCESSLIST queries. + +Prints a table containing the columns: + +**user** is the user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the 'default' user. SHOW PROCESSLIST shows the username for a specific query, not for a query that this query initiated. + +**address** is the name of the host that the query was sent from. For distributed processing, on remote servers, this is the name of the query requestor host. To track where a distributed query was originally made from, look at SHOW PROCESSLIST on the query requestor server. + +**elapsed** - The execution time, in seconds. Queries are output in order of decreasing execution time. + +**rows_read**, **bytes_read** - How many rows and bytes of uncompressed data were read when processing the query. For distributed processing, data is totaled from all the remote servers. This is the data used for restrictions and quotas. + +**memory_usage** - Current RAM usage in bytes. See the setting 'max_memory_usage'. + +**query** - The query itself. In INSERT queries, the data for insertion is not output. + +**query_id** - The query identifier. Non-empty only if it was explicitly defined by the user. For distributed processing, the query ID is not passed to remote servers. + +This query is exactly the same as: SELECT * FROM system.processes [INTO OUTFILE filename] [FORMAT format]. + +Tip (execute in the console): +``watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'"`` + +SHOW CREATE TABLE +~~~~~~~~~~~~~~~~~ + +.. code-block:: sql + + SHOW CREATE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] + +Returns a single String-type 'statement' column, which contains a single value - the CREATE query used for creating the specified table. + +DESCRIBE TABLE +~~~~~~~~~~~~~~ + +.. code-block:: sql + + DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] + +Returns two String-type columns: 'name' and 'type', which indicate the names and types of columns in the specified table. + +Nested data structures are output in "expanded" format. Each column is shown separately, with the name after a dot. + +EXISTS +~~~~~~ + +.. code-block:: sql + + EXISTS TABLE [db.]name [INTO OUTFILE filename] [FORMAT format] + +Returns a single UInt8-type column, which contains the single value 0 if the table or database doesn't exist, or 1 if the table exists in the specified database. + +USE +~~~ + +.. code-block:: sql + + USE db + +Lets you set the current database for the session. +The current database is used for searching for tables if the database is not explicitly defined in the query with a dot before the table name. +This query can't be made when using the HTTP protocol, since there is no concept of a session. + +SET +~~~ + +.. code-block:: sql + + SET [GLOBAL] param = value + +Lets you set the 'param' setting to 'value'. You can also make all the settings from the specified settings profile in a single query. To do this, specify 'profile' as the setting name. For more information, see the section "Settings". The setting is made for the session, or for the server (globally) if GLOBAL is specified. +When making a global setting, the setting is not applied to sessions already running, including the current session. It will only be used for new sessions. + +Settings made using SET GLOBAL have a lower priority compared with settings made in the config file in the user profile. In other words, user settings can't be overridden by SET GLOBAL. + +When the server is restarted, global settings made using SET GLOBAL are lost. +To make settings that persist after a server restart, you can only use the server's config file. (This can't be done using a SET query.) + +OPTIMIZE +~~~~~~~~ + +.. code-block:: sql + + OPTIMIZE TABLE [db.]name [PARTITION partition] [FINAL] + +Asks the table engine to do something for optimization. +Supported only by *MergeTree engines, in which this query initializes a non-scheduled merge of data parts. +If ``PARTITION`` is specified, then only specified partition will be optimized. +If ``FINAL`` is specified, then optimization will be performed even if data inside the partition already optimized (i. e. all data is in single part). + +INSERT +~~~~~~ +This query has several variations. + +.. code-block:: sql + + INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ... + +Inserts rows with the listed values in the 'table' table. +This query is exactly the same as: + +.. code-block:: sql + + INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... + +.. code-block:: sql + + INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format ... + +Inserts data in any specified format. +The data itself comes after 'format', after all space symbols up to the first line break if there is one and including it, or after all space symbols if there isn't a line break. We recommend writing data starting from the next line (this is important if the data starts with space characters). + +Example: + +.. code-block:: sql + + INSERT INTO t FORMAT TabSeparated + 11 Hello, world! + 22 Qwerty + +For more information about data formats, see the section "Formats". The "Interfaces" section describes how to insert data separately from the query when using the command-line client or the HTTP interface. + +The query may optionally specify a list of columns for insertion. In this case, the default values are written to the other columns. +Default values are calculated from DEFAULT expressions specified in table definitions, or, if the DEFAULT is not explicitly defined, zeros and empty strings are used. If the 'strict_insert_default' setting is set to 1, all the columns that do not have explicit DEFAULTS must be specified in the query. + +.. code-block:: sql + + INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... + +Inserts the result of the SELECT query into a table. +The names and data types of the SELECT result must exactly match the table structure that data is inserted into, or the specified list of columns. +To change column names, use synonyms (AS) in the SELECT query. +To change data types, use type conversion functions (see the section "Functions"). + +None of the data formats allows using expressions as values. +In other words, you can't write INSERT INTO t VALUES (now(), 1 + 1, DEFAULT). + +There is no support for other data part modification queries: +UPDATE, DELETE, REPLACE, MERGE, UPSERT, INSERT UPDATE. +However, you can delete old data using ALTER TABLE ... DROP PARTITION. + + +SELECT +~~~~~~ + +His Highness, the SELECT query. + +.. code-block:: sql + + SELECT [DISTINCT] expr_list + [FROM [db.]table | (subquery) | table_function] [FINAL] + [SAMPLE sample_coeff] + [ARRAY JOIN ...] + [GLOBAL] ANY|ALL INNER|LEFT JOIN (subquery)|table USING columns_list + [PREWHERE expr] + [WHERE expr] + [GROUP BY expr_list] [WITH TOTALS] + [HAVING expr] + [ORDER BY expr_list] + [LIMIT [n, ]m] + [UNION ALL ...] + [INTO OUTFILE filename] + [FORMAT format] + +All the clauses are optional, except for the required list of expressions immediately after SELECT. +The clauses below are described in almost the same order as in the query execution conveyor. + +If the query omits the DISTINCT, GROUP BY, and ORDER BY clauses and the IN and JOIN subqueries, the query will be completely stream processed, using O(1) amount of RAM. +Otherwise, the query may consume too much RAM, if appropriate restrictions are not defined (max_memory_usage, max_rows_to_group_by, max_rows_to_sort, max_rows_in_distinct, max_bytes_in_distinct, max_rows_in_set, max_bytes_in_set, max_rows_in_join, max_bytes_in_join, max_bytes_before_external_sort, max_bytes_before_external_group_by). For more information, see the section "Settings". It is possible to use external sorting (saving temporary tables to a disk) and external aggregation. Merge join is not implemented. + +FROM clause +""""""""""" + +If the FROM clause is omitted, data will be read from the 'system.one' table. +The 'system.one' table contains exactly one row (this table fulfills the same purpose as the DUAL table found in other DBMSs). + +The FROM clause specifies the table to read data from, or a subquery, or a table function; ARRAY JOIN and the regular JOIN may also be included (see below). + +Instead of a table, the SELECT subquery may be specified in brackets. In this case, the subquery processing pipeline will be built into the processing pipeline of an external query. +In contrast to standard SQL, a synonym does not need to be specified after a subquery. For compatibility, it is possible to write 'AS name' after a subquery, but the specified name isn't used anywhere. + +A table function may be specified instead of a table. For more information, see the section "Table functions". + +To execute a query, all the columns listed in the query are extracted from the appropriate table. Any columns not needed for the external query are thrown out of the subqueries. +If a query does not list any columns (for example, SELECT count() FROM t), some column is extracted from the table anyway (the smallest one is preferred), in order to calculate the number of rows. + +The FINAL modifier can be used only for a SELECT from a CollapsingMergeTree table. When you specify FINAL, data is selected fully "collapsed". Keep in mind that using FINAL leads to a selection that includes columns related to the primary key, in addition to the columns specified in the SELECT. Additionally, the query will be executed in a single stream, and data will be merged during query execution. This means that when using FINAL, the query is processed more slowly. In most cases, you should avoid using FINAL. For more information, see the section "CollapsingMergeTree engine". + +SAMPLE clause +""""""""""""" + +The SAMPLE clause allows for approximated query processing. +Approximated query processing is only supported by MergeTree* type tables, and only if the sampling expression was specified during table creation (see the section "MergeTree engine"). + +SAMPLE has the format ``SAMPLE k``, where 'k' is a decimal number from 0 to 1, or ``SAMPLE n``, where 'n' is a sufficiently large integer. + +In the first case, the query will be executed on 'k' percent of data. For example, ``SAMPLE 0.1`` runs the query on 10% of data. +In the second case, the query will be executed on a sample of no more than 'n' rows. For example, ``SAMPLE 10000000`` runs the query on a maximum of 10,000,000 rows. + +Example: + +.. code-block:: sql + + SELECT + Title, + count() * 10 AS PageViews + FROM hits_distributed + SAMPLE 0.1 + WHERE + CounterID = 34 + AND toDate(EventDate) >= toDate('2013-01-29') + AND toDate(EventDate) <= toDate('2013-02-04') + AND NOT DontCountHits + AND NOT Refresh + AND Title != '' + GROUP BY Title + ORDER BY PageViews DESC LIMIT 1000 + +In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value 'count()' is manually multiplied by 10. + +When using something like ``SAMPLE 10000000``, there isn't any information about which relative percent of data was processed or what the aggregate functions should be multiplied by, so this method of writing is not always appropriate to the situation. + +A sample with a relative coefficient is "consistent": if we look at all possible data that could be in the table, a sample (when using a single sampling expression specified during table creation) with the same coefficient always selects the same subset of possible data. In other words, a sample from different tables on different servers at different times is made the same way. + +For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This allows using the sample in subqueries in the IN clause, as well as for manually correlating results of different queries with samples. + +ARRAY JOIN clause +""""""""""""""""" + +Allows executing JOIN with an array or nested data structure. The intent is similar to the 'arrayJoin' function, but its functionality is broader. + +ARRAY JOIN is essentially INNER JOIN with an array. Example: + +.. code-block:: sql + + :) CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = Memory + + CREATE TABLE arrays_test + ( + s String, + arr Array(UInt8) + ) ENGINE = Memory + + Ok. + + 0 rows in set. Elapsed: 0.001 sec. + + :) INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []) + + INSERT INTO arrays_test VALUES + + Ok. + + 3 rows in set. Elapsed: 0.001 sec. + + :) SELECT * FROM arrays_test + + SELECT * + FROM arrays_test + + ┌─s───────┬─arr─────┐ + │ Hello │ [1,2] │ + │ World │ [3,4,5] │ + │ Goodbye │ [] │ + └─────────┴─────────┘ + + 3 rows in set. Elapsed: 0.001 sec. + + :) SELECT s, arr FROM arrays_test ARRAY JOIN arr + + SELECT s, arr + FROM arrays_test + ARRAY JOIN arr + + ┌─s─────┬─arr─┐ + │ Hello │ 1 │ + │ Hello │ 2 │ + │ World │ 3 │ + │ World │ 4 │ + │ World │ 5 │ + └───────┴─────┘ + + 5 rows in set. Elapsed: 0.001 sec. + +An alias can be specified for an array in the ARRAY JOIN clause. In this case, an array item can be accessed by this alias, but the array itself by the original name. Example: + +.. code-block:: sql + + :) SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a + + SELECT s, arr, a + FROM arrays_test + ARRAY JOIN arr AS a + + ┌─s─────┬─arr─────┬─a─┐ + │ Hello │ [1,2] │ 1 │ + │ Hello │ [1,2] │ 2 │ + │ World │ [3,4,5] │ 3 │ + │ World │ [3,4,5] │ 4 │ + │ World │ [3,4,5] │ 5 │ + └───────┴─────────┴───┘ + + 5 rows in set. Elapsed: 0.001 sec. + +Multiple arrays of the same size can be comma-separated in the ARRAY JOIN clause. In this case, JOIN is performed with them simultaneously (the direct sum, not the direct product). +Example: + +.. code-block:: sql + + :) SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped + + SELECT s, arr, a, num, mapped + FROM arrays_test + ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(lambda(tuple(x), plus(x, 1)), arr) AS mapped + + ┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ + │ Hello │ [1,2] │ 1 │ 1 │ 2 │ + │ Hello │ [1,2] │ 2 │ 2 │ 3 │ + │ World │ [3,4,5] │ 3 │ 1 │ 4 │ + │ World │ [3,4,5] │ 4 │ 2 │ 5 │ + │ World │ [3,4,5] │ 5 │ 3 │ 6 │ + └───────┴─────────┴───┴─────┴────────┘ + + 5 rows in set. Elapsed: 0.002 sec. + + :) SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num + + SELECT s, arr, a, num, arrayEnumerate(arr) + FROM arrays_test + ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num + + ┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ + │ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ + │ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ + │ World │ [3,4,5] │ 3 │ 1 │ [1,2,3] │ + │ World │ [3,4,5] │ 4 │ 2 │ [1,2,3] │ + │ World │ [3,4,5] │ 5 │ 3 │ [1,2,3] │ + └───────┴─────────┴───┴─────┴─────────────────────┘ + + 5 rows in set. Elapsed: 0.002 sec. + +ARRAY JOIN also works with nested data structures. Example: + +.. code-block:: sql + + :) CREATE TABLE nested_test (s String, nest Nested(x UInt8, y UInt32)) ENGINE = Memory + + CREATE TABLE nested_test + ( + s String, + nest Nested( + x UInt8, + y UInt32) + ) ENGINE = Memory + + Ok. + + 0 rows in set. Elapsed: 0.006 sec. + + :) INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []) + + INSERT INTO nested_test VALUES + + Ok. + + 3 rows in set. Elapsed: 0.001 sec. + + :) SELECT * FROM nested_test + + SELECT * + FROM nested_test + + ┌─s───────┬─nest.x──┬─nest.y─────┐ + │ Hello │ [1,2] │ [10,20] │ + │ World │ [3,4,5] │ [30,40,50] │ + │ Goodbye │ [] │ [] │ + └─────────┴─────────┴────────────┘ + + 3 rows in set. Elapsed: 0.001 sec. + + :) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest + + SELECT s, `nest.x`, `nest.y` + FROM nested_test + ARRAY JOIN nest + + ┌─s─────┬─nest.x─┬─nest.y─┐ + │ Hello │ 1 │ 10 │ + │ Hello │ 2 │ 20 │ + │ World │ 3 │ 30 │ + │ World │ 4 │ 40 │ + │ World │ 5 │ 50 │ + └───────┴────────┴────────┘ + + 5 rows in set. Elapsed: 0.001 sec. + +When specifying names of nested data structures in ARRAY JOIN, the meaning is the same as ARRAY JOIN with all the array elements that it consists of. Example: + +.. code-block:: sql + + :) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x, nest.y + + SELECT s, `nest.x`, `nest.y` + FROM nested_test + ARRAY JOIN `nest.x`, `nest.y` + + ┌─s─────┬─nest.x─┬─nest.y─┐ + │ Hello │ 1 │ 10 │ + │ Hello │ 2 │ 20 │ + │ World │ 3 │ 30 │ + │ World │ 4 │ 40 │ + │ World │ 5 │ 50 │ + └───────┴────────┴────────┘ + + 5 rows in set. Elapsed: 0.001 sec. + +This variation also makes sense: + +.. code-block:: sql + + :) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x + + SELECT s, `nest.x`, `nest.y` + FROM nested_test + ARRAY JOIN `nest.x` + + ┌─s─────┬─nest.x─┬─nest.y─────┐ + │ Hello │ 1 │ [10,20] │ + │ Hello │ 2 │ [10,20] │ + │ World │ 3 │ [30,40,50] │ + │ World │ 4 │ [30,40,50] │ + │ World │ 5 │ [30,40,50] │ + └───────┴────────┴────────────┘ + + 5 rows in set. Elapsed: 0.001 sec. + +An alias may be used for a nested data structure, in order to select either the JOIN result or the source array. Example: + +.. code-block:: sql + + :) SELECT s, n.x, n.y, nest.x, nest.y FROM nested_test ARRAY JOIN nest AS n + + SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` + FROM nested_test + ARRAY JOIN nest AS n + + ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ + │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ + │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ + │ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ + │ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ + │ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ + └───────┴─────┴─────┴─────────┴────────────┘ + + 5 rows in set. Elapsed: 0.001 sec. + +Example of using the arrayEnumerate function: + +.. code-block:: sql + + :) SELECT s, n.x, n.y, nest.x, nest.y, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(nest.x) AS num + + SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num + FROM nested_test + ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num + + ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ + │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ + │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ + │ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ 1 │ + │ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ 2 │ + │ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ 3 │ + └───────┴─────┴─────┴─────────┴────────────┴─────┘ + + 5 rows in set. Elapsed: 0.002 sec. + +The query can only specify a single ARRAY JOIN clause. + +The corresponding conversion can be performed before the WHERE/PREWHERE clause (if its result is needed in this clause), or after completing WHERE/PREWHERE (to reduce the volume of calculations). + +JOIN clause +""""""""""" +The normal JOIN, which is not related to ARRAY JOIN described above. + +.. code-block:: sql + + [GLOBAL] ANY|ALL INNER|LEFT [OUTER] JOIN (subquery)|table USING columns_list + +Performs joins with data from the subquery. At the beginning of query execution, the subquery specified after JOIN is run, and its result is saved in memory. Then it is read from the "left" table specified in the FROM clause, and while it is being read, for each of the read rows from the "left" table, rows are selected from the subquery results table (the "right" table) that meet the condition for matching the values of the columns specified in USING. + +The table name can be specified instead of a subquery. This is equivalent to the 'SELECT * FROM table' subquery, except in a special case when the table has the Join engine - an array prepared for joining. + +All columns that are not needed for the JOIN are deleted from the subquery. + +There are several types of JOINs: + +INNER or LEFT - the type: +If INNER is specified, the result will contain only those rows that have a matching row in the right table. +If LEFT is specified, any rows in the left table that don't have matching rows in the right table will be assigned the default value - zeros or empty rows. LEFT OUTER may be written instead of LEFT; the word OUTER does not affect anything. + +ANY or ALL - strictness: +If ANY is specified and there are multiple matching rows in the right table, only the first one will be joined. +If ALL is specified and there are multiple matching rows in the right table, the data will be multiplied by the number of these rows. + +Using ALL corresponds to the normal JOIN semantic from standard SQL. +Using ANY is optimal. If the right table has only one matching row, the results of ANY and ALL are the same. You must specify either ANY or ALL (neither of them is selected by default). + +GLOBAL - distribution: + +When using a normal ``JOIN``, the query is sent to remote servers. Subqueries are run on each of them in order to make the right table, and the join is performed with this table. In other words, the right table is formed on each server separately. + +When using ``GLOBAL ... JOIN``, first the requestor server runs a subquery to calculate the right table. This temporary table is passed to each remote server, and queries are run on them using the temporary data that was transmitted. + +Be careful when using GLOBAL JOINs. For more information, see the section "Distributed subqueries" below. + +Any combination of JOINs is possible. For example, ``GLOBAL ANY LEFT OUTER JOIN``. + +When running JOINs, there is no optimization of the order of execution in relation to other stages of the query. The join (a search in the right table) is run before filtering in WHERE and before aggregation. In order to explicitly set the order of execution, we recommend running a JOIN subquery with a subquery. + +Example: + +.. code-block:: sql + + SELECT + CounterID, + hits, + visits + FROM + ( + SELECT + CounterID, + count() AS hits + FROM test.hits + GROUP BY CounterID + ) ANY LEFT JOIN + ( + SELECT + CounterID, + sum(Sign) AS visits + FROM test.visits + GROUP BY CounterID + ) USING CounterID + ORDER BY hits DESC + LIMIT 10 + + ┌─CounterID─┬───hits─┬─visits─┐ + │ 1143050 │ 523264 │ 13665 │ + │ 731962 │ 475698 │ 102716 │ + │ 722545 │ 337212 │ 108187 │ + │ 722889 │ 252197 │ 10547 │ + │ 2237260 │ 196036 │ 9522 │ + │ 23057320 │ 147211 │ 7689 │ + │ 722818 │ 90109 │ 17847 │ + │ 48221 │ 85379 │ 4652 │ + │ 19762435 │ 77807 │ 7026 │ + │ 722884 │ 77492 │ 11056 │ + └───────────┴────────┴────────┘ + +Subqueries don't allow you to set names or use them for referencing a column from a specific subquery. +The columns specified in USING must have the same names in both subqueries, and the other columns must be named differently. You can use aliases to change the names of columns in subqueries (the example uses the aliases 'hits' and 'visits'). + +The USING clause specifies one or more columns to join, which establishes the equality of these columns. The list of columns is set without brackets. More complex join conditions are not supported. + +The right table (the subquery result) resides in RAM. If there isn't enough memory, you can't run a JOIN. + +Only one JOIN can be specified in a query (on a single level). To run multiple JOINs, you can put them in subqueries. + +Each time a query is run with the same JOIN, the subquery is run again - the result is not cached. To avoid this, use the special 'Join' table engine, which is a prepared array for joining that is always in RAM. For more information, see the section "Table engines, Join". + +In some cases, it is more efficient to use IN instead of JOIN. Among the various types of JOINs, the most efficient is ANY LEFT JOIN, then ANY INNER JOIN. The least efficient are ALL LEFT JOIN and ALL INNER JOIN. + +If you need a JOIN for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a JOIN might not be very convenient due to the bulky syntax and the fact that the right table is re-accessed for every query. For such cases, there is an "external dictionaries" feature that you should use instead of JOIN. For more information, see the section "External dictionaries". + +WHERE clause +"""""""""""" + +If there is a WHERE clause, it must contain an expression with the UInt8 type. This is usually an expression with comparison and logical operators. +This expression will be used for filtering data before all other transformations. + +If indexes are supported by the database table engine, the expression is evaluated on the ability to use indexes. + +PREWHERE clause +""""""""""""""" + +This clause has the same meaning as the WHERE clause. The difference is in which data is read from the table. When using PREWHERE, first only the columns necessary for executing PREWHERE are read. Then the other columns are read that are needed for running the query, but only those blocks where the PREWHERE expression is true. + +It makes sense to use PREWHERE if there are filtration conditions that are not suitable for indexes that are used by a minority of the columns in the query, but that provide strong data filtration. This reduces the volume of data to read. + +For example, it is useful to write PREWHERE for queries that extract a large number of columns, but that only have filtration for a few columns. + +PREWHERE is only supported by *MergeTree tables. + +A query may simultaneously specify PREWHERE and WHERE. In this case, PREWHERE precedes WHERE. + +Keep in mind that it does not make much sense for PREWHERE to only specify those columns that have an index, because when using an index, only the data blocks that match the index are read. + +If the 'optimize_move_to_prewhere' setting is set to 1 and PREWHERE is omitted, the system uses heuristics to automatically move parts of expressions from WHERE to PREWHERE. + +GROUP BY clause +""""""""""""""" + +This is one of the most important parts of a column-oriented DBMS. + +If there is a GROUP BY clause, it must contain a list of expressions. Each expression will be referred to here as a "key". +All the expressions in the SELECT, HAVING, and ORDER BY clauses must be calculated from keys or from aggregate functions. In other words, each column selected from the table must be used either in keys or inside aggregate functions. + +If a query contains only table columns inside aggregate functions, the GROUP BY clause can be omitted, and aggregation by an empty set of keys is assumed. + +Example: + +.. code-block:: sql + + SELECT + count(), + median(FetchTiming > 60 ? 60 : FetchTiming), + count() - sum(Refresh) + FROM hits + +However, in contrast to standard SQL, if the table doesn't have any rows (either there aren't any at all, or there aren't any after using WHERE to filter), an empty result is returned, and not the result from one of the rows containing the initial values of aggregate functions. + +As opposed to MySQL (and conforming to standard SQL), you can't get some value of some column that is not in a key or aggregate function (except constant expressions). To work around this, you can use the 'any' aggregate function (get the first encountered value) or 'min/max'. + +Example: + +.. code-block:: sql + + SELECT + domainWithoutWWW(URL) AS domain, + count(), + any(Title) AS title -- для каждого домена достаём первый попавшийся заголовок страницы + FROM hits + GROUP BY domain + +For every different key value encountered, GROUP BY calculates a set of aggregate function values. + +GROUP BY is not supported for array columns. + +A constant can't be specified as arguments for aggregate functions. Example: sum(1). Instead of this, you can get rid of the constant. Example: ``count()``. + +WITH TOTALS modifier +^^^^^^^^^^^^^^^^^^^^^^^ + +If the WITH TOTALS modifier is specified, another row will be calculated. This row will have key columns containing default values (zeros or empty lines), and columns of aggregate functions with the values calculated across all the rows (the "total" values). + +This extra row is output in JSON*, TabSeparated*, and Pretty* formats, separately from the other rows. In the other formats, this row is not output. + +In JSON* formats, this row is output as a separate 'totals' field. In TabSeparated formats, the row comes after the main result, preceded by an empty row (after the other data). In Pretty formats, the row is output as a separate table after the main result. + +``WITH TOTALS`` can be run in different ways when HAVING is present. The behavior depends on the 'totals_mode' setting. +By default, totals_mode = 'before_having'. In this case, 'totals' is calculated across all rows, including the ones that don't pass through HAVING and 'max_rows_to_group_by'. + +The other alternatives include only the rows that pass through HAVING in 'totals', and behave differently with the setting 'max_rows_to_group_by' and 'group_by_overflow_mode = 'any''. + +``after_having_exclusive`` - Don't include rows that didn't pass through ``'max_rows_to_group_by'``. In other words, 'totals' will have less than or the same number of rows as it would if 'max_rows_to_group_by' were omitted. + +``after_having_inclusive`` - Include all the rows that didn't pass through ``'max_rows_to_group_by'`` in 'totals'. In other words, 'totals' will have more than or the same number of rows as it would if 'max_rows_to_group_by' were omitted. + +``after_having_auto`` - Count the number of rows that passed through HAVING. If it is more than a certain amount (by default, 50%), include all the rows that didn't pass through 'max_rows_to_group_by' in 'totals'. Otherwise, do not include them. + +``totals_auto_threshold`` - By default, 0.5 is the coefficient for ``after_having_auto``. + +If 'max_rows_to_group_by' and 'group_by_overflow_mode = 'any'' are not used, all variations of 'after_having' are the same, and you can use any of them (for example, 'after_having_auto'). + +You can use WITH TOTALS in subqueries, including subqueries in the JOIN clause. In this case, the respective total values are combined. + +external memory GROUP BY +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is possible to turn on spilling temporary data to disk to limit memory consumption during the execution of GROUP BY. Value of ``max_bytes_before_external_group_by`` setting determines the maximum memory consumption before temporary data is dumped to the file system. If it is 0 (the default value), the feature is turned off. + +When using ``max_bytes_before_external_group_by`` it is advisable to set ``max_memory_usage`` to an approximately twice greater value. The reason for this is that aggregation is executed in two stages: reading and generation of intermediate data (1) and merging of intermediate data (2). Spilling data to the filesystem can be performed only on stage 1. If the spilling did not happen, then stage 2 could consume up to the same amount of memory as stage 1. + +For example: if ``max_memory_usage`` is equal to 10000000000 and you want to use external aggregation, it makes sense to set ``max_bytes_before_external_group_by`` to 10000000000 and ``max_memory_usage`` to 20000000000. If dumping data to the file system happened at least once during the execution, maximum memory consumption would be just a little bit higher than ``max_bytes_before_external_group_by``. + +During distributed query execution external aggregation is performed on the remote servers. If you want the memory consumption on the originating server to be small, set ``distributed_aggregation_memory_efficient`` to 1. If ``distributed_aggregation_memory_efficient`` is turned on then during merging of the dumped data and also during merging of the query results from the remote servers, total memory consumption is no more than 1/256 * number of threads of the total amount of memory. + +If external aggregation is turned on and total memory consumption was less than ``max_bytes_before_external_group_by`` (meaning that no spilling took place), the query performance is the same as when external aggregation is turned off. If some data was dumped, then execution time will be several times longer (approximately 3x). + +If you have an ORDER BY clause with some small LIMIT after a GROUP BY, then ORDER BY will not consume significant amount of memory. But if no LIMIT is provided, don't forget to turn on external sorting (``max_bytes_before_external_sort``). + +LIMIT N BY modifier +^^^^^^^^^^^^^^^^^^^^^^ + +LIMIT ``N`` BY ``COLUMNS`` allows you to restrict top ``N`` rows per each group of ``COLUMNS``. ``LIMIT N BY`` is unrelated to ``LIMIT`` clause. Key for ``LIMIT N BY`` could contain arbitrary number of columns or expressions. + +Example: + +.. code-block:: sql + + SELECT + domainWithoutWWW(URL) AS domain, + domainWithoutWWW(REFERRER_URL) AS referrer, + device_type, + count() cnt + FROM hits + GROUP BY domain, referrer, device_type + ORDER BY cnt DESC + LIMIT 5 BY domain, device_type + LIMIT 100 + +will select top 5 referrers for each domain - device type pair, total number of rows - 100. + +HAVING clause +""""""""""""" + +Allows filtering the result received after GROUP BY, similar to the WHERE clause. +WHERE and HAVING differ in that WHERE is performed before aggregation (GROUP BY), while HAVING is performed after it. If aggregation is not performed, HAVING can't be used. + +ORDER BY clause +""""""""""""""" + +The ORDER BY clause contains a list of expressions, which can each be assigned DESC or ASC (the sorting direction). If the direction is not specified, ASC is assumed. ASC is sorted in ascending order, and DESC in descending order. The sorting direction applies to a single expression, not to the entire list. Example: ``ORDER BY Visits DESC, SearchPhrase`` + +For sorting by String values, you can specify collation (comparison). Example: ``ORDER BY SearchPhrase COLLATE 'tr'`` - for sorting by keyword in ascending order, using the Turkish alphabet, case insensitive, assuming that strings are UTF-8 encoded. COLLATE can be specified or not for each expression in ORDER BY independently. If ASC or DESC is specified, COLLATE is specified after it. When using COLLATE, sorting is always case-insensitive. + +We only recommend using COLLATE for final sorting of a small number of rows, since sorting with COLLATE is less efficient than normal sorting by bytes. + +Rows that have identical values for the list of sorting expressions are output in an arbitrary order, which can also be nondeterministic (different each time). +If the ORDER BY clause is omitted, the order of the rows is also undefined, and may be nondeterministic as well. + +When floating point numbers are sorted, NaNs are separate from the other values. Regardless of the sorting order, NaNs come at the end. In other words, for ascending sorting they are placed as if they are larger than all the other numbers, while for descending sorting they are placed as if they are smaller than the rest. + +Less RAM is used if a small enough LIMIT is specified in addition to ORDER BY. Otherwise, the amount of memory spent is proportional to the volume of data for sorting. For distributed query processing, if GROUP BY is omitted, sorting is partially done on remote servers, and the results are merged on the requestor server. This means that for distributed sorting, the volume of data to sort can be greater than the amount of memory on a single server. + +If there is not enough RAM, it is possible to perform sorting in external memory (creating temporary files on a disk). Use the setting max_bytes_before_external_sort for this purpose. If it is set to 0 (the default), external sorting is disabled. If it is enabled, when the volume of data to sort reaches the specified number of bytes, the collected data is sorted and dumped into a temporary file. After all data is read, all the sorted files are merged and the results are output. Files are written to the /var/lib/clickhouse/tmp/ directory in the config (by default, but you can use the 'tmp_path' parameter to change this setting). + +Running a query may use more memory than ``'max_bytes_before_external_sort'``. For this reason, this setting must have a value significantly smaller than 'max_memory_usage'. As an example, if your server has 128 GB of RAM and you need to run a single query, set 'max_memory_usage' to 100 GB, and 'max_bytes_before_external_sort' to 80 GB. + +External sorting works much less effectively than sorting in RAM. + +SELECT clause +""""""""""""" + +The expressions specified in the SELECT clause are analyzed after the calculations for all the clauses listed above are completed. +More specifically, expressions are analyzed that are above the aggregate functions, if there are any aggregate functions. The aggregate functions and everything below them are calculated during aggregation (GROUP BY). These expressions work as if they are applied to separate rows in the result. + +DISTINCT clause +""""""""""""""" + +If DISTINCT is specified, only a single row will remain out of all the sets of fully matching rows in the result. +The result will be the same as if GROUP BY were specified across all the fields specified in SELECT without aggregate functions. But there are several differences from GROUP BY: + +- DISTINCT can be applied together with GROUP BY. +- When ORDER BY is omitted and LIMIT is defined, the query stops running immediately after the required number of different rows has been read. In this case, using DISTINCT is much more optimal. +- Data blocks are output as they are processed, without waiting for the entire query to finish running. + +DISTINCT is not supported if SELECT has at least one array column. + +LIMIT clause +"""""""""""" + +LIMIT m allows you to select the first 'm' rows from the result. +LIMIT n, m allows you to select the first 'm' rows from the result after skipping the first 'n' rows. + +'n' and 'm' must be non-negative integers. + +If there isn't an ORDER BY clause that explicitly sorts results, the result may be arbitrary and nondeterministic. + +UNION ALL clause +"""""""""""""""" + +You can use UNION ALL to combine any number of queries. Example: + +.. code-block:: sql + + SELECT CounterID, 1 AS table, toInt64(count()) AS c + FROM test.hits + GROUP BY CounterID + + UNION ALL + + SELECT CounterID, 2 AS table, sum(Sign) AS c + FROM test.visits + GROUP BY CounterID + HAVING c > 0 + +Only UNION ALL is supported. The regular UNION (UNION DISTINCT) is not supported. If you need UNION DISTINCT, you can write SELECT DISTINCT from a subquery containing UNION ALL. + +Queries that are parts of UNION ALL can be run simultaneously, and their results can be mixed together. + +The structure of results (the number and type of columns) must match for the queries, but the column names can differ. In this case, the column names for the final result will be taken from the first query. + +Queries that are parts of UNION ALL can't be enclosed in brackets. ORDER BY and LIMIT are applied to separate queries, not to the final result. If you need to apply a conversion to the final result, you can put all the queries with UNION ALL in a subquery in the FROM clause. + +INTO OUTFILE clause +""""""""""""""""""" + +Add ``INTO OUTFILE`` filename clause (where filename is a string literal) to redirect query output to a file filename. +In contrast to MySQL the file is created on a client host. The query will fail if a file with the same filename already exists. +INTO OUTFILE is available in the command-line client and clickhouse-local (a query sent via HTTP interface will fail). + +Default output format is TabSeparated (the same as in the batch mode of command-line client). + +FORMAT clause +""""""""""""" +Specify 'FORMAT format' to get data in any specified format. +You can use this for convenience, or for creating dumps. For more information, see the section "Formats". +If the FORMAT clause is omitted, the default format is used, which depends on both the settings and the interface used for accessing the DB. For the HTTP interface and the command-line client in batch mode, the default format is TabSeparated. For the command-line client in interactive mode, the default format is PrettyCompact (it has attractive and compact tables). + +When using the command-line client, data is passed to the client in an internal efficient format. The client independently interprets the FORMAT clause of the query and formats the data itself (thus relieving the network and the server from the load). + +IN operators +"""""""""""" + +The ``IN``, ``NOT IN``, ``GLOBAL IN``, and ``GLOBAL NOT IN`` operators are covered separately, since their functionality is quite rich. + +The left side of the operator is either a single column or a tuple. + +Examples: + +.. code-block:: sql + + SELECT UserID IN (123, 456) FROM ... + SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... + +If the left side is a single column that is in the index, and the right side is a set of constants, the system uses the index for processing the query. + +Don't list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section "External data for query processing"), then use a subquery. + +The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. + +If the right side of the operator is the name of a table (for example, ``UserID IN users``), this is equivalent to the subquery ``UserID IN (SELECT * FROM users)``. Use this when working with external data that is sent along with the query. For example, the query can be sent together with a set of user IDs loaded to the 'users' temporary table, which should be filtered. + +If the right side of the operator is a table name that has the Set engine (a prepared data set that is always in RAM), the data set will not be created over again for each query. + +The subquery may specify more than one column for filtering tuples. +Example: + +.. code-block:: sql + + SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... + +The columns to the left and right of the ``IN`` operator should have the same type. + +The IN operator and subquery may occur in any part of the query, including in aggregate functions and lambda functions. +Example: + +.. code-block:: sql + + SELECT + EventDate, + avg(UserID IN + ( + SELECT UserID + FROM test.hits + WHERE EventDate = toDate('2014-03-17') + )) AS ratio + FROM test.hits + GROUP BY EventDate + ORDER BY EventDate ASC + + ┌──EventDate─┬────ratio─┐ + │ 2014-03-17 │ 1 │ + │ 2014-03-18 │ 0.807696 │ + │ 2014-03-19 │ 0.755406 │ + │ 2014-03-20 │ 0.723218 │ + │ 2014-03-21 │ 0.697021 │ + │ 2014-03-22 │ 0.647851 │ + │ 2014-03-23 │ 0.648416 │ + └────────────┴──────────┘ + +- for each day after March 17th, count the percentage of pageviews made by users who visited the site on March 17th. +A subquery in the IN clause is always run just one time on a single server. There are no dependent subqueries. + +Distributed subqueries +""""""""""""""""""""""""" + +There are two versions of INs with subqueries (and for JOINs): the regular ``IN`` / ``JOIN``, and ``GLOBAL IN`` / ``GLOBAL JOIN``. They differ in how they are run for distributed query processing. + +When using the regular ``IN``, the query is sent to remote servers, and each of them runs the subqueries in the IN or JOIN clause. + +When using ``GLOBAL IN`` / ``GLOBAL JOIN``, first all the subqueries for ``GLOBAL IN`` / ``GLOBAL JOIN`` are run, and the results are collected in temporary tables. Then the temporary tables are sent to each remote server, where the queries are run using this temporary data. + +For a non-distributed query, use the regular ``IN`` / ``JOIN``. + +Be careful when using subqueries in the ``IN`` / ``JOIN`` clauses for distributed query processing. + +Let's look at some examples. Assume that each server in the cluster has a normal local_table. Each server also has a **distributed_table** table with the Distributed type, which looks at all the servers in the cluster. + +For a query to the **distributed_table**, the query will be sent to all the remote servers and run on them using the **local_table**. + +For example, the query + +``SELECT uniq(UserID) FROM distributed_table`` + +will be sent to all the remote servers as + +``SELECT uniq(UserID) FROM local_table`` + +and run on each of them in parallel, until it reaches the stage where intermediate results can be combined. Then the intermediate results will be returned to the requestor server and merged on it, and the final result will be sent to the client. + +Now let's examine a query with IN: + +.. code-block:: sql + + SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) + +- calculates the overlap in the audiences of two websites. + +This query will be sent to all the remote servers as + +.. code-block:: sql + + SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) + +In other words, the data set in the IN clause will be collected on each server independently, only across the data that is stored locally on each of the servers. + +This will work correctly and optimally if you are prepared for this case and have spread data across the cluster servers such that the data for a single UserID resides entirely on a single server. In this case, all the necessary data will be available locally on each server. Otherwise, the result will be inaccurate. We refer to this variation of the query as "local IN". + +To correct how the query works when data is spread randomly across the cluster servers, you could specify **distributed_table** inside a subquery. The query would look like this: + +.. code-block:: sql + + SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) + +This query will be sent to all remote servers as + +.. code-block:: sql + SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) + +Each of the remote servers will start running the subquery. Since the subquery uses a distributed table, each remote server will re-send the subquery to every remote server, as + +.. code-block:: sql + + SELECT UserID FROM local_table WHERE CounterID = 34 + +For example, if you have a cluster of 100 servers, executing the entire query will require 10,000 elementary requests, which is generally considered unacceptable. + +In such cases, you should always use ``GLOBAL IN`` instead of ``IN``. Let's look at how it works for the query + +.. code-block:: sql + + SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) + +The requestor server will execute the subquery + +.. code-block:: sql + + SELECT UserID FROM distributed_table WHERE CounterID = 34 + +and the result will be put in a temporary table in RAM. Then a query will be sent to each remote server as + +.. code-block:: sql + + SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1 + +and the temporary table '_data1' will be sent to every remote server together with the query (the name of the temporary table is implementation-defined). + +This is more optimal than using the normal IN. However, keep the following points in mind: + +#. When creating a temporary table, data is not made unique. To reduce the volume of data transmitted over the network, specify DISTINCT in the subquery. (You don't need to do this for a normal IN.) +#. The temporary table will be sent to all the remote servers. Transmission does not account for network topology. For example, if 10 remote servers reside in a datacenter that is very remote in relation to the requestor server, the data will be sent 10 times over the channel to the remote datacenter. Try to avoid large data sets when using GLOBAL IN. +#. When transmitting data to remote servers, restrictions on network bandwidth are not configurable. You might overload the network. +#. Try to distribute data across servers so that you don't need to use GLOBAL IN on a regular basis. +#. If you need to use GLOBAL IN often, plan the location of the ClickHouse cluster so that in each datacenter, there will be at least one replica of each shard, and there is a fast network between them - for possibility to process query with transferring data only inside datacenter. + +It also makes sense to specify a local table in the GLOBAL IN clause, in case this local table is only available on the requestor server and you want to use data from it on remote servers. + +Extreme values +"""""""""""""""""""""" + +In addition to results, you can also get minimum and maximum values for the results columns. To do this, set the 'extremes' setting to '1'. Minimums and maximums are calculated for numeric types, dates, and dates with times. For other columns, the default values are output. + +An extra two rows are calculated - the minimums and maximums, respectively. These extra two rows are output in JSON*, TabSeparated*, and Pretty* formats, separate from the other rows. They are not output for other formats. + +In JSON* formats, the extreme values are output in a separate 'extremes' field. In TabSeparated formats, the row comes after the main result, and after 'totals' if present. It is preceded by an empty row (after the other data). In Pretty formats, the row is output as a separate table after the main result, and after 'totals' if present. + +Extreme values are calculated for rows that have passed through LIMIT. However, when using 'LIMIT offset, size', the rows before 'offset' are included in 'extremes'. In stream requests, the result may also include a small number of rows that passed through LIMIT. + +Notes +""""""""" + +The GROUP BY and ORDER BY clauses do not support positional arguments. This contradicts MySQL, but conforms to standard SQL. +For example, ``'GROUP BY 1, 2'`` will be interpreted as grouping by constants (i.e. aggregation of all rows into one). + +You can use synonyms (AS aliases) in any part of a query. + +You can put an asterisk in any part of a query instead of an expression. When the query is analyzed, the asterisk is expanded to a list of all table columns (excluding the ``MATERIALIZED`` and ALIAS columns). There are only a few cases when using an asterisk is justified: +* When creating a table dump. +* For tables containing just a few columns, such as system tables. +* For getting information about what columns are in a table. In this case, set ``'LIMIT 1'``. But it is better to use the ``DESC TABLE`` query. +* When there is strong filtration on a small number of columns using ``PREWHERE``. +* In subqueries (since columns that aren't needed for the external query are excluded from subqueries). +In all other cases, we don't recommend using the asterisk, since it only gives you the drawbacks of a columnar DBMS instead of the advantages. + +KILL QUERY +~~~~~~~~~~ + +.. code-block:: sql + + KILL QUERY WHERE [SYNC|ASYNC|TEST] [FORMAT format] + +Tries to finish currently executing queries. +Queries to be finished are selected from ``system.processes`` table according to expression after WHERE term. + +Examples: + +.. code-block:: sql + + KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90' + +Finishes all queries with specified query_id. + +.. code-block:: sql + + KILL QUERY WHERE user='username' SYNC + +Synchronously finishes all queries of user ``username``. + +Readonly users can kill only own queries. diff --git a/docs/en/query_language/syntax.rst b/docs/en/query_language/syntax.rst new file mode 100644 index 00000000000..0ed259cd008 --- /dev/null +++ b/docs/en/query_language/syntax.rst @@ -0,0 +1,98 @@ +Syntax +--------- + +There are two types of parsers in the system: a full SQL parser (a recursive descent parser), and a data format parser (a fast stream parser). In all cases except the INSERT query, only the full SQL parser is used. +The INSERT query uses both parsers: +:: + INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def') + +The ``INSERT INTO t VALUES`` fragment is parsed by the full parser, and the data ``(1, 'Hello, world'), (2, 'abc'), (3, 'def')`` is parsed by the fast stream parser. +Data can have any format. When a query is received, the server calculates no more than 'max_query_size' bytes of the request in RAM (by default, 1 MB), and the rest is stream parsed. This means the system doesn't have problems with large INSERT queries, like MySQL does. + +When using the Values format in an ``INSERT`` query, it may seem that data is parsed the same as expressions in a SELECT query, but this is not true. The Values format is much more limited. + +Next we will cover the full parser. For more information about format parsers, see the section "Formats". + +Spaces +~~~~~~~ +There may be any number of space symbols between syntactical constructions (including the beginning and end of a query). Space symbols include the space, tab, line break, CR, and form feed. + +Comments +~~~~~~~~~~~ +SQL-style and C-style comments are supported. +SQL-style comments: from ``--`` to the end of the line. The space after ``--`` can be omitted. +C-style comments: from ``/*`` to ``*/``. These comments can be multiline. Spaces are not required here, either. + +Keywords +~~~~~~~~~~~~~~ +Keywords (such as SELECT) are not case-sensitive. Everything else (column names, functions, and so on), in contrast to standard SQL, is case-sensitive. Keywords are not reserved (they are just parsed as keywords in the corresponding context). + +Identifiers +~~~~~~~~~~~~~~ +Identifiers (column names, functions, and data types) can be quoted or non-quoted. +Non-quoted identifiers start with a Latin letter or underscore, and continue with a Latin letter, underscore, or number. In other words, they must match the regex ``^[a-zA-Z_][0-9a-zA-Z_]*$``. Examples: ``x``, ``_1``, ``X_y__Z123_``. +Quoted identifiers are placed in reversed quotation marks ```id``` (the same as in MySQL), and can indicate any set of bytes (non-empty). In addition, symbols (for example, the reverse quotation mark) inside this type of identifier can be backslash-escaped. Escaping rules are the same as for string literals (see below). +We recommend using identifiers that do not need to be quoted. + +Literals +~~~~~~~~ +There are numeric literals, string literals, and compound literals. + +Numeric literals +""""""""""""""""" +A numeric literal tries to be parsed: +- first as a 64-bit signed number, using the 'strtoull' function. +- if unsuccessful, as a 64-bit unsigned number, using the 'strtoll' function. +- if unsuccessful, as a floating-point number using the 'strtod' function. +- otherwise, an error is returned. + +The corresponding value will have the smallest type that the value fits in. +For example, 1 is parsed as UInt8, but 256 is parsed as UInt16. For more information, see "Data types". + +Examples: ``1``, ``18446744073709551615``, ``0xDEADBEEF``, ``01``, ``0.1``, ``1e100``, ``-1e-100``, ``inf``, ``nan``. + +String literals +"""""""""""""""""" +Only string literals in single quotes are supported. The enclosed characters can be backslash-escaped. The following escape sequences have special meanings: ``\b``, ``\f``, ``\r``, ``\n``, ``\t``, ``\0``, ``\a``, ``\v``, ``\xHH``. In all other cases, escape sequences like \c, where c is any character, are transformed to c. This means that the sequences ``\'`` and ``\\`` can be used. The value will have the String type. + +Minimum set of symbols that must be escaped in string literal is ``'`` and ``\``. + +Compound literals +"""""""""""""""""" +Constructions are supported for arrays: ``[1, 2, 3]`` and tuples: ``(1, 'Hello, world!', 2)``. +Actually, these are not literals, but expressions with the array creation operator and the tuple creation operator, respectively. For more information, see the section "Operators". +An array must consist of at least one item, and a tuple must have at least two items. +Tuples have a special purpose for use in the IN clause of a SELECT query. Tuples can be obtained as the result of a query, but they can't be saved to a database (with the exception of Memory-type tables). + +Functions +~~~~~~~ +Functions are written like an identifier with a list of arguments (possibly empty) in brackets. In contrast to standard SQL, the brackets are required, even for an empty arguments list. Example: ``now()``. +There are regular and aggregate functions (see the section "Aggregate functions"). Some aggregate functions can contain two lists of arguments in brackets. Example: ``quantile(0.9)(x)``. These aggregate functions are called "parametric" functions, and the arguments in the first list are called "parameters". The syntax of aggregate functions without parameters is the same as for regular functions. + +Operators +~~~~~~~~~ +Operators are converted to their corresponding functions during query parsing, taking their priority and associativity into account. +For example, the expression ``1 + 2 * 3 + 4`` is transformed to ``plus(plus(1, multiply(2, 3)), 4)``. +For more information, see the section "Operators" below. + +Data types and database table engines +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Data types and table engines in the ``CREATE`` query are written the same way as identifiers or functions. In other words, they may or may not contain an arguments list in brackets. For more information, see the sections "Data types," "Table engines," and "CREATE". + +Synonyms +~~~~~~~~ +In the SELECT query, expressions can specify synonyms using the AS keyword. Any expression is placed to the left of AS. The identifier name for the synonym is placed to the right of AS. As opposed to standard SQL, synonyms are not only declared on the top level of expressions: +:: + SELECT (1 AS n) + 2, n + +In contrast to standard SQL, synonyms can be used in all parts of a query, not just ``SELECT``. + +Asterisk +~~~~~~~~~ +In a ``SELECT`` query, an asterisk can replace the expression. For more information, see the section "SELECT". + +Expressions +~~~~~~~~~ +An expression is a function, identifier, literal, application of an operator, expression in brackets, subquery, or asterisk. It can also contain a synonym. +A list of expressions is one or more expressions separated by commas. +Functions and operators, in turn, can have expressions as arguments. diff --git a/docs/en/quotas.rst b/docs/en/quotas.rst new file mode 100644 index 00000000000..6b7be9b0e49 --- /dev/null +++ b/docs/en/quotas.rst @@ -0,0 +1,95 @@ +Quotas +====== + +Quotas allow you to limit resource usage over a period of time, or simply track the use of resources. +Quotas are set up in the user config. This is usually ``users.xml``. + +The system also has a feature for limiting the complexity of a single query (see the section "Restrictions on query complexity"). + +In contrast to query complexity restrictions, quotas: + * place restrictions on a set of queries that can be run over a period of time, instead of limiting a single query. + * account for resources spent on all remote servers for distributed query processing. + +Let's look at the section of the ``users.xml`` file that defines quotas. + +.. code-block:: xml + + + + + + + + + 3600 + + + 0 + 0 + 0 + 0 + 0 + + + +By default, the quota just tracks resource consumption for each hour, without limiting usage. + +.. code-block:: xml + + + + + + 3600 + 1000 + 100 + 1000000000 + 100000000000 + 900 + + + 86400 + 10000 + 1000 + 5000000000 + 500000000000 + 7200 + + + +For the ``statbox`` quota, restrictions are set for every hour and for every 24 hours (86,400 seconds). The time interval is counted starting from an implementation-defined fixed moment in time. In other words, the 24-hour interval doesn't necessarily begin at midnight. + +When the interval ends, all collected values are cleared. For the next hour, the quota calculation starts over. + +Let's examine the amounts that can be restricted: + +``queries`` - The overall number of queries. + +``errors`` - The number of queries that threw exceptions. + +``result_rows`` - The total number of rows output in results. + +``read_rows`` - The total number of source rows retrieved from tables for running a query, on all remote servers. + +``execution_time`` - The total time of query execution, in seconds (wall time). + +If the limit is exceeded for at least one time interval, an exception is thrown with a text about which restriction was exceeded, for which interval, and when the new interval begins (when queries can be sent again). + +Quotas can use the "quota key" feature in order to report on resources for multiple keys independently. Here is an example of this: + +.. code-block:: xml + + + + + + +The quota is assigned to users in the ``users`` section of the config. See the section "Access rights". + +For distributed query processing, the accumulated amounts are stored on the requestor server. So if the user goes to another server, the quota there will "start over". + +When the server is restarted, quotas are reset. diff --git a/docs/en/settings/index.rst b/docs/en/settings/index.rst new file mode 100644 index 00000000000..b35c1d18805 --- /dev/null +++ b/docs/en/settings/index.rst @@ -0,0 +1,9 @@ +Settings +========== + +In this section, we review settings that you can make using a SET query or in a config file. Remember that these settings can be set for a session or globally. Settings that can only be made in the server config file are not covered here. + +.. toctree:: + :glob: + + * diff --git a/docs/en/settings/query_complexity.rst b/docs/en/settings/query_complexity.rst new file mode 100644 index 00000000000..0b583acbb1c --- /dev/null +++ b/docs/en/settings/query_complexity.rst @@ -0,0 +1,173 @@ +Restrictions on query complexity +===================== +Restrictions on query complexity are part of the settings. +They are used in order to provide safer execution from the user interface. +Almost all the restrictions only apply to SELECTs. +For distributed query processing, restrictions are applied on each server separately. + +Restrictions on the "maximum amount of something" can take the value 0, which means "unrestricted". +Most restrictions also have an 'overflow_mode' setting, meaning what to do when the limit is exceeded. +It can take one of two values: 'throw' or 'break'. Restrictions on aggregation (``group_by_overflow_mode``) also have the value ``any``. + +``throw`` - Throw an exception (default). + +``break`` - Stop executing the query and return the partial result, as if the source data ran out. + +``any`` (only for group_by_overflow_mode) - Continuing aggregation for the keys that got into the set, but don't add new keys to the set. + +readonly +------- +If set to 0, allows to run any queries. +If set to 1, allows to run only queries that don't change data or settings (e.g. SELECT or SHOW). INSERT and SET are forbidden. +If set to 2, allows to run queries that don't change data (SELECT, SHOW) and allows to change settings (SET). + +After you set the read-only mode, you won't be able to disable it in the current session. + +When using the GET method in the HTTP interface, 'readonly = 1' is set automatically. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body, or in the URL parameter. + +max_memory_usage +-------------- +The maximum amount of memory consumption when running a query on a single server. By default, 10 GB. + +The setting doesn't consider the volume of available memory or the total volume of memory on the machine. +The restriction applies to a single query within a single server. +You can use SHOW PROCESSLIST to see the current memory consumption for each query. +In addition, the peak memory consumption is tracked for each query and written to the log. + +Certain cases of memory consumption are not tracked: + * Large constants (for example, a very long string constant). + * The states of 'groupArray' aggregate functions, and also 'quantile' (it is tracked for 'quantileTiming'). + +Memory consumption is not fully considered for aggregate function states ``min``, ``max``, ``any``, ``anyLast``, ``argMin``, and ``argMax`` from String and Array arguments. + +max_rows_to_read +--------------- +The following restrictions can be checked on each block (instead of on each row). That is, the restrictions can be broken a little. +When running a query in multiple threads, the following restrictions apply to each thread separately. + +Maximum number of rows that can be read from a table when running a query. + +max_bytes_to_read +------------- +Maximum number of bytes (uncompressed data) that can be read from a table when running a query. + +read_overflow_mode +------------- +What to do when the volume of data read exceeds one of the limits: ``throw`` or ``break``. ``By default, throw``. + +max_rows_to_group_by +------------- +Maximum number of unique keys received from aggregation. This setting lets you limit memory consumption when aggregating. + +group_by_overflow_mode +--------------- +What to do when the number of unique keys for aggregation exceeds the limit: ``throw``, ``break``, or ``any``. ``By default, throw``. +Using the 'any' value lets you run an approximation of GROUP BY. The quality of this approximation depends on the statistical nature of the data. + +max_rows_to_sort +-------------- +Maximum number of rows before sorting. This allows you to limit memory consumption when sorting. + +max_bytes_to_sort +------------- +Maximum number of bytes before sorting. + +sort_overflow_mode +------------ +What to do if the number of rows received before sorting exceeds one of the limits: ``throw`` or ``break``. ``By default, throw``. + +max_result_rows +------------- +Limit on the number of rows in the result. Also checked for subqueries, and on remote servers when running parts of a distributed query. + +max_result_bytes +------------- +Limit on the number of bytes in the result. The same as the previous setting. + +result_overflow_mode +-------------- +What to do if the volume of the result exceeds one of the limits: ``throw`` or ``break``. By default, throw. +Using ``break`` is similar to using ``LIMIT``. + +max_execution_time +-------------- +Maximum query execution time in seconds. +At this time, it is not checked for one of the sorting stages, or when merging and finalizing aggregate functions. + +timeout_overflow_mode +--------------- +What to do if the query is run longer than ``max_execution_time``: ``throw`` or ``break``. ``By default, throw``. + +min_execution_speed +-------------- +Minimal execution speed in rows per second. Checked on every data block when ``timeout_before_checking_execution_speed`` expires. If the execution speed is lower, an exception is thrown. + +timeout_before_checking_execution_speed +--------------- +Checks that execution speed is not too slow (no less than ``min_execution_speed``), after the specified time in seconds has expired. + +max_columns_to_read +-------------- +Maximum number of columns that can be read from a table in a single query. If a query requires reading a greater number of columns, it throws an exception. + +max_temporary_columns +---------------- +Maximum number of temporary columns that must be kept in RAM at the same time when running a query, including constant columns. If there are more temporary columns than this, it throws an exception. + +max_temporary_non_const_columns +--------------------- +The same thing as 'max_temporary_columns', but without counting constant columns. +Note that constant columns are formed fairly often when running a query, but they require approximately zero computing resources. + +max_subquery_depth +------------- +Maximum nesting depth of subqueries. If subqueries are deeper, an exception is thrown. ``By default, 100``. + +max_pipeline_depth +----------- +Maximum pipeline depth. Corresponds to the number of transformations that each data block goes through during query processing. Counted within the limits of a single server. If the pipeline depth is greater, an exception is thrown. By default, 1000. + +max_ast_depth +----------- +Maximum nesting depth of a query syntactic tree. If exceeded, an exception is thrown. At this time, it isn't checked during parsing, but only after parsing the query. That is, a syntactic tree that is too deep can be created during parsing, but the query will fail. By default, 1000. + +max_ast_elements +----------- +Maximum number of elements in a query syntactic tree. If exceeded, an exception is thrown. +In the same way as the previous setting, it is checked only after parsing the query. ``By default, 10,000``. + +max_rows_in_set +---------- +Maximum number of rows for a data set in the IN clause created from a subquery. + +max_bytes_in_set +----------- +Maximum number of bytes (uncompressed data) used by a set in the IN clause created from a subquery. + +set_overflow_mode +----------- +What to do when the amount of data exceeds one of the limits: ``throw`` or ``break``. ``By default, throw``. + +max_rows_in_distinct +----------- +Maximum number of different rows when using DISTINCT. + +max_bytes_in_distinct +-------------- +Maximum number of bytes used by a hash table when using DISTINCT. + +distinct_overflow_mode +------------ +What to do when the amount of data exceeds one of the limits: ``throw`` or ``break``. ``By default, throw``. + +max_rows_to_transfer +----------- +Maximum number of rows that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. + +max_bytes_to_transfer +----------- +Maximum number of bytes (uncompressed data) that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. + +transfer_overflow_mode +--------- +What to do when the amount of data exceeds one of the limits: ``throw`` or ``break``. ``By default, throw``. diff --git a/docs/en/settings/settings.rst b/docs/en/settings/settings.rst new file mode 100644 index 00000000000..f8b40f9593c --- /dev/null +++ b/docs/en/settings/settings.rst @@ -0,0 +1,208 @@ +max_block_size +-------------- +In ClickHouse, data is processed by blocks (sets of column parts). The internal processing cycles for a single block are efficient enough, but there are noticeable expenditures on each block. 'max_block_size' is a recommendation for what size of block (in number of rows) to load from tables. The block size shouldn't be too small, so that the expenditures on each block are still noticeable, but not too large, so that the query with LIMIT that is completed after the first block is processed quickly, so that too much memory isn't consumed when extracting a large number of columns in multiple threads, and so that at least some cache locality is preserved. + +By default, it is 65,536. + +Blocks the size of 'max_block_size' are not always loaded from the table. If it is obvious that less data needs to be retrieved, a smaller block is processed. + +max_insert_block_size +-------------------- +The size of blocks to form for insertion into a table. +This setting only applies in cases when the server forms the blocks. +For example, for an INSERT via the HTTP interface, the server parses the data format and forms blocks of the specified size. +But when using clickhouse-client, the client parses the data itself, and the ``max_insert_block_size`` setting on the server doesn't affect the size of the inserted blocks. +The setting also doesn't have a purpose when using INSERT SELECT, since data is inserted in the same blocks that are formed after SELECT. + +By default, it is 1,048,576. + +This is slightly more than 'max_block_size'. The reason for this is because certain table engines (*MergeTree) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, *MergeTree tables sort data during insertion, and a large enough block size allows sorting more data in RAM. + +max_threads +----------- +The maximum number of query processing threads +- excluding threads for retrieving data from remote servers (see the ``max_distributed_connections`` parameter). + +This parameter applies to threads that perform the same stages of the query execution pipeline in parallel. +For example, if reading from a table, evaluating expressions with functions, filtering with WHERE and pre-aggregating for GROUP BY can all be done in parallel using at least ``max_threads`` number of threads, then 'max_threads' are used. + +By default, ``8``. + +If less than one SELECT query is normally run on a server at a time, set this parameter to a value slightly less than the actual number of processor cores. + +For queries that are completed quickly because of a LIMIT, you can set a lower ``max_threads``. For example, if the necessary number of entries are located in every block and ``max_threads = 8``, 8 blocks are retrieved, although it would have been enough to read just one. + +The smaller the ``max_threads`` value, the less memory is consumed. + +max_compress_block_size +----------- +The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, ``1,048,576 (1 MiB)``. If the size is reduced, the compression rate is significantly reduced, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. There usually isn't any reason to change this setting. + +Don't confuse blocks for compression (a chunk of memory consisting of bytes) and blocks for query processing (a set of rows from a table). + +min_compress_block_size +-------------- +For *MergeTree tables. In order to reduce latency when processing queries, a block is compressed when writing the next mark if its size is at least ``min_compress_block_size``. By default, 65,536. + +The actual size of the block, if the uncompressed data less than ``max_compress_block_size`` is no less than this value and no less than the volume of data for one mark. + +Let's look at an example. Assume that ``index_granularity`` was set to 8192 during table creation. + +We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, the total will be 32 KB of data. Since ``min_compress_block_size = 65,536``, a compressed block will be formed for every two marks. + +We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won't be decompressed. + +There usually isn't any reason to change this setting. + +max_query_size +----------- +The maximum part of a query that can be taken to RAM for parsing with the SQL parser. +The INSERT query also contains data for INSERT that is processed by a separate stream parser (that consumes O(1) RAM), which is not included in this restriction. + +``By default, 256 KiB.`` + +interactive_delay +------------- +The interval in microseconds for checking whether request execution has been canceled and sending the progress. +By default, 100,000 (check for canceling and send progress ten times per second). + +connect_timeout +----------- + +receive_timeout +--------- + +send_timeout +--------- +Timeouts in seconds on the socket used for communicating with the client. +``By default, 10, 300, 300.`` + +poll_interval +---------- +Lock in a wait loop for the specified number of seconds. +``By default, 10``. + +max_distributed_connections +---------------- +The maximum number of simultaneous connections with remote servers for distributed processing of a single query to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. + +``By default, 100.`` + +The following parameters are only used when creating Distributed tables (and when launching a server), so there is no reason to change them at runtime. + +distributed_connections_pool_size +------------------- +The maximum number of simultaneous connections with remote servers for distributed processing of all queries to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. + +``By default, 128.`` + +connect_timeout_with_failover_ms +---------------- +The timeout in milliseconds for connecting to a remote server for a Distributed table engine, if the 'shard' and 'replica' sections are used in the cluster definition. +If unsuccessful, several attempts are made to connect to various replicas. + +``By default, 50.`` + +connections_with_failover_max_tries +---------------- +The maximum number of connection attempts with each replica, for the Distributed table engine. + +``By default, 3.`` + +extremes +----- +Whether to count extreme values (the minimums and maximums in columns of a query result). +Accepts 0 or 1. By default, 0 (disabled). +For more information, see the section "Extreme values". + +use_uncompressed_cache +---------- +Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). +The uncompressed cache (only for tables in the MergeTree family) allows significantly reducing latency and increasing throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the ``uncompressed_cache_size`` configuration parameter (only set in the config file) - the size of uncompressed cache blocks. +By default, it is 8 GiB. The uncompressed cache is filled in as needed; the least-used data is automatically deleted. + +For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically in order to save space for truly small queries. So you can keep the ``use_uncompressed_cache`` setting always set to 1. + +replace_running_query +----------- +When using the HTTP interface, the 'query_id' parameter can be passed. This is any string that serves as the query identifier. +If a query from the same user with the same 'query_id' already exists at this time, the behavior depends on the 'replace_running_query' parameter. + +``0 (default)`` - Throw an exception (don't allow the query to run if a query with the same 'query_id' is already running). +``1`` - Cancel the old query and start running the new one. + +Yandex.Metrica uses this parameter set to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn't finished yet, it should be canceled. + +load_balancing +----------- +Which replicas (among healthy replicas) to preferably send a query to (on the first attempt) for distributed processing. + +random (по умолчанию) +~~~~~~~~~~~~~~~~ +The number of errors is counted for each replica. The query is sent to the replica with the fewest errors, and if there are several of these, to any one of them. +Disadvantages: Server proximity is not accounted for; if the replicas have different data, you will also get different data. + +nearest_hostname +~~~~~~~~~ +The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a host name that is most similar to the server's host name in the config file (for the number of different characters in identical positions, up to the minimum length of both host names). + +As an example, example01-01-1 and example01-01-2.yandex.ru are different in one position, while example01-01-1 and example01-02-2 differ in two places. +This method might seem a little stupid, but it doesn't use external data about network topology, and it doesn't compare IP addresses, which would be complicated for our IPv6 addresses. + +Thus, if there are equivalent replicas, the closest one by name is preferred. +We can also assume that when sending a query to the same server, in the absence of failures, a distributed query will also go to the same servers. So even if different data is placed on the replicas, the query will return mostly the same results. + +in_order +~~~~~~~ +Replicas are accessed in the same order as they are specified. The number of errors does not matter. This method is appropriate when you know exactly which replica is preferable. + +totals_mode +----------- +How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = 'any' are present. +See the section "WITH TOTALS modifier". + +totals_auto_threshold +-------------- +The threshold for ``totals_mode = 'auto'``. +See the section "WITH TOTALS modifier". + +default_sample +---------- +A floating-point number from 0 to 1. By default, 1. +Allows setting a default sampling coefficient for all SELECT queries. +(For tables that don't support sampling, an exception will be thrown.) +If set to 1, default sampling is not performed. + +max_parallel_replicas +--------------- +The maximum number of replicas of each shard used when the query is executed. +For consistency (to get different parts of the same partition), this option only works for the specified sampling key. +The lag of the replicas is not controlled. + +compile +------- +Enable query compilation. The default is 0 (disabled). + +Compilation is provided for only part of the request processing pipeline - for the first aggregation step (GROUP BY). +In the event that this part of the pipeline was compiled, the query can work faster, by deploying short loops and inlining the aggregate function calls. The maximum performance increase (up to four times in rare cases) is achieved on queries with several simple aggregate functions. Typically, the performance gain is negligible. In very rare cases, the request may be slowed down. + +min_count_to_compile +--------------- +After how many times, when the compiled piece of code could come in handy, perform its compilation. The default is 3. +In case the value is zero, the compilation is executed synchronously, and the request will wait for the compilation process to finish before continuing. This can be used for testing, otherwise use values ​​starting with 1. Typically, compilation takes about 5-10 seconds. +If the value is 1 or more, the compilation is performed asynchronously, in a separate thread. If the result is ready, it will be immediately used, including those already running at the moment requests. + +The compiled code is required for each different combination of aggregate functions used in the query and the type of keys in GROUP BY. +The compilation results are saved in the build directory as .so files. The number of compilation results is unlimited, since they do not take up much space. When the server is restarted, the old results will be used, except for the server update - then the old results are deleted. + +input_format_skip_unknown_fields +---------------- +If the parameter is true, INSERT operation will skip columns with unknown names from input. +Otherwise, an exception will be generated, it is default behavior. +The parameter works only for JSONEachRow and TSKV input formats. + +output_format_json_quote_64bit_integers +----------------- +If the parameter is true (default value), UInt64 and Int64 numbers are printed as quoted strings in all JSON output formats. +Such behavior is compatible with most JavaScript interpreters that stores all numbers as double-precision floating point numbers. +Otherwise, they are printed as regular numbers. diff --git a/docs/en/settings/settings_profiles.rst b/docs/en/settings/settings_profiles.rst new file mode 100644 index 00000000000..914d69f928b --- /dev/null +++ b/docs/en/settings/settings_profiles.rst @@ -0,0 +1,50 @@ +Settings profiles +================ +A settings profile is a collection of settings grouped under the same name. Each ClickHouse user has a profile. +To apply all the settings in a profile, set 'profile'. Example: +:: + SET profile = 'web' + +- Load the 'web' profile. That is, set all the options belonging to the 'web' profile. + +Settings profiles are declared in the user config file. This is normally 'users.xml'. +Example: + +.. code-block:: xml + + + + + + + 8 + + + + 1000000000 + 100000000000 + 1000000 + any + 1000000 + 1000000000 + 100000 + 100000000 + break + 600 + 1000000 + 15 + 25 + 100 + 50 + 2 + 25 + 50 + 100 + 1 + + + + +In the example, two profiles are set: ``default`` and ``web``. The ``default`` profile has a special purpose - it must always be present and is applied when starting the server. In other words, the ``default`` profile contains default settings. The ``web`` profile is a regular profile that can be set using the SET query or using a URL parameter in an HTTP query. + +Settings profiles can inherit from each other. To use inheritance, indicate the 'profile' setting before the other settings that are listed in the profile. diff --git a/docs/en/system_tables/index.rst b/docs/en/system_tables/index.rst new file mode 100644 index 00000000000..6e5047db547 --- /dev/null +++ b/docs/en/system_tables/index.rst @@ -0,0 +1,13 @@ +System tables +========== + +System tables are used for implementing part of the system's functionality, and for providing access to information about how the system is working. +You can't delete a system table (but you can perform DETACH). +System tables don't have files with data on the disk or files with metadata. The server creates all the system tables when it starts. +System tables are read-only. +System tables are located in the 'system' database. + +.. toctree:: + :glob: + + * diff --git a/docs/en/system_tables/system.asynchronous_metrics.rst b/docs/en/system_tables/system.asynchronous_metrics.rst new file mode 100644 index 00000000000..3fa9b49403b --- /dev/null +++ b/docs/en/system_tables/system.asynchronous_metrics.rst @@ -0,0 +1,5 @@ +system.asynchronous_metrics +--------------------------- + +Like system.events, but show info about currently executing events or consuming resources. +Example: The number of currently executing SELECT queries; memory consumption of the system. diff --git a/docs/en/system_tables/system.clusters.rst b/docs/en/system_tables/system.clusters.rst new file mode 100644 index 00000000000..9995a3bebc2 --- /dev/null +++ b/docs/en/system_tables/system.clusters.rst @@ -0,0 +1,14 @@ +system.clusters +--------------- + +Contains information about clusters available in the config file and the servers in them. +Columns: +:: + cluster String - Cluster name. + shard_num UInt32 - Number of a shard in the cluster, starting from 1. + shard_weight UInt32 - Relative weight of a shard when writing data. + replica_num UInt32 - Number of a replica in the shard, starting from 1. + host_name String - Host name as specified in the config. + host_address String - Host's IP address obtained from DNS. + port UInt16 - The port used to access the server. + user String - The username to use for connecting to the server. diff --git a/docs/en/system_tables/system.columns.rst b/docs/en/system_tables/system.columns.rst new file mode 100644 index 00000000000..88104a6a323 --- /dev/null +++ b/docs/en/system_tables/system.columns.rst @@ -0,0 +1,13 @@ +system.columns +-------------- + +Contains information about the columns in all tables. +You can use this table to get information similar to ``DESCRIBE TABLE``, but for multiple tables at once. +:: + database String - Name of the database the table is located in. + table String - Table name. + name String - Column name. + type String - Column type. + default_type String - Expression type (DEFAULT, MATERIALIZED, ALIAS) for the default value, or an empty string if it is not defined. + default_expression String - Expression for the default value, or an empty string if it is not defined. + diff --git a/docs/en/system_tables/system.databases.rst b/docs/en/system_tables/system.databases.rst new file mode 100644 index 00000000000..62e55862cb8 --- /dev/null +++ b/docs/en/system_tables/system.databases.rst @@ -0,0 +1,6 @@ +system.databases +---------------- + +This table contains a single String column called 'name' - the name of a database. +Each database that the server knows about has a corresponding entry in the table. +This system table is used for implementing the ``SHOW DATABASES`` query. diff --git a/docs/en/system_tables/system.dictionaries.rst b/docs/en/system_tables/system.dictionaries.rst new file mode 100644 index 00000000000..9db602f4758 --- /dev/null +++ b/docs/en/system_tables/system.dictionaries.rst @@ -0,0 +1,22 @@ +system.dictionaries +------------------- + +Contains information about external dictionaries. + +Columns: +:: + name String - Dictionary name. + type String - Dictionary type: Flat, Hashed, Cache. + origin String - Path to the config file where the dictionary is described. + attribute.names Array(String) - Array of attribute names provided by the dictionary. + attribute.types Array(String) - Corresponding array of attribute types provided by the dictionary. + has_hierarchy UInt8 - Whether the dictionary is hierarchical. + bytes_allocated UInt64 - The amount of RAM used by the dictionary. + hit_rate Float64 - For cache dictionaries, the percent of usage for which the value was in the cache. + element_count UInt64 - The number of items stored in the dictionary. + load_factor Float64 - The filled percentage of the dictionary (for a hashed dictionary, it is the filled percentage of the hash table). + creation_time DateTime - Time spent for the creation or last successful reload of the dictionary. + last_exception String - Text of an error that occurred when creating or reloading the dictionary, if the dictionary couldn't be created. + source String - Text describing the data source for the dictionary. + +Note that the amount of memory used by the dictionary is not proportional to the number of items stored in it. So for flat and cached dictionaries, all the memory cells are pre-assigned, regardless of how full the dictionary actually is. diff --git a/docs/en/system_tables/system.events.rst b/docs/en/system_tables/system.events.rst new file mode 100644 index 00000000000..feaf96a1f44 --- /dev/null +++ b/docs/en/system_tables/system.events.rst @@ -0,0 +1,6 @@ +system.events +------------- + +Contains information about the number of events that have occurred in the system. This is used for profiling and monitoring purposes. +Example: The number of processed SELECT queries. +Columns: 'event String' - the event name, and 'value UInt64' - the quantity. diff --git a/docs/en/system_tables/system.functions.rst b/docs/en/system_tables/system.functions.rst new file mode 100644 index 00000000000..38aef943e93 --- /dev/null +++ b/docs/en/system_tables/system.functions.rst @@ -0,0 +1,9 @@ +system.functions +---------------- + +Contains information about normal and aggregate functions. +Columns: + +:: + name String - Function name. + is_aggregate UInt8 - Whether it is an aggregate function. diff --git a/docs/en/system_tables/system.merges.rst b/docs/en/system_tables/system.merges.rst new file mode 100644 index 00000000000..3b5591883da --- /dev/null +++ b/docs/en/system_tables/system.merges.rst @@ -0,0 +1,18 @@ +system.merges +------------- +Contains information about merges currently in process for tables in the MergeTree family. + +Columns: +:: + database String - Name of the database the table is located in. + table String - Name of the table. + elapsed Float64 - Time in seconds since the merge started. + progress Float64 - Percent of progress made, from 0 to 1. + num_parts UInt64 - Number of parts to merge. + result_part_name String - Name of the part that will be formed as the result of the merge. + total_size_bytes_compressed UInt64 - Total size of compressed data in the parts being merged. + total_size_marks UInt64 - Total number of marks in the parts being merged. + bytes_read_uncompressed UInt64 - Amount of bytes read, decompressed. + rows_read UInt64 - Number of rows read. + bytes_written_uncompressed UInt64 - Amount of bytes written, uncompressed. + rows_written UInt64 - Number of rows written. diff --git a/docs/en/system_tables/system.metrics.rst b/docs/en/system_tables/system.metrics.rst new file mode 100644 index 00000000000..dee53b399e6 --- /dev/null +++ b/docs/en/system_tables/system.metrics.rst @@ -0,0 +1,2 @@ +system.metrics +-------------- diff --git a/docs/en/system_tables/system.numbers.rst b/docs/en/system_tables/system.numbers.rst new file mode 100644 index 00000000000..baaef83daab --- /dev/null +++ b/docs/en/system_tables/system.numbers.rst @@ -0,0 +1,6 @@ +system.numbers +-------------- + +This table contains a single UInt64 column named 'number' that contains almost all the natural numbers starting from zero. +You can use this table for tests, or if you need to do a brute force search. +Reads from this table are not parallelized. diff --git a/docs/en/system_tables/system.numbers_mt.rst b/docs/en/system_tables/system.numbers_mt.rst new file mode 100644 index 00000000000..b5569748b2a --- /dev/null +++ b/docs/en/system_tables/system.numbers_mt.rst @@ -0,0 +1,5 @@ +system.numbers_mt +----------------- + +The same as 'system.numbers' but reads are parallelized. The numbers can be returned in any order. +Used for tests. diff --git a/docs/en/system_tables/system.one.rst b/docs/en/system_tables/system.one.rst new file mode 100644 index 00000000000..8a4c1821fb8 --- /dev/null +++ b/docs/en/system_tables/system.one.rst @@ -0,0 +1,6 @@ +system.one +---------- + +This table contains a single row with a single 'dummy' UInt8 column containing the value 0. +This table is used if a SELECT query doesn't specify the FROM clause. +This is similar to the DUAL table found in other DBMSs. diff --git a/docs/en/system_tables/system.parts.rst b/docs/en/system_tables/system.parts.rst new file mode 100644 index 00000000000..9f7d34694a8 --- /dev/null +++ b/docs/en/system_tables/system.parts.rst @@ -0,0 +1,18 @@ +system.parts +------------ +Contains information about parts of a table in the MergeTree family. + +Columns: +:: + database String - Name of the database where the table that this part belongs to is located. + table String - Name of the table that this part belongs to. + engine String - Name of the table engine, without parameters. + partition String - Name of the partition, in the format YYYYMM. + name String - Name of the part. + replicated UInt8 - Whether the part belongs to replicated data. + active UInt8 - Whether the part is used in a table, or is no longer needed and will be deleted soon. Inactive parts remain after merging. + marks UInt64 - Number of marks - multiply by the index granularity (usually 8192) to get the approximate number of rows in the part. + bytes UInt64 - Number of bytes when compressed. + modification_time DateTime - Time the directory with the part was modified. Usually corresponds to the part's creation time. + remove_time DateTime - For inactive parts only - the time when the part became inactive. + refcount UInt32 - The number of places where the part is used. A value greater than 2 indicates that this part participates in queries or merges. diff --git a/docs/en/system_tables/system.processes.rst b/docs/en/system_tables/system.processes.rst new file mode 100644 index 00000000000..a082631b589 --- /dev/null +++ b/docs/en/system_tables/system.processes.rst @@ -0,0 +1,23 @@ +system.processes +---------------- + +This system table is used for implementing the ``SHOW PROCESSLIST`` query. +Columns: +:: + user String - Name of the user who made the request. For distributed query processing, this is the user who helped the requestor server send the query to this server, not the user who made the distributed request on the requestor server. + + address String - The IP address the request was made from. The same for distributed processing. + + elapsed Float64 - The time in seconds since request execution started. + + rows_read UInt64 - The number of rows read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. + + bytes_read UInt64 - The number of uncompressed bytes read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. + + total_rows_approx UInt64 - The approximation of the total number of rows that should be read. For distributed processing, on the requestor server, this is the total for all remote servers. It can be updated during request processing, when new sources to process become known. + + memory_usage UInt64 - How much memory the request uses. It might not include some types of dedicated memory. + + query String - The query text. For INSERT, it doesn't include the data to insert. + + query_id String - Query ID, if defined. diff --git a/docs/en/system_tables/system.replicas.rst b/docs/en/system_tables/system.replicas.rst new file mode 100644 index 00000000000..851a9990b38 --- /dev/null +++ b/docs/en/system_tables/system.replicas.rst @@ -0,0 +1,114 @@ +system.replicas +--------------- + +Contains information and status for replicated tables residing on the local server. This table can be used for monitoring. The table contains a row for every Replicated* table. + +Example: + +.. code-block:: sql + + SELECT * + FROM system.replicas + WHERE table = 'visits' + FORMAT Vertical + + Row 1: + ────── + database: merge + table: visits + engine: ReplicatedCollapsingMergeTree + is_leader: 1 + is_readonly: 0 + is_session_expired: 0 + future_parts: 1 + parts_to_check: 0 + zookeeper_path: /clickhouse/tables/01-06/visits + replica_name: example01-06-1.yandex.ru + replica_path: /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru + columns_version: 9 + queue_size: 1 + inserts_in_queue: 0 + merges_in_queue: 1 + log_max_index: 596273 + log_pointer: 596274 + total_replicas: 2 + active_replicas: 2 + +Столбцы: +:: + database: Database name. + table: Table name. + engine: Table engine name. + + is_leader: Whether the replica is the leader. + Only one replica can be the leader at a time. The leader is responsible for selecting background merges to perform. + Note that writes can be performed to any replica that is available and has a session in ZK, regardless of whether it is a leader. + + is_readonly: Whether the replica is in read-only mode. + This mode is turned on if the config doesn't have sections with ZK, if an unknown error occurred when reinitializing sessions in ZK, and during session reinitialization in ZK. + + is_session_expired: Whether the session with ZK has expired. + Basically the same as 'is_readonly'. + + future_parts: The number of data parts that will appear as the result of INSERTs or merges that haven't been done yet. + + parts_to_check: The number of data parts in the queue for verification. + A part is put in the verification queue if there is suspicion that it might be damaged. + + zookeeper_path: Path to table data in ZK. + replica_name: Replica name in ZK. Different replicas of the same table have different names. + replica_path: Path to replica data in ZK. The same as concatenating 'zookeeper_path/replicas/replica_path'. + + columns_version: Version number of the table structure. Indicates how many times ALTER was performed. If replicas have different versions, it means some replicas haven't made all of the ALTERs yet. + + queue_size: Size of the queue for operations waiting to be performed. Operations include inserting blocks of data, merges, and certain other actions. It usually coincides with 'future_parts'. + + inserts_in_queue: Number of inserts of blocks of data that need to be made. Insertions are usually replicated fairly quickly. If this number is large, it means something is wrong. + + merges_in_queue: The number of merges waiting to be made. Sometimes merges are lengthy, so this value may be greater than one for a long time. + + The next 4 columns have a non-zero value only where there is an active session with ZK. + + log_max_index: Maximum entry number in the log of general activity. + log_pointer: Maximum entry number from the log of general activity that the replica copied to its queue for execution, plus one. + If log_pointer is much smaller than log_max_index, something is wrong. + + total_replicas: The total number of known replicas of this table. + active_replicas: The number of replicas of this table that have a session in ZK (i.e., the number of functioning replicas).к + +If you request all the columns, the table may work a bit slowly, since several reads from ZK are made for each row. +If you don't request the last 4 columns (log_max_index, log_pointer, total_replicas, active_replicas), the table works quickly. + +For example, you can check that everything is working correctly like this: + +.. code-block:: sql + + SELECT + database, + table, + is_leader, + is_readonly, + is_session_expired, + future_parts, + parts_to_check, + columns_version, + queue_size, + inserts_in_queue, + merges_in_queue, + log_max_index, + log_pointer, + total_replicas, + active_replicas + FROM system.replicas + WHERE + is_readonly + OR is_session_expired + OR future_parts > 20 + OR parts_to_check > 10 + OR queue_size > 20 + OR inserts_in_queue > 10 + OR log_max_index - log_pointer > 10 + OR total_replicas < 2 + OR active_replicas < total_replicas + +If this query doesn't return anything, it means that everything is fine. diff --git a/docs/en/system_tables/system.settings.rst b/docs/en/system_tables/system.settings.rst new file mode 100644 index 00000000000..f47845e6d31 --- /dev/null +++ b/docs/en/system_tables/system.settings.rst @@ -0,0 +1,25 @@ +system.settings +--------------- + +Contains information about settings that are currently in use (i.e. used for executing the query you are using to read from the system.settings table). + +Columns: +:: + name String - Setting name. + value String - Setting value. + changed UInt8 - Whether the setting was explicitly defined in the config or explicitly changed. + +Example: + +.. code-block:: sql + + SELECT * + FROM system.settings + WHERE changed + + ┌─name───────────────────┬─value───────┬─changed─┐ + │ max_threads │ 8 │ 1 │ + │ use_uncompressed_cache │ 0 │ 1 │ + │ load_balancing │ random │ 1 │ + │ max_memory_usage │ 10000000000 │ 1 │ + └────────────────────────┴─────────────┴─────────┘ diff --git a/docs/en/system_tables/system.tables.rst b/docs/en/system_tables/system.tables.rst new file mode 100644 index 00000000000..0e7536f9ec9 --- /dev/null +++ b/docs/en/system_tables/system.tables.rst @@ -0,0 +1,7 @@ +system.tables +------------- + +This table contains the String columns 'database', 'name', and 'engine' and DateTime column metadata_modification_time. +Each table that the server knows about is entered in the 'system.tables' table. +There is an issue: table engines are specified without parameters. +This system table is used for implementing SHOW TABLES queries. diff --git a/docs/en/system_tables/system.zookeeper.rst b/docs/en/system_tables/system.zookeeper.rst new file mode 100644 index 00000000000..6cb7f41b40d --- /dev/null +++ b/docs/en/system_tables/system.zookeeper.rst @@ -0,0 +1,69 @@ +system.zookeeper +---------------- + +Allows reading data from the ZooKeeper cluster defined in the config. +The query must have a 'path' equality condition in the WHERE clause. This is the path in ZooKeeper for the children that you want to get data for. + +Query SELECT * FROM system.zookeeper WHERE path = '/clickhouse' outputs data for all children on the /clickhouse node. +To output data for all root nodes, write path = '/'. +If the path specified in 'path' doesn't exist, an exception will be thrown. + +Columns: +:: + name String - Name of the node. + path String - Path to the node. + value String - Value of the node. + dataLength Int32 - Size of the value. + numChildren Int32 - Number of children. + czxid Int64 - ID of the transaction that created the node. + mzxid Int64 - ID of the transaction that last changed the node. + pzxid Int64 - ID of the transaction that last added or removed children. + ctime DateTime - Time of node creation. + mtime DateTime - Time of the last node modification. + version Int32 - Node version - the number of times the node was changed. + cversion Int32 - Number of added or removed children. + aversion Int32 - Number of changes to ACL. + ephemeralOwner Int64 - For ephemeral nodes, the ID of the session that owns this node. + +Example: + +.. code-block:: sql + + SELECT * + FROM system.zookeeper + WHERE path = '/clickhouse/tables/01-08/visits/replicas' + FORMAT Vertical + + Row 1: + ────── + name: example01-08-1.yandex.ru + value: + czxid: 932998691229 + mzxid: 932998691229 + ctime: 2015-03-27 16:49:51 + mtime: 2015-03-27 16:49:51 + version: 0 + cversion: 47 + aversion: 0 + ephemeralOwner: 0 + dataLength: 0 + numChildren: 7 + pzxid: 987021031383 + path: /clickhouse/tables/01-08/visits/replicas + + Row 2: + ────── + name: example01-08-2.yandex.ru + value: + czxid: 933002738135 + mzxid: 933002738135 + ctime: 2015-03-27 16:57:01 + mtime: 2015-03-27 16:57:01 + version: 0 + cversion: 37 + aversion: 0 + ephemeralOwner: 0 + dataLength: 0 + numChildren: 7 + pzxid: 987021252247 + path: /clickhouse/tables/01-08/visits/replicas diff --git a/docs/en/table_engines/aggregatingmergetree.rst b/docs/en/table_engines/aggregatingmergetree.rst new file mode 100644 index 00000000000..ec4e6d8bd8a --- /dev/null +++ b/docs/en/table_engines/aggregatingmergetree.rst @@ -0,0 +1,82 @@ +AggregatingMergeTree +-------------------- + +This engine differs from ``MergeTree`` in that the merge combines the states of aggregate functions stored in the table for rows with the same primary key value. + +In order for this to work, it uses the AggregateFunction data type and the -State and -Merge modifiers for aggregate functions. Let's examine it more closely. + +There is an AggregateFunction data type, which is a parametric data type. As parameters, the name of the aggregate function is passed, then the types of its arguments. +Examples: + +.. code-block:: sql + + CREATE TABLE t + ( + column1 AggregateFunction(uniq, UInt64), + column2 AggregateFunction(anyIf, String, UInt8), + column3 AggregateFunction(quantiles(0.5, 0.9), UInt64) + ) ENGINE = ... + +This type of column stores the state of an aggregate function. + +To get this type of value, use aggregate functions with the 'State' suffix. +Example: uniqState(UserID), quantilesState(0.5, 0.9)(SendTiming) - in contrast to the corresponding 'uniq' and 'quantiles' functions, these functions return the state, rather than the prepared value. In other words, they return an AggregateFunction type value. + +An AggregateFunction type value can't be output in Pretty formats. In other formats, these types of values are output as implementation-specific binary data. The AggregateFunction type values are not intended for output or saving in a dump. + +The only useful thing you can do with AggregateFunction type values is combine the states and get a result, which essentially means to finish aggregation. Aggregate functions with the 'Merge' suffix are used for this purpose. +Example: uniqMerge(UserIDState), where UserIDState has the AggregateFunction type. + +In other words, an aggregate function with the 'Merge' suffix takes a set of states, combines them, and returns the result. +As an example, these two queries return the same result: + +.. code-block:: sql + + SELECT uniq(UserID) FROM table + + SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP BY RegionID) + +There is an AggregatingMergeTree engine. Its job during a merge is to combine the states of aggregate functions from different table rows with the same primary key value. + +You can't use a normal INSERT to insert a row in a table containing AggregateFunction columns, because you can't explicitly define the AggregateFunction value. Instead, use INSERT SELECT with '-State' aggregate functions for inserting data. + +With SELECT from an AggregatingMergeTree table, use GROUP BY and aggregate functions with the '-Merge' modifier in order to complete data aggregation. + +You can use AggregatingMergeTree tables for incremental data aggregation, including for aggregated materialized views. + +Example: +Creating a materialized AggregatingMergeTree view that tracks the 'test.visits' table: + +.. code-block:: sql + + CREATE MATERIALIZED VIEW test.basic + ENGINE = AggregatingMergeTree(StartDate, (CounterID, StartDate), 8192) + AS SELECT + CounterID, + StartDate, + sumState(Sign) AS Visits, + uniqState(UserID) AS Users + FROM test.visits + GROUP BY CounterID, StartDate; + +Inserting data in the 'test.visits' table. Data will also be inserted in the view, where it will be aggregated: + +.. code-block:: sql + + INSERT INTO test.visits ... + +Performing SELECT from the view using GROUP BY to finish data aggregation: + +.. code-block:: sql + + SELECT + StartDate, + sumMerge(Visits) AS Visits, + uniqMerge(Users) AS Users + FROM test.basic + GROUP BY StartDate + ORDER BY StartDate; + +You can create a materialized view like this and assign a normal view to it that finishes data aggregation. + +Note that in most cases, using AggregatingMergeTree is not justified, since queries can be run efficiently enough on non-aggregated data. diff --git a/docs/en/table_engines/buffer.rst b/docs/en/table_engines/buffer.rst new file mode 100644 index 00000000000..d8dcc944ff7 --- /dev/null +++ b/docs/en/table_engines/buffer.rst @@ -0,0 +1,56 @@ +Buffer +------ + +Buffers the data to write in RAM, periodically flushing it to another table. During the read operation, data is read from the buffer and the other table simultaneously. +:: + Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) + +Engine parameters: +database, table - The table to flush data to. Instead of the database name, you can use a constant expression that returns a string. +num_layers - The level of parallelism. Physically, the table will be represented as 'num_layers' of independent buffers. The recommended value is 16. +min_time, max_time, min_rows, max_rows, min_bytes, and max_bytes are conditions for flushing data from the buffer. + +Data is flushed from the buffer and written to the destination table if all the 'min' conditions or at least one 'max' condition are met. +min_time, max_time - Condition for the time in seconds from the moment of the first write to the buffer. +min_rows, max_rows - Condition for the number of rows in the buffer. +min_bytes, max_bytes - Condition for the number of bytes in the buffer. + +During the write operation, data is inserted to a 'num_layers' number of random buffers. Or, if the data part to insert is large enough (greater than 'max_rows' or 'max_bytes'), it is written directly to the destination table, omitting the buffer. + +The conditions for flushing the data are calculated separately for each of the 'num_layers' buffers. For example, if num_layers = 16 and max_bytes = 100000000, the maximum RAM consumption is 1.6 GB. + +Example: +:: + CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) + +Creating a 'merge.hits_buffer' table with the same structure as 'merge.hits' and using the Buffer engine. When writing to this table, data is buffered in RAM and later written to the 'merge.hits' table. 16 buffers are created. The data in each of them is flushed if either 100 seconds have passed, or one million rows have been written, or 100 MB of data have been written; or if simultaneously 10 seconds have passed and 10,000 rows and 10 MB of data have been written. For example, if just one row has been written, after 100 seconds it will be flushed, no matter what. But if many rows have been written, the data will be flushed sooner. + +When the server is stopped, with DROP TABLE or DETACH TABLE, buffer data is also flushed to the destination table. + +You can set empty strings in single quotation marks for the database and table name. This indicates the absence of a destination table. In this case, when the data flush conditions are reached, the buffer is simply cleared. This may be useful for keeping a window of data in memory. + +When reading from a Buffer table, data is processed both from the buffer and from the destination table (if there is one). +Note that the Buffer tables does not support an index. In other words, data in the buffer is fully scanned, which might be slow for large buffers. (For data in a subordinate table, the index it supports will be used.) + +If the set of columns in the Buffer table doesn't match the set of columns in a subordinate table, a subset of columns that exist in both tables is inserted. + +If the types don't match for one of the columns in the Buffer table and a subordinate table, an error message is entered in the server log and the buffer is cleared. +The same thing happens if the subordinate table doesn't exist when the buffer is flushed. + +If you need to run ALTER for a subordinate table and the Buffer table, we recommend first deleting the Buffer table, running ALTER for the subordinate table, then creating the Buffer table again. + +If the server is restarted abnormally, the data in the buffer is lost. + +PREWHERE, FINAL and SAMPLE do not work correctly for Buffer tables. These conditions are passed to the destination table, but are not used for processing data in the buffer. Because of this, we recommend only using the Buffer table for writing, while reading from the destination table. + +When adding data to a Buffer, one of the buffers is locked. This causes delays if a read operation is simultaneously being performed from the table. + +Data that is inserted to a Buffer table may end up in the subordinate table in a different order and in different blocks. Because of this, a Buffer table is difficult to use for writing to a CollapsingMergeTree correctly. To avoid problems, you can set 'num_layers' to 1. + +If the destination table is replicated, some expected characteristics of replicated tables are lost when writing to a Buffer table. The random changes to the order of rows and sizes of data parts cause data deduplication to quit working, which means it is not possible to have a reliable 'exactly once' write to replicated tables. + +Due to these disadvantages, we can only recommend using a Buffer table in rare cases. + +A Buffer table is used when too many INSERTs are received from a large number of servers over a unit of time and data can't be buffered before insertion, which means the INSERTs can't run fast enough. + +Note that it doesn't make sense to insert data one row at a time, even for Buffer tables. This will only produce a speed of a few thousand rows per second, while inserting larger blocks of data can produce over a million rows per second (see the section "Performance"). diff --git a/docs/en/table_engines/collapsingmergetree.rst b/docs/en/table_engines/collapsingmergetree.rst new file mode 100644 index 00000000000..e2bacf2512e --- /dev/null +++ b/docs/en/table_engines/collapsingmergetree.rst @@ -0,0 +1,33 @@ +CollapsingMergeTree +------------------- + +This engine differs from MergeTree in that it allows automatic deletion, or "collapsing" certain pairs of rows when merging. + +Yandex.Metrica has normal logs (such as hit logs) and change logs. Change logs are used for incrementally calculating statistics on data that is constantly changing. Examples are the log of session changes, or logs of changes to user histories. Sessions are constantly changing in Yandex.Metrica. For example, the number of hits per session increases. We refer to changes in any object as a pair (?old values, ?new values). Old values may be missing if the object was created. New values may be missing if the object was deleted. If the object was changed, but existed previously and was not deleted, both values are present. In the change log, one or two entries are made for each change. Each entry contains all the attributes that the object has, plus a special attribute for differentiating between the old and new values. When objects change, only the new entries are added to the change log, and the existing ones are not touched. + +The change log makes it possible to incrementally calculate almost any statistics. To do this, we need to consider "new" rows with a plus sign, and "old" rows with a minus sign. In other words, incremental calculation is possible for all statistics whose algebraic structure contains an operation for taking the inverse of an element. This is true of most statistics. We can also calculate "idempotent" statistics, such as the number of unique visitors, since the unique visitors are not deleted when making changes to sessions. + +This is the main concept that allows Yandex.Metrica to work in real time. + +CollapsingMergeTree accepts an additional parameter - the name of an Int8-type column that contains the row's "sign". Example: + +.. code-block:: sql + + CollapsingMergeTree(EventDate, (CounterID, EventDate, intHash32(UniqID), VisitID), 8192, Sign) + +Here, 'Sign' is a column containing -1 for "old" values and 1 for "new" values. + +When merging, each group of consecutive identical primary key values (columns for sorting data) is reduced to no more than one row with the column value 'sign_column = -1' (the "negative row") and no more than one row with the column value 'sign_column = 1' (the "positive row"). In other words, entries from the change log are collapsed. + +If the number of positive and negative rows matches, the first negative row and the last positive row are written. +If there is one more positive row than negative rows, only the last positive row is written. +If there is one more negative row than positive rows, only the first negative row is written. +Otherwise, there will be a logical error and none of the rows will be written. (A logical error can occur if the same section of the log was accidentally inserted more than once. The error is just recorded in the server log, and the merge continues.) + +Thus, collapsing should not change the results of calculating statistics. +Changes are gradually collapsed so that in the end only the last value of almost every object is left. +Compared to MergeTree, the CollapsingMergeTree engine allows a multifold reduction of data volume. + +There are several ways to get completely "collapsed" data from a CollapsingMergeTree table: + #. Write a query with GROUP BY and aggregate functions that accounts for the sign. For example, to calculate quantity, write 'sum(Sign)' instead of 'count()'. To calculate the sum of something, write 'sum(Sign * x)' instead of 'sum(x)', and so on, and also add 'HAVING sum(Sign) > 0'. Not all amounts can be calculated this way. For example, the aggregate functions 'min' and 'max' can't be rewritten. + #. If you must extract data without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the FINAL modifier for the FROM clause. This approach is significantly less efficient. diff --git a/docs/en/table_engines/distributed.rst b/docs/en/table_engines/distributed.rst new file mode 100644 index 00000000000..ae428887a99 --- /dev/null +++ b/docs/en/table_engines/distributed.rst @@ -0,0 +1,111 @@ +Distributed +----------- + +**The Distributed engine does not store data itself**, but allows distributed query processing on multiple servers. +Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any. +The Distributed engine accepts parameters: the cluster name in the server's config file, the name of a remote database, the name of a remote table, and (optionally) a sharding key. +Example: +:: + Distributed(logs, default, hits[, sharding_key]) + +- Data will be read from all servers in the 'logs' cluster, from the 'default.hits' table located on every server in the cluster. +Data is not only read, but is partially processed on the remote servers (to the extent that this is possible). +For example, for a query with GROUP BY, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated. + +Instead of the database name, you can use a constant expression that returns a string. For example, ``currentDatabase()``. + +logs - The cluster name in the server's config file. + +Clusters are set like this: + +.. code-block:: xml + + + + + + 1 + + false + + example01-01-1 + 9000 + + + example01-01-2 + 9000 + + + + 2 + false + + example01-02-1 + 9000 + + + example01-02-2 + 9000 + + + + + +Here a cluster is defined with the name 'logs' that consists of two shards, each of which contains two replicas. Shards refer to the servers that contain different parts of the data (in order to read all the data, you must access all the shards). +Replicas are duplicating servers (in order to read all the data, you can access the data on any one of the replicas). + +For each server, there are several parameters: mandatory: ``'host'``, ``'port'``, and optional: ``'user'``, ``'password'``. + * ``host`` - address of remote server. May be specified as domain name or IPv4 or IPv6 address. If you specify domain, server will perform DNS lookup at startup, and result will be cached till server shutdown. If DNS request is failed, server won't start. If you are changing DNS records, restart the server for new records to take effect. + * ``port`` - TCP-port for interserver communication (tcp_port in configuration file, usually 9000). Don't get confused with http_port. + * ``user`` - user name to connect to remote server. By default user is 'default'. This user must have access rights to connect to remote server. Access rights are managed in users.xml configuration file. For additional info, consider "Access rights" section. + * ``password`` - password to log in to remote server, in plaintext. Default is empty string. + +When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) - see the 'load_balancing' setting. +If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times. +This works in favor of resiliency, but does not provide complete fault tolerance: a remote server might accept the connection, but might not work, or work poorly. + +You can specify just one of the shards (in this case, query processing should be called remote, rather than distributed) or up to any number of shards. In each shard, you can specify from one to any number of replicas. You can specify a different number of replicas for each shard. + +You can specify as many clusters as you wish in the configuration. + +To view your clusters, use the 'system.clusters' table. + +The Distributed engine allows working with a cluster like a local server. However, the cluster is inextensible: you must write its configuration in the server config file (even better, for all the cluster's servers). + +There is no support for Distributed tables that look at other Distributed tables (except in cases when a Distributed table only has one shard). As an alternative, make the Distributed table look at the "final" tables. + +The Distributed engine requires writing clusters to the config file. Clusters from config are updated on the fly, it does not require server restart. If you need to send a query to an unknown set of shards and replicas each time, you don't need to create a Distributed table - use the 'remote' table function instead. See the section "Table functions". + +There are two methods for writing data to a cluster: + +First, you can define which servers to write which data to, and perform the write directly on each shard. In other words, perform INSERT in the tables that the distributed table "looks at". +This is the most flexible solution - you can use any sharding scheme, which could be non-trivial due to the requirements of the subject area. +This is also the most optimal solution, since data can be written to different shards completely independently. + +Second, you can perform INSERT in a Distributed table. In this case, the table will distribute the inserted data across servers itself. +In order to write to a Distributed table, it must have a sharding key set (the last parameter). In addition, if there is only one shard, the write operation works without specifying the sharding key, since it doesn't have any meaning in this case. + +Each shard can have a weight defined in the config file. By default, the weight is equal to one. Data is distributed across shards in the amount proportional to the shard weight. For example, if there are two shards and the first has a weight of 9 while the second has a weight of 10, the first will be sent 9 / 19 parts of the rows, and the second will be sent 10 / 19. + +Each shard can have the 'internal_replication' parameter defined in the config file. + +If this parameter is set to 'true', the write operation selects the first healthy replica and writes data to it. Use this alternative if the Distributed table "looks at" replicated tables. In other words, if the table where data will be written is going to replicate them itself. + +If it is set to 'false' (the default), data is written to all replicas. In essence, this means that the Distributed table replicates data itself. This is worse than using replicated tables, because the consistency of replicas is not checked, and over time they will contain slightly different data. + +To select the shard that a row of data is sent to, the sharding expression is analyzed, and its remainder is taken from dividing it by the total weight of the shards. The row is sent to the shard that corresponds to the half-interval of the remainders from 'prev_weight' to 'prev_weights + weight', where 'prev_weights' is the total weight of the shards with the smallest number, and 'weight' is the weight of this shard. For example, if there are two shards, and the first has a weight of 9 while the second has a weight of 10, the row will be sent to the first shard for the remainders from the range [0, 9), and to the second for the remainders from the range [10, 19). + +The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression 'rand()' for random distribution of data, or 'UserID' for distribution by the remainder from dividing the user's ID (then the data of a single user will reside on a single shard, which simplifies running IN and JOIN by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function: intHash64(UserID). + +A simple remainder from division is a limited solution for sharding and isn't always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area, rather than using entries in Distributed tables. + +When using Replicated tables, it is possible to reshard data - look at "Resharding" section. But in many cases, better to do without it. SELECT queries are sent to all the shards, and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you don't have to transfer the old data to it. You can write new data with a heavier weight - the data will be distributed slightly unevenly, but queries will work correctly and efficiently. + +You should be concerned about the sharding scheme in the following cases: +- Queries are used that require joining data (IN or JOIN) by a specific key. If data is sharded by this key, you can use local IN or JOIN instead of GLOBAL IN or GLOBAL JOIN, which is much more efficient. +- A large number of servers is used (hundreds or more) with a large number of small queries (queries of individual clients - websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as we've done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into "layers", where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. Distributed tables are created for each layer, and a single shared distributed table is created for global queries. + +Data is written asynchronously. For an INSERT to a Distributed table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: +/var/lib/clickhouse/data/database/table/. + +If the server ceased to exist or had a rough restart (for example, after a device failure) after an INSERT to a Distributed table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the 'broken' subdirectory and no longer used. diff --git a/docs/en/table_engines/file.rst b/docs/en/table_engines/file.rst new file mode 100644 index 00000000000..817c1ce8912 --- /dev/null +++ b/docs/en/table_engines/file.rst @@ -0,0 +1,4 @@ +File(InputFormat) +----------------- + +The data source is a file that stores data in one of the supported input formats (TabSeparated, Native, и т. д.) ... diff --git a/docs/en/table_engines/index.rst b/docs/en/table_engines/index.rst new file mode 100644 index 00000000000..c8e972a89f6 --- /dev/null +++ b/docs/en/table_engines/index.rst @@ -0,0 +1,18 @@ +Table engines +============= + +The table engine (type of table) determines: + - How and where data is stored - where to write it to, and where to read it from. + - Which queries are supported, and how. + - Concurrent data access. + - Use of indexes, if present. + - Whether multithreaded request execution is possible. + - Data replication. + - When reading data, the engine is only required to extract the necessary set of columns. However, in some cases, the query may be partially processed inside the table engine. + +Note that for most serious tasks, you should use engines from the MergeTree family. + +.. toctree:: + :glob: + + * diff --git a/docs/en/table_engines/join.rst b/docs/en/table_engines/join.rst new file mode 100644 index 00000000000..342a6f59a01 --- /dev/null +++ b/docs/en/table_engines/join.rst @@ -0,0 +1,14 @@ +Join +---- + +A prepared data structure for JOIN that is always located in RAM. +:: + Join(ANY|ALL, LEFT|INNER, k1[, k2, ...]) + +Engine parameters: ``ANY``|``ALL`` - strictness, and ``LEFT``|``INNER`` - the type. These parameters are set without quotes and must match the JOIN that the table will be used for. k1, k2, ... are the key columns from the USING clause that the join will be made on. + +The table can't be used for GLOBAL JOINs. + +You can use INSERT to add data to the table, similar to the Set engine. For ANY, data for duplicated keys will be ignored. For ALL, it will be counted. You can't perform SELECT directly from the table. The only way to retrieve data is to use it as the "right-hand" table for JOIN. + +Storing data on the disk is the same as for the Set engine. diff --git a/docs/en/table_engines/log.rst b/docs/en/table_engines/log.rst new file mode 100644 index 00000000000..49f74c29b62 --- /dev/null +++ b/docs/en/table_engines/log.rst @@ -0,0 +1,5 @@ +Log +---- + +Log differs from TinyLog in that a small file of "marks" resides with the column files. These marks are written on every data block and contain offsets - where to start reading the file in order to skip the specified number of rows. This makes it possible to read table data in multiple threads. For concurrent data access, the read operations can be performed simultaneously, while write operations block reads and each other. +The Log engine does not support indexes. Similarly, if writing to a table failed, the table is broken, and reading from it returns an error. The Log engine is appropriate for temporary data, write-once tables, and for testing or demonstration purposes. diff --git a/docs/en/table_engines/materializedview.rst b/docs/en/table_engines/materializedview.rst new file mode 100644 index 00000000000..91950082a80 --- /dev/null +++ b/docs/en/table_engines/materializedview.rst @@ -0,0 +1,4 @@ +MaterializedView +----------------- + +Used for implementing materialized views (for more information, see ``CREATE MATERIALIZED VIEW``). For storing data, it uses a different engine that was specified when creating the view. When reading from a table, it just uses this engine. diff --git a/docs/en/table_engines/memory.rst b/docs/en/table_engines/memory.rst new file mode 100644 index 00000000000..07b01933615 --- /dev/null +++ b/docs/en/table_engines/memory.rst @@ -0,0 +1,11 @@ +Memory +------ + +The Memory engine stores data in RAM, in uncompressed form. Data is stored in exactly the same form as it is received when read. In other words, reading from this table is completely free. +Concurrent data access is synchronized. Locks are short: read and write operations don't block each other. +Indexes are not supported. Reading is parallelized. +Maximal productivity (over 10 GB/sec) is reached on simple queries, because there is no reading from the disk, decompressing, or deserializing data. (We should note that in many cases, the productivity of the MergeTree engine is almost as high.) +When restarting a server, data disappears from the table and the table becomes empty. +Normally, using this table engine is not justified. However, it can be used for tests, and for tasks where maximum speed is required on a relatively small number of rows (up to approximately 100,000,000). + +The Memory engine is used by the system for temporary tables with external query data (see the section "External data for processing a query"), and for implementing GLOBAL IN (see the section "IN operators"). diff --git a/docs/en/table_engines/merge.rst b/docs/en/table_engines/merge.rst new file mode 100644 index 00000000000..591bd5e5ce9 --- /dev/null +++ b/docs/en/table_engines/merge.rst @@ -0,0 +1,35 @@ +Merge +----- + +The Merge engine (not to be confused with MergeTree) does not store data itself, but allows reading from any number of other tables simultaneously. +Reading is automatically parallelized. Writing to a table is not supported. When reading, the indexes of tables that are actually being read are used, if they exist. +The Merge engine accepts parameters: the database name and a regular expression for tables. Example: +:: + Merge(hits, '^WatchLog') + +- Data will be read from the tables in the 'hits' database with names that match the regex ``'^WatchLog'``. + +Instead of the database name, you can use a constant expression that returns a string. For example, ``currentDatabase()``. + +Regular expressions are re2 (similar to PCRE), case-sensitive. See the notes about escaping symbols in regular expressions in the "match" section. + +When selecting tables to read, the Merge table itself will not be selected, even if it matches the regex. This is to avoid loops. +It is possible to create two Merge tables that will endlessly try to read each others' data. But don't do this. + +The typical way to use the Merge engine is for working with a large number of TinyLog tables as if with a single table. + +Virtual columns +~~~~~~~~~~~~~~~~~~~ + +Virtual columns are columns that are provided by the table engine, regardless of the table definition. In other words, these columns are not specified in CREATE TABLE, but they are accessible for SELECT. + +Virtual columns differ from normal columns in the following ways: + - They are not specified in table definitions. + - Data can't be added to them with ``INSERT``. + - When using ``INSERT`` without specifying the list of columns, virtual columns are ignored. + - They are not selected when using the asterisk (``SELECT *``). + - Virtual columns are not shown in ``SHOW CREATE TABLE`` and ``DESC TABLE`` queries. + +A Merge table contains the virtual column **_table** of the String type. (If the table already has a '_table' column, the virtual column is named '_table1', and if it already has '_table1', it is named '_table2', and so on.) It contains the name of the table that data was read from. + +If the WHERE or PREWHERE clause contains conditions for the '_table' column that do not depend on other table columns (as one of the conjunction elements, or as an entire expression), these conditions are used as an index. The conditions are performed on a data set of table names to read data from, and the read operation will be performed from only those tables that the condition was triggered on. diff --git a/docs/en/table_engines/mergetree.rst b/docs/en/table_engines/mergetree.rst new file mode 100644 index 00000000000..15f37bb4f1a --- /dev/null +++ b/docs/en/table_engines/mergetree.rst @@ -0,0 +1,64 @@ +MergeTree +--------- + +The MergeTree engine supports an index by primary key and by date, and provides the possibility to update data in real time. +This is the most advanced table engine in ClickHouse. Don't confuse it with the Merge engine. + +The engine accepts parameters: the name of a Date type column containing the date, a sampling expression (optional), a tuple that defines the table's primary key, and the index granularity. +Example: + +Example without sampling support: +:: + MergeTree(EventDate, (CounterID, EventDate), 8192) + +Example with sampling support: +:: + MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) + +A MergeTree type table must have a separate column containing the date. In this example, it is the 'EventDate' column. The type of the date column must be 'Date' (not 'DateTime'). + +The primary key may be a tuple from any expressions (usually this is just a tuple of columns), or a single expression. + +The sampling expression (optional) can be any expression. It must also be present in the primary key. The example uses a hash of user IDs to pseudo-randomly disperse data in the table for each CounterID and EventDate. In other words, when using the SAMPLE clause in a query, you get an evenly pseudo-random sample of data for a subset of users. + +The table is implemented as a set of parts. Each part is sorted by the primary key. In addition, each part has the minimum and maximum date assigned. When inserting in the table, a new sorted part is created. The merge process is periodically initiated in the background. When merging, several parts are selected, usually the smallest ones, and then merged into one large sorted part. + +In other words, incremental sorting occurs when inserting to the table. Merging is implemented so that the table always consists of a small number of sorted parts, and the merge itself doesn't do too much work. + +During insertion, data belonging to different months is separated into different parts. The parts that correspond to different months are never combined. The purpose of this is to provide local data modification (for ease in backups). + +Parts are combined up to a certain size threshold, so there aren't any merges that are too long. + +For each part, an index file is also written. The index file contains the primary key value for every 'index_granularity' row in the table. In other words, this is an abbreviated index of sorted data. + +For columns, "marks" are also written to each 'index_granularity' row so that data can be read in a specific range. + +When reading from a table, the SELECT query is analyzed for whether indexes can be used. An index can be used if the WHERE or PREWHERE clause has an expression (as one of the conjunction elements, or entirely) that represents an equality or inequality comparison operation, or if it has IN above columns that are in the primary key or date, or Boolean operators over them. + +Thus, it is possible to quickly run queries on one or many ranges of the primary key. In the example given, queries will work quickly for a specific counter, for a specific counter and range of dates, for a specific counter and date, for multiple counters and a range of dates, and so on. + +.. code-block:: sql + + SELECT count() FROM table WHERE EventDate = toDate(now()) AND CounterID = 34 + SELECT count() FROM table WHERE EventDate = toDate(now()) AND (CounterID = 34 OR CounterID = 42) + SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDate <= toDate('2014-01-31')) OR EventDate = toDate('2014-05-01')) AND CounterID IN (101500, 731962, 160656) AND (CounterID = 101500 OR EventDate != toDate('2014-05-01')) + +All of these cases will use the index by date and by primary key. The index is used even for complex expressions. Reading from the table is organized so that using the index can't be slower than a full scan. + +In this example, the index can't be used: + +.. code-block:: sql + + SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' + +The index by date only allows reading those parts that contain dates from the desired range. However, a data part may contain data for many dates (up to an entire month), while within a single part the data is ordered by the primary key, which might not contain the date as the first column. Because of this, using a query with only a date condition that does not specify the primary key prefix will cause more data to be read than for a single date. + +For concurrent table access, we use multi-versioning. In other words, when a table is simultaneously read and updated, data is read from a set of parts that is current at the time of the query. There are no lengthy locks. Inserts do not get in the way of read operations. + +Reading from a table is automatically parallelized. + +The OPTIMIZE query is supported, which calls an extra merge step. + +You can use a single large table and continually add data to it in small chunks - this is what MergeTree is intended for. + +Data replication is possible for all types of tables in the MergeTree family (see the section "Data replication"). diff --git a/docs/en/table_engines/null.rst b/docs/en/table_engines/null.rst new file mode 100644 index 00000000000..ee3260f8bf2 --- /dev/null +++ b/docs/en/table_engines/null.rst @@ -0,0 +1,6 @@ +Null +---- + +When writing to a Null table, data is ignored. When reading from a Null table, the response is empty. + +However, you can create a materialized view on a Null table, so the data written to the table will end up in the view. diff --git a/docs/en/table_engines/replacingmergetree.rst b/docs/en/table_engines/replacingmergetree.rst new file mode 100644 index 00000000000..3635cb99d58 --- /dev/null +++ b/docs/en/table_engines/replacingmergetree.rst @@ -0,0 +1,19 @@ +ReplacingMergeTree +------------------ + +This engine differs from ``MergeTree`` in that it can deduplicate data by primary key while merging. + +For ReplacingMergeTree mode, last parameter is optional name of 'version' column. While merging, for all rows with same primary key, only one row is selected: last row, if version column was not specified, or last row with maximum version value, if specified. + +Version column must have type of UInt family or ``Date`` or ``DateTime``. + +.. code-block:: sql + + ReplacingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192, ver) + +Please note, that data is deduplicated only while merging process. Merges are processed in background. Exact time of merge is unspecified and you could not rely on it. Some part of data could be not merged at all. While you could trigger extra merge with OPTIMIZE query, it is not recommended, as OPTIMIZE will read and write vast amount of data. + +This table engine is suitable for background removal of duplicate data to save space, but not suitable to guarantee of deduplication. + +*Developed for special purposes of not Yandex.Metrica department.* + diff --git a/docs/en/table_engines/replication.rst b/docs/en/table_engines/replication.rst new file mode 100644 index 00000000000..a2dfb20dab6 --- /dev/null +++ b/docs/en/table_engines/replication.rst @@ -0,0 +1,175 @@ +Data replication +----------------- + +ReplicatedMergeTree +~~~~~~~~~~~~~~~~~~~ + +ReplicatedCollapsingMergeTree +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +ReplicatedAggregatingMergeTree +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +ReplicatedSummingMergeTree +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Replication is only supported for tables in the MergeTree family. Replication works at the level of an individual table, not the entire server. A server can store both replicated and non-replicated tables at the same time. + +INSERT and ALTER are replicated (for more information, see ALTER). Compressed data is replicated, not query texts. +The CREATE, DROP, ATTACH, DETACH, and RENAME queries are not replicated. In other words, they belong to a single server. The CREATE TABLE query creates a new replicatable table on the server where the query is run. If this table already exists on other servers, it adds a new replica. The DROP TABLE query deletes the replica located on the server where the query is run. The RENAME query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. + +Replication is not related to sharding in any way. Replication works independently on each shard. + +Replication is an optional feature. To use replication, set the addresses of the ZooKeeper cluster in the config file. Example: + +.. code-block:: xml + + + + example1 + 2181 + + + example2 + 2181 + + + example3 + 2181 + + + +**Use ZooKeeper version 3.4.5 or later** For example, the version in the Ubuntu Precise package is too old. + +You can specify any existing ZooKeeper cluster - the system will use a directory on it for its own data (the directory is specified when creating a replicatable table). + +If ZooKeeper isn't set in the config file, you can't create replicated tables, and any existing replicated tables will be read-only. + +ZooKeeper isn't used for SELECT queries. In other words, replication doesn't affect the productivity of SELECT queries - they work just as fast as for non-replicated tables. + +For each INSERT query (more precisely, for each inserted block of data; the INSERT query contains a single block, or per block for every max_insert_block_size = 1048576 rows), approximately ten entries are made in ZooKeeper in several transactions. This leads to slightly longer latencies for INSERT compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one INSERT per second, it doesn't create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred INSERTs per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data. + +For very large clusters, you can use different ZooKeeper clusters for different shards. However, this hasn't proven necessary on the Yandex.Metrica cluster (approximately 300 servers). + +Replication is asynchronous and multi-master. INSERT queries (as well as ALTER) can be sent to any available server. Data is inserted on this server, then sent to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If a part of the replicas is not available, the data on them is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network. + +There are no quorum writes. You can't write data with confirmation that it was received by more than one replica. If you write a batch of data to one replica and the server with this data ceases to exist before the data has time to get to the other replicas, this data will be lost. + +Each block of data is written atomically. The INSERT query is divided into blocks up to max_insert_block_size = 1048576 rows. In other words, if the INSERT query has less than 1048576 rows, it is made atomically. + +Blocks of data are duplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn't know if the data was written to the DB, so the INSERT query can simply be repeated. It doesn't matter which replica INSERTs were sent to with identical data - INSERTs are idempotent. This only works for the last 100 blocks inserted in a table. + +During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.) + +You can have any number of replicas of the same data. Yandex.Metrica uses double replication in production. Each server uses RAID-5 or RAID-6, and RAID-10 in some cases. This is a relatively reliable and convenient solution. + +The system monitors data synchronicity on replicas and is able to recover after a failure. Failover is automatic (for small differences in data) or semi-automatic (when data differs too much, which may indicate a configuration error). + +Creating replicated tables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``'Replicated'`` prefix is added to the table engine name. For example, ``ReplicatedMergeTree``. + +Two parameters are also added in the beginning of the parameters list - the path to the table in ZooKeeper, and the replica name in ZooKeeper. + +Example: +:: + ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/hits', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192) + +As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the 'macros' section of the config file. Example: + +.. code-block:: xml + + + 05 + 02 + example05-02-1.yandex.ru + + +The path to the table in ZooKeeper should be unique for each replicated table. Tables on different shards should have different paths. +In this case, the path consists of the following parts: + +``/clickhouse/tables/`` - is the common prefix. We recommend using exactly this one. + +``{layer}-{shard}`` - is the shard identifier. In this example it consists of two parts, since the Yandex.Metrica cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier. + +``hits`` - is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it doesn't change after a RENAME query. + +The replica name identifies different replicas of the same table. You can use the server name for this, as in the example. The name only needs to be unique within each shard. + +You can define everything explicitly instead of using substitutions. This might be convenient for testing and for configuring small clusters, but it is inconvenient when working with large clusters. + +Run CREATE TABLE on each replica. This query creates a new replicated table, or adds a new replica to an existing one. + +If you add a new replica after the table already contains some data on other replicas, the data will be copied from the other replicas to the new one after running the query. In other words, the new replica syncs itself with the others. + +To delete a replica, run DROP TABLE. However, only one replica is deleted - the one that resides on the server where you run the query. + +Recovery after failures +~~~~~~~~~~~~~~~~~~~~~~~~~ + +If ZooKeeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ZooKeeper. + +If ZooKeeper is unavailable during an INSERT, or an error occurs when interacting with ZooKeeper, an exception is thrown. + +After connecting to ZooKeeper, the system checks whether the set of data in the local file system matches the expected set of data (ZooKeeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas. + +If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ZooKeeper), it moves them to the 'detached' subdirectory (they are not deleted). Any missing parts are copied from the replicas. + +Note that ClickHouse does not perform any destructive actions such as automatically deleting a large amount of data. + +When the server starts (or establishes a new session with ZooKeeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a SELECT query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary. + +If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by "pushing a button". + +To start recovery, create the node ``/path_to_table/replica_name/flags/force_restore_data`` in ZooKeeper with any content or run command to recover all replicated tables: +:: + sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data + +Then launch the server. On start, the server deletes these flags and starts recovery. + +Recovery after complete data loss +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If all data and metadata disappeared from one of the servers, follow these steps for recovery: + +#. Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them. +#. If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory /var/lib/clickhouse/data/db_name/table_name/). +#. Copy table definitions located in /var/lib/clickhouse/metadata/. from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, launch the server and make all the ATTACH TABLE queries that should have been in the .sql files in /var/lib/clickhouse/metadata/.) +#. Create the ``/path_to_table/replica_name/flags/force_restore_data`` node in ZooKeeper with any content or run command to recover all replicated tables: ``sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data`` + +Then launch the server (restart it if it is already running). Data will be downloaded from replicas. + +An alternative recovery option is to delete information about the lost replica from ZooKeeper ( ``/path_to_table/replica_name``), then create the replica again as described in "Creating replicated tables". + +There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once. + +Converting from MergeTree to ReplicatedMergeTree +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +From here on, we use ``MergeTree`` to refer to all the table engines in the ``MergeTree`` family, including ``ReplicatedMergeTree``. + +If you had a MergeTree table that was manually replicated, you can convert it to a replicatable table. You might need to do this if you have already collected a large amount of data in a MergeTree table and now you want to enable replication. + +If the data differs on various replicas, first sync it, or delete this data on all the replicas except one. + +Rename the existing MergeTree table, then create a ReplicatedMergeTree table with the old name. +Move the data from the old table to the 'detached' subdirectory inside the directory with the new table data (``/var/lib/clickhouse/data/db_name/table_name/``). +Then run ALTER TABLE ATTACH PART on one of the replicas to add these data parts to the working set. + +If exactly the same parts exist on the other replicas, they are added to the working set on them. If not, the parts are downloaded from the replica that has them. + +Converting from ReplicatedMergeTree to MergeTree +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Create a MergeTree table with a different name. Move all the data from the directory with the ReplicatedMergeTree table data to the new table's data directory. Then delete the ReplicatedMergeTree table and restart the server. + +If you want to get rid of a ReplicatedMergeTree table without launching the server: + * Delete the corresponding .sql file in the metadata directory (``/var/lib/clickhouse/metadata/``). + * Delete the corresponding path in ZooKeeper (``/path_to_table/replica_name``). +After this, you can launch the server, create a MergeTree table, move the data to its directory, and then restart the server. + +Recovery when metadata in the ZooKeeper cluster is lost or damaged +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you lost ZooKeeper, you can save data by moving it to an unreplicated table as described above. diff --git a/docs/en/table_engines/resharding.rst b/docs/en/table_engines/resharding.rst new file mode 100644 index 00000000000..6987d844453 --- /dev/null +++ b/docs/en/table_engines/resharding.rst @@ -0,0 +1,79 @@ +Перешардирование +---------------- + +.. code-block:: sql + + ALTER TABLE t RESHARD [COPY] [PARTITION partition] TO cluster description USING sharding key + +Query works only for Replicated tables and for Distributed tables that are looking at Replicated tables. + +When executing, query first checks correctness of query, sufficiency of free space on nodes and writes to ZooKeeper at some path a task to to. Next work is done asynchronously. + +For using resharding, you must specify path in ZooKeeper for task queue in configuration file: + +.. code-block:: xml + + + /clickhouse/task_queue + + +When running ``ALTER TABLE t RESHARD`` query, node in ZooKeeper is created if not exists. + +Cluster description is list of shards with weights to distribute the data. +Shard is specified as address of table in ZooKeeper. Example: /clickhouse/tables/01-03/hits +Relative weight of shard (optional, default is 1) could be specified after WEIGHT keyword. +Example: + +.. code-block:: sql + + ALTER TABLE merge.hits + RESHARD PARTITION 201501 + TO + '/clickhouse/tables/01-01/hits' WEIGHT 1, + '/clickhouse/tables/01-02/hits' WEIGHT 2, + '/clickhouse/tables/01-03/hits' WEIGHT 1, + '/clickhouse/tables/01-04/hits' WEIGHT 1 + USING UserID + +Sharding key (``UserID`` in example) has same semantic as for Distributed tables. You could specify ``rand()`` as sharding key for random distribution of data. + +When query is run, it checks: + * identity of table structure on all shards. + * availability of free space on local node in amount of partition size in bytes, with additional 10% reserve. + * availability of free space on all replicas of all specified shards, except local replica, if exists, in amount of patition size times ratio of shard weight to total weight of all shards, with additional 10% reserve. + +Next, asynchronous processing of query is of following steps: + #. Split patition to parts on local node. + It merges all parts forming a partition and in the same time, splits them to several, according sharding key. + Result is placed to /reshard directory in table data directory. + Source parts doesn't modified and all process doesn't intervent table working data set. + + #. Copying all parts to remote nodes (to each replica of corresponding shard). + + #. Execution of queries ``ALTER TABLE t DROP PARTITION`` on local node and ``ALTER TABLE t ATTACH PARTITION`` on all shards. + Note: this operation is not atomic. There are time point when user could see absence of data. + + When ``COPY`` keyword is specified, source data is not removed. It is suitable for copying data from one cluster to another with changing sharding scheme in same time. + + #. Removing temporary data from local node. + +When having multiple resharding queries, their tasks will be done sequentially. + +Query in example is to reshard single partition. +If you don't specify partition in query, then tasks to reshard all partitions will be created. Example: + +.. code-block:: sql + + ALTER TABLE merge.hits + RESHARD + TO ... + +When resharding Distributed tables, each shard will be resharded (corresponding query is sent to each shard). + +You could reshard Distributed table to itself or to another table. + +Resharding is intended for "old" data: in case when during job, resharded partition was modified, task for that partition will be cancelled. + +On each server, resharding is done in single thread. It is doing that way to not disturb normal query processing. + +As of June 2016, resharding is in "beta" state: it was tested only for small data sets - up to 5 TB. diff --git a/docs/en/table_engines/set.rst b/docs/en/table_engines/set.rst new file mode 100644 index 00000000000..178678ecb0a --- /dev/null +++ b/docs/en/table_engines/set.rst @@ -0,0 +1,11 @@ +Set +---- + +A data set that is always in RAM. It is intended for use on the right side of the IN operator (see the section "IN operators"). + +You can use INSERT to insert data in the table. New elements will be added to the data set, while duplicates will be ignored. +But you can't perform SELECT from the table. The only way to retrieve data is by using it in the right half of the IN operator. + +Data is always located in RAM. For INSERT, the blocks of inserted data are also written to the directory of tables on the disk. When starting the server, this data is loaded to RAM. In other words, after restarting, the data remains in place. + +For a rough server restart, the block of data on the disk might be lost or damaged. In the latter case, you may need to manually delete the file with damaged data. diff --git a/docs/en/table_engines/summingmergetree.rst b/docs/en/table_engines/summingmergetree.rst new file mode 100644 index 00000000000..c57944cc8a1 --- /dev/null +++ b/docs/en/table_engines/summingmergetree.rst @@ -0,0 +1,40 @@ +SummingMergeTree +---------------- + +This engine differs from ``MergeTree`` in that it totals data while merging. + +.. code-block:: sql + + SummingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192) + +The columns to total are implicit. When merging, all rows with the same primary key value (in the example, OrderId, EventDate, BannerID, ...) have their values totaled in numeric columns that are not part of the primary key. + +.. code-block:: sql + + SummingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192, (Shows, Clicks, Cost, ...)) + +The columns to total are set explicitly (the last parameter - Shows, Clicks, Cost, ...). When merging, all rows with the same primary key value have their values totaled in the specified columns. The specified columns also must be numeric and must not be part of the primary key. + +If the values were null in all of these columns, the row is deleted. (The exception is cases when the data part would not have any rows left in it.) + +For the other rows that are not part of the primary key, the first value that occurs is selected when merging. + +Summation is not performed for a read operation. If it is necessary, write the appropriate GROUP BY. + +In addition, a table can have nested data structures that are processed in a special way. +If the name of a nested table ends in 'Map' and it contains at least two columns that meet the following criteria: + + * for the first table, numeric ((U)IntN, Date, DateTime), we'll refer to it as 'key' + * for other tables, arithmetic ((U)IntN, Float32/64), we'll refer to it as '(values...)' +then this nested table is interpreted as a mapping of key => (values...), and when merging its rows, the elements of two data sets are merged by 'key' with a summation of the corresponding (values...). + +Examples: +:: + [(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] + [(1, 100)] + [(1, 150)] -> [(1, 250)] + [(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] + [(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] + +For nested data structures, you don't need to specify the columns as a list of columns for totaling. + +This table engine is not particularly useful. Remember that when saving just pre-aggregated data, you lose some of the system's advantages. diff --git a/docs/en/table_engines/tinylog.rst b/docs/en/table_engines/tinylog.rst new file mode 100644 index 00000000000..d4f10110f45 --- /dev/null +++ b/docs/en/table_engines/tinylog.rst @@ -0,0 +1,18 @@ +TinyLog +------- + +The simplest table engine, which stores data on a disk. +Each column is stored in a separate compressed file. +When writing, data is appended to the end of files. + +Concurrent data access is not restricted in any way: + - If you are simultaneously reading from a table and writing to it in a different query, the read operation will complete with an error. + - If you are writing to a table in multiple queries simultaneously, the data will be broken. + +The typical way to use this table is write-once: first just write the data one time, then read it as many times as needed. +Queries are executed in a single stream. In other words, this engine is intended for relatively small tables (recommended up to 1,000,000 rows). +It makes sense to use this table engine if you have many small tables, since it is simpler than the Log engine (fewer files need to be opened). +The situation when you have a large number of small tables guarantees poor productivity, but may already be used when working with another DBMS, and you may find it easier to switch to using TinyLog types of tables. +Indexes are not supported. + +In Yandex.Metrica, TinyLog tables are used for intermediary data that is processed in small batches. diff --git a/docs/en/table_engines/view.rst b/docs/en/table_engines/view.rst new file mode 100644 index 00000000000..3b66eab39b2 --- /dev/null +++ b/docs/en/table_engines/view.rst @@ -0,0 +1,4 @@ +View +----- + +Used for implementing views (for more information, see the ``CREATE VIEW`` query). It does not store data, but only stores the specified ``SELECT`` query. When reading from a table, it runs this query (and deletes all unnecessary columns from the query). diff --git a/docs/en/table_functions/index.rst b/docs/en/table_functions/index.rst new file mode 100644 index 00000000000..afada73fe25 --- /dev/null +++ b/docs/en/table_functions/index.rst @@ -0,0 +1,11 @@ +Table functions +================= + +Table functions can be specified in the FROM clause instead of the database and table names. +Table functions can only be used if 'readonly' is not set. +Table functions aren't related to other functions. + +.. toctree:: + :glob: + + * diff --git a/docs/en/table_functions/merge.rst b/docs/en/table_functions/merge.rst new file mode 100644 index 00000000000..f4e1246a363 --- /dev/null +++ b/docs/en/table_functions/merge.rst @@ -0,0 +1,6 @@ +merge +----- + +``merge(db_name, 'tables_regexp')`` creates a temporary Merge table. For more information, see the section "Table engines, Merge". + +The table structure is taken from the first table encountered that matches the regular expression. diff --git a/docs/en/table_functions/remote.rst b/docs/en/table_functions/remote.rst new file mode 100644 index 00000000000..9fafc4f99d8 --- /dev/null +++ b/docs/en/table_functions/remote.rst @@ -0,0 +1,63 @@ +remote +------ + +``remote('addresses_expr', db, table[, 'user'[, 'password']])`` + +or + +``remote('addresses_expr', db.table[, 'user'[, 'password']])`` + +- Allows accessing a remote server without creating a Distributed table. + +``addresses_expr`` - An expression that generates addresses of remote servers. + +This may be just one server address. The server address is host:port, or just the host. The host can be specified as the server name, or as the IPv4 or IPv6 address. An IPv6 address is specified in square brackets. The port is the TCP port on the remote server. If the port is omitted, it uses tcp_port from the server's config file (by default, 9000). + +Note: As an exception, when specifying an IPv6 address, the port is required. + +Examples: +:: + example01-01-1 + example01-01-1:9000 + localhost + 127.0.0.1 + [::]:9000 + [2a02:6b8:0:1111::11]:9000 + +Multiple addresses can be comma-separated. In this case, the query goes to all the specified addresses (like to shards with different data) and uses distributed processing. + +Example: +:: + example01-01-1,example01-02-1 + +Part of the expression can be specified in curly brackets. The previous example can be written as follows: +:: + example01-0{1,2}-1 + +Curly brackets can contain a range of numbers separated by two dots (non-negative integers). In this case, the range is expanded to a set of values that generate shard addresses. If the first number starts with zero, the values are formed with the same zero alignment. +The previous example can be written as follows: +:: + example01-{01..02}-1 + +If you have multiple pairs of curly brackets, it generates the direct product of the corresponding sets. + +Addresses and fragments in curly brackets can be separated by the pipe (|) symbol. In this case, the corresponding sets of addresses are interpreted as replicas, and the query will be sent to the first healthy replica. The replicas are evaluated in the order currently set in the 'load_balancing' setting. + +Example: +:: + example01-{01..02}-{1|2} + +This example specifies two shards that each have two replicas. + +The number of addresses generated is limited by a constant. Right now this is 1000 addresses. + +Using the 'remote' table function is less optimal than creating a Distributed table, because in this case, the server connection is re-established for every request. In addition, if host names are set, the names are resolved, and errors are not counted when working with various replicas. When processing a large number of queries, always create the Distributed table ahead of time, and don't use the 'remote' table function. + +The 'remote' table function can be useful in the following cases: + * Accessing a specific server for data comparison, debugging, and testing. + * Queries between various ClickHouse clusters for research purposes. + * Infrequent distributed requests that are made manually. + * Distributed requests where the set of servers is re-defined each time. + +The username can be omitted. In this case, the 'default' username is used. +The password can be omitted. In this case, an empty password is used. diff --git a/docs/ru/access_rights.rst b/docs/ru/access_rights.rst index 66fb1811729..0987d4d8465 100644 --- a/docs/ru/access_rights.rst +++ b/docs/ru/access_rights.rst @@ -27,7 +27,7 @@