mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 00:52:02 +00:00
Merge remote-tracking branch 'origin/master' into s3-uri-virtual-hosted-style-support
This commit is contained in:
commit
f9c98dd689
202
.gitignore
vendored
202
.gitignore
vendored
@ -18,7 +18,7 @@
|
||||
/docs/website
|
||||
/docs/venv/
|
||||
/docs/tools/venv/
|
||||
/docs/tools/translate/venv/
|
||||
/docs/tools/translate/venv
|
||||
/docs/tools/translate/output.md
|
||||
/docs/en/single.md
|
||||
/docs/ru/single.md
|
||||
@ -73,100 +73,100 @@ contrib/libpoco/Poco/
|
||||
contrib/libpoco/bin/
|
||||
contrib/libpoco/cmake_uninstall.cmake
|
||||
contrib/libre2/re2_st/
|
||||
dbms/Client/clickhouse-benchmark
|
||||
dbms/Client/clickhouse-client
|
||||
dbms/Client/tests/test-connect
|
||||
dbms/Common/tests/arena_with_free_lists
|
||||
dbms/Common/tests/auto_array
|
||||
dbms/Common/tests/compact_array
|
||||
dbms/Common/tests/hash_table
|
||||
dbms/Common/tests/hashes_test
|
||||
dbms/Common/tests/int_hashes_perf
|
||||
dbms/Common/tests/lru_cache
|
||||
dbms/Common/tests/parallel_aggregation
|
||||
dbms/Common/tests/parallel_aggregation2
|
||||
dbms/Common/tests/radix_sort
|
||||
dbms/Common/tests/shell_command_test
|
||||
dbms/Common/tests/simple_cache
|
||||
dbms/Common/tests/sip_hash
|
||||
dbms/Common/tests/sip_hash_perf
|
||||
dbms/Common/tests/small_table
|
||||
dbms/Core/tests/exception
|
||||
dbms/Core/tests/field
|
||||
dbms/Core/tests/rvo_test
|
||||
dbms/Core/tests/string_pool
|
||||
dbms/DataStreams/tests/aggregating_stream
|
||||
dbms/DataStreams/tests/block_tab_separated_streams
|
||||
dbms/DataStreams/tests/collapsing_sorted_stream
|
||||
dbms/DataStreams/tests/expression_stream
|
||||
dbms/DataStreams/tests/filter_stream
|
||||
dbms/DataStreams/tests/filter_stream_hitlog
|
||||
dbms/DataStreams/tests/fork_streams
|
||||
dbms/DataStreams/tests/glue_streams
|
||||
dbms/DataStreams/tests/json_streams
|
||||
dbms/DataStreams/tests/native_streams
|
||||
dbms/DataStreams/tests/sorting_stream
|
||||
dbms/DataStreams/tests/tab_separated_streams
|
||||
dbms/DataStreams/tests/union_stream
|
||||
dbms/DataStreams/tests/union_stream2
|
||||
dbms/DataTypes/tests/data_type_string
|
||||
dbms/DataTypes/tests/data_types_number_fixed
|
||||
dbms/Functions/tests/functions_arithmetic
|
||||
dbms/Functions/tests/logical_functions_performance
|
||||
dbms/Functions/tests/number_traits
|
||||
dbms/IO/tests/async_write
|
||||
dbms/IO/tests/cached_compressed_read_buffer
|
||||
dbms/IO/tests/compressed_buffer
|
||||
dbms/IO/tests/hashing_read_buffer
|
||||
dbms/IO/tests/hashing_write_buffer
|
||||
dbms/IO/tests/io_and_exceptions
|
||||
dbms/IO/tests/io_operators
|
||||
dbms/IO/tests/mempbrk
|
||||
dbms/IO/tests/o_direct_and_dirty_pages
|
||||
dbms/IO/tests/parse_int_perf
|
||||
dbms/IO/tests/parse_int_perf2
|
||||
dbms/IO/tests/read_buffer
|
||||
dbms/IO/tests/read_buffer_aio
|
||||
dbms/IO/tests/read_buffer_perf
|
||||
dbms/IO/tests/read_escaped_string
|
||||
dbms/IO/tests/read_float_perf
|
||||
dbms/IO/tests/read_write_int
|
||||
dbms/IO/tests/valid_utf8
|
||||
dbms/IO/tests/valid_utf8_perf
|
||||
dbms/IO/tests/var_uint
|
||||
dbms/IO/tests/write_buffer
|
||||
dbms/IO/tests/write_buffer_aio
|
||||
dbms/IO/tests/write_buffer_perf
|
||||
dbms/Interpreters/tests/address_patterns
|
||||
dbms/Interpreters/tests/aggregate
|
||||
dbms/Interpreters/tests/compiler_test
|
||||
dbms/Interpreters/tests/create_query
|
||||
dbms/Interpreters/tests/expression
|
||||
dbms/Interpreters/tests/expression_analyzer
|
||||
dbms/Interpreters/tests/hash_map
|
||||
dbms/Interpreters/tests/hash_map2
|
||||
dbms/Interpreters/tests/hash_map3
|
||||
dbms/Interpreters/tests/hash_map_string
|
||||
dbms/Interpreters/tests/hash_map_string_2
|
||||
dbms/Interpreters/tests/hash_map_string_3
|
||||
dbms/Interpreters/tests/hash_map_string_small
|
||||
dbms/Interpreters/tests/in_join_subqueries_preprocessor
|
||||
dbms/Interpreters/tests/logical_expressions_optimizer
|
||||
dbms/Interpreters/tests/select_query
|
||||
dbms/Interpreters/tests/two_level_hash_map
|
||||
dbms/Interpreters/tests/users
|
||||
dbms/Parsers/tests/create_parser
|
||||
dbms/Parsers/tests/select_parser
|
||||
dbms/Server/clickhouse-server
|
||||
dbms/Server/clickhouse-server.init
|
||||
dbms/Storages/tests/hit_log
|
||||
dbms/Storages/tests/merge_tree
|
||||
dbms/Storages/tests/part_checker
|
||||
dbms/Storages/tests/part_name
|
||||
dbms/Storages/tests/pk_condition
|
||||
dbms/Storages/tests/seek_speed_test
|
||||
dbms/Storages/tests/storage_log
|
||||
dbms/Storages/tests/system_numbers
|
||||
src/Client/clickhouse-benchmark
|
||||
src/Client/clickhouse-client
|
||||
src/Client/tests/test-connect
|
||||
src/Common/tests/arena_with_free_lists
|
||||
src/Common/tests/auto_array
|
||||
src/Common/tests/compact_array
|
||||
src/Common/tests/hash_table
|
||||
src/Common/tests/hashes_test
|
||||
src/Common/tests/int_hashes_perf
|
||||
src/Common/tests/lru_cache
|
||||
src/Common/tests/parallel_aggregation
|
||||
src/Common/tests/parallel_aggregation2
|
||||
src/Common/tests/radix_sort
|
||||
src/Common/tests/shell_command_test
|
||||
src/Common/tests/simple_cache
|
||||
src/Common/tests/sip_hash
|
||||
src/Common/tests/sip_hash_perf
|
||||
src/Common/tests/small_table
|
||||
src/Core/tests/exception
|
||||
src/Core/tests/field
|
||||
src/Core/tests/rvo_test
|
||||
src/Core/tests/string_pool
|
||||
src/DataStreams/tests/aggregating_stream
|
||||
src/DataStreams/tests/block_tab_separated_streams
|
||||
src/DataStreams/tests/collapsing_sorted_stream
|
||||
src/DataStreams/tests/expression_stream
|
||||
src/DataStreams/tests/filter_stream
|
||||
src/DataStreams/tests/filter_stream_hitlog
|
||||
src/DataStreams/tests/fork_streams
|
||||
src/DataStreams/tests/glue_streams
|
||||
src/DataStreams/tests/json_streams
|
||||
src/DataStreams/tests/native_streams
|
||||
src/DataStreams/tests/sorting_stream
|
||||
src/DataStreams/tests/tab_separated_streams
|
||||
src/DataStreams/tests/union_stream
|
||||
src/DataStreams/tests/union_stream2
|
||||
src/DataTypes/tests/data_type_string
|
||||
src/DataTypes/tests/data_types_number_fixed
|
||||
src/Functions/tests/functions_arithmetic
|
||||
src/Functions/tests/logical_functions_performance
|
||||
src/Functions/tests/number_traits
|
||||
src/IO/tests/async_write
|
||||
src/IO/tests/cached_compressed_read_buffer
|
||||
src/IO/tests/compressed_buffer
|
||||
src/IO/tests/hashing_read_buffer
|
||||
src/IO/tests/hashing_write_buffer
|
||||
src/IO/tests/io_and_exceptions
|
||||
src/IO/tests/io_operators
|
||||
src/IO/tests/mempbrk
|
||||
src/IO/tests/o_direct_and_dirty_pages
|
||||
src/IO/tests/parse_int_perf
|
||||
src/IO/tests/parse_int_perf2
|
||||
src/IO/tests/read_buffer
|
||||
src/IO/tests/read_buffer_aio
|
||||
src/IO/tests/read_buffer_perf
|
||||
src/IO/tests/read_escaped_string
|
||||
src/IO/tests/read_float_perf
|
||||
src/IO/tests/read_write_int
|
||||
src/IO/tests/valid_utf8
|
||||
src/IO/tests/valid_utf8_perf
|
||||
src/IO/tests/var_uint
|
||||
src/IO/tests/write_buffer
|
||||
src/IO/tests/write_buffer_aio
|
||||
src/IO/tests/write_buffer_perf
|
||||
src/Interpreters/tests/address_patterns
|
||||
src/Interpreters/tests/aggregate
|
||||
src/Interpreters/tests/compiler_test
|
||||
src/Interpreters/tests/create_query
|
||||
src/Interpreters/tests/expression
|
||||
src/Interpreters/tests/expression_analyzer
|
||||
src/Interpreters/tests/hash_map
|
||||
src/Interpreters/tests/hash_map2
|
||||
src/Interpreters/tests/hash_map3
|
||||
src/Interpreters/tests/hash_map_string
|
||||
src/Interpreters/tests/hash_map_string_2
|
||||
src/Interpreters/tests/hash_map_string_3
|
||||
src/Interpreters/tests/hash_map_string_small
|
||||
src/Interpreters/tests/in_join_subqueries_preprocessor
|
||||
src/Interpreters/tests/logical_expressions_optimizer
|
||||
src/Interpreters/tests/select_query
|
||||
src/Interpreters/tests/two_level_hash_map
|
||||
src/Interpreters/tests/users
|
||||
src/Parsers/tests/create_parser
|
||||
src/Parsers/tests/select_parser
|
||||
src/Server/clickhouse-server
|
||||
src/Server/clickhouse-server.init
|
||||
src/Storages/tests/hit_log
|
||||
src/Storages/tests/merge_tree
|
||||
src/Storages/tests/part_checker
|
||||
src/Storages/tests/part_name
|
||||
src/Storages/tests/pk_condition
|
||||
src/Storages/tests/seek_speed_test
|
||||
src/Storages/tests/storage_log
|
||||
src/Storages/tests/system_numbers
|
||||
libs/libcommon/src/revision.h
|
||||
libs/libcommon/src/tests/date_lut2
|
||||
libs/libcommon/src/tests/date_lut3
|
||||
@ -184,15 +184,15 @@ libs/libzkutil/src/tests/zkutil_zookeeper_holder
|
||||
utils/zookeeper-create-entry-to-download-part/zookeeper-create-entry-to-download-part
|
||||
utils/zookeeper-dump-tree/zookeeper-dump-tree
|
||||
utils/zookeeper-remove-by-list/zookeeper-remove-by-list
|
||||
dbms/Storages/tests/remove_symlink_directory
|
||||
src/Storages/tests/remove_symlink_directory
|
||||
libs/libcommon/src/tests/json_test
|
||||
utils/compressor/zstd_test
|
||||
utils/wikistat-loader/wikistat-loader
|
||||
dbms/Common/tests/pod_array
|
||||
src/Common/tests/pod_array
|
||||
|
||||
dbms/Server/data/*
|
||||
dbms/Server/metadata/*
|
||||
dbms/Server/status
|
||||
src/Server/data/*
|
||||
src/Server/metadata/*
|
||||
src/Server/status
|
||||
config-9001.xml
|
||||
|
||||
*-preprocessed.xml
|
||||
@ -242,7 +242,7 @@ website/package-lock.json
|
||||
*/.DS_Store
|
||||
|
||||
# Ignore files for locally disabled tests
|
||||
/dbms/queries/**/*.disabled
|
||||
/src/queries/**/*.disabled
|
||||
|
||||
# cquery cache
|
||||
/.cquery-cache
|
||||
|
@ -31,11 +31,11 @@ build:
|
||||
- docker pull $CI_REGISTRY/yandex/clickhouse-builder
|
||||
- docker run --rm --volumes-from "${HOSTNAME}-build" --workdir "${CI_PROJECT_DIR}" --env CI_PROJECT_DIR=${CI_PROJECT_DIR} $CI_REGISTRY/yandex/clickhouse-builder /build_gitlab_ci.sh
|
||||
# You can upload your binary to nexus
|
||||
- curl -v --keepalive-time 60 --keepalive --user "$NEXUS_USER:$NEXUS_PASSWORD" -XPUT "http://$NEXUS_HOST/repository/binaries/$CI_PROJECT_NAME" --upload-file ./dbms/Server/clickhouse
|
||||
- curl -v --keepalive-time 60 --keepalive --user "$NEXUS_USER:$NEXUS_PASSWORD" -XPUT "http://$NEXUS_HOST/repository/binaries/$CI_PROJECT_NAME" --upload-file ./src/Server/clickhouse
|
||||
# Or download artifacts from gitlab
|
||||
artifacts:
|
||||
paths:
|
||||
- ./dbms/Server/clickhouse
|
||||
- ./src/Server/clickhouse
|
||||
expire_in: 1 day
|
||||
tags:
|
||||
- docker
|
||||
|
@ -384,8 +384,8 @@ set(ConfigIncludePath ${CMAKE_CURRENT_BINARY_DIR}/includes/configs CACHE INTERNA
|
||||
include_directories(${ConfigIncludePath})
|
||||
|
||||
add_subdirectory (base)
|
||||
add_subdirectory (dbms)
|
||||
add_subdirectory (programs)
|
||||
add_subdirectory (src)
|
||||
add_subdirectory (tests)
|
||||
add_subdirectory (utils)
|
||||
|
||||
|
@ -16,7 +16,7 @@ ClickHouse is an open-source column-oriented database management system that all
|
||||
## Upcoming Events
|
||||
|
||||
* [ClickHouse Online Meetup (in Russian)](https://events.yandex.ru/events/click-house-onlajn-vs-03-04-2020) on April 3, 2020.
|
||||
* [Talk on Saint HighLoad++ (online in Russian)](https://www.highload.ru/spb/2020/abstracts/6647) on April 6, 2020.
|
||||
* [ClickHouse in Avito (online in Russian)](https://avitotech.timepad.ru/event/1290051/) on April 9, 2020.
|
||||
* [ClickHouse Workshop in Novosibirsk](https://2020.codefest.ru/lecture/1628) on TBD date.
|
||||
* [Talks on Saint HighLoad++ in St. Petersburg](https://www.highload.ru/spb/2020/abstracts/6647) on TBD date.
|
||||
* [Yandex C++ Open-Source Sprints in Moscow](https://events.yandex.ru/events/otkrytyj-kod-v-yandek-28-03-2020) on TBD date.
|
||||
|
@ -12,7 +12,6 @@
|
||||
#include <unistd.h>
|
||||
|
||||
#include <typeinfo>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
@ -10,7 +10,7 @@ if (ENABLE_CLANG_TIDY)
|
||||
if (CLANG_TIDY_PATH)
|
||||
message(STATUS "Using clang-tidy: ${CLANG_TIDY_PATH}. The checks will be run during build process. See the .clang-tidy file at the root directory to configure the checks.")
|
||||
set (USE_CLANG_TIDY 1)
|
||||
# The variable CMAKE_CXX_CLANG_TIDY will be set inside dbms and base directories with non third-party code.
|
||||
# The variable CMAKE_CXX_CLANG_TIDY will be set inside src and base directories with non third-party code.
|
||||
# set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}")
|
||||
else ()
|
||||
message(STATUS "clang-tidy is not found. This is normal - the tool is used only for static code analysis and not essential for build.")
|
||||
|
@ -1,5 +1,5 @@
|
||||
set(DIVIDE_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/libdivide)
|
||||
set(DBMS_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/dbms ${ClickHouse_BINARY_DIR}/dbms)
|
||||
set(DBMS_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/src ${ClickHouse_BINARY_DIR}/src)
|
||||
set(DOUBLE_CONVERSION_CONTRIB_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/double-conversion)
|
||||
set(METROHASH_CONTRIB_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/libmetrohash/src)
|
||||
set(PCG_RANDOM_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/libpcg-random/include)
|
||||
|
@ -38,7 +38,7 @@
|
||||
|
||||
/* Implementation: */
|
||||
|
||||
static int _libcpiud_errno = ERR_OK;
|
||||
_Thread_local int _libcpiud_errno = ERR_OK;
|
||||
|
||||
int set_error(cpu_error_t err)
|
||||
{
|
||||
|
@ -1,8 +0,0 @@
|
||||
# TODO: make separate lib datastream, block, ...
|
||||
#include(${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake)
|
||||
#add_headers_and_sources(clickhouse_client .)
|
||||
#add_library(clickhouse_client ${clickhouse_client_headers} ${clickhouse_client_sources})
|
||||
#target_link_libraries (clickhouse_client clickhouse_common_io ${Poco_Net_LIBRARY})
|
||||
#target_include_directories (clickhouse_client PRIVATE ${DBMS_INCLUDE_DIR})
|
||||
|
||||
add_subdirectory(tests)
|
@ -1,709 +0,0 @@
|
||||
#include "FunctionsStringSearch.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <DataTypes/DataTypeFixedString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsMultiStringPosition.h>
|
||||
#include <Functions/FunctionsMultiStringSearch.h>
|
||||
#include <Functions/Regexps.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Poco/UTF8String.h>
|
||||
#include <Common/Volnitsky.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
}
|
||||
/** Implementation details for functions of 'position' family depending on ASCII/UTF8 and case sensitiveness.
|
||||
*/
|
||||
struct PositionCaseSensitiveASCII
|
||||
{
|
||||
/// For searching single substring inside big-enough contiguous chunk of data. Coluld have slightly expensive initialization.
|
||||
using SearcherInBigHaystack = Volnitsky;
|
||||
|
||||
/// For search many substrings in one string
|
||||
using MultiSearcherInBigHaystack = MultiVolnitsky;
|
||||
|
||||
/// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
|
||||
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
|
||||
|
||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
||||
{
|
||||
return SearcherInBigHaystack(needle_data, needle_size, haystack_size_hint);
|
||||
}
|
||||
|
||||
static SearcherInSmallHaystack createSearcherInSmallHaystack(const char * needle_data, size_t needle_size)
|
||||
{
|
||||
return SearcherInSmallHaystack(needle_data, needle_size);
|
||||
}
|
||||
|
||||
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
|
||||
{
|
||||
return MultiSearcherInBigHaystack(needles);
|
||||
}
|
||||
|
||||
/// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8).
|
||||
static size_t countChars(const char * begin, const char * end) { return end - begin; }
|
||||
|
||||
/// Convert string to lowercase. Only for case-insensitive search.
|
||||
/// Implementation is permitted to be inefficient because it is called for single string.
|
||||
static void toLowerIfNeed(std::string &) { }
|
||||
};
|
||||
|
||||
struct PositionCaseInsensitiveASCII
|
||||
{
|
||||
/// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
|
||||
using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
|
||||
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
|
||||
using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;
|
||||
|
||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
|
||||
{
|
||||
return SearcherInBigHaystack(needle_data, needle_size);
|
||||
}
|
||||
|
||||
static SearcherInSmallHaystack createSearcherInSmallHaystack(const char * needle_data, size_t needle_size)
|
||||
{
|
||||
return SearcherInSmallHaystack(needle_data, needle_size);
|
||||
}
|
||||
|
||||
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
|
||||
{
|
||||
return MultiSearcherInBigHaystack(needles);
|
||||
}
|
||||
|
||||
static size_t countChars(const char * begin, const char * end) { return end - begin; }
|
||||
|
||||
static void toLowerIfNeed(std::string & s) { std::transform(std::begin(s), std::end(s), std::begin(s), tolower); }
|
||||
};
|
||||
|
||||
struct PositionCaseSensitiveUTF8
|
||||
{
|
||||
using SearcherInBigHaystack = VolnitskyUTF8;
|
||||
using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
|
||||
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
|
||||
|
||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
||||
{
|
||||
return SearcherInBigHaystack(needle_data, needle_size, haystack_size_hint);
|
||||
}
|
||||
|
||||
static SearcherInSmallHaystack createSearcherInSmallHaystack(const char * needle_data, size_t needle_size)
|
||||
{
|
||||
return SearcherInSmallHaystack(needle_data, needle_size);
|
||||
}
|
||||
|
||||
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
|
||||
{
|
||||
return MultiSearcherInBigHaystack(needles);
|
||||
}
|
||||
|
||||
static size_t countChars(const char * begin, const char * end)
|
||||
{
|
||||
size_t res = 0;
|
||||
for (auto it = begin; it != end; ++it)
|
||||
if (!UTF8::isContinuationOctet(static_cast<UInt8>(*it)))
|
||||
++res;
|
||||
return res;
|
||||
}
|
||||
|
||||
static void toLowerIfNeed(std::string &) { }
|
||||
};
|
||||
|
||||
struct PositionCaseInsensitiveUTF8
|
||||
{
|
||||
using SearcherInBigHaystack = VolnitskyCaseInsensitiveUTF8;
|
||||
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitiveUTF8;
|
||||
using SearcherInSmallHaystack = UTF8CaseInsensitiveStringSearcher; /// TODO Very suboptimal.
|
||||
|
||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
||||
{
|
||||
return SearcherInBigHaystack(needle_data, needle_size, haystack_size_hint);
|
||||
}
|
||||
|
||||
static SearcherInSmallHaystack createSearcherInSmallHaystack(const char * needle_data, size_t needle_size)
|
||||
{
|
||||
return SearcherInSmallHaystack(needle_data, needle_size);
|
||||
}
|
||||
|
||||
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
|
||||
{
|
||||
return MultiSearcherInBigHaystack(needles);
|
||||
}
|
||||
|
||||
static size_t countChars(const char * begin, const char * end)
|
||||
{
|
||||
size_t res = 0;
|
||||
for (auto it = begin; it != end; ++it)
|
||||
if (!UTF8::isContinuationOctet(static_cast<UInt8>(*it)))
|
||||
++res;
|
||||
return res;
|
||||
}
|
||||
|
||||
static void toLowerIfNeed(std::string & s) { Poco::UTF8::toLowerInPlace(s); }
|
||||
};
|
||||
|
||||
template <typename Impl>
|
||||
struct PositionImpl
|
||||
{
|
||||
static constexpr bool use_default_implementation_for_constants = false;
|
||||
|
||||
using ResultType = UInt64;
|
||||
|
||||
/// Find one substring in many strings.
|
||||
static void vectorConstant(
|
||||
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray<UInt64> & res)
|
||||
{
|
||||
const UInt8 * begin = data.data();
|
||||
const UInt8 * pos = begin;
|
||||
const UInt8 * end = pos + data.size();
|
||||
|
||||
/// Current index in the array of strings.
|
||||
size_t i = 0;
|
||||
|
||||
typename Impl::SearcherInBigHaystack searcher = Impl::createSearcherInBigHaystack(needle.data(), needle.size(), end - pos);
|
||||
|
||||
/// We will search for the next occurrence in all strings at once.
|
||||
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
||||
{
|
||||
/// Determine which index it refers to.
|
||||
while (begin + offsets[i] <= pos)
|
||||
{
|
||||
res[i] = 0;
|
||||
++i;
|
||||
}
|
||||
|
||||
/// We check that the entry does not pass through the boundaries of strings.
|
||||
if (pos + needle.size() < begin + offsets[i])
|
||||
res[i] = 1 + Impl::countChars(reinterpret_cast<const char *>(begin + offsets[i - 1]), reinterpret_cast<const char *>(pos));
|
||||
else
|
||||
res[i] = 0;
|
||||
|
||||
pos = begin + offsets[i];
|
||||
++i;
|
||||
}
|
||||
|
||||
if (i < res.size())
|
||||
memset(&res[i], 0, (res.size() - i) * sizeof(res[0]));
|
||||
}
|
||||
|
||||
/// Search for substring in string.
|
||||
static void constantConstant(std::string data, std::string needle, UInt64 & res)
|
||||
{
|
||||
Impl::toLowerIfNeed(data);
|
||||
Impl::toLowerIfNeed(needle);
|
||||
|
||||
res = data.find(needle);
|
||||
if (res == std::string::npos)
|
||||
res = 0;
|
||||
else
|
||||
res = 1 + Impl::countChars(data.data(), data.data() + res);
|
||||
}
|
||||
|
||||
/// Search each time for a different single substring inside each time different string.
|
||||
static void vectorVector(
|
||||
const ColumnString::Chars & haystack_data,
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const ColumnString::Chars & needle_data,
|
||||
const ColumnString::Offsets & needle_offsets,
|
||||
PaddedPODArray<UInt64> & res)
|
||||
{
|
||||
ColumnString::Offset prev_haystack_offset = 0;
|
||||
ColumnString::Offset prev_needle_offset = 0;
|
||||
|
||||
size_t size = haystack_offsets.size();
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
size_t needle_size = needle_offsets[i] - prev_needle_offset - 1;
|
||||
size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1;
|
||||
|
||||
if (0 == needle_size)
|
||||
{
|
||||
/// An empty string is always at the very beginning of `haystack`.
|
||||
res[i] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
/// It is assumed that the StringSearcher is not very difficult to initialize.
|
||||
typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(
|
||||
reinterpret_cast<const char *>(&needle_data[prev_needle_offset]),
|
||||
needle_offsets[i] - prev_needle_offset - 1); /// zero byte at the end
|
||||
|
||||
/// searcher returns a pointer to the found substring or to the end of `haystack`.
|
||||
size_t pos = searcher.search(&haystack_data[prev_haystack_offset], &haystack_data[haystack_offsets[i] - 1])
|
||||
- &haystack_data[prev_haystack_offset];
|
||||
|
||||
if (pos != haystack_size)
|
||||
{
|
||||
res[i] = 1
|
||||
+ Impl::countChars(
|
||||
reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]),
|
||||
reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset + pos]));
|
||||
}
|
||||
else
|
||||
res[i] = 0;
|
||||
}
|
||||
|
||||
prev_haystack_offset = haystack_offsets[i];
|
||||
prev_needle_offset = needle_offsets[i];
|
||||
}
|
||||
}
|
||||
|
||||
/// Find many substrings in single string.
|
||||
static void constantVector(
|
||||
const String & haystack,
|
||||
const ColumnString::Chars & needle_data,
|
||||
const ColumnString::Offsets & needle_offsets,
|
||||
PaddedPODArray<UInt64> & res)
|
||||
{
|
||||
// NOTE You could use haystack indexing. But this is a rare case.
|
||||
|
||||
ColumnString::Offset prev_needle_offset = 0;
|
||||
|
||||
size_t size = needle_offsets.size();
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
size_t needle_size = needle_offsets[i] - prev_needle_offset - 1;
|
||||
|
||||
if (0 == needle_size)
|
||||
{
|
||||
res[i] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(
|
||||
reinterpret_cast<const char *>(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1);
|
||||
|
||||
size_t pos = searcher.search(
|
||||
reinterpret_cast<const UInt8 *>(haystack.data()),
|
||||
reinterpret_cast<const UInt8 *>(haystack.data()) + haystack.size())
|
||||
- reinterpret_cast<const UInt8 *>(haystack.data());
|
||||
|
||||
if (pos != haystack.size())
|
||||
{
|
||||
res[i] = 1 + Impl::countChars(haystack.data(), haystack.data() + pos);
|
||||
}
|
||||
else
|
||||
res[i] = 0;
|
||||
}
|
||||
|
||||
prev_needle_offset = needle_offsets[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void vectorFixedConstant(Args &&...)
|
||||
{
|
||||
throw Exception("Functions 'position' don't support FixedString haystack argument", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Impl>
|
||||
struct MultiSearchAllPositionsImpl
|
||||
{
|
||||
using ResultType = UInt64;
|
||||
|
||||
static void vectorConstant(
|
||||
const ColumnString::Chars & haystack_data,
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const std::vector<StringRef> & needles,
|
||||
PaddedPODArray<UInt64> & res)
|
||||
{
|
||||
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
|
||||
{
|
||||
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
|
||||
};
|
||||
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||
|
||||
const size_t haystack_string_size = haystack_offsets.size();
|
||||
const size_t needles_size = needles.size();
|
||||
|
||||
/// Something can be uninitialized after the search itself
|
||||
std::fill(res.begin(), res.end(), 0);
|
||||
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
size_t prev_offset = 0;
|
||||
for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size)
|
||||
{
|
||||
const auto * haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||
searcher.searchOneAll(haystack, haystack_end, res.data() + from, res_callback);
|
||||
prev_offset = haystack_offsets[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Impl>
|
||||
struct MultiSearchImpl
|
||||
{
|
||||
using ResultType = UInt8;
|
||||
static constexpr bool is_using_hyperscan = false;
|
||||
/// Variable for understanding, if we used offsets for the output, most
|
||||
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
||||
static constexpr bool is_column_array = false;
|
||||
static auto getReturnType() { return std::make_shared<DataTypeNumber<ResultType>>(); }
|
||||
|
||||
static void vectorConstant(
|
||||
const ColumnString::Chars & haystack_data,
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const std::vector<StringRef> & needles,
|
||||
PaddedPODArray<UInt8> & res,
|
||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets)
|
||||
{
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||
const size_t haystack_string_size = haystack_offsets.size();
|
||||
res.resize(haystack_string_size);
|
||||
size_t iteration = 0;
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
size_t prev_offset = 0;
|
||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
||||
{
|
||||
const auto * haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||
if (iteration == 0 || !res[j])
|
||||
res[j] = searcher.searchOne(haystack, haystack_end);
|
||||
prev_offset = haystack_offsets[j];
|
||||
}
|
||||
++iteration;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Impl>
|
||||
struct MultiSearchFirstPositionImpl
|
||||
{
|
||||
using ResultType = UInt64;
|
||||
static constexpr bool is_using_hyperscan = false;
|
||||
/// Variable for understanding, if we used offsets for the output, most
|
||||
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
||||
static constexpr bool is_column_array = false;
|
||||
static auto getReturnType() { return std::make_shared<DataTypeNumber<ResultType>>(); }
|
||||
|
||||
static void vectorConstant(
|
||||
const ColumnString::Chars & haystack_data,
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const std::vector<StringRef> & needles,
|
||||
PaddedPODArray<UInt64> & res,
|
||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets)
|
||||
{
|
||||
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
|
||||
{
|
||||
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
|
||||
};
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||
const size_t haystack_string_size = haystack_offsets.size();
|
||||
res.resize(haystack_string_size);
|
||||
size_t iteration = 0;
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
size_t prev_offset = 0;
|
||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
||||
{
|
||||
const auto * haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||
if (iteration == 0 || res[j] == 0)
|
||||
res[j] = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
|
||||
else
|
||||
{
|
||||
UInt64 result = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback);
|
||||
if (result != 0)
|
||||
res[j] = std::min(result, res[j]);
|
||||
}
|
||||
prev_offset = haystack_offsets[j];
|
||||
}
|
||||
++iteration;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Impl>
|
||||
struct MultiSearchFirstIndexImpl
|
||||
{
|
||||
using ResultType = UInt64;
|
||||
static constexpr bool is_using_hyperscan = false;
|
||||
/// Variable for understanding, if we used offsets for the output, most
|
||||
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
||||
static constexpr bool is_column_array = false;
|
||||
static auto getReturnType() { return std::make_shared<DataTypeNumber<ResultType>>(); }
|
||||
|
||||
static void vectorConstant(
|
||||
const ColumnString::Chars & haystack_data,
|
||||
const ColumnString::Offsets & haystack_offsets,
|
||||
const std::vector<StringRef> & needles,
|
||||
PaddedPODArray<UInt64> & res,
|
||||
[[maybe_unused]] PaddedPODArray<UInt64> & offsets)
|
||||
{
|
||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||
const size_t haystack_string_size = haystack_offsets.size();
|
||||
res.resize(haystack_string_size);
|
||||
size_t iteration = 0;
|
||||
while (searcher.hasMoreToSearch())
|
||||
{
|
||||
size_t prev_offset = 0;
|
||||
for (size_t j = 0; j < haystack_string_size; ++j)
|
||||
{
|
||||
const auto * haystack = &haystack_data[prev_offset];
|
||||
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
|
||||
/// hasMoreToSearch traverse needles in increasing order
|
||||
if (iteration == 0 || res[j] == 0)
|
||||
res[j] = searcher.searchOneFirstIndex(haystack, haystack_end);
|
||||
prev_offset = haystack_offsets[j];
|
||||
}
|
||||
++iteration;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/** Token search the string, means that needle must be surrounded by some separator chars, like whitespace or puctuation.
|
||||
*/
|
||||
template <typename TokenSearcher, bool negate_result = false>
|
||||
struct HasTokenImpl
|
||||
{
|
||||
using ResultType = UInt8;
|
||||
|
||||
static constexpr bool use_default_implementation_for_constants = true;
|
||||
|
||||
static void vectorConstant(
|
||||
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray<UInt8> & res)
|
||||
{
|
||||
if (offsets.empty())
|
||||
return;
|
||||
|
||||
const UInt8 * begin = data.data();
|
||||
const UInt8 * pos = begin;
|
||||
const UInt8 * end = pos + data.size();
|
||||
|
||||
/// The current index in the array of strings.
|
||||
size_t i = 0;
|
||||
|
||||
TokenSearcher searcher(pattern.data(), pattern.size(), end - pos);
|
||||
|
||||
/// We will search for the next occurrence in all rows at once.
|
||||
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
||||
{
|
||||
/// Let's determine which index it refers to.
|
||||
while (begin + offsets[i] <= pos)
|
||||
{
|
||||
res[i] = negate_result;
|
||||
++i;
|
||||
}
|
||||
|
||||
/// We check that the entry does not pass through the boundaries of strings.
|
||||
if (pos + pattern.size() < begin + offsets[i])
|
||||
res[i] = !negate_result;
|
||||
else
|
||||
res[i] = negate_result;
|
||||
|
||||
pos = begin + offsets[i];
|
||||
++i;
|
||||
}
|
||||
|
||||
/// Tail, in which there can be no substring.
|
||||
if (i < res.size())
|
||||
memset(&res[i], negate_result, (res.size() - i) * sizeof(res[0]));
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void vectorVector(Args &&...)
|
||||
{
|
||||
throw Exception("Function 'hasToken' does not support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
|
||||
/// Search different needles in single haystack.
|
||||
template <typename... Args>
|
||||
static void constantVector(Args &&...)
|
||||
{
|
||||
throw Exception("Function 'hasToken' does not support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void vectorFixedConstant(Args &&...)
|
||||
{
|
||||
throw Exception("Functions 'hasToken' don't support FixedString haystack argument", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct NamePosition
|
||||
{
|
||||
static constexpr auto name = "position";
|
||||
};
|
||||
struct NamePositionUTF8
|
||||
{
|
||||
static constexpr auto name = "positionUTF8";
|
||||
};
|
||||
struct NamePositionCaseInsensitive
|
||||
{
|
||||
static constexpr auto name = "positionCaseInsensitive";
|
||||
};
|
||||
struct NamePositionCaseInsensitiveUTF8
|
||||
{
|
||||
static constexpr auto name = "positionCaseInsensitiveUTF8";
|
||||
};
|
||||
struct NameMultiSearchAllPositions
|
||||
{
|
||||
static constexpr auto name = "multiSearchAllPositions";
|
||||
};
|
||||
struct NameMultiSearchAllPositionsUTF8
|
||||
{
|
||||
static constexpr auto name = "multiSearchAllPositionsUTF8";
|
||||
};
|
||||
struct NameMultiSearchAllPositionsCaseInsensitive
|
||||
{
|
||||
static constexpr auto name = "multiSearchAllPositionsCaseInsensitive";
|
||||
};
|
||||
struct NameMultiSearchAllPositionsCaseInsensitiveUTF8
|
||||
{
|
||||
static constexpr auto name = "multiSearchAllPositionsCaseInsensitiveUTF8";
|
||||
};
|
||||
struct NameMultiSearchAny
|
||||
{
|
||||
static constexpr auto name = "multiSearchAny";
|
||||
};
|
||||
struct NameMultiSearchAnyUTF8
|
||||
{
|
||||
static constexpr auto name = "multiSearchAnyUTF8";
|
||||
};
|
||||
struct NameMultiSearchAnyCaseInsensitive
|
||||
{
|
||||
static constexpr auto name = "multiSearchAnyCaseInsensitive";
|
||||
};
|
||||
struct NameMultiSearchAnyCaseInsensitiveUTF8
|
||||
{
|
||||
static constexpr auto name = "multiSearchAnyCaseInsensitiveUTF8";
|
||||
};
|
||||
struct NameMultiSearchFirstIndex
|
||||
{
|
||||
static constexpr auto name = "multiSearchFirstIndex";
|
||||
};
|
||||
struct NameMultiSearchFirstIndexUTF8
|
||||
{
|
||||
static constexpr auto name = "multiSearchFirstIndexUTF8";
|
||||
};
|
||||
struct NameMultiSearchFirstIndexCaseInsensitive
|
||||
{
|
||||
static constexpr auto name = "multiSearchFirstIndexCaseInsensitive";
|
||||
};
|
||||
struct NameMultiSearchFirstIndexCaseInsensitiveUTF8
|
||||
{
|
||||
static constexpr auto name = "multiSearchFirstIndexCaseInsensitiveUTF8";
|
||||
};
|
||||
struct NameMultiSearchFirstPosition
|
||||
{
|
||||
static constexpr auto name = "multiSearchFirstPosition";
|
||||
};
|
||||
struct NameMultiSearchFirstPositionUTF8
|
||||
{
|
||||
static constexpr auto name = "multiSearchFirstPositionUTF8";
|
||||
};
|
||||
struct NameMultiSearchFirstPositionCaseInsensitive
|
||||
{
|
||||
static constexpr auto name = "multiSearchFirstPositionCaseInsensitive";
|
||||
};
|
||||
struct NameMultiSearchFirstPositionCaseInsensitiveUTF8
|
||||
{
|
||||
static constexpr auto name = "multiSearchFirstPositionCaseInsensitiveUTF8";
|
||||
};
|
||||
|
||||
struct NameHasToken
|
||||
{
|
||||
static constexpr auto name = "hasToken";
|
||||
};
|
||||
|
||||
struct NameHasTokenCaseInsensitive
|
||||
{
|
||||
static constexpr auto name = "hasTokenCaseInsensitive";
|
||||
};
|
||||
|
||||
|
||||
using FunctionPosition = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveASCII>, NamePosition>;
|
||||
using FunctionPositionUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveUTF8>, NamePositionUTF8>;
|
||||
using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveASCII>, NamePositionCaseInsensitive>;
|
||||
using FunctionPositionCaseInsensitiveUTF8
|
||||
= FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionMultiSearchAllPositions
|
||||
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
|
||||
using FunctionMultiSearchAllPositionsUTF8
|
||||
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
|
||||
using FunctionMultiSearchAllPositionsCaseInsensitive
|
||||
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAllPositionsCaseInsensitive>;
|
||||
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition<
|
||||
MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>,
|
||||
NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearchAny>;
|
||||
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAnyUTF8>;
|
||||
using FunctionMultiSearchCaseInsensitive
|
||||
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAnyCaseInsensitive>;
|
||||
using FunctionMultiSearchCaseInsensitiveUTF8
|
||||
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAnyCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionMultiSearchFirstIndex
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
|
||||
using FunctionMultiSearchFirstIndexUTF8
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
|
||||
using FunctionMultiSearchFirstIndexCaseInsensitive
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstIndexCaseInsensitive>;
|
||||
using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstIndexCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionMultiSearchFirstPosition
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
|
||||
using FunctionMultiSearchFirstPositionUTF8
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitive
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<
|
||||
MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>,
|
||||
NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionHasToken = FunctionsStringSearch<HasTokenImpl<VolnitskyCaseSensitiveToken, false>, NameHasToken>;
|
||||
using FunctionHasTokenCaseInsensitive
|
||||
= FunctionsStringSearch<HasTokenImpl<VolnitskyCaseInsensitiveToken, false>, NameHasTokenCaseInsensitive>;
|
||||
|
||||
void registerFunctionsStringSearch(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionPosition>(FunctionFactory::CaseInsensitive);
|
||||
factory.registerFunction<FunctionPositionUTF8>();
|
||||
factory.registerFunction<FunctionPositionCaseInsensitive>();
|
||||
factory.registerFunction<FunctionPositionCaseInsensitiveUTF8>();
|
||||
|
||||
factory.registerFunction<FunctionMultiSearchAllPositions>();
|
||||
factory.registerFunction<FunctionMultiSearchAllPositionsUTF8>();
|
||||
factory.registerFunction<FunctionMultiSearchAllPositionsCaseInsensitive>();
|
||||
factory.registerFunction<FunctionMultiSearchAllPositionsCaseInsensitiveUTF8>();
|
||||
|
||||
factory.registerFunction<FunctionMultiSearch>();
|
||||
factory.registerFunction<FunctionMultiSearchUTF8>();
|
||||
factory.registerFunction<FunctionMultiSearchCaseInsensitive>();
|
||||
factory.registerFunction<FunctionMultiSearchCaseInsensitiveUTF8>();
|
||||
|
||||
factory.registerFunction<FunctionMultiSearchFirstIndex>();
|
||||
factory.registerFunction<FunctionMultiSearchFirstIndexUTF8>();
|
||||
factory.registerFunction<FunctionMultiSearchFirstIndexCaseInsensitive>();
|
||||
factory.registerFunction<FunctionMultiSearchFirstIndexCaseInsensitiveUTF8>();
|
||||
|
||||
factory.registerFunction<FunctionMultiSearchFirstPosition>();
|
||||
factory.registerFunction<FunctionMultiSearchFirstPositionUTF8>();
|
||||
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitive>();
|
||||
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitiveUTF8>();
|
||||
|
||||
factory.registerFunction<FunctionHasToken>();
|
||||
factory.registerFunction<FunctionHasTokenCaseInsensitive>();
|
||||
|
||||
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
|
||||
}
|
||||
}
|
@ -23,7 +23,7 @@ It builds only binaries, not packages.
|
||||
|
||||
For example, run server:
|
||||
```
|
||||
cd $(git rev-parse --show-toplevel)/dbms/Server
|
||||
cd $(git rev-parse --show-toplevel)/src/Server
|
||||
$(git rev-parse --show-toplevel)/docker/builder/programs/clickhouse server --config-file $(git rev-parse --show-toplevel)/programs/server/config.xml
|
||||
```
|
||||
|
||||
|
@ -17,7 +17,7 @@ cmake .. -LA -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DSANITIZE=$SANITIZER $CMAKE_FLAGS
|
||||
ninja
|
||||
ccache --show-stats ||:
|
||||
mv ./programs/clickhouse* /output
|
||||
mv ./dbms/unit_tests_dbms /output
|
||||
mv ./src/unit_tests_dbms /output
|
||||
find . -name '*.so' -print -exec mv '{}' /output \;
|
||||
find . -name '*.so.*' -print -exec mv '{}' /output \;
|
||||
|
||||
|
@ -97,10 +97,6 @@ function run_tests
|
||||
touch "$x"
|
||||
done
|
||||
|
||||
# FIXME remove some broken long tests
|
||||
rm "$test_prefix"/{IPv4,IPv6,modulo,parse_engine_file,number_formatting_formats,select_format}.xml ||:
|
||||
|
||||
test_files=$(ls "$test_prefix"/*.xml)
|
||||
|
||||
# FIXME a quick crutch to bring the run time down for the unstable tests --
|
||||
# if some performance tests xmls were changed in a PR, run only these ones.
|
||||
@ -111,7 +107,7 @@ function run_tests
|
||||
# and not always correct (e.g. when the reference SHA is really old and
|
||||
# has some other differences to the tested SHA, besides the one introduced
|
||||
# by the PR).
|
||||
test_files_override=$(sed "s/dbms\/tests\/performance/${test_prefix//\//\\/}/" changed-tests.txt)
|
||||
test_files_override=$(sed "s/tests\/performance/${test_prefix//\//\\/}/" changed-tests.txt)
|
||||
if [ "$test_files_override" != "" ]
|
||||
then
|
||||
test_files=$test_files_override
|
||||
@ -126,6 +122,17 @@ function run_tests
|
||||
test_files=$(ls "$test_prefix"/$CHPC_TEST_GLOB.xml)
|
||||
fi
|
||||
|
||||
if [ "$test_files" == "" ]
|
||||
then
|
||||
# FIXME remove some broken long tests
|
||||
for test_name in {IPv4,IPv6,modulo,parse_engine_file,number_formatting_formats,select_format,arithmetic,cryptographic_hashes,logical_functions_{medium,small}}
|
||||
do
|
||||
printf "$test_name\tMarked as broken (see compare.sh)\n" >> skipped-tests.tsv
|
||||
rm "$test_prefix/$test_name.xml" ||:
|
||||
done
|
||||
test_files=$(ls "$test_prefix"/*.xml)
|
||||
fi
|
||||
|
||||
# Run the tests.
|
||||
test_name="<none>"
|
||||
for test in $test_files
|
||||
@ -275,9 +282,11 @@ create table test_times_tsv engine File(TSV, 'test-times.tsv') as
|
||||
from test_time join wall_clock using test
|
||||
order by avg_real_per_query desc;
|
||||
|
||||
create table all_queries_tsv engine File(TSV, 'all-queries.tsv') as
|
||||
select left, right, diff, rd, test, query
|
||||
from queries order by rd[3] desc;
|
||||
create table all_tests_tsv engine File(TSV, 'all-queries.tsv') as
|
||||
select left, right, diff,
|
||||
floor(left > right ? left / right : right / left, 3),
|
||||
rd, test, query
|
||||
from queries order by test, query;
|
||||
" 2> >(head -2 >> report-errors.rep) ||:
|
||||
|
||||
for version in {right,left}
|
||||
@ -397,7 +406,7 @@ unset IFS
|
||||
|
||||
# Remember that grep sets error code when nothing is found, hence the bayan
|
||||
# operator.
|
||||
grep -H -m2 '\(Exception\|Error\):[^:]' ./*-err.log | sed 's/:/\t/' > run-errors.tsv ||:
|
||||
grep -H -m2 -i '\(Exception\|Error\):[^:]' ./*-err.log | sed 's/:/\t/' > run-errors.tsv ||:
|
||||
}
|
||||
|
||||
case "$stage" in
|
||||
@ -429,6 +438,7 @@ case "$stage" in
|
||||
"report")
|
||||
time report ||:
|
||||
|
||||
time "$script_dir/report.py" --report=all-queries > all-queries.html 2> >(head -2 >> report-errors.rep) ||:
|
||||
time "$script_dir/report.py" > report.html
|
||||
;&
|
||||
esac
|
||||
|
@ -90,17 +90,23 @@ export PYTHONIOENCODING=utf-8
|
||||
# Use a default number of runs if not told otherwise
|
||||
export CHPC_RUNS=${CHPC_RUNS:-7}
|
||||
|
||||
# By default, use the main comparison script from the tested package, so that we
|
||||
# can change it in PRs.
|
||||
script_path="right/scripts"
|
||||
if [ -v CHPC_LOCAL_SCRIPT ]
|
||||
then
|
||||
script_path=".."
|
||||
fi
|
||||
|
||||
# Even if we have some errors, try our best to save the logs.
|
||||
set +e
|
||||
|
||||
# Use main comparison script from the tested package, so that we can change it
|
||||
# in PRs.
|
||||
# Older version use 'kill 0', so put the script into a separate process group
|
||||
# FIXME remove set +m in April 2020
|
||||
set +m
|
||||
{ \
|
||||
time ../download.sh "$REF_PR" "$REF_SHA" "$PR_TO_TEST" "$SHA_TO_TEST" && \
|
||||
time stage=configure right/scripts/compare.sh ; \
|
||||
time stage=configure "$script_path"/compare.sh ; \
|
||||
} 2>&1 | ts "$(printf '%%Y-%%m-%%d %%H:%%M:%%S\t')" | tee compare.log
|
||||
set -m
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import argparse
|
||||
import ast
|
||||
import collections
|
||||
import csv
|
||||
@ -8,6 +9,11 @@ import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
parser = argparse.ArgumentParser(description='Create performance test report')
|
||||
parser.add_argument('--report', default='main', choices=['main', 'all-queries'],
|
||||
help='Which report to build')
|
||||
args = parser.parse_args()
|
||||
|
||||
report_errors = []
|
||||
error_tests = 0
|
||||
slow_average_tests = 0
|
||||
@ -16,7 +22,7 @@ slower_queries = 0
|
||||
unstable_queries = 0
|
||||
very_unstable_queries = 0
|
||||
|
||||
print("""
|
||||
header_template = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<style>
|
||||
@ -56,7 +62,7 @@ tr:nth-child(odd) td {{filter: brightness(95%);}}
|
||||
<div class="main">
|
||||
|
||||
<h1>ClickHouse performance comparison</h1>
|
||||
""".format())
|
||||
"""
|
||||
|
||||
table_anchor = 0
|
||||
row_anchor = 0
|
||||
@ -133,195 +139,252 @@ def printSimpleTable(caption, columns, rows):
|
||||
print(tableRow(row))
|
||||
print(tableEnd())
|
||||
|
||||
printSimpleTable('Tested commits', ['Old', 'New'],
|
||||
[['<pre>{}</pre>'.format(x) for x in
|
||||
[open('left-commit.txt').read(),
|
||||
open('right-commit.txt').read()]]])
|
||||
if args.report == 'main':
|
||||
print(header_template.format())
|
||||
|
||||
def print_changes():
|
||||
rows = tsvRows('changed-perf.tsv')
|
||||
if not rows:
|
||||
return
|
||||
printSimpleTable('Tested commits', ['Old', 'New'],
|
||||
[['<pre>{}</pre>'.format(x) for x in
|
||||
[open('left-commit.txt').read(),
|
||||
open('right-commit.txt').read()]]])
|
||||
|
||||
global faster_queries, slower_queries
|
||||
def print_changes():
|
||||
rows = tsvRows('changed-perf.tsv')
|
||||
if not rows:
|
||||
return
|
||||
|
||||
print(tableStart('Changes in performance'))
|
||||
columns = [
|
||||
'Old, s', # 0
|
||||
'New, s', # 1
|
||||
'Relative difference (new - old)/old', # 2
|
||||
'Randomization distribution quantiles \
|
||||
[5%, 50%, 95%, 99%]', # 3
|
||||
'Test', # 4
|
||||
'Query', # 5
|
||||
global faster_queries, slower_queries
|
||||
|
||||
print(tableStart('Changes in performance'))
|
||||
columns = [
|
||||
'Old, s', # 0
|
||||
'New, s', # 1
|
||||
'Relative difference (new - old)/old', # 2
|
||||
'Randomization distribution quantiles \
|
||||
[5%, 50%, 95%, 99%]', # 3
|
||||
'Test', # 4
|
||||
'Query', # 5
|
||||
]
|
||||
|
||||
print(tableHeader(columns))
|
||||
|
||||
attrs = ['' for c in columns]
|
||||
for row in rows:
|
||||
if float(row[2]) < 0.:
|
||||
faster_queries += 1
|
||||
attrs[2] = 'style="background: #adbdff"'
|
||||
else:
|
||||
slower_queries += 1
|
||||
attrs[2] = 'style="background: #ffb0a0"'
|
||||
|
||||
print(tableRow(row, attrs))
|
||||
|
||||
print(tableEnd())
|
||||
|
||||
print_changes()
|
||||
|
||||
slow_on_client_rows = tsvRows('slow-on-client.tsv')
|
||||
error_tests += len(slow_on_client_rows)
|
||||
printSimpleTable('Slow on client',
|
||||
['Client time, s', 'Server time, s', 'Ratio', 'Query'],
|
||||
slow_on_client_rows)
|
||||
|
||||
def print_unstable_queries():
|
||||
global unstable_queries
|
||||
global very_unstable_queries
|
||||
|
||||
unstable_rows = tsvRows('unstable-queries.tsv')
|
||||
if not unstable_rows:
|
||||
return
|
||||
|
||||
unstable_queries += len(unstable_rows)
|
||||
|
||||
columns = [
|
||||
'Old, s', #0
|
||||
'New, s', #1
|
||||
'Relative difference (new - old)/old', #2
|
||||
'Randomization distribution quantiles [5%, 50%, 95%, 99%]', #3
|
||||
'Test', #4
|
||||
'Query' #5
|
||||
]
|
||||
|
||||
print(tableHeader(columns))
|
||||
print(tableStart('Unstable queries'))
|
||||
print(tableHeader(columns))
|
||||
|
||||
attrs = ['' for c in columns]
|
||||
for row in rows:
|
||||
if float(row[2]) < 0.:
|
||||
faster_queries += 1
|
||||
attrs[2] = 'style="background: #adbdff"'
|
||||
else:
|
||||
slower_queries += 1
|
||||
attrs[2] = 'style="background: #ffb0a0"'
|
||||
attrs = ['' for c in columns]
|
||||
for r in unstable_rows:
|
||||
rd = ast.literal_eval(r[3])
|
||||
# Note the zero-based array index, this is rd[3] in SQL.
|
||||
if rd[2] > 0.2:
|
||||
very_unstable_queries += 1
|
||||
attrs[3] = 'style="background: #ffb0a0"'
|
||||
else:
|
||||
attrs[3] = ''
|
||||
|
||||
print(tableRow(row, attrs))
|
||||
print(tableRow(r, attrs))
|
||||
|
||||
print(tableEnd())
|
||||
print(tableEnd())
|
||||
|
||||
print_changes()
|
||||
print_unstable_queries()
|
||||
|
||||
slow_on_client_rows = tsvRows('slow-on-client.tsv')
|
||||
error_tests += len(slow_on_client_rows)
|
||||
printSimpleTable('Slow on client',
|
||||
['Client time, s', 'Server time, s', 'Ratio', 'Query'],
|
||||
slow_on_client_rows)
|
||||
run_error_rows = tsvRows('run-errors.tsv')
|
||||
error_tests += len(run_error_rows)
|
||||
printSimpleTable('Run errors', ['Test', 'Error'], run_error_rows)
|
||||
|
||||
def print_unstable_queries():
|
||||
global unstable_queries
|
||||
global very_unstable_queries
|
||||
skipped_tests_rows = tsvRows('skipped-tests.tsv')
|
||||
printSimpleTable('Skipped tests', ['Test', 'Reason'], skipped_tests_rows)
|
||||
|
||||
unstable_rows = tsvRows('unstable-queries.tsv')
|
||||
if not unstable_rows:
|
||||
return
|
||||
printSimpleTable('Tests with most unstable queries',
|
||||
['Test', 'Unstable', 'Changed perf', 'Total not OK'],
|
||||
tsvRows('bad-tests.tsv'))
|
||||
|
||||
unstable_queries += len(unstable_rows)
|
||||
def print_test_times():
|
||||
global slow_average_tests
|
||||
rows = tsvRows('test-times.tsv')
|
||||
if not rows:
|
||||
return
|
||||
|
||||
columns = [
|
||||
'Old, s', #0
|
||||
'New, s', #1
|
||||
'Relative difference (new - old)/old', #2
|
||||
'Randomization distribution quantiles [5%, 50%, 95%, 99%]', #3
|
||||
'Test', #4
|
||||
'Query' #5
|
||||
]
|
||||
columns = [
|
||||
'Test', #0
|
||||
'Wall clock time, s', #1
|
||||
'Total client time, s', #2
|
||||
'Total queries', #3
|
||||
'Ignored short queries', #4
|
||||
'Longest query<br>(sum for all runs), s', #5
|
||||
'Avg wall clock time<br>(sum for all runs), s', #6
|
||||
'Shortest query<br>(sum for all runs), s', #7
|
||||
]
|
||||
|
||||
print(tableStart('Unstable queries'))
|
||||
print(tableHeader(columns))
|
||||
print(tableStart('Test times'))
|
||||
print(tableHeader(columns))
|
||||
|
||||
attrs = ['' for c in columns]
|
||||
for r in unstable_rows:
|
||||
rd = ast.literal_eval(r[3])
|
||||
# Note the zero-based array index, this is rd[3] in SQL.
|
||||
if rd[2] > 0.2:
|
||||
very_unstable_queries += 1
|
||||
attrs[3] = 'style="background: #ffb0a0"'
|
||||
else:
|
||||
attrs[3] = ''
|
||||
attrs = ['' for c in columns]
|
||||
for r in rows:
|
||||
if float(r[6]) > 22:
|
||||
# FIXME should be 15s max -- investigate parallel_insert
|
||||
slow_average_tests += 1
|
||||
attrs[6] = 'style="background: #ffb0a0"'
|
||||
else:
|
||||
attrs[6] = ''
|
||||
|
||||
print(tableRow(r, attrs))
|
||||
if float(r[5]) > 30:
|
||||
slow_average_tests += 1
|
||||
attrs[5] = 'style="background: #ffb0a0"'
|
||||
else:
|
||||
attrs[5] = ''
|
||||
|
||||
print(tableEnd())
|
||||
print(tableRow(r, attrs))
|
||||
|
||||
print_unstable_queries()
|
||||
print(tableEnd())
|
||||
|
||||
run_error_rows = tsvRows('run-errors.tsv')
|
||||
error_tests += len(run_error_rows)
|
||||
printSimpleTable('Run errors', ['Test', 'Error'], run_error_rows)
|
||||
print_test_times()
|
||||
|
||||
skipped_tests_rows = tsvRows('skipped-tests.tsv')
|
||||
printSimpleTable('Skipped tests', ['Test', 'Reason'], skipped_tests_rows)
|
||||
|
||||
printSimpleTable('Tests with most unstable queries',
|
||||
['Test', 'Unstable', 'Changed perf', 'Total not OK'],
|
||||
tsvRows('bad-tests.tsv'))
|
||||
|
||||
def print_test_times():
|
||||
global slow_average_tests
|
||||
rows = tsvRows('test-times.tsv')
|
||||
if not rows:
|
||||
return
|
||||
|
||||
columns = [
|
||||
'Test', #0
|
||||
'Wall clock time, s', #1
|
||||
'Total client time, s', #2
|
||||
'Total queries', #3
|
||||
'Ignored short queries', #4
|
||||
'Longest query<br>(sum for all runs), s', #5
|
||||
'Avg wall clock time<br>(sum for all runs), s', #6
|
||||
'Shortest query<br>(sum for all runs), s', #7
|
||||
]
|
||||
|
||||
print(tableStart('Test times'))
|
||||
print(tableHeader(columns))
|
||||
|
||||
attrs = ['' for c in columns]
|
||||
for r in rows:
|
||||
if float(r[6]) > 22:
|
||||
# FIXME should be 15s max -- investigate parallel_insert
|
||||
slow_average_tests += 1
|
||||
attrs[6] = 'style="background: #ffb0a0"'
|
||||
else:
|
||||
attrs[6] = ''
|
||||
|
||||
if float(r[5]) > 30:
|
||||
slow_average_tests += 1
|
||||
attrs[5] = 'style="background: #ffb0a0"'
|
||||
else:
|
||||
attrs[5] = ''
|
||||
|
||||
print(tableRow(r, attrs))
|
||||
|
||||
print(tableEnd())
|
||||
|
||||
print_test_times()
|
||||
|
||||
# Add the errors reported by various steps of comparison script
|
||||
report_errors += [l.strip() for l in open('report-errors.rep')]
|
||||
if len(report_errors):
|
||||
print(tableStart('Errors while building the report'))
|
||||
print(tableHeader(['Error']))
|
||||
for x in report_errors:
|
||||
print(tableRow([x]))
|
||||
print(tableEnd())
|
||||
# Add the errors reported by various steps of comparison script
|
||||
report_errors += [l.strip() for l in open('report-errors.rep')]
|
||||
if len(report_errors):
|
||||
print(tableStart('Errors while building the report'))
|
||||
print(tableHeader(['Error']))
|
||||
for x in report_errors:
|
||||
print(tableRow([x]))
|
||||
print(tableEnd())
|
||||
|
||||
|
||||
print("""
|
||||
<p class="links">
|
||||
<a href="output.7z">Test output</a>
|
||||
<a href="compare.log">Log</a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
""")
|
||||
print("""
|
||||
<p class="links">
|
||||
<a href="output.7z">Test output</a>
|
||||
<a href="all-queries.html">All queries</a>
|
||||
<a href="compare.log">Log</a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
""")
|
||||
|
||||
status = 'success'
|
||||
message = 'See the report'
|
||||
message_array = []
|
||||
status = 'success'
|
||||
message = 'See the report'
|
||||
message_array = []
|
||||
|
||||
if slow_average_tests:
|
||||
status = 'failure'
|
||||
message_array.append(str(slow_average_tests) + ' too long')
|
||||
if slow_average_tests:
|
||||
status = 'failure'
|
||||
message_array.append(str(slow_average_tests) + ' too long')
|
||||
|
||||
if faster_queries:
|
||||
message_array.append(str(faster_queries) + ' faster')
|
||||
if faster_queries:
|
||||
message_array.append(str(faster_queries) + ' faster')
|
||||
|
||||
if slower_queries:
|
||||
status = 'failure'
|
||||
message_array.append(str(slower_queries) + ' slower')
|
||||
if slower_queries:
|
||||
status = 'failure'
|
||||
message_array.append(str(slower_queries) + ' slower')
|
||||
|
||||
if unstable_queries:
|
||||
message_array.append(str(unstable_queries) + ' unstable')
|
||||
if unstable_queries:
|
||||
message_array.append(str(unstable_queries) + ' unstable')
|
||||
|
||||
if very_unstable_queries:
|
||||
status = 'failure'
|
||||
if very_unstable_queries:
|
||||
status = 'failure'
|
||||
|
||||
error_tests += slow_average_tests
|
||||
if error_tests:
|
||||
status = 'failure'
|
||||
message_array.append(str(error_tests) + ' errors')
|
||||
error_tests += slow_average_tests
|
||||
if error_tests:
|
||||
status = 'failure'
|
||||
message_array.append(str(error_tests) + ' errors')
|
||||
|
||||
if message_array:
|
||||
message = ', '.join(message_array)
|
||||
if message_array:
|
||||
message = ', '.join(message_array)
|
||||
|
||||
if report_errors:
|
||||
status = 'failure'
|
||||
message = 'Errors while building the report.'
|
||||
if report_errors:
|
||||
status = 'failure'
|
||||
message = 'Errors while building the report.'
|
||||
|
||||
print("""
|
||||
<!--status: {status}-->
|
||||
<!--message: {message}-->
|
||||
""".format(status=status, message=message))
|
||||
print("""
|
||||
<!--status: {status}-->
|
||||
<!--message: {message}-->
|
||||
""".format(status=status, message=message))
|
||||
|
||||
elif args.report == 'all-queries':
|
||||
|
||||
print(header_template.format())
|
||||
|
||||
printSimpleTable('Tested commits', ['Old', 'New'],
|
||||
[['<pre>{}</pre>'.format(x) for x in
|
||||
[open('left-commit.txt').read(),
|
||||
open('right-commit.txt').read()]]])
|
||||
|
||||
def print_all_queries():
|
||||
rows = tsvRows('all-queries.tsv')
|
||||
if not rows:
|
||||
return
|
||||
|
||||
columns = [
|
||||
'Old, s', #0
|
||||
'New, s', #1
|
||||
'Relative difference (new - old)/old', #2
|
||||
'Times speedup/slowdown', #3
|
||||
'Randomization distribution quantiles \
|
||||
[5%, 50%, 95%, 99%]', #4
|
||||
'Test', #5
|
||||
'Query', #6
|
||||
]
|
||||
|
||||
print(tableStart('All query times'))
|
||||
print(tableHeader(columns))
|
||||
|
||||
attrs = ['' for c in columns]
|
||||
for r in rows:
|
||||
if float(r[2]) > 0.05:
|
||||
attrs[3] = 'style="background: #ffb0a0"'
|
||||
elif float(r[2]) < -0.05:
|
||||
attrs[3] = 'style="background: #adbdff"'
|
||||
else:
|
||||
attrs[3] = ''
|
||||
|
||||
print(tableRow(r, attrs))
|
||||
|
||||
print(tableEnd())
|
||||
|
||||
print_all_queries()
|
||||
|
||||
print("""
|
||||
<p class="links">
|
||||
<a href="output.7z">Test output</a>
|
||||
<a href="report.html">Main report</a>
|
||||
<a href="compare.log">Log</a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
""")
|
||||
|
@ -1 +0,0 @@
|
||||
../../../CHANGELOG.md
|
@ -7,7 +7,7 @@
|
||||
|
||||
[Yandex Managed Service for ClickHouse](https://cloud.yandex.com/services/managed-clickhouse?utm_source=referrals&utm_medium=clickhouseofficialsite&utm_campaign=link3) provides the following key features:
|
||||
|
||||
- Fully managed ZooKeeper service for [ClickHouse replication](../operations/table_engines/replication.md)
|
||||
- Fully managed ZooKeeper service for [ClickHouse replication](../engines/table_engines/mergetree_family/replication.md)
|
||||
- Multiple storage type choices
|
||||
- Replicas in different availability zones
|
||||
- Encryption and isolation
|
||||
|
5
docs/en/commercial/index.md
Normal file
5
docs/en/commercial/index.md
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
toc_folder_title: Commercial
|
||||
toc_priority: 70
|
||||
---
|
||||
|
@ -1,5 +0,0 @@
|
||||
# Set {#set}
|
||||
|
||||
Used for the right half of an [IN](../../query_language/select.md#select-in-operators) expression.
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/data_types/special_data_types/set/) <!--hide-->
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 62
|
||||
toc_title: Overview of ClickHouse Architecture
|
||||
---
|
||||
|
||||
# Overview of ClickHouse Architecture {#overview-of-clickhouse-architecture}
|
||||
|
||||
ClickHouse is a true column-oriented DBMS. Data is stored by columns and during the execution of arrays (vectors or chunks of columns). Whenever possible, operations are dispatched on arrays, rather than on individual values. It is called “vectorized query execution,” and it helps lower the cost of actual data processing.
|
||||
@ -115,7 +120,7 @@ There are ordinary functions and aggregate functions. For aggregate functions, s
|
||||
|
||||
Ordinary functions don’t change the number of rows – they work as if they are processing each row independently. In fact, functions are not called for individual rows, but for `Block`’s of data to implement vectorized query execution.
|
||||
|
||||
There are some miscellaneous functions, like [blockSize](../query_language/functions/other_functions.md#function-blocksize), [rowNumberInBlock](../query_language/functions/other_functions.md#function-rownumberinblock), and [runningAccumulate](../query_language/functions/other_functions.md#function-runningaccumulate), that exploit block processing and violate the independence of rows.
|
||||
There are some miscellaneous functions, like [blockSize](../sql_reference/functions/other_functions.md#function-blocksize), [rowNumberInBlock](../sql_reference/functions/other_functions.md#function-rownumberinblock), and [runningAccumulate](../sql_reference/functions/other_functions.md#function-runningaccumulate), that exploit block processing and violate the independence of rows.
|
||||
|
||||
ClickHouse has strong typing, so there’s no implicit type conversion. If a function doesn’t support a specific combination of types, it throws an exception. But functions can work (be overloaded) for many different combinations of types. For example, the `plus` function (to implement the `+` operator) works for any combination of numeric types: `UInt8` + `Float32`, `UInt16` + `Int8`, and so on. Also, some variadic functions can accept any number of arguments, such as the `concat` function.
|
||||
|
||||
|
@ -1,6 +1,11 @@
|
||||
---
|
||||
toc_priority: 63
|
||||
toc_title: Browse ClickHouse Source Code
|
||||
---
|
||||
|
||||
# Browse ClickHouse Source Code {#browse-clickhouse-source-code}
|
||||
|
||||
You can use **Woboq** online code browser available [here](https://clickhouse-test-reports.s3.yandex.net/codebrowser/html_report///ClickHouse/dbms/index.html). It provides code navigation and semantic highlighting, search and indexing. The code snapshot is updated daily.
|
||||
You can use **Woboq** online code browser available [here](https://clickhouse-test-reports.s3.yandex.net/codebrowser/html_report///ClickHouse/src/index.html). It provides code navigation and semantic highlighting, search and indexing. The code snapshot is updated daily.
|
||||
|
||||
Also, you can browse sources on [GitHub](https://github.com/ClickHouse/ClickHouse) as usual.
|
||||
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 64
|
||||
toc_title: How to Build ClickHouse on Linux
|
||||
---
|
||||
|
||||
# How to Build ClickHouse for Development {#how-to-build-clickhouse-for-development}
|
||||
|
||||
The following tutorial is based on the Ubuntu Linux system.
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 67
|
||||
toc_title: How to Build ClickHouse on Linux for AARCH64 (ARM64)
|
||||
---
|
||||
|
||||
# How to Build ClickHouse on Linux for AARCH64 (ARM64) architecture {#how-to-build-clickhouse-on-linux-for-aarch64-arm64-architecture}
|
||||
|
||||
This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on another Linux machine with AARCH64 CPU architecture. This is intended for continuous integration checks that run on Linux servers.
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 66
|
||||
toc_title: How to Build ClickHouse on Linux for Mac OS X
|
||||
---
|
||||
|
||||
# How to Build ClickHouse on Linux for Mac OS X {#how-to-build-clickhouse-on-linux-for-mac-os-x}
|
||||
|
||||
This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on OS X. This is intended for continuous integration checks that run on Linux servers. If you want to build ClickHouse directly on Mac OS X, then proceed with [another instruction](build_osx.md).
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 65
|
||||
toc_title: How to Build ClickHouse on Mac OS X
|
||||
---
|
||||
|
||||
# How to Build ClickHouse on Mac OS X {#how-to-build-clickhouse-on-mac-os-x}
|
||||
|
||||
Build should work on Mac OS X 10.15 (Catalina)
|
||||
|
@ -1,7 +1,12 @@
|
||||
---
|
||||
toc_priority: 70
|
||||
toc_title: Third-Party Libraries Used
|
||||
---
|
||||
|
||||
# Third-Party Libraries Used {#third-party-libraries-used}
|
||||
|
||||
| Library | License |
|
||||
|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
|-------------|--------------------------------------------------------------------------------------|
|
||||
| base64 | [BSD 2-Clause License](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) |
|
||||
| boost | [Boost Software License 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) |
|
||||
| brotli | [MIT](https://github.com/google/brotli/blob/master/LICENSE) |
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 61
|
||||
toc_title: The Beginner ClickHouse Developer Instruction
|
||||
---
|
||||
|
||||
Building of ClickHouse is supported on Linux, FreeBSD and Mac OS X.
|
||||
|
||||
# If you use Windows {#if-you-use-windows}
|
||||
|
@ -1,3 +1,10 @@
|
||||
---
|
||||
toc_folder_title: Development
|
||||
toc_hidden: true
|
||||
toc_priority: 58
|
||||
toc_title: hidden
|
||||
---
|
||||
|
||||
# ClickHouse Development {#clickhouse-development}
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/development/) <!--hide-->
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 68
|
||||
toc_title: How to Write C++ Code
|
||||
---
|
||||
|
||||
# How to Write C++ Code {#how-to-write-c-code}
|
||||
|
||||
## General Recommendations {#general-recommendations}
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 69
|
||||
toc_title: How to Run ClickHouse Tests
|
||||
---
|
||||
|
||||
# ClickHouse Testing {#clickhouse-testing}
|
||||
|
||||
## Functional Tests {#functional-tests}
|
||||
@ -55,7 +60,7 @@ If you want to improve performance of ClickHouse in some scenario, and if improv
|
||||
|
||||
## Test Tools And Scripts {#test-tools-and-scripts}
|
||||
|
||||
Some programs in `tests` directory are not prepared tests, but are test tools. For example, for `Lexer` there is a tool `dbms/Parsers/tests/lexer` that just do tokenization of stdin and writes colorized result to stdout. You can use these kind of tools as a code examples and for exploration and manual testing.
|
||||
Some programs in `tests` directory are not prepared tests, but are test tools. For example, for `Lexer` there is a tool `src/Parsers/tests/lexer` that just do tokenization of stdin and writes colorized result to stdout. You can use these kind of tools as a code examples and for exploration and manual testing.
|
||||
|
||||
You can also place pair of files `.sh` and `.reference` along with the tool to run it on some predefined input - then script result can be compared to `.reference` file. These kind of tests are not automated.
|
||||
|
||||
|
@ -1,8 +1,14 @@
|
||||
---
|
||||
toc_folder_title: Database Engines
|
||||
toc_priority: 27
|
||||
toc_title: Introduction
|
||||
---
|
||||
|
||||
# Database Engines {#database-engines}
|
||||
|
||||
Database engines allow you to work with tables.
|
||||
|
||||
By default, ClickHouse uses its native database engine, which provides configurable [table engines](../operations/table_engines/index.md) and an [SQL dialect](../query_language/syntax.md).
|
||||
By default, ClickHouse uses its native database engine, which provides configurable [table engines](../../engines/table_engines/index.md) and an [SQL dialect](../../sql_reference/syntax.md).
|
||||
|
||||
You can also use the following database engines:
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 31
|
||||
toc_title: Lazy
|
||||
---
|
||||
|
||||
# Lazy {#lazy}
|
||||
|
||||
Keeps tables in RAM only `expiration_time_in_seconds` seconds after last access. Can be used only with \*Log tables.
|
@ -1,4 +1,9 @@
|
||||
# MySQL {#mysql}
|
||||
---
|
||||
toc_priority: 30
|
||||
toc_title: MySQL
|
||||
---
|
||||
|
||||
# Mysql {#mysql}
|
||||
|
||||
Allows to connect to databases on a remote MySQL server and perform `INSERT` and `SELECT` queries to exchange data between ClickHouse and MySQL.
|
||||
|
||||
@ -26,27 +31,27 @@ ENGINE = MySQL('host:port', 'database', 'user', 'password')
|
||||
|
||||
## Data Types Support {#data_types-support}
|
||||
|
||||
| MySQL | ClickHouse |
|
||||
|----------------------------------|---------------------------------------------|
|
||||
| UNSIGNED TINYINT | [UInt8](../data_types/int_uint.md) |
|
||||
| TINYINT | [Int8](../data_types/int_uint.md) |
|
||||
| UNSIGNED SMALLINT | [UInt16](../data_types/int_uint.md) |
|
||||
| SMALLINT | [Int16](../data_types/int_uint.md) |
|
||||
| UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../data_types/int_uint.md) |
|
||||
| INT, MEDIUMINT | [Int32](../data_types/int_uint.md) |
|
||||
| UNSIGNED BIGINT | [UInt64](../data_types/int_uint.md) |
|
||||
| BIGINT | [Int64](../data_types/int_uint.md) |
|
||||
| FLOAT | [Float32](../data_types/float.md) |
|
||||
| DOUBLE | [Float64](../data_types/float.md) |
|
||||
| DATE | [Date](../data_types/date.md) |
|
||||
| DATETIME, TIMESTAMP | [DateTime](../data_types/datetime.md) |
|
||||
| BINARY | [FixedString](../data_types/fixedstring.md) |
|
||||
| MySQL | ClickHouse |
|
||||
|----------------------|--------------------------------------|
|
||||
| UNSIGNED TINYINT | [UInt8](../../sql_reference/data_types/int_uint.md) |
|
||||
| TINYINT | [Int8](../../sql_reference/data_types/int_uint.md) |
|
||||
| UNSIGNED SMALLINT | [UInt16](../../sql_reference/data_types/int_uint.md) |
|
||||
| SMALLINT | [Int16](../../sql_reference/data_types/int_uint.md) |
|
||||
| UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../../sql_reference/data_types/int_uint.md) |
|
||||
| INT, MEDIUMINT | [Int32](../../sql_reference/data_types/int_uint.md) |
|
||||
| UNSIGNED BIGINT | [UInt64](../../sql_reference/data_types/int_uint.md) |
|
||||
| BIGINT | [Int64](../../sql_reference/data_types/int_uint.md) |
|
||||
| FLOAT | [Float32](../../sql_reference/data_types/float.md) |
|
||||
| DOUBLE | [Float64](../../sql_reference/data_types/float.md) |
|
||||
| DATE | [Date](../../sql_reference/data_types/date.md) |
|
||||
| DATETIME, TIMESTAMP | [DateTime](../../sql_reference/data_types/datetime.md) |
|
||||
| BINARY | [FixedString](../../sql_reference/data_types/fixedstring.md) |
|
||||
|
||||
All other MySQL data types are converted into [String](../data_types/string.md).
|
||||
All other MySQL data types are converted into [String](../../sql_reference/data_types/string.md).
|
||||
|
||||
[Nullable](../data_types/nullable.md) is supported.
|
||||
[Nullable](../../sql_reference/data_types/nullable.md) is supported.
|
||||
|
||||
## Examples of Use {#examples-of-use}
|
||||
## Examples Of Use {#examples-of-use}
|
||||
|
||||
Table in MySQL:
|
||||
|
||||
@ -64,11 +69,11 @@ mysql> insert into mysql_table (`int_id`, `float`) VALUES (1,2);
|
||||
Query OK, 1 row affected (0,00 sec)
|
||||
|
||||
mysql> select * from mysql_table;
|
||||
+--------+-------+
|
||||
+------+-----+
|
||||
| int_id | value |
|
||||
+--------+-------+
|
||||
+------+-----+
|
||||
| 1 | 2 |
|
||||
+--------+-------+
|
||||
+------+-----+
|
||||
1 row in set (0,00 sec)
|
||||
```
|
||||
|
5
docs/en/engines/index.md
Normal file
5
docs/en/engines/index.md
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
toc_folder_title: Engines
|
||||
toc_priority: 25
|
||||
---
|
||||
|
83
docs/en/engines/table_engines/index.md
Normal file
83
docs/en/engines/table_engines/index.md
Normal file
@ -0,0 +1,83 @@
|
||||
---
|
||||
toc_folder_title: Table Engines
|
||||
toc_priority: 26
|
||||
toc_title: Introduction
|
||||
---
|
||||
|
||||
# Table Engines {#table_engines}
|
||||
|
||||
The table engine (type of table) determines:
|
||||
|
||||
- How and where data is stored, where to write it to, and where to read it from.
|
||||
- Which queries are supported, and how.
|
||||
- Concurrent data access.
|
||||
- Use of indexes, if present.
|
||||
- Whether multithreaded request execution is possible.
|
||||
- Data replication parameters.
|
||||
|
||||
## Engine Families {#engine-families}
|
||||
|
||||
### Mergetree {#mergetree}
|
||||
|
||||
The most universal and functional table engines for high-load tasks. The property shared by these engines is quick data insertion with subsequent background data processing. `MergeTree` family engines support data replication (with [Replicated\*](mergetree_family/replication.md) versions of engines), partitioning, and other features not supported in other engines.
|
||||
|
||||
Engines in the family:
|
||||
|
||||
- [MergeTree](mergetree_family/mergetree.md)
|
||||
- [ReplacingMergeTree](mergetree_family/replacingmergetree.md)
|
||||
- [SummingMergeTree](mergetree_family/summingmergetree.md)
|
||||
- [AggregatingMergeTree](mergetree_family/aggregatingmergetree.md)
|
||||
- [CollapsingMergeTree](mergetree_family/collapsingmergetree.md)
|
||||
- [VersionedCollapsingMergeTree](mergetree_family/versionedcollapsingmergetree.md)
|
||||
- [GraphiteMergeTree](mergetree_family/graphitemergetree.md)
|
||||
|
||||
### Log {#log}
|
||||
|
||||
Lightweight [engines](log_family/index.md) with minimum functionality. They’re the most effective when you need to quickly write many small tables (up to approximately 1 million rows) and read them later as a whole.
|
||||
|
||||
Engines in the family:
|
||||
|
||||
- [TinyLog](log_family/tinylog.md)
|
||||
- [StripeLog](log_family/stripelog.md)
|
||||
- [Log](log_family/log.md)
|
||||
|
||||
### Integration Engines {#integration-engines}
|
||||
|
||||
Engines for communicating with other data storage and processing systems.
|
||||
|
||||
Engines in the family:
|
||||
|
||||
- [Kafka](integrations/kafka.md)
|
||||
- [MySQL](integrations/mysql.md)
|
||||
- [ODBC](integrations/odbc.md)
|
||||
- [JDBC](integrations/jdbc.md)
|
||||
- [HDFS](integrations/hdfs.md)
|
||||
|
||||
### Special Engines {#special-engines}
|
||||
|
||||
Engines in the family:
|
||||
|
||||
- [Distributed](special/distributed.md)
|
||||
- [MaterializedView](special/materializedview.md)
|
||||
- [Dictionary](special/dictionary.md)
|
||||
- [Merge](special/merge.md)
|
||||
- [File](special/file.md)
|
||||
- [Null](special/null.md)
|
||||
- [Set](special/set.md)
|
||||
- [Join](special/join.md)
|
||||
- [URL](special/url.md)
|
||||
- [View](special/view.md)
|
||||
- [Memory](special/memory.md)
|
||||
- [Buffer](special/buffer.md)
|
||||
|
||||
## Virtual Columns {#table_engines-virtual-columns}
|
||||
|
||||
Virtual column is an integral table engine attribute that is defined in the engine source code.
|
||||
|
||||
You shouldn’t specify virtual columns in the `CREATE TABLE` query and you can’t see them in `SHOW CREATE TABLE` and `DESCRIBE TABLE` query results. Virtual columns are also read-only, so you can’t insert data into virtual columns.
|
||||
|
||||
To select data from a virtual column, you must specify its name in the `SELECT` query. `SELECT *` doesn’t return values from virtual columns.
|
||||
|
||||
If you create a table with a column that has the same name as one of the table virtual columns, the virtual column becomes inaccessible. We don’t recommend doing this. To help avoid conflicts, virtual column names are usually prefixed with an underscore.
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/) <!--hide-->
|
@ -1,7 +1,12 @@
|
||||
---
|
||||
toc_priority: 36
|
||||
toc_title: HDFS
|
||||
---
|
||||
|
||||
# HDFS {#table_engines-hdfs}
|
||||
|
||||
This engine provides integration with [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)via ClickHouse. This engine is similar
|
||||
to the [File](file.md) and [URL](url.md) engines, but provides Hadoop-specific features.
|
||||
to the [File](../special/file.md) and [URL](../special/url.md) engines, but provides Hadoop-specific features.
|
||||
|
||||
## Usage {#usage}
|
||||
|
||||
@ -13,7 +18,7 @@ The `URI` parameter is the whole file URI in HDFS.
|
||||
The `format` parameter specifies one of the available file formats. To perform
|
||||
`SELECT` queries, the format must be supported for input, and to perform
|
||||
`INSERT` queries – for output. The available formats are listed in the
|
||||
[Formats](../../interfaces/formats.md#formats) section.
|
||||
[Formats](../../../interfaces/formats.md#formats) section.
|
||||
The path part of `URI` may contain globs. In this case the table would be readonly.
|
||||
|
||||
**Example:**
|
||||
@ -60,7 +65,7 @@ Multiple path components can have globs. For being processed file should exists
|
||||
- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`.
|
||||
- `{N..M}` — Substitutes any number in range from N to M including both borders.
|
||||
|
||||
Constructions with `{}` are similar to the [remote](../../query_language/table_functions/remote.md) table function.
|
||||
Constructions with `{}` are similar to the [remote](../../../sql_reference/table_functions/remote.md) table function.
|
||||
|
||||
**Example**
|
||||
|
||||
@ -111,6 +116,6 @@ CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9
|
||||
|
||||
**See Also**
|
||||
|
||||
- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns)
|
||||
- [Virtual columns](../index.md#table_engines-virtual_columns)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/hdfs/) <!--hide-->
|
5
docs/en/engines/table_engines/integrations/index.md
Normal file
5
docs/en/engines/table_engines/integrations/index.md
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
toc_folder_title: Integrations
|
||||
toc_priority: 30
|
||||
---
|
||||
|
@ -1,10 +1,15 @@
|
||||
---
|
||||
toc_priority: 34
|
||||
toc_title: JDBC
|
||||
---
|
||||
|
||||
# JDBC {#table-engine-jdbc}
|
||||
|
||||
Allows ClickHouse to connect to external databases via [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity).
|
||||
|
||||
To implement the JDBC connection, ClickHouse uses the separate program [clickhouse-jdbc-bridge](https://github.com/alex-krash/clickhouse-jdbc-bridge) that should run as a daemon.
|
||||
|
||||
This engine supports the [Nullable](../../data_types/nullable.md) data type.
|
||||
This engine supports the [Nullable](../../../sql_reference/data_types/nullable.md) data type.
|
||||
|
||||
## Creating a Table {#creating-a-table}
|
||||
|
||||
@ -44,11 +49,11 @@ mysql> insert into test (`int_id`, `float`) VALUES (1,2);
|
||||
Query OK, 1 row affected (0,00 sec)
|
||||
|
||||
mysql> select * from test;
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
| int_id | int_nullable | float | float_nullable |
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
| 1 | NULL | 2 | NULL |
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
1 row in set (0,00 sec)
|
||||
```
|
||||
|
||||
@ -78,6 +83,6 @@ FROM jdbc_table
|
||||
|
||||
## See Also {#see-also}
|
||||
|
||||
- [JDBC table function](../../query_language/table_functions/jdbc.md).
|
||||
- [JDBC table function](../../../sql_reference/table_functions/jdbc.md).
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/jdbc/) <!--hide-->
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 32
|
||||
toc_title: Kafka
|
||||
---
|
||||
|
||||
# Kafka {#kafka}
|
||||
|
||||
This engine works with [Apache Kafka](http://kafka.apache.org/).
|
||||
@ -33,7 +38,7 @@ Required parameters:
|
||||
- `kafka_broker_list` – A comma-separated list of brokers (for example, `localhost:9092`).
|
||||
- `kafka_topic_list` – A list of Kafka topics.
|
||||
- `kafka_group_name` – A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere.
|
||||
- `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../interfaces/formats.md) section.
|
||||
- `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section.
|
||||
|
||||
Optional parameters:
|
||||
|
||||
@ -123,7 +128,7 @@ Example:
|
||||
SELECT level, sum(total) FROM daily GROUP BY level;
|
||||
```
|
||||
|
||||
To improve performance, received messages are grouped into blocks the size of [max\_insert\_block\_size](../settings/settings.md#settings-max_insert_block_size). If the block wasn’t formed within [stream\_flush\_interval\_ms](../settings/settings.md) milliseconds, the data will be flushed to the table regardless of the completeness of the block.
|
||||
To improve performance, received messages are grouped into blocks the size of [max\_insert\_block\_size](../../../operations/server_configuration_parameters/settings.md#settings-max_insert_block_size). If the block wasn’t formed within [stream\_flush\_interval\_ms](../../../operations/server_configuration_parameters/settings.md) milliseconds, the data will be flushed to the table regardless of the completeness of the block.
|
||||
|
||||
To stop receiving topic data or to change the conversion logic, detach the materialized view:
|
||||
|
||||
@ -164,6 +169,6 @@ For a list of possible configuration options, see the [librdkafka configuration
|
||||
|
||||
**See Also**
|
||||
|
||||
- [Virtual columns](index.md#table_engines-virtual_columns)
|
||||
- [Virtual columns](../index.md#table_engines-virtual_columns)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/kafka/) <!--hide-->
|
@ -1,4 +1,9 @@
|
||||
# MySQL {#mysql}
|
||||
---
|
||||
toc_priority: 33
|
||||
toc_title: MySQL
|
||||
---
|
||||
|
||||
# Mysql {#mysql}
|
||||
|
||||
The MySQL engine allows you to perform `SELECT` queries on data that is stored on a remote MySQL server.
|
||||
|
||||
@ -13,12 +18,12 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
) ENGINE = MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']);
|
||||
```
|
||||
|
||||
See a detailed description of the [CREATE TABLE](../../query_language/create.md#create-table-query) query.
|
||||
See a detailed description of the [CREATE TABLE](../../../sql_reference/statements/create.md#create-table-query) query.
|
||||
|
||||
The table structure can differ from the original MySQL table structure:
|
||||
|
||||
- Column names should be the same as in the original MySQL table, but you can use just some of these columns and in any order.
|
||||
- Column types may differ from those in the original MySQL table. ClickHouse tries to [cast](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) values to the ClickHouse data types.
|
||||
- Column types may differ from those in the original MySQL table. ClickHouse tries to [cast](../../../sql_reference/functions/type_conversion_functions.md#type_conversion_function-cast) values to the ClickHouse data types.
|
||||
|
||||
**Engine Parameters**
|
||||
|
||||
@ -61,11 +66,11 @@ mysql> insert into test (`int_id`, `float`) VALUES (1,2);
|
||||
Query OK, 1 row affected (0,00 sec)
|
||||
|
||||
mysql> select * from test;
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
| int_id | int_nullable | float | float_nullable |
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
| 1 | NULL | 2 | NULL |
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
1 row in set (0,00 sec)
|
||||
```
|
||||
|
||||
@ -92,7 +97,7 @@ SELECT * FROM mysql_table
|
||||
|
||||
## See Also {#see-also}
|
||||
|
||||
- [The ‘mysql’ table function](../../query_language/table_functions/mysql.md)
|
||||
- [Using MySQL as a source of external dictionary](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql)
|
||||
- [The ‘mysql’ table function](../../../sql_reference/table_functions/mysql.md)
|
||||
- [Using MySQL as a source of external dictionary](../../../sql_reference/dictionaries/external_dictionaries/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/mysql/) <!--hide-->
|
@ -1,10 +1,15 @@
|
||||
---
|
||||
toc_priority: 35
|
||||
toc_title: ODBC
|
||||
---
|
||||
|
||||
# ODBC {#table-engine-odbc}
|
||||
|
||||
Allows ClickHouse to connect to external databases via [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity).
|
||||
|
||||
To safely implement ODBC connections, ClickHouse uses a separate program `clickhouse-odbc-bridge`. If the ODBC driver is loaded directly from `clickhouse-server`, driver problems can crash the ClickHouse server. ClickHouse automatically starts `clickhouse-odbc-bridge` when it is required. The ODBC bridge program is installed from the same package as the `clickhouse-server`.
|
||||
|
||||
This engine supports the [Nullable](../../data_types/nullable.md) data type.
|
||||
This engine supports the [Nullable](../../../sql_reference/data_types/nullable.md) data type.
|
||||
|
||||
## Creating a Table {#creating-a-table}
|
||||
|
||||
@ -18,12 +23,12 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
ENGINE = ODBC(connection_settings, external_database, external_table)
|
||||
```
|
||||
|
||||
See a detailed description of the [CREATE TABLE](../../query_language/create.md#create-table-query) query.
|
||||
See a detailed description of the [CREATE TABLE](../../../sql_reference/statements/create.md#create-table-query) query.
|
||||
|
||||
The table structure can differ from the source table structure:
|
||||
|
||||
- Column names should be the same as in the source table, but you can use just some of these columns and in any order.
|
||||
- Column types may differ from those in the source table. ClickHouse tries to [cast](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) values to the ClickHouse data types.
|
||||
- Column types may differ from those in the source table. ClickHouse tries to [cast](../../../sql_reference/functions/type_conversion_functions.md#type_conversion_function-cast) values to the ClickHouse data types.
|
||||
|
||||
**Engine Parameters**
|
||||
|
||||
@ -67,7 +72,7 @@ You can check the connection using the `isql` utility from the unixODBC installa
|
||||
|
||||
``` bash
|
||||
$ isql -v mysqlconn
|
||||
+---------------------------------------+
|
||||
+-------------------------+
|
||||
| Connected! |
|
||||
| |
|
||||
...
|
||||
@ -88,11 +93,11 @@ mysql> insert into test (`int_id`, `float`) VALUES (1,2);
|
||||
Query OK, 1 row affected (0,00 sec)
|
||||
|
||||
mysql> select * from test;
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
| int_id | int_nullable | float | float_nullable |
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
| 1 | NULL | 2 | NULL |
|
||||
+--------+--------------+-------+----------------+
|
||||
+------+----------+-----+----------+
|
||||
1 row in set (0,00 sec)
|
||||
```
|
||||
|
||||
@ -119,7 +124,7 @@ SELECT * FROM odbc_t
|
||||
|
||||
## See Also {#see-also}
|
||||
|
||||
- [ODBC external dictionaries](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc)
|
||||
- [ODBC table function](../../query_language/table_functions/odbc.md)
|
||||
- [ODBC external dictionaries](../../../sql_reference/dictionaries/external_dictionaries/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc)
|
||||
- [ODBC table function](../../../sql_reference/table_functions/odbc.md)
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/odbc/) <!--hide-->
|
5
docs/en/engines/table_engines/log_family/index.md
Normal file
5
docs/en/engines/table_engines/log_family/index.md
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
toc_folder_title: Log Family
|
||||
toc_priority: 29
|
||||
---
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 33
|
||||
toc_title: Log
|
||||
---
|
||||
|
||||
# Log {#log}
|
||||
|
||||
Engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](log_family.md) article.
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 31
|
||||
toc_title: Introduction
|
||||
---
|
||||
|
||||
# Log Engine Family {#log-engine-family}
|
||||
|
||||
These engines were developed for scenarios when you need to quickly write many small tables (up to about 1 million rows) and read them later as a whole.
|
||||
@ -8,7 +13,7 @@ Engines of the family:
|
||||
- [Log](log.md)
|
||||
- [TinyLog](tinylog.md)
|
||||
|
||||
## Common properties {#common-properties}
|
||||
## Common Properties {#common-properties}
|
||||
|
||||
Engines:
|
||||
|
||||
@ -20,7 +25,7 @@ Engines:
|
||||
|
||||
During `INSERT` queries, the table is locked, and other queries for reading and writing data both wait for the table to unlock. If there are no data writing queries, any number of data reading queries can be performed concurrently.
|
||||
|
||||
- Do not support [mutation](../../query_language/alter.md#alter-mutations) operations.
|
||||
- Do not support [mutation](../../../sql_reference/statements/alter.md#alter-mutations) operations.
|
||||
|
||||
- Do not support indexes.
|
||||
|
@ -1,4 +1,9 @@
|
||||
# StripeLog {#stripelog}
|
||||
---
|
||||
toc_priority: 32
|
||||
toc_title: StripeLog
|
||||
---
|
||||
|
||||
# Stripelog {#stripelog}
|
||||
|
||||
This engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](log_family.md) article.
|
||||
|
||||
@ -15,7 +20,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
) ENGINE = StripeLog
|
||||
```
|
||||
|
||||
See the detailed description of the [CREATE TABLE](../../query_language/create.md#create-table-query) query.
|
||||
See the detailed description of the [CREATE TABLE](../../../sql_reference/statements/create.md#create-table-query) query.
|
||||
|
||||
## Writing the Data {#table_engines-stripelog-writing-the-data}
|
||||
|
||||
@ -32,7 +37,7 @@ The `StripeLog` engine does not support the `ALTER UPDATE` and `ALTER DELETE` op
|
||||
|
||||
The file with marks allows ClickHouse to parallelize the reading of data. This means that a `SELECT` query returns rows in an unpredictable order. Use the `ORDER BY` clause to sort rows.
|
||||
|
||||
## Example of Use {#table_engines-stripelog-example-of-use}
|
||||
## Example Of Use {#table_engines-stripelog-example-of-use}
|
||||
|
||||
Creating a table:
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 34
|
||||
toc_title: TinyLog
|
||||
---
|
||||
|
||||
# TinyLog {#tinylog}
|
||||
|
||||
The engine belongs to the log engine family. See [Log Engine Family](log_family.md) for common properties of log engines and their differences.
|
@ -1,10 +1,15 @@
|
||||
# AggregatingMergeTree {#aggregatingmergetree}
|
||||
---
|
||||
toc_priority: 35
|
||||
toc_title: AggregatingMergeTree
|
||||
---
|
||||
|
||||
# Aggregatingmergetree {#aggregatingmergetree}
|
||||
|
||||
The engine inherits from [MergeTree](mergetree.md#table_engines-mergetree), altering the logic for data parts merging. ClickHouse replaces all rows with the same primary key (or more accurately, with the same [sorting key](mergetree.md)) with a single row (within a one data part) that stores a combination of states of aggregate functions.
|
||||
|
||||
You can use `AggregatingMergeTree` tables for incremental data aggregation, including for aggregated materialized views.
|
||||
|
||||
The engine processes all columns with [AggregateFunction](../../data_types/nested_data_structures/aggregatefunction.md) type.
|
||||
The engine processes all columns with [AggregateFunction](../../../sql_reference/data_types/aggregatefunction.md) type.
|
||||
|
||||
It is appropriate to use `AggregatingMergeTree` if it reduces the number of rows by orders.
|
||||
|
||||
@ -24,7 +29,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
[SETTINGS name=value, ...]
|
||||
```
|
||||
|
||||
For a description of request parameters, see [request description](../../query_language/create.md).
|
||||
For a description of request parameters, see [request description](../../../sql_reference/statements/create.md).
|
||||
|
||||
**Query clauses**
|
||||
|
||||
@ -51,12 +56,12 @@ All of the parameters have the same meaning as in `MergeTree`.
|
||||
|
||||
## SELECT and INSERT {#select-and-insert}
|
||||
|
||||
To insert data, use [INSERT SELECT](../../query_language/insert_into.md) query with aggregate -State- functions.
|
||||
To insert data, use [INSERT SELECT](../../../sql_reference/statements/insert_into.md) query with aggregate -State- functions.
|
||||
When selecting data from `AggregatingMergeTree` table, use `GROUP BY` clause and the same aggregate functions as when inserting data, but using `-Merge` suffix.
|
||||
|
||||
In the results of `SELECT` query, the values of `AggregateFunction` type have implementation-specific binary representation for all of the ClickHouse output formats. If dump data into, for example, `TabSeparated` format with `SELECT` query then this dump can be loaded back using `INSERT` query.
|
||||
|
||||
## Example of an Aggregated Materialized View {#example-of-an-aggregated-materialized-view}
|
||||
## Example Of an Aggregated Materialized View {#example-of-an-aggregated-materialized-view}
|
||||
|
||||
`AggregatingMergeTree` materialized view that watches the `test.visits` table:
|
||||
|
@ -1,4 +1,9 @@
|
||||
# CollapsingMergeTree {#table_engine-collapsingmergetree}
|
||||
---
|
||||
toc_priority: 36
|
||||
toc_title: CollapsingMergeTree
|
||||
---
|
||||
|
||||
# Collapsingmergetree {#table_engine-collapsingmergetree}
|
||||
|
||||
The engine inherits from [MergeTree](mergetree.md) and adds the logic of rows collapsing to data parts merge algorithm.
|
||||
|
||||
@ -21,7 +26,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
[SETTINGS name=value, ...]
|
||||
```
|
||||
|
||||
For a description of query parameters, see [query description](../../query_language/create.md).
|
||||
For a description of query parameters, see [query description](../../../sql_reference/statements/create.md).
|
||||
|
||||
**CollapsingMergeTree Parameters**
|
||||
|
||||
@ -132,7 +137,7 @@ The aggregates `count`, `sum` and `avg` could be calculated this way. The aggreg
|
||||
|
||||
If you need to extract data without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the `FINAL` modifier for the `FROM` clause. This approach is significantly less efficient.
|
||||
|
||||
## Example of use {#example-of-use}
|
||||
## Example Of Use {#example-of-use}
|
||||
|
||||
Example data:
|
||||
|
||||
@ -222,7 +227,7 @@ SELECT * FROM UAct FINAL
|
||||
|
||||
This way of selecting the data is very inefficient. Don’t use it for big tables.
|
||||
|
||||
## Example of another approach {#example-of-another-approach}
|
||||
## Example Of Another Approach {#example-of-another-approach}
|
||||
|
||||
Example data:
|
||||
|
@ -1,6 +1,11 @@
|
||||
---
|
||||
toc_priority: 32
|
||||
toc_title: Custom Partitioning Key
|
||||
---
|
||||
|
||||
# Custom Partitioning Key {#custom-partitioning-key}
|
||||
|
||||
Partitioning is available for the [MergeTree](mergetree.md) family tables (including [replicated](replication.md) tables). [Materialized views](materializedview.md) based on MergeTree tables support partitioning, as well.
|
||||
Partitioning is available for the [MergeTree](mergetree.md) family tables (including [replicated](replication.md) tables). [Materialized views](../special/materializedview.md) based on MergeTree tables support partitioning, as well.
|
||||
|
||||
A partition is a logical combination of records in a table by a specified criterion. You can set a partition by an arbitrary criterion, such as by month, by day, or by event type. Each partition is stored separately to simplify manipulations of this data. When accessing the data, ClickHouse uses the smallest subset of partitions possible.
|
||||
|
||||
@ -33,7 +38,7 @@ When inserting new data to a table, this data is stored as a separate part (chun
|
||||
!!! info "Info"
|
||||
A merge only works for data parts that have the same value for the partitioning expression. This means **you shouldn’t make overly granular partitions** (more than about a thousand partitions). Otherwise, the `SELECT` query performs poorly because of an unreasonably large number of files in the file system and open file descriptors.
|
||||
|
||||
Use the [system.parts](../system_tables.md#system_tables-parts) table to view the table parts and partitions. For example, let’s assume that we have a `visits` table with partitioning by month. Let’s perform the `SELECT` query for the `system.parts` table:
|
||||
Use the [system.parts](../../../operations/system_tables.md#system_tables-parts) table to view the table parts and partitions. For example, let’s assume that we have a `visits` table with partitioning by month. Let’s perform the `SELECT` query for the `system.parts` table:
|
||||
|
||||
``` sql
|
||||
SELECT
|
||||
@ -72,7 +77,7 @@ Let’s break down the name of the first part: `201901_1_3_1`:
|
||||
|
||||
The `active` column shows the status of the part. `1` is active; `0` is inactive. The inactive parts are, for example, source parts remaining after merging to a larger part. The corrupted data parts are also indicated as inactive.
|
||||
|
||||
As you can see in the example, there are several separated parts of the same partition (for example, `201901_1_3_1` and `201901_1_9_2`). This means that these parts are not merged yet. ClickHouse merges the inserted parts of data periodically, approximately 15 minutes after inserting. In addition, you can perform a non-scheduled merge using the [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query. Example:
|
||||
As you can see in the example, there are several separated parts of the same partition (for example, `201901_1_3_1` and `201901_1_9_2`). This means that these parts are not merged yet. ClickHouse merges the inserted parts of data periodically, approximately 15 minutes after inserting. In addition, you can perform a non-scheduled merge using the [OPTIMIZE](../../../sql_reference/statements/misc.md#misc_operations-optimize) query. Example:
|
||||
|
||||
``` sql
|
||||
OPTIMIZE TABLE visits PARTITION 201902;
|
||||
@ -111,10 +116,10 @@ drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached
|
||||
|
||||
The folders ‘201901\_1\_1\_0’, ‘201901\_1\_7\_1’ and so on are the directories of the parts. Each part relates to a corresponding partition and contains data just for a certain month (the table in this example has partitioning by month).
|
||||
|
||||
The `detached` directory contains parts that were detached from the table using the [DETACH](#alter_detach-partition) query. The corrupted parts are also moved to this directory, instead of being deleted. The server does not use the parts from the `detached` directory. You can add, delete, or modify the data in this directory at any time – the server will not know about this until you run the [ATTACH](../../query_language/alter.md#alter_attach-partition) query.
|
||||
The `detached` directory contains parts that were detached from the table using the [DETACH](#alter_detach-partition) query. The corrupted parts are also moved to this directory, instead of being deleted. The server does not use the parts from the `detached` directory. You can add, delete, or modify the data in this directory at any time – the server will not know about this until you run the [ATTACH](../../../sql_reference/statements/alter.md#alter_attach-partition) query.
|
||||
|
||||
Note that on the operating server, you cannot manually change the set of parts or their data on the file system, since the server will not know about it. For non-replicated tables, you can do this when the server is stopped, but it isn’t recommended. For replicated tables, the set of parts cannot be changed in any case.
|
||||
|
||||
ClickHouse allows you to perform operations with the partitions: delete them, copy from one table to another, or create a backup. See the list of all operations in the section [Manipulations With Partitions and Parts](../../query_language/alter.md#alter_manipulations-with-partitions).
|
||||
ClickHouse allows you to perform operations with the partitions: delete them, copy from one table to another, or create a backup. See the list of all operations in the section [Manipulations With Partitions and Parts](../../../sql_reference/statements/alter.md#alter_manipulations-with-partitions).
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/custom_partitioning_key/) <!--hide-->
|
@ -1,4 +1,9 @@
|
||||
# GraphiteMergeTree {#graphitemergetree}
|
||||
---
|
||||
toc_priority: 38
|
||||
toc_title: GraphiteMergeTree
|
||||
---
|
||||
|
||||
# Graphitemergetree {#graphitemergetree}
|
||||
|
||||
This engine is designed for thinning and aggregating/averaging (rollup) [Graphite](http://graphite.readthedocs.io/en/latest/index.html) data. It may be helpful to developers who want to use ClickHouse as a data store for Graphite.
|
||||
|
||||
@ -23,7 +28,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
[SETTINGS name=value, ...]
|
||||
```
|
||||
|
||||
See a detailed description of the [CREATE TABLE](../../query_language/create.md#create-table-query) query.
|
||||
See a detailed description of the [CREATE TABLE](../../../sql_reference/statements/create.md#create-table-query) query.
|
||||
|
||||
A table for the Graphite data should have the following columns for the following data:
|
||||
|
||||
@ -72,9 +77,9 @@ All of the parameters excepting `config_section` have the same meaning as in `Me
|
||||
|
||||
</details>
|
||||
|
||||
## Rollup configuration {#rollup-configuration}
|
||||
## Rollup Configuration {#rollup-configuration}
|
||||
|
||||
The settings for rollup are defined by the [graphite\_rollup](../server_settings/settings.md#server_settings-graphite_rollup) parameter in the server configuration. The name of the parameter could be any. You can create several configurations and use them for different tables.
|
||||
The settings for rollup are defined by the [graphite\_rollup](../../../operations/server_configuration_parameters/settings.md#server_configuration_parameters-graphite_rollup) parameter in the server configuration. The name of the parameter could be any. You can create several configurations and use them for different tables.
|
||||
|
||||
Rollup configuration structure:
|
||||
|
5
docs/en/engines/table_engines/mergetree_family/index.md
Normal file
5
docs/en/engines/table_engines/mergetree_family/index.md
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
toc_folder_title: MergeTree Family
|
||||
toc_priority: 28
|
||||
---
|
||||
|
@ -1,4 +1,9 @@
|
||||
# MergeTree {#table_engines-mergetree}
|
||||
---
|
||||
toc_priority: 30
|
||||
toc_title: MergeTree
|
||||
---
|
||||
|
||||
# Mergetree {#table_engines-mergetree}
|
||||
|
||||
The `MergeTree` engine and other engines of this family (`*MergeTree`) are the most robust ClickHouse table engines.
|
||||
|
||||
@ -23,7 +28,7 @@ Main features:
|
||||
If necessary, you can set the data sampling method in the table.
|
||||
|
||||
!!! info "Info"
|
||||
The [Merge](merge.md) engine does not belong to the `*MergeTree` family.
|
||||
The [Merge](../special/merge.md) engine does not belong to the `*MergeTree` family.
|
||||
|
||||
## Creating a Table {#table_engine-mergetree-creating-a-table}
|
||||
|
||||
@ -44,7 +49,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
[SETTINGS name=value, ...]
|
||||
```
|
||||
|
||||
For a description of parameters, see the [CREATE query description](../../query_language/create.md).
|
||||
For a description of parameters, see the [CREATE query description](../../../sql_reference/statements/create.md).
|
||||
|
||||
!!! note "Note"
|
||||
`INDEX` is an experimental feature, see [Data Skipping Indexes](#table_engine-mergetree-data_skipping-indexes).
|
||||
@ -55,7 +60,7 @@ For a description of parameters, see the [CREATE query description](../../query_
|
||||
|
||||
- `PARTITION BY` — The [partitioning key](custom_partitioning_key.md).
|
||||
|
||||
For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](../../data_types/date.md). The partition names here have the `"YYYYMM"` format.
|
||||
For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](../../../sql_reference/data_types/date.md). The partition names here have the `"YYYYMM"` format.
|
||||
|
||||
- `ORDER BY` — The sorting key.
|
||||
|
||||
@ -83,7 +88,7 @@ For a description of parameters, see the [CREATE query description](../../query_
|
||||
- `index_granularity` — Maximum number of data rows between the marks of an index. Default value: 8192. See [Data Storage](#mergetree-data-storage).
|
||||
- `index_granularity_bytes` — Maximum size of data granules in bytes. Default value: 10Mb. To restrict the granule size only by number of rows, set to 0 (not recommended). See [Data Storage](#mergetree-data-storage).
|
||||
- `enable_mixed_granularity_parts` — Enables or disables transitioning to control the granule size with the `index_granularity_bytes` setting. Before version 19.11, there was only the `index_granularity` setting for restricting granule size. The `index_granularity_bytes` setting improves ClickHouse performance when selecting data from tables with big rows (tens and hundreds of megabytes). If you have tables with big rows, you can enable this setting for the tables to improve the efficiency of `SELECT` queries.
|
||||
- `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in “Server configuration parameters”.
|
||||
- `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../../../operations/server_configuration_parameters/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in “Server configuration parameters”.
|
||||
- `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation that is required for using direct I/O access to the storage disk. When merging data parts, ClickHouse calculates the total storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, ClickHouse reads and writes the data to the storage disk using the direct I/O interface (`O_DIRECT` option). If `min_merge_bytes_to_use_direct_io = 0`, then direct I/O is disabled. Default value: `10 * 1024 * 1024 * 1024` bytes.
|
||||
<a name="mergetree_setting-merge_with_ttl_timeout"></a>
|
||||
- `merge_with_ttl_timeout` — Minimum delay in seconds before repeating a merge with TTL. Default value: 86400 (1 day).
|
||||
@ -99,7 +104,7 @@ ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDa
|
||||
|
||||
In the example, we set partitioning by month.
|
||||
|
||||
We also set an expression for sampling as a hash by the user ID. This allows you to pseudorandomize the data in the table for each `CounterID` and `EventDate`. If you define a [SAMPLE](../../query_language/select.md#select-sample-clause) clause when selecting the data, ClickHouse will return an evenly pseudorandom data sample for a subset of users.
|
||||
We also set an expression for sampling as a hash by the user ID. This allows you to pseudorandomize the data in the table for each `CounterID` and `EventDate`. If you define a [SAMPLE](../../../sql_reference/statements/select.md#select-sample-clause) clause when selecting the data, ClickHouse will return an evenly pseudorandom data sample for a subset of users.
|
||||
|
||||
The `index_granularity` setting can be omitted because 8192 is the default value.
|
||||
|
||||
@ -121,9 +126,9 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
|
||||
**MergeTree() Parameters**
|
||||
|
||||
- `date-column` — The name of a column of the [Date](../../data_types/date.md) type. ClickHouse automatically creates partitions by month based on this column. The partition names are in the `"YYYYMM"` format.
|
||||
- `date-column` — The name of a column of the [Date](../../../sql_reference/data_types/date.md) type. ClickHouse automatically creates partitions by month based on this column. The partition names are in the `"YYYYMM"` format.
|
||||
- `sampling_expression` — An expression for sampling.
|
||||
- `(primary, key)` — Primary key. Type: [Tuple()](../../data_types/tuple.md)
|
||||
- `(primary, key)` — Primary key. Type: [Tuple()](../../../sql_reference/data_types/tuple.md)
|
||||
- `index_granularity` — The granularity of an index. The number of data rows between the “marks” of an index. The value 8192 is appropriate for most tasks.
|
||||
|
||||
**Example**
|
||||
@ -147,11 +152,11 @@ Each data part is logically divided into granules. A granule is the smallest ind
|
||||
|
||||
The granule size is restricted by the `index_granularity` and `index_granularity_bytes` settings of the table engine. The number of rows in a granule lays in the `[1, index_granularity]` range, depending on the size of the rows. The size of a granule can exceed `index_granularity_bytes` if the size of a single row is greater than the value of the setting. In this case, the size of the granule equals the size of the row.
|
||||
|
||||
## Primary Keys and Indexes in Queries {#primary-keys-and-indexes-in-queries}
|
||||
## Primary Keys and Indexes In Queries {#primary-keys-and-indexes-in-queries}
|
||||
|
||||
Take the `(CounterID, Date)` primary key as an example. In this case, the sorting and index can be illustrated as follows:
|
||||
|
||||
Whole data: [-------------------------------------------------------------------------]
|
||||
Whole data: [---------------------------------------------]
|
||||
CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll]
|
||||
Date: [1111111222222233331233211111222222333211111112122222223111112223311122333]
|
||||
Marks: | | | | | | | | | | |
|
||||
@ -193,7 +198,7 @@ The number of columns in the primary key is not explicitly limited. Depending on
|
||||
|
||||
A long primary key will negatively affect the insert performance and memory consumption, but extra columns in the primary key do not affect ClickHouse performance during `SELECT` queries.
|
||||
|
||||
### Choosing a Primary Key that Differs from the Sorting Key {#choosing-a-primary-key-that-differs-from-the-sorting-key}
|
||||
### Choosing a Primary Key That Differs From the Sorting Key {#choosing-a-primary-key-that-differs-from-the-sorting-key}
|
||||
|
||||
It is possible to specify a primary key (an expression with values that are written in the index file for each mark) that is different from the sorting key (an expression for sorting the rows in data parts). In this case the primary key expression tuple must be a prefix of the sorting key expression tuple.
|
||||
|
||||
@ -202,9 +207,9 @@ This feature is helpful when using the [SummingMergeTree](summingmergetree.md) a
|
||||
|
||||
In this case it makes sense to leave only a few columns in the primary key that will provide efficient range scans and add the remaining dimension columns to the sorting key tuple.
|
||||
|
||||
[ALTER](../../query_language/alter.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts don’t need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification.
|
||||
[ALTER](../../../sql_reference/statements/alter.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts don’t need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification.
|
||||
|
||||
### Use of Indexes and Partitions in Queries {#use-of-indexes-and-partitions-in-queries}
|
||||
### Use Of Indexes and Partitions In Queries {#use-of-indexes-and-partitions-in-queries}
|
||||
|
||||
For `SELECT` queries, ClickHouse analyzes whether an index can be used. An index can be used if the `WHERE/PREWHERE` clause has an expression (as one of the conjunction elements, or entirely) that represents an equality or inequality comparison operation, or if it has `IN` or `LIKE` with a fixed prefix on columns or expressions that are in the primary key or partitioning key, or on certain partially repetitive functions of these columns, or logical relationships of these expressions.
|
||||
|
||||
@ -232,11 +237,11 @@ In the example below, the index can’t be used.
|
||||
SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%'
|
||||
```
|
||||
|
||||
To check whether ClickHouse can use the index when running a query, use the settings [force\_index\_by\_date](../settings/settings.md#settings-force_index_by_date) and [force\_primary\_key](../settings/settings.md).
|
||||
To check whether ClickHouse can use the index when running a query, use the settings [force\_index\_by\_date](../../../operations/settings/settings.md#settings-force_index_by_date) and [force\_primary\_key](../../../operations/settings/settings.md).
|
||||
|
||||
The key for partitioning by month allows reading only those data blocks which contain dates from the proper range. In this case, the data block may contain data for many dates (up to an entire month). Within a block, data is sorted by primary key, which might not contain the date as the first column. Because of this, using a query with only a date condition that does not specify the primary key prefix will cause more data to be read than for a single date.
|
||||
|
||||
### Use of Index for Partially-Monotonic Primary Keys {#use-of-index-for-partially-monotonic-primary-keys}
|
||||
### Use Of Index For Partially-monotonic Primary Keys {#use-of-index-for-partially-monotonic-primary-keys}
|
||||
|
||||
Consider, for example, the days of the month. They form a [monotonic sequence](https://en.wikipedia.org/wiki/Monotonic_function) for one month, but not monotonic for more extended periods. This is a partially-monotonic sequence. If a user creates the table with partially-monotonic primary key, ClickHouse creates a sparse index as usual. When a user selects data from this kind of table, ClickHouse analyzes the query conditions. If the user wants to get data between two marks of the index and both these marks fall within one month, ClickHouse can use the index in this particular case because it can calculate the distance between the parameters of a query and index marks.
|
||||
|
||||
@ -244,7 +249,7 @@ ClickHouse cannot use an index if the values of the primary key in the query par
|
||||
|
||||
ClickHouse uses this logic not only for days of the month sequences, but for any primary key that represents a partially-monotonic sequence.
|
||||
|
||||
### Data Skipping Indexes (Experimental) {#table_engine-mergetree-data_skipping-indexes}
|
||||
### Data Skipping Indexes (experimental) {#table_engine-mergetree-data_skipping-indexes}
|
||||
|
||||
The index declaration is in the columns section of the `CREATE` query.
|
||||
|
||||
@ -278,7 +283,7 @@ SELECT count() FROM table WHERE s < 'z'
|
||||
SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234
|
||||
```
|
||||
|
||||
#### Available Types of Indices {#available-types-of-indices}
|
||||
#### Available Types Of Indices {#available-types-of-indices}
|
||||
|
||||
- `minmax`
|
||||
|
||||
@ -307,7 +312,7 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234
|
||||
|
||||
Supported data types: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`, `Array`, `LowCardinality`, `Nullable`.
|
||||
|
||||
The following functions can use it: [equals](../../query_language/functions/comparison_functions.md), [notEquals](../../query_language/functions/comparison_functions.md), [in](../../query_language/functions/in_functions.md), [notIn](../../query_language/functions/in_functions.md), [has](../../query_language/functions/array_functions.md).
|
||||
The following functions can use it: [equals](../../../sql_reference/functions/comparison_functions.md), [notEquals](../../../sql_reference/functions/comparison_functions.md), [in](../../../sql_reference/functions/in_functions.md), [notIn](../../../sql_reference/functions/in_functions.md), [has](../../../sql_reference/functions/array_functions.md).
|
||||
|
||||
<!-- -->
|
||||
|
||||
@ -323,24 +328,24 @@ Conditions in the `WHERE` clause contains calls of the functions that operate wi
|
||||
|
||||
The `set` index can be used with all functions. Function subsets for other indexes are shown in the table below.
|
||||
|
||||
| Function (operator) / Index | primary key | minmax | ngrambf\_v1 | tokenbf\_v1 | bloom\_filter |
|
||||
|----------------------------------------------------------------------------------------------------------|-------------|--------|-------------|-------------|---------------|
|
||||
| [equals (=, ==)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| [notEquals(!=, \<\>)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| [like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ |
|
||||
| [notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✗ | ✗ |
|
||||
| [startsWith](../../query_language/functions/string_functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ |
|
||||
| [endsWith](../../query_language/functions/string_functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ |
|
||||
| [multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ |
|
||||
| [in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| [notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| [less (\<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [greater (\>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [lessOrEquals (\<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [greaterOrEquals (\>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [empty](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ |
|
||||
| Function (operator) / Index | primary key | minmax | ngrambf\_v1 | tokenbf\_v1 | bloom\_filter |
|
||||
|------------------------------------------------------------------|---------|------|---------|---------|---------|
|
||||
| [equals (=, ==)](../../../sql_reference/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| [notEquals(!=, \<\>)](../../../sql_reference/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| [like](../../../sql_reference/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ |
|
||||
| [notLike](../../../sql_reference/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✗ | ✗ |
|
||||
| [startsWith](../../../sql_reference/functions/string_functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ |
|
||||
| [endsWith](../../../sql_reference/functions/string_functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ |
|
||||
| [multiSearchAny](../../../sql_reference/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ |
|
||||
| [in](../../../sql_reference/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| [notIn](../../../sql_reference/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ |
|
||||
| [less (\<)](../../../sql_reference/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [greater (\>)](../../../sql_reference/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [lessOrEquals (\<=)](../../../sql_reference/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [greaterOrEquals (\>=)](../../../sql_reference/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [empty](../../../sql_reference/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| [notEmpty](../../../sql_reference/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||
| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ |
|
||||
|
||||
Functions with a constant argument that is less than ngram size can’t be used by `ngrambf_v1` for query optimization.
|
||||
|
||||
@ -365,13 +370,13 @@ For concurrent table access, we use multi-versioning. In other words, when a tab
|
||||
|
||||
Reading from a table is automatically parallelized.
|
||||
|
||||
## TTL for Columns and Tables {#table_engine-mergetree-ttl}
|
||||
## TTL For Columns and Tables {#table_engine-mergetree-ttl}
|
||||
|
||||
Determines the lifetime of values.
|
||||
|
||||
The `TTL` clause can be set for the whole table and for each individual column. Table-level TTL can also specify logic of automatic move of data between disks and volumes.
|
||||
|
||||
Expressions must evaluate to [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md) data type.
|
||||
Expressions must evaluate to [Date](../../../sql_reference/data_types/date.md) or [DateTime](../../../sql_reference/data_types/datetime.md) data type.
|
||||
|
||||
Example:
|
||||
|
||||
@ -380,7 +385,7 @@ TTL time_column
|
||||
TTL time_column + interval
|
||||
```
|
||||
|
||||
To define `interval`, use [time interval](../../query_language/operators.md#operators-datetime) operators.
|
||||
To define `interval`, use [time interval](../../../sql_reference/operators.md#operators-datetime) operators.
|
||||
|
||||
``` sql
|
||||
TTL date_time + INTERVAL 1 MONTH
|
||||
@ -471,26 +476,24 @@ Data with an expired TTL is removed when ClickHouse merges data parts.
|
||||
|
||||
When ClickHouse see that data is expired, it performs an off-schedule merge. To control the frequency of such merges, you can set [merge\_with\_ttl\_timeout](#mergetree_setting-merge_with_ttl_timeout). If the value is too low, it will perform many off-schedule merges that may consume a lot of resources.
|
||||
|
||||
If you perform the `SELECT` query between merges, you may get expired data. To avoid it, use the [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query before `SELECT`.
|
||||
If you perform the `SELECT` query between merges, you may get expired data. To avoid it, use the [OPTIMIZE](../../../sql_reference/statements/misc.md#misc_operations-optimize) query before `SELECT`.
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/mergetree/) <!--hide-->
|
||||
|
||||
## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes}
|
||||
## Using Multiple Block Devices For Data Storage {#table_engine-mergetree-multiple-volumes}
|
||||
|
||||
### Introduction {#introduction}
|
||||
|
||||
`MergeTree` family table engines can store data on multiple block devices. For example, it can be useful when the data of a certain table are implicitly split into “hot” and “cold”. The most recent data is regularly requested but requires only a small amount of space. On the contrary, the fat-tailed historical data is requested rarely. If several disks are available, the “hot” data may be located on fast disks (for example, NVMe SSDs or in memory), while the “cold” data - on relatively slow ones (for example, HDD).
|
||||
|
||||
Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](../../query_language/alter.md#alter_move-partition) queries.
|
||||
Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](../../../sql_reference/statements/alter.md#alter_move-partition) queries.
|
||||
|
||||
### Terms {#terms}
|
||||
|
||||
- Disk — Block device mounted to the filesystem.
|
||||
- Default disk — Disk that stores the path specified in the [path](../server_settings/settings.md#server_settings-path) server setting.
|
||||
- Default disk — Disk that stores the path specified in the [path](../../../operations/server_configuration_parameters/settings.md#server_configuration_parameters-path) server setting.
|
||||
- Volume — Ordered set of equal disks (similar to [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures)).
|
||||
- Storage policy — Set of volumes and the rules for moving data between them.
|
||||
|
||||
The names given to the described entities can be found in the system tables, [system.storage\_policies](../system_tables.md#system_tables-storage_policies) and [system.disks](../system_tables.md#system_tables-disks). To apply one of the configured storage policies for a table, use the `storage_policy` setting of `MergeTree`-engine family tables.
|
||||
The names given to the described entities can be found in the system tables, [system.storage\_policies](../../../operations/system_tables.md#system_tables-storage_policies) and [system.disks](../../../operations/system_tables.md#system_tables-disks). To apply one of the configured storage policies for a table, use the `storage_policy` setting of `MergeTree`-engine family tables.
|
||||
|
||||
### Configuration {#table_engine-mergetree-multiple-volumes-configure}
|
||||
|
||||
@ -625,9 +628,9 @@ The `default` storage policy implies using only one volume, which consists of on
|
||||
In the case of `MergeTree` tables, data is getting to disk in different ways:
|
||||
|
||||
- As a result of an insert (`INSERT` query).
|
||||
- During background merges and [mutations](../../query_language/alter.md#alter-mutations).
|
||||
- During background merges and [mutations](../../../sql_reference/statements/alter.md#alter-mutations).
|
||||
- When downloading from another replica.
|
||||
- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../query_language/alter.md#alter_freeze-partition).
|
||||
- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../../sql_reference/statements/alter.md#alter_freeze-partition).
|
||||
|
||||
In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy:
|
||||
|
||||
@ -637,9 +640,9 @@ In all these cases except for mutations and partition freezing, a part is stored
|
||||
Under the hood, mutations and partition freezing make use of [hard links](https://en.wikipedia.org/wiki/Hard_link). Hard links between different disks are not supported, therefore in such cases the resulting parts are stored on the same disks as the initial ones.
|
||||
|
||||
In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file.
|
||||
Data is never transferred from the last one and into the first one. One may use system tables [system.part\_log](../system_tables.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../system_tables.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs.
|
||||
Data is never transferred from the last one and into the first one. One may use system tables [system.part\_log](../../../operations/system_tables.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../../../operations/system_tables.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs.
|
||||
|
||||
User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../query_language/alter.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met.
|
||||
User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql_reference/statements/alter.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met.
|
||||
|
||||
Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas.
|
||||
|
@ -1,4 +1,9 @@
|
||||
# ReplacingMergeTree {#replacingmergetree}
|
||||
---
|
||||
toc_priority: 33
|
||||
toc_title: ReplacingMergeTree
|
||||
---
|
||||
|
||||
# Replacingmergetree {#replacingmergetree}
|
||||
|
||||
The engine differs from [MergeTree](mergetree.md#table_engines-mergetree) in that it removes duplicate entries with the same primary key value (or more accurately, with the same [sorting key](mergetree.md) value).
|
||||
|
||||
@ -22,7 +27,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
[SETTINGS name=value, ...]
|
||||
```
|
||||
|
||||
For a description of request parameters, see [request description](../../query_language/create.md).
|
||||
For a description of request parameters, see [request description](../../../sql_reference/statements/create.md).
|
||||
|
||||
**ReplacingMergeTree Parameters**
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 31
|
||||
toc_title: Data Replication
|
||||
---
|
||||
|
||||
# Data Replication {#table_engines-replication}
|
||||
|
||||
Replication is only supported for tables in the MergeTree family:
|
||||
@ -14,7 +19,7 @@ Replication works at the level of an individual table, not the entire server. A
|
||||
|
||||
Replication does not depend on sharding. Each shard has its own independent replication.
|
||||
|
||||
Compressed data for `INSERT` and `ALTER` queries is replicated (for more information, see the documentation for [ALTER](../../query_language/alter.md#query_language_queries_alter)).
|
||||
Compressed data for `INSERT` and `ALTER` queries is replicated (for more information, see the documentation for [ALTER](../../../sql_reference/statements/alter.md#query_language_queries_alter)).
|
||||
|
||||
`CREATE`, `DROP`, `ATTACH`, `DETACH` and `RENAME` queries are executed on a single server and are not replicated:
|
||||
|
||||
@ -24,7 +29,7 @@ Compressed data for `INSERT` and `ALTER` queries is replicated (for more informa
|
||||
|
||||
ClickHouse uses [Apache ZooKeeper](https://zookeeper.apache.org) for storing replicas meta information. Use ZooKeeper version 3.4.5 or newer.
|
||||
|
||||
To use replication, set parameters in the [zookeeper](../server_settings/settings.md#server-settings_zookeeper) server configuration section.
|
||||
To use replication, set parameters in the [zookeeper](../../../operations/server_configuration_parameters/settings.md#server-settings_zookeeper) server configuration section.
|
||||
|
||||
!!! attention "Attention"
|
||||
Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem.
|
||||
@ -52,7 +57,7 @@ You can specify any existing ZooKeeper cluster and the system will use a directo
|
||||
|
||||
If ZooKeeper isn’t set in the config file, you can’t create replicated tables, and any existing replicated tables will be read-only.
|
||||
|
||||
ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max\_replica\_delay\_for\_distributed\_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback\_to\_stale\_replicas\_for\_distributed\_queries](../settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries).
|
||||
ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max\_replica\_delay\_for\_distributed\_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback\_to\_stale\_replicas\_for\_distributed\_queries](../../../operations/settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries).
|
||||
|
||||
For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it doesn’t create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data.
|
||||
|
||||
@ -64,7 +69,7 @@ By default, an INSERT query waits for confirmation of writing the data from only
|
||||
|
||||
Each block of data is written atomically. The INSERT query is divided into blocks up to `max_insert_block_size = 1048576` rows. In other words, if the `INSERT` query has less than 1048576 rows, it is made atomically.
|
||||
|
||||
Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn’t know if the data was written to the DB, so the `INSERT` query can simply be repeated. It doesn’t matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge\_tree](../server_settings/settings.md#server_settings-merge_tree) server settings.
|
||||
Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn’t know if the data was written to the DB, so the `INSERT` query can simply be repeated. It doesn’t matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge\_tree](../../../operations/server_configuration_parameters/settings.md#server_configuration_parameters-merge_tree) server settings.
|
||||
|
||||
During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.)
|
||||
|
||||
@ -181,7 +186,7 @@ An alternative recovery option is to delete information about the lost replica f
|
||||
|
||||
There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once.
|
||||
|
||||
## Converting from MergeTree to ReplicatedMergeTree {#converting-from-mergetree-to-replicatedmergetree}
|
||||
## Converting From Mergetree To Replicatedmergetree {#converting-from-mergetree-to-replicatedmergetree}
|
||||
|
||||
We use the term `MergeTree` to refer to all table engines in the `MergeTree family`, the same as for `ReplicatedMergeTree`.
|
||||
|
||||
@ -193,7 +198,7 @@ Rename the existing MergeTree table, then create a `ReplicatedMergeTree` table w
|
||||
Move the data from the old table to the `detached` subdirectory inside the directory with the new table data (`/var/lib/clickhouse/data/db_name/table_name/`).
|
||||
Then run `ALTER TABLE ATTACH PARTITION` on one of the replicas to add these data parts to the working set.
|
||||
|
||||
## Converting from ReplicatedMergeTree to MergeTree {#converting-from-replicatedmergetree-to-mergetree}
|
||||
## Converting From Replicatedmergetree To Mergetree {#converting-from-replicatedmergetree-to-mergetree}
|
||||
|
||||
Create a MergeTree table with a different name. Move all the data from the directory with the `ReplicatedMergeTree` table data to the new table’s data directory. Then delete the `ReplicatedMergeTree` table and restart the server.
|
||||
|
||||
@ -204,7 +209,7 @@ If you want to get rid of a `ReplicatedMergeTree` table without launching the se
|
||||
|
||||
After this, you can launch the server, create a `MergeTree` table, move the data to its directory, and then restart the server.
|
||||
|
||||
## Recovery When Metadata in The ZooKeeper Cluster is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged}
|
||||
## Recovery When Metadata In The Zookeeper Cluster Is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged}
|
||||
|
||||
If the data in ZooKeeper was lost or damaged, you can save data by moving it to an unreplicated table as described above.
|
||||
|
@ -1,4 +1,9 @@
|
||||
# SummingMergeTree {#summingmergetree}
|
||||
---
|
||||
toc_priority: 34
|
||||
toc_title: SummingMergeTree
|
||||
---
|
||||
|
||||
# Summingmergetree {#summingmergetree}
|
||||
|
||||
The engine inherits from [MergeTree](mergetree.md#table_engines-mergetree). The difference is that when merging data parts for `SummingMergeTree` tables ClickHouse replaces all the rows with the same primary key (or more accurately, with the same [sorting key](mergetree.md)) with one row which contains summarized values for the columns with the numeric data type. If the sorting key is composed in a way that a single key value corresponds to large number of rows, this significantly reduces storage volume and speeds up data selection.
|
||||
|
||||
@ -19,7 +24,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
[SETTINGS name=value, ...]
|
||||
```
|
||||
|
||||
For a description of request parameters, see [request description](../../query_language/create.md).
|
||||
For a description of request parameters, see [request description](../../../sql_reference/statements/create.md).
|
||||
|
||||
**Parameters of SummingMergeTree**
|
||||
|
||||
@ -91,9 +96,9 @@ SELECT key, sum(value) FROM summtt GROUP BY key
|
||||
|
||||
When data are inserted into a table, they are saved as-is. Clickhouse merges the inserted parts of data periodically and this is when rows with the same primary key are summed and replaced with one for each resulting part of data.
|
||||
|
||||
ClickHouse can merge the data parts so that different resulting parts of data cat consist rows with the same primary key, i.e. the summation will be incomplete. Therefore (`SELECT`) an aggregate function [sum()](../../query_language/agg_functions/reference.md#agg_function-sum) and `GROUP BY` clause should be used in a query as described in the example above.
|
||||
ClickHouse can merge the data parts so that different resulting parts of data cat consist rows with the same primary key, i.e. the summation will be incomplete. Therefore (`SELECT`) an aggregate function [sum()](../../../sql_reference/aggregate_functions/reference.md#agg_function-sum) and `GROUP BY` clause should be used in a query as described in the example above.
|
||||
|
||||
### Common rules for summation {#common-rules-for-summation}
|
||||
### Common Rules For Summation {#common-rules-for-summation}
|
||||
|
||||
The values in the columns with the numeric data type are summarized. The set of columns is defined by the parameter `columns`.
|
||||
|
||||
@ -103,9 +108,9 @@ If column is not in the primary key and is not summarized, an arbitrary value is
|
||||
|
||||
The values are not summarized for columns in the primary key.
|
||||
|
||||
### The Summation in the AggregateFunction Columns {#the-summation-in-the-aggregatefunction-columns}
|
||||
### The Summation In the Aggregatefunction Columns {#the-summation-in-the-aggregatefunction-columns}
|
||||
|
||||
For columns of [AggregateFunction type](../../data_types/nested_data_structures/aggregatefunction.md) ClickHouse behaves as [AggregatingMergeTree](aggregatingmergetree.md) engine aggregating according to the function.
|
||||
For columns of [AggregateFunction type](../../../sql_reference/data_types/aggregatefunction.md) ClickHouse behaves as [AggregatingMergeTree](aggregatingmergetree.md) engine aggregating according to the function.
|
||||
|
||||
### Nested Structures {#nested-structures}
|
||||
|
||||
@ -127,7 +132,7 @@ Examples:
|
||||
[(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)]
|
||||
```
|
||||
|
||||
When requesting data, use the [sumMap(key, value)](../../query_language/agg_functions/reference.md) function for aggregation of `Map`.
|
||||
When requesting data, use the [sumMap(key, value)](../../../sql_reference/aggregate_functions/reference.md) function for aggregation of `Map`.
|
||||
|
||||
For nested data structure, you do not need to specify its columns in the tuple of columns for summation.
|
||||
|
@ -1,4 +1,9 @@
|
||||
# VersionedCollapsingMergeTree {#versionedcollapsingmergetree}
|
||||
---
|
||||
toc_priority: 37
|
||||
toc_title: VersionedCollapsingMergeTree
|
||||
---
|
||||
|
||||
# Versionedcollapsingmergetree {#versionedcollapsingmergetree}
|
||||
|
||||
This engine:
|
||||
|
||||
@ -24,7 +29,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
[SETTINGS name=value, ...]
|
||||
```
|
||||
|
||||
For a description of query parameters, see the [query description](../../query_language/create.md).
|
||||
For a description of query parameters, see the [query description](../../../sql_reference/statements/create.md).
|
||||
|
||||
**Engine Parameters**
|
||||
|
||||
@ -136,7 +141,7 @@ The aggregates `count`, `sum` and `avg` can be calculated this way. The aggregat
|
||||
|
||||
If you need to extract the data with “collapsing” but without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the `FINAL` modifier for the `FROM` clause. This approach is inefficient and should not be used with large tables.
|
||||
|
||||
## Example of Use {#example-of-use}
|
||||
## Example Of Use {#example-of-use}
|
||||
|
||||
Example data:
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 45
|
||||
toc_title: Buffer
|
||||
---
|
||||
|
||||
# Buffer {#buffer}
|
||||
|
||||
Buffers the data to write in RAM, periodically flushing it to another table. During the read operation, data is read from the buffer and the other table simultaneously.
|
@ -1,6 +1,11 @@
|
||||
---
|
||||
toc_priority: 35
|
||||
toc_title: Dictionary
|
||||
---
|
||||
|
||||
# Dictionary {#dictionary}
|
||||
|
||||
The `Dictionary` engine displays the [dictionary](../../query_language/dicts/external_dicts.md) data as a ClickHouse table.
|
||||
The `Dictionary` engine displays the [dictionary](../../../sql_reference/dictionaries/external_dictionaries/external_dicts.md) data as a ClickHouse table.
|
||||
|
||||
As an example, consider a dictionary of `products` with the following configuration:
|
||||
|
||||
@ -57,7 +62,7 @@ WHERE name = 'products'
|
||||
└──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘
|
||||
```
|
||||
|
||||
You can use the [dictGet\*](../../query_language/functions/ext_dict_functions.md#ext_dict_functions) function to get the dictionary data in this format.
|
||||
You can use the [dictGet\*](../../../sql_reference/functions/ext_dict_functions.md#ext_dict_functions) function to get the dictionary data in this format.
|
||||
|
||||
This view isn’t helpful when you need to get raw data, or when performing a `JOIN` operation. For these cases, you can use the `Dictionary` engine, which displays the dictionary data in a table.
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 33
|
||||
toc_title: Distributed
|
||||
---
|
||||
|
||||
# Distributed {#distributed}
|
||||
|
||||
**Tables with Distributed engine do not store any data by themself**, but allow distributed query processing on multiple servers.
|
||||
@ -18,7 +23,7 @@ The Distributed engine accepts parameters:
|
||||
See also:
|
||||
|
||||
- `insert_distributed_sync` setting
|
||||
- [MergeTree](mergetree.md#table_engine-mergetree-multiple-volumes) for the examples
|
||||
- [MergeTree](../mergetree_family/mergetree.md#table_engine-mergetree-multiple-volumes) for the examples
|
||||
|
||||
Example:
|
||||
|
||||
@ -79,12 +84,12 @@ Cluster names must not contain dots.
|
||||
The parameters `host`, `port`, and optionally `user`, `password`, `secure`, `compression` are specified for each server:
|
||||
- `host` – The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server doesn’t start. If you change the DNS record, restart the server.
|
||||
- `port` – The TCP port for messenger activity (‘tcp\_port’ in the config, usually set to 9000). Do not confuse it with http\_port.
|
||||
- `user` – Name of the user for connecting to a remote server. Default value: default. This user must have access to connect to the specified server. Access is configured in the users.xml file. For more information, see the section [Access rights](../../operations/access_rights.md).
|
||||
- `user` – Name of the user for connecting to a remote server. Default value: default. This user must have access to connect to the specified server. Access is configured in the users.xml file. For more information, see the section [Access rights](../../../operations/access_rights.md).
|
||||
- `password` – The password for connecting to a remote server (not masked). Default value: empty string.
|
||||
- `secure` - Use ssl for connection, usually you also should define `port` = 9440. Server should listen on <tcp_port_secure>9440</tcp_port_secure> and have correct certificates.
|
||||
- `compression` - Use data compression. Default value: true.
|
||||
|
||||
When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) – see the [load\_balancing](../settings/settings.md#settings-load_balancing) setting.
|
||||
When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) – see the [load\_balancing](../../../operations/settings/settings.md#settings-load_balancing) setting.
|
||||
If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times.
|
||||
This works in favour of resiliency, but does not provide complete fault tolerance: a remote server might accept the connection, but might not work, or work poorly.
|
||||
|
||||
@ -96,7 +101,7 @@ To view your clusters, use the ‘system.clusters’ table.
|
||||
|
||||
The Distributed engine allows working with a cluster like a local server. However, the cluster is inextensible: you must write its configuration in the server config file (even better, for all the cluster’s servers).
|
||||
|
||||
The Distributed engine requires writing clusters to the config file. Clusters from the config file are updated on the fly, without restarting the server. If you need to send a query to an unknown set of shards and replicas each time, you don’t need to create a Distributed table – use the ‘remote’ table function instead. See the section [Table functions](../../query_language/table_functions/index.md).
|
||||
The Distributed engine requires writing clusters to the config file. Clusters from the config file are updated on the fly, without restarting the server. If you need to send a query to an unknown set of shards and replicas each time, you don’t need to create a Distributed table – use the ‘remote’ table function instead. See the section [Table functions](../../../sql_reference/table_functions/index.md).
|
||||
|
||||
There are two methods for writing data to a cluster:
|
||||
|
||||
@ -125,18 +130,18 @@ You should be concerned about the sharding scheme in the following cases:
|
||||
- Queries are used that require joining data (IN or JOIN) by a specific key. If data is sharded by this key, you can use local IN or JOIN instead of GLOBAL IN or GLOBAL JOIN, which is much more efficient.
|
||||
- A large number of servers is used (hundreds or more) with a large number of small queries (queries of individual clients - websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as we’ve done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. Distributed tables are created for each layer, and a single shared distributed table is created for global queries.
|
||||
|
||||
Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The period for sending data is managed by the [distributed\_directory\_monitor\_sleep\_time\_ms](../settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed\_directory\_monitor\_max\_sleep\_time\_ms](../settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed\_directory\_monitor\_batch\_inserts](../settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`.
|
||||
Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The period for sending data is managed by the [distributed\_directory\_monitor\_sleep\_time\_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed\_directory\_monitor\_max\_sleep\_time\_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed\_directory\_monitor\_batch\_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`.
|
||||
|
||||
If the server ceased to exist or had a rough restart (for example, after a device failure) after an INSERT to a Distributed table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the ‘broken’ subdirectory and no longer used.
|
||||
|
||||
When the max\_parallel\_replicas option is enabled, query processing is parallelized across all replicas within a single shard. For more information, see the section [max\_parallel\_replicas](../settings/settings.md#settings-max_parallel_replicas).
|
||||
When the max\_parallel\_replicas option is enabled, query processing is parallelized across all replicas within a single shard. For more information, see the section [max\_parallel\_replicas](../../../operations/settings/settings.md#settings-max_parallel_replicas).
|
||||
|
||||
## Virtual Columns {#virtual-columns}
|
||||
|
||||
- `_shard_num` — Contains the `shard_num` (from `system.clusters`). Type: [UInt32](../../data_types/int_uint.md).
|
||||
- `_shard_num` — Contains the `shard_num` (from `system.clusters`). Type: [UInt32](../../../sql_reference/data_types/int_uint.md).
|
||||
|
||||
!!! note "Note"
|
||||
Since [`remote`](../../query_language/table_functions/remote.md)/`cluster` table functions internally create temporary instance of the same Distributed engine, `_shard_num` is available there too.
|
||||
Since [`remote`](../../../sql_reference/table_functions/remote.md)/`cluster` table functions internally create temporary instance of the same Distributed engine, `_shard_num` is available there too.
|
||||
|
||||
**See Also**
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 34
|
||||
toc_title: External data
|
||||
---
|
||||
|
||||
# External Data for Query Processing {#external-data-for-query-processing}
|
||||
|
||||
ClickHouse allows sending a server the data that is needed for processing a query, together with a SELECT query. This data is put in a temporary table (see the section “Temporary tables”) and can be used in the query (for example, in IN operators).
|
@ -1,7 +1,12 @@
|
||||
---
|
||||
toc_priority: 37
|
||||
toc_title: File
|
||||
---
|
||||
|
||||
# File {#table_engines-file}
|
||||
|
||||
The File table engine keeps the data in a file in one of the supported [file
|
||||
formats](../../interfaces/formats.md#formats) (TabSeparated, Native, etc.).
|
||||
formats](../../../interfaces/formats.md#formats) (TabSeparated, Native, etc.).
|
||||
|
||||
Usage examples:
|
||||
|
||||
@ -9,7 +14,7 @@ Usage examples:
|
||||
- Convert data from one format to another.
|
||||
- Updating data in ClickHouse via editing a file on a disk.
|
||||
|
||||
## Usage in ClickHouse Server {#usage-in-clickhouse-server}
|
||||
## Usage In Clickhouse Server {#usage-in-clickhouse-server}
|
||||
|
||||
``` sql
|
||||
File(Format)
|
||||
@ -18,13 +23,13 @@ File(Format)
|
||||
The `Format` parameter specifies one of the available file formats. To perform
|
||||
`SELECT` queries, the format must be supported for input, and to perform
|
||||
`INSERT` queries – for output. The available formats are listed in the
|
||||
[Formats](../../interfaces/formats.md#formats) section.
|
||||
[Formats](../../../interfaces/formats.md#formats) section.
|
||||
|
||||
ClickHouse does not allow to specify filesystem path for`File`. It will use folder defined by [path](../server_settings/settings.md) setting in server configuration.
|
||||
ClickHouse does not allow to specify filesystem path for`File`. It will use folder defined by [path](../../../operations/server_configuration_parameters/settings.md) setting in server configuration.
|
||||
|
||||
When creating table using `File(Format)` it creates empty subdirectory in that folder. When data is written to that table, it’s put into `data.Format` file in that subdirectory.
|
||||
|
||||
You may manually create this subfolder and file in server filesystem and then [ATTACH](../../query_language/misc.md) it to table information with matching name, so you can query data from that file.
|
||||
You may manually create this subfolder and file in server filesystem and then [ATTACH](../../../sql_reference/statements/misc.md) it to table information with matching name, so you can query data from that file.
|
||||
|
||||
!!! warning "Warning"
|
||||
Be careful with this functionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined.
|
||||
@ -60,16 +65,16 @@ SELECT * FROM file_engine_table
|
||||
└──────┴───────┘
|
||||
```
|
||||
|
||||
## Usage in Clickhouse-local {#usage-in-clickhouse-local}
|
||||
## Usage In Clickhouse-local {#usage-in-clickhouse-local}
|
||||
|
||||
In [clickhouse-local](../utils/clickhouse-local.md) File engine accepts file path in addition to `Format`. Default input/output streams can be specified using numeric or human-readable names like `0` or `stdin`, `1` or `stdout`.
|
||||
In [clickhouse-local](../../../operations/utilities/clickhouse-local.md) File engine accepts file path in addition to `Format`. Default input/output streams can be specified using numeric or human-readable names like `0` or `stdin`, `1` or `stdout`.
|
||||
**Example:**
|
||||
|
||||
``` bash
|
||||
$ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table"
|
||||
```
|
||||
|
||||
## Details of Implementation {#details-of-implementation}
|
||||
## Details Of Implementation {#details-of-implementation}
|
||||
|
||||
- Multiple `SELECT` queries can be performed concurrently, but `INSERT` queries will wait each other.
|
||||
- Supported creating new file by `INSERT` query.
|
@ -1,4 +1,9 @@
|
||||
# GenerateRandom {#table_engines-generate}
|
||||
---
|
||||
toc_priority: 46
|
||||
toc_title: GenerateRandom
|
||||
---
|
||||
|
||||
# Generaterandom {#table_engines-generate}
|
||||
|
||||
The GenerateRandom table engine produces random data for given table schema.
|
||||
|
||||
@ -7,7 +12,7 @@ Usage examples:
|
||||
- Use in test to populate reproducible large table.
|
||||
- Generate random input for fuzzing tests.
|
||||
|
||||
## Usage in ClickHouse Server {#usage-in-clickhouse-server}
|
||||
## Usage In Clickhouse Server {#usage-in-clickhouse-server}
|
||||
|
||||
``` sql
|
||||
ENGINE = GenerateRandom(random_seed, max_string_length, max_array_length)
|
||||
@ -18,7 +23,7 @@ array columns and strings correspondingly in generated data.
|
||||
|
||||
Generate table engine supports only `SELECT` queries.
|
||||
|
||||
It supports all [DataTypes](../../data_types/index.md) that can be stored in a table except `LowCardinality` and `AggregateFunction`.
|
||||
It supports all [DataTypes](../../../sql_reference/data_types/index.md) that can be stored in a table except `LowCardinality` and `AggregateFunction`.
|
||||
|
||||
**Example:**
|
||||
|
||||
@ -42,7 +47,7 @@ SELECT * FROM generate_engine_table LIMIT 3
|
||||
└──────┴────────────┘
|
||||
```
|
||||
|
||||
## Details of Implementation {#details-of-implementation}
|
||||
## Details Of Implementation {#details-of-implementation}
|
||||
|
||||
- Not supported:
|
||||
- `ALTER`
|
5
docs/en/engines/table_engines/special/index.md
Normal file
5
docs/en/engines/table_engines/special/index.md
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
toc_folder_title: Special
|
||||
toc_priority: 31
|
||||
---
|
||||
|
@ -1,6 +1,11 @@
|
||||
---
|
||||
toc_priority: 40
|
||||
toc_title: Join
|
||||
---
|
||||
|
||||
# Join {#join}
|
||||
|
||||
Prepared data structure for using in [JOIN](../../query_language/select.md#select-join) operations.
|
||||
Prepared data structure for using in [JOIN](../../../sql_reference/statements/select.md#select-join) operations.
|
||||
|
||||
## Creating a Table {#creating-a-table}
|
||||
|
||||
@ -12,12 +17,12 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
|
||||
) ENGINE = Join(join_strictness, join_type, k1[, k2, ...])
|
||||
```
|
||||
|
||||
See the detailed description of the [CREATE TABLE](../../query_language/create.md#create-table-query) query.
|
||||
See the detailed description of the [CREATE TABLE](../../../sql_reference/statements/create.md#create-table-query) query.
|
||||
|
||||
**Engine Parameters**
|
||||
|
||||
- `join_strictness` – [JOIN strictness](../../query_language/select.md#select-join-strictness).
|
||||
- `join_type` – [JOIN type](../../query_language/select.md#select-join-types).
|
||||
- `join_strictness` – [JOIN strictness](../../../sql_reference/statements/select.md#select-join-strictness).
|
||||
- `join_type` – [JOIN type](../../../sql_reference/statements/select.md#select-join-types).
|
||||
- `k1[, k2, ...]` – Key columns from the `USING` clause that the `JOIN` operation is made with.
|
||||
|
||||
Enter `join_strictness` and `join_type` parameters without quotes, for example, `Join(ANY, LEFT, col1)`. They must match the `JOIN` operation that the table will be used for. If the parameters don’t match, ClickHouse doesn’t throw an exception and may return incorrect data.
|
||||
@ -79,21 +84,21 @@ You can use `INSERT` queries to add data to the `Join`-engine tables. If the tab
|
||||
You cannot perform a `SELECT` query directly from the table. Instead, use one of the following methods:
|
||||
|
||||
- Place the table to the right side in a `JOIN` clause.
|
||||
- Call the [joinGet](../../query_language/functions/other_functions.md#joinget) function, which lets you extract data from the table the same way as from a dictionary.
|
||||
- Call the [joinGet](../../../sql_reference/functions/other_functions.md#joinget) function, which lets you extract data from the table the same way as from a dictionary.
|
||||
|
||||
### Limitations and Settings {#join-limitations-and-settings}
|
||||
|
||||
When creating a table, the following settings are applied:
|
||||
|
||||
- [join\_use\_nulls](../settings/settings.md#join_use_nulls)
|
||||
- [max\_rows\_in\_join](../settings/query_complexity.md#settings-max_rows_in_join)
|
||||
- [max\_bytes\_in\_join](../settings/query_complexity.md#settings-max_bytes_in_join)
|
||||
- [join\_overflow\_mode](../settings/query_complexity.md#settings-join_overflow_mode)
|
||||
- [join\_any\_take\_last\_row](../settings/settings.md#settings-join_any_take_last_row)
|
||||
- [join\_use\_nulls](../../../operations/settings/settings.md#join_use_nulls)
|
||||
- [max\_rows\_in\_join](../../../operations/settings/query_complexity.md#settings-max_rows_in_join)
|
||||
- [max\_bytes\_in\_join](../../../operations/settings/query_complexity.md#settings-max_bytes_in_join)
|
||||
- [join\_overflow\_mode](../../../operations/settings/query_complexity.md#settings-join_overflow_mode)
|
||||
- [join\_any\_take\_last\_row](../../../operations/settings/settings.md#settings-join_any_take_last_row)
|
||||
|
||||
The `Join`-engine tables can’t be used in `GLOBAL JOIN` operations.
|
||||
|
||||
The `Join`-engine allows use [join\_use\_nulls](../settings/settings.md#join_use_nulls) setting in the `CREATE TABLE` statement. And [SELECT](../../query_language/select.md) query allows use `join_use_nulls` too. If you have different `join_use_nulls` settings, you can get an error joining table. It depends on kind of JOIN. When you use [joinGet](../../query_language/functions/other_functions.md#joinget) function, you have to use the same `join_use_nulls` setting in `CRATE TABLE` and `SELECT` statements.
|
||||
The `Join`-engine allows use [join\_use\_nulls](../../../operations/settings/settings.md#join_use_nulls) setting in the `CREATE TABLE` statement. And [SELECT](../../../sql_reference/statements/select.md) query allows use `join_use_nulls` too. If you have different `join_use_nulls` settings, you can get an error joining table. It depends on kind of JOIN. When you use [joinGet](../../../sql_reference/functions/other_functions.md#joinget) function, you have to use the same `join_use_nulls` setting in `CRATE TABLE` and `SELECT` statements.
|
||||
|
||||
## Data Storage {#data-storage}
|
||||
|
10
docs/en/engines/table_engines/special/materializedview.md
Normal file
10
docs/en/engines/table_engines/special/materializedview.md
Normal file
@ -0,0 +1,10 @@
|
||||
---
|
||||
toc_priority: 43
|
||||
toc_title: MaterializedView
|
||||
---
|
||||
|
||||
# Materializedview {#materializedview}
|
||||
|
||||
Used for implementing materialized views (for more information, see [CREATE TABLE](../../../sql_reference/statements/create.md)). For storing data, it uses a different engine that was specified when creating the view. When reading from a table, it just uses this engine.
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/materializedview/) <!--hide-->
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 44
|
||||
toc_title: Memory
|
||||
---
|
||||
|
||||
# Memory {#memory}
|
||||
|
||||
The Memory engine stores data in RAM, in uncompressed form. Data is stored in exactly the same form as it is received when read. In other words, reading from this table is completely free.
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 36
|
||||
toc_title: Merge
|
||||
---
|
||||
|
||||
# Merge {#merge}
|
||||
|
||||
The `Merge` engine (not to be confused with `MergeTree`) does not store data itself, but allows reading from any number of other tables simultaneously.
|
||||
@ -52,7 +57,7 @@ FROM WatchLog
|
||||
|
||||
## Virtual Columns {#virtual-columns}
|
||||
|
||||
- `_table` — Contains the name of the table from which data was read. Type: [String](../../data_types/string.md).
|
||||
- `_table` — Contains the name of the table from which data was read. Type: [String](../../../sql_reference/data_types/string.md).
|
||||
|
||||
You can set the constant conditions on `_table` in the `WHERE/PREWHERE` clause (for example, `WHERE _table='xyz'`). In this case the read operation is performed only for that tables where the condition on `_table` is satisfied, so the `_table` column acts as an index.
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 38
|
||||
toc_title: 'Null'
|
||||
---
|
||||
|
||||
# Null {#null}
|
||||
|
||||
When writing to a Null table, data is ignored. When reading from a Null table, the response is empty.
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 39
|
||||
toc_title: Set
|
||||
---
|
||||
|
||||
# Set {#set}
|
||||
|
||||
A data set that is always in RAM. It is intended for use on the right side of the IN operator (see the section “IN operators”).
|
@ -1,13 +1,18 @@
|
||||
---
|
||||
toc_priority: 41
|
||||
toc_title: URL
|
||||
---
|
||||
|
||||
# URL(URL, Format) {#table_engines-url}
|
||||
|
||||
Manages data on a remote HTTP/HTTPS server. This engine is similar
|
||||
to the [File](file.md) engine.
|
||||
|
||||
## Using the engine in the ClickHouse server {#using-the-engine-in-the-clickhouse-server}
|
||||
## Using the Engine In the Clickhouse Server {#using-the-engine-in-the-clickhouse-server}
|
||||
|
||||
The `format` must be one that ClickHouse can use in
|
||||
`SELECT` queries and, if necessary, in `INSERTs`. For the full list of supported formats, see
|
||||
[Formats](../../interfaces/formats.md#formats).
|
||||
[Formats](../../../interfaces/formats.md#formats).
|
||||
|
||||
The `URL` must conform to the structure of a Uniform Resource Locator. The specified URL must point to a server
|
||||
that uses HTTP or HTTPS. This does not require any
|
||||
@ -17,7 +22,7 @@ additional headers for getting a response from the server.
|
||||
respectively. For processing `POST` requests, the remote server must support
|
||||
[Chunked transfer encoding](https://en.wikipedia.org/wiki/Chunked_transfer_encoding).
|
||||
|
||||
You can limit the maximum number of HTTP GET redirect hops using the [max\_http\_get\_redirects](../settings/settings.md#setting-max_http_get_redirects) setting.
|
||||
You can limit the maximum number of HTTP GET redirect hops using the [max\_http\_get\_redirects](../../../operations/settings/settings.md#setting-max_http_get_redirects) setting.
|
||||
|
||||
**Example:**
|
||||
|
||||
@ -64,7 +69,7 @@ SELECT * FROM url_engine_table
|
||||
└───────┴───────┘
|
||||
```
|
||||
|
||||
## Details of Implementation {#details-of-implementation}
|
||||
## Details Of Implementation {#details-of-implementation}
|
||||
|
||||
- Reads and writes can be parallel
|
||||
- Not supported:
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 42
|
||||
toc_title: View
|
||||
---
|
||||
|
||||
# View {#table_engines-view}
|
||||
|
||||
Used for implementing views (for more information, see the `CREATE VIEW query`). It does not store data, but only stores the specified `SELECT` query. When reading from a table, it runs this query (and deletes all unnecessary columns from the query).
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 78
|
||||
toc_title: General Questions
|
||||
---
|
||||
|
||||
# General Questions {#general-questions}
|
||||
|
||||
## Why Not Use Something Like MapReduce? {#why-not-use-something-like-mapreduce}
|
||||
@ -40,7 +45,7 @@ SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV
|
||||
|
||||
### Using a File-Engine Table {#using-a-file-engine-table}
|
||||
|
||||
See [File](../operations/table_engines/file.md).
|
||||
See [File](../engines/table_engines/special/file.md).
|
||||
|
||||
### Using Command-Line Redirection {#using-command-line-redirection}
|
||||
|
||||
|
5
docs/en/faq/index.md
Normal file
5
docs/en/faq/index.md
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
toc_folder_title: F.A.Q.
|
||||
toc_priority: 76
|
||||
---
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 17
|
||||
toc_title: AMPLab Big Data Benchmark
|
||||
---
|
||||
|
||||
# AMPLab Big Data Benchmark {#amplab-big-data-benchmark}
|
||||
|
||||
See https://amplab.cs.berkeley.edu/benchmark/
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 19
|
||||
toc_title: Terabyte Click Logs from Criteo
|
||||
---
|
||||
|
||||
# Terabyte of Click Logs from Criteo {#terabyte-of-click-logs-from-criteo}
|
||||
|
||||
Download the data from http://labs.criteo.com/downloads/download-terabyte-click-logs/
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
toc_folder_title: Example Datasets
|
||||
toc_priority: 12
|
||||
toc_title: Introduction
|
||||
---
|
||||
|
||||
# Example Datasets
|
||||
|
||||
This section describes how to obtain example datasets and import them into ClickHouse.
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 21
|
||||
toc_title: Yandex.Metrica Data
|
||||
---
|
||||
|
||||
# Anonymized Yandex.Metrica Data {#anonymized-yandex-metrica-data}
|
||||
|
||||
Dataset consists of two tables containing anonymized data about hits (`hits_v1`) and visits (`visits_v1`) of Yandex.Metrica. You can read more about Yandex.Metrica in [ClickHouse history](../../introduction/history.md) section.
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 16
|
||||
toc_title: New York Taxi Data
|
||||
---
|
||||
|
||||
# New York Taxi Data {#new-york-taxi-data}
|
||||
|
||||
This dataset can be obtained in two ways:
|
||||
@ -375,7 +380,7 @@ We ran queries using a client located in a Yandex datacenter in Finland on a clu
|
||||
## Summary {#summary}
|
||||
|
||||
| servers | Q1 | Q2 | Q3 | Q4 |
|
||||
|---------|-------|-------|-------|-------|
|
||||
|-------|-----|-----|-----|-----|
|
||||
| 1 | 0.490 | 1.224 | 2.104 | 3.593 |
|
||||
| 3 | 0.212 | 0.438 | 0.733 | 1.241 |
|
||||
| 140 | 0.028 | 0.043 | 0.051 | 0.072 |
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 15
|
||||
toc_title: OnTime
|
||||
---
|
||||
|
||||
# OnTime {#ontime}
|
||||
|
||||
This dataset can be obtained in two ways:
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 20
|
||||
toc_title: Star Schema Benchmark
|
||||
---
|
||||
|
||||
# Star Schema Benchmark {#star-schema-benchmark}
|
||||
|
||||
Compiling dbgen:
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 18
|
||||
toc_title: WikiStat
|
||||
---
|
||||
|
||||
# WikiStat {#wikistat}
|
||||
|
||||
See: http://dumps.wikimedia.org/other/pagecounts-raw/
|
||||
|
@ -1,3 +1,10 @@
|
||||
---
|
||||
toc_folder_title: Getting Started
|
||||
toc_hidden: true
|
||||
toc_priority: 8
|
||||
toc_title: hidden
|
||||
---
|
||||
|
||||
# Getting Started {#getting-started}
|
||||
|
||||
If you are new to ClickHouse and want to get a hands-on feeling of its performance, first of all, you need to go through the [installation process](install.md). After that you can:
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 11
|
||||
toc_title: Installation
|
||||
---
|
||||
|
||||
# Installation {#installation}
|
||||
|
||||
## System Requirements {#system-requirements}
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 14
|
||||
toc_title: Playground
|
||||
---
|
||||
|
||||
# ClickHouse Playground {#clickhouse-playground}
|
||||
|
||||
[ClickHouse Playground](https://play.clickhouse.tech?file=welcome) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster.
|
||||
@ -27,7 +32,7 @@ You can make queries to playground using any HTTP client, for example [curl](htt
|
||||
More information about software products that support ClickHouse is available [here](../interfaces/index.md).
|
||||
|
||||
| Parameter | Value |
|
||||
|:----------|:--------------------------------------|
|
||||
|:------|:------------------------|
|
||||
| Endpoint | https://play-api.clickhouse.tech:8443 |
|
||||
| User | `playground` |
|
||||
| Password | `clickhouse` |
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 12
|
||||
toc_title: Tutorial
|
||||
---
|
||||
|
||||
# ClickHouse Tutorial {#clickhouse-tutorial}
|
||||
|
||||
## What to Expect from This Tutorial? {#what-to-expect-from-this-tutorial}
|
||||
@ -99,11 +104,11 @@ As in most databases management systems, ClickHouse logically groups tables into
|
||||
clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial"
|
||||
```
|
||||
|
||||
Syntax for creating tables is way more complicated compared to databases (see [reference](../query_language/create.md). In general `CREATE TABLE` statement has to specify three key things:
|
||||
Syntax for creating tables is way more complicated compared to databases (see [reference](../sql_reference/statements/create.md). In general `CREATE TABLE` statement has to specify three key things:
|
||||
|
||||
1. Name of table to create.
|
||||
2. Table schema, i.e. list of columns and their [data types](../data_types/index.md).
|
||||
3. [Table engine](../operations/table_engines/index.md) and it’s settings, which determines all the details on how queries to this table will be physically executed.
|
||||
2. Table schema, i.e. list of columns and their [data types](../sql_reference/data_types/index.md).
|
||||
3. [Table engine](../engines/table_engines/index.md) and it’s settings, which determines all the details on how queries to this table will be physically executed.
|
||||
|
||||
Yandex.Metrica is a web analytics service, and sample dataset doesn’t cover its full functionality, so there are only two tables to create:
|
||||
|
||||
@ -455,11 +460,11 @@ SETTINGS index_granularity = 8192
|
||||
|
||||
You can execute those queries using the interactive mode of `clickhouse-client` (just launch it in a terminal without specifying a query in advance) or try some [alternative interface](../interfaces/index.md) if you want.
|
||||
|
||||
As we can see, `hits_v1` uses the [basic MergeTree engine](../operations/table_engines/mergetree.md), while the `visits_v1` uses the [Collapsing](../operations/table_engines/collapsingmergetree.md) variant.
|
||||
As we can see, `hits_v1` uses the [basic MergeTree engine](../engines/table_engines/mergetree_family/mergetree.md), while the `visits_v1` uses the [Collapsing](../engines/table_engines/mergetree_family/collapsingmergetree.md) variant.
|
||||
|
||||
### Import Data {#import-data}
|
||||
|
||||
Data import to ClickHouse is done via [INSERT INTO](../query_language/insert_into.md) query like in many other SQL databases. However, data is usually provided in one of the [supported serialization formats](../interfaces/formats.md) instead of `VALUES` clause (which is also supported).
|
||||
Data import to ClickHouse is done via [INSERT INTO](../sql_reference/statements/insert_into.md) query like in many other SQL databases. However, data is usually provided in one of the [supported serialization formats](../interfaces/formats.md) instead of `VALUES` clause (which is also supported).
|
||||
|
||||
The files we downloaded earlier are in tab-separated format, so here’s how to import them via console client:
|
||||
|
||||
@ -524,9 +529,9 @@ ClickHouse cluster is a homogenous cluster. Steps to set up:
|
||||
1. Install ClickHouse server on all machines of the cluster
|
||||
2. Set up cluster configs in configuration files
|
||||
3. Create local tables on each instance
|
||||
4. Create a [Distributed table](../operations/table_engines/distributed.md)
|
||||
4. Create a [Distributed table](../engines/table_engines/special/distributed.md)
|
||||
|
||||
[Distributed table](../operations/table_engines/distributed.md) is actually a kind of “view” to local tables of ClickHouse cluster. SELECT query from a distributed table executes using resources of all cluster’s shards. You may specify configs for multiple clusters and create multiple distributed tables providing views to different clusters.
|
||||
[Distributed table](../engines/table_engines/special/distributed.md) is actually a kind of “view” to local tables of ClickHouse cluster. SELECT query from a distributed table executes using resources of all cluster’s shards. You may specify configs for multiple clusters and create multiple distributed tables providing views to different clusters.
|
||||
|
||||
Example config for a cluster with three shards, one replica each:
|
||||
|
||||
@ -568,16 +573,16 @@ CREATE TABLE tutorial.hits_all AS tutorial.hits_local
|
||||
ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand());
|
||||
```
|
||||
|
||||
A common practice is to create similar Distributed tables on all machines of the cluster. It allows running distributed queries on any machine of the cluster. Also there’s an alternative option to create temporary distributed table for a given SELECT query using [remote](../query_language/table_functions/remote.md) table function.
|
||||
A common practice is to create similar Distributed tables on all machines of the cluster. It allows running distributed queries on any machine of the cluster. Also there’s an alternative option to create temporary distributed table for a given SELECT query using [remote](../sql_reference/table_functions/remote.md) table function.
|
||||
|
||||
Let’s run [INSERT SELECT](../query_language/insert_into.md) into the Distributed table to spread the table to multiple servers.
|
||||
Let’s run [INSERT SELECT](../sql_reference/statements/insert_into.md) into the Distributed table to spread the table to multiple servers.
|
||||
|
||||
``` sql
|
||||
INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1;
|
||||
```
|
||||
|
||||
!!! warning "Notice"
|
||||
This approach is not suitable for the sharding of large tables. There’s a separate tool [clickhouse-copier](../operations/utils/clickhouse-copier.md) that can re-shard arbitrary large tables.
|
||||
This approach is not suitable for the sharding of large tables. There’s a separate tool [clickhouse-copier](../operations/utilities/clickhouse-copier.md) that can re-shard arbitrary large tables.
|
||||
|
||||
As you could expect, computationally heavy queries run N times faster if they utilize 3 servers instead of one.
|
||||
|
||||
@ -653,7 +658,7 @@ ENGINE = ReplcatedMergeTree(
|
||||
...
|
||||
```
|
||||
|
||||
Here we use [ReplicatedMergeTree](../operations/table_engines/replication.md) table engine. In parameters we specify ZooKeeper path containing shard and replica identifiers.
|
||||
Here we use [ReplicatedMergeTree](../engines/table_engines/mergetree_family/replication.md) table engine. In parameters we specify ZooKeeper path containing shard and replica identifiers.
|
||||
|
||||
``` sql
|
||||
INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local;
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 41
|
||||
toc_title: Applying CatBoost Models
|
||||
---
|
||||
|
||||
# Applying a Catboost Model in ClickHouse {#applying-catboost-model-in-clickhouse}
|
||||
|
||||
[CatBoost](https://catboost.ai) is a free and open-source gradient boosting library developed at [Yandex](https://yandex.com/company/) for machine learning.
|
||||
@ -109,7 +114,7 @@ FROM amazon_train
|
||||
|
||||
+-count()-+
|
||||
| 65538 |
|
||||
+---------+
|
||||
+-------+
|
||||
```
|
||||
|
||||
## 3. Integrate CatBoost into ClickHouse {#integrate-catboost-into-clickhouse}
|
||||
@ -178,7 +183,7 @@ LIMIT 10
|
||||
```
|
||||
|
||||
!!! note "Note"
|
||||
Function [modelEvaluate](../query_language/functions/other_functions.md#function-modelevaluate) returns tuple with per-class raw predictions for multiclass models.
|
||||
Function [modelEvaluate](../sql_reference/functions/other_functions.md#function-modelevaluate) returns tuple with per-class raw predictions for multiclass models.
|
||||
|
||||
Let’s predict the probability:
|
||||
|
||||
@ -201,7 +206,7 @@ LIMIT 10
|
||||
```
|
||||
|
||||
!!! note "Note"
|
||||
More info about [exp()](../query_language/functions/math_functions.md) function.
|
||||
More info about [exp()](../sql_reference/functions/math_functions.md) function.
|
||||
|
||||
Let’s calculate LogLoss on the sample:
|
||||
|
||||
@ -227,6 +232,6 @@ FROM
|
||||
```
|
||||
|
||||
!!! note "Note"
|
||||
More info about [avg()](../query_language/agg_functions/reference.md#agg_function-avg) and [log()](../query_language/functions/math_functions.md) functions.
|
||||
More info about [avg()](../sql_reference/aggregate_functions/reference.md#agg_function-avg) and [log()](../sql_reference/functions/math_functions.md) functions.
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/guides/apply_catboost_model/) <!--hide-->
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
toc_folder_title: Guides
|
||||
toc_priority: 38
|
||||
toc_title: Overview
|
||||
---
|
||||
|
||||
# ClickHouse Guides {#clickhouse-guides}
|
||||
|
||||
List of detailed step-by-step instructions that help to solve various tasks using ClickHouse:
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 3
|
||||
toc_title: Overview
|
||||
---
|
||||
|
||||
# What is ClickHouse? {#what-is-clickhouse}
|
||||
|
||||
ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).
|
||||
@ -5,7 +10,7 @@ ClickHouse is a column-oriented database management system (DBMS) for online ana
|
||||
In a “normal” row-oriented DBMS, data is stored in this order:
|
||||
|
||||
| Row | WatchID | JavaEnable | Title | GoodEvent | EventTime |
|
||||
|-----|-------------|------------|--------------------|-----------|---------------------|
|
||||
|---|---------|--------|------------|-------|-------------|
|
||||
| \#0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 |
|
||||
| \#1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 |
|
||||
| \#2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 |
|
||||
@ -18,7 +23,7 @@ Examples of a row-oriented DBMS are MySQL, Postgres, and MS SQL Server.
|
||||
In a column-oriented DBMS, data is stored like this:
|
||||
|
||||
| Row: | \#0 | \#1 | \#2 | \#N |
|
||||
|-------------|---------------------|---------------------|---------------------|-----|
|
||||
|---------|-------------|-------------|-------------|---|
|
||||
| WatchID: | 89354350662 | 90329509958 | 89953706054 | … |
|
||||
| JavaEnable: | 1 | 0 | 1 | … |
|
||||
| Title: | Investor Relations | Contact us | Mission | … |
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 17
|
||||
toc_title: Command-Line Client
|
||||
---
|
||||
|
||||
# Command-line Client {#command-line-client}
|
||||
|
||||
ClickHouse provides a native command-line client: `clickhouse-client`. The client supports command-line options and configuration files. For more information, see [Configuring](#interfaces_cli_configuration).
|
||||
@ -82,7 +87,7 @@ Format a query as usual, then place the values that you want to pass from the ap
|
||||
```
|
||||
|
||||
- `name` — Placeholder identifier. In the console client it should be used in app parameters as `--param_<name> = value`.
|
||||
- `data type` — [Data type](../data_types/index.md) of the app parameter value. For example, a data structure like `(integer, ('string', integer))` can have the `Tuple(UInt8, Tuple(String, UInt8))` data type (you can also use another [integer](../data_types/int_uint.md) types).
|
||||
- `data type` — [Data type](../sql_reference/data_types/index.md) of the app parameter value. For example, a data structure like `(integer, ('string', integer))` can have the `Tuple(UInt8, Tuple(String, UInt8))` data type (you can also use another [integer](../sql_reference/data_types/int_uint.md) types).
|
||||
|
||||
#### Example {#example}
|
||||
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 24
|
||||
toc_title: C++ Client Library
|
||||
---
|
||||
|
||||
# C++ Client Library {#c-client-library}
|
||||
|
||||
See README at [clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp) repository.
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 21
|
||||
toc_title: Input and Output Formats
|
||||
---
|
||||
|
||||
# Formats for Input and Output Data {#formats}
|
||||
|
||||
ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read an external dictionary. A format supported for output can be used to arrange the
|
||||
@ -6,7 +11,7 @@ results of a `SELECT`, and to perform `INSERT`s into a file-backed table.
|
||||
The supported formats are:
|
||||
|
||||
| Format | Input | Output |
|
||||
|-----------------------------------------------------------------|-------|--------|
|
||||
|---------------------------------------|-----|------|
|
||||
| [TabSeparated](#tabseparated) | ✔ | ✔ |
|
||||
| [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ |
|
||||
| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ |
|
||||
@ -103,9 +108,9 @@ Only a small set of symbols are escaped. You can easily stumble onto a string va
|
||||
|
||||
Arrays are written as a list of comma-separated values in square brackets. Number items in the array are formatted as normally. `Date` and `DateTime` types are written in single quotes. Strings are written in single quotes with the same escaping rules as above.
|
||||
|
||||
[NULL](../query_language/syntax.md) is formatted as `\N`.
|
||||
[NULL](../sql_reference/syntax.md) is formatted as `\N`.
|
||||
|
||||
Each element of [Nested](../data_types/nested_data_structures/nested.md) structures is represented as array.
|
||||
Each element of [Nested](../sql_reference/data_types/nested_data_structures/nested.md) structures is represented as array.
|
||||
|
||||
For example:
|
||||
|
||||
@ -325,7 +330,7 @@ SearchPhrase=curtain designs count()=1064
|
||||
SearchPhrase=baku count()=1000
|
||||
```
|
||||
|
||||
[NULL](../query_language/syntax.md) is formatted as `\N`.
|
||||
[NULL](../sql_reference/syntax.md) is formatted as `\N`.
|
||||
|
||||
``` sql
|
||||
SELECT * FROM t_null FORMAT TSKV
|
||||
@ -457,7 +462,7 @@ If the query contains GROUP BY, rows\_before\_limit\_at\_least is the exact numb
|
||||
|
||||
This format is only appropriate for outputting a query result, but not for parsing (retrieving data to insert in a table).
|
||||
|
||||
ClickHouse supports [NULL](../query_language/syntax.md), which is displayed as `null` in the JSON output.
|
||||
ClickHouse supports [NULL](../sql_reference/syntax.md), which is displayed as `null` in the JSON output.
|
||||
|
||||
See also the [JSONEachRow](#jsoneachrow) format.
|
||||
|
||||
@ -534,7 +539,7 @@ ClickHouse ignores spaces between elements and commas after the objects. You can
|
||||
|
||||
**Omitted values processing**
|
||||
|
||||
ClickHouse substitutes omitted values with the default values for the corresponding [data types](../data_types/index.md).
|
||||
ClickHouse substitutes omitted values with the default values for the corresponding [data types](../sql_reference/data_types/index.md).
|
||||
|
||||
If `DEFAULT expr` is specified, ClickHouse uses different substitution rules depending on the [input\_format\_defaults\_for\_omitted\_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) setting.
|
||||
|
||||
@ -579,7 +584,7 @@ Unlike the [JSON](#json) format, there is no substitution of invalid UTF-8 seque
|
||||
|
||||
### Usage of Nested Structures {#jsoneachrow-nested}
|
||||
|
||||
If you have a table with [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input\_format\_import\_nested\_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting.
|
||||
If you have a table with [Nested](../sql_reference/data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input\_format\_import\_nested\_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting.
|
||||
|
||||
For example, consider the following table:
|
||||
|
||||
@ -653,7 +658,7 @@ Outputs data as Unicode-art tables, also using ANSI-escape sequences for setting
|
||||
A full grid of the table is drawn, and each row occupies two lines in the terminal.
|
||||
Each result block is output as a separate table. This is necessary so that blocks can be output without buffering results (buffering would be necessary in order to pre-calculate the visible width of all the values).
|
||||
|
||||
[NULL](../query_language/syntax.md) is output as `ᴺᵁᴸᴸ`.
|
||||
[NULL](../sql_reference/syntax.md) is output as `ᴺᵁᴸᴸ`.
|
||||
|
||||
Example (shown for the [PrettyCompact](#prettycompact) format):
|
||||
|
||||
@ -757,7 +762,7 @@ FixedString is represented simply as a sequence of bytes.
|
||||
|
||||
Array is represented as a varint length (unsigned [LEB128](https://en.wikipedia.org/wiki/LEB128)), followed by successive elements of the array.
|
||||
|
||||
For [NULL](../query_language/syntax.md#null-literal) support, an additional byte containing 1 or 0 is added before each [Nullable](../data_types/nullable.md) value. If 1, then the value is `NULL` and this byte is interpreted as a separate value. If 0, the value after the byte is not `NULL`.
|
||||
For [NULL](../sql_reference/syntax.md#null-literal) support, an additional byte containing 1 or 0 is added before each [Nullable](../sql_reference/data_types/nullable.md) value. If 1, then the value is `NULL` and this byte is interpreted as a separate value. If 0, the value after the byte is not `NULL`.
|
||||
|
||||
## RowBinaryWithNamesAndTypes {#rowbinarywithnamesandtypes}
|
||||
|
||||
@ -769,7 +774,7 @@ Similar to [RowBinary](#rowbinary), but with added header:
|
||||
|
||||
## Values {#data-format-values}
|
||||
|
||||
Prints every row in brackets. Rows are separated by commas. There is no comma after the last row. The values inside the brackets are also comma-separated. Numbers are output in a decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are similar to the [TabSeparated](#tabseparated) format. During formatting, extra spaces aren’t inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). [NULL](../query_language/syntax.md) is represented as `NULL`.
|
||||
Prints every row in brackets. Rows are separated by commas. There is no comma after the last row. The values inside the brackets are also comma-separated. Numbers are output in a decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are similar to the [TabSeparated](#tabseparated) format. During formatting, extra spaces aren’t inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). [NULL](../sql_reference/syntax.md) is represented as `NULL`.
|
||||
|
||||
The minimum set of characters that you need to escape when passing data in Values format: single quotes and backslashes.
|
||||
|
||||
@ -781,7 +786,7 @@ See also: [input\_format\_values\_interpret\_expressions](../operations/settings
|
||||
|
||||
Prints each value on a separate line with the column name specified. This format is convenient for printing just one or a few rows if each row consists of a large number of columns.
|
||||
|
||||
[NULL](../query_language/syntax.md) is output as `ᴺᵁᴸᴸ`.
|
||||
[NULL](../sql_reference/syntax.md) is output as `ᴺᵁᴸᴸ`.
|
||||
|
||||
Example:
|
||||
|
||||
@ -960,7 +965,7 @@ message MessageType {
|
||||
```
|
||||
|
||||
ClickHouse tries to find a column named `x.y.z` (or `x_y_z` or `X.y_Z` and so on).
|
||||
Nested messages are suitable to input or output a [nested data structures](../data_types/nested_data_structures/nested.md).
|
||||
Nested messages are suitable to input or output a [nested data structures](../sql_reference/data_types/nested_data_structures/nested.md).
|
||||
|
||||
Default values defined in a protobuf schema like this
|
||||
|
||||
@ -972,7 +977,7 @@ message MessageType {
|
||||
}
|
||||
```
|
||||
|
||||
are not applied; the [table defaults](../query_language/create.md#create-default-values) are used instead of them.
|
||||
are not applied; the [table defaults](../sql_reference/statements/create.md#create-default-values) are used instead of them.
|
||||
|
||||
ClickHouse inputs and outputs protobuf messages in the `length-delimited` format.
|
||||
It means before every message should be written its length as a [varint](https://developers.google.com/protocol-buffers/docs/encoding#varints).
|
||||
@ -986,23 +991,23 @@ ClickHouse Avro format supports reading and writing [Avro data files](http://avr
|
||||
|
||||
### Data Types Matching {#data_types-matching}
|
||||
|
||||
The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` and `SELECT` queries.
|
||||
The table below shows supported data types and how they match ClickHouse [data types](../sql_reference/data_types/index.md) in `INSERT` and `SELECT` queries.
|
||||
|
||||
| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` |
|
||||
|---------------------------------------------|-------------------------------------------------------------------------------------------|------------------------------|
|
||||
| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32)](../data_types/int_uint.md), [UInt(8\|16\|32)](../data_types/int_uint.md) | `int` |
|
||||
| `boolean`, `int`, `long`, `float`, `double` | [Int64](../data_types/int_uint.md), [UInt64](../data_types/int_uint.md) | `long` |
|
||||
| `boolean`, `int`, `long`, `float`, `double` | [Float32](../data_types/float.md) | `float` |
|
||||
| `boolean`, `int`, `long`, `float`, `double` | [Float64](../data_types/float.md) | `double` |
|
||||
| `bytes`, `string`, `fixed`, `enum` | [String](../data_types/string.md) | `bytes` |
|
||||
| `bytes`, `string`, `fixed` | [FixedString(N)](../data_types/fixedstring.md) | `fixed(N)` |
|
||||
| `enum` | [Enum(8\|16)](../data_types/enum.md) | `enum` |
|
||||
| `array(T)` | [Array(T)](../data_types/array.md) | `array(T)` |
|
||||
| `union(null, T)`, `union(T, null)` | [Nullable(T)](../data_types/date.md) | `union(null, T)` |
|
||||
| `null` | [Nullable(Nothing)](../data_types/special_data_types/nothing.md) | `null` |
|
||||
| `int (date)` \* | [Date](../data_types/date.md) | `int (date)` \* |
|
||||
| `long (timestamp-millis)` \* | [DateTime64(3)](../data_types/datetime.md) | `long (timestamp-millis)` \* |
|
||||
| `long (timestamp-micros)` \* | [DateTime64(6)](../data_types/datetime.md) | `long (timestamp-micros)` \* |
|
||||
|---------------------------|-------------------------------------------------------|------------------|
|
||||
| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32)](../sql_reference/data_types/int_uint.md), [UInt(8\|16\|32)](../sql_reference/data_types/int_uint.md) | `int` |
|
||||
| `boolean`, `int`, `long`, `float`, `double` | [Int64](../sql_reference/data_types/int_uint.md), [UInt64](../sql_reference/data_types/int_uint.md) | `long` |
|
||||
| `boolean`, `int`, `long`, `float`, `double` | [Float32](../sql_reference/data_types/float.md) | `float` |
|
||||
| `boolean`, `int`, `long`, `float`, `double` | [Float64](../sql_reference/data_types/float.md) | `double` |
|
||||
| `bytes`, `string`, `fixed`, `enum` | [String](../sql_reference/data_types/string.md) | `bytes` |
|
||||
| `bytes`, `string`, `fixed` | [FixedString(N)](../sql_reference/data_types/fixedstring.md) | `fixed(N)` |
|
||||
| `enum` | [Enum(8\|16)](../sql_reference/data_types/enum.md) | `enum` |
|
||||
| `array(T)` | [Array(T)](../sql_reference/data_types/array.md) | `array(T)` |
|
||||
| `union(null, T)`, `union(T, null)` | [Nullable(T)](../sql_reference/data_types/date.md) | `union(null, T)` |
|
||||
| `null` | [Nullable(Nothing)](../sql_reference/data_types/special_data_types/nothing.md) | `null` |
|
||||
| `int (date)` \* | [Date](../sql_reference/data_types/date.md) | `int (date)` \* |
|
||||
| `long (timestamp-millis)` \* | [DateTime64(3)](../sql_reference/data_types/datetime.md) | `long (timestamp-millis)` \* |
|
||||
| `long (timestamp-micros)` \* | [DateTime64(6)](../sql_reference/data_types/datetime.md) | `long (timestamp-micros)` \* |
|
||||
|
||||
\* [Avro logical types](http://avro.apache.org/docs/current/spec.html#Logical+Types)
|
||||
|
||||
@ -1056,7 +1061,7 @@ Same as [Avro](#data-format-avro)
|
||||
|
||||
### Usage {#usage}
|
||||
|
||||
To quickly verify schema resolution you can use [kafkacat](https://github.com/edenhill/kafkacat) with [clickhouse-local](../operations/utils/clickhouse-local.md):
|
||||
To quickly verify schema resolution you can use [kafkacat](https://github.com/edenhill/kafkacat) with [clickhouse-local](../operations/utilities/clickhouse-local.md):
|
||||
|
||||
``` bash
|
||||
$ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse-local --input-format AvroConfluent --format_avro_schema_registry_url 'http://schema-registry' -S "field1 Int64, field2 String" -q 'select * from table'
|
||||
@ -1065,7 +1070,7 @@ $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse-
|
||||
3 c
|
||||
```
|
||||
|
||||
To use `AvroConfluent` with [Kafka](../operations/table_engines/kafka.md):
|
||||
To use `AvroConfluent` with [Kafka](../engines/table_engines/integrations/kafka.md):
|
||||
|
||||
``` sql
|
||||
CREATE TABLE topic1_stream
|
||||
@ -1094,25 +1099,25 @@ SELECT * FROM topic1_stream;
|
||||
|
||||
### Data Types Matching {#data_types-matching-2}
|
||||
|
||||
The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` and `SELECT` queries.
|
||||
The table below shows supported data types and how they match ClickHouse [data types](../sql_reference/data_types/index.md) in `INSERT` and `SELECT` queries.
|
||||
|
||||
| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) |
|
||||
|------------------------------|---------------------------------------------|------------------------------|
|
||||
| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` |
|
||||
| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` |
|
||||
| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` |
|
||||
| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` |
|
||||
| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` |
|
||||
| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` |
|
||||
| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` |
|
||||
| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` |
|
||||
| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` |
|
||||
| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` |
|
||||
| `DATE32` | [Date](../data_types/date.md) | `UINT16` |
|
||||
| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` |
|
||||
| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` |
|
||||
| — | [FixedString](../data_types/fixedstring.md) | `STRING` |
|
||||
| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` |
|
||||
|------------------|---------------------------|------------------|
|
||||
| `UINT8`, `BOOL` | [UInt8](../sql_reference/data_types/int_uint.md) | `UINT8` |
|
||||
| `INT8` | [Int8](../sql_reference/data_types/int_uint.md) | `INT8` |
|
||||
| `UINT16` | [UInt16](../sql_reference/data_types/int_uint.md) | `UINT16` |
|
||||
| `INT16` | [Int16](../sql_reference/data_types/int_uint.md) | `INT16` |
|
||||
| `UINT32` | [UInt32](../sql_reference/data_types/int_uint.md) | `UINT32` |
|
||||
| `INT32` | [Int32](../sql_reference/data_types/int_uint.md) | `INT32` |
|
||||
| `UINT64` | [UInt64](../sql_reference/data_types/int_uint.md) | `UINT64` |
|
||||
| `INT64` | [Int64](../sql_reference/data_types/int_uint.md) | `INT64` |
|
||||
| `FLOAT`, `HALF_FLOAT` | [Float32](../sql_reference/data_types/float.md) | `FLOAT` |
|
||||
| `DOUBLE` | [Float64](../sql_reference/data_types/float.md) | `DOUBLE` |
|
||||
| `DATE32` | [Date](../sql_reference/data_types/date.md) | `UINT16` |
|
||||
| `DATE64`, `TIMESTAMP` | [DateTime](../sql_reference/data_types/datetime.md) | `UINT32` |
|
||||
| `STRING`, `BINARY` | [String](../sql_reference/data_types/string.md) | `STRING` |
|
||||
| — | [FixedString](../sql_reference/data_types/fixedstring.md) | `STRING` |
|
||||
| `DECIMAL` | [Decimal](../sql_reference/data_types/decimal.md) | `DECIMAL` |
|
||||
|
||||
ClickHouse supports configurable precision of `Decimal` type. The `INSERT` query treats the Parquet `DECIMAL` type as the ClickHouse `Decimal128` type.
|
||||
|
||||
@ -1134,7 +1139,7 @@ You can select data from a ClickHouse table and save them into some file in the
|
||||
$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq}
|
||||
```
|
||||
|
||||
To exchange data with Hadoop, you can use [HDFS table engine](../operations/table_engines/hdfs.md).
|
||||
To exchange data with Hadoop, you can use [HDFS table engine](../engines/table_engines/integrations/hdfs.md).
|
||||
|
||||
## ORC {#data-format-orc}
|
||||
|
||||
@ -1142,24 +1147,24 @@ To exchange data with Hadoop, you can use [HDFS table engine](../operations/tabl
|
||||
|
||||
### Data Types Matching {#data_types-matching-3}
|
||||
|
||||
The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` queries.
|
||||
The table below shows supported data types and how they match ClickHouse [data types](../sql_reference/data_types/index.md) in `INSERT` queries.
|
||||
|
||||
| ORC data type (`INSERT`) | ClickHouse data type |
|
||||
|--------------------------|---------------------------------------|
|
||||
| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) |
|
||||
| `INT8` | [Int8](../data_types/int_uint.md) |
|
||||
| `UINT16` | [UInt16](../data_types/int_uint.md) |
|
||||
| `INT16` | [Int16](../data_types/int_uint.md) |
|
||||
| `UINT32` | [UInt32](../data_types/int_uint.md) |
|
||||
| `INT32` | [Int32](../data_types/int_uint.md) |
|
||||
| `UINT64` | [UInt64](../data_types/int_uint.md) |
|
||||
| `INT64` | [Int64](../data_types/int_uint.md) |
|
||||
| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) |
|
||||
| `DOUBLE` | [Float64](../data_types/float.md) |
|
||||
| `DATE32` | [Date](../data_types/date.md) |
|
||||
| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) |
|
||||
| `STRING`, `BINARY` | [String](../data_types/string.md) |
|
||||
| `DECIMAL` | [Decimal](../data_types/decimal.md) |
|
||||
|----------------|-------------------------|
|
||||
| `UINT8`, `BOOL` | [UInt8](../sql_reference/data_types/int_uint.md) |
|
||||
| `INT8` | [Int8](../sql_reference/data_types/int_uint.md) |
|
||||
| `UINT16` | [UInt16](../sql_reference/data_types/int_uint.md) |
|
||||
| `INT16` | [Int16](../sql_reference/data_types/int_uint.md) |
|
||||
| `UINT32` | [UInt32](../sql_reference/data_types/int_uint.md) |
|
||||
| `INT32` | [Int32](../sql_reference/data_types/int_uint.md) |
|
||||
| `UINT64` | [UInt64](../sql_reference/data_types/int_uint.md) |
|
||||
| `INT64` | [Int64](../sql_reference/data_types/int_uint.md) |
|
||||
| `FLOAT`, `HALF_FLOAT` | [Float32](../sql_reference/data_types/float.md) |
|
||||
| `DOUBLE` | [Float64](../sql_reference/data_types/float.md) |
|
||||
| `DATE32` | [Date](../sql_reference/data_types/date.md) |
|
||||
| `DATE64`, `TIMESTAMP` | [DateTime](../sql_reference/data_types/datetime.md) |
|
||||
| `STRING`, `BINARY` | [String](../sql_reference/data_types/string.md) |
|
||||
| `DECIMAL` | [Decimal](../sql_reference/data_types/decimal.md) |
|
||||
|
||||
ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type.
|
||||
|
||||
@ -1175,7 +1180,7 @@ You can insert ORC data from a file into ClickHouse table by the following comma
|
||||
$ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
|
||||
```
|
||||
|
||||
To exchange data with Hadoop, you can use [HDFS table engine](../operations/table_engines/hdfs.md).
|
||||
To exchange data with Hadoop, you can use [HDFS table engine](../engines/table_engines/integrations/hdfs.md).
|
||||
|
||||
## Format Schema {#formatschema}
|
||||
|
||||
@ -1191,7 +1196,7 @@ can contain an absolute path or a path relative to the current directory on the
|
||||
If you use the client in the [batch mode](../interfaces/cli.md#cli_usage), the path to the schema must be relative due to security reasons.
|
||||
|
||||
If you input or output data via the [HTTP interface](../interfaces/http.md) the file name specified in the format schema
|
||||
should be located in the directory specified in [format\_schema\_path](../operations/server_settings/settings.md#server_settings-format_schema_path)
|
||||
should be located in the directory specified in [format\_schema\_path](../operations/server_configuration_parameters/settings.md#server_configuration_parameters-format_schema_path)
|
||||
in the server configuration.
|
||||
|
||||
## Skipping Errors {#skippingerrors}
|
||||
|
@ -1,10 +1,15 @@
|
||||
---
|
||||
toc_priority: 19
|
||||
toc_title: HTTP Interface
|
||||
---
|
||||
|
||||
# HTTP Interface {#http-interface}
|
||||
|
||||
The HTTP interface lets you use ClickHouse on any platform from any programming language. We use it for working from Java and Perl, as well as shell scripts. In other departments, the HTTP interface is used from Perl, Python, and Go. The HTTP interface is more limited than the native interface, but it has better compatibility.
|
||||
|
||||
By default, clickhouse-server listens for HTTP on port 8123 (this can be changed in the config).
|
||||
|
||||
If you make a GET / request without parameters, it returns 200 response code and the string which defined in [http\_server\_default\_response](../operations/server_settings/settings.md#server_settings-http_server_default_response) default value “Ok.” (with a line feed at the end)
|
||||
If you make a GET / request without parameters, it returns 200 response code and the string which defined in [http\_server\_default\_response](../operations/server_configuration_parameters/settings.md#server_configuration_parameters-http_server_default_response) default value “Ok.” (with a line feed at the end)
|
||||
|
||||
``` bash
|
||||
$ curl 'http://localhost:8123/'
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
toc_folder_title: Interfaces
|
||||
toc_priority: 14
|
||||
toc_title: Introduction
|
||||
---
|
||||
|
||||
# Interfaces {#interfaces}
|
||||
|
||||
ClickHouse provides two network interfaces (both can be optionally wrapped in TLS for additional security):
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 22
|
||||
toc_title: JDBC Driver
|
||||
---
|
||||
|
||||
# JDBC Driver {#jdbc-driver}
|
||||
|
||||
- **[Official driver](https://github.com/ClickHouse/clickhouse-jdbc)**
|
||||
|
@ -1,6 +1,11 @@
|
||||
---
|
||||
toc_priority: 20
|
||||
toc_title: MySQL Interface
|
||||
---
|
||||
|
||||
# MySQL interface {#mysql-interface}
|
||||
|
||||
ClickHouse supports MySQL wire protocol. It can be enabled by [mysql\_port](../operations/server_settings/settings.md#server_settings-mysql_port) setting in configuration file:
|
||||
ClickHouse supports MySQL wire protocol. It can be enabled by [mysql\_port](../operations/server_configuration_parameters/settings.md#server_configuration_parameters-mysql_port) setting in configuration file:
|
||||
|
||||
``` xml
|
||||
<mysql_port>9004</mysql_port>
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 23
|
||||
toc_title: ODBC Driver
|
||||
---
|
||||
|
||||
# ODBC Driver {#odbc-driver}
|
||||
|
||||
- [Official driver](https://github.com/ClickHouse/clickhouse-odbc).
|
||||
|
@ -1,5 +1,10 @@
|
||||
---
|
||||
toc_priority: 18
|
||||
toc_title: Native Interface (TCP)
|
||||
---
|
||||
|
||||
# Native Interface (TCP) {#native-interface-tcp}
|
||||
|
||||
The native protocol is used in the [command-line client](cli.md), for inter-server communication during distributed query processing, and also in other C++ programs. Unfortunately, native ClickHouse protocol does not have formal specification yet, but it can be reverse-engineered from ClickHouse source code (starting [around here](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/Client)) and/or by intercepting and analyzing TCP traffic.
|
||||
The native protocol is used in the [command-line client](cli.md), for inter-server communication during distributed query processing, and also in other C++ programs. Unfortunately, native ClickHouse protocol does not have formal specification yet, but it can be reverse-engineered from ClickHouse source code (starting [around here](https://github.com/ClickHouse/ClickHouse/tree/master/src/Client)) and/or by intercepting and analyzing TCP traffic.
|
||||
|
||||
[Original article](https://clickhouse.tech/docs/en/interfaces/tcp/) <!--hide-->
|
||||
|
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 26
|
||||
toc_title: Client Libraries
|
||||
---
|
||||
|
||||
# Client Libraries from Third-party Developers {#client-libraries-from-third-party-developers}
|
||||
|
||||
!!! warning "Disclaimer"
|
||||
|
5
docs/en/interfaces/third-party/gui.md
vendored
5
docs/en/interfaces/third-party/gui.md
vendored
@ -1,3 +1,8 @@
|
||||
---
|
||||
toc_priority: 28
|
||||
toc_title: Visual Interfaces
|
||||
---
|
||||
|
||||
# Visual Interfaces from Third-party Developers {#visual-interfaces-from-third-party-developers}
|
||||
|
||||
## Open-Source {#open-source}
|
||||
|
5
docs/en/interfaces/third-party/index.md
vendored
Normal file
5
docs/en/interfaces/third-party/index.md
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
toc_folder_title: Third-Party
|
||||
toc_priority: 24
|
||||
---
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user