Merge remote-tracking branch 'upstream/master' into HEAD

This commit is contained in:
Anton Popov 2022-03-14 16:28:35 +00:00
commit 36ec379aeb
492 changed files with 8738 additions and 8589 deletions

17
.github/codecov.yml vendored
View File

@ -1,17 +0,0 @@
codecov:
max_report_age: "off"
strict_yaml_branch: "master"
ignore:
- "contrib"
- "docs"
- "benchmark"
- "tests"
- "docker"
- "debian"
- "cmake"
comment: false
github_checks:
annotations: false

View File

@ -1,43 +0,0 @@
# This workflow checks out code, performs an Anchore container image
# vulnerability and compliance scan, and integrates the results with
# GitHub Advanced Security code scanning feature. For more information on
# the Anchore scan action usage and parameters, see
# https://github.com/anchore/scan-action. For more information on
# Anchore container image scanning in general, see
# https://docs.anchore.com.
name: Docker Container Scan (clickhouse-server)
env:
# Force the stdout and stderr streams to be unbuffered
PYTHONUNBUFFERED: 1
"on":
pull_request:
paths:
- docker/server/Dockerfile
- .github/workflows/anchore-analysis.yml
schedule:
- cron: '0 21 * * *'
jobs:
Anchore-Build-Scan:
runs-on: ubuntu-latest
steps:
- name: Checkout the code
uses: actions/checkout@v2
- name: Build the Docker image
run: |
cd docker/server
perl -pi -e 's|=\$version||g' Dockerfile
docker build . --file Dockerfile --tag localbuild/testimage:latest
- name: Run the local Anchore scan action itself with GitHub Advanced Security code scanning integration enabled
uses: anchore/scan-action@v2
id: scan
with:
image: "localbuild/testimage:latest"
acs-report-enable: true
- name: Upload Anchore Scan Report
uses: github/codeql-action/upload-sarif@v1
with:
sarif_file: ${{ steps.scan.outputs.sarif }}

116
.gitmodules vendored
View File

@ -1,6 +1,6 @@
[submodule "contrib/poco"]
path = contrib/poco
url = https://github.com/ClickHouse-Extras/poco.git
url = https://github.com/ClickHouse/poco.git
branch = clickhouse
[submodule "contrib/zstd"]
path = contrib/zstd
@ -10,13 +10,13 @@
url = https://github.com/lz4/lz4.git
[submodule "contrib/librdkafka"]
path = contrib/librdkafka
url = https://github.com/ClickHouse-Extras/librdkafka.git
url = https://github.com/ClickHouse/librdkafka.git
[submodule "contrib/cctz"]
path = contrib/cctz
url = https://github.com/ClickHouse-Extras/cctz.git
url = https://github.com/ClickHouse/cctz.git
[submodule "contrib/zlib-ng"]
path = contrib/zlib-ng
url = https://github.com/ClickHouse-Extras/zlib-ng.git
url = https://github.com/ClickHouse/zlib-ng.git
branch = clickhouse-2.0.x
[submodule "contrib/googletest"]
path = contrib/googletest
@ -32,51 +32,51 @@
url = https://github.com/google/re2.git
[submodule "contrib/llvm"]
path = contrib/llvm
url = https://github.com/ClickHouse-Extras/llvm
url = https://github.com/ClickHouse/llvm
[submodule "contrib/mariadb-connector-c"]
path = contrib/mariadb-connector-c
url = https://github.com/ClickHouse-Extras/mariadb-connector-c.git
url = https://github.com/ClickHouse/mariadb-connector-c.git
[submodule "contrib/jemalloc"]
path = contrib/jemalloc
url = https://github.com/jemalloc/jemalloc.git
[submodule "contrib/unixodbc"]
path = contrib/unixodbc
url = https://github.com/ClickHouse-Extras/UnixODBC.git
url = https://github.com/ClickHouse/UnixODBC.git
[submodule "contrib/protobuf"]
path = contrib/protobuf
url = https://github.com/ClickHouse-Extras/protobuf.git
url = https://github.com/ClickHouse/protobuf.git
branch = v3.13.0.1
[submodule "contrib/boost"]
path = contrib/boost
url = https://github.com/ClickHouse-Extras/boost.git
url = https://github.com/ClickHouse/boost.git
[submodule "contrib/base64"]
path = contrib/base64
url = https://github.com/ClickHouse-Extras/Turbo-Base64.git
url = https://github.com/ClickHouse/Turbo-Base64.git
[submodule "contrib/arrow"]
path = contrib/arrow
url = https://github.com/ClickHouse-Extras/arrow.git
url = https://github.com/ClickHouse/arrow.git
branch = blessed/release-6.0.1
[submodule "contrib/thrift"]
path = contrib/thrift
url = https://github.com/apache/thrift.git
[submodule "contrib/libhdfs3"]
path = contrib/libhdfs3
url = https://github.com/ClickHouse-Extras/libhdfs3.git
url = https://github.com/ClickHouse/libhdfs3.git
[submodule "contrib/libxml2"]
path = contrib/libxml2
url = https://github.com/GNOME/libxml2.git
[submodule "contrib/libgsasl"]
path = contrib/libgsasl
url = https://github.com/ClickHouse-Extras/libgsasl.git
url = https://github.com/ClickHouse/libgsasl.git
[submodule "contrib/libcxx"]
path = contrib/libcxx
url = https://github.com/ClickHouse-Extras/libcxx.git
url = https://github.com/ClickHouse/libcxx.git
[submodule "contrib/libcxxabi"]
path = contrib/libcxxabi
url = https://github.com/ClickHouse-Extras/libcxxabi.git
url = https://github.com/ClickHouse/libcxxabi.git
[submodule "contrib/snappy"]
path = contrib/snappy
url = https://github.com/ClickHouse-Extras/snappy.git
url = https://github.com/ClickHouse/snappy.git
[submodule "contrib/cppkafka"]
path = contrib/cppkafka
url = https://github.com/mfontanini/cppkafka.git
@ -85,95 +85,95 @@
url = https://github.com/google/brotli.git
[submodule "contrib/h3"]
path = contrib/h3
url = https://github.com/ClickHouse-Extras/h3
url = https://github.com/ClickHouse/h3
[submodule "contrib/hyperscan"]
path = contrib/hyperscan
url = https://github.com/ClickHouse-Extras/hyperscan.git
url = https://github.com/ClickHouse/hyperscan.git
[submodule "contrib/libunwind"]
path = contrib/libunwind
url = https://github.com/ClickHouse-Extras/libunwind.git
url = https://github.com/ClickHouse/libunwind.git
[submodule "contrib/simdjson"]
path = contrib/simdjson
url = https://github.com/simdjson/simdjson.git
[submodule "contrib/rapidjson"]
path = contrib/rapidjson
url = https://github.com/ClickHouse-Extras/rapidjson
url = https://github.com/ClickHouse/rapidjson
[submodule "contrib/fastops"]
path = contrib/fastops
url = https://github.com/ClickHouse-Extras/fastops
url = https://github.com/ClickHouse/fastops
[submodule "contrib/orc"]
path = contrib/orc
url = https://github.com/ClickHouse-Extras/orc
url = https://github.com/ClickHouse/orc
[submodule "contrib/sparsehash-c11"]
path = contrib/sparsehash-c11
url = https://github.com/sparsehash/sparsehash-c11.git
[submodule "contrib/grpc"]
path = contrib/grpc
url = https://github.com/ClickHouse-Extras/grpc.git
url = https://github.com/ClickHouse/grpc.git
branch = v1.33.2
[submodule "contrib/aws"]
path = contrib/aws
url = https://github.com/ClickHouse-Extras/aws-sdk-cpp.git
url = https://github.com/ClickHouse/aws-sdk-cpp.git
[submodule "aws-c-event-stream"]
path = contrib/aws-c-event-stream
url = https://github.com/ClickHouse-Extras/aws-c-event-stream.git
url = https://github.com/ClickHouse/aws-c-event-stream.git
[submodule "aws-c-common"]
path = contrib/aws-c-common
url = https://github.com/ClickHouse-Extras/aws-c-common.git
url = https://github.com/ClickHouse/aws-c-common.git
[submodule "aws-checksums"]
path = contrib/aws-checksums
url = https://github.com/ClickHouse-Extras/aws-checksums.git
url = https://github.com/ClickHouse/aws-checksums.git
[submodule "contrib/curl"]
path = contrib/curl
url = https://github.com/curl/curl.git
[submodule "contrib/icudata"]
path = contrib/icudata
url = https://github.com/ClickHouse-Extras/icudata.git
url = https://github.com/ClickHouse/icudata.git
[submodule "contrib/icu"]
path = contrib/icu
url = https://github.com/unicode-org/icu.git
[submodule "contrib/flatbuffers"]
path = contrib/flatbuffers
url = https://github.com/ClickHouse-Extras/flatbuffers.git
url = https://github.com/ClickHouse/flatbuffers.git
[submodule "contrib/replxx"]
path = contrib/replxx
url = https://github.com/ClickHouse-Extras/replxx.git
url = https://github.com/ClickHouse/replxx.git
[submodule "contrib/avro"]
path = contrib/avro
url = https://github.com/ClickHouse-Extras/avro.git
url = https://github.com/ClickHouse/avro.git
ignore = untracked
[submodule "contrib/msgpack-c"]
path = contrib/msgpack-c
url = https://github.com/msgpack/msgpack-c
[submodule "contrib/libcpuid"]
path = contrib/libcpuid
url = https://github.com/ClickHouse-Extras/libcpuid.git
url = https://github.com/ClickHouse/libcpuid.git
[submodule "contrib/openldap"]
path = contrib/openldap
url = https://github.com/ClickHouse-Extras/openldap.git
url = https://github.com/ClickHouse/openldap.git
[submodule "contrib/AMQP-CPP"]
path = contrib/AMQP-CPP
url = https://github.com/ClickHouse-Extras/AMQP-CPP.git
url = https://github.com/ClickHouse/AMQP-CPP.git
[submodule "contrib/cassandra"]
path = contrib/cassandra
url = https://github.com/ClickHouse-Extras/cpp-driver.git
url = https://github.com/ClickHouse/cpp-driver.git
branch = clickhouse
[submodule "contrib/libuv"]
path = contrib/libuv
url = https://github.com/ClickHouse-Extras/libuv.git
url = https://github.com/ClickHouse/libuv.git
branch = clickhouse
[submodule "contrib/fmtlib"]
path = contrib/fmtlib
url = https://github.com/fmtlib/fmt.git
[submodule "contrib/sentry-native"]
path = contrib/sentry-native
url = https://github.com/ClickHouse-Extras/sentry-native.git
url = https://github.com/ClickHouse/sentry-native.git
[submodule "contrib/krb5"]
path = contrib/krb5
url = https://github.com/ClickHouse-Extras/krb5
url = https://github.com/ClickHouse/krb5
[submodule "contrib/cyrus-sasl"]
path = contrib/cyrus-sasl
url = https://github.com/ClickHouse-Extras/cyrus-sasl
url = https://github.com/ClickHouse/cyrus-sasl
branch = cyrus-sasl-2.1
[submodule "contrib/croaring"]
path = contrib/croaring
@ -184,7 +184,7 @@
url = https://github.com/danlark1/miniselect
[submodule "contrib/rocksdb"]
path = contrib/rocksdb
url = https://github.com/ClickHouse-Extras/rocksdb.git
url = https://github.com/ClickHouse/rocksdb.git
[submodule "contrib/xz"]
path = contrib/xz
url = https://github.com/xz-mirror/xz
@ -194,53 +194,53 @@
branch = lts_2021_11_02
[submodule "contrib/dragonbox"]
path = contrib/dragonbox
url = https://github.com/ClickHouse-Extras/dragonbox.git
url = https://github.com/ClickHouse/dragonbox.git
[submodule "contrib/fast_float"]
path = contrib/fast_float
url = https://github.com/fastfloat/fast_float
[submodule "contrib/libpq"]
path = contrib/libpq
url = https://github.com/ClickHouse-Extras/libpq
url = https://github.com/ClickHouse/libpq
[submodule "contrib/boringssl"]
path = contrib/boringssl
url = https://github.com/ClickHouse-Extras/boringssl.git
url = https://github.com/ClickHouse/boringssl.git
branch = MergeWithUpstream
[submodule "contrib/NuRaft"]
path = contrib/NuRaft
url = https://github.com/ClickHouse-Extras/NuRaft.git
url = https://github.com/ClickHouse/NuRaft.git
[submodule "contrib/nanodbc"]
path = contrib/nanodbc
url = https://github.com/ClickHouse-Extras/nanodbc.git
url = https://github.com/ClickHouse/nanodbc.git
[submodule "contrib/datasketches-cpp"]
path = contrib/datasketches-cpp
url = https://github.com/ClickHouse-Extras/datasketches-cpp.git
url = https://github.com/ClickHouse/datasketches-cpp.git
[submodule "contrib/yaml-cpp"]
path = contrib/yaml-cpp
url = https://github.com/ClickHouse-Extras/yaml-cpp.git
url = https://github.com/ClickHouse/yaml-cpp.git
[submodule "contrib/cld2"]
path = contrib/cld2
url = https://github.com/ClickHouse-Extras/cld2.git
url = https://github.com/ClickHouse/cld2.git
[submodule "contrib/libstemmer_c"]
path = contrib/libstemmer_c
url = https://github.com/ClickHouse-Extras/libstemmer_c.git
url = https://github.com/ClickHouse/libstemmer_c.git
[submodule "contrib/wordnet-blast"]
path = contrib/wordnet-blast
url = https://github.com/ClickHouse-Extras/wordnet-blast.git
url = https://github.com/ClickHouse/wordnet-blast.git
[submodule "contrib/lemmagen-c"]
path = contrib/lemmagen-c
url = https://github.com/ClickHouse-Extras/lemmagen-c.git
url = https://github.com/ClickHouse/lemmagen-c.git
[submodule "contrib/libpqxx"]
path = contrib/libpqxx
url = https://github.com/ClickHouse-Extras/libpqxx.git
url = https://github.com/ClickHouse/libpqxx.git
[submodule "contrib/sqlite-amalgamation"]
path = contrib/sqlite-amalgamation
url = https://github.com/azadkuh/sqlite-amalgamation
[submodule "contrib/s2geometry"]
path = contrib/s2geometry
url = https://github.com/ClickHouse-Extras/s2geometry.git
url = https://github.com/ClickHouse/s2geometry.git
[submodule "contrib/bzip2"]
path = contrib/bzip2
url = https://github.com/ClickHouse-Extras/bzip2.git
url = https://github.com/ClickHouse/bzip2.git
[submodule "contrib/magic_enum"]
path = contrib/magic_enum
url = https://github.com/Neargye/magic_enum
@ -249,16 +249,16 @@
url = https://github.com/google/libprotobuf-mutator
[submodule "contrib/sysroot"]
path = contrib/sysroot
url = https://github.com/ClickHouse-Extras/sysroot.git
url = https://github.com/ClickHouse/sysroot.git
[submodule "contrib/nlp-data"]
path = contrib/nlp-data
url = https://github.com/ClickHouse-Extras/nlp-data.git
url = https://github.com/ClickHouse/nlp-data.git
[submodule "contrib/hive-metastore"]
path = contrib/hive-metastore
url = https://github.com/ClickHouse-Extras/hive-metastore
url = https://github.com/ClickHouse/hive-metastore
[submodule "contrib/azure"]
path = contrib/azure
url = https://github.com/ClickHouse-Extras/azure-sdk-for-cpp.git
url = https://github.com/ClickHouse/azure-sdk-for-cpp.git
[submodule "contrib/minizip-ng"]
path = contrib/minizip-ng
url = https://github.com/zlib-ng/minizip-ng

View File

@ -248,7 +248,9 @@ endif()
if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
set(USE_DEBUG_HELPERS ON)
endif()
option(USE_DEBUG_HELPERS "Enable debug helpers" ${USE_DEBUG_HELPERS})
option(BUILD_STANDALONE_KEEPER "Build keeper as small standalone binary" OFF)
# Create BuildID when using lld. For other linkers it is created by default.
if (LINKER_NAME MATCHES "lld$")
@ -263,6 +265,11 @@ if (OBJCOPY_PATH AND YANDEX_OFFICIAL_BUILD AND (NOT CMAKE_TOOLCHAIN_FILE))
set (USE_BINARY_HASH 1)
endif ()
# Allows to build stripped binary in a separate directory
if (OBJCOPY_PATH AND READELF_PATH)
set(BUILD_STRIPPED_BINARIES_PREFIX "" CACHE STRING "Build stripped binaries with debug info in separate directory")
endif()
cmake_host_system_information(RESULT AVAILABLE_PHYSICAL_MEMORY QUERY AVAILABLE_PHYSICAL_MEMORY) # Not available under freebsd
@ -285,8 +292,13 @@ include(cmake/cpu_features.cmake)
set (COMPILER_FLAGS "${COMPILER_FLAGS} -fasynchronous-unwind-tables")
# Reproducible builds
set (COMPILER_FLAGS "${COMPILER_FLAGS} -ffile-prefix-map=${CMAKE_SOURCE_DIR}=.")
set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -ffile-prefix-map=${CMAKE_SOURCE_DIR}=.")
# If turned `ON`, remap file source paths in debug info, predefined preprocessor macros and __builtin_FILE().
option(ENABLE_BUILD_PATH_MAPPING "Enable remap file source paths in debug info, predefined preprocessor macros and __builtin_FILE(). It's to generate reproducible builds. See https://reproducible-builds.org/docs/build-path" ON)
if (ENABLE_BUILD_PATH_MAPPING)
set (COMPILER_FLAGS "${COMPILER_FLAGS} -ffile-prefix-map=${CMAKE_SOURCE_DIR}=.")
set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -ffile-prefix-map=${CMAKE_SOURCE_DIR}=.")
endif()
if (${CMAKE_VERSION} VERSION_LESS "3.12.4")
# CMake < 3.12 doesn't support setting 20 as a C++ standard version.

View File

@ -18,7 +18,7 @@
#include "Common/config_version.h"
#include <Common/config.h>
#if USE_SENTRY
#if USE_SENTRY && !defined(KEEPER_STANDALONE_BUILD)
# include <sentry.h>
# include <stdio.h>

View File

@ -1,5 +1,13 @@
include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
add_headers_and_sources(loggers .)
# Standard version depends on DBMS and works with text log
add_library(loggers ${loggers_sources} ${loggers_headers})
target_compile_definitions(loggers PUBLIC WITH_TEXT_LOG=1)
target_link_libraries(loggers PRIVATE dbms clickhouse_common_io)
target_include_directories(loggers PUBLIC ..)
# Lightweight version doesn't work with textlog and also doesn't depend on DBMS
add_library(loggers_no_text_log ${loggers_sources} ${loggers_headers})
target_link_libraries(loggers_no_text_log PRIVATE clickhouse_common_io)
target_include_directories(loggers PUBLIC ..)

View File

@ -9,7 +9,11 @@
#include <Poco/ConsoleChannel.h>
#include <Poco/Logger.h>
#include <Poco/Net/RemoteSyslogChannel.h>
#include <Interpreters/TextLog.h>
#ifdef WITH_TEXT_LOG
#include <Interpreters/TextLog.h>
#endif
#include <filesystem>
namespace fs = std::filesystem;
@ -30,17 +34,21 @@ static std::string createDirectory(const std::string & file)
return path;
};
#ifdef WITH_TEXT_LOG
void Loggers::setTextLog(std::shared_ptr<DB::TextLog> log, int max_priority)
{
text_log = log;
text_log_max_priority = max_priority;
}
#endif
void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Logger & logger /*_root*/, const std::string & cmd_name)
{
#ifdef WITH_TEXT_LOG
if (split)
if (auto log = text_log.lock())
split->addTextLog(log, text_log_max_priority);
#endif
auto current_logger = config.getString("logger", "");
if (config_logger == current_logger) //-V1051

View File

@ -7,10 +7,12 @@
#include <Poco/Util/Application.h>
#include "OwnSplitChannel.h"
#ifdef WITH_TEXT_LOG
namespace DB
{
class TextLog;
}
#endif
namespace Poco::Util
{
@ -27,7 +29,9 @@ public:
/// Close log files. On next log write files will be reopened.
void closeLogs(Poco::Logger & logger);
#ifdef WITH_TEXT_LOG
void setTextLog(std::shared_ptr<DB::TextLog> log, int max_priority);
#endif
private:
Poco::AutoPtr<Poco::FileChannel> log_file;
@ -37,8 +41,10 @@ private:
/// Previous value of logger element in config. It is used to reinitialize loggers whenever the value changed.
std::string config_logger;
#ifdef WITH_TEXT_LOG
std::weak_ptr<DB::TextLog> text_log;
int text_log_max_priority = -1;
#endif
Poco::AutoPtr<DB::OwnSplitChannel> split;
};

View File

@ -20,10 +20,13 @@ namespace DB
{
void OwnSplitChannel::log(const Poco::Message & msg)
{
#ifdef WITH_TEXT_LOG
auto logs_queue = CurrentThread::getInternalTextLogsQueue();
if (channels.empty() && (logs_queue == nullptr || msg.getPriority() > logs_queue->max_priority))
return;
#endif
if (auto * masker = SensitiveDataMasker::getInstance())
{
@ -86,6 +89,7 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg)
channel.first->log(msg); // ordinary child
}
#ifdef WITH_TEXT_LOG
auto logs_queue = CurrentThread::getInternalTextLogsQueue();
/// Log to "TCP queue" if message is not too noisy
@ -137,6 +141,7 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg)
if (text_log_locked)
text_log_locked->add(elem);
}
#endif
}
@ -145,12 +150,14 @@ void OwnSplitChannel::addChannel(Poco::AutoPtr<Poco::Channel> channel, const std
channels.emplace(name, ExtendedChannelPtrPair(std::move(channel), dynamic_cast<ExtendedLogChannel *>(channel.get())));
}
#ifdef WITH_TEXT_LOG
void OwnSplitChannel::addTextLog(std::shared_ptr<DB::TextLog> log, int max_priority)
{
std::lock_guard<std::mutex> lock(text_log_mutex);
text_log = log;
text_log_max_priority.store(max_priority, std::memory_order_relaxed);
}
#endif
void OwnSplitChannel::setLevel(const std::string & name, int level)
{

View File

@ -7,10 +7,12 @@
#include <Poco/Channel.h>
#include "ExtendedLogChannel.h"
#ifdef WITH_TEXT_LOG
namespace DB
{
class TextLog;
}
#endif
namespace DB
{
@ -25,7 +27,9 @@ public:
/// Adds a child channel
void addChannel(Poco::AutoPtr<Poco::Channel> channel, const std::string & name);
#ifdef WITH_TEXT_LOG
void addTextLog(std::shared_ptr<DB::TextLog> log, int max_priority);
#endif
void setLevel(const std::string & name, int level);
@ -40,8 +44,10 @@ private:
std::mutex text_log_mutex;
#ifdef WITH_TEXT_LOG
std::weak_ptr<DB::TextLog> text_log;
std::atomic<int> text_log_max_priority = -1;
#endif
};
}

25
cmake/strip.sh Executable file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env bash
BINARY_PATH=$1
BINARY_NAME=$(basename $BINARY_PATH)
DESTINATION_STRIPPED_DIR=$2
OBJCOPY_PATH=${3:objcopy}
READELF_PATH=${4:readelf}
BUILD_ID=$($READELF_PATH -n $1 | sed -n '/Build ID/ { s/.*: //p; q; }')
BUILD_ID_PREFIX=${BUILD_ID:0:2}
BUILD_ID_SUFFIX=${BUILD_ID:2}
TEMP_BINARY_PATH="${BINARY_PATH}_temp"
DESTINATION_DEBUG_INFO_DIR="$DESTINATION_STRIPPED_DIR/lib/debug/.build-id"
DESTINATION_STRIP_BINARY_DIR="$DESTINATION_STRIPPED_DIR/bin"
mkdir -p "$DESTINATION_DEBUG_INFO_DIR/$BUILD_ID_PREFIX"
mkdir -p "$DESTINATION_STRIP_BINARY_DIR"
$OBJCOPY_PATH --only-keep-debug "$BINARY_PATH" "$DESTINATION_DEBUG_INFO_DIR/$BUILD_ID_PREFIX/$BUILD_ID_SUFFIX.debug"
touch "$TEMP_BINARY_PATH"
$OBJCOPY_PATH --add-gnu-debuglink "$DESTINATION_DEBUG_INFO_DIR/$BUILD_ID_PREFIX/$BUILD_ID_SUFFIX.debug" "$BINARY_PATH" "$TEMP_BINARY_PATH"
$OBJCOPY_PATH --strip-all "$TEMP_BINARY_PATH" "$DESTINATION_STRIP_BINARY_DIR/$BINARY_NAME"
rm -f "$TEMP_BINARY_PATH"

26
cmake/strip_binary.cmake Normal file
View File

@ -0,0 +1,26 @@
macro(clickhouse_strip_binary)
set(oneValueArgs TARGET DESTINATION_DIR BINARY_PATH)
cmake_parse_arguments(STRIP "" "${oneValueArgs}" "" ${ARGN})
if (NOT DEFINED STRIP_TARGET)
message(FATAL_ERROR "A target name must be provided for stripping binary")
endif()
if (NOT DEFINED STRIP_BINARY_PATH)
message(FATAL_ERROR "A binary path name must be provided for stripping binary")
endif()
if (NOT DEFINED STRIP_DESTINATION_DIR)
message(FATAL_ERROR "Destination directory for stripped binary must be provided")
endif()
add_custom_command(TARGET ${STRIP_TARGET} POST_BUILD
COMMAND bash ${ClickHouse_SOURCE_DIR}/cmake/strip.sh ${STRIP_BINARY_PATH} ${STRIP_DESTINATION_DIR} ${OBJCOPY_PATH} ${READELF_PATH}
COMMENT "Stripping clickhouse binary" VERBATIM
)
install(PROGRAMS ${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET} DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
install(DIRECTORY ${STRIP_DESTINATION_DIR}/lib/debug DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT clickhouse)
endmacro()

View File

@ -169,3 +169,33 @@ if (OBJCOPY_PATH)
else ()
message (FATAL_ERROR "Cannot find objcopy.")
endif ()
# Readelf (FIXME copypaste)
if (COMPILER_GCC)
find_program (READELF_PATH NAMES "llvm-readelf" "llvm-readelf-13" "llvm-readelf-12" "llvm-readelf-11" "readelf")
else ()
find_program (READELF_PATH NAMES "llvm-readelf-${COMPILER_VERSION_MAJOR}" "llvm-readelf" "readelf")
endif ()
if (NOT READELF_PATH AND OS_DARWIN)
find_program (BREW_PATH NAMES "brew")
if (BREW_PATH)
execute_process (COMMAND ${BREW_PATH} --prefix llvm ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE LLVM_PREFIX)
if (LLVM_PREFIX)
find_program (READELF_PATH NAMES "llvm-readelf" PATHS "${LLVM_PREFIX}/bin" NO_DEFAULT_PATH)
endif ()
if (NOT READELF_PATH)
execute_process (COMMAND ${BREW_PATH} --prefix binutils ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE BINUTILS_PREFIX)
if (BINUTILS_PREFIX)
find_program (READELF_PATH NAMES "readelf" PATHS "${BINUTILS_PREFIX}/bin" NO_DEFAULT_PATH)
endif ()
endif ()
endif ()
endif ()
if (READELF_PATH)
message (STATUS "Using readelf: ${READELF_PATH}")
else ()
message (FATAL_ERROR "Cannot find readelf.")
endif ()

2
contrib/icu vendored

@ -1 +1 @@
Subproject commit faa2f9f9e1fe74c5ed00eba371d2830134cdbea1
Subproject commit a56dde820dc35665a66f2e9ee8ba58e75049b668

View File

@ -212,7 +212,9 @@ set(ICUUC_SOURCES
"${ICU_SOURCE_DIR}/common/ubiditransform.cpp"
"${ICU_SOURCE_DIR}/common/pluralmap.cpp"
"${ICU_SOURCE_DIR}/common/static_unicode_sets.cpp"
"${ICU_SOURCE_DIR}/common/restrace.cpp")
"${ICU_SOURCE_DIR}/common/restrace.cpp"
"${ICU_SOURCE_DIR}/common/emojiprops.cpp"
"${ICU_SOURCE_DIR}/common/lstmbe.cpp")
set(ICUI18N_SOURCES
"${ICU_SOURCE_DIR}/i18n/ucln_in.cpp"
@ -398,7 +400,6 @@ set(ICUI18N_SOURCES
"${ICU_SOURCE_DIR}/i18n/sharedbreakiterator.cpp"
"${ICU_SOURCE_DIR}/i18n/scientificnumberformatter.cpp"
"${ICU_SOURCE_DIR}/i18n/dayperiodrules.cpp"
"${ICU_SOURCE_DIR}/i18n/nounit.cpp"
"${ICU_SOURCE_DIR}/i18n/number_affixutils.cpp"
"${ICU_SOURCE_DIR}/i18n/number_compact.cpp"
"${ICU_SOURCE_DIR}/i18n/number_decimalquantity.cpp"
@ -446,12 +447,21 @@ set(ICUI18N_SOURCES
"${ICU_SOURCE_DIR}/i18n/formattedvalue.cpp"
"${ICU_SOURCE_DIR}/i18n/formattedval_iterimpl.cpp"
"${ICU_SOURCE_DIR}/i18n/formattedval_sbimpl.cpp"
"${ICU_SOURCE_DIR}/i18n/formatted_string_builder.cpp")
"${ICU_SOURCE_DIR}/i18n/formatted_string_builder.cpp"
"${ICU_SOURCE_DIR}/i18n/measunit_extra.cpp"
"${ICU_SOURCE_DIR}/i18n/number_symbolswrapper.cpp"
"${ICU_SOURCE_DIR}/i18n/number_usageprefs.cpp"
"${ICU_SOURCE_DIR}/i18n/numrange_capi.cpp"
"${ICU_SOURCE_DIR}/i18n/pluralranges.cpp"
"${ICU_SOURCE_DIR}/i18n/units_complexconverter.cpp"
"${ICU_SOURCE_DIR}/i18n/units_converter.cpp"
"${ICU_SOURCE_DIR}/i18n/units_data.cpp"
"${ICU_SOURCE_DIR}/i18n/units_router.cpp")
file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/empty.cpp" CONTENT " ")
enable_language(ASM)
set(ICUDATA_SOURCES
"${ICUDATA_SOURCE_DIR}/icudt66l_dat.S"
"${ICUDATA_SOURCE_DIR}/icudt70l_dat.S"
"${CMAKE_CURRENT_BINARY_DIR}/empty.cpp" # Without this cmake can incorrectly detects library type (OBJECT) instead of SHARED/STATIC
)

2
contrib/icudata vendored

@ -1 +1 @@
Subproject commit f020820388e3faafb44cc643574a2d563dfde572
Subproject commit 72d9a4a7febc904e2b0a534ccb25ae40fac5f1e5

View File

@ -4,12 +4,21 @@
extern "C" {
#endif
#if !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wredundant-decls"
#endif
#include <jemalloc/jemalloc_defs.h>
#include <jemalloc/jemalloc_rename.h>
#include <jemalloc/jemalloc_macros.h>
#include <jemalloc/jemalloc_protos.h>
#include <jemalloc/jemalloc_typedefs.h>
#if !defined(__clang__)
#pragma GCC diagnostic pop
#endif
#ifdef __cplusplus
}
#endif

View File

@ -1,4 +1,8 @@
if (APPLE OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined")
# During cross-compilation in our CI we have to use llvm-tblgen and other building tools
# tools to be build for host architecture and everything else for target architecture (e.g. AArch64)
# Possible workaround is to use llvm-tblgen from some package...
# But lets just enable LLVM for native builds
if (CMAKE_CROSSCOMPILING OR SANITIZE STREQUAL "undefined")
set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF)
else()
set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON)
@ -22,9 +26,6 @@ set (LLVM_LIBRARY_DIRS "${ClickHouse_BINARY_DIR}/contrib/llvm/llvm")
set (REQUIRED_LLVM_LIBRARIES
LLVMExecutionEngine
LLVMRuntimeDyld
LLVMX86CodeGen
LLVMX86Desc
LLVMX86Info
LLVMAsmPrinter
LLVMDebugInfoDWARF
LLVMGlobalISel
@ -56,6 +57,12 @@ set (REQUIRED_LLVM_LIBRARIES
LLVMDemangle
)
if (ARCH_AMD64)
list(APPEND REQUIRED_LLVM_LIBRARIES LLVMX86Info LLVMX86Desc LLVMX86CodeGen)
elseif (ARCH_AARCH64)
list(APPEND REQUIRED_LLVM_LIBRARIES LLVMAArch64Info LLVMAArch64Desc LLVMAArch64CodeGen)
endif ()
#function(llvm_libs_all REQUIRED_LLVM_LIBRARIES)
# llvm_map_components_to_libnames (result all)
# if (USE_STATIC_LIBRARIES OR NOT "LLVM" IN_LIST result)

View File

@ -42,6 +42,9 @@ COPY prepare_hive_data.sh /
COPY demo_data.txt /
ENV PATH=/apache-hive-2.3.9-bin/bin:/hadoop-3.1.0/bin:/hadoop-3.1.0/sbin:$PATH
RUN service ssh start && sed s/HOSTNAME/$HOSTNAME/ /hadoop-3.1.0/etc/hadoop/core-site.xml.template > /hadoop-3.1.0/etc/hadoop/core-site.xml && hdfs namenode -format
RUN apt install -y python3 python3-pip
RUN pip3 install flask requests
COPY http_api_server.py /
COPY start.sh /

View File

@ -0,0 +1,70 @@
import os
import subprocess
import datetime
from flask import Flask, flash, request, redirect, url_for
def run_command(command, wait=False):
print("{} - execute shell command:{}".format(datetime.datetime.now(), command))
lines = []
p = subprocess.Popen(command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True)
if wait:
for l in iter(p.stdout.readline, b''):
lines.append(l)
p.poll()
return (lines, p.returncode)
else:
return(iter(p.stdout.readline, b''), 0)
UPLOAD_FOLDER = './'
ALLOWED_EXTENSIONS = {'txt', 'sh'}
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
@app.route('/')
def hello_world():
return 'Hello World'
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/upload', methods=['GET', 'POST'])
def upload_file():
if request.method == 'POST':
# check if the post request has the file part
if 'file' not in request.files:
flash('No file part')
return redirect(request.url)
file = request.files['file']
# If the user does not select a file, the browser submits an
# empty file without a filename.
if file.filename == '':
flash('No selected file')
return redirect(request.url)
if file and allowed_file(file.filename):
filename = file.filename
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
return redirect(url_for('upload_file', name=filename))
return '''
<!doctype html>
<title>Upload new File</title>
<h1>Upload new File</h1>
<form method=post enctype=multipart/form-data>
<input type=file name=file>
<input type=submit value=Upload>
</form>
'''
@app.route('/run', methods=['GET', 'POST'])
def parse_request():
data = request.data # data is empty
run_command(data, wait=True)
return 'Ok'
if __name__ == '__main__':
app.run(port=5011)

View File

@ -2,5 +2,9 @@
hive -e "create database test"
hive -e "create table test.demo(id string, score int) PARTITIONED BY(day string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'; create table test.demo_orc(id string, score int) PARTITIONED BY(day string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; "
hive -e "create table test.parquet_demo(id string, score int) PARTITIONED BY(day string, hour string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'"
hive -e "create table test.demo_text(id string, score int, day string)row format delimited fields terminated by ','; load data local inpath '/demo_data.txt' into table test.demo_text "
hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.demo partition(day) select * from test.demo_text; insert into test.demo_orc partition(day) select * from test.demo_text"
hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.demo partition(day) select * from test.demo_text; insert into test.demo_orc partition(day) select * from test.demo_text"
hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.parquet_demo partition(day, hour) select id, score, day, '00' as hour from test.demo;"
hive -e "set hive.exec.dynamic.partition.mode=nonstrict;insert into test.parquet_demo partition(day, hour) select id, score, day, '01' as hour from test.demo;"

View File

@ -1,6 +1,5 @@
service ssh start
sed s/HOSTNAME/$HOSTNAME/ /hadoop-3.1.0/etc/hadoop/core-site.xml.template > /hadoop-3.1.0/etc/hadoop/core-site.xml
hadoop namenode -format
start-all.sh
service mysql start
mysql -u root -e "CREATE USER \"test\"@\"localhost\" IDENTIFIED BY \"test\""
@ -9,4 +8,4 @@ schematool -initSchema -dbType mysql
#nohup hiveserver2 &
nohup hive --service metastore &
bash /prepare_hive_data.sh
while true; do sleep 1000; done
python3 http_api_server.py

View File

@ -13,6 +13,17 @@ COPY s3downloader /s3downloader
ENV S3_URL="https://clickhouse-datasets.s3.yandex.net"
ENV DATASETS="hits visits"
ENV EXPORT_S3_STORAGE_POLICIES=1
# Download Minio-related binaries
RUN arch=${TARGETARCH:-amd64} \
&& wget "https://dl.min.io/server/minio/release/linux-${arch}/minio" \
&& chmod +x ./minio \
&& wget "https://dl.min.io/client/mc/release/linux-${arch}/mc" \
&& chmod +x ./mc
ENV MINIO_ROOT_USER="clickhouse"
ENV MINIO_ROOT_PASSWORD="clickhouse"
COPY setup_minio.sh /
COPY run.sh /
CMD ["/bin/bash", "/run.sh"]

View File

@ -17,6 +17,8 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
# install test configs
/usr/share/clickhouse-test/config/install.sh
./setup_minio.sh
function start()
{
if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
@ -93,6 +95,8 @@ else
clickhouse-client --query "SHOW TABLES FROM test"
clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits"
clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits"
clickhouse-client --query "CREATE TABLE test.hits_s3 (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192, storage_policy='s3_cache'"
clickhouse-client --query "INSERT INTO test.hits_s3 SELECT * FROM test.hits"
fi
clickhouse-client --query "SHOW TABLES FROM test"

View File

@ -0,0 +1,66 @@
#!/bin/bash
# TODO: Make this file shared with stateless tests
#
# Usage for local run:
#
# ./docker/test/stateful/setup_minio.sh ./tests/
#
set -e -x -a -u
ls -lha
mkdir -p ./minio_data
if [ ! -f ./minio ]; then
echo 'MinIO binary not found, downloading...'
BINARY_TYPE=$(uname -s | tr '[:upper:]' '[:lower:]')
wget "https://dl.min.io/server/minio/release/${BINARY_TYPE}-amd64/minio" \
&& chmod +x ./minio \
&& wget "https://dl.min.io/client/mc/release/${BINARY_TYPE}-amd64/mc" \
&& chmod +x ./mc
fi
MINIO_ROOT_USER=${MINIO_ROOT_USER:-clickhouse}
MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-clickhouse}
./minio server --address ":11111" ./minio_data &
while ! curl -v --silent http://localhost:11111 2>&1 | grep AccessDenied
do
echo "Trying to connect to minio"
sleep 1
done
lsof -i :11111
sleep 5
./mc alias set clickminio http://localhost:11111 clickhouse clickhouse
./mc admin user add clickminio test testtest
./mc admin policy set clickminio readwrite user=test
./mc mb clickminio/test
# Upload data to Minio. By default after unpacking all tests will in
# /usr/share/clickhouse-test/queries
TEST_PATH=${1:-/usr/share/clickhouse-test}
MINIO_DATA_PATH=${TEST_PATH}/queries/1_stateful/data_minio
# Iterating over globs will cause redudant FILE variale to be a path to a file, not a filename
# shellcheck disable=SC2045
for FILE in $(ls "${MINIO_DATA_PATH}"); do
echo "$FILE";
./mc cp "${MINIO_DATA_PATH}"/"$FILE" clickminio/test/"$FILE";
done
mkdir -p ~/.aws
cat <<EOT >> ~/.aws/credentials
[default]
aws_access_key_id=clickhouse
aws_secret_access_key=clickhouse
EOT

View File

@ -60,6 +60,7 @@ RUN arch=${TARGETARCH:-amd64} \
ENV MINIO_ROOT_USER="clickhouse"
ENV MINIO_ROOT_PASSWORD="clickhouse"
ENV EXPORT_S3_STORAGE_POLICIES=1
COPY run.sh /
COPY setup_minio.sh /

View File

@ -29,5 +29,6 @@ COPY run.sh /
ENV DATASETS="hits visits"
ENV S3_URL="https://clickhouse-datasets.s3.yandex.net"
ENV EXPORT_S3_STORAGE_POLICIES=1
CMD ["/bin/bash", "/run.sh"]

View File

@ -173,6 +173,8 @@ quit
configure
./setup_minio.sh
start
# shellcheck disable=SC2086 # No quotes because I want to split it into words.
@ -188,6 +190,8 @@ clickhouse-client --query "SHOW TABLES FROM datasets"
clickhouse-client --query "SHOW TABLES FROM test"
clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits"
clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits"
clickhouse-client --query "CREATE TABLE test.hits_s3 (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192, storage_policy='s3_cache'"
clickhouse-client --query "INSERT INTO test.hits_s3 SELECT * FROM test.hits"
clickhouse-client --query "SHOW TABLES FROM test"
./stress --hung-check --drop-databases --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" \

View File

@ -38,7 +38,7 @@ Writing the docs is extremely useful for project's users and developers, and gro
The documentation contains information about all the aspects of the ClickHouse lifecycle: developing, testing, installing, operating, and using. The base language of the documentation is English. The English version is the most actual. All other languages are supported as much as they can by contributors from different countries.
At the moment, [documentation](https://clickhouse.com/docs) exists in English, Russian, Chinese, Japanese, and Farsi. We store the documentation besides the ClickHouse source code in the [GitHub repository](https://github.com/ClickHouse/ClickHouse/tree/master/docs).
At the moment, [documentation](https://clickhouse.com/docs) exists in English, Russian, Chinese, Japanese. We store the documentation besides the ClickHouse source code in the [GitHub repository](https://github.com/ClickHouse/ClickHouse/tree/master/docs).
Each language lays in the corresponding folder. Files that are not translated from English are the symbolic links to the English ones.

View File

@ -1,11 +1,11 @@
sudo apt-get install apt-transport-https ca-certificates dirmngr
sudo apt-get install -y apt-transport-https ca-certificates dirmngr
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main/" | sudo tee \
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee \
/etc/apt/sources.list.d/clickhouse.list
sudo apt-get update
sudo apt-get install -y clickhouse-server clickhouse-client
sudo service clickhouse-server start
clickhouse-client # or "clickhouse-client --password" if you set up a password.
clickhouse-client # or "clickhouse-client --password" if you've set up a password.

View File

@ -190,15 +190,3 @@ Runs randomly generated queries to catch program errors. If it fails, ask a main
## Performance Tests
Measure changes in query performance. This is the longest check that takes just below 6 hours to run. The performance test report is described in detail [here](https://github.com/ClickHouse/ClickHouse/tree/master/docker/test/performance-comparison#how-to-read-the-report).
# QA
> What is a `Task (private network)` item on status pages?
It's a link to the Yandex's internal job system. Yandex employees can see the check's start time and its more verbose status.
> Where the tests are run
Somewhere on Yandex internal infrastructure.

View File

@ -40,8 +40,8 @@ The list of third-party libraries:
| grpc | [Apache](https://github.com/ClickHouse-Extras/grpc/blob/60c986e15cae70aade721d26badabab1f822fdd6/LICENSE) |
| h3 | [Apache](https://github.com/ClickHouse-Extras/h3/blob/c7f46cfd71fb60e2fefc90e28abe81657deff735/LICENSE) |
| hyperscan | [Boost](https://github.com/ClickHouse-Extras/hyperscan/blob/e9f08df0213fc637aac0a5bbde9beeaeba2fe9fa/LICENSE) |
| icu | [Public Domain](https://github.com/unicode-org/icu/blob/faa2f9f9e1fe74c5ed00eba371d2830134cdbea1/icu4c/LICENSE) |
| icudata | [Public Domain](https://github.com/ClickHouse-Extras/icudata/blob/f020820388e3faafb44cc643574a2d563dfde572/LICENSE) |
| icu | [Public Domain](https://github.com/unicode-org/icu/blob/a56dde820dc35665a66f2e9ee8ba58e75049b668/icu4c/LICENSE) |
| icudata | [Public Domain](https://github.com/ClickHouse-Extras/icudata/blob/72d9a4a7febc904e2b0a534ccb25ae40fac5f1e5/LICENSE) |
| jemalloc | [BSD 2-clause](https://github.com/ClickHouse-Extras/jemalloc/blob/e6891d9746143bf2cf617493d880ba5a0b9a3efd/COPYING) |
| krb5 | [MIT](https://github.com/ClickHouse-Extras/krb5/blob/5149dea4e2be0f67707383d2682b897c14631374/src/lib/gssapi/LICENSE) |
| libc-headers | [LGPL](https://github.com/ClickHouse-Extras/libc-headers/blob/a720b7105a610acbd7427eea475a5b6810c151eb/LICENSE) |

View File

@ -243,7 +243,7 @@ List of tasks: https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aopen+is%3A
## Test Data {#test-data}
Developing ClickHouse often requires loading realistic datasets. It is particularly important for performance testing. We have a specially prepared set of anonymized data from Yandex.Metrica. It requires additionally some 3GB of free disk space. Note that this data is not required to accomplish most of the development tasks.
Developing ClickHouse often requires loading realistic datasets. It is particularly important for performance testing. We have a specially prepared set of anonymized data of web analytics. It requires additionally some 3GB of free disk space. Note that this data is not required to accomplish most of the development tasks.
sudo apt install wget xz-utils
@ -270,7 +270,7 @@ Navigate to your fork repository in GitHubs UI. If you have been developing i
A pull request can be created even if the work is not completed yet. In this case please put the word “WIP” (work in progress) at the beginning of the title, it can be changed later. This is useful for cooperative reviewing and discussion of changes as well as for running all of the available tests. It is important that you provide a brief description of your changes, it will later be used for generating release changelogs.
Testing will commence as soon as Yandex employees label your PR with a tag “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour.
Testing will commence as soon as ClickHouse employees label your PR with a tag “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour.
The system will prepare ClickHouse binary builds for your pull request individually. To retrieve these builds click the “Details” link next to “ClickHouse build check” entry in the list of checks. There you will find direct links to the built .deb packages of ClickHouse which you can deploy even on your production servers (if you have no fear).

View File

@ -404,9 +404,9 @@ enum class CompressionMethod
};
```
**15.** All names must be in English. Transliteration of Russian words is not allowed.
**15.** All names must be in English. Transliteration of Hebrew words is not allowed.
not Stroka
not T_PAAMAYIM_NEKUDOTAYIM
**16.** Abbreviations are acceptable if they are well known (when you can easily find the meaning of the abbreviation in Wikipedia or in a search engine).

View File

@ -11,7 +11,7 @@ Functional tests are the most simple and convenient to use. Most of ClickHouse f
Each functional test sends one or multiple queries to the running ClickHouse server and compares the result with reference.
Tests are located in `queries` directory. There are two subdirectories: `stateless` and `stateful`. Stateless tests run queries without any preloaded test data - they often create small synthetic datasets on the fly, within the test itself. Stateful tests require preloaded test data from Yandex.Metrica and it is available to general public.
Tests are located in `queries` directory. There are two subdirectories: `stateless` and `stateful`. Stateless tests run queries without any preloaded test data - they often create small synthetic datasets on the fly, within the test itself. Stateful tests require preloaded test data from CLickHouse and it is available to general public.
Each test can be one of two types: `.sql` and `.sh`. `.sql` test is the simple SQL script that is piped to `clickhouse-client --multiquery --testmode`. `.sh` test is a script that is run by itself. SQL tests are generally preferable to `.sh` tests. You should use `.sh` tests only when you have to test some feature that cannot be exercised from pure SQL, such as piping some input data into `clickhouse-client` or testing `clickhouse-local`.
@ -133,44 +133,6 @@ If the system clickhouse-server is already running and you do not want to stop i
`clickhouse` binary has almost no dependencies and works across wide range of Linux distributions. To quick and dirty test your changes on a server, you can simply `scp` your fresh built `clickhouse` binary to your server and then run it as in examples above.
## Testing Environment {#testing-environment}
Before publishing release as stable we deploy it on testing environment. Testing environment is a cluster that process 1/39 part of [Yandex.Metrica](https://metrica.yandex.com/) data. We share our testing environment with Yandex.Metrica team. ClickHouse is upgraded without downtime on top of existing data. We look at first that data is processed successfully without lagging from realtime, the replication continue to work and there is no issues visible to Yandex.Metrica team. First check can be done in the following way:
``` sql
SELECT hostName() AS h, any(version()), any(uptime()), max(UTCEventTime), count() FROM remote('example01-01-{1..3}t', merge, hits) WHERE EventDate >= today() - 2 GROUP BY h ORDER BY h;
```
In some cases we also deploy to testing environment of our friend teams in Yandex: Market, Cloud, etc. Also we have some hardware servers that are used for development purposes.
## Load Testing {#load-testing}
After deploying to testing environment we run load testing with queries from production cluster. This is done manually.
Make sure you have enabled `query_log` on your production cluster.
Collect query log for a day or more:
``` bash
$ clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv
```
This is a way complicated example. `type = 2` will filter queries that are executed successfully. `query LIKE '%ym:%'` is to select relevant queries from Yandex.Metrica. `is_initial_query` is to select only queries that are initiated by client, not by ClickHouse itself (as parts of distributed query processing).
`scp` this log to your testing cluster and run it as following:
``` bash
$ clickhouse benchmark --concurrency 16 < queries.tsv
```
(probably you also want to specify a `--user`)
Then leave it for a night or weekend and go take a rest.
You should check that `clickhouse-server` does not crash, memory footprint is bounded and performance not degrading over time.
Precise query execution timings are not recorded and not compared due to high variability of queries and environment.
## Build Tests {#build-tests}
Build tests allow to check that build is not broken on various alternative configurations and on some foreign systems. These tests are automated as well.
@ -259,13 +221,13 @@ Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuz
## Security Audit
People from Yandex Security Team did some basic overview of ClickHouse capabilities from the security standpoint.
Our Security Team did some basic overview of ClickHouse capabilities from the security standpoint.
## Static Analyzers {#static-analyzers}
We run `clang-tidy` on per-commit basis. `clang-static-analyzer` checks are also enabled. `clang-tidy` is also used for some style checks.
We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`, `CodeQL`. You will find instructions for usage in `tests/instructions/` directory. Also you can read [the article in russian](https://habr.com/company/yandex/blog/342018/).
We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`, `CodeQL`. You will find instructions for usage in `tests/instructions/` directory.
If you use `CLion` as an IDE, you can leverage some `clang-tidy` checks out of the box.
@ -310,12 +272,6 @@ Alternatively you can try `uncrustify` tool to reformat your code. Configuration
We also use `codespell` to find typos in code. It is automated as well.
## Metrica B2B Tests {#metrica-b2b-tests}
Each ClickHouse release is tested with Yandex Metrica and AppMetrica engines. Testing and stable versions of ClickHouse are deployed on VMs and run with a small copy of Metrica engine that is processing fixed sample of input data. Then results of two instances of Metrica engine are compared together.
These tests are automated by separate team. Due to high number of moving parts, tests are fail most of the time by completely unrelated reasons, that are very difficult to figure out. Most likely these tests have negative value for us. Nevertheless these tests was proved to be useful in about one or two times out of hundreds.
## Test Coverage {#test-coverage}
We also track test coverage but only for functional tests and only for clickhouse-server. It is performed on daily basis.

View File

@ -76,7 +76,7 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree](
| FLOAT | [Float32](../../sql-reference/data-types/float.md) |
| DOUBLE | [Float64](../../sql-reference/data-types/float.md) |
| DECIMAL, NEWDECIMAL | [Decimal](../../sql-reference/data-types/decimal.md) |
| DATE, NEWDATE | [Date](../../sql-reference/data-types/date.md) |
| DATE, NEWDATE | [Date32](../../sql-reference/data-types/date32.md) |
| DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) |
| DATETIME2, TIMESTAMP2 | [DateTime64](../../sql-reference/data-types/datetime64.md) |
| YEAR | [UInt16](../../sql-reference/data-types/int-uint.md) |

View File

@ -49,6 +49,8 @@ ENGINE = MySQL('host:port', ['database' | database], 'user', 'password')
All other MySQL data types are converted into [String](../../sql-reference/data-types/string.md).
Because of the ClickHouse date type has a different range from the MySQL date range,If the MySQL date type is out of the range of ClickHouse date, you can use the setting mysql_datatypes_support_level to modify the mapping from the MySQL date type to the Clickhouse date type: date2Date32 (convert MySQL's date type to ClickHouse Date32) or date2String(convert MySQL's date type to ClickHouse String,this is usually used when your mysql data is less than 1925) or default(convert MySQL's date type to ClickHouse Date).
[Nullable](../../sql-reference/data-types/nullable.md) is supported.
## Global Variables Support {#global-variables-support}

View File

@ -26,7 +26,7 @@ CREATE TABLE s3_engine_table (name String, value UInt32)
``` sql
CREATE TABLE s3_engine_table (name String, value UInt32)
ENGINE=S3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'gzip')
ENGINE=S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/test-data.csv.gz', 'CSV', 'gzip')
SETTINGS input_format_with_names_use_header = 0;
INSERT INTO s3_engine_table VALUES ('one', 1), ('two', 2), ('three', 3);
@ -75,19 +75,19 @@ Create table with files named `file-000.csv`, `file-001.csv`, … , `file-999.cs
``` sql
CREATE TABLE big_table (name String, value UInt32)
ENGINE = S3('https://storage.yandexcloud.net/my-bucket/my_folder/file-{000..999}.csv', 'CSV');
ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/my_folder/file-{000..999}.csv', 'CSV');
```
**Example with wildcards 2**
Suppose we have several files in CSV format with the following URIs on S3:
- 'https://storage.yandexcloud.net/my-bucket/some_folder/some_file_1.csv'
- 'https://storage.yandexcloud.net/my-bucket/some_folder/some_file_2.csv'
- 'https://storage.yandexcloud.net/my-bucket/some_folder/some_file_3.csv'
- 'https://storage.yandexcloud.net/my-bucket/another_folder/some_file_1.csv'
- 'https://storage.yandexcloud.net/my-bucket/another_folder/some_file_2.csv'
- 'https://storage.yandexcloud.net/my-bucket/another_folder/some_file_3.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/some_folder/some_file_1.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/some_folder/some_file_2.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/some_folder/some_file_3.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/another_folder/some_file_1.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/another_folder/some_file_2.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/another_folder/some_file_3.csv'
There are several ways to make a table consisting of all six files:
@ -96,21 +96,21 @@ There are several ways to make a table consisting of all six files:
``` sql
CREATE TABLE table_with_range (name String, value UInt32)
ENGINE = S3('https://storage.yandexcloud.net/my-bucket/{some,another}_folder/some_file_{1..3}', 'CSV');
ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/{some,another}_folder/some_file_{1..3}', 'CSV');
```
2. Take all files with `some_file_` prefix (there should be no extra files with such prefix in both folders):
``` sql
CREATE TABLE table_with_question_mark (name String, value UInt32)
ENGINE = S3('https://storage.yandexcloud.net/my-bucket/{some,another}_folder/some_file_?', 'CSV');
ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/{some,another}_folder/some_file_?', 'CSV');
```
3. Take all the files in both folders (all files should satisfy format and schema described in query):
``` sql
CREATE TABLE table_with_asterisk (name String, value UInt32)
ENGINE = S3('https://storage.yandexcloud.net/my-bucket/{some,another}_folder/*', 'CSV');
ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/{some,another}_folder/*', 'CSV');
```
## S3-related Settings {#settings}
@ -142,7 +142,7 @@ The following settings can be specified in configuration file for given endpoint
``` xml
<s3>
<endpoint-name>
<endpoint>https://storage.yandexcloud.net/my-test-bucket-768/</endpoint>
<endpoint>https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/</endpoint>
<!-- <access_key_id>ACCESS_KEY_ID</access_key_id> -->
<!-- <secret_access_key>SECRET_ACCESS_KEY</secret_access_key> -->
<!-- <region>us-west-1</region> -->

View File

@ -55,27 +55,28 @@ WHERE table = 'visits'
```
``` text
┌─partition─┬─name───────────┬─active─┐
│ 201901 │ 201901_1_3_1 │ 0 │
│ 201901 │ 201901_1_9_2 │ 1 │
│ 201901 │ 201901_8_8_0 │ 0 │
│ 201901 │ 201901_9_9_0 │ 0 │
│ 201902 │ 201902_4_6_1 │ 1 │
│ 201902 │ 201902_10_10_0 │ 1 │
│ 201902 │ 201902_11_11_0 │ 1 │
└───────────┴────────────────┴────────┘
┌─partition─┬─name──────────────┬─active─┐
│ 201901 │ 201901_1_3_1 │ 0 │
│ 201901 │ 201901_1_9_2_11 │ 1 │
│ 201901 │ 201901_8_8_0 │ 0 │
│ 201901 │ 201901_9_9_0 │ 0 │
│ 201902 │ 201902_4_6_1_11 │ 1 │
│ 201902 │ 201902_10_10_0_11 │ 1 │
│ 201902 │ 201902_11_11_0_11 │ 1 │
└───────────┴───────────────────┴────────┘
```
The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md) queries.
The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query.
Lets break down the name of the first part: `201901_1_3_1`:
Lets break down the name of the part: `201901_1_9_2_11`:
- `201901` is the partition name.
- `1` is the minimum number of the data block.
- `3` is the maximum number of the data block.
- `1` is the chunk level (the depth of the merge tree it is formed from).
- `9` is the maximum number of the data block.
- `2` is the chunk level (the depth of the merge tree it is formed from).
- `11` is the mutation version (if a part mutated)
!!! info "Info"
The parts of old-type tables have the name: `20190117_20190123_2_2_0` (minimum date - maximum date - minimum block number - maximum block number - level).
@ -89,16 +90,16 @@ OPTIMIZE TABLE visits PARTITION 201902;
```
``` text
┌─partition─┬─name───────────┬─active─┐
│ 201901 │ 201901_1_3_1 │ 0 │
│ 201901 │ 201901_1_9_2 │ 1 │
│ 201901 │ 201901_8_8_0 │ 0 │
│ 201901 │ 201901_9_9_0 │ 0 │
│ 201902 │ 201902_4_6_1 │ 0 │
│ 201902 │ 201902_4_11_2 │ 1 │
│ 201902 │ 201902_10_10_0 │ 0 │
│ 201902 │ 201902_11_11_0 │ 0 │
└───────────┴────────────────┴────────┘
┌─partition─┬─name─────────────┬─active─┐
│ 201901 │ 201901_1_3_1 │ 0 │
│ 201901 │ 201901_1_9_2_11 │ 1 │
│ 201901 │ 201901_8_8_0 │ 0 │
│ 201901 │ 201901_9_9_0 │ 0 │
│ 201902 │ 201902_4_6_1 │ 0 │
│ 201902 │ 201902_4_11_2_11 │ 1 │
│ 201902 │ 201902_10_10_0 │ 0 │
│ 201902 │ 201902_11_11_0 │ 0 │
└───────────┴──────────────────┴────────┘
```
Inactive parts will be deleted approximately 10 minutes after merging.
@ -109,12 +110,12 @@ Another way to view a set of parts and partitions is to go into the directory of
/var/lib/clickhouse/data/default/visits$ ls -l
total 40
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2_11
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_8_8_0
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_9_9_0
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_10_10_0
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_11_11_0
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:19 201902_4_11_2
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:19 201902_4_11_2_11
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 12:09 201902_4_6_1
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached
```

View File

@ -802,7 +802,7 @@ Configuration markup:
<disks>
<s3>
<type>s3</type>
<endpoint>https://storage.yandexcloud.net/my-bucket/root-path/</endpoint>
<endpoint>https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/root-path/</endpoint>
<access_key_id>your_access_key_id</access_key_id>
<secret_access_key>your_secret_access_key</secret_access_key>
<region></region>
@ -856,7 +856,7 @@ S3 disk can be configured as `main` or `cold` storage:
<disks>
<s3>
<type>s3</type>
<endpoint>https://storage.yandexcloud.net/my-bucket/root-path/</endpoint>
<endpoint>https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/root-path/</endpoint>
<access_key_id>your_access_key_id</access_key_id>
<secret_access_key>your_secret_access_key</secret_access_key>
</s3>

View File

@ -97,7 +97,7 @@ ZooKeeper is not used in `SELECT` queries because replication does not affect th
For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it does not create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data.
For very large clusters, you can use different ZooKeeper clusters for different shards. However, this hasnt proven necessary on the Yandex.Metrica cluster (approximately 300 servers).
For very large clusters, you can use different ZooKeeper clusters for different shards. However, from our experience this has not proven necessary based on production clusters with approximately 300 servers.
Replication is asynchronous and multi-master. `INSERT` queries (as well as `ALTER`) can be sent to any available server. Data is inserted on the server where the query is run, and then it is copied to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If part of the replicas are not available, the data is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network. The number of threads performing background tasks for replicated tables can be set by [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) setting.
@ -111,7 +111,7 @@ Data blocks are deduplicated. For multiple writes of the same data block (data b
During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.)
You can have any number of replicas of the same data. Yandex.Metrica uses double replication in production. Each server uses RAID-5 or RAID-6, and RAID-10 in some cases. This is a relatively reliable and convenient solution.
You can have any number of replicas of the same data. Based on our experiences, a relatively reliable and convenient solution could use double replication in production, with each server using RAID-5 or RAID-6 (and RAID-10 in some cases).
The system monitors data synchronicity on replicas and is able to recover after a failure. Failover is automatic (for small differences in data) or semi-automatic (when data differs too much, which may indicate a configuration error).
@ -163,7 +163,7 @@ Example:
<macros>
<layer>05</layer>
<shard>02</shard>
<replica>example05-02-1.yandex.ru</replica>
<replica>example05-02-1</replica>
</macros>
```
@ -172,7 +172,7 @@ In this case, the path consists of the following parts:
`/clickhouse/tables/` is the common prefix. We recommend using exactly this one.
`{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the Yandex.Metrica cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier.
`{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the example cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier.
`table_name` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query.
*HINT*: you could add a database name in front of `table_name` as well. E.g. `db_name.table_name`

View File

@ -197,7 +197,7 @@ A simple remainder from the division is a limited solution for sharding and isn
You should be concerned about the sharding scheme in the following cases:
- Queries are used that require joining data (`IN` or `JOIN`) by a specific key. If data is sharded by this key, you can use local `IN` or `JOIN` instead of `GLOBAL IN` or `GLOBAL JOIN`, which is much more efficient.
- A large number of servers is used (hundreds or more) with a large number of small queries, for example, queries for data of individual clients (e.g. websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as weve done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. `Distributed` tables are created for each layer, and a single shared distributed table is created for global queries.
- A large number of servers is used (hundreds or more) with a large number of small queries, for example, queries for data of individual clients (e.g. websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. `Distributed` tables are created for each layer, and a single shared distributed table is created for global queries.
Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The periodicity for sending data is managed by the [distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`. The number of threads performing background tasks can be set by [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size) setting.

View File

@ -6,7 +6,7 @@ toc_priority: 110
# Why Not Use Something Like MapReduce? {#why-not-use-something-like-mapreduce}
We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). Yandex uses its in-house solution, YT.
We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). Large IT companies often have proprietary in-house solutions.
These systems arent appropriate for online queries due to their high latency. In other words, they cant be used as the back-end for a web interface. These types of systems arent useful for real-time data updates. Distributed sorting isnt the best way to perform reduce operations if the result of the operation and all the intermediate results (if there are any) are located in the RAM of a single server, which is usually the case for online queries. In such a case, a hash table is an optimal way to perform reduce operations. A common approach to optimizing map-reduce tasks is pre-aggregation (partial reduce) using a hash table in RAM. The user performs this optimization manually. Distributed sorting is one of the main causes of reduced performance when running simple map-reduce tasks.

View File

@ -9,7 +9,7 @@ toc_priority: 11
This question usually arises when people see official ClickHouse t-shirts. They have large words **“ClickHouse не тормозит”** on the front.
Before ClickHouse became open-source, it has been developed as an in-house storage system by the largest Russian IT company, [Yandex](https://yandex.com/company/). Thats why it initially got its slogan in Russian, which is “не тормозит” (pronounced as “ne tormozit”). After the open-source release we first produced some of those t-shirts for events in Russia and it was a no-brainer to use the slogan as-is.
Before ClickHouse became open-source, it has been developed as an in-house storage system by the largest Russian IT company, Yandex. Thats why it initially got its slogan in Russian, which is “не тормозит” (pronounced as “ne tormozit”). After the open-source release we first produced some of those t-shirts for events in Russia and it was a no-brainer to use the slogan as-is.
One of the following batches of those t-shirts was supposed to be given away on events outside of Russia and we tried to make the English version of the slogan. Unfortunately, the Russian language is kind of elegant in terms of expressing stuff and there was a restriction of limited space on a t-shirt, so we failed to come up with good enough translation (most options appeared to be either long or inaccurate) and decided to keep the slogan in Russian even on t-shirts produced for international events. It appeared to be a great decision because people all over the world get positively surprised and curious when they see it.

View File

@ -11,7 +11,7 @@ This section describes how to obtain example datasets and import them into Click
The list of documented datasets:
- [GitHub Events](../../getting-started/example-datasets/github-events.md)
- [Anonymized Yandex.Metrica Dataset](../../getting-started/example-datasets/metrica.md)
- [Anonymized Web Analytics Dataset](../../getting-started/example-datasets/metrica.md)
- [Recipes](../../getting-started/example-datasets/recipes.md)
- [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md)
- [WikiStat](../../getting-started/example-datasets/wikistat.md)

View File

@ -1,11 +1,11 @@
---
toc_priority: 15
toc_title: Yandex.Metrica Data
toc_title: Web Analytics Data
---
# Anonymized Yandex.Metrica Data {#anonymized-yandex-metrica-data}
# Anonymized Web Analytics Data {#anonymized-web-analytics-data}
Dataset consists of two tables containing anonymized data about hits (`hits_v1`) and visits (`visits_v1`) of Yandex.Metrica. You can read more about Yandex.Metrica in [ClickHouse history](../../introduction/history.md) section.
Dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`).
The dataset consists of two tables, either of them can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz and as prepared partitions at https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz.
@ -73,6 +73,6 @@ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1"
## Example Queries {#example-queries}
[ClickHouse tutorial](../../getting-started/tutorial.md) is based on Yandex.Metrica dataset and the recommended way to get started with this dataset is to just go through tutorial.
[The ClickHouse tutorial](../../getting-started/tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial.
Additional examples of queries to these tables can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) of ClickHouse (they are named `test.hits` and `test.visits` there).

View File

@ -375,7 +375,7 @@ Q3: 0.051 sec.
Q4: 0.072 sec.
In this case, the query processing time is determined above all by network latency.
We ran queries using a client located in a Yandex datacenter in Finland on a cluster in Russia, which added about 20 ms of latency.
We ran queries using a client located in a different datacenter than where the cluster was located, which added about 20 ms of latency.
## Summary {#summary}

View File

@ -5,11 +5,12 @@ toc_title: Playground
# ClickHouse Playground {#clickhouse-playground}
!!! warning "Warning"
This service is deprecated and will be replaced in foreseeable future.
[ClickHouse Playground](https://play.clickhouse.com) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster.
Several example datasets are available in Playground as well as sample queries that show ClickHouse features. Theres also a selection of ClickHouse LTS releases to experiment with.
ClickHouse Playground gives the experience of m2.small [Managed Service for ClickHouse](https://cloud.yandex.com/services/managed-clickhouse) instance (4 vCPU, 32 GB RAM) hosted in [Yandex.Cloud](https://cloud.yandex.com/). More information about [cloud providers](../commercial/cloud.md).
You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](../interfaces/jdbc.md) or [ODBC](../interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](../interfaces/index.md).
## Credentials {#credentials}
@ -56,11 +57,3 @@ TCP endpoint example with [CLI](../interfaces/cli.md):
``` bash
clickhouse client --secure -h play-api.clickhouse.com --port 9440 -u playground --password clickhouse -q "SELECT 'Play ClickHouse\!'"
```
## Implementation Details {#implementation-details}
ClickHouse Playground web interface makes requests via ClickHouse [HTTP API](../interfaces/http.md).
The Playground backend is just a ClickHouse cluster without any additional server-side application. As mentioned above, ClickHouse HTTPS and TCP/TLS endpoints are also publicly available as a part of the Playground, both are proxied through [Cloudflare Spectrum](https://www.cloudflare.com/products/cloudflare-spectrum/) to add an extra layer of protection and improved global connectivity.
!!! warning "Warning"
Exposing the ClickHouse server to the public internet in any other situation is **strongly not recommended**. Make sure it listens only on a private network and is covered by a properly configured firewall.

View File

@ -80,7 +80,7 @@ clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv
## Import Sample Dataset {#import-sample-dataset}
Now its time to fill our ClickHouse server with some sample data. In this tutorial, well use the anonymized data of Yandex.Metrica, the first service that runs ClickHouse in production way before it became open-source (more on that in [history section](../introduction/history.md)). There are [multiple ways to import Yandex.Metrica dataset](../getting-started/example-datasets/metrica.md), and for the sake of the tutorial, well go with the most realistic one.
Now its time to fill our ClickHouse server with some sample data. In this tutorial, well use some anonymized web analytics data. There are [multiple ways to import the dataset](../getting-started/example-datasets/metrica.md), and for the sake of the tutorial, well go with the most realistic one.
### Download and Extract Table Data {#download-and-extract-table-data}
@ -105,7 +105,7 @@ Syntax for creating tables is way more complicated compared to databases (see [r
2. Table schema, i.e. list of columns and their [data types](../sql-reference/data-types/index.md).
3. [Table engine](../engines/table-engines/index.md) and its settings, which determines all the details on how queries to this table will be physically executed.
Yandex.Metrica is a web analytics service, and sample dataset does not cover its full functionality, so there are only two tables to create:
There are two tables to create:
- `hits` is a table with each action done by all users on all websites covered by the service.
- `visits` is a table that contains pre-built sessions instead of individual actions.
@ -533,19 +533,19 @@ Example config for a cluster with three shards, one replica each:
<perftest_3shards_1replicas>
<shard>
<replica>
<host>example-perftest01j.yandex.ru</host>
<host>example-perftest01j</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>example-perftest02j.yandex.ru</host>
<host>example-perftest02j</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>example-perftest03j.yandex.ru</host>
<host>example-perftest03j</host>
<port>9000</port>
</replica>
</shard>
@ -591,15 +591,15 @@ Example config for a cluster of one shard containing three replicas:
<perftest_1shards_3replicas>
<shard>
<replica>
<host>example-perftest01j.yandex.ru</host>
<host>example-perftest01j</host>
<port>9000</port>
</replica>
<replica>
<host>example-perftest02j.yandex.ru</host>
<host>example-perftest02j</host>
<port>9000</port>
</replica>
<replica>
<host>example-perftest03j.yandex.ru</host>
<host>example-perftest03j</host>
<port>9000</port>
</replica>
</shard>
@ -617,15 +617,15 @@ ZooKeeper locations are specified in the configuration file:
``` xml
<zookeeper>
<node>
<host>zoo01.yandex.ru</host>
<host>zoo01</host>
<port>2181</port>
</node>
<node>
<host>zoo02.yandex.ru</host>
<host>zoo02</host>
<port>2181</port>
</node>
<node>
<host>zoo03.yandex.ru</host>
<host>zoo03</host>
<port>2181</port>
</node>
</zookeeper>

View File

@ -5,7 +5,7 @@ toc_title: Applying CatBoost Models
# Applying a Catboost Model in ClickHouse {#applying-catboost-model-in-clickhouse}
[CatBoost](https://catboost.ai) is a free and open-source gradient boosting library developed at [Yandex](https://yandex.com/company/) for machine learning.
[CatBoost](https://catboost.ai) is a free and open-source gradient boosting library developed at Yandex for machine learning.
With this instruction, you will learn to apply pre-trained models in ClickHouse by running model inference from SQL.

View File

@ -300,7 +300,7 @@ Result:
<tr> <th>Search phrase</th> <th>Count</th> </tr>
<tr> <td></td> <td>8267016</td> </tr>
<tr> <td>bathroom interior design</td> <td>2166</td> </tr>
<tr> <td>yandex</td> <td>1655</td> </tr>
<tr> <td>clickhouse</td> <td>1655</td> </tr>
<tr> <td>spring 2014 fashion</td> <td>1549</td> </tr>
<tr> <td>freeform photos</td> <td>1480</td> </tr>
</table>
@ -371,7 +371,7 @@ Similar to TabSeparated, but outputs a value in name=value format. Names are esc
``` text
SearchPhrase= count()=8267016
SearchPhrase=bathroom interior design count()=2166
SearchPhrase=yandex count()=1655
SearchPhrase=clickhouse count()=1655
SearchPhrase=2014 spring fashion count()=1549
SearchPhrase=freeform photos count()=1480
SearchPhrase=angelina jolie count()=1245
@ -1060,7 +1060,7 @@ XML format is suitable only for output, not for parsing. Example:
<field>2166</field>
</row>
<row>
<SearchPhrase>yandex</SearchPhrase>
<SearchPhrase>clickhouse</SearchPhrase>
<field>1655</field>
</row>
<row>

View File

@ -12,7 +12,7 @@ ClickHouse provides three network interfaces (they can be optionally wrapped in
- [Native TCP](../interfaces/tcp.md), which has less overhead.
- [gRPC](grpc.md).
In most cases it is recommended to use appropriate tool or library instead of interacting with those directly. Officially supported by Yandex are the following:
In most cases it is recommended to use an appropriate tool or library instead of interacting with those directly. The following are officially supported by ClickHouse:
- [Command-line client](../interfaces/cli.md)
- [JDBC driver](../interfaces/jdbc.md)

View File

@ -6,7 +6,7 @@ toc_title: Integrations
# Integration Libraries from Third-party Developers {#integration-libraries-from-third-party-developers}
!!! warning "Disclaimer"
Yandex does **not** maintain the tools and libraries listed below and havent done any extensive testing to ensure their quality.
ClickHouse, Inc. does **not** maintain the tools and libraries listed below and havent done extensive testing to ensure their quality.
## Infrastructure Products {#infrastructure-products}

View File

@ -5,7 +5,7 @@ toc_title: Performance
# Performance {#performance}
According to internal testing results at Yandex, ClickHouse shows the best performance (both the highest throughput for long queries and the lowest latency on short queries) for comparable operating scenarios among systems of its class that were available for testing. You can view the test results on a [separate page](https://clickhouse.com/benchmark/dbms/).
ClickHouse shows the best performance (both the highest throughput for long queries and the lowest latency on short queries) for comparable operating scenarios among systems of its class that were available for testing. You can view the test results on a [separate page](https://clickhouse.com/benchmark/dbms/).
Numerous independent benchmarks came to similar conclusions. They are not difficult to find using an internet search, or you can see [our small collection of related links](https://clickhouse.com/#independent-benchmarks).

View File

@ -0,0 +1,132 @@
---
toc_priority: 69
toc_title: "Named connections"
---
# Storing details for connecting to external sources in configuration files {#named-collections}
Details for connecting to external sources (dictionaries, tables, table functions) can be saved
in configuration files and thus simplify the creation of objects and hide credentials
from users with only SQL access.
Parameters can be set in XML `<format>CSV</format>` and overridden in SQL `, format = 'TSV'`.
The parameters in SQL can be overridden using format `key` = `value`: `compression_method = 'gzip'`.
Named connections are stored in the `config.xml` file of the ClickHouse server in the `<named_collections>` section and are applied when ClickHouse starts.
Example of configuration:
```xml
$ cat /etc/clickhouse-server/config.d/named_collections.xml
<clickhouse>
<named_collections>
...
</named_collections>
</clickhouse>
```
## Named connections for accessing S3.
The description of parameters see [s3 Table Function](../sql-reference/table-functions/s3.md).
Example of configuration:
```xml
<clickhouse>
<named_collections>
<s3_mydata>
<access_key_id>AKIAIOSFODNN7EXAMPLE</access_key_id>
<secret_access_key> wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY</secret_access_key>
<format>CSV</format>
</s3_mydata>
</named_collections>
</clickhouse>
```
### An example of using named connections with the s3 function
```sql
INSERT INTO FUNCTION s3(s3_mydata, url = 'https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz',
format = 'TSV', structure = 'number UInt64', compression_method = 'gzip')
SELECT * FROM numbers(10000);
SELECT count()
FROM s3(s3_mydata, url = 'https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz')
┌─count()─┐
│ 10000 │
└─────────┘
1 rows in set. Elapsed: 0.279 sec. Processed 10.00 thousand rows, 90.00 KB (35.78 thousand rows/s., 322.02 KB/s.)
```
### An example of using named connections with an S3 table
```sql
CREATE TABLE s3_engine_table (number Int64)
ENGINE=S3(s3_mydata, url='https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz', format = 'TSV')
SETTINGS input_format_with_names_use_header = 0;
SELECT * FROM s3_engine_table LIMIT 3;
┌─number─┐
│ 0 │
│ 1 │
│ 2 │
└────────┘
```
## Named connections for accessing MySQL database.
The description of parameters see [mysql](../sql-reference/table-functions/mysql.md).
Example of configuration:
```xml
<clickhouse>
<named_collections>
<mymysql>
<user>myuser</user>
<password>mypass</password>
<host>127.0.0.1</host>
<port>3306</port>
<database>test</database>
<connection_pool_size>8</connection_pool_size>
<on_duplicate_clause>1</on_duplicate_clause>
<replace_query>1</replace_query>
</mymysql>
</named_collections>
</clickhouse>
```
### An example of using named connections with the mysql function
```sql
SELECT count() FROM mysql(mymysql, table = 'test');
┌─count()─┐
│ 3 │
└─────────┘
```
### An example of using named connections with an MySQL table
```sql
CREATE TABLE mytable(A Int64) ENGINE = MySQL(mymysql, table = 'test', connection_pool_size=3, replace_query=0);
SELECT count() FROM mytable;
┌─count()─┐
│ 3 │
└─────────┘
```
### An example of using named with an external dictionary with source MySQL
```sql
CREATE DICTIONARY dict (A Int64, B String)
PRIMARY KEY A
SOURCE(MYSQL(NAME mymysql TABLE 'source'))
LIFETIME(MIN 1 MAX 2)
LAYOUT(HASHED());
SELECT dictGet('dict', 'B', 2);
┌─dictGet('dict', 'B', 2)─┐
│ two │
└─────────────────────────┘
```

View File

@ -59,7 +59,7 @@ wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/cl
chmod a+x benchmark-new.sh
wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/queries.sql
```
3. Download test data according to the [Yandex.Metrica dataset](../getting-started/example-datasets/metrica.md) instruction (“hits” table containing 100 million rows).
3. Download the [web analytics dataset](../getting-started/example-datasets/metrica.md) (“hits” table containing 100 million rows).
```bash
wget https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz
tar xvf hits_100m_obfuscated_v1.tar.xz -C .
@ -78,6 +78,6 @@ mv hits_100m_obfuscated_v1/* .
```bash
./benchmark-new.sh hits_100m_obfuscated
```
7. Send the numbers and the info about your hardware configuration to clickhouse-feedback@yandex-team.com
7. Send the numbers and the info about your hardware configuration to feedback@clickhouse.com
All the results are published here: https://clickhouse.com/benchmark/hardware/

View File

@ -101,7 +101,7 @@ Quotas can use the “quota key” feature to report on resources for multiple k
<web_global>
<!-- keyed The quota_key "key" is passed in the query parameter,
and the quota is tracked separately for each key value.
For example, you can pass a Yandex.Metrica username as the key,
For example, you can pass a username of your service as the key,
so the quota will be counted separately for each username.
Using keys makes sense only if quota_key is transmitted by the program, not by a user.

View File

@ -410,7 +410,7 @@ Useful for breaking away from a specific network interface.
**Example**
``` xml
<interserver_http_host>example.yandex.ru</interserver_http_host>
<interserver_http_host>example.clickhouse.com</interserver_http_host>
```
## interserver_https_port {#interserver-https-port}
@ -430,7 +430,7 @@ Similar to `interserver_http_host`, except that this hostname can be used by oth
**Example**
``` xml
<interserver_https_host>example.yandex.ru</interserver_https_host>
<interserver_https_host>example.clickhouse.com</interserver_https_host>
```
## interserver_http_credentials {#server-settings-interserver-http-credentials}
@ -1247,7 +1247,7 @@ The time zone is necessary for conversions between String and DateTime formats w
**Example**
``` xml
<timezone>Europe/Moscow</timezone>
<timezone>Asia/Istanbul</timezone>
```
## tcp_port {#server_configuration_parameters-tcp_port}

View File

@ -256,7 +256,7 @@ Possible values:
Default value: 161061273600 (150 GB).
The merge scheduler periodically analyzes the sizes and number of parts in partitions, and if there is enough free resources in the pool, it starts background merges. Merges occur until the total size of the source parts is less than `max_bytes_to_merge_at_max_space_in_pool`.
The merge scheduler periodically analyzes the sizes and number of parts in partitions, and if there is enough free resources in the pool, it starts background merges. Merges occur until the total size of the source parts is larger than `max_bytes_to_merge_at_max_space_in_pool`.
Merges initiated by [OPTIMIZE FINAL](../../sql-reference/statements/optimize.md) ignore `max_bytes_to_merge_at_max_space_in_pool` and merge parts only taking into account available resources (free disk's space) until one part remains in the partition.
@ -346,7 +346,7 @@ Default value: `0`.
**Usage**
The value of the `min_bytes_to_rebalance_partition_over_jbod` setting should be less than the value of the [max_bytes_to_merge_at_max_space_in_pool](../../operations/settings/merge-tree-settings.md#max-bytes-to-merge-at-max-space-in-pool) setting. Otherwise, ClickHouse throws an exception.
The value of the `min_bytes_to_rebalance_partition_over_jbod` setting should not be less than the value of the [max_bytes_to_merge_at_max_space_in_pool](../../operations/settings/merge-tree-settings.md#max-bytes-to-merge-at-max-space-in-pool) / 1024. Otherwise, ClickHouse throws an exception.
## detach_not_byte_identical_parts {#detach_not_byte_identical_parts}

View File

@ -1326,7 +1326,7 @@ If a query from the same user with the same query_id already exists at thi
`1` Cancel the old query and start running the new one.
Yandex.Metrica uses this parameter set to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasnt finished yet, it should be cancelled.
Set this parameter to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasnt finished yet, it should be cancelled.
## replace_running_query_max_wait_ms {#replace-running-query-max-wait-ms}
@ -1380,7 +1380,7 @@ load_balancing = nearest_hostname
The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a hostname that is most similar to the servers hostname in the config file (for the number of different characters in identical positions, up to the minimum length of both hostnames).
For instance, example01-01-1 and example01-01-2.yandex.ru are different in one position, while example01-01-1 and example01-02-2 differ in two places.
For instance, example01-01-1 and example01-01-2 are different in one position, while example01-01-1 and example01-02-2 differ in two places.
This method might seem primitive, but it does not require external data about network topology, and it does not compare IP addresses, which would be complicated for our IPv6 addresses.
Thus, if there are equivalent replicas, the closest one by name is preferred.

View File

@ -33,7 +33,7 @@ Client section in `config.xml` will look like:
Add Zookeeper to ClickHouse config with some cluster and macros:
``` xml
<yandex>
<clickhouse>
<zookeeper>
<node>
<host>localhost</host>
@ -41,7 +41,7 @@ Add Zookeeper to ClickHouse config with some cluster and macros:
<secure>1</secure>
</node>
</zookeeper>
</yandex>
</clickhouse>
```
Start `clickhouse-server`. In logs you should see:

View File

@ -14,18 +14,18 @@ Columns:
```
```text
┌─name─────────────────────┬─is_aggregate─┬─case_insensitive─┬─alias_to─┐
sumburConsistentHash │ 0 │ 0 │ │
yandexConsistentHash │ 0 │ 0 │ │
demangle │ 0 │ 0 │ │
addressToLine │ 0 │ 0 │ │
JSONExtractRaw │ 0 │ 0 │ │
JSONExtractKeysAndValues │ 0 │ 0 │ │
JSONExtract │ 0 │ 0 │ │
JSONExtractString │ 0 │ 0 │ │
JSONExtractFloat │ 0 │ 0 │ │
JSONExtractInt │ 0 │ 0 │ │
└──────────────────────────┴──────────────┴──────────────────┴──────────┘
┌─name──────────────────┬─is_aggregate─┬─case_insensitive─┬─alias_to─┬─create_query─┬─origin─┐
logTrace │ 0 │ 0 │ │ │ System │
aes_decrypt_mysql │ 0 │ 0 │ │ │ System │
aes_encrypt_mysql │ 0 │ 0 │ │ │ System │
decrypt │ 0 │ 0 │ │ │ System │
encrypt │ 0 │ 0 │ │ │ System │
toBool │ 0 │ 0 │ │ │ System │
windowID │ 0 │ 0 │ │ │ System │
hopStart │ 0 │ 0 │ │ │ System │
hop │ 0 │ 0 │ │ │ System │
snowflakeToDateTime64 │ 0 │ 0 │ │ │ System │
└───────────────────────┴──────────────┴──────────────────┴──────────┴──────────────┴────────┘
10 rows in set. Elapsed: 0.002 sec.
```

View File

@ -65,13 +65,13 @@ Row 1:
──────
database: merge
table: visits_v2
replica_name: mtgiga001-1t.metrika.yandex.net
replica_name: mtgiga001-1t
position: 15
node_name: queue-0009325559
type: MERGE_PARTS
create_time: 2020-12-07 14:04:21
required_quorum: 0
source_replica: mtgiga001-1t.metrika.yandex.net
source_replica: mtgiga001-1t
new_part_name: 20201130_121373_121384_2
parts_to_merge: ['20201130_121373_121378_1','20201130_121379_121379_0','20201130_121380_121380_0','20201130_121381_121381_0','20201130_121382_121382_0','20201130_121383_121383_0','20201130_121384_121384_0']
is_detach: 0

View File

@ -40,7 +40,7 @@ FORMAT Vertical
``` text
Row 1:
──────
name: example01-08-1.yandex.ru
name: example01-08-1
value:
czxid: 932998691229
mzxid: 932998691229
@ -57,7 +57,7 @@ path: /clickhouse/tables/01-08/visits/replicas
Row 2:
──────
name: example01-08-2.yandex.ru
name: example01-08-2
value:
czxid: 933002738135
mzxid: 933002738135

View File

@ -139,7 +139,7 @@ With the default settings, ZooKeeper is a time bomb:
This bomb must be defused.
The ZooKeeper (3.5.1) configuration below is used in the Yandex.Metrica production environment as of May 20, 2017:
The ZooKeeper (3.5.1) configuration below is used in a large production environment:
zoo.cfg:

View File

@ -13,7 +13,7 @@ The suffix -If can be appended to the name of any aggregate function. In this ca
Examples: `sumIf(column, cond)`, `countIf(cond)`, `avgIf(x, cond)`, `quantilesTimingIf(level1, level2)(x, cond)`, `argMinIf(arg, val, cond)` and so on.
With conditional aggregate functions, you can calculate aggregates for several conditions at once, without using subqueries and `JOIN`s. For example, in Yandex.Metrica, conditional aggregate functions are used to implement the segment comparison functionality.
With conditional aggregate functions, you can calculate aggregates for several conditions at once, without using subqueries and `JOIN`s. For example, conditional aggregate functions can be used to implement the segment comparison functionality.
## -Array {#agg-functions-combinator-array}

View File

@ -40,7 +40,7 @@ When inserting data into ClickHouse, you can use different formats of date and t
``` sql
CREATE TABLE dt
(
`timestamp` DateTime('Europe/Moscow'),
`timestamp` DateTime('Asia/Istanbul'),
`event_id` UInt8
)
ENGINE = TinyLog;
@ -61,13 +61,13 @@ SELECT * FROM dt;
└─────────────────────┴──────────┘
```
- When inserting datetime as an integer, it is treated as Unix Timestamp (UTC). `1546300800` represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Europe/Moscow` (UTC+3) timezone specified, when outputting as string the value will be shown as `'2019-01-01 03:00:00'`
- When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Europe/Moscow` timezone and saved as `1546290000`.
- When inserting datetime as an integer, it is treated as Unix Timestamp (UTC). `1546300800` represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Asia/Istanbul` (UTC+3) timezone specified, when outputting as string the value will be shown as `'2019-01-01 03:00:00'`
- When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Asia/Istanbul` timezone and saved as `1546290000`.
**2.** Filtering on `DateTime` values
``` sql
SELECT * FROM dt WHERE timestamp = toDateTime('2019-01-01 00:00:00', 'Europe/Moscow')
SELECT * FROM dt WHERE timestamp = toDateTime('2019-01-01 00:00:00', 'Asia/Istanbul')
```
``` text
@ -91,12 +91,12 @@ SELECT * FROM dt WHERE timestamp = '2019-01-01 00:00:00'
**3.** Getting a time zone for a `DateTime`-type column:
``` sql
SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x
SELECT toDateTime(now(), 'Asia/Istanbul') AS column, toTypeName(column) AS x
```
``` text
┌──────────────column─┬─x─────────────────────────┐
│ 2019-10-16 04:12:04 │ DateTime('Europe/Moscow') │
│ 2019-10-16 04:12:04 │ DateTime('Asia/Istanbul') │
└─────────────────────┴───────────────────────────┘
```
@ -105,7 +105,7 @@ SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x
``` sql
SELECT
toDateTime(timestamp, 'Europe/London') as lon_time,
toDateTime(timestamp, 'Europe/Moscow') as mos_time
toDateTime(timestamp, 'Asia/Istanbul') as mos_time
FROM dt
```

View File

@ -27,7 +27,7 @@ Supported range of values: \[1925-01-01 00:00:00, 2283-11-11 23:59:59.99999999\]
``` sql
CREATE TABLE dt
(
`timestamp` DateTime64(3, 'Europe/Moscow'),
`timestamp` DateTime64(3, 'Asia/Istanbul'),
`event_id` UInt8
)
ENGINE = TinyLog;
@ -48,13 +48,13 @@ SELECT * FROM dt;
└─────────────────────────┴──────────┘
```
- When inserting datetime as an integer, it is treated as an appropriately scaled Unix Timestamp (UTC). `1546300800000` (with precision 3) represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Europe/Moscow` (UTC+3) timezone specified, when outputting as a string the value will be shown as `'2019-01-01 03:00:00'`.
- When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Europe/Moscow` timezone and stored as `1546290000000`.
- When inserting datetime as an integer, it is treated as an appropriately scaled Unix Timestamp (UTC). `1546300800000` (with precision 3) represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Asia/Istanbul` (UTC+3) timezone specified, when outputting as a string the value will be shown as `'2019-01-01 03:00:00'`.
- When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Asia/Istanbul` timezone and stored as `1546290000000`.
2. Filtering on `DateTime64` values
``` sql
SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow');
SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Asia/Istanbul');
```
``` text
@ -68,12 +68,12 @@ Unlike `DateTime`, `DateTime64` values are not converted from `String` automatic
3. Getting a time zone for a `DateTime64`-type value:
``` sql
SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x;
SELECT toDateTime64(now(), 3, 'Asia/Istanbul') AS column, toTypeName(column) AS x;
```
``` text
┌──────────────────column─┬─x──────────────────────────────┐
│ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Europe/Moscow') │
│ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Asia/Istanbul') │
└─────────────────────────┴────────────────────────────────┘
```
@ -82,7 +82,7 @@ SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS
``` sql
SELECT
toDateTime64(timestamp, 3, 'Europe/London') as lon_time,
toDateTime64(timestamp, 3, 'Europe/Moscow') as mos_time
toDateTime64(timestamp, 3, 'Asia/Istanbul') as mos_time
FROM dt;
```

View File

@ -14,7 +14,7 @@ This allows you to:
- Check whether a region is part of another region.
- Get a chain of parent regions.
All the functions support “translocality,” the ability to simultaneously use different perspectives on region ownership. For more information, see the section “Functions for working with Yandex.Metrica dictionaries”.
All the functions support “translocality,” the ability to simultaneously use different perspectives on region ownership. For more information, see the section “Functions for working with web analytics dictionaries”.
The internal dictionaries are disabled in the default package.
To enable them, uncomment the parameters `path_to_regions_hierarchy_file` and `path_to_regions_names_files` in the server configuration file.
@ -48,5 +48,3 @@ Dictionary updates (other than loading at first use) do not block queries. Durin
We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server.
There are also functions for working with OS identifiers and Yandex.Metrica search engines, but they shouldnt be used.

View File

@ -358,13 +358,13 @@ Query with timezone:
``` sql
WITH toDateTime64('2020-01-01 10:20:30.999', 3) AS dt64
SELECT toStartOfSecond(dt64, 'Europe/Moscow');
SELECT toStartOfSecond(dt64, 'Asia/Istanbul');
```
Result:
``` text
┌─toStartOfSecond(dt64, 'Europe/Moscow')─┐
┌─toStartOfSecond(dt64, 'Asia/Istanbul')─┐
│ 2020-01-01 13:20:30.000 │
└────────────────────────────────────────┘
```
@ -560,13 +560,13 @@ Result:
Query with the specified timezone:
```sql
SELECT now(), date_trunc('hour', now(), 'Europe/Moscow');
SELECT now(), date_trunc('hour', now(), 'Asia/Istanbul');
```
Result:
```text
┌───────────────now()─┬─date_trunc('hour', now(), 'Europe/Moscow')─┐
┌───────────────now()─┬─date_trunc('hour', now(), 'Asia/Istanbul')─┐
│ 2020-09-28 10:46:26 │ 2020-09-28 13:00:00 │
└─────────────────────┴────────────────────────────────────────────┘
```
@ -871,13 +871,13 @@ Result:
Query with the specified timezone:
``` sql
SELECT now('Europe/Moscow');
SELECT now('Asia/Istanbul');
```
Result:
``` text
┌─now('Europe/Moscow')─┐
┌─now('Asia/Istanbul')─┐
│ 2020-10-17 10:42:23 │
└──────────────────────┘
```
@ -895,7 +895,6 @@ The same as today() - 1.
## timeSlot {#timeslot}
Rounds the time to the half hour.
This function is specific to Yandex.Metrica, since half an hour is the minimum amount of time for breaking a session into two sessions if a tracking tag shows a single users consecutive pageviews that differ in time by strictly more than this amount. This means that tuples (the tag ID, user ID, and time slot) can be used to search for pageviews that are included in the corresponding session.
## toYYYYMM {#toyyyymm}

View File

@ -220,7 +220,7 @@ Result:
A fast, decent-quality non-cryptographic hash function for a string obtained from a URL using some type of normalization.
`URLHash(s)` Calculates a hash from a string without one of the trailing symbols `/`,`?` or `#` at the end, if present.
`URLHash(s, N)` Calculates a hash from a string up to the N level in the URL hierarchy, without one of the trailing symbols `/`,`?` or `#` at the end, if present.
Levels are the same as in URLHierarchy. This function is specific to Yandex.Metrica.
Levels are the same as in URLHierarchy.
## farmFingerprint64 {#farmfingerprint64}

View File

@ -5,9 +5,7 @@ toc_title: JSON
# Functions for Working with JSON {#functions-for-working-with-json}
In Yandex.Metrica, JSON is transmitted by users as session parameters. There are some special functions for working with this JSON. (Although in most of the cases, the JSONs are additionally pre-processed, and the resulting values are put in separate columns in their processed format.) All these functions are based on strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done.
The following assumptions are made:
ClickHouse has special functions for working with this JSON. The `visitParam` functions make strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done. The following assumptions are made:
1. The field name (function argument) must be a constant.
2. The field name is somehow canonically encoded in JSON. For example: `visitParamHas('{"abc":"def"}', 'abc') = 1`, but `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0`

View File

@ -189,11 +189,11 @@ Accepts a number. If the number is less than one, it returns 0. Otherwise, it ro
## roundDuration(num) {#rounddurationnum}
Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function is specific to Yandex.Metrica and used for implementing the report on session length.
Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function was specifically implemented for a web analytics use case for reporting on session lengths.
## roundAge(num) {#roundagenum}
Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rounds the number down to a number from the set: 18, 25, 35, 45, 55. This function is specific to Yandex.Metrica and used for implementing the report on user age.
Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rounds the number down to a number from the set: 18, 25, 35, 45, 55.
## roundDown(num, arr) {#rounddownnum-arr}

View File

@ -1012,7 +1012,7 @@ Result:
Query:
``` sql
SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow')
SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Asia/Istanbul')
AS parseDateTimeBestEffort;
```
@ -1206,7 +1206,7 @@ Result:
Query:
``` sql
SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull;
SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Asia/Istanbul') AS parseDateTimeBestEffortUSOrNull;
```
Result:
@ -1292,7 +1292,7 @@ Result:
Query:
``` sql
SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero;
SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Asia/Istanbul') AS parseDateTimeBestEffortUSOrZero;
```
Result:
@ -1362,7 +1362,7 @@ SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346') AS a, toTypeName(a
UNION ALL
SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346',6) AS a, toTypeName(a) AS t
UNION ALL
SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346',3,'Europe/Moscow') AS a, toTypeName(a) AS t
SELECT parseDateTime64BestEffort('2021-01-01 01:01:00.12346',3,'Asia/Istanbul') AS a, toTypeName(a) AS t
FORMAT PrettyCompactMonoBlock;
```
@ -1373,7 +1373,7 @@ Result:
│ 2021-01-01 01:01:00.123000 │ DateTime64(3) │
│ 2021-01-01 00:00:00.000000 │ DateTime64(3) │
│ 2021-01-01 01:01:00.123460 │ DateTime64(6) │
│ 2020-12-31 22:01:00.123000 │ DateTime64(3, 'Europe/Moscow') │
│ 2020-12-31 22:01:00.123000 │ DateTime64(3, 'Asia/Istanbul') │
└────────────────────────────┴────────────────────────────────┘
```

View File

@ -34,7 +34,7 @@ The URL can be specified with or without a scheme. Examples:
``` text
svn+ssh://some.svn-hosting.com:80/repo/trunk
some.svn-hosting.com:80/repo/trunk
https://yandex.com/time/
https://clickhouse.com/time/
```
For these examples, the `domain` function returns the following results:
@ -42,7 +42,7 @@ For these examples, the `domain` function returns the following results:
``` text
some.svn-hosting.com
some.svn-hosting.com
yandex.com
clickhouse.com
```
**Returned values**
@ -85,7 +85,7 @@ The URL can be specified with or without a scheme. Examples:
``` text
svn+ssh://some.svn-hosting.com:80/repo/trunk
some.svn-hosting.com:80/repo/trunk
https://yandex.com/time/
https://clickhouse.com/time/
```
**Returned values**
@ -109,7 +109,7 @@ SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk');
### firstSignificantSubdomain {#firstsignificantsubdomain}
Returns the “first significant subdomain”. This is a non-standard concept specific to Yandex.Metrica. The first significant subdomain is a second-level domain if it is com, net, org, or co. Otherwise, it is a third-level domain. For example, `firstSignificantSubdomain (https://news.yandex.ru/) = yandex, firstSignificantSubdomain (https://news.yandex.com.tr/) = yandex`. The list of “insignificant” second-level domains and other implementation details may change in the future.
Returns the “first significant subdomain”. The first significant subdomain is a second-level domain if it is com, net, org, or co. Otherwise, it is a third-level domain. For example, `firstSignificantSubdomain (https://news.clickhouse.com/) = clickhouse, firstSignificantSubdomain (https://news.clickhouse.com.tr/) = clickhouse`. The list of “insignificant” second-level domains and other implementation details may change in the future.
### cutToFirstSignificantSubdomain {#cuttofirstsignificantsubdomain}
@ -117,7 +117,7 @@ Returns the part of the domain that includes top-level subdomains up to the “f
For example:
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
- `cutToFirstSignificantSubdomain('https://news.clickhouse.com.tr/') = 'clickhouse.com.tr'`.
- `cutToFirstSignificantSubdomain('www.tr') = 'tr'`.
- `cutToFirstSignificantSubdomain('tr') = ''`.
@ -127,7 +127,7 @@ Returns the part of the domain that includes top-level subdomains up to the “f
For example:
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
- `cutToFirstSignificantSubdomain('https://news.clickhouse.com.tr/') = 'clickhouse.com.tr'`.
- `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`.
- `cutToFirstSignificantSubdomain('tr') = ''`.
@ -335,7 +335,7 @@ Returns an array containing the URL, truncated at the end by the symbols /,? in
### URLPathHierarchy(URL) {#urlpathhierarchyurl}
The same as above, but without the protocol and host in the result. The / element (root) is not included. Example: the function is used to implement tree reports the URL in Yandex. Metric.
The same as above, but without the protocol and host in the result. The / element (root) is not included.
``` text
URLPathHierarchy('https://example.com/browse/CONV-6788') =

View File

@ -254,7 +254,7 @@ You can work with dates without using `INTERVAL`, just by adding or subtracting
Examples:
``` sql
SELECT toDateTime('2014-10-26 00:00:00', 'Europe/Moscow') AS time, time + 60 * 60 * 24 AS time_plus_24_hours, time + toIntervalDay(1) AS time_plus_1_day;
SELECT toDateTime('2014-10-26 00:00:00', 'Asia/Istanbul') AS time, time + 60 * 60 * 24 AS time_plus_24_hours, time + toIntervalDay(1) AS time_plus_1_day;
```
``` text

View File

@ -26,11 +26,11 @@ A table with the specified structure for reading or writing data in the specifie
**Examples**
Selecting the first two rows from the table from S3 file `https://storage.yandexcloud.net/my-test-bucket-768/data.csv`:
Selecting the first two rows from the table from S3 file `https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/data.csv`:
``` sql
SELECT *
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/data.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32')
FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/data.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32')
LIMIT 2;
```
@ -45,7 +45,7 @@ The similar but from file with `gzip` compression:
``` sql
SELECT *
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/data.csv.gz', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32', 'gzip')
FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/data.csv.gz', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32', 'gzip')
LIMIT 2;
```
@ -60,20 +60,20 @@ LIMIT 2;
Suppose that we have several files with following URIs on S3:
- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv'
- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv'
- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_3.csv'
- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_4.csv'
- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_1.csv'
- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv'
- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv'
- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_4.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/some_prefix/some_file_1.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/some_prefix/some_file_2.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/some_prefix/some_file_3.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/some_prefix/some_file_4.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/another_prefix/some_file_1.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/another_prefix/some_file_2.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/another_prefix/some_file_3.csv'
- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/another_prefix/some_file_4.csv'
Count the amount of rows in files ending with numbers from 1 to 3:
``` sql
SELECT count(*)
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}.csv', 'CSV', 'name String, value UInt32')
FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}.csv', 'CSV', 'name String, value UInt32')
```
``` text
@ -86,7 +86,7 @@ Count the total amount of rows in all files in these two directories:
``` sql
SELECT count(*)
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV', 'name String, value UInt32')
FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/{some,another}_prefix/*', 'CSV', 'name String, value UInt32')
```
``` text
@ -102,7 +102,7 @@ Count the total amount of rows in files named `file-000.csv`, `file-001.csv`,
``` sql
SELECT count(*)
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV', 'name String, value UInt32');
FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV', 'name String, value UInt32');
```
``` text
@ -114,14 +114,14 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000
Insert data into file `test-data.csv.gz`:
``` sql
INSERT INTO FUNCTION s3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip')
INSERT INTO FUNCTION s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip')
VALUES ('test-data', 1), ('test-data-2', 2);
```
Insert data into file `test-data.csv.gz` from existing table:
``` sql
INSERT INTO FUNCTION s3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip')
INSERT INTO FUNCTION s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip')
SELECT name, value FROM existing_table;
```

View File

@ -926,7 +926,7 @@ toc_title: '2018'
#### Backward Incompatible Changes: {#backward-incompatible-changes-10}
- Removed the `distributed_ddl_allow_replicated_alter` option. This behavior is enabled by default.
- Removed the `strict_insert_defaults` setting. If you were using this functionality, write to `clickhouse-feedback@yandex-team.com`.
- Removed the `strict_insert_defaults` setting. If you were using this functionality, write to `feedback@clickhouse.com`.
- Removed the `UnsortedMergeTree` engine.
### ClickHouse Release 1.1.54343, 2018-02-05 {#clickhouse-release-1-1-54343-2018-02-05}
@ -953,7 +953,7 @@ This release contains bug fixes for the previous release 1.1.54337:
- Added support for storage of multi-dimensional arrays and tuples (`Tuple` data type) in tables.
- Support for table functions for `DESCRIBE` and `INSERT` queries. Added support for subqueries in `DESCRIBE`. Examples: `DESC TABLE remote('host', default.hits)`; `DESC TABLE (SELECT 1)`; `INSERT INTO TABLE FUNCTION remote('host', default.hits)`. Support for `INSERT INTO TABLE` in addition to `INSERT INTO`.
- Improved support for time zones. The `DateTime` data type can be annotated with the timezone that is used for parsing and formatting in text formats. Example: `DateTime('Europe/Moscow')`. When timezones are specified in functions for `DateTime` arguments, the return type will track the timezone, and the value will be displayed as expected.
- Improved support for time zones. The `DateTime` data type can be annotated with the timezone that is used for parsing and formatting in text formats. Example: `DateTime('Asia/Istanbul')`. When timezones are specified in functions for `DateTime` arguments, the return type will track the timezone, and the value will be displayed as expected.
- Added the functions `toTimeZone`, `timeDiff`, `toQuarter`, `toRelativeQuarterNum`. The `toRelativeHour`/`Minute`/`Second` functions can take a value of type `Date` as an argument. The `now` function name is case-sensitive.
- Added the `toStartOfFifteenMinutes` function (Kirill Shvakov).
- Added the `clickhouse format` tool for formatting queries.
@ -1049,7 +1049,7 @@ This release contains bug fixes for the previous release 1.1.54337:
- The `runningIncome` function was renamed to `runningDifferenceStartingWithFirstvalue` to avoid confusion.
- Removed the `FROM ARRAY JOIN arr` syntax when ARRAY JOIN is specified directly after FROM with no table (Amos Bird).
- Removed the `BlockTabSeparated` format that was used solely for demonstration purposes.
- Changed the state format for aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. If you have stored states of these aggregate functions in tables (using the `AggregateFunction` data type or materialized views with corresponding states), please write to clickhouse-feedback@yandex-team.com.
- Changed the state format for aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. If you have stored states of these aggregate functions in tables (using the `AggregateFunction` data type or materialized views with corresponding states), please write to feedback@clickhouse.com.
- In previous server versions there was an undocumented feature: if an aggregate function depends on parameters, you can still specify it without parameters in the AggregateFunction data type. Example: `AggregateFunction(quantiles, UInt64)` instead of `AggregateFunction(quantiles(0.5, 0.9), UInt64)`. This feature was lost. Although it was undocumented, we plan to support it again in future releases.
- Enum data types cannot be used in min/max aggregate functions. This ability will be returned in the next release.

View File

@ -1039,7 +1039,7 @@ toc_title: '2019'
- Fix insert and select query to MySQL engine with MySQL style identifier quoting. [#5704](https://github.com/ClickHouse/ClickHouse/pull/5704) ([Winter Zhang](https://github.com/zhang2014))
- Now `CHECK TABLE` query can work with MergeTree engine family. It returns check status and message if any for each part (or file in case of simplier engines). Also, fix bug in fetch of a broken part. [#5865](https://github.com/ClickHouse/ClickHouse/pull/5865) ([alesapin](https://github.com/alesapin))
- Fix SPLIT_SHARED_LIBRARIES runtime [#5793](https://github.com/ClickHouse/ClickHouse/pull/5793) ([Danila Kutenin](https://github.com/danlark1))
- Fixed time zone initialization when `/etc/localtime` is a relative symlink like `../usr/share/zoneinfo/Europe/Moscow` [#5922](https://github.com/ClickHouse/ClickHouse/pull/5922) ([alexey-milovidov](https://github.com/alexey-milovidov))
- Fixed time zone initialization when `/etc/localtime` is a relative symlink like `../usr/share/zoneinfo/Asia/Istanbul` [#5922](https://github.com/ClickHouse/ClickHouse/pull/5922) ([alexey-milovidov](https://github.com/alexey-milovidov))
- clickhouse-copier: Fix use-after free on shutdown [#5752](https://github.com/ClickHouse/ClickHouse/pull/5752) ([proller](https://github.com/proller))
- Updated `simdjson`. Fixed the issue that some invalid JSONs with zero bytes successfully parse. [#5938](https://github.com/ClickHouse/ClickHouse/pull/5938) ([alexey-milovidov](https://github.com/alexey-milovidov))
- Fix shutdown of SystemLogs [#5802](https://github.com/ClickHouse/ClickHouse/pull/5802) ([Anton Popov](https://github.com/CurtizJ))

View File

@ -130,7 +130,7 @@ SELECT queries are sent to all the shards and work regardless of how data is dis
次の場合は、シャーディングスキームについて心配する必要があります:
- 特定のキーによるデータの結合(INまたはJOIN)を必要とするクエリが使用されます。 このキーによってデータがシャードされる場合は、GLOBAL INまたはGLOBAL JOINの代わりにlocal INまたはJOINを使用できます。
- 多数のサーバーが、多数の小さなクエリ(個々のクライアント-ウェブサイト、広告主、またはパートナーのクエリ)で使用されます(数百以上)。 小さなクエリがクラスタ全体に影響を与えないようにするには、単一のシャード上の単一のクライアントのデータを検索することが理にかなってい また、我々はYandexのでやったように。Metricaでは、biレベルのシャーディングを設定できます:クラスタ全体を次のように分割します “layers” ここで、レイヤーは複数のシャードで構成されます。 単一のクライアントのデータは単一のレイヤー上にありますが、必要に応じてシャードをレイヤーに追加することができ、データはランダムに分散されます。 分散テーブルはレイヤごとに作成され、グローバルクエリ用に単一の共有分散テーブルが作成されます。
- 多数のサーバーが、多数の小さなクエリ(個々のクライアント-ウェブサイト、広告主、またはパートナーのクエリ)で使用されます(数百以上)。 小さなクエリがクラスタ全体に影響を与えないようにするには、単一のシャード上の単一のクライアントのデータを検索することが理にかなってい また レベルのシャーディングを設定できます:クラスタ全体を次のように分割します “layers” ここで、レイヤーは複数のシャードで構成されます。 単一のクライアントのデータは単一のレイヤー上にありますが、必要に応じてシャードをレイヤーに追加することができ、データはランダムに分散されます。 分散テーブルはレイヤごとに作成され、グローバルクエリ用に単一の共有分散テーブルが作成されます。
データは非同期に書き込まれます。 テーブルに挿入すると、データブロックはローカルファイルシステムに書き込まれます。 データはできるだけ早くバックグラウンドでリモートサーバーに送信されます。 データを送信するための期間は、 [distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) と [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) 設定。 その `Distributed` エンジンは、挿入されたデータを含む各ファイルを別々に送信しますが、 [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) 設定。 この設定の改善にクラスターの性能をより一層の活用地域のサーバやネットワーク資源です。 を確認しておきましょうか否かのデータが正常に送信されるチェックリストファイル(データまたは間に-をはさんだ)はテーブルディレクトリ: `/var/lib/clickhouse/data/database/table/`.

View File

@ -21,7 +21,7 @@ toc_title: "ニューヨークタクシー"
ファイルの中には、無効な行が含まれている場合があり、以下のように修正することができます。
``` bash
sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_
sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_Yand
sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-03.csv > data/yellow_tripdata_2010-03.csv_
mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv
mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv
@ -378,7 +378,6 @@ Q3:0.051秒
Q4:0.072秒
この場合、クエリの処理時間は、ネットワークのレイテンシによって決定されます。
フィンランドのYandexデータセンターにあるクライアントをロシアのクラスター上に置いてクエリを実行したところ、約20ミリ秒のレイテンシが追加されました。
## サマリ {#summary}

View File

@ -5,11 +5,12 @@ toc_title: Playground
# ClickHouse Playground {#clickhouse-playground}
!!! warning "Warning"
This service is deprecated and will be replaced in foreseeable future.
[ClickHouse Playground](https://play.clickhouse.com) では、サーバーやクラスタを設定することなく、即座にクエリを実行して ClickHouse を試すことができます。
いくつかの例のデータセットは、Playground だけでなく、ClickHouse の機能を示すサンプルクエリとして利用可能です. また、 ClickHouse の LTS リリースで試すこともできます。
ClickHouse Playground は、[Yandex.Cloud](https://cloud.yandex.com/)にホストされている m2.small [Managed Service for ClickHouse](https://cloud.yandex.com/services/managed-clickhouse) インスタンス(4 vCPU, 32 GB RAM) で提供されています。クラウドプロバイダの詳細情報については[こちら](../commercial/cloud.md)。
任意の HTTP クライアントを使用してプレイグラウンドへのクエリを作成することができます。例えば[curl](https://curl.haxx.se)、[wget](https://www.gnu.org/software/wget/)、[JDBC](../interfaces/jdbc.md)または[ODBC](../interfaces/odbc.md)ドライバを使用して接続を設定します。
ClickHouse をサポートするソフトウェア製品の詳細情報は[こちら](../interfaces/index.md)をご覧ください。
@ -59,14 +60,3 @@ curl "https://play-api.clickhouse.com:8443/?query=SELECT+'Play+ClickHouse\!';&us
``` bash
clickhouse client --secure -h play-api.clickhouse.com --port 9440 -u playground --password clickhouse -q "SELECT 'Play ClickHouse\!'"
```
## 実装の詳細 {#implementation-details}
ClickHouse PlaygroundのWebインタフェースは、ClickHouse [HTTP API](../interfaces/http.md)を介してリクエストを行います。
Playgroundのバックエンドは、追加のサーバーサイドのアプリケーションを伴わない、ただのClickHouseクラスタです。
上記のように, ClickHouse HTTPSとTCP/TLSのエンドポイントは Playground の一部としても公開されており、
いずれも、上記の保護とよりよいグローバルな接続のためのレイヤを追加するために、[Cloudflare Spectrum](https://www.cloudflare.com/products/cloudflare-spectrum/) を介してプロキシされています。
!!! warning "注意"
いかなる場合においても、インターネットにClickHouseサーバを公開することは **非推奨です**
プライベートネットワーク上でのみ接続を待機し、適切に設定されたファイアウォールによって保護されていることを確認してください。

View File

@ -547,19 +547,19 @@ ClickHouseクラスタは均質なクラスタ(homogenous cluster)です。セ
<perftest_3shards_1replicas>
<shard>
<replica>
<host>example-perftest01j.yandex.ru</host>
<host>example-perftest01j</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>example-perftest02j.yandex.ru</host>
<host>example-perftest02j</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>example-perftest03j.yandex.ru</host>
<host>example-perftest03j</host>
<port>9000</port>
</replica>
</shard>
@ -607,15 +607,15 @@ INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1;
<perftest_1shards_3replicas>
<shard>
<replica>
<host>example-perftest01j.yandex.ru</host>
<host>example-perftest01j</host>
<port>9000</port>
</replica>
<replica>
<host>example-perftest02j.yandex.ru</host>
<host>example-perftest02j</host>
<port>9000</port>
</replica>
<replica>
<host>example-perftest03j.yandex.ru</host>
<host>example-perftest03j</host>
<port>9000</port>
</replica>
</shard>
@ -637,15 +637,15 @@ ZooKeeperの場所は設定ファイルで指定します:
``` xml
<zookeeper>
<node>
<host>zoo01.yandex.ru</host>
<host>zoo01</host>
<port>2181</port>
</node>
<node>
<host>zoo02.yandex.ru</host>
<host>zoo02</host>
<port>2181</port>
</node>
<node>
<host>zoo03.yandex.ru</host>
<host>zoo03</host>
<port>2181</port>
</node>
</zookeeper>

View File

@ -8,7 +8,7 @@ toc_title: "\u7D71\u5408"
# サードパーティ開発者からの統合ライブラリ {#integration-libraries-from-third-party-developers}
!!! warning "免責事項"
Yandexのは **ない** 以下のツールとライブラリを維持し、その品質を確保するための広範なテストを行っていません。
ClickHouse, Inc.のは **ない** 以下のツールとライブラリを維持し、その品質を確保するための広範なテストを行っていません。
## インフラ製品 {#infrastructure-products}

View File

@ -694,7 +694,7 @@ UTCタイムゾーンまたは地理的位置(たとえば、Africa/Abidjan)のI
**例**
``` xml
<timezone>Europe/Moscow</timezone>
<timezone>Asia/Istanbul</timezone>
```
## tcp_port {#server_configuration_parameters-tcp_port}

View File

@ -40,7 +40,7 @@ ClickHouseにデータを挿入するときは、データの値に応じて、
``` sql
CREATE TABLE dt
(
`timestamp` DateTime('Europe/Moscow'),
`timestamp` DateTime('Asia/Istanbul'),
`event_id` UInt8
)
ENGINE = TinyLog;
@ -61,13 +61,13 @@ SELECT * FROM dt;
└─────────────────────┴──────────┘
```
- Datetimeを整数として挿入する場合は、Unix Timestamp(UTC)として扱われます。 `1546300800` を表す `'2019-01-01 00:00:00'` UTC しかし、 `timestamp` 列は `Europe/Moscow` (UTC+3)タイムゾーンが指定されている場合、文字列として出力すると、値は次のように表示されます `'2019-01-01 03:00:00'`
- 文字列値をdatetimeとして挿入すると、列タイムゾーンにあるものとして扱われます。 `'2019-01-01 00:00:00'` であるとして扱われます `Europe/Moscow` タイムゾーンとして保存 `1546290000`.
- Datetimeを整数として挿入する場合は、Unix Timestamp(UTC)として扱われます。 `1546300800` を表す `'2019-01-01 00:00:00'` UTC しかし、 `timestamp` 列は `Asia/Istanbul` (UTC+3)タイムゾーンが指定されている場合、文字列として出力すると、値は次のように表示されます `'2019-01-01 03:00:00'`
- 文字列値をdatetimeとして挿入すると、列タイムゾーンにあるものとして扱われます。 `'2019-01-01 00:00:00'` であるとして扱われます `Asia/Istanbul` タイムゾーンとして保存 `1546290000`.
**2.** フィルタリング `DateTime`
``` sql
SELECT * FROM dt WHERE timestamp = toDateTime('2019-01-01 00:00:00', 'Europe/Moscow')
SELECT * FROM dt WHERE timestamp = toDateTime('2019-01-01 00:00:00', 'Asia/Istanbul')
```
``` text
@ -91,12 +91,12 @@ SELECT * FROM dt WHERE timestamp = '2019-01-01 00:00:00'
**3.** Aのタイムゾーンの取得 `DateTime`-タイプ列:
``` sql
SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x
SELECT toDateTime(now(), 'Asia/Istanbul') AS column, toTypeName(column) AS x
```
``` text
┌──────────────column─┬─x─────────────────────────┐
│ 2019-10-16 04:12:04 │ DateTime('Europe/Moscow') │
│ 2019-10-16 04:12:04 │ DateTime('Asia/Istanbul') │
└─────────────────────┴───────────────────────────┘
```
@ -105,7 +105,7 @@ SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x
``` sql
SELECT
toDateTime(timestamp, 'Europe/London') as lon_time,
toDateTime(timestamp, 'Europe/Moscow') as mos_time
toDateTime(timestamp, 'Asia/Istanbul') as mos_time
FROM dt
```

View File

@ -28,7 +28,7 @@ DateTime64(precision, [timezone])
``` sql
CREATE TABLE dt
(
`timestamp` DateTime64(3, 'Europe/Moscow'),
`timestamp` DateTime64(3, 'Asia/Istanbul'),
`event_id` UInt8
)
ENGINE = TinyLog
@ -49,13 +49,13 @@ SELECT * FROM dt
└─────────────────────────┴──────────┘
```
- Datetimeを整数として挿入する場合、適切にスケーリングされたUnixタイムスタンプ(UTC)として扱われます。 `1546300800000` 精度3でを表します `'2019-01-01 00:00:00'` UTC しかし、 `timestamp` 列は `Europe/Moscow` (UTC+3)タイムゾーンが指定されている場合、文字列として出力すると、値は次のように表示されます `'2019-01-01 03:00:00'`
- 文字列値をdatetimeとして挿入すると、列タイムゾーンにあるものとして扱われます。 `'2019-01-01 00:00:00'` であるとして扱われます `Europe/Moscow` タイムゾーンとして保存 `1546290000000`.
- Datetimeを整数として挿入する場合、適切にスケーリングされたUnixタイムスタンプ(UTC)として扱われます。 `1546300800000` 精度3でを表します `'2019-01-01 00:00:00'` UTC しかし、 `timestamp` 列は `Asia/Istanbul` (UTC+3)タイムゾーンが指定されている場合、文字列として出力すると、値は次のように表示されます `'2019-01-01 03:00:00'`
- 文字列値をdatetimeとして挿入すると、列タイムゾーンにあるものとして扱われます。 `'2019-01-01 00:00:00'` であるとして扱われます `Asia/Istanbul` タイムゾーンとして保存 `1546290000000`.
**2.** フィルタリング `DateTime64`
``` sql
SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow')
SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Asia/Istanbul')
```
``` text
@ -69,12 +69,12 @@ SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europ
**3.** Aのタイムゾーンの取得 `DateTime64`-タイプ値:
``` sql
SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x
SELECT toDateTime64(now(), 3, 'Asia/Istanbul') AS column, toTypeName(column) AS x
```
``` text
┌──────────────────column─┬─x──────────────────────────────┐
│ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Europe/Moscow') │
│ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Asia/Istanbul') │
└─────────────────────────┴────────────────────────────────┘
```
@ -83,7 +83,7 @@ SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS
``` sql
SELECT
toDateTime64(timestamp, 3, 'Europe/London') as lon_time,
toDateTime64(timestamp, 3, 'Europe/Moscow') as mos_time
toDateTime64(timestamp, 3, 'Asia/Istanbul') as mos_time
FROM dt
```

View File

@ -460,7 +460,7 @@ AS parseDateTimeBestEffort;
クエリ:
``` sql
SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow')
SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Asia/Istanbul')
AS parseDateTimeBestEffort
```

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

View File

@ -1 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" width="54" height="48" markdown="1" viewBox="0 0 9 8"><style>.o{fill:#fc0}.r{fill:red}</style><path d="M0,7 h1 v1 h-1 z" class="r"/><path d="M0,0 h1 v7 h-1 z" class="o"/><path d="M2,0 h1 v8 h-1 z" class="o"/><path d="M4,0 h1 v8 h-1 z" class="o"/><path d="M6,0 h1 v8 h-1 z" class="o"/><path d="M8,3.25 h1 v1.5 h-1 z" class="o"/></svg>

Before

Width:  |  Height:  |  Size: 373 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

View File

@ -1,94 +0,0 @@
---
toc_priority: 0
toc_title: 목차
---
# ClickHouse란? {#what-is-clickhouse}
ClickHouse® 는 query의 온라인 분석 처리(OLAP)를 위한 열 지향(column-oriented) 데이터베이스 관리 시스템(DBMS)입니다.
"보통의" 행 지향(row-oriented) DMBS에서는 데이터가 다음과 같은 순서로 저장됩니다.
| row | WatchID | JavaEnable | Title | GoodEvent | EventTime |
|-----|-------------|------------|--------------------|-----------|---------------------|
| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 |
| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 |
| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 |
| #N | … | … | … | … | … |
즉, 행과 관련된 모든 값들은 물리적으로 나란히 저장됩니다.
행 지향(row-oriented) DMBS의 예시로는 MySQL, Postgres, 그리고 MS SQL 서버 등이 있습니다.
열 지향 (column-oriented) DBMS에서는 데이터가 아래와 같은 방식으로 저장됩니다:
| Row: | #0 | #1 | #2 | #N |
|-------------|---------------------|---------------------|---------------------|-----|
| WatchID: | 89354350662 | 90329509958 | 89953706054 | … |
| JavaEnable: | 1 | 0 | 1 | … |
| Title: | Investor Relations | Contact us | Mission | … |
| GoodEvent: | 1 | 1 | 1 | … |
| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … |
이 예에서는 데이터가 정렬된 순서만을 보여줍니다. 다른 열의 값들은 서로 분리되어 저장되고, 같은 열의 정보들은 함께 저장됩니다.
열 지향(column-oriented) DBMS 의 종류는 Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, 그리고 kdb+ 등이 있습니다.
데이터를 저장하기 위한 서로 다른 순서는 다른 시나리오에 더 적합합니다. 데이터 접근 시나리오는 쿼리가 수행되는 빈도, 비율 및 비율을 나타내거나, 각 쿼리 유형(행, 열 및 바이트)에 대해 읽은 데이터의 양 데이터 읽기와 업데이트 사이의 관계, 데이터의 작업 크기 및 로컬에서 사용되는 방법 트랜잭션이 사용되는지 여부, 트랜잭션이 얼마나 격리되어 있는지, 데이터 복제 및 논리적 무결성에 대한 요구 사항, 각 쿼리 유형에 대한 대기 시간 및 처리량 요구 사항 등이 있습니다.
시스템의 부하가 높을수록 사용 시나리오의 요구 사항에 맞게 시스템 설정을 사용자 지정하는 것이 더 중요하며 이 사용자 지정은 더욱 세분화됩니다. 상당히 다른 시나리오에 똑같이 적합한 시스템은 없습니다. 만약 높은 부하에서 시스템이 넓은 시나리오 집합에 대해 적응한다면 시스템은 모든 시나리오를 모두 제대로 처리하지 못하거나 가능한 시나리오 중 하나 또는 몇 개에 대해서만 잘 작동할 것입니다.
## OLAP 시나리오의 중요 속성들 {#key-properties-of-olap-scenario}
- 요청(request)의 대부분은 읽기 접근에 관한 것입니다.
- 데이터는 단일 행이 아니라 상당히 큰 일괄 처리(\> 1000개 행)로 업데이트됩니다. 또는 전혀 업데이트되지 않습니다.
- 데이터는 DB에 추가되지만 수정되지는 않습니다.
- 읽기의 경우 DB에서 상당히 많은 수의 행이 추출되지만 열은 일부만 추출됩니다.
- 테이블은 "넓습니다". 이는 열의 수가 많다는 것을 의미합니다.
- 쿼리는 상대적으로 드뭅니다(일반적으로 서버당 수백 또는 초당 쿼리 미만).
- 간단한 쿼리의 경우 약 50ms의 대기 시간이 허용됩니다.
- 열 값은 숫자와 짧은 문자열(예: URL당 60바이트)과 같이 상당히 작습니다
- 단일 쿼리를 처리할 때 높은 처리량이 필요합니다(서버당 초당 최대 수십억 행).
- 트랜잭션이 필요하지 않습니다.
- 데이터 일관성에 대한 요구 사항이 낮습니다.
- 쿼리당 하나의 큰 테이블이 존재하고 하나를 제외한 모든 테이블은 작습니다.
- 쿼리 결과가 원본 데이터보다 훨씬 작습니다. 즉, 데이터가 필터링되거나 집계되므로 결과가 단일 서버의 RAM에 꼭 들어맞습니다.
OLAP 시나리오가 다른 일반적인 시나리오(OLTP 또는 키-값 액세스와 같은)와 매우 다르다는 것을 쉽게 알 수 있습니다. 따라서 적절한 성능을 얻으려면 분석 쿼리를 처리하기 위해 OLTP 또는 키-값 DB를 사용하는 것은 의미가 없습니다. 예를 들어 분석에 MongoDB나 Redis를 사용하려고 하면 OLAP 데이터베이스에 비해 성능이 매우 저하됩니다.
## 왜 열 지향 데이터베이스가 OLAP 시나리오에 적합한가{#why-column-oriented-databases-work-better-in-the-olap-scenario}
열 지향(column-oriented) 데이터베이스는 OLAP 시나리오에 더 적합합니다. 대부분의 쿼리를 처리하는 데 있어서 행 지향(row-oriented) 데이터베이스보다 100배 이상 빠릅니다. 그 이유는 아래에 자세히 설명되어 있지만 사실은 시각적으로 더 쉽게 설명할 수 있습니다.
**행 지향 DBMS**
![Row-oriented](images/row-oriented.gif#)
**열 지향 DBMS**
![Column-oriented](images/column-oriented.gif#)
차이가 보이시나요?
### 입출력 {#inputoutput}
1. 분석 쿼리의 경우 적은 수의 테이블 열만 읽어야 합니다. 열 지향 데이터베이스에서는 필요한 데이터만 읽을 수 있습니다. 예를 들어 100개 중 5개의 열이 필요한 경우 I/O가 20배 감소할 것으로 예상할 수 있습니다.
2. 데이터는 패킷으로 읽히므로 압축하기가 더 쉽습니다. 열의 데이터도 압축하기 쉽습니다. 이것은 I/O의 볼륨을 더욱 감소시킵니다.
3. 감소된 I/O로 인해 시스템 캐시에 더 많은 데이터가 들어갑니다.
예를 들어, "각 광고 플랫폼에 대한 레코드 수 계산" 쿼리는 압축되지 않은 1바이트를 차지하는 하나의 "광고 플랫폼 ID" 열을 읽어야 합니다. 트래픽의 대부분이 광고 플랫폼에서 발생하지 않은 경우 이 열의 최소 10배 압축을 기대할 수 있습니다. 빠른 압축 알고리즘을 사용하면 초당 최소 몇 기가바이트의 압축되지 않은 데이터의 속도로 데이터 압축 해제가 가능합니다. 즉, 이 쿼리는 단일 서버에서 초당 약 수십억 행의 속도로 처리될 수 있습니다. 이 속도는 정말 실제로 달성됩니다.
### CPU {#cpu}
쿼리를 수행하려면 많은 행을 처리해야 하므로 별도의 행이 아닌 전체 벡터에 대한 모든 연산을 디스패치하거나 쿼리 엔진을 구현하여 디스패치 비용이 거의 들지 않습니다. 반쯤 괜찮은 디스크 하위 시스템에서 이렇게 하지 않으면 쿼리 인터프리터가 불가피하게 CPU를 정지시킵니다. 데이터를 열에 저장하고 가능한 경우 열별로 처리하는 것이 좋습니다.
이를 수행하기위한 두가지 방법이 있습니다.
1. 벡터 엔진. 모든 연산은 별도의 값 대신 벡터에 대해 작성됩니다. 즉, 작업을 자주 호출할 필요가 없으며 파견 비용도 무시할 수 있습니다. 작업 코드에는 최적화된 내부 주기가 포함되어 있습니다.
2. 코드 생성. 쿼리에 대해 생성된 코드에는 모든 간접 호출이 있습니다.
이것은 단순한 쿼리를 실행할 때 의미가 없기 때문에 "일반" 데이터베이스에서는 수행되지 않습니다. 그러나 예외가 있습니다. 예를 들어 MemSQL은 코드 생성을 사용하여 SQL 쿼리를 처리할 때 대기 시간을 줄입니다. (비교되게, 분석 DBMS는 대기 시간이 아닌 처리량 최적화가 필요합니다.)
CPU 효율성을 위해 쿼리 언어는 선언적(SQL 또는 MDX)이거나 최소한 벡터(J, K)여야 합니다. 쿼리는 최적화를 허용하는 암시적 루프만 포함해야 합니다.
{## [원문](https://clickhouse.com/docs/en/) ##}

View File

@ -53,15 +53,15 @@ WHERE table = 'visits'
```
``` text
┌─partition─┬─name───────────┬─active─┐
│ 201901 │ 201901_1_3_1 │ 0 │
│ 201901 │ 201901_1_9_2 │ 1 │
│ 201901 │ 201901_8_8_0 │ 0 │
│ 201901 │ 201901_9_9_0 │ 0 │
│ 201902 │ 201902_4_6_1 │ 1 │
│ 201902 │ 201902_10_10_0 │ 1 │
│ 201902 │ 201902_11_11_0 │ 1 │
└───────────┴────────────────┴────────┘
┌─partition─┬─name──────────────┬─active─┐
│ 201901 │ 201901_1_3_1 │ 0 │
│ 201901 │ 201901_1_9_2_11 │ 1 │
│ 201901 │ 201901_8_8_0 │ 0 │
│ 201901 │ 201901_9_9_0 │ 0 │
│ 201902 │ 201902_4_6_1_11 │ 1 │
│ 201902 │ 201902_10_10_0_11 │ 1 │
│ 201902 │ 201902_11_11_0_11 │ 1 │
└───────────┴───────────────────┴────────┘
```
Столбец `partition` содержит имена всех партиций таблицы. Таблица `visits` из нашего примера содержит две партиции: `201901` и `201902`. Используйте значения из этого столбца в запросах [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md).
@ -70,12 +70,13 @@ WHERE table = 'visits'
Столбец `active` отображает состояние куска. `1` означает, что кусок активен; `0` неактивен. К неактивным можно отнести куски, оставшиеся после слияния данных. Поврежденные куски также отображаются как неактивные. Неактивные куски удаляются приблизительно через 10 минут после того, как было выполнено слияние.
Рассмотрим детальнее имя первого куска `201901_1_3_1`:
Рассмотрим детальнее имя куска `201901_1_9_2_11`:
- `201901` имя партиции;
- `1` минимальный номер блока данных;
- `3` максимальный номер блока данных;
- `1` уровень куска (глубина дерева слияний, которыми этот кусок образован).
- `9` максимальный номер блока данных;
- `2` уровень куска (глубина дерева слияний, которыми этот кусок образован).
- `11` - версия мутации (если парт мутировал)
!!! info "Info"
Названия кусков для таблиц старого типа образуются следующим образом: `20190117_20190123_2_2_0` (минимальная дата _ максимальная дата _ номер минимального блока _ номер максимального блока _ уровень).
@ -89,16 +90,16 @@ OPTIMIZE TABLE visits PARTITION 201902;
```
``` text
┌─partition─┬─name───────────┬─active─┐
│ 201901 │ 201901_1_3_1 │ 0 │
│ 201901 │ 201901_1_9_2 │ 1 │
│ 201901 │ 201901_8_8_0 │ 0 │
│ 201901 │ 201901_9_9_0 │ 0 │
│ 201902 │ 201902_4_6_1 │ 0 │
│ 201902 │ 201902_4_11_2 │ 1 │
│ 201902 │ 201902_10_10_0 │ 0 │
│ 201902 │ 201902_11_11_0 │ 0 │
└───────────┴────────────────┴────────┘
┌─partition─┬─name─────────────┬─active─┐
│ 201901 │ 201901_1_3_1 │ 0 │
│ 201901 │ 201901_1_9_2_11 │ 1 │
│ 201901 │ 201901_8_8_0 │ 0 │
│ 201901 │ 201901_9_9_0 │ 0 │
│ 201902 │ 201902_4_6_1 │ 0 │
│ 201902 │ 201902_4_11_2_11 │ 1 │
│ 201902 │ 201902_10_10_0 │ 0 │
│ 201902 │ 201902_11_11_0 │ 0 │
└───────────┴──────────────────┴────────┘
```
Неактивные куски будут удалены примерно через 10 минут после слияния.
@ -109,12 +110,12 @@ OPTIMIZE TABLE visits PARTITION 201902;
/var/lib/clickhouse/data/default/visits$ ls -l
total 40
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2_11
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_8_8_0
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_9_9_0
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_10_10_0
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_11_11_0
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:19 201902_4_11_2
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:19 201902_4_11_2_11
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 12:09 201902_4_6_1
drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached
```

View File

@ -128,7 +128,7 @@ logs - имя кластера в конфигурационном файле с
Беспокоиться о схеме шардирования имеет смысл в следующих случаях:
- используются запросы, требующие соединение данных (IN, JOIN) по определённому ключу - тогда если данные шардированы по этому ключу, то можно использовать локальные IN, JOIN вместо GLOBAL IN, GLOBAL JOIN, что кардинально более эффективно.
- используется большое количество серверов (сотни и больше) и большое количество маленьких запросов (запросы отдельных клиентов - сайтов, рекламодателей, партнёров) - тогда, для того, чтобы маленькие запросы не затрагивали весь кластер, имеет смысл располагать данные одного клиента на одном шарде, или (вариант, который используется в Яндекс.Метрике) сделать двухуровневое шардирование: разбить весь кластер на «слои», где слой может состоять из нескольких шардов; данные для одного клиента располагаются на одном слое, но в один слой можно по мере необходимости добавлять шарды, в рамках которых данные распределены произвольным образом; создаются распределённые таблицы на каждый слой и одна общая распределённая таблица для глобальных запросов.
- используется большое количество серверов (сотни и больше) и большое количество маленьких запросов (запросы отдельных клиентов - сайтов, рекламодателей, партнёров) - тогда, для того, чтобы маленькие запросы не затрагивали весь кластер, имеет смысл располагать данные одного клиента на одном шарде, или сделать двухуровневое шардирование: разбить весь кластер на «слои», где слой может состоять из нескольких шардов; данные для одного клиента располагаются на одном слое, но в один слой можно по мере необходимости добавлять шарды, в рамках которых данные распределены произвольным образом; создаются распределённые таблицы на каждый слой и одна общая распределённая таблица для глобальных запросов.
Запись данных осуществляется полностью асинхронно. При вставке в таблицу, блок данных сначала записывается в файловую систему. Затем, в фоновом режиме отправляются на удалённые серверы при первой возможности. Период отправки регулируется настройками [distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) и [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms). Движок таблиц `Distributed` отправляет каждый файл со вставленными данными отдельно, но можно включить пакетную отправку данных настройкой [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts). Эта настройка улучшает производительность кластера за счет более оптимального использования ресурсов сервера-отправителя и сети. Необходимо проверять, что данные отправлены успешно, для этого проверьте список файлов (данных, ожидающих отправки) в каталоге таблицы `/var/lib/clickhouse/data/database/table/`. Количество потоков для выполнения фоновых задач можно задать с помощью настройки [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size).

View File

@ -6,7 +6,7 @@ toc_priority: 110
# Почему бы не использовать системы типа MapReduce? {#why-not-use-something-like-mapreduce}
Системами типа MapReduce будем называть системы распределённых вычислений, в которых операция свёртки реализована на основе распределённой сортировки. Наиболее распространённое решение с открытым кодом в данном классе — [Apache Hadoop](http://hadoop.apache.org). Яндекс пользуется собственным решением — YT.
Системами типа MapReduce будем называть системы распределённых вычислений, в которых операция свёртки реализована на основе распределённой сортировки. Наиболее распространённое решение с открытым кодом в данном классе — [Apache Hadoop](http://hadoop.apache.org). В крупных IT компаниях вроде Google или Яндекс часто используются собственные закрытые решения.
Такие системы не подходят для онлайн запросов в силу слишком большой задержки. То есть не могут быть использованы в качестве бэкенда для веб-интерфейса. Также эти системы не подходят для обновления данных в реальном времени. Распределённая сортировка является не оптимальным способом для выполнения операции свёртки в случае запросов, выполняющихся в режиме онлайн, потому что результат выполнения операции и все промежуточные результаты (если такие есть) помещаются в оперативную память на одном сервере. В таком случае оптимальным способом выполнения операции свёртки является хеш-таблица. Частым способом оптимизации "map-reduce" задач является предагрегация (частичная свёртка) с использованием хеш-таблицы в оперативной памяти. Пользователь делает эту оптимизацию в ручном режиме. Распределённая сортировка — основная причина тормозов при выполнении несложных задач типа "map-reduce".

View File

@ -380,7 +380,7 @@ Q3: 0.051 sec.
Q4: 0.072 sec.
В этом случае, время выполнения запросов определяется в первую очередь сетевыми задержками.
Мы выполняли запросы с помощью клиента, расположенного в дата-центре Яндекса в Мянтсяля (Финляндия), на кластер в России, что добавляет порядка 20 мс задержки.
Мы выполняли запросы с помощью клиента, расположенного в другом дата-центре, не там где кластер, что добавляет порядка 20 мс задержки.
## Резюме {#reziume}

View File

@ -5,11 +5,12 @@ toc_title: Playground
# ClickHouse Playground {#clickhouse-playground}
!!! warning "Warning"
This service is deprecated and will be replaced in foreseeable future.
[ClickHouse Playground](https://play.clickhouse.com) позволяет пользователям экспериментировать с ClickHouse, мгновенно выполняя запросы без настройки своего сервера или кластера.
В Playground доступны несколько тестовых массивов данных, а также примеры запросов, которые показывают возможности ClickHouse. Кроме того, вы можете выбрать LTS релиз ClickHouse, который хотите протестировать.
ClickHouse Playground дает возможность поработать с [Managed Service for ClickHouse](https://cloud.yandex.com/services/managed-clickhouse) в конфигурации m2.small (4 vCPU, 32 ГБ ОЗУ), которую предосталяет [Яндекс.Облако](https://cloud.yandex.com/). Дополнительную информацию об облачных провайдерах читайте в разделе [Поставщики облачных услуг ClickHouse](../commercial/cloud.md).
Вы можете отправлять запросы к Playground с помощью любого HTTP-клиента, например [curl](https://curl.haxx.se) или [wget](https://www.gnu.org/software/wget/), также можно установить соединение с помощью драйверов [JDBC](../interfaces/jdbc.md) или [ODBC](../interfaces/odbc.md). Более подробная информация о программных продуктах, поддерживающих ClickHouse, доступна [здесь](../interfaces/index.md).
## Параметры доступа {#credentials}
@ -54,11 +55,3 @@ curl "https://play-api.clickhouse.com:8443/?query=SELECT+'Play+ClickHouse\!';&us
``` bash
clickhouse client --secure -h play-api.clickhouse.com --port 9440 -u playground --password clickhouse -q "SELECT 'Play ClickHouse\!'"
```
## Детали реализации {#implementation-details}
Веб-интерфейс ClickHouse Playground выполняет запросы через ClickHouse [HTTP API](../interfaces/http.md).
Бэкэнд Playground - это кластер ClickHouse без дополнительных серверных приложений. Как упоминалось выше, способы подключения по HTTPS и TCP/TLS общедоступны как часть Playground. Они проксируются через [Cloudflare Spectrum](https://www.cloudflare.com/products/cloudflare-spectrum/) для добавления дополнительного уровня защиты и улучшенного глобального подключения.
!!! warning "Предупреждение"
Открывать сервер ClickHouse для публичного доступа в любой другой ситуации **настоятельно не рекомендуется**. Убедитесь, что он настроен только на частную сеть и защищен брандмауэром.

View File

@ -6,7 +6,7 @@ toc_title: "Библиотеки для интеграции от сторонн
# Библиотеки для интеграции от сторонних разработчиков {#biblioteki-dlia-integratsii-ot-storonnikh-razrabotchikov}
!!! warning "Disclaimer"
Яндекс не занимается поддержкой перечисленных ниже инструментов и библиотек и не проводит тщательного тестирования для проверки их качества.
ClickHouse, Inc. не занимается поддержкой перечисленных ниже инструментов и библиотек и не проводит тщательного тестирования для проверки их качества.
## Инфраструктурные продукты {#infrastrukturnye-produkty}

View File

@ -0,0 +1,132 @@
---
toc_priority: 69
toc_title: "Именованные соединения"
---
# Хранение реквизитов для подключения к внешним источникам в конфигурационных файлах {#named-collections}
Реквизиты для подключения к внешним источникам (словарям, таблицам, табличным функциям) можно сохранить
в конфигурационных файлах и таким образом упростить создание объектов и скрыть реквизиты (пароли)
от пользователей, имеющих только SQL доступ.
Параметры можно задать в XML `<format>CSV</format>` и переопределить в SQL `, format = 'TSV'`.
При использовании именованных соединений, параметры в SQL задаются в формате `ключ` = `значение`: `compression_method = 'gzip'`.
Именованные соединения хранятся в файле `config.xml` сервера ClickHouse в секции `<named_collections>` и применяются при старте ClickHouse.
Пример конфигурации:
```xml
$ cat /etc/clickhouse-server/config.d/named_collections.xml
<clickhouse>
<named_collections>
...
</named_collections>
</clickhouse>
```
## Именованные соединения для доступа к S3.
Описание параметров смотри [Табличная Функция S3](../sql-reference/table-functions/s3.md).
Пример конфигурации:
```xml
<clickhouse>
<named_collections>
<s3_mydata>
<access_key_id>AKIAIOSFODNN7EXAMPLE</access_key_id>
<secret_access_key> wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY</secret_access_key>
<format>CSV</format>
</s3_mydata>
</named_collections>
</clickhouse>
```
### Пример использования именованных соединений с функцией s3
```sql
INSERT INTO FUNCTION s3(s3_mydata, url = 'https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz',
format = 'TSV', structure = 'number UInt64', compression_method = 'gzip')
SELECT * FROM numbers(10000);
SELECT count()
FROM s3(s3_mydata, url = 'https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz')
┌─count()─┐
│ 10000 │
└─────────┘
1 rows in set. Elapsed: 0.279 sec. Processed 10.00 thousand rows, 90.00 KB (35.78 thousand rows/s., 322.02 KB/s.)
```
### Пример использования именованных соединений с таблицей S3
```sql
CREATE TABLE s3_engine_table (number Int64)
ENGINE=S3(s3_mydata, url='https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz', format = 'TSV')
SETTINGS input_format_with_names_use_header = 0;
SELECT * FROM s3_engine_table LIMIT 3;
┌─number─┐
│ 0 │
│ 1 │
│ 2 │
└────────┘
```
## Пример использования именованных соединений с базой данных MySQL.
Описание параметров смотри [mysql](../sql-reference/table-functions/mysql.md).
Пример конфигурации:
```xml
<clickhouse>
<named_collections>
<mymysql>
<user>myuser</user>
<password>mypass</password>
<host>127.0.0.1</host>
<port>3306</port>
<database>test</database>
<connection_pool_size>8</connection_pool_size>
<on_duplicate_clause>1</on_duplicate_clause>
<replace_query>1</replace_query>
</mymysql>
</named_collections>
</clickhouse>
```
### Пример использования именованных соединений с табличной функцией mysql
```sql
SELECT count() FROM mysql(mymysql, table = 'test');
┌─count()─┐
│ 3 │
└─────────┘
```
### Пример использования именованных таблицей с движком mysql
```sql
CREATE TABLE mytable(A Int64) ENGINE = MySQL(mymysql, table = 'test', connection_pool_size=3, replace_query=0);
SELECT count() FROM mytable;
┌─count()─┐
│ 3 │
└─────────┘
```
### Пример использования именованных с внешним словарем с источником mysql
```sql
CREATE DICTIONARY dict (A Int64, B String)
PRIMARY KEY A
SOURCE(MYSQL(NAME mymysql TABLE 'source'))
LIFETIME(MIN 1 MAX 2)
LAYOUT(HASHED());
SELECT dictGet('dict', 'B', 2);
┌─dictGet('dict', 'B', 2)─┐
│ two │
└─────────────────────────┘
```

Some files were not shown because too many files have changed in this diff Show More