mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Merge branch 'master' of github.com:ClickHouse/ClickHouse into BLAKE3
This commit is contained in:
commit
44591b79ff
6
.gitmodules
vendored
6
.gitmodules
vendored
@ -217,6 +217,9 @@
|
||||
[submodule "contrib/yaml-cpp"]
|
||||
path = contrib/yaml-cpp
|
||||
url = https://github.com/ClickHouse-Extras/yaml-cpp.git
|
||||
[submodule "contrib/cld2"]
|
||||
path = contrib/cld2
|
||||
url = https://github.com/ClickHouse-Extras/cld2.git
|
||||
[submodule "contrib/libstemmer_c"]
|
||||
path = contrib/libstemmer_c
|
||||
url = https://github.com/ClickHouse-Extras/libstemmer_c.git
|
||||
@ -247,6 +250,9 @@
|
||||
[submodule "contrib/sysroot"]
|
||||
path = contrib/sysroot
|
||||
url = https://github.com/ClickHouse-Extras/sysroot.git
|
||||
[submodule "contrib/nlp-data"]
|
||||
path = contrib/nlp-data
|
||||
url = https://github.com/ClickHouse-Extras/nlp-data.git
|
||||
[submodule "contrib/hive-metastore"]
|
||||
path = contrib/hive-metastore
|
||||
url = https://github.com/ClickHouse-Extras/hive-metastore
|
||||
|
4
LICENSE
4
LICENSE
@ -1,4 +1,4 @@
|
||||
Copyright 2016-2021 ClickHouse, Inc.
|
||||
Copyright 2016-2022 ClickHouse, Inc.
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
@ -188,7 +188,7 @@ Copyright 2016-2021 ClickHouse, Inc.
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2016-2021 ClickHouse, Inc.
|
||||
Copyright 2016-2022 ClickHouse, Inc.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
2
contrib/CMakeLists.txt
vendored
2
contrib/CMakeLists.txt
vendored
@ -141,6 +141,8 @@ if (ENABLE_NLP)
|
||||
add_contrib (libstemmer-c-cmake libstemmer_c)
|
||||
add_contrib (wordnet-blast-cmake wordnet-blast)
|
||||
add_contrib (lemmagen-c-cmake lemmagen-c)
|
||||
add_contrib (nlp-data-cmake nlp-data)
|
||||
add_contrib (cld2-cmake cld2)
|
||||
endif()
|
||||
|
||||
add_contrib (sqlite-cmake sqlite-amalgamation)
|
||||
|
1
contrib/cld2
vendored
Submodule
1
contrib/cld2
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit bc6d493a2f64ed1fc1c4c4b4294a542a04e04217
|
33
contrib/cld2-cmake/CMakeLists.txt
Normal file
33
contrib/cld2-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,33 @@
|
||||
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
|
||||
|
||||
set (SRCS
|
||||
"${LIBRARY_DIR}/internal/cldutil.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det.cc"
|
||||
"${LIBRARY_DIR}/internal/cldutil_shared.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det_impl.cc"
|
||||
"${LIBRARY_DIR}/internal/debug.cc"
|
||||
"${LIBRARY_DIR}/internal/fixunicodevalue.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_entities.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_language.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_ulscript.cc"
|
||||
"${LIBRARY_DIR}/internal/getonescriptspan.cc"
|
||||
"${LIBRARY_DIR}/internal/lang_script.cc"
|
||||
"${LIBRARY_DIR}/internal/offsetmap.cc"
|
||||
"${LIBRARY_DIR}/internal/scoreonescriptspan.cc"
|
||||
"${LIBRARY_DIR}/internal/tote.cc"
|
||||
"${LIBRARY_DIR}/internal/utf8statetable.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc"
|
||||
)
|
||||
add_library(_cld2 ${SRCS})
|
||||
set_property(TARGET _cld2 PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_options (_cld2 PRIVATE -Wno-reserved-id-macro -Wno-c++11-narrowing)
|
||||
target_include_directories(_cld2 SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/public")
|
||||
add_library(ch_contrib::cld2 ALIAS _cld2)
|
1
contrib/nlp-data
vendored
Submodule
1
contrib/nlp-data
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 5591f91f5e748cba8fb9ef81564176feae774853
|
15
contrib/nlp-data-cmake/CMakeLists.txt
Normal file
15
contrib/nlp-data-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,15 @@
|
||||
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
|
||||
|
||||
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/nlp-data")
|
||||
|
||||
add_library (_nlp_data INTERFACE)
|
||||
|
||||
clickhouse_embed_binaries(
|
||||
TARGET nlp_dictionaries
|
||||
RESOURCE_DIR "${LIBRARY_DIR}"
|
||||
RESOURCES charset.zst tonality_ru.zst programming.zst
|
||||
)
|
||||
|
||||
add_dependencies(_nlp_data nlp_dictionaries)
|
||||
target_link_libraries(_nlp_data INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:nlp_dictionaries> -Wl,${NO_WHOLE_ARCHIVE}")
|
||||
add_library(ch_contrib::nlp_data ALIAS _nlp_data)
|
@ -65,7 +65,12 @@ do
|
||||
# check if variable not empty
|
||||
[ -z "$dir" ] && continue
|
||||
# ensure directories exist
|
||||
if ! mkdir -p "$dir"; then
|
||||
if [ "$DO_CHOWN" = "1" ]; then
|
||||
mkdir="mkdir"
|
||||
else
|
||||
mkdir="$gosu mkdir"
|
||||
fi
|
||||
if ! $mkdir -p "$dir"; then
|
||||
echo "Couldn't create necessary directory: $dir"
|
||||
exit 1
|
||||
fi
|
||||
|
@ -78,15 +78,21 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree](
|
||||
| DATE, NEWDATE | [Date](../../sql-reference/data-types/date.md) |
|
||||
| DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) |
|
||||
| DATETIME2, TIMESTAMP2 | [DateTime64](../../sql-reference/data-types/datetime64.md) |
|
||||
| YEAR | [UInt16](../../sql-reference/data-types/int-uint.md) |
|
||||
| TIME | [Int64](../../sql-reference/data-types/int-uint.md) |
|
||||
| ENUM | [Enum](../../sql-reference/data-types/enum.md) |
|
||||
| STRING | [String](../../sql-reference/data-types/string.md) |
|
||||
| VARCHAR, VAR_STRING | [String](../../sql-reference/data-types/string.md) |
|
||||
| BLOB | [String](../../sql-reference/data-types/string.md) |
|
||||
| GEOMETRY | [String](../../sql-reference/data-types/string.md) |
|
||||
| BINARY | [FixedString](../../sql-reference/data-types/fixedstring.md) |
|
||||
| BIT | [UInt64](../../sql-reference/data-types/int-uint.md) |
|
||||
| SET | [UInt64](../../sql-reference/data-types/int-uint.md) |
|
||||
|
||||
[Nullable](../../sql-reference/data-types/nullable.md) is supported.
|
||||
|
||||
The data of TIME type in MySQL is converted to microseconds in ClickHouse.
|
||||
|
||||
Other types are not supported. If MySQL table contains a column of such type, ClickHouse throws exception "Unhandled data type" and stops replication.
|
||||
|
||||
## Specifics and Recommendations {#specifics-and-recommendations}
|
||||
|
@ -27,6 +27,7 @@ toc_title: Client Libraries
|
||||
- Go
|
||||
- [clickhouse](https://github.com/kshvakov/clickhouse/)
|
||||
- [go-clickhouse](https://github.com/roistat/go-clickhouse)
|
||||
- [chconn](https://github.com/vahid-sohrabloo/chconn)
|
||||
- [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse)
|
||||
- [golang-clickhouse](https://github.com/leprosus/golang-clickhouse)
|
||||
- Swift
|
||||
|
@ -43,7 +43,7 @@ User host is a host from which a connection to ClickHouse server could be establ
|
||||
- `HOST ANY` — User can connect from any location. This is a default option.
|
||||
- `HOST LOCAL` — User can connect only locally.
|
||||
- `HOST NAME 'fqdn'` — User host can be specified as FQDN. For example, `HOST NAME 'mysite.com'`.
|
||||
- `HOST NAME REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) regular expressions when specifying user hosts. For example, `HOST NAME REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) regular expressions when specifying user hosts. For example, `HOST REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST LIKE 'template'` — Allows you to use the [LIKE](../../../sql-reference/functions/string-search-functions.md#function-like) operator to filter the user hosts. For example, `HOST LIKE '%'` is equivalent to `HOST ANY`, `HOST LIKE '%.mysite.com'` filters all the hosts in the `mysite.com` domain.
|
||||
|
||||
Another way of specifying host is to use `@` syntax following the username. Examples:
|
||||
|
@ -43,7 +43,7 @@ CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1]
|
||||
- `HOST ANY` — Пользователь может подключиться с любого хоста. Используется по умолчанию.
|
||||
- `HOST LOCAL` — Пользователь может подключиться только локально.
|
||||
- `HOST NAME 'fqdn'` — Хост задается через FQDN. Например, `HOST NAME 'mysite.com'`.
|
||||
- `HOST NAME REGEXP 'regexp'` — Позволяет использовать регулярные выражения [pcre](http://www.pcre.org/), чтобы задать хосты. Например, `HOST NAME REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST REGEXP 'regexp'` — Позволяет использовать регулярные выражения [pcre](http://www.pcre.org/), чтобы задать хосты. Например, `HOST REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST LIKE 'template'` — Позволяет использовать оператор [LIKE](../../functions/string-search-functions.md#function-like) для фильтрации хостов. Например, `HOST LIKE '%'` эквивалентен `HOST ANY`; `HOST LIKE '%.mysite.com'` разрешает подключение со всех хостов в домене `mysite.com`.
|
||||
|
||||
Также, чтобы задать хост, вы можете использовать `@` вместе с именем пользователя. Примеры:
|
||||
|
@ -62,7 +62,7 @@ def build_for_lang(lang, args):
|
||||
strict=True,
|
||||
theme=theme_cfg,
|
||||
nav=blog_nav,
|
||||
copyright='©2016–2021 ClickHouse, Inc.',
|
||||
copyright='©2016–2022 ClickHouse, Inc.',
|
||||
use_directory_urls=True,
|
||||
repo_name='ClickHouse/ClickHouse',
|
||||
repo_url='https://github.com/ClickHouse/ClickHouse/',
|
||||
@ -97,10 +97,6 @@ def build_for_lang(lang, args):
|
||||
with open(os.path.join(args.blog_output_dir, lang, 'rss.xml'), 'w') as f:
|
||||
f.write(rss_template.render({'config': raw_config}))
|
||||
|
||||
# TODO: AMP for blog
|
||||
# if not args.skip_amp:
|
||||
# amp.build_amp(lang, args, cfg)
|
||||
|
||||
logging.info(f'Finished building {lang} blog')
|
||||
|
||||
except exceptions.ConfigurationError as e:
|
||||
|
@ -1 +0,0 @@
|
||||
../../../en/faq/general/index.md
|
27
docs/zh/faq/general/index.md
Normal file
27
docs/zh/faq/general/index.md
Normal file
@ -0,0 +1,27 @@
|
||||
---
|
||||
title: ClickHouse 有关常见问题
|
||||
toc_hidden_folder: true
|
||||
toc_priority: 1
|
||||
toc_title: General
|
||||
---
|
||||
|
||||
# ClickHouse 有关常见问题 {#general-questions}
|
||||
|
||||
常见问题:
|
||||
|
||||
- [什么是 ClickHouse?](../../index.md#what-is-clickhouse)
|
||||
- [为何 ClickHouse 如此迅捷?](../../faq/general/why-clickhouse-is-so-fast.md)
|
||||
- [谁在使用 ClickHouse?](../../faq/general/who-is-using-clickhouse.md)
|
||||
- [“ClickHouse” 有什么含义?](../../faq/general/dbms-naming.md)
|
||||
- [ “Не тормозит” 有什么含义?](../../faq/general/ne-tormozit.md)
|
||||
- [什么是 OLAP?](../../faq/general/olap.md)
|
||||
- [什么是列存储数据库?](../../faq/general/columnar-database.md)
|
||||
- [为何不使用 MapReduce等技术?](../../faq/general/mapreduce.md)
|
||||
- [我如何为 ClickHouse贡献代码?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md)
|
||||
|
||||
|
||||
|
||||
!!! info "没找到您需要的内容?"
|
||||
请查阅 [其他 F.A.Q. 类别](../../faq/index.md) 或者从左侧导航栏浏览其他文档
|
||||
|
||||
{## [原始文档](https://clickhouse.com/docs/en/faq/general/) ##}
|
@ -1 +0,0 @@
|
||||
../../../en/faq/general/mapreduce.md
|
13
docs/zh/faq/general/mapreduce.md
Normal file
13
docs/zh/faq/general/mapreduce.md
Normal file
@ -0,0 +1,13 @@
|
||||
---
|
||||
title: 为何不使用 MapReduce等技术?
|
||||
toc_hidden: true
|
||||
toc_priority: 110
|
||||
---
|
||||
|
||||
# 为何不使用 MapReduce等技术? {#why-not-use-something-like-mapreduce}
|
||||
|
||||
我们可以将MapReduce这样的系统称为分布式计算系统,其中的reduce操作是基于分布式排序的。这个领域中最常见的开源解决方案是[Apache Hadoop](http://hadoop.apache.org)。Yandex使用其内部解决方案YT。
|
||||
|
||||
这些系统不适合用于在线查询,因为它们的延迟很大。换句话说,它们不能被用作网页界面的后端。这些类型的系统对于实时数据更新并不是很有用。如果操作的结果和所有中间结果(如果有的话)都位于单个服务器的内存中,那么分布式排序就不是执行reduce操作的最佳方式,这通常是在线查询的情况。在这种情况下,哈希表是执行reduce操作的最佳方式。优化map-reduce任务的一种常见方法是使用内存中的哈希表进行预聚合(部分reduce)。用户手动执行此优化。在运行简单的map-reduce任务时,分布式排序是导致性能下降的主要原因之一。
|
||||
|
||||
大多数MapReduce实现允许你在集群中执行任意代码。但是声明性查询语言更适合于OLAP,以便快速运行实验。例如,Hadoop有Hive和Pig。还可以考虑使用Cloudera Impala或Shark(已经过时了)来支持Spark,以及Spark SQL、Presto和Apache Drill。与专门的系统相比,运行这些任务的性能是非常不理想的,但是相对较高的延迟使得使用这些系统作为web界面的后端是不现实的。
|
@ -19,6 +19,7 @@ toc_priority: 76
|
||||
- [什么是 OLAP?](../faq/general/olap.md)
|
||||
- [什么是列存储数据库?](../faq/general/columnar-database.md)
|
||||
- [为何不使用 MapReduce等技术?](../faq/general/mapreduce.md)
|
||||
- [我如何为 ClickHouse贡献代码?](../faq/general/how-do-i-contribute-code-to-clickhouse.md)
|
||||
- **[应用案例](../faq/use-cases/index.md)**
|
||||
- [我能把 ClickHouse 作为时序数据库来使用吗?](../faq/use-cases/time-series.md)
|
||||
- [我能把 ClickHouse 作为 key-value 键值存储吗?](../faq/use-cases/key-value.md)
|
||||
|
@ -1 +0,0 @@
|
||||
../../../en/faq/use-cases/time-series.md
|
21
docs/zh/faq/use-cases/time-series.md
Normal file
21
docs/zh/faq/use-cases/time-series.md
Normal file
@ -0,0 +1,21 @@
|
||||
---
|
||||
title: 我能把 ClickHouse 当做时序数据库来使用吗?
|
||||
toc_hidden: true
|
||||
toc_priority: 101
|
||||
---
|
||||
|
||||
# 我能把 ClickHouse 当做时序数据库来使用吗? {#can-i-use-clickhouse-as-a-time-series-database}
|
||||
|
||||
ClickHouse是一个通用的数据存储解决方案[OLAP](../../faq/general/olap.md)的工作负载,而有许多专门的时间序列数据库管理系统。然而,ClickHouse的[专注于查询执行速度](../../faq/general/why-clickhouse-is-so-fast.md)使得它在许多情况下的性能优于专门的系统。关于这个话题有很多独立的基准,所以我们不打算在这里进行论述。相反,让我们将重点放在ClickHouse的重要功能(如果这是你的用例)上。
|
||||
|
||||
|
||||
|
||||
首先,有 **[specialized codecs](../../sql-reference/statements/create/table.md#create-query-specialized-codecs)**,这是典型的时间序列。无论是常见的算法,如“DoubleDelta”和“Gorilla”,或特定的ClickHouse 数据类型如“T64”。
|
||||
|
||||
|
||||
|
||||
其次,时间序列查询通常只访问最近的数据,比如一天或一周以前的数据。使用具有快速nVME/SSD驱动器和高容量HDD驱动器的服务器是有意义的。ClickHouse [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes)特性允许配置在快速硬盘上保持新鲜的热数据,并随着数据的老化逐渐移动到较慢的硬盘上。如果您的需求需要,也可以汇总或删除更旧的数据。
|
||||
|
||||
|
||||
|
||||
尽管这与ClickHouse存储和处理原始数据的理念相违背,但你可以使用[materialized views](../../sql-reference/statements/create/view.md)来适应更紧迫的延迟或成本需求。
|
@ -364,7 +364,9 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
|
||||
"clickhouse-git-import",
|
||||
"clickhouse-compressor",
|
||||
"clickhouse-format",
|
||||
"clickhouse-extract-from-config"
|
||||
"clickhouse-extract-from-config",
|
||||
"clickhouse-keeper",
|
||||
"clickhouse-keeper-converter",
|
||||
};
|
||||
|
||||
for (const auto & tool : tools)
|
||||
|
@ -330,8 +330,6 @@ int Keeper::main(const std::vector<std::string> & /*args*/)
|
||||
|
||||
DB::ServerUUID::load(path + "/uuid", log);
|
||||
|
||||
const Settings & settings = global_context->getSettingsRef();
|
||||
|
||||
std::string include_from_path = config().getString("include_from", "/etc/metrika.xml");
|
||||
|
||||
GlobalThreadPool::initialize(
|
||||
@ -377,8 +375,8 @@ int Keeper::main(const std::vector<std::string> & /*args*/)
|
||||
{
|
||||
Poco::Net::ServerSocket socket;
|
||||
auto address = socketBindListen(socket, listen_host, port);
|
||||
socket.setReceiveTimeout(settings.receive_timeout);
|
||||
socket.setSendTimeout(settings.send_timeout);
|
||||
socket.setReceiveTimeout(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC));
|
||||
socket.setSendTimeout(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC));
|
||||
servers->emplace_back(
|
||||
listen_host,
|
||||
port_name,
|
||||
@ -393,8 +391,8 @@ int Keeper::main(const std::vector<std::string> & /*args*/)
|
||||
#if USE_SSL
|
||||
Poco::Net::SecureServerSocket socket;
|
||||
auto address = socketBindListen(socket, listen_host, port, /* secure = */ true);
|
||||
socket.setReceiveTimeout(settings.receive_timeout);
|
||||
socket.setSendTimeout(settings.send_timeout);
|
||||
socket.setReceiveTimeout(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC));
|
||||
socket.setSendTimeout(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC));
|
||||
servers->emplace_back(
|
||||
listen_host,
|
||||
secure_port_name,
|
||||
|
@ -967,6 +967,83 @@ if (ThreadFuzzer::instance().isEffective())
|
||||
},
|
||||
/* already_loaded = */ false); /// Reload it right now (initial loading)
|
||||
|
||||
const auto listen_hosts = getListenHosts(config());
|
||||
const auto listen_try = getListenTry(config());
|
||||
|
||||
if (config().has("keeper_server"))
|
||||
{
|
||||
#if USE_NURAFT
|
||||
//// If we don't have configured connection probably someone trying to use clickhouse-server instead
|
||||
//// of clickhouse-keeper, so start synchronously.
|
||||
bool can_initialize_keeper_async = false;
|
||||
|
||||
if (has_zookeeper) /// We have configured connection to some zookeeper cluster
|
||||
{
|
||||
/// If we cannot connect to some other node from our cluster then we have to wait our Keeper start
|
||||
/// synchronously.
|
||||
can_initialize_keeper_async = global_context->tryCheckClientConnectionToMyKeeperCluster();
|
||||
}
|
||||
/// Initialize keeper RAFT.
|
||||
global_context->initializeKeeperDispatcher(can_initialize_keeper_async);
|
||||
FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher());
|
||||
|
||||
for (const auto & listen_host : listen_hosts)
|
||||
{
|
||||
/// TCP Keeper
|
||||
const char * port_name = "keeper_server.tcp_port";
|
||||
createServer(
|
||||
config(), listen_host, port_name, listen_try, /* start_server: */ false,
|
||||
servers_to_start_before_tables,
|
||||
[&](UInt16 port) -> ProtocolServerAdapter
|
||||
{
|
||||
Poco::Net::ServerSocket socket;
|
||||
auto address = socketBindListen(socket, listen_host, port);
|
||||
socket.setReceiveTimeout(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC));
|
||||
socket.setSendTimeout(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC));
|
||||
return ProtocolServerAdapter(
|
||||
listen_host,
|
||||
port_name,
|
||||
"Keeper (tcp): " + address.toString(),
|
||||
std::make_unique<TCPServer>(
|
||||
new KeeperTCPHandlerFactory(*this, false), server_pool, socket));
|
||||
});
|
||||
|
||||
const char * secure_port_name = "keeper_server.tcp_port_secure";
|
||||
createServer(
|
||||
config(), listen_host, secure_port_name, listen_try, /* start_server: */ false,
|
||||
servers_to_start_before_tables,
|
||||
[&](UInt16 port) -> ProtocolServerAdapter
|
||||
{
|
||||
#if USE_SSL
|
||||
Poco::Net::SecureServerSocket socket;
|
||||
auto address = socketBindListen(socket, listen_host, port, /* secure = */ true);
|
||||
socket.setReceiveTimeout(config().getUInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC));
|
||||
socket.setSendTimeout(config().getUInt64("keeper_server.socket_send_timeout_sec", DBMS_DEFAULT_SEND_TIMEOUT_SEC));
|
||||
return ProtocolServerAdapter(
|
||||
listen_host,
|
||||
secure_port_name,
|
||||
"Keeper with secure protocol (tcp_secure): " + address.toString(),
|
||||
std::make_unique<TCPServer>(
|
||||
new KeeperTCPHandlerFactory(*this, true), server_pool, socket));
|
||||
#else
|
||||
UNUSED(port);
|
||||
throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.",
|
||||
ErrorCodes::SUPPORT_IS_DISABLED};
|
||||
#endif
|
||||
});
|
||||
}
|
||||
#else
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination.");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
for (auto & server : servers_to_start_before_tables)
|
||||
{
|
||||
server.start();
|
||||
LOG_INFO(log, "Listening for {}", server.getDescription());
|
||||
}
|
||||
|
||||
auto & access_control = global_context->getAccessControl();
|
||||
if (config().has("custom_settings_prefixes"))
|
||||
access_control.setCustomSettingsPrefixes(config().getString("custom_settings_prefixes"));
|
||||
@ -1075,83 +1152,6 @@ if (ThreadFuzzer::instance().isEffective())
|
||||
/// try set up encryption. There are some errors in config, error will be printed and server wouldn't start.
|
||||
CompressionCodecEncrypted::Configuration::instance().load(config(), "encryption_codecs");
|
||||
|
||||
const auto listen_hosts = getListenHosts(config());
|
||||
const auto listen_try = getListenTry(config());
|
||||
|
||||
if (config().has("keeper_server"))
|
||||
{
|
||||
#if USE_NURAFT
|
||||
//// If we don't have configured connection probably someone trying to use clickhouse-server instead
|
||||
//// of clickhouse-keeper, so start synchronously.
|
||||
bool can_initialize_keeper_async = false;
|
||||
|
||||
if (has_zookeeper) /// We have configured connection to some zookeeper cluster
|
||||
{
|
||||
/// If we cannot connect to some other node from our cluster then we have to wait our Keeper start
|
||||
/// synchronously.
|
||||
can_initialize_keeper_async = global_context->tryCheckClientConnectionToMyKeeperCluster();
|
||||
}
|
||||
/// Initialize keeper RAFT.
|
||||
global_context->initializeKeeperDispatcher(can_initialize_keeper_async);
|
||||
FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher());
|
||||
|
||||
for (const auto & listen_host : listen_hosts)
|
||||
{
|
||||
/// TCP Keeper
|
||||
const char * port_name = "keeper_server.tcp_port";
|
||||
createServer(
|
||||
config(), listen_host, port_name, listen_try, /* start_server: */ false,
|
||||
servers_to_start_before_tables,
|
||||
[&](UInt16 port) -> ProtocolServerAdapter
|
||||
{
|
||||
Poco::Net::ServerSocket socket;
|
||||
auto address = socketBindListen(socket, listen_host, port);
|
||||
socket.setReceiveTimeout(settings.receive_timeout);
|
||||
socket.setSendTimeout(settings.send_timeout);
|
||||
return ProtocolServerAdapter(
|
||||
listen_host,
|
||||
port_name,
|
||||
"Keeper (tcp): " + address.toString(),
|
||||
std::make_unique<TCPServer>(
|
||||
new KeeperTCPHandlerFactory(*this, false), server_pool, socket));
|
||||
});
|
||||
|
||||
const char * secure_port_name = "keeper_server.tcp_port_secure";
|
||||
createServer(
|
||||
config(), listen_host, secure_port_name, listen_try, /* start_server: */ false,
|
||||
servers_to_start_before_tables,
|
||||
[&](UInt16 port) -> ProtocolServerAdapter
|
||||
{
|
||||
#if USE_SSL
|
||||
Poco::Net::SecureServerSocket socket;
|
||||
auto address = socketBindListen(socket, listen_host, port, /* secure = */ true);
|
||||
socket.setReceiveTimeout(settings.receive_timeout);
|
||||
socket.setSendTimeout(settings.send_timeout);
|
||||
return ProtocolServerAdapter(
|
||||
listen_host,
|
||||
secure_port_name,
|
||||
"Keeper with secure protocol (tcp_secure): " + address.toString(),
|
||||
std::make_unique<TCPServer>(
|
||||
new KeeperTCPHandlerFactory(*this, true), server_pool, socket));
|
||||
#else
|
||||
UNUSED(port);
|
||||
throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.",
|
||||
ErrorCodes::SUPPORT_IS_DISABLED};
|
||||
#endif
|
||||
});
|
||||
}
|
||||
#else
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination.");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
for (auto & server : servers_to_start_before_tables)
|
||||
{
|
||||
server.start();
|
||||
LOG_INFO(log, "Listening for {}", server.getDescription());
|
||||
}
|
||||
|
||||
SCOPE_EXIT({
|
||||
/// Stop reloading of the main config. This must be done before `global_context->shutdown()` because
|
||||
/// otherwise the reloading may pass a changed config to some destroyed parts of ContextSharedPart.
|
||||
|
@ -506,6 +506,7 @@ if (ENABLE_NLP)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::stemmer)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::wnb)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::lemmagen)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::nlp_data)
|
||||
endif()
|
||||
|
||||
if (TARGET ch_contrib::bzip2)
|
||||
@ -558,3 +559,4 @@ if (ENABLE_TESTS)
|
||||
|
||||
add_check(unit_tests_dbms)
|
||||
endif ()
|
||||
|
||||
|
252
src/Common/FrequencyHolder.h
Normal file
252
src/Common/FrequencyHolder.h
Normal file
@ -0,0 +1,252 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/Arena.h>
|
||||
#include <Common/getResource.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/readFloatText.h>
|
||||
#include <IO/ZstdInflatingReadBuffer.h>
|
||||
|
||||
#include <base/StringRef.h>
|
||||
#include <base/logger_useful.h>
|
||||
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int FILE_DOESNT_EXIST;
|
||||
}
|
||||
|
||||
/// FrequencyHolder class is responsible for storing and loading dictionaries
|
||||
/// needed for text classification functions:
|
||||
///
|
||||
/// 1. detectLanguageUnknown
|
||||
/// 2. detectCharset
|
||||
/// 3. detectTonality
|
||||
/// 4. detectProgrammingLanguage
|
||||
|
||||
class FrequencyHolder
|
||||
{
|
||||
|
||||
public:
|
||||
struct Language
|
||||
{
|
||||
String name;
|
||||
HashMap<StringRef, Float64> map;
|
||||
};
|
||||
|
||||
struct Encoding
|
||||
{
|
||||
String name;
|
||||
String lang;
|
||||
HashMap<UInt16, Float64> map;
|
||||
};
|
||||
|
||||
public:
|
||||
using Map = HashMap<StringRef, Float64>;
|
||||
using Container = std::vector<Language>;
|
||||
using EncodingMap = HashMap<UInt16, Float64>;
|
||||
using EncodingContainer = std::vector<Encoding>;
|
||||
|
||||
static FrequencyHolder & getInstance()
|
||||
{
|
||||
static FrequencyHolder instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void loadEncodingsFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded charset frequencies");
|
||||
|
||||
auto resource = getResource("charset.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies");
|
||||
|
||||
String line;
|
||||
UInt16 bigram;
|
||||
Float64 frequency;
|
||||
String charset_name;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new charset
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(charset_name, buf_line);
|
||||
|
||||
/* In our dictionary we have lines with form: <Language>_<Charset>
|
||||
* If we need to find language of data, we return <Language>
|
||||
* If we need to find charset of data, we return <Charset>.
|
||||
*/
|
||||
size_t sep = charset_name.find('_');
|
||||
|
||||
Encoding enc;
|
||||
enc.lang = charset_name.substr(0, sep);
|
||||
enc.name = charset_name.substr(sep + 1);
|
||||
encodings_freq.push_back(std::move(enc));
|
||||
}
|
||||
else
|
||||
{
|
||||
readIntText(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
encodings_freq.back().map[bigram] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
|
||||
}
|
||||
|
||||
|
||||
void loadEmotionalDict()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
||||
LOG_TRACE(log, "Loading embedded emotional dictionary");
|
||||
|
||||
auto resource = getResource("tonality_ru.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary");
|
||||
|
||||
String line;
|
||||
String word;
|
||||
Float64 tonality;
|
||||
size_t count = 0;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
readStringUntilWhitespace(word, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(tonality, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(word.data(), word.size()), word.size()};
|
||||
emotional_dict[ref] = tonality;
|
||||
++count;
|
||||
}
|
||||
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
||||
}
|
||||
|
||||
|
||||
void loadProgrammingFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded programming languages frequencies loading");
|
||||
|
||||
auto resource = getResource("programming.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies");
|
||||
|
||||
String line;
|
||||
String bigram;
|
||||
Float64 frequency;
|
||||
String programming_language;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new language
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(programming_language, buf_line);
|
||||
|
||||
Language lang;
|
||||
lang.name = programming_language;
|
||||
programming_freq.push_back(std::move(lang));
|
||||
}
|
||||
else
|
||||
{
|
||||
readStringUntilWhitespace(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()};
|
||||
programming_freq.back().map[ref] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Programming languages frequencies was added");
|
||||
}
|
||||
|
||||
const Map & getEmotionalDict()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (emotional_dict.empty())
|
||||
loadEmotionalDict();
|
||||
|
||||
return emotional_dict;
|
||||
}
|
||||
|
||||
|
||||
const EncodingContainer & getEncodingsFrequency()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (encodings_freq.empty())
|
||||
loadEncodingsFrequency();
|
||||
|
||||
return encodings_freq;
|
||||
}
|
||||
|
||||
const Container & getProgrammingFrequency()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (programming_freq.empty())
|
||||
loadProgrammingFrequency();
|
||||
|
||||
return programming_freq;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
Arena string_pool;
|
||||
|
||||
Map emotional_dict;
|
||||
Container programming_freq;
|
||||
EncodingContainer encodings_freq;
|
||||
|
||||
std::mutex mutex;
|
||||
};
|
||||
}
|
@ -291,6 +291,15 @@ public:
|
||||
|
||||
size_t getIntervalsSize() const { return intervals_size; }
|
||||
|
||||
size_t getSizeInBytes() const
|
||||
{
|
||||
size_t nodes_size_in_bytes = nodes.size() * sizeof(Node);
|
||||
size_t intervals_size_in_bytes = sorted_intervals.size() * sizeof(IntervalWithValue);
|
||||
size_t result = nodes_size_in_bytes + intervals_size_in_bytes;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
struct Node
|
||||
{
|
||||
|
@ -24,7 +24,6 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int UNSUPPORTED_PARAMETER;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
@ -34,9 +33,12 @@ namespace ErrorCodes
|
||||
*/
|
||||
|
||||
|
||||
struct StringSearcherBase
|
||||
class StringSearcherBase
|
||||
{
|
||||
public:
|
||||
bool force_fallback = false;
|
||||
#ifdef __SSE2__
|
||||
protected:
|
||||
static constexpr auto n = sizeof(__m128i);
|
||||
const int page_size = ::getPageSize();
|
||||
|
||||
@ -53,7 +55,7 @@ template <bool CaseSensitive, bool ASCII> class StringSearcher;
|
||||
|
||||
/// Case-insensitive UTF-8 searcher
|
||||
template <>
|
||||
class StringSearcher<false, false> : private StringSearcherBase
|
||||
class StringSearcher<false, false> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
using UTF8SequenceBuffer = uint8_t[6];
|
||||
@ -119,11 +121,14 @@ public:
|
||||
size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq));
|
||||
|
||||
if (length_l != length_u)
|
||||
throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
|
||||
force_fallback = true;
|
||||
}
|
||||
|
||||
l = l_seq[0];
|
||||
u = u_seq[0];
|
||||
|
||||
if (force_fallback)
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
@ -158,7 +163,10 @@ public:
|
||||
|
||||
/// @note Unicode standard states it is a rare but possible occasion
|
||||
if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
|
||||
throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
|
||||
{
|
||||
force_fallback = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
cache_actual_len += src_len;
|
||||
@ -199,9 +207,10 @@ public:
|
||||
if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
|
||||
break;
|
||||
|
||||
/// @note assuming sequences for lowercase and uppercase have exact same length (that is not always true)
|
||||
const auto len = UTF8::seqLength(*haystack_pos);
|
||||
auto len = UTF8::seqLength(*haystack_pos);
|
||||
haystack_pos += len;
|
||||
|
||||
len = UTF8::seqLength(*needle_pos);
|
||||
needle_pos += len;
|
||||
}
|
||||
|
||||
@ -213,7 +222,7 @@ public:
|
||||
{
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
if (pageSafe(pos))
|
||||
if (pageSafe(pos) && !force_fallback)
|
||||
{
|
||||
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
||||
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
|
||||
@ -262,7 +271,7 @@ public:
|
||||
while (haystack < haystack_end)
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
if (haystack + n <= haystack_end && pageSafe(haystack))
|
||||
if (haystack + n <= haystack_end && pageSafe(haystack) && !force_fallback)
|
||||
{
|
||||
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
||||
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
|
||||
@ -339,7 +348,7 @@ public:
|
||||
|
||||
/// Case-insensitive ASCII searcher
|
||||
template <>
|
||||
class StringSearcher<false, true> : private StringSearcherBase
|
||||
class StringSearcher<false, true> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
/// string to be searched for
|
||||
@ -541,7 +550,7 @@ public:
|
||||
|
||||
/// Case-sensitive searcher (both ASCII and UTF-8)
|
||||
template <bool ASCII>
|
||||
class StringSearcher<true, ASCII> : private StringSearcherBase
|
||||
class StringSearcher<true, ASCII> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
/// string to be searched for
|
||||
@ -725,7 +734,7 @@ public:
|
||||
// Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings
|
||||
// should work just fine. But any Unicode whitespace is not considered a token separtor.
|
||||
template <typename StringSearcher>
|
||||
class TokenSearcher
|
||||
class TokenSearcher : public StringSearcherBase
|
||||
{
|
||||
StringSearcher searcher;
|
||||
size_t needle_size;
|
||||
@ -809,7 +818,7 @@ using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStri
|
||||
* It is required that strings are zero-terminated.
|
||||
*/
|
||||
|
||||
struct LibCASCIICaseSensitiveStringSearcher
|
||||
struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
@ -833,7 +842,7 @@ struct LibCASCIICaseSensitiveStringSearcher
|
||||
}
|
||||
};
|
||||
|
||||
struct LibCASCIICaseInsensitiveStringSearcher
|
||||
struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
|
@ -372,7 +372,7 @@ public:
|
||||
, fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}
|
||||
, fallback_searcher{needle_, needle_size}
|
||||
{
|
||||
if (fallback)
|
||||
if (fallback || fallback_searcher.force_fallback)
|
||||
return;
|
||||
|
||||
hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
|
||||
@ -393,7 +393,7 @@ public:
|
||||
|
||||
const auto haystack_end = haystack + haystack_size;
|
||||
|
||||
if (fallback || haystack_size <= needle_size)
|
||||
if (fallback || haystack_size <= needle_size || fallback_searcher.force_fallback)
|
||||
return fallback_searcher.search(haystack, haystack_end);
|
||||
|
||||
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
|
||||
|
@ -16,7 +16,15 @@ using MYSQL_ROW = char**;
|
||||
struct st_mysql_field;
|
||||
using MYSQL_FIELD = st_mysql_field;
|
||||
|
||||
enum struct enum_field_types;
|
||||
enum struct enum_field_types { MYSQL_TYPE_DECIMAL, MYSQL_TYPE_TINY,
|
||||
MYSQL_TYPE_SHORT, MYSQL_TYPE_LONG,
|
||||
MYSQL_TYPE_FLOAT, MYSQL_TYPE_DOUBLE,
|
||||
MYSQL_TYPE_NULL, MYSQL_TYPE_TIMESTAMP,
|
||||
MYSQL_TYPE_LONGLONG, MYSQL_TYPE_INT24,
|
||||
MYSQL_TYPE_DATE, MYSQL_TYPE_TIME,
|
||||
MYSQL_TYPE_DATETIME, MYSQL_TYPE_YEAR,
|
||||
MYSQL_TYPE_NEWDATE, MYSQL_TYPE_VARCHAR,
|
||||
MYSQL_TYPE_BIT };
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -204,6 +204,7 @@ namespace MySQLReplication
|
||||
case MYSQL_TYPE_DATE:
|
||||
case MYSQL_TYPE_DATETIME:
|
||||
case MYSQL_TYPE_NEWDATE:
|
||||
case MYSQL_TYPE_YEAR:
|
||||
{
|
||||
/// No data here.
|
||||
column_meta.emplace_back(0);
|
||||
@ -214,7 +215,9 @@ namespace MySQLReplication
|
||||
case MYSQL_TYPE_DOUBLE:
|
||||
case MYSQL_TYPE_TIMESTAMP2:
|
||||
case MYSQL_TYPE_DATETIME2:
|
||||
case MYSQL_TYPE_TIME2:
|
||||
case MYSQL_TYPE_BLOB:
|
||||
case MYSQL_TYPE_GEOMETRY:
|
||||
{
|
||||
column_meta.emplace_back(UInt16(meta[pos]));
|
||||
pos += 1;
|
||||
@ -432,6 +435,98 @@ namespace MySQLReplication
|
||||
row.push_back(Field(date_day_number.toUnderType()));
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_YEAR: {
|
||||
Int16 val = 0;
|
||||
payload.readStrict(reinterpret_cast<char *>(&val), 1);
|
||||
row.push_back(Field{UInt16{static_cast<UInt16>(val + 1900)}});
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_TIME2:
|
||||
{
|
||||
UInt64 uintpart = 0UL;
|
||||
Int32 frac = 0U;
|
||||
Int64 ltime;
|
||||
Int64 intpart;
|
||||
switch (meta)
|
||||
{
|
||||
case 0:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 3);
|
||||
intpart = uintpart - 0x800000L;
|
||||
ltime = intpart << 24;
|
||||
break;
|
||||
}
|
||||
case 1:
|
||||
case 2:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 3);
|
||||
intpart = uintpart - 0x800000L;
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&frac), 1);
|
||||
if (intpart < 0 && frac > 0)
|
||||
{
|
||||
intpart ++;
|
||||
frac -= 0x100;
|
||||
}
|
||||
frac = frac * 10000;
|
||||
ltime = intpart << 24;
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
case 4:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 3);
|
||||
intpart = uintpart - 0x800000L;
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&frac), 2);
|
||||
if (intpart < 0 && frac > 0)
|
||||
{
|
||||
intpart ++;
|
||||
frac -= 0x10000;
|
||||
}
|
||||
frac = frac * 100;
|
||||
ltime = intpart << 24;
|
||||
break;
|
||||
}
|
||||
case 5:
|
||||
case 6:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 6);
|
||||
intpart = uintpart - 0x800000000000L;
|
||||
ltime = intpart;
|
||||
frac = std::abs(intpart % (1L << 24));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 3);
|
||||
intpart = uintpart - 0x800000L;
|
||||
ltime = intpart << 24;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Int64 hh, mm, ss;
|
||||
bool negative = false;
|
||||
if (intpart == 0)
|
||||
{
|
||||
hh = 0;
|
||||
mm = 0;
|
||||
ss = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ltime < 0) negative= true;
|
||||
UInt64 ultime = std::abs(ltime);
|
||||
intpart = ultime >> 24;
|
||||
hh = (intpart >> 12) % (1 << 10);
|
||||
mm = (intpart >> 6) % (1 << 6);
|
||||
ss = intpart % (1 << 6);
|
||||
}
|
||||
|
||||
Int64 time_micro = 0;
|
||||
time_micro = (hh * 3600 + mm * 60 + ss) * 1000000 + std::abs(frac);
|
||||
if (negative) time_micro = - time_micro;
|
||||
row.push_back(Field{Int64{time_micro}});
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_DATETIME2:
|
||||
{
|
||||
Int64 val = 0;
|
||||
@ -585,6 +680,14 @@ namespace MySQLReplication
|
||||
}
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_SET:
|
||||
{
|
||||
UInt32 size = (meta & 0xff);
|
||||
Bitmap bitmap1;
|
||||
readBitmap(payload, bitmap1, size);
|
||||
row.push_back(Field{UInt64{bitmap1.to_ulong()}});
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_BIT:
|
||||
{
|
||||
UInt32 bits = ((meta >> 8) * 8) + (meta & 0xff);
|
||||
@ -631,6 +734,7 @@ namespace MySQLReplication
|
||||
row.push_back(Field{String{val}});
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_GEOMETRY:
|
||||
case MYSQL_TYPE_BLOB:
|
||||
{
|
||||
UInt32 size = 0;
|
||||
|
@ -92,5 +92,7 @@ void registerDataTypeString(DataTypeFactory & factory)
|
||||
factory.registerAlias("BINARY LARGE OBJECT", "String", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("BINARY VARYING", "String", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("VARBINARY", "String", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("GEOMETRY", "String", DataTypeFactory::CaseInsensitive); //mysql
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -86,7 +86,10 @@ void registerDataTypeNumbers(DataTypeFactory & factory)
|
||||
factory.registerAlias("INT UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("INTEGER UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("BIGINT UNSIGNED", "UInt64", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("BIT", "UInt64", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("BIT", "UInt64", DataTypeFactory::CaseInsensitive); /// MySQL
|
||||
factory.registerAlias("SET", "UInt64", DataTypeFactory::CaseInsensitive); /// MySQL
|
||||
factory.registerAlias("YEAR", "UInt16", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("TIME", "Int64", DataTypeFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -523,6 +523,7 @@ inline bool isBool(const DataTypePtr & data_type)
|
||||
template <typename DataType> constexpr bool IsDataTypeDecimal = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeNumber = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeDateOrDateTime = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeEnum = false;
|
||||
|
||||
template <typename DataType> constexpr bool IsDataTypeDecimalOrNumber = IsDataTypeDecimal<DataType> || IsDataTypeNumber<DataType>;
|
||||
|
||||
@ -547,4 +548,9 @@ template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDate32> = tru
|
||||
template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDateTime> = true;
|
||||
template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDateTime64> = true;
|
||||
|
||||
template <typename T>
|
||||
class DataTypeEnum;
|
||||
|
||||
template <typename T> inline constexpr bool IsDataTypeEnum<DataTypeEnum<T>> = true;
|
||||
|
||||
}
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <Databases/MySQL/MaterializeMetadata.h>
|
||||
#include <Processors/Sources/MySQLSource.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/Operators.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/executeQuery.h>
|
||||
#include <Storages/StorageMergeTree.h>
|
||||
@ -315,6 +316,47 @@ getTableOutput(const String & database_name, const String & table_name, ContextM
|
||||
return std::move(res.pipeline);
|
||||
}
|
||||
|
||||
static inline String reWriteMysqlQueryColumn(mysqlxx::Pool::Entry & connection, const String & database_name, const String & table_name, const Settings & global_settings)
|
||||
{
|
||||
Block tables_columns_sample_block
|
||||
{
|
||||
{ std::make_shared<DataTypeString>(), "column_name" },
|
||||
{ std::make_shared<DataTypeString>(), "column_type" }
|
||||
};
|
||||
|
||||
const String & query = "SELECT COLUMN_NAME AS column_name, COLUMN_TYPE AS column_type FROM INFORMATION_SCHEMA.COLUMNS"
|
||||
" WHERE TABLE_SCHEMA = '" + backQuoteIfNeed(database_name) +
|
||||
"' AND TABLE_NAME = '" + backQuoteIfNeed(table_name) + "' ORDER BY ORDINAL_POSITION";
|
||||
|
||||
StreamSettings mysql_input_stream_settings(global_settings, false, true);
|
||||
auto mysql_source = std::make_unique<MySQLSource>(connection, query, tables_columns_sample_block, mysql_input_stream_settings);
|
||||
|
||||
Block block;
|
||||
WriteBufferFromOwnString query_columns;
|
||||
QueryPipeline pipeline(std::move(mysql_source));
|
||||
PullingPipelineExecutor executor(pipeline);
|
||||
while (executor.pull(block))
|
||||
{
|
||||
const auto & column_name_col = *block.getByPosition(0).column;
|
||||
const auto & column_type_col = *block.getByPosition(1).column;
|
||||
size_t rows = block.rows();
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
{
|
||||
String column_name = column_name_col[i].safeGet<String>();
|
||||
String column_type = column_type_col[i].safeGet<String>();
|
||||
//we can do something special conversion to guarantee select results is the same as the binlog parse results
|
||||
if (column_type.starts_with("set"))
|
||||
{
|
||||
query_columns << (backQuote(column_name) + " + 0");
|
||||
} else
|
||||
query_columns << backQuote(column_name);
|
||||
query_columns << ",";
|
||||
}
|
||||
}
|
||||
String query_columns_str = query_columns.str();
|
||||
return query_columns_str.substr(0, query_columns_str.length() - 1);
|
||||
}
|
||||
|
||||
static inline void dumpDataForTables(
|
||||
mysqlxx::Pool::Entry & connection, const std::unordered_map<String, String> & need_dumping_tables,
|
||||
const String & query_prefix, const String & database_name, const String & mysql_database_name,
|
||||
@ -334,9 +376,10 @@ static inline void dumpDataForTables(
|
||||
|
||||
auto pipeline = getTableOutput(database_name, table_name, query_context);
|
||||
StreamSettings mysql_input_stream_settings(context->getSettingsRef());
|
||||
auto input = std::make_unique<MySQLSource>(
|
||||
connection, "SELECT * FROM " + backQuoteIfNeed(mysql_database_name) + "." + backQuoteIfNeed(table_name),
|
||||
pipeline.getHeader(), mysql_input_stream_settings);
|
||||
String mysql_select_all_query = "SELECT " + reWriteMysqlQueryColumn(connection, mysql_database_name, table_name, context->getSettings()) + " FROM "
|
||||
+ backQuoteIfNeed(mysql_database_name) + "." + backQuoteIfNeed(table_name);
|
||||
LOG_INFO(&Poco::Logger::get("MaterializedMySQLSyncThread(" + database_name + ")"), "mysql_select_all_query is {}", mysql_select_all_query);
|
||||
auto input = std::make_unique<MySQLSource>(connection, mysql_select_all_query, pipeline.getHeader(), mysql_input_stream_settings);
|
||||
auto counting = std::make_shared<CountingTransform>(pipeline.getHeader());
|
||||
Pipe pipe(std::move(input));
|
||||
pipe.addTransform(counting);
|
||||
|
@ -60,8 +60,8 @@ private:
|
||||
const auto & attributes_types_to_read = coordinator->getAttributesTypesToRead();
|
||||
const auto & attributes_default_values_columns = coordinator->getAttributesDefaultValuesColumns();
|
||||
|
||||
const auto & dictionary = coordinator->getDictionary();
|
||||
auto attributes_columns = dictionary->getColumns(
|
||||
const auto & read_columns_func = coordinator->getReadColumnsFunc();
|
||||
auto attributes_columns = read_columns_func(
|
||||
attributes_names_to_read,
|
||||
attributes_types_to_read,
|
||||
key_columns,
|
||||
|
@ -19,6 +19,8 @@ class DictionarySourceCoordinator final : public shared_ptr_helper<DictionarySou
|
||||
|
||||
public:
|
||||
|
||||
using ReadColumnsFunc = std::function<Columns (const Strings &, const DataTypes &, const Columns &, const DataTypes &, const Columns &)>;
|
||||
|
||||
Pipe read(size_t num_streams);
|
||||
|
||||
private:
|
||||
@ -31,6 +33,15 @@ private:
|
||||
: dictionary(std::move(dictionary_))
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func([this](
|
||||
const Strings & attribute_names,
|
||||
const DataTypes & result_types,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const Columns & default_values_columns)
|
||||
{
|
||||
return dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_values_columns);
|
||||
})
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
@ -45,6 +56,31 @@ private:
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, data_columns_with_type(std::move(data_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func([this](
|
||||
const Strings & attribute_names,
|
||||
const DataTypes & result_types,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const Columns & default_values_columns)
|
||||
{
|
||||
return dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_values_columns);
|
||||
})
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
|
||||
explicit DictionarySourceCoordinator(
|
||||
std::shared_ptr<const IDictionary> dictionary_,
|
||||
const Names & column_names,
|
||||
ColumnsWithTypeAndName && key_columns_with_type_,
|
||||
ColumnsWithTypeAndName && data_columns_with_type_,
|
||||
size_t max_block_size_,
|
||||
ReadColumnsFunc read_columns_func_)
|
||||
: dictionary(std::move(dictionary_))
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, data_columns_with_type(std::move(data_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func(std::move(read_columns_func_))
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
@ -61,6 +97,8 @@ private:
|
||||
|
||||
const std::vector<ColumnPtr> & getAttributesDefaultValuesColumns() const { return attributes_default_values_columns; }
|
||||
|
||||
const ReadColumnsFunc & getReadColumnsFunc() const { return read_columns_func; }
|
||||
|
||||
const std::shared_ptr<const IDictionary> & getDictionary() const { return dictionary; }
|
||||
|
||||
void initialize(const Names & column_names);
|
||||
@ -79,6 +117,8 @@ private:
|
||||
std::vector<ColumnPtr> attributes_default_values_columns;
|
||||
|
||||
const size_t max_block_size;
|
||||
ReadColumnsFunc read_columns_func;
|
||||
|
||||
std::atomic<size_t> parallel_read_block_index = 0;
|
||||
};
|
||||
|
||||
|
@ -382,7 +382,8 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
|
||||
|
||||
void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConfiguration & config, const std::string & structure_prefix)
|
||||
{
|
||||
const char * range_default_type = "Date";
|
||||
static constexpr auto range_default_type = "Date";
|
||||
|
||||
if (config.has(structure_prefix + ".range_min"))
|
||||
range_min.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_min", range_default_type));
|
||||
|
||||
@ -395,7 +396,10 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf
|
||||
"Dictionary structure should have both 'range_min' and 'range_max' either specified or not.");
|
||||
}
|
||||
|
||||
if (range_min && range_max && !range_min->type->equals(*range_max->type))
|
||||
if (!range_min)
|
||||
return;
|
||||
|
||||
if (!range_min->type->equals(*range_max->type))
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Dictionary structure 'range_min' and 'range_max' should have same type, "
|
||||
@ -405,15 +409,20 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf
|
||||
range_max->type->getName());
|
||||
}
|
||||
|
||||
if (range_min && !range_min->type->isValueRepresentedByInteger())
|
||||
WhichDataType range_type(range_min->type);
|
||||
|
||||
bool valid_range = range_type.isInt() || range_type.isUInt() || range_type.isDecimal() || range_type.isFloat() || range_type.isEnum()
|
||||
|| range_type.isDate() || range_type.isDate32() || range_type.isDateTime() || range_type.isDateTime64();
|
||||
|
||||
if (!valid_range)
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum."
|
||||
"Dictionary structure type of 'range_min' and 'range_max' should be an Integer, Float, Decimal, Date, Date32, DateTime DateTime64, or Enum."
|
||||
" Actual 'range_min' and 'range_max' type is {}",
|
||||
range_min->type->getName());
|
||||
}
|
||||
|
||||
if ((range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty()))
|
||||
if (!range_min->expression.empty() || !range_max->expression.empty())
|
||||
has_expressions = true;
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -19,7 +19,18 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
using RangeStorageType = Int64;
|
||||
enum class RangeHashedDictionaryLookupStrategy : uint8_t
|
||||
{
|
||||
min,
|
||||
max
|
||||
};
|
||||
|
||||
struct RangeHashedDictionaryConfiguration
|
||||
{
|
||||
bool convert_null_range_bound_to_open;
|
||||
RangeHashedDictionaryLookupStrategy lookup_strategy;
|
||||
bool require_nonempty;
|
||||
};
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
class RangeHashedDictionary final : public IDictionary
|
||||
@ -31,11 +42,17 @@ public:
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
DictionarySourcePtr source_ptr_,
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
bool require_nonempty_,
|
||||
DictionaryLifetime dict_lifetime_,
|
||||
RangeHashedDictionaryConfiguration configuration_,
|
||||
BlockPtr update_field_loaded_block_ = nullptr);
|
||||
|
||||
std::string getTypeName() const override { return "RangeHashed"; }
|
||||
std::string getTypeName() const override
|
||||
{
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
|
||||
return "RangeHashed";
|
||||
else
|
||||
return "ComplexKeyRangeHashed";
|
||||
}
|
||||
|
||||
size_t getBytesAllocated() const override { return bytes_allocated; }
|
||||
|
||||
@ -57,7 +74,15 @@ public:
|
||||
|
||||
std::shared_ptr<const IExternalLoadable> clone() const override
|
||||
{
|
||||
return std::make_shared<RangeHashedDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, update_field_loaded_block);
|
||||
auto result = std::make_shared<RangeHashedDictionary>(
|
||||
getDictionaryID(),
|
||||
dict_struct,
|
||||
source_ptr->clone(),
|
||||
dict_lifetime,
|
||||
configuration,
|
||||
update_field_loaded_block);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DictionarySourcePtr getSource() const override { return source_ptr; }
|
||||
@ -76,7 +101,7 @@ public:
|
||||
DictionarySpecialKeyType getSpecialKeyType() const override { return DictionarySpecialKeyType::Range;}
|
||||
|
||||
ColumnPtr getColumn(
|
||||
const std::string& attribute_name,
|
||||
const std::string & attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
@ -88,52 +113,90 @@ public:
|
||||
|
||||
private:
|
||||
|
||||
using RangeInterval = Interval<RangeStorageType>;
|
||||
template <typename RangeStorageType>
|
||||
using IntervalMap = IntervalMap<Interval<RangeStorageType>, size_t>;
|
||||
|
||||
template <typename T>
|
||||
using Values = IntervalMap<RangeInterval, std::optional<T>>;
|
||||
template <typename RangeStorageType>
|
||||
using KeyAttributeContainerType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, IntervalMap<RangeStorageType>, DefaultHash<UInt64>>,
|
||||
HashMapWithSavedHash<StringRef, IntervalMap<RangeStorageType>, DefaultHash<StringRef>>>;
|
||||
|
||||
template <typename Value>
|
||||
using CollectionType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, Values<Value>, DefaultHash<UInt64>>,
|
||||
HashMapWithSavedHash<StringRef, Values<Value>, DefaultHash<StringRef>>>;
|
||||
|
||||
using NoAttributesCollectionType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, IntervalSet<RangeInterval>>,
|
||||
HashMapWithSavedHash<StringRef, IntervalSet<RangeInterval>>>;
|
||||
using AttributeContainerType = std::conditional_t<std::is_same_v<Value, Array>, std::vector<Value>, PaddedPODArray<Value>>;
|
||||
|
||||
struct Attribute final
|
||||
{
|
||||
public:
|
||||
AttributeUnderlyingType type;
|
||||
bool is_nullable;
|
||||
|
||||
std::variant<
|
||||
CollectionType<UInt8>,
|
||||
CollectionType<UInt16>,
|
||||
CollectionType<UInt32>,
|
||||
CollectionType<UInt64>,
|
||||
CollectionType<UInt128>,
|
||||
CollectionType<UInt256>,
|
||||
CollectionType<Int8>,
|
||||
CollectionType<Int16>,
|
||||
CollectionType<Int32>,
|
||||
CollectionType<Int64>,
|
||||
CollectionType<Int128>,
|
||||
CollectionType<Int256>,
|
||||
CollectionType<Decimal32>,
|
||||
CollectionType<Decimal64>,
|
||||
CollectionType<Decimal128>,
|
||||
CollectionType<Decimal256>,
|
||||
CollectionType<DateTime64>,
|
||||
CollectionType<Float32>,
|
||||
CollectionType<Float64>,
|
||||
CollectionType<UUID>,
|
||||
CollectionType<StringRef>,
|
||||
CollectionType<Array>>
|
||||
maps;
|
||||
AttributeContainerType<UInt8>,
|
||||
AttributeContainerType<UInt16>,
|
||||
AttributeContainerType<UInt32>,
|
||||
AttributeContainerType<UInt64>,
|
||||
AttributeContainerType<UInt128>,
|
||||
AttributeContainerType<UInt256>,
|
||||
AttributeContainerType<Int8>,
|
||||
AttributeContainerType<Int16>,
|
||||
AttributeContainerType<Int32>,
|
||||
AttributeContainerType<Int64>,
|
||||
AttributeContainerType<Int128>,
|
||||
AttributeContainerType<Int256>,
|
||||
AttributeContainerType<Decimal32>,
|
||||
AttributeContainerType<Decimal64>,
|
||||
AttributeContainerType<Decimal128>,
|
||||
AttributeContainerType<Decimal256>,
|
||||
AttributeContainerType<DateTime64>,
|
||||
AttributeContainerType<Float32>,
|
||||
AttributeContainerType<Float64>,
|
||||
AttributeContainerType<UUID>,
|
||||
AttributeContainerType<StringRef>,
|
||||
AttributeContainerType<Array>>
|
||||
container;
|
||||
|
||||
std::optional<std::vector<bool>> is_value_nullable;
|
||||
};
|
||||
|
||||
template <typename RangeStorageType>
|
||||
struct InvalidIntervalWithKey
|
||||
{
|
||||
KeyType key;
|
||||
Interval<RangeStorageType> interval;
|
||||
size_t attribute_value_index;
|
||||
};
|
||||
|
||||
template <typename RangeStorageType>
|
||||
using InvalidIntervalsContainerType = PaddedPODArray<InvalidIntervalWithKey<RangeStorageType>>;
|
||||
|
||||
template <template<typename> typename ContainerType>
|
||||
using RangeStorageTypeContainer = std::variant<
|
||||
ContainerType<UInt8>,
|
||||
ContainerType<UInt16>,
|
||||
ContainerType<UInt32>,
|
||||
ContainerType<UInt64>,
|
||||
ContainerType<UInt128>,
|
||||
ContainerType<UInt256>,
|
||||
ContainerType<Int8>,
|
||||
ContainerType<Int16>,
|
||||
ContainerType<Int32>,
|
||||
ContainerType<Int64>,
|
||||
ContainerType<Int128>,
|
||||
ContainerType<Int256>,
|
||||
ContainerType<Decimal32>,
|
||||
ContainerType<Decimal64>,
|
||||
ContainerType<Decimal128>,
|
||||
ContainerType<Decimal256>,
|
||||
ContainerType<DateTime64>,
|
||||
ContainerType<Float32>,
|
||||
ContainerType<Float64>,
|
||||
ContainerType<UUID>>;
|
||||
|
||||
struct KeyAttribute final
|
||||
{
|
||||
RangeStorageTypeContainer<KeyAttributeContainerType> container;
|
||||
|
||||
RangeStorageTypeContainer<InvalidIntervalsContainerType> invalid_intervals_container;
|
||||
|
||||
};
|
||||
|
||||
void createAttributes();
|
||||
@ -151,43 +214,31 @@ private:
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const;
|
||||
|
||||
ColumnPtr getColumnInternal(
|
||||
const std::string & attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
const PaddedPODArray<UInt64> & key_to_index) const;
|
||||
|
||||
template <typename AttributeType, bool is_nullable, typename ValueSetter>
|
||||
void getItemsInternalImpl(
|
||||
const Attribute & attribute,
|
||||
const PaddedPODArray<UInt64> & key_to_index,
|
||||
ValueSetter && set_value) const;
|
||||
|
||||
void updateData();
|
||||
|
||||
void blockToAttributes(const Block & block);
|
||||
|
||||
void buildAttributeIntervalTrees();
|
||||
|
||||
template <typename T>
|
||||
void setAttributeValueImpl(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value);
|
||||
|
||||
void setAttributeValue(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value);
|
||||
|
||||
template <typename RangeType>
|
||||
void getKeysAndDates(
|
||||
PaddedPODArray<KeyType> & keys,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const;
|
||||
|
||||
template <typename T, typename RangeType>
|
||||
void getKeysAndDates(
|
||||
const Attribute & attribute,
|
||||
PaddedPODArray<KeyType> & keys,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const;
|
||||
|
||||
template <typename RangeType>
|
||||
PaddedPODArray<Int64> makeDateKeys(
|
||||
const PaddedPODArray<RangeType> & block_start_dates,
|
||||
const PaddedPODArray<RangeType> & block_end_dates) const;
|
||||
void setAttributeValue(Attribute & attribute, const Field & value);
|
||||
|
||||
const DictionaryStructure dict_struct;
|
||||
const DictionarySourcePtr source_ptr;
|
||||
const DictionaryLifetime dict_lifetime;
|
||||
const bool require_nonempty;
|
||||
const RangeHashedDictionaryConfiguration configuration;
|
||||
BlockPtr update_field_loaded_block;
|
||||
|
||||
std::vector<Attribute> attributes;
|
||||
Arena complex_key_arena;
|
||||
KeyAttribute key_attribute;
|
||||
|
||||
size_t bytes_allocated = 0;
|
||||
size_t element_count = 0;
|
||||
@ -195,7 +246,6 @@ private:
|
||||
mutable std::atomic<size_t> query_count{0};
|
||||
mutable std::atomic<size_t> found_count{0};
|
||||
Arena string_arena;
|
||||
NoAttributesCollectionType no_attributes_container;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ namespace ErrorCodes
|
||||
extern const int CANNOT_TRUNCATE_FILE;
|
||||
extern const int CANNOT_UNLINK;
|
||||
extern const int CANNOT_RMDIR;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
std::mutex DiskLocal::reservation_mutex;
|
||||
@ -458,10 +459,16 @@ void registerDiskLocal(DiskFactory & factory)
|
||||
const Poco::Util::AbstractConfiguration & config,
|
||||
const String & config_prefix,
|
||||
ContextPtr context,
|
||||
const DisksMap & /*map*/) -> DiskPtr {
|
||||
const DisksMap & map) -> DiskPtr {
|
||||
String path;
|
||||
UInt64 keep_free_space_bytes;
|
||||
loadDiskLocalConfig(name, config, config_prefix, context, path, keep_free_space_bytes);
|
||||
|
||||
for (const auto & [disk_name, disk_ptr] : map)
|
||||
{
|
||||
if (path == disk_ptr->getPath())
|
||||
throw Exception("Disk " + name + " and Disk " + disk_name + " cannot have the same path" + " (" + path + ")", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
return std::make_shared<DiskLocal>(name, path, keep_free_space_bytes);
|
||||
};
|
||||
factory.registerDiskType("local", creator);
|
||||
|
@ -80,6 +80,10 @@ endif()
|
||||
|
||||
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::lz4)
|
||||
|
||||
if (ENABLE_NLP)
|
||||
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::cld2)
|
||||
endif()
|
||||
|
||||
if (TARGET ch_contrib::h3)
|
||||
target_link_libraries (clickhouse_functions PRIVATE ch_contrib::h3)
|
||||
endif()
|
||||
|
142
src/Functions/FunctionsCharsetClassification.cpp
Normal file
142
src/Functions/FunctionsCharsetClassification.cpp
Normal file
@ -0,0 +1,142 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
|
||||
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
|
||||
* Using a naive Bayesian classifier, find the most likely charset and language and return it
|
||||
*/
|
||||
|
||||
template <bool detect_language>
|
||||
struct CharsetClassificationImpl
|
||||
{
|
||||
/* We need to solve zero-frequency problem for Naive Bayes Classifier
|
||||
* If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
|
||||
* 1e-06 is minimal value in our marked-up dictionary.
|
||||
*/
|
||||
static constexpr Float64 zero_frequency = 1e-06;
|
||||
|
||||
/// If the data size is bigger than this, behaviour is unspecified for this function.
|
||||
static constexpr size_t max_string_size = 1u << 15;
|
||||
|
||||
static ALWAYS_INLINE inline Float64 naiveBayes(
|
||||
const FrequencyHolder::EncodingMap & standard,
|
||||
const HashMap<UInt16, UInt64> & model,
|
||||
Float64 max_result)
|
||||
{
|
||||
Float64 res = 0;
|
||||
for (const auto & el : model)
|
||||
{
|
||||
/// Try to find bigram in the dictionary.
|
||||
const auto * it = standard.find(el.getKey());
|
||||
if (it != standard.end())
|
||||
{
|
||||
res += el.getMapped() * log(it->getMapped());
|
||||
} else
|
||||
{
|
||||
res += el.getMapped() * log(zero_frequency);
|
||||
}
|
||||
/// If at some step the result has become less than the current maximum, then it makes no sense to count it fully.
|
||||
if (res < max_result)
|
||||
{
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Сount how many times each bigram occurs in the text.
|
||||
static ALWAYS_INLINE inline void calculateStats(
|
||||
const UInt8 * data,
|
||||
const size_t size,
|
||||
HashMap<UInt16, UInt64> & model)
|
||||
{
|
||||
UInt16 hash = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
hash <<= 8;
|
||||
hash += *(data + i);
|
||||
++model[hash];
|
||||
}
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
||||
|
||||
if (detect_language)
|
||||
/// 2 chars for ISO code + 1 zero byte
|
||||
res_data.reserve(offsets.size() * 3);
|
||||
else
|
||||
/// Mean charset length is 8
|
||||
res_data.reserve(offsets.size() * 8);
|
||||
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::string_view res;
|
||||
|
||||
HashMap<UInt16, UInt64> model;
|
||||
calculateStats(str, str_len, model);
|
||||
|
||||
/// Go through the dictionary and find the charset with the highest weight
|
||||
Float64 max_result = log(zero_frequency) * (max_string_size);
|
||||
for (const auto & item : encodings_freq)
|
||||
{
|
||||
Float64 score = naiveBayes(item.map, model, max_result);
|
||||
if (max_result < score)
|
||||
{
|
||||
max_result = score;
|
||||
res = detect_language ? item.lang : item.name;
|
||||
}
|
||||
}
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct NameDetectCharset
|
||||
{
|
||||
static constexpr auto name = "detectCharset";
|
||||
};
|
||||
|
||||
struct NameDetectLanguageUnknown
|
||||
{
|
||||
static constexpr auto name = "detectLanguageUnknown";
|
||||
};
|
||||
|
||||
|
||||
using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
|
||||
using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
|
||||
|
||||
void registerFunctionDetectCharset(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectCharset>();
|
||||
factory.registerFunction<FunctionDetectLanguageUnknown>();
|
||||
}
|
||||
|
||||
}
|
@ -1772,6 +1772,12 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<ToDataType, DataTypeString>)
|
||||
{
|
||||
if (from_type->getCustomSerialization())
|
||||
return ConvertImplGenericToString<ColumnString>::execute(arguments, result_type, input_rows_count);
|
||||
}
|
||||
|
||||
bool done;
|
||||
if constexpr (to_string_or_fixed_string)
|
||||
{
|
||||
@ -3409,7 +3415,7 @@ private:
|
||||
return false;
|
||||
};
|
||||
|
||||
auto make_custom_serialization_wrapper = [&](const auto & types) -> bool
|
||||
auto make_custom_serialization_wrapper = [&](const auto & types) -> bool
|
||||
{
|
||||
using Types = std::decay_t<decltype(types)>;
|
||||
using ToDataType = typename Types::RightType;
|
||||
|
231
src/Functions/FunctionsLanguageClassification.cpp
Normal file
231
src/Functions/FunctionsLanguageClassification.cpp
Normal file
@ -0,0 +1,231 @@
|
||||
#include "config_functions.h"
|
||||
|
||||
#if USE_NLP
|
||||
|
||||
#include <Columns/ColumnMap.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Common/isValidUTF8.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
#include <compact_lang_det.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
/* Determine language of Unicode UTF-8 text.
|
||||
* Uses the cld2 library https://github.com/CLD2Owners/cld2
|
||||
*/
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
struct FunctionDetectLanguageImpl
|
||||
{
|
||||
static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string)
|
||||
{
|
||||
if (code_string.ends_with("-Latn"))
|
||||
code_string.remove_suffix(code_string.size() - 5);
|
||||
|
||||
if (code_string.ends_with("-Hant"))
|
||||
code_string.remove_suffix(code_string.size() - 5);
|
||||
|
||||
// Old deprecated codes
|
||||
if (code_string == "iw")
|
||||
return "he";
|
||||
|
||||
if (code_string == "jw")
|
||||
return "jv";
|
||||
|
||||
if (code_string == "in")
|
||||
return "id";
|
||||
|
||||
if (code_string == "mo")
|
||||
return "ro";
|
||||
|
||||
// Some languages do not have 2 letter codes, for example code for Cebuano is ceb
|
||||
if (code_string.size() != 2)
|
||||
return "other";
|
||||
|
||||
return code_string;
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
/// Constant 3 is based on the fact that in general we need 2 characters for ISO code + 1 zero byte
|
||||
res_data.reserve(offsets.size() * 3);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
bool is_reliable;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::string_view res;
|
||||
|
||||
if (UTF8::isValidUTF8(str, str_len))
|
||||
{
|
||||
auto lang = CLD2::DetectLanguage(reinterpret_cast<const char *>(str), str_len, true, &is_reliable);
|
||||
res = codeISO(LanguageCode(lang));
|
||||
}
|
||||
else
|
||||
{
|
||||
res = "un";
|
||||
}
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class FunctionDetectLanguageMixed : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "detectLanguageMixed";
|
||||
|
||||
/// Number of top results
|
||||
static constexpr auto top_N = 3;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionDetectLanguageMixed>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeFloat32>());
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
|
||||
{
|
||||
const auto & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal columns " + arguments[0].column->getName() + " of arguments of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
const auto & input_data = col->getChars();
|
||||
const auto & input_offsets = col->getOffsets();
|
||||
|
||||
/// Create and fill the result map.
|
||||
|
||||
const auto & result_type_map = static_cast<const DataTypeMap &>(*result_type);
|
||||
const DataTypePtr & key_type = result_type_map.getKeyType();
|
||||
const DataTypePtr & value_type = result_type_map.getValueType();
|
||||
|
||||
MutableColumnPtr keys_data = key_type->createColumn();
|
||||
MutableColumnPtr values_data = value_type->createColumn();
|
||||
MutableColumnPtr offsets = DataTypeNumber<IColumn::Offset>().createColumn();
|
||||
|
||||
size_t total_elements = input_rows_count * top_N;
|
||||
keys_data->reserve(total_elements);
|
||||
values_data->reserve(total_elements);
|
||||
offsets->reserve(input_rows_count);
|
||||
|
||||
bool is_reliable;
|
||||
CLD2::Language result_lang_top3[top_N];
|
||||
int32_t pc[top_N];
|
||||
int bytes[top_N];
|
||||
|
||||
IColumn::Offset current_offset = 0;
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
const UInt8 * str = input_data.data() + input_offsets[i - 1];
|
||||
const size_t str_len = input_offsets[i] - input_offsets[i - 1] - 1;
|
||||
|
||||
if (UTF8::isValidUTF8(str, str_len))
|
||||
{
|
||||
CLD2::DetectLanguageSummary(reinterpret_cast<const char *>(str), str_len, true, result_lang_top3, pc, bytes, &is_reliable);
|
||||
|
||||
for (size_t j = 0; j < top_N; ++j)
|
||||
{
|
||||
if (pc[j] == 0)
|
||||
break;
|
||||
|
||||
auto res_str = FunctionDetectLanguageImpl::codeISO(LanguageCode(result_lang_top3[j]));
|
||||
Float32 res_float = static_cast<Float32>(pc[j]) / 100;
|
||||
|
||||
keys_data->insertData(res_str.data(), res_str.size());
|
||||
values_data->insertData(reinterpret_cast<const char *>(&res_float), sizeof(res_float));
|
||||
++current_offset;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string_view res_str = "un";
|
||||
Float32 res_float = 0;
|
||||
|
||||
keys_data->insertData(res_str.data(), res_str.size());
|
||||
values_data->insertData(reinterpret_cast<const char *>(&res_float), sizeof(res_float));
|
||||
++current_offset;
|
||||
}
|
||||
offsets->insert(current_offset);
|
||||
}
|
||||
|
||||
auto nested_column = ColumnArray::create(
|
||||
ColumnTuple::create(Columns{std::move(keys_data), std::move(values_data)}),
|
||||
std::move(offsets));
|
||||
|
||||
return ColumnMap::create(nested_column);
|
||||
}
|
||||
};
|
||||
|
||||
struct NameDetectLanguage
|
||||
{
|
||||
static constexpr auto name = "detectLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionDetectLanguage = FunctionTextClassificationString<FunctionDetectLanguageImpl, NameDetectLanguage>;
|
||||
|
||||
void registerFunctionsDetectLanguage(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectLanguage>();
|
||||
factory.registerFunction<FunctionDetectLanguageMixed>();
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
120
src/Functions/FunctionsProgrammingClassification.cpp
Normal file
120
src/Functions/FunctionsProgrammingClassification.cpp
Normal file
@ -0,0 +1,120 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <unordered_map>
|
||||
#include <string_view>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/**
|
||||
* Determine the programming language from the source code.
|
||||
* We calculate all the unigrams and bigrams of commands in the source code.
|
||||
* Then using a marked-up dictionary with weights of unigrams and bigrams of commands for various programming languages
|
||||
* Find the biggest weight of the programming language and return it
|
||||
*/
|
||||
struct FunctionDetectProgrammingLanguageImpl
|
||||
{
|
||||
/// Calculate total weight
|
||||
static ALWAYS_INLINE inline Float64 stateMachine(
|
||||
const FrequencyHolder::Map & standard,
|
||||
const std::unordered_map<String, Float64> & model)
|
||||
{
|
||||
Float64 res = 0;
|
||||
for (const auto & el : model)
|
||||
{
|
||||
/// Try to find each n-gram in dictionary
|
||||
const auto * it = standard.find(el.first);
|
||||
if (it != standard.end())
|
||||
res += el.second * it->getMapped();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
const auto & programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
|
||||
|
||||
/// Constant 5 is arbitrary
|
||||
res_data.reserve(offsets.size() * 5);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::unordered_map<String, Float64> data_freq;
|
||||
StringRef prev_command;
|
||||
StringRef command;
|
||||
|
||||
/// Select all commands from the string
|
||||
for (size_t ind = 0; ind < str_len; ++ind)
|
||||
{
|
||||
/// Assume that all commands are split by spaces
|
||||
if (isWhitespaceASCII(str[ind]))
|
||||
continue;
|
||||
|
||||
size_t prev_ind = ind;
|
||||
while (ind < str_len && !isWhitespaceASCII(str[ind]))
|
||||
++ind;
|
||||
|
||||
command = {str + prev_ind, ind - prev_ind};
|
||||
|
||||
/// We add both unigrams and bigrams to later search for them in the dictionary
|
||||
if (prev_command.data)
|
||||
data_freq[prev_command.toString() + command.toString()] += 1;
|
||||
|
||||
data_freq[command.toString()] += 1;
|
||||
prev_command = command;
|
||||
}
|
||||
|
||||
std::string_view res;
|
||||
Float64 max_result = 0;
|
||||
/// Iterate over all programming languages and find the language with the highest weight
|
||||
for (const auto & item : programming_freq)
|
||||
{
|
||||
Float64 result = stateMachine(item.map, data_freq);
|
||||
if (result > max_result)
|
||||
{
|
||||
max_result = result;
|
||||
res = item.name;
|
||||
}
|
||||
}
|
||||
/// If all weights are zero, then we assume that the language is undefined
|
||||
if (res.empty())
|
||||
res = "Undefined";
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct NameDetectProgrammingLanguage
|
||||
{
|
||||
static constexpr auto name = "detectProgrammingLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionDetectProgrammingLanguage = FunctionTextClassificationString<FunctionDetectProgrammingLanguageImpl, NameDetectProgrammingLanguage>;
|
||||
|
||||
void registerFunctionDetectProgrammingLanguage(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectProgrammingLanguage>();
|
||||
}
|
||||
|
||||
}
|
122
src/Functions/FunctionsTextClassification.h
Normal file
122
src/Functions/FunctionsTextClassification.h
Normal file
@ -0,0 +1,122 @@
|
||||
#pragma once
|
||||
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
/// Functions for text classification with different result types
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
template <typename Impl, typename Name>
|
||||
class FunctionTextClassificationString : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = Name::name;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionTextClassificationString>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return arguments[0];
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
|
||||
{
|
||||
const ColumnPtr & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
auto col_res = ColumnString::create();
|
||||
Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
|
||||
return col_res;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Impl, typename Name>
|
||||
class FunctionTextClassificationFloat : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = Name::name;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionTextClassificationFloat>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return std::make_shared<DataTypeFloat32>();
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
|
||||
{
|
||||
const ColumnPtr & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
auto col_res = ColumnVector<Float32>::create();
|
||||
ColumnVector<Float32>::Container & vec_res = col_res->getData();
|
||||
vec_res.resize(col->size());
|
||||
|
||||
Impl::vector(col->getChars(), col->getOffsets(), vec_res);
|
||||
return col_res;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
89
src/Functions/FunctionsTonalityClassification.cpp
Normal file
89
src/Functions/FunctionsTonalityClassification.cpp
Normal file
@ -0,0 +1,89 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/**
|
||||
* Determines the sentiment of text data.
|
||||
* Uses a marked-up sentiment dictionary, each word has a tonality ranging from -12 to 6.
|
||||
* For each text, calculate the average sentiment value of its words and return it in range [-1,1]
|
||||
*/
|
||||
struct FunctionDetectTonalityImpl
|
||||
{
|
||||
static ALWAYS_INLINE inline Float32 detectTonality(
|
||||
const UInt8 * str,
|
||||
const size_t str_len,
|
||||
const FrequencyHolder::Map & emotional_dict)
|
||||
{
|
||||
Float64 weight = 0;
|
||||
UInt64 count_words = 0;
|
||||
|
||||
String word;
|
||||
/// Select all Russian words from the string
|
||||
for (size_t ind = 0; ind < str_len; ++ind)
|
||||
{
|
||||
/// Split words by whitespaces and punctuation signs
|
||||
if (isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind]))
|
||||
continue;
|
||||
|
||||
while (ind < str_len && !(isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind])))
|
||||
{
|
||||
word.push_back(str[ind]);
|
||||
++ind;
|
||||
}
|
||||
/// Try to find a russian word in the tonality dictionary
|
||||
const auto * it = emotional_dict.find(word);
|
||||
if (it != emotional_dict.end())
|
||||
{
|
||||
count_words += 1;
|
||||
weight += it->getMapped();
|
||||
}
|
||||
word.clear();
|
||||
}
|
||||
|
||||
if (!count_words)
|
||||
return 0;
|
||||
|
||||
/// Calculate average value of tonality.
|
||||
/// Convert values -12..6 to -1..1
|
||||
if (weight > 0)
|
||||
return weight / count_words / 6;
|
||||
else
|
||||
return weight / count_words / 12;
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
PaddedPODArray<Float32> & res)
|
||||
{
|
||||
const auto & emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
|
||||
size_t size = offsets.size();
|
||||
size_t prev_offset = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
res[i] = detectTonality(data.data() + prev_offset, offsets[i] - 1 - prev_offset, emotional_dict);
|
||||
prev_offset = offsets[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct NameDetectTonality
|
||||
{
|
||||
static constexpr auto name = "detectTonality";
|
||||
};
|
||||
|
||||
using FunctionDetectTonality = FunctionTextClassificationFloat<FunctionDetectTonalityImpl, NameDetectTonality>;
|
||||
|
||||
void registerFunctionDetectTonality(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectTonality>();
|
||||
}
|
||||
|
||||
}
|
@ -9,5 +9,6 @@
|
||||
#cmakedefine01 USE_S2_GEOMETRY
|
||||
#cmakedefine01 USE_FASTOPS
|
||||
#cmakedefine01 USE_BLAKE3
|
||||
#cmakedefine01 USE_NLP
|
||||
#cmakedefine01 USE_HYPERSCAN
|
||||
|
||||
|
@ -39,6 +39,9 @@ void registerFunctionEncodeXMLComponent(FunctionFactory &);
|
||||
void registerFunctionDecodeXMLComponent(FunctionFactory &);
|
||||
void registerFunctionExtractTextFromHTML(FunctionFactory &);
|
||||
void registerFunctionToStringCutToZero(FunctionFactory &);
|
||||
void registerFunctionDetectCharset(FunctionFactory &);
|
||||
void registerFunctionDetectTonality(FunctionFactory &);
|
||||
void registerFunctionDetectProgrammingLanguage(FunctionFactory &);
|
||||
|
||||
#if USE_BASE64
|
||||
void registerFunctionBase64Encode(FunctionFactory &);
|
||||
@ -50,6 +53,7 @@ void registerFunctionTryBase64Decode(FunctionFactory &);
|
||||
void registerFunctionStem(FunctionFactory &);
|
||||
void registerFunctionSynonyms(FunctionFactory &);
|
||||
void registerFunctionLemmatize(FunctionFactory &);
|
||||
void registerFunctionsDetectLanguage(FunctionFactory &);
|
||||
#endif
|
||||
|
||||
#if USE_ICU
|
||||
@ -91,6 +95,9 @@ void registerFunctionsString(FunctionFactory & factory)
|
||||
registerFunctionDecodeXMLComponent(factory);
|
||||
registerFunctionExtractTextFromHTML(factory);
|
||||
registerFunctionToStringCutToZero(factory);
|
||||
registerFunctionDetectCharset(factory);
|
||||
registerFunctionDetectTonality(factory);
|
||||
registerFunctionDetectProgrammingLanguage(factory);
|
||||
|
||||
#if USE_BASE64
|
||||
registerFunctionBase64Encode(factory);
|
||||
@ -102,6 +109,7 @@ void registerFunctionsString(FunctionFactory & factory)
|
||||
registerFunctionStem(factory);
|
||||
registerFunctionSynonyms(factory);
|
||||
registerFunctionLemmatize(factory);
|
||||
registerFunctionsDetectLanguage(factory);
|
||||
#endif
|
||||
|
||||
#if USE_ICU
|
||||
|
@ -108,6 +108,9 @@ static NamesAndTypesList getColumnsList(const ASTExpressionList * columns_defini
|
||||
data_type_function->name = type_name_upper + " UNSIGNED";
|
||||
}
|
||||
|
||||
if (type_name_upper == "SET")
|
||||
data_type_function->arguments.reset();
|
||||
|
||||
/// Transforms MySQL ENUM's list of strings to ClickHouse string-integer pairs
|
||||
/// For example ENUM('a', 'b', 'c') -> ENUM('a'=1, 'b'=2, 'c'=3)
|
||||
/// Elements on a position further than 32767 are assigned negative values, starting with -32768.
|
||||
|
@ -40,7 +40,8 @@ TEST(MySQLCreateRewritten, ColumnsDataType)
|
||||
{"TINYINT", "Int8"}, {"SMALLINT", "Int16"}, {"MEDIUMINT", "Int32"}, {"INT", "Int32"},
|
||||
{"INTEGER", "Int32"}, {"BIGINT", "Int64"}, {"FLOAT", "Float32"}, {"DOUBLE", "Float64"},
|
||||
{"VARCHAR(10)", "String"}, {"CHAR(10)", "String"}, {"Date", "Date"}, {"DateTime", "DateTime"},
|
||||
{"TIMESTAMP", "DateTime"}, {"BOOLEAN", "Bool"}, {"BIT", "UInt64"}
|
||||
{"TIMESTAMP", "DateTime"}, {"BOOLEAN", "Bool"}, {"BIT", "UInt64"}, {"SET", "UInt64"},
|
||||
{"YEAR", "UInt16"}, {"TIME", "Int64"}, {"GEOMETRY", "String"}
|
||||
};
|
||||
|
||||
for (const auto & [test_type, mapped_type] : test_types)
|
||||
|
@ -69,7 +69,14 @@ void ReplaceQueryParameterVisitor::visitQueryParameter(ASTPtr & ast)
|
||||
" because it isn't parsed completely: only {} of {} bytes was parsed: {}",
|
||||
value, type_name, ast_param.name, read_buffer.count(), value.size(), value.substr(0, read_buffer.count()));
|
||||
|
||||
ast = addTypeConversionToAST(std::make_shared<ASTLiteral>(temp_column[0]), type_name);
|
||||
Field literal;
|
||||
/// If data type has custom serialization, we should use CAST from String,
|
||||
/// because CAST from field may not work correctly (for example for type IPv6).
|
||||
if (data_type->getCustomSerialization())
|
||||
literal = value;
|
||||
else
|
||||
literal = temp_column[0];
|
||||
ast = addTypeConversionToAST(std::make_shared<ASTLiteral>(literal), type_name);
|
||||
|
||||
/// Keep the original alias.
|
||||
ast->setAlias(alias);
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <base/range.h>
|
||||
#include <base/logger_useful.h>
|
||||
#include <Processors/Sources/MySQLSource.h>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -145,8 +146,7 @@ namespace
|
||||
break;
|
||||
case ValueType::vtUInt64:
|
||||
{
|
||||
//we don't have enum enum_field_types definition in mysqlxx/Types.h, so we use literal values directly here.
|
||||
if (static_cast<int>(mysql_type) == 16)
|
||||
if (mysql_type == enum_field_types::MYSQL_TYPE_BIT)
|
||||
{
|
||||
size_t n = value.size();
|
||||
UInt64 val = 0UL;
|
||||
@ -175,9 +175,32 @@ namespace
|
||||
read_bytes_size += 4;
|
||||
break;
|
||||
case ValueType::vtInt64:
|
||||
assert_cast<ColumnInt64 &>(column).insertValue(value.getInt());
|
||||
read_bytes_size += 8;
|
||||
{
|
||||
if (mysql_type == enum_field_types::MYSQL_TYPE_TIME)
|
||||
{
|
||||
String time_str(value.data(), value.size());
|
||||
bool negative = time_str.starts_with("-");
|
||||
if (negative) time_str = time_str.substr(1);
|
||||
std::vector<String> hhmmss;
|
||||
boost::split(hhmmss, time_str, [](char c) { return c == ':'; });
|
||||
Int64 v = 0;
|
||||
if (hhmmss.size() == 3)
|
||||
{
|
||||
v = (std::stoi(hhmmss[0]) * 3600 + std::stoi(hhmmss[1]) * 60 + std::stold(hhmmss[2])) * 1000000;
|
||||
}
|
||||
else
|
||||
throw Exception("Unsupported value format", ErrorCodes::NOT_IMPLEMENTED);
|
||||
if (negative) v = -v;
|
||||
assert_cast<ColumnInt64 &>(column).insertValue(v);
|
||||
read_bytes_size += value.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
assert_cast<ColumnInt64 &>(column).insertValue(value.getInt());
|
||||
read_bytes_size += 8;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ValueType::vtFloat32:
|
||||
assert_cast<ColumnFloat32 &>(column).insertValue(value.getDouble());
|
||||
read_bytes_size += 4;
|
||||
|
@ -1123,7 +1123,7 @@ bool ReplicatedMergeTreeQueue::addFuturePartIfNotCoveredByThem(const String & pa
|
||||
|
||||
if (isNotCoveredByFuturePartsImpl(entry, part_name, reject_reason, lock))
|
||||
{
|
||||
CurrentlyExecuting::setActualPartName(entry, part_name, *this);
|
||||
CurrentlyExecuting::setActualPartName(entry, part_name, *this, lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1375,7 +1375,8 @@ Int64 ReplicatedMergeTreeQueue::getCurrentMutationVersion(const String & partiti
|
||||
}
|
||||
|
||||
|
||||
ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting(const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_)
|
||||
ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting(
|
||||
const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_, std::lock_guard<std::mutex> & /* state_lock */)
|
||||
: entry(entry_), queue(queue_)
|
||||
{
|
||||
if (entry->type == ReplicatedMergeTreeLogEntry::DROP_RANGE || entry->type == ReplicatedMergeTreeLogEntry::REPLACE_RANGE)
|
||||
@ -1397,8 +1398,11 @@ ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting(const Replicate
|
||||
}
|
||||
|
||||
|
||||
void ReplicatedMergeTreeQueue::CurrentlyExecuting::setActualPartName(ReplicatedMergeTreeQueue::LogEntry & entry,
|
||||
const String & actual_part_name, ReplicatedMergeTreeQueue & queue)
|
||||
void ReplicatedMergeTreeQueue::CurrentlyExecuting::setActualPartName(
|
||||
ReplicatedMergeTreeQueue::LogEntry & entry,
|
||||
const String & actual_part_name,
|
||||
ReplicatedMergeTreeQueue & queue,
|
||||
std::lock_guard<std::mutex> & /* state_lock */)
|
||||
{
|
||||
if (!entry.actual_new_part_name.empty())
|
||||
throw Exception("Entry actual part isn't empty yet. This is a bug.", ErrorCodes::LOGICAL_ERROR);
|
||||
@ -1477,7 +1481,7 @@ ReplicatedMergeTreeQueue::SelectedEntryPtr ReplicatedMergeTreeQueue::selectEntry
|
||||
}
|
||||
|
||||
if (entry)
|
||||
return std::make_shared<SelectedEntry>(entry, std::unique_ptr<CurrentlyExecuting>{ new CurrentlyExecuting(entry, *this) });
|
||||
return std::make_shared<SelectedEntry>(entry, std::unique_ptr<CurrentlyExecuting>{new CurrentlyExecuting(entry, *this, lock)});
|
||||
else
|
||||
return {};
|
||||
}
|
||||
|
@ -251,11 +251,18 @@ private:
|
||||
friend class ReplicatedMergeTreeQueue;
|
||||
|
||||
/// Created only in the selectEntryToProcess function. It is called under mutex.
|
||||
CurrentlyExecuting(const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_);
|
||||
CurrentlyExecuting(
|
||||
const ReplicatedMergeTreeQueue::LogEntryPtr & entry_,
|
||||
ReplicatedMergeTreeQueue & queue_,
|
||||
std::lock_guard<std::mutex> & state_lock);
|
||||
|
||||
/// In case of fetch, we determine actual part during the execution, so we need to update entry. It is called under state_mutex.
|
||||
static void setActualPartName(ReplicatedMergeTreeQueue::LogEntry & entry, const String & actual_part_name,
|
||||
ReplicatedMergeTreeQueue & queue);
|
||||
static void setActualPartName(
|
||||
ReplicatedMergeTreeQueue::LogEntry & entry,
|
||||
const String & actual_part_name,
|
||||
ReplicatedMergeTreeQueue & queue,
|
||||
std::lock_guard<std::mutex> & state_lock);
|
||||
|
||||
public:
|
||||
~CurrentlyExecuting();
|
||||
};
|
||||
|
@ -5,36 +5,66 @@ import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
|
||||
from ci_config import CI_CONFIG
|
||||
|
||||
DOWNLOAD_RETRIES_COUNT = 5
|
||||
|
||||
|
||||
def get_with_retries(
|
||||
url: str,
|
||||
retries: int = DOWNLOAD_RETRIES_COUNT,
|
||||
sleep: int = 3,
|
||||
**kwargs,
|
||||
) -> requests.Response:
|
||||
logging.info("Getting URL with %i and sleep %i in between: %s", retries, sleep, url)
|
||||
exc = None # type: Optional[Exception]
|
||||
for i in range(DOWNLOAD_RETRIES_COUNT):
|
||||
try:
|
||||
response = requests.get(url, **kwargs)
|
||||
response.raise_for_status()
|
||||
break
|
||||
except Exception as e:
|
||||
if i + 1 < DOWNLOAD_RETRIES_COUNT:
|
||||
logging.info("Exception '%s' while getting, retry %i", e, i + 1)
|
||||
time.sleep(sleep)
|
||||
|
||||
exc = e
|
||||
else:
|
||||
raise Exception(exc)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def get_build_name_for_check(check_name):
|
||||
return CI_CONFIG['tests_config'][check_name]['required_build']
|
||||
return CI_CONFIG["tests_config"][check_name]["required_build"]
|
||||
|
||||
|
||||
def get_build_urls(build_name, reports_path):
|
||||
for root, _, files in os.walk(reports_path):
|
||||
for f in files:
|
||||
if build_name in f :
|
||||
if build_name in f:
|
||||
logging.info("Found build report json %s", f)
|
||||
with open(os.path.join(root, f), 'r', encoding='utf-8') as file_handler:
|
||||
with open(os.path.join(root, f), "r", encoding="utf-8") as file_handler:
|
||||
build_report = json.load(file_handler)
|
||||
return build_report['build_urls']
|
||||
return build_report["build_urls"]
|
||||
return []
|
||||
|
||||
|
||||
def dowload_build_with_progress(url, path):
|
||||
logging.info("Downloading from %s to temp path %s", url, path)
|
||||
for i in range(DOWNLOAD_RETRIES_COUNT):
|
||||
try:
|
||||
with open(path, 'wb') as f:
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
total_length = response.headers.get('content-length')
|
||||
with open(path, "wb") as f:
|
||||
response = get_with_retries(url, retries=1, stream=True)
|
||||
total_length = response.headers.get("content-length")
|
||||
if total_length is None or int(total_length) == 0:
|
||||
logging.info("No content-length, will download file without progress")
|
||||
logging.info(
|
||||
"No content-length, will download file without progress"
|
||||
)
|
||||
f.write(response.content)
|
||||
else:
|
||||
dl = 0
|
||||
@ -46,32 +76,38 @@ def dowload_build_with_progress(url, path):
|
||||
if sys.stdout.isatty():
|
||||
done = int(50 * dl / total_length)
|
||||
percent = int(100 * float(dl) / total_length)
|
||||
eq_str = '=' * done
|
||||
space_str = ' ' * (50 - done)
|
||||
eq_str = "=" * done
|
||||
space_str = " " * (50 - done)
|
||||
sys.stdout.write(f"\r[{eq_str}{space_str}] {percent}%")
|
||||
sys.stdout.flush()
|
||||
break
|
||||
except Exception as ex:
|
||||
sys.stdout.write("\n")
|
||||
time.sleep(3)
|
||||
logging.info("Exception while downloading %s, retry %s", ex, i + 1)
|
||||
except Exception:
|
||||
if sys.stdout.isatty():
|
||||
sys.stdout.write("\n")
|
||||
if i + 1 < DOWNLOAD_RETRIES_COUNT:
|
||||
time.sleep(3)
|
||||
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
else:
|
||||
raise Exception(f"Cannot download dataset from {url}, all retries exceeded")
|
||||
|
||||
sys.stdout.write("\n")
|
||||
if sys.stdout.isatty():
|
||||
sys.stdout.write("\n")
|
||||
logging.info("Downloading finished")
|
||||
|
||||
|
||||
def download_builds(result_path, build_urls, filter_fn):
|
||||
for url in build_urls:
|
||||
if filter_fn(url):
|
||||
fname = os.path.basename(url.replace('%2B', '+').replace('%20', ' '))
|
||||
fname = os.path.basename(url.replace("%2B", "+").replace("%20", " "))
|
||||
logging.info("Will download %s to %s", fname, result_path)
|
||||
dowload_build_with_progress(url, os.path.join(result_path, fname))
|
||||
|
||||
def download_builds_filter(check_name, reports_path, result_path, filter_fn=lambda _: True):
|
||||
|
||||
def download_builds_filter(
|
||||
check_name, reports_path, result_path, filter_fn=lambda _: True
|
||||
):
|
||||
build_name = get_build_name_for_check(check_name)
|
||||
urls = get_build_urls(build_name, reports_path)
|
||||
print(urls)
|
||||
@ -81,17 +117,32 @@ def download_builds_filter(check_name, reports_path, result_path, filter_fn=lamb
|
||||
|
||||
download_builds(result_path, urls, filter_fn)
|
||||
|
||||
|
||||
def download_all_deb_packages(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('deb'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("deb")
|
||||
)
|
||||
|
||||
|
||||
def download_shared_build(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('shared_build.tgz'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("shared_build.tgz")
|
||||
)
|
||||
|
||||
|
||||
def download_unit_tests(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('unit_tests_dbms'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("unit_tests_dbms")
|
||||
)
|
||||
|
||||
|
||||
def download_clickhouse_binary(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('clickhouse'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("clickhouse")
|
||||
)
|
||||
|
||||
|
||||
def download_performance_build(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('performance.tgz'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("performance.tgz")
|
||||
)
|
||||
|
@ -5,22 +5,23 @@ import json
|
||||
import time
|
||||
|
||||
import jwt
|
||||
import requests
|
||||
import boto3
|
||||
import requests # type: ignore
|
||||
import boto3 # type: ignore
|
||||
|
||||
NEED_RERUN_OR_CANCELL_WORKFLOWS = {
|
||||
13241696, # PR
|
||||
15834118, # Docs
|
||||
15516108, # ReleaseCI
|
||||
15797242, # BackportPR
|
||||
"PullRequestCI",
|
||||
"Docs",
|
||||
"DocsRelease",
|
||||
"BackportPR",
|
||||
}
|
||||
|
||||
# https://docs.github.com/en/rest/reference/actions#cancel-a-workflow-run
|
||||
#
|
||||
API_URL = 'https://api.github.com/repos/ClickHouse/ClickHouse'
|
||||
API_URL = "https://api.github.com/repos/ClickHouse/ClickHouse"
|
||||
|
||||
MAX_RETRY = 5
|
||||
|
||||
|
||||
def get_installation_id(jwt_token):
|
||||
headers = {
|
||||
"Authorization": f"Bearer {jwt_token}",
|
||||
@ -29,29 +30,33 @@ def get_installation_id(jwt_token):
|
||||
response = requests.get("https://api.github.com/app/installations", headers=headers)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data[0]['id']
|
||||
return data[0]["id"]
|
||||
|
||||
|
||||
def get_access_token(jwt_token, installation_id):
|
||||
headers = {
|
||||
"Authorization": f"Bearer {jwt_token}",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
}
|
||||
response = requests.post(f"https://api.github.com/app/installations/{installation_id}/access_tokens", headers=headers)
|
||||
response = requests.post(
|
||||
f"https://api.github.com/app/installations/{installation_id}/access_tokens",
|
||||
headers=headers,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data['token']
|
||||
return data["token"]
|
||||
|
||||
|
||||
def get_key_and_app_from_aws():
|
||||
secret_name = "clickhouse_github_secret_key"
|
||||
session = boto3.session.Session()
|
||||
client = session.client(
|
||||
service_name='secretsmanager',
|
||||
service_name="secretsmanager",
|
||||
)
|
||||
get_secret_value_response = client.get_secret_value(
|
||||
SecretId=secret_name
|
||||
)
|
||||
data = json.loads(get_secret_value_response['SecretString'])
|
||||
return data['clickhouse-app-key'], int(data['clickhouse-app-id'])
|
||||
get_secret_value_response = client.get_secret_value(SecretId=secret_name)
|
||||
data = json.loads(get_secret_value_response["SecretString"])
|
||||
return data["clickhouse-app-key"], int(data["clickhouse-app-id"])
|
||||
|
||||
|
||||
def get_token_from_aws():
|
||||
private_key, app_id = get_key_and_app_from_aws()
|
||||
@ -65,6 +70,7 @@ def get_token_from_aws():
|
||||
installation_id = get_installation_id(encoded_jwt)
|
||||
return get_access_token(encoded_jwt, installation_id)
|
||||
|
||||
|
||||
def _exec_get_with_retry(url):
|
||||
for i in range(MAX_RETRY):
|
||||
try:
|
||||
@ -78,20 +84,25 @@ def _exec_get_with_retry(url):
|
||||
raise Exception("Cannot execute GET request with retries")
|
||||
|
||||
|
||||
WorkflowDescription = namedtuple('WorkflowDescription',
|
||||
['run_id', 'status', 'rerun_url', 'cancel_url'])
|
||||
WorkflowDescription = namedtuple(
|
||||
"WorkflowDescription", ["run_id", "status", "rerun_url", "cancel_url"]
|
||||
)
|
||||
|
||||
|
||||
def get_workflows_description_for_pull_request(pull_request_event):
|
||||
head_branch = pull_request_event['head']['ref']
|
||||
print("PR", pull_request_event['number'], "has head ref", head_branch)
|
||||
head_branch = pull_request_event["head"]["ref"]
|
||||
print("PR", pull_request_event["number"], "has head ref", head_branch)
|
||||
workflows_data = []
|
||||
workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page=1")
|
||||
workflows_data += workflows['workflow_runs']
|
||||
workflows = _exec_get_with_retry(
|
||||
API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page=1"
|
||||
)
|
||||
workflows_data += workflows["workflow_runs"]
|
||||
i = 2
|
||||
while len(workflows['workflow_runs']) > 0:
|
||||
workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page={i}")
|
||||
workflows_data += workflows['workflow_runs']
|
||||
while len(workflows["workflow_runs"]) > 0:
|
||||
workflows = _exec_get_with_retry(
|
||||
API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page={i}"
|
||||
)
|
||||
workflows_data += workflows["workflow_runs"]
|
||||
i += 1
|
||||
if i > 30:
|
||||
print("Too many workflows found")
|
||||
@ -99,29 +110,37 @@ def get_workflows_description_for_pull_request(pull_request_event):
|
||||
|
||||
workflow_descriptions = []
|
||||
for workflow in workflows_data:
|
||||
# unfortunately we cannot filter workflows from forks in request to API so doing it manually
|
||||
if (workflow['head_repository']['full_name'] == pull_request_event['head']['repo']['full_name']
|
||||
and workflow['workflow_id'] in NEED_RERUN_OR_CANCELL_WORKFLOWS):
|
||||
workflow_descriptions.append(WorkflowDescription(
|
||||
run_id=workflow['id'],
|
||||
status=workflow['status'],
|
||||
rerun_url=workflow['rerun_url'],
|
||||
cancel_url=workflow['cancel_url']))
|
||||
# unfortunately we cannot filter workflows from forks in request to API
|
||||
# so doing it manually
|
||||
if (
|
||||
workflow["head_repository"]["full_name"]
|
||||
== pull_request_event["head"]["repo"]["full_name"]
|
||||
and workflow["name"] in NEED_RERUN_OR_CANCELL_WORKFLOWS
|
||||
):
|
||||
workflow_descriptions.append(
|
||||
WorkflowDescription(
|
||||
run_id=workflow["id"],
|
||||
status=workflow["status"],
|
||||
rerun_url=workflow["rerun_url"],
|
||||
cancel_url=workflow["cancel_url"],
|
||||
)
|
||||
)
|
||||
|
||||
return workflow_descriptions
|
||||
|
||||
|
||||
def get_workflow_description(workflow_id):
|
||||
workflow = _exec_get_with_retry(API_URL + f"/actions/runs/{workflow_id}")
|
||||
return WorkflowDescription(
|
||||
run_id=workflow['id'],
|
||||
status=workflow['status'],
|
||||
rerun_url=workflow['rerun_url'],
|
||||
cancel_url=workflow['cancel_url'])
|
||||
run_id=workflow["id"],
|
||||
status=workflow["status"],
|
||||
rerun_url=workflow["rerun_url"],
|
||||
cancel_url=workflow["cancel_url"],
|
||||
)
|
||||
|
||||
|
||||
def _exec_post_with_retry(url, token):
|
||||
headers = {
|
||||
"Authorization": f"token {token}"
|
||||
}
|
||||
headers = {"Authorization": f"token {token}"}
|
||||
for i in range(MAX_RETRY):
|
||||
try:
|
||||
response = requests.post(url, headers=headers)
|
||||
@ -133,32 +152,34 @@ def _exec_post_with_retry(url, token):
|
||||
|
||||
raise Exception("Cannot execute POST request with retry")
|
||||
|
||||
|
||||
def exec_workflow_url(urls_to_cancel, token):
|
||||
for url in urls_to_cancel:
|
||||
print("Post for workflow workflow using url", url)
|
||||
_exec_post_with_retry(url, token)
|
||||
print("Workflow post finished")
|
||||
|
||||
|
||||
def main(event):
|
||||
token = get_token_from_aws()
|
||||
event_data = json.loads(event['body'])
|
||||
event_data = json.loads(event["body"])
|
||||
|
||||
print("Got event for PR", event_data['number'])
|
||||
action = event_data['action']
|
||||
print("Got action", event_data['action'])
|
||||
pull_request = event_data['pull_request']
|
||||
labels = { l['name'] for l in pull_request['labels'] }
|
||||
print("Got event for PR", event_data["number"])
|
||||
action = event_data["action"]
|
||||
print("Got action", event_data["action"])
|
||||
pull_request = event_data["pull_request"]
|
||||
labels = {label["name"] for label in pull_request["labels"]}
|
||||
print("PR has labels", labels)
|
||||
if action == 'closed' or 'do not test' in labels:
|
||||
if action == "closed" or "do not test" in labels:
|
||||
print("PR merged/closed or manually labeled 'do not test' will kill workflows")
|
||||
workflow_descriptions = get_workflows_description_for_pull_request(pull_request)
|
||||
urls_to_cancel = []
|
||||
for workflow_description in workflow_descriptions:
|
||||
if workflow_description.status != 'completed':
|
||||
if workflow_description.status != "completed":
|
||||
urls_to_cancel.append(workflow_description.cancel_url)
|
||||
print(f"Found {len(urls_to_cancel)} workflows to cancel")
|
||||
exec_workflow_url(urls_to_cancel, token)
|
||||
elif action == 'labeled' and 'can be tested' in labels:
|
||||
elif action == "labeled" and "can be tested" in labels:
|
||||
print("PR marked with can be tested label, rerun workflow")
|
||||
workflow_descriptions = get_workflows_description_for_pull_request(pull_request)
|
||||
if not workflow_descriptions:
|
||||
@ -168,7 +189,7 @@ def main(event):
|
||||
sorted_workflows = list(sorted(workflow_descriptions, key=lambda x: x.run_id))
|
||||
most_recent_workflow = sorted_workflows[-1]
|
||||
print("Latest workflow", most_recent_workflow)
|
||||
if most_recent_workflow.status != 'completed':
|
||||
if most_recent_workflow.status != "completed":
|
||||
print("Latest workflow is not completed, cancelling")
|
||||
exec_workflow_url([most_recent_workflow.cancel_url], token)
|
||||
print("Cancelled")
|
||||
@ -176,7 +197,7 @@ def main(event):
|
||||
for _ in range(30):
|
||||
latest_workflow_desc = get_workflow_description(most_recent_workflow.run_id)
|
||||
print("Checking latest workflow", latest_workflow_desc)
|
||||
if latest_workflow_desc.status in ('completed', 'cancelled'):
|
||||
if latest_workflow_desc.status in ("completed", "cancelled"):
|
||||
print("Finally latest workflow done, going to rerun")
|
||||
exec_workflow_url([most_recent_workflow.rerun_url], token)
|
||||
print("Rerun finished, exiting")
|
||||
@ -187,5 +208,6 @@ def main(event):
|
||||
else:
|
||||
print("Nothing to do")
|
||||
|
||||
|
||||
def handler(event, _):
|
||||
main(event)
|
||||
|
@ -2,28 +2,51 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import requests # type: ignore
|
||||
from unidiff import PatchSet # type: ignore
|
||||
|
||||
from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL, GITHUB_RUN_ID, GITHUB_EVENT_PATH
|
||||
from build_download_helper import get_with_retries
|
||||
from env_helper import (
|
||||
GITHUB_REPOSITORY,
|
||||
GITHUB_SERVER_URL,
|
||||
GITHUB_RUN_ID,
|
||||
GITHUB_EVENT_PATH,
|
||||
)
|
||||
|
||||
DIFF_IN_DOCUMENTATION_EXT = [
|
||||
".html",
|
||||
".md",
|
||||
".yml",
|
||||
".txt",
|
||||
".css",
|
||||
".js",
|
||||
".xml",
|
||||
".ico",
|
||||
".conf",
|
||||
".svg",
|
||||
".png",
|
||||
".jpg",
|
||||
".py",
|
||||
".sh",
|
||||
".json",
|
||||
]
|
||||
RETRY_SLEEP = 0
|
||||
|
||||
DIFF_IN_DOCUMENTATION_EXT = [".html", ".md", ".yml", ".txt", ".css", ".js", ".xml", ".ico", ".conf", ".svg", ".png",
|
||||
".jpg", ".py", ".sh", ".json"]
|
||||
|
||||
def get_pr_for_commit(sha, ref):
|
||||
if not ref:
|
||||
return None
|
||||
try_get_pr_url = f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{sha}/pulls"
|
||||
try_get_pr_url = (
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{sha}/pulls"
|
||||
)
|
||||
try:
|
||||
response = requests.get(try_get_pr_url)
|
||||
response.raise_for_status()
|
||||
response = get_with_retries(try_get_pr_url, sleep=RETRY_SLEEP)
|
||||
data = response.json()
|
||||
if len(data) > 1:
|
||||
print("Got more than one pr for commit", sha)
|
||||
for pr in data:
|
||||
# refs for pushes looks like refs/head/XX
|
||||
# refs for RPs looks like XX
|
||||
if pr['head']['ref'] in ref:
|
||||
if pr["head"]["ref"] in ref:
|
||||
return pr
|
||||
print("Cannot find PR with required ref", ref, "returning first one")
|
||||
first_pr = data[0]
|
||||
@ -35,15 +58,22 @@ def get_pr_for_commit(sha, ref):
|
||||
|
||||
class PRInfo:
|
||||
default_event = {
|
||||
'commits': 1,
|
||||
'before': 'HEAD~',
|
||||
'after': 'HEAD',
|
||||
'ref': None,
|
||||
}
|
||||
def __init__(self, github_event=None, need_orgs=False, need_changed_files=False, labels_from_api=False):
|
||||
"commits": 1,
|
||||
"before": "HEAD~",
|
||||
"after": "HEAD",
|
||||
"ref": None,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
github_event=None,
|
||||
need_orgs=False,
|
||||
need_changed_files=False,
|
||||
pr_event_from_api=False,
|
||||
):
|
||||
if not github_event:
|
||||
if GITHUB_EVENT_PATH:
|
||||
with open(GITHUB_EVENT_PATH, 'r', encoding='utf-8') as event_file:
|
||||
with open(GITHUB_EVENT_PATH, "r", encoding="utf-8") as event_file:
|
||||
github_event = json.load(event_file)
|
||||
else:
|
||||
github_event = PRInfo.default_event.copy()
|
||||
@ -51,22 +81,34 @@ class PRInfo:
|
||||
self.changed_files = set([])
|
||||
self.body = ""
|
||||
ref = github_event.get("ref", "refs/head/master")
|
||||
if ref and ref.startswith('refs/heads/'):
|
||||
if ref and ref.startswith("refs/heads/"):
|
||||
ref = ref[11:]
|
||||
|
||||
# workflow completed event, used for PRs only
|
||||
if 'action' in github_event and github_event['action'] == 'completed':
|
||||
self.sha = github_event['workflow_run']['head_sha']
|
||||
prs_for_sha = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{self.sha}/pulls").json()
|
||||
if "action" in github_event and github_event["action"] == "completed":
|
||||
self.sha = github_event["workflow_run"]["head_sha"]
|
||||
prs_for_sha = get_with_retries(
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{self.sha}"
|
||||
"/pulls",
|
||||
sleep=RETRY_SLEEP,
|
||||
).json()
|
||||
if len(prs_for_sha) != 0:
|
||||
github_event['pull_request'] = prs_for_sha[0]
|
||||
github_event["pull_request"] = prs_for_sha[0]
|
||||
|
||||
if 'pull_request' in github_event: # pull request and other similar events
|
||||
self.number = github_event['pull_request']['number']
|
||||
if 'after' in github_event:
|
||||
self.sha = github_event['after']
|
||||
if "pull_request" in github_event: # pull request and other similar events
|
||||
self.number = github_event["pull_request"]["number"]
|
||||
if pr_event_from_api:
|
||||
response = get_with_retries(
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}"
|
||||
f"/pulls/{self.number}",
|
||||
sleep=RETRY_SLEEP,
|
||||
)
|
||||
github_event["pull_request"] = response.json()
|
||||
|
||||
if "after" in github_event:
|
||||
self.sha = github_event["after"]
|
||||
else:
|
||||
self.sha = github_event['pull_request']['head']['sha']
|
||||
self.sha = github_event["pull_request"]["head"]["sha"]
|
||||
|
||||
repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}"
|
||||
self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}"
|
||||
@ -75,35 +117,35 @@ class PRInfo:
|
||||
self.commit_html_url = f"{repo_prefix}/commits/{self.sha}"
|
||||
self.pr_html_url = f"{repo_prefix}/pull/{self.number}"
|
||||
|
||||
self.base_ref = github_event['pull_request']['base']['ref']
|
||||
self.base_name = github_event['pull_request']['base']['repo']['full_name']
|
||||
self.head_ref = github_event['pull_request']['head']['ref']
|
||||
self.head_name = github_event['pull_request']['head']['repo']['full_name']
|
||||
self.body = github_event['pull_request']['body']
|
||||
self.base_ref = github_event["pull_request"]["base"]["ref"]
|
||||
self.base_name = github_event["pull_request"]["base"]["repo"]["full_name"]
|
||||
self.head_ref = github_event["pull_request"]["head"]["ref"]
|
||||
self.head_name = github_event["pull_request"]["head"]["repo"]["full_name"]
|
||||
self.body = github_event["pull_request"]["body"]
|
||||
self.labels = {
|
||||
label["name"] for label in github_event["pull_request"]["labels"]
|
||||
}
|
||||
|
||||
if labels_from_api:
|
||||
response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels")
|
||||
self.labels = {l['name'] for l in response.json()}
|
||||
else:
|
||||
self.labels = {l['name'] for l in github_event['pull_request']['labels']}
|
||||
|
||||
self.user_login = github_event['pull_request']['user']['login']
|
||||
self.user_login = github_event["pull_request"]["user"]["login"]
|
||||
self.user_orgs = set([])
|
||||
if need_orgs:
|
||||
user_orgs_response = requests.get(github_event['pull_request']['user']['organizations_url'])
|
||||
user_orgs_response = get_with_retries(
|
||||
github_event["pull_request"]["user"]["organizations_url"],
|
||||
sleep=RETRY_SLEEP,
|
||||
)
|
||||
if user_orgs_response.ok:
|
||||
response_json = user_orgs_response.json()
|
||||
self.user_orgs = set(org['id'] for org in response_json)
|
||||
self.user_orgs = set(org["id"] for org in response_json)
|
||||
|
||||
self.diff_url = github_event['pull_request']['diff_url']
|
||||
elif 'commits' in github_event:
|
||||
self.sha = github_event['after']
|
||||
pull_request = get_pr_for_commit(self.sha, github_event['ref'])
|
||||
self.diff_url = github_event["pull_request"]["diff_url"]
|
||||
elif "commits" in github_event:
|
||||
self.sha = github_event["after"]
|
||||
pull_request = get_pr_for_commit(self.sha, github_event["ref"])
|
||||
repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}"
|
||||
self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}"
|
||||
self.commit_html_url = f"{repo_prefix}/commits/{self.sha}"
|
||||
self.repo_full_name = GITHUB_REPOSITORY
|
||||
if pull_request is None or pull_request['state'] == 'closed':
|
||||
if pull_request is None or pull_request["state"] == "closed":
|
||||
# it's merged PR to master
|
||||
self.number = 0
|
||||
self.labels = {}
|
||||
@ -112,25 +154,25 @@ class PRInfo:
|
||||
self.base_name = self.repo_full_name
|
||||
self.head_ref = ref
|
||||
self.head_name = self.repo_full_name
|
||||
self.diff_url = \
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}/compare/{github_event['before']}...{self.sha}"
|
||||
self.diff_url = (
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}/"
|
||||
f"compare/{github_event['before']}...{self.sha}"
|
||||
)
|
||||
else:
|
||||
self.number = pull_request['number']
|
||||
if labels_from_api:
|
||||
response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels")
|
||||
self.labels = {l['name'] for l in response.json()}
|
||||
else:
|
||||
self.labels = {l['name'] for l in pull_request['labels']}
|
||||
self.labels = {label["name"] for label in pull_request["labels"]}
|
||||
|
||||
self.base_ref = pull_request['base']['ref']
|
||||
self.base_name = pull_request['base']['repo']['full_name']
|
||||
self.head_ref = pull_request['head']['ref']
|
||||
self.head_name = pull_request['head']['repo']['full_name']
|
||||
self.pr_html_url = pull_request['html_url']
|
||||
if 'pr-backport' in self.labels:
|
||||
self.diff_url = f"https://github.com/{GITHUB_REPOSITORY}/compare/master...{self.head_ref}.diff"
|
||||
self.base_ref = pull_request["base"]["ref"]
|
||||
self.base_name = pull_request["base"]["repo"]["full_name"]
|
||||
self.head_ref = pull_request["head"]["ref"]
|
||||
self.head_name = pull_request["head"]["repo"]["full_name"]
|
||||
self.pr_html_url = pull_request["html_url"]
|
||||
if "pr-backport" in self.labels:
|
||||
self.diff_url = (
|
||||
f"https://github.com/{GITHUB_REPOSITORY}/"
|
||||
f"compare/master...{self.head_ref}.diff"
|
||||
)
|
||||
else:
|
||||
self.diff_url = pull_request['diff_url']
|
||||
self.diff_url = pull_request["diff_url"]
|
||||
else:
|
||||
print(json.dumps(github_event, sort_keys=True, indent=4))
|
||||
self.sha = os.getenv("GITHUB_SHA")
|
||||
@ -153,24 +195,27 @@ class PRInfo:
|
||||
if not self.diff_url:
|
||||
raise Exception("Diff URL cannot be find for event")
|
||||
|
||||
response = requests.get(self.diff_url)
|
||||
response = get_with_retries(
|
||||
self.diff_url,
|
||||
sleep=RETRY_SLEEP,
|
||||
)
|
||||
response.raise_for_status()
|
||||
if 'commits' in self.event and self.number == 0:
|
||||
if "commits" in self.event and self.number == 0:
|
||||
diff = response.json()
|
||||
|
||||
if 'files' in diff:
|
||||
self.changed_files = [f['filename'] for f in diff['files']]
|
||||
if "files" in diff:
|
||||
self.changed_files = [f["filename"] for f in diff["files"]]
|
||||
else:
|
||||
diff_object = PatchSet(response.text)
|
||||
self.changed_files = {f.path for f in diff_object}
|
||||
|
||||
def get_dict(self):
|
||||
return {
|
||||
'sha': self.sha,
|
||||
'number': self.number,
|
||||
'labels': self.labels,
|
||||
'user_login': self.user_login,
|
||||
'user_orgs': self.user_orgs,
|
||||
"sha": self.sha,
|
||||
"number": self.number,
|
||||
"labels": self.labels,
|
||||
"user_login": self.user_login,
|
||||
"user_orgs": self.user_orgs,
|
||||
}
|
||||
|
||||
def has_changes_in_documentation(self):
|
||||
@ -181,49 +226,63 @@ class PRInfo:
|
||||
|
||||
for f in self.changed_files:
|
||||
_, ext = os.path.splitext(f)
|
||||
path_in_docs = 'docs' in f
|
||||
path_in_website = 'website' in f
|
||||
if (ext in DIFF_IN_DOCUMENTATION_EXT and (path_in_docs or path_in_website)) or 'docker/docs' in f:
|
||||
path_in_docs = "docs" in f
|
||||
path_in_website = "website" in f
|
||||
if (
|
||||
ext in DIFF_IN_DOCUMENTATION_EXT and (path_in_docs or path_in_website)
|
||||
) or "docker/docs" in f:
|
||||
return True
|
||||
return False
|
||||
|
||||
def can_skip_builds_and_use_version_from_master(self):
|
||||
if 'force tests' in self.labels:
|
||||
# TODO: See a broken loop
|
||||
if "force tests" in self.labels:
|
||||
return False
|
||||
|
||||
if self.changed_files is None or not self.changed_files:
|
||||
return False
|
||||
|
||||
for f in self.changed_files:
|
||||
if (not f.startswith('tests/queries')
|
||||
or not f.startswith('tests/integration')
|
||||
or not f.startswith('tests/performance')):
|
||||
# TODO: this logic is broken, should be fixed before using
|
||||
if (
|
||||
not f.startswith("tests/queries")
|
||||
or not f.startswith("tests/integration")
|
||||
or not f.startswith("tests/performance")
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def can_skip_integration_tests(self):
|
||||
if 'force tests' in self.labels:
|
||||
# TODO: See a broken loop
|
||||
if "force tests" in self.labels:
|
||||
return False
|
||||
|
||||
if self.changed_files is None or not self.changed_files:
|
||||
return False
|
||||
|
||||
for f in self.changed_files:
|
||||
if not f.startswith('tests/queries') or not f.startswith('tests/performance'):
|
||||
# TODO: this logic is broken, should be fixed before using
|
||||
if not f.startswith("tests/queries") or not f.startswith(
|
||||
"tests/performance"
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def can_skip_functional_tests(self):
|
||||
if 'force tests' in self.labels:
|
||||
# TODO: See a broken loop
|
||||
if "force tests" in self.labels:
|
||||
return False
|
||||
|
||||
if self.changed_files is None or not self.changed_files:
|
||||
return False
|
||||
|
||||
for f in self.changed_files:
|
||||
if not f.startswith('tests/integration') or not f.startswith('tests/performance'):
|
||||
# TODO: this logic is broken, should be fixed before using
|
||||
if not f.startswith("tests/integration") or not f.startswith(
|
||||
"tests/performance"
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
@ -204,7 +204,7 @@ def check_pr_description(pr_info):
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
pr_info = PRInfo(need_orgs=True, labels_from_api=True)
|
||||
pr_info = PRInfo(need_orgs=True, pr_event_from_api=True)
|
||||
can_run, description = should_run_checks_for_pr(pr_info)
|
||||
gh = Github(get_best_robot_token())
|
||||
commit = get_commit(gh, pr_info.sha)
|
||||
@ -212,6 +212,9 @@ if __name__ == "__main__":
|
||||
description_report = check_pr_description(pr_info)[:139]
|
||||
if description_report:
|
||||
print("::notice ::Cannot run, description does not match the template")
|
||||
logging.info(
|
||||
"PR body doesn't match the template: (start)\n%s\n(end)", pr_info.body
|
||||
)
|
||||
url = (
|
||||
f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/"
|
||||
"blob/master/.github/PULL_REQUEST_TEMPLATE.md?plain=1"
|
||||
|
@ -0,0 +1 @@
|
||||
#!/usr/bin/env python3
|
@ -0,0 +1,36 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<clickhouse>
|
||||
<keeper_server>
|
||||
<tcp_port>9181</tcp_port>
|
||||
<server_id>1</server_id>
|
||||
<log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
|
||||
<snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
|
||||
<coordination_settings>
|
||||
<operation_timeout_ms>5000</operation_timeout_ms>
|
||||
<raft_logs_level>trace</raft_logs_level>
|
||||
<session_timeout_ms>10000</session_timeout_ms>
|
||||
</coordination_settings>
|
||||
<raft_configuration>
|
||||
<server>
|
||||
<can_become_leader>true</can_become_leader>
|
||||
<hostname>node1</hostname>
|
||||
<id>1</id>
|
||||
<port>2888</port>
|
||||
<priority>1</priority>
|
||||
</server>
|
||||
</raft_configuration>
|
||||
</keeper_server>
|
||||
|
||||
<user_directories>
|
||||
<replicated>
|
||||
<zookeeper_path>/clickhouse/access</zookeeper_path>
|
||||
</replicated>
|
||||
</user_directories>
|
||||
|
||||
<zookeeper>
|
||||
<node index="1">
|
||||
<host>node1</host>
|
||||
<port>9181</port>
|
||||
</node>
|
||||
</zookeeper>
|
||||
</clickhouse>
|
21
tests/integration/test_keeper_and_access_storage/test.py
Normal file
21
tests/integration/test_keeper_and_access_storage/test.py
Normal file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import pytest
|
||||
|
||||
from helpers.cluster import ClickHouseCluster
|
||||
|
||||
cluster = ClickHouseCluster(__file__)
|
||||
|
||||
node1 = cluster.add_instance('node1', main_configs=['configs/keeper.xml'], stay_alive=True)
|
||||
|
||||
# test that server is able to start
|
||||
@pytest.fixture(scope="module")
|
||||
def started_cluster():
|
||||
try:
|
||||
cluster.start()
|
||||
yield cluster
|
||||
finally:
|
||||
cluster.shutdown()
|
||||
|
||||
def test_create_replicated(started_cluster):
|
||||
assert node1.query("SELECT 1") == "1\n"
|
@ -1141,14 +1141,14 @@ def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, m
|
||||
`v19` datetime(6) DEFAULT CURRENT_TIMESTAMP(6),
|
||||
`v20` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
`v21` TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP(6),
|
||||
/* todo support */
|
||||
# `v22` YEAR,
|
||||
# `v23` TIME,
|
||||
# `v24` TIME(3),
|
||||
# `v25` GEOMETRY,
|
||||
`v22` YEAR,
|
||||
`v23` TIME,
|
||||
`v24` TIME(6),
|
||||
`v25` GEOMETRY,
|
||||
`v26` bit(4),
|
||||
/* todo support */
|
||||
# `v27` JSON DEFAULT NULL,
|
||||
# `v28` set('a', 'c', 'f', 'd', 'e', 'b'),
|
||||
`v28` set('a', 'c', 'f', 'd', 'e', 'b'),
|
||||
`v29` mediumint(4) unsigned NOT NULL DEFAULT '0',
|
||||
`v30` varbinary(255) DEFAULT NULL COMMENT 'varbinary support',
|
||||
`v31` binary(200) DEFAULT NULL,
|
||||
@ -1158,8 +1158,9 @@ def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, m
|
||||
""")
|
||||
|
||||
mysql_node.query("""
|
||||
INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v31, v32) values
|
||||
(1, 11, 9223372036854775807, -1, 1, 11, 18446744073709551615, -1.1, 1.1, -1.111, 1.111, 1.1111, '2021-10-06', 'text', 'varchar', 'BLOB', '2021-10-06 18:32:57', '2021-10-06 18:32:57.482786', '2021-10-06 18:32:57', '2021-10-06 18:32:57.482786', b'1010', 11, 'varbinary', 'binary', 'RED');
|
||||
INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v28, v29, v30, v31, v32) values
|
||||
(1, 11, 9223372036854775807, -1, 1, 11, 18446744073709551615, -1.1, 1.1, -1.111, 1.111, 1.1111, '2021-10-06', 'text', 'varchar', 'BLOB', '2021-10-06 18:32:57',
|
||||
'2021-10-06 18:32:57.482786', '2021-10-06 18:32:57', '2021-10-06 18:32:57.482786', '2021', '838:59:59', '838:59:59.000000', ST_GeometryFromText('point(0.0 0.0)'), b'1010', 'a', 11, 'varbinary', 'binary', 'RED');
|
||||
""")
|
||||
clickhouse_node.query(
|
||||
"CREATE DATABASE test_database_datatype ENGINE = MaterializeMySQL('{}:3306', 'test_database_datatype', 'root', 'clickhouse')".format(
|
||||
@ -1167,14 +1168,18 @@ def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, m
|
||||
|
||||
check_query(clickhouse_node, "SELECT name FROM system.tables WHERE database = 'test_database_datatype'", "t1\n")
|
||||
# full synchronization check
|
||||
check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v32 FROM test_database_datatype.t1 FORMAT TSV",
|
||||
"1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t10\t11\tvarbinary\tRED\n")
|
||||
check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, hex(v25), v26, v28, v29, v30, v32 FROM test_database_datatype.t1 FORMAT TSV",
|
||||
"1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57" +
|
||||
"\t2021-10-06 18:32:57.482786\t2021\t3020399000000\t3020399000000\t00000000010100000000000000000000000000000000000000\t10\t1\t11\tvarbinary\tRED\n")
|
||||
|
||||
mysql_node.query("""
|
||||
INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v31, v32) values
|
||||
(2, 22, 9223372036854775807, -2, 2, 22, 18446744073709551615, -2.2, 2.2, -2.22, 2.222, 2.2222, '2021-10-07', 'text', 'varchar', 'BLOB', '2021-10-07 18:32:57', '2021-10-07 18:32:57.482786', '2021-10-07 18:32:57', '2021-10-07 18:32:57.482786', b'1011', 22, 'varbinary', 'binary', 'GREEN' );
|
||||
INSERT INTO test_database_datatype.t1 (v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v28, v29, v30, v31, v32) values
|
||||
(2, 22, 9223372036854775807, -2, 2, 22, 18446744073709551615, -2.2, 2.2, -2.22, 2.222, 2.2222, '2021-10-07', 'text', 'varchar', 'BLOB', '2021-10-07 18:32:57',
|
||||
'2021-10-07 18:32:57.482786', '2021-10-07 18:32:57', '2021-10-07 18:32:57.482786', '2021', '-838:59:59', '-12:59:58.000001', ST_GeometryFromText('point(120.153576 30.287459)'), b'1011', 'a,c', 22, 'varbinary', 'binary', 'GREEN' );
|
||||
""")
|
||||
# increment synchronization check
|
||||
check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v26, v29, v30, v32 FROM test_database_datatype.t1 ORDER BY v1 FORMAT TSV",
|
||||
"1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t10\t11\tvarbinary\tRED\n" +
|
||||
"2\t2\t22\t9223372036854775807\t-2\t2\t22\t18446744073709551615\t-2.2\t2.2\t-2.22\t2.222\t2.2222\t2021-10-07\ttext\tvarchar\tBLOB\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786\t11\t22\tvarbinary\tGREEN\n")
|
||||
check_query(clickhouse_node, "SELECT v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, hex(v25), v26, v28, v29, v30, v32 FROM test_database_datatype.t1 FORMAT TSV",
|
||||
"1\t1\t11\t9223372036854775807\t-1\t1\t11\t18446744073709551615\t-1.1\t1.1\t-1.111\t1.111\t1.1111\t2021-10-06\ttext\tvarchar\tBLOB\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786\t2021-10-06 18:32:57\t2021-10-06 18:32:57.482786" +
|
||||
"\t2021\t3020399000000\t3020399000000\t00000000010100000000000000000000000000000000000000\t10\t1\t11\tvarbinary\tRED\n" +
|
||||
"2\t2\t22\t9223372036854775807\t-2\t2\t22\t18446744073709551615\t-2.2\t2.2\t-2.22\t2.222\t2.2222\t2021-10-07\ttext\tvarchar\tBLOB\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786\t2021-10-07 18:32:57\t2021-10-07 18:32:57.482786" +
|
||||
"\t2021\t-3020399000000\t-46798000001\t000000000101000000D55C6E30D4095E40DCF0BBE996493E40\t11\t3\t22\tvarbinary\tGREEN\n")
|
||||
|
20
tests/performance/classification.xml
Normal file
20
tests/performance/classification.xml
Normal file
@ -0,0 +1,20 @@
|
||||
<test>
|
||||
<settings>
|
||||
<allow_experimental_nlp_functions>1</allow_experimental_nlp_functions>
|
||||
</settings>
|
||||
|
||||
<preconditions>
|
||||
<table_exists>hits_100m_single</table_exists>
|
||||
</preconditions>
|
||||
|
||||
<query>SELECT detectLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
<query>SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
<query>SELECT detectTonality(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
|
||||
<!-- Input is not really correct for these functions,
|
||||
but at least it gives us some idea about their performance -->
|
||||
<query>SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
<query>SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
<query>SELECT detectCharset(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
|
||||
</test>
|
@ -90,3 +90,31 @@
|
||||
21
|
||||
22
|
||||
23
|
||||
6
|
||||
7
|
||||
7
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
17
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
|
@ -93,3 +93,34 @@ SELECT position(concat(' иголка.ру', arrayStringConcat
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test ß test'), 'ß') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test AaßAa test'), 'aßa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test A1ß2a test'), '1ß2') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat('test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
|
@ -1,3 +1,2 @@
|
||||
SELECT positionCaseInsensitiveUTF8('иголка.ру', 'иголка.р<>\0') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8('иголка.ру', randomString(rand() % 100)) FROM system.numbers; -- { serverError 2 }
|
||||
SELECT sum(ignore(positionCaseInsensitiveUTF8('иголка.ру', randomString(rand() % 2)))) FROM numbers(1000000);
|
||||
|
15
tests/queries/0_stateless/02133_classification.reference
Normal file
15
tests/queries/0_stateless/02133_classification.reference
Normal file
@ -0,0 +1,15 @@
|
||||
ru
|
||||
en
|
||||
fr
|
||||
ja
|
||||
zh
|
||||
un
|
||||
{'ja':0.62,'fr':0.36}
|
||||
{'ko':0.98}
|
||||
{}
|
||||
ISO-8859-1
|
||||
en
|
||||
0.465
|
||||
-0.28823888
|
||||
0.050505556
|
||||
C++
|
23
tests/queries/0_stateless/02133_classification.sql
Normal file
23
tests/queries/0_stateless/02133_classification.sql
Normal file
@ -0,0 +1,23 @@
|
||||
-- Tags: no-fasttest
|
||||
-- Tag no-fasttest: depends on cld2 and nlp-data
|
||||
|
||||
SET allow_experimental_nlp_functions = 1;
|
||||
|
||||
SELECT detectLanguage('Они сошлись. Волна и камень, Стихи и проза, лед и пламень, Не столь различны меж собой.');
|
||||
SELECT detectLanguage('Sweet are the uses of adversity which, like the toad, ugly and venomous, wears yet a precious jewel in his head.');
|
||||
SELECT detectLanguage('A vaincre sans peril, on triomphe sans gloire.');
|
||||
SELECT detectLanguage('二兎を追う者は一兎をも得ず');
|
||||
SELECT detectLanguage('有情饮水饱,无情食饭饥。');
|
||||
SELECT detectLanguage('*****///// _____ ,,,,,,,, .....');
|
||||
SELECT detectLanguageMixed('二兎を追う者は一兎をも得ず二兎を追う者は一兎をも得ず A vaincre sans peril, on triomphe sans gloire.');
|
||||
SELECT detectLanguageMixed('어디든 가치가 있는 곳으로 가려면 지름길은 없다');
|
||||
SELECT detectLanguageMixed('*****///// _____ ,,,,,,,, .....');
|
||||
|
||||
SELECT detectCharset('Plain English');
|
||||
SELECT detectLanguageUnknown('Plain English');
|
||||
|
||||
SELECT detectTonality('милая кошка');
|
||||
SELECT detectTonality('ненависть к людям');
|
||||
SELECT detectTonality('обычная прогулка по ближайшему парку');
|
||||
|
||||
SELECT detectProgrammingLanguage('#include <iostream>');
|
@ -3,3 +3,5 @@ DefaultValue
|
||||
1
|
||||
0
|
||||
0 15 20 Value
|
||||
0 10 0 Value
|
||||
0 15 10 Value
|
||||
|
@ -38,3 +38,7 @@ PolygonDictionary
|
||||
1
|
||||
0
|
||||
[[[(0,0),(0,1),(1,1),(1,0)]]]
|
||||
RangeHashedDictionary
|
||||
0 0 1
|
||||
1
|
||||
0
|
||||
|
@ -170,7 +170,7 @@ CREATE TABLE 02183_range_dictionary_source_table
|
||||
)
|
||||
ENGINE = TinyLog;
|
||||
|
||||
INSERT INTO 02183_range_dictionary_source_table VALUES(1, 0, 1);
|
||||
INSERT INTO 02183_range_dictionary_source_table VALUES(0, 0, 1);
|
||||
|
||||
DROP DICTIONARY IF EXISTS 02183_range_dictionary;
|
||||
CREATE DICTIONARY 02183_range_dictionary
|
||||
@ -185,7 +185,10 @@ LAYOUT(RANGE_HASHED())
|
||||
RANGE(MIN start MAX end)
|
||||
LIFETIME(0);
|
||||
|
||||
SELECT * FROM 02183_range_dictionary; -- {serverError 1}
|
||||
SELECT 'RangeHashedDictionary';
|
||||
SELECT * FROM 02183_range_dictionary;
|
||||
SELECT dictHas('02183_range_dictionary', 0, 0);
|
||||
SELECT dictHas('02183_range_dictionary', 0, 2);
|
||||
|
||||
DROP DICTIONARY 02183_range_dictionary;
|
||||
DROP TABLE 02183_range_dictionary_source_table;
|
||||
|
2
tests/queries/0_stateless/02184_ipv6_parsing.reference
Normal file
2
tests/queries/0_stateless/02184_ipv6_parsing.reference
Normal file
@ -0,0 +1,2 @@
|
||||
2001:db9:85a3::8a2e:370:7334
|
||||
2001:db8:85a3::8a2e:370:7334
|
11
tests/queries/0_stateless/02184_ipv6_parsing.sh
Executable file
11
tests/queries/0_stateless/02184_ipv6_parsing.sh
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel, no-fasttest
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select toString(toIPv6('2001:db9:85a3::8a2e:370:7334'))"
|
||||
$CLICKHOUSE_CLIENT --param_var 2001:db8:85a3::8a2e:370:7334 -q "select {var:IPv6}"
|
||||
|
@ -0,0 +1,3 @@
|
||||
1 0 18446744073709551615 value0 value1 value2
|
||||
('value0','value1','value2')
|
||||
1
|
@ -0,0 +1,36 @@
|
||||
DROP TABLE IF EXISTS 02184_range_dictionary_source_table;
|
||||
CREATE TABLE 02184_range_dictionary_source_table
|
||||
(
|
||||
id UInt64,
|
||||
start UInt64,
|
||||
end UInt64,
|
||||
value_0 String,
|
||||
value_1 String,
|
||||
value_2 String
|
||||
)
|
||||
ENGINE = TinyLog;
|
||||
|
||||
INSERT INTO 02184_range_dictionary_source_table VALUES (1, 0, 18446744073709551615, 'value0', 'value1', 'value2');
|
||||
|
||||
DROP DICTIONARY IF EXISTS 02184_range_dictionary;
|
||||
CREATE DICTIONARY 02184_range_dictionary
|
||||
(
|
||||
id UInt64,
|
||||
start UInt64,
|
||||
end UInt64,
|
||||
value_0 String,
|
||||
value_1 String,
|
||||
value_2 String
|
||||
)
|
||||
PRIMARY KEY id
|
||||
SOURCE(CLICKHOUSE(TABLE '02184_range_dictionary_source_table'))
|
||||
LAYOUT(RANGE_HASHED())
|
||||
RANGE(MIN start MAX end)
|
||||
LIFETIME(0);
|
||||
|
||||
SELECT * FROM 02184_range_dictionary;
|
||||
SELECT dictGet('02184_range_dictionary', ('value_0', 'value_1', 'value_2'), 1, 18446744073709551615);
|
||||
SELECT dictHas('02184_range_dictionary', 1, 18446744073709551615);
|
||||
|
||||
DROP DICTIONARY 02184_range_dictionary;
|
||||
DROP TABLE 02184_range_dictionary_source_table;
|
@ -0,0 +1,22 @@
|
||||
Source table
|
||||
0 \N 5000 Value0
|
||||
0 5001 10000 Value1
|
||||
0 10001 \N Value2
|
||||
Dictionary convert_null_range_bound_to_open = 1
|
||||
0 5001 10000 Value1
|
||||
0 0 5000 Value0
|
||||
0 10001 18446744073709551615 Value2
|
||||
Value0
|
||||
Value1
|
||||
Value2
|
||||
1
|
||||
1
|
||||
1
|
||||
Dictionary convert_null_range_bound_to_open = 0
|
||||
0 5001 10000 Value1
|
||||
DefaultValue
|
||||
Value1
|
||||
DefaultValue
|
||||
0
|
||||
1
|
||||
0
|
@ -0,0 +1,63 @@
|
||||
DROP TABLE IF EXISTS 02185_range_dictionary_source_table;
|
||||
CREATE TABLE 02185_range_dictionary_source_table
|
||||
(
|
||||
id UInt64,
|
||||
start Nullable(UInt64),
|
||||
end Nullable(UInt64),
|
||||
value String
|
||||
)
|
||||
ENGINE = TinyLog;
|
||||
|
||||
INSERT INTO 02185_range_dictionary_source_table VALUES (0, NULL, 5000, 'Value0'), (0, 5001, 10000, 'Value1'), (0, 10001, NULL, 'Value2');
|
||||
|
||||
SELECT 'Source table';
|
||||
SELECT * FROM 02185_range_dictionary_source_table;
|
||||
|
||||
DROP DICTIONARY IF EXISTS 02185_range_dictionary;
|
||||
CREATE DICTIONARY 02185_range_dictionary
|
||||
(
|
||||
id UInt64,
|
||||
start UInt64,
|
||||
end UInt64,
|
||||
value String DEFAULT 'DefaultValue'
|
||||
)
|
||||
PRIMARY KEY id
|
||||
SOURCE(CLICKHOUSE(TABLE '02185_range_dictionary_source_table'))
|
||||
LAYOUT(RANGE_HASHED(convert_null_range_bound_to_open 1))
|
||||
RANGE(MIN start MAX end)
|
||||
LIFETIME(0);
|
||||
|
||||
SELECT 'Dictionary convert_null_range_bound_to_open = 1';
|
||||
SELECT * FROM 02185_range_dictionary;
|
||||
SELECT dictGet('02185_range_dictionary', 'value', 0, 0);
|
||||
SELECT dictGet('02185_range_dictionary', 'value', 0, 5001);
|
||||
SELECT dictGet('02185_range_dictionary', 'value', 0, 10001);
|
||||
SELECT dictHas('02185_range_dictionary', 0, 0);
|
||||
SELECT dictHas('02185_range_dictionary', 0, 5001);
|
||||
SELECT dictHas('02185_range_dictionary', 0, 10001);
|
||||
|
||||
DROP DICTIONARY 02185_range_dictionary;
|
||||
|
||||
CREATE DICTIONARY 02185_range_dictionary
|
||||
(
|
||||
id UInt64,
|
||||
start UInt64,
|
||||
end UInt64,
|
||||
value String DEFAULT 'DefaultValue'
|
||||
)
|
||||
PRIMARY KEY id
|
||||
SOURCE(CLICKHOUSE(TABLE '02185_range_dictionary_source_table'))
|
||||
LAYOUT(RANGE_HASHED(convert_null_range_bound_to_open 0))
|
||||
RANGE(MIN start MAX end)
|
||||
LIFETIME(0);
|
||||
|
||||
SELECT 'Dictionary convert_null_range_bound_to_open = 0';
|
||||
SELECT * FROM 02185_range_dictionary;
|
||||
SELECT dictGet('02185_range_dictionary', 'value', 0, 0);
|
||||
SELECT dictGet('02185_range_dictionary', 'value', 0, 5001);
|
||||
SELECT dictGet('02185_range_dictionary', 'value', 0, 10001);
|
||||
SELECT dictHas('02185_range_dictionary', 0, 0);
|
||||
SELECT dictHas('02185_range_dictionary', 0, 5001);
|
||||
SELECT dictHas('02185_range_dictionary', 0, 10001);
|
||||
|
||||
DROP TABLE 02185_range_dictionary_source_table;
|
@ -0,0 +1,18 @@
|
||||
Source table
|
||||
1 2020-01-01 2100-01-01 Value0
|
||||
1 2020-01-02 2100-01-01 Value1
|
||||
1 2020-01-03 2100-01-01 Value2
|
||||
Dictionary .range_lookup_strategy = min
|
||||
1 2020-01-01 2100-01-01 Value0
|
||||
1 2020-01-02 2100-01-01 Value1
|
||||
1 2020-01-03 2100-01-01 Value2
|
||||
Value0
|
||||
Value0
|
||||
Value0
|
||||
Dictionary .range_lookup_strategy = max
|
||||
1 2020-01-01 2100-01-01 Value0
|
||||
1 2020-01-02 2100-01-01 Value1
|
||||
1 2020-01-03 2100-01-01 Value2
|
||||
Value0
|
||||
Value1
|
||||
Value2
|
@ -0,0 +1,64 @@
|
||||
DROP TABLE IF EXISTS 02186_range_dictionary_source_table;
|
||||
CREATE TABLE 02186_range_dictionary_source_table
|
||||
(
|
||||
id UInt64,
|
||||
start Date,
|
||||
end Date,
|
||||
value String
|
||||
)
|
||||
Engine = TinyLog;
|
||||
|
||||
INSERT INTO 02186_range_dictionary_source_table VALUES (1, '2020-01-01', '2100-01-01', 'Value0');
|
||||
INSERT INTO 02186_range_dictionary_source_table VALUES (1, '2020-01-02', '2100-01-01', 'Value1');
|
||||
INSERT INTO 02186_range_dictionary_source_table VALUES (1, '2020-01-03', '2100-01-01', 'Value2');
|
||||
|
||||
SELECT 'Source table';
|
||||
SELECT * FROM 02186_range_dictionary_source_table;
|
||||
|
||||
DROP DICTIONARY IF EXISTS 02186_range_dictionary;
|
||||
CREATE DICTIONARY 02186_range_dictionary
|
||||
(
|
||||
id UInt64,
|
||||
start Date,
|
||||
end Date,
|
||||
value String
|
||||
)
|
||||
PRIMARY KEY id
|
||||
SOURCE(CLICKHOUSE(TABLE '02186_range_dictionary_source_table'))
|
||||
LAYOUT(RANGE_HASHED(range_lookup_strategy 'min'))
|
||||
RANGE(MIN start MAX end)
|
||||
LIFETIME(0);
|
||||
|
||||
SELECT 'Dictionary .range_lookup_strategy = min';
|
||||
|
||||
SELECT * FROM 02186_range_dictionary;
|
||||
|
||||
select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-01'));
|
||||
select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-02'));
|
||||
select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-03'));
|
||||
|
||||
DROP DICTIONARY 02186_range_dictionary;
|
||||
|
||||
CREATE DICTIONARY 02186_range_dictionary
|
||||
(
|
||||
id UInt64,
|
||||
start Date,
|
||||
end Date,
|
||||
value String
|
||||
)
|
||||
PRIMARY KEY id
|
||||
SOURCE(CLICKHOUSE(TABLE '02186_range_dictionary_source_table'))
|
||||
LAYOUT(RANGE_HASHED(range_lookup_strategy 'max'))
|
||||
RANGE(MIN start MAX end)
|
||||
LIFETIME(0);
|
||||
|
||||
SELECT 'Dictionary .range_lookup_strategy = max';
|
||||
|
||||
SELECT * FROM 02186_range_dictionary;
|
||||
|
||||
select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-01'));
|
||||
select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-02'));
|
||||
select dictGet('02186_range_dictionary', 'value', toUInt64(1), toDate('2020-01-03'));
|
||||
|
||||
DROP DICTIONARY 02186_range_dictionary;
|
||||
DROP TABLE 02186_range_dictionary_source_table;
|
288
utils/c++expr
Executable file
288
utils/c++expr
Executable file
@ -0,0 +1,288 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
usage() {
|
||||
cat <<EOF >&2
|
||||
USAGE: c++expr [-c CXX | -C | -I] [-i INCLUDE] [-b STEPS] [-t TESTS] [-o FILE] [-O CXX_OPTS...] [-g 'GLOBAL CODE'] 'MAIN CODE'
|
||||
OPTIONS:
|
||||
-c CXX use specified c++ compiler
|
||||
-C use cmake
|
||||
-I integrate into ClickHouse build tree in current directory
|
||||
-i INC add #include <INC>
|
||||
-l LIB link against LIB (only for -I or -C)
|
||||
-b STEPS_NUM make program to benchmark specified code snippet and run tests with STEPS_NUM each
|
||||
-b perf-top run infinite benchmark and show perf top
|
||||
-t TESTS_NUM make program to benchmark specified code snippet and run TESTS_NUM tests
|
||||
-o FILE do not run, just save binary executable file
|
||||
-O CXX_OPTS forward option compiler (e.g. -O "-O3 -std=c++20")
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
SOURCE_FILE=main.cpp
|
||||
GLOBAL=
|
||||
OUTPUT_EXECUTABLE=
|
||||
INCS="vector iostream typeinfo cstdlib cmath sys/time.h"
|
||||
LIBS=""
|
||||
BENCHMARK_STEPS=0
|
||||
RUN_PERFTOP=
|
||||
BENCHMARK_TESTS=5
|
||||
USE_CMAKE=
|
||||
USE_CLICKHOUSE=
|
||||
CXX=g++
|
||||
CXX_OPTS=
|
||||
CMD_PARAMS=
|
||||
|
||||
#
|
||||
# Parse command line
|
||||
#
|
||||
|
||||
if [ "$1" == "--help" ]; then usage; fi
|
||||
while getopts "vc:CIi:l:b:t:o:O:g:" OPT; do
|
||||
case "$OPT" in
|
||||
v) set -x; ;;
|
||||
c) CXX="$OPTARG"; ;;
|
||||
C) USE_CMAKE=y; ;;
|
||||
I) USE_CLICKHOUSE=y; LIBS="$LIBS clickhouse_common_io"; ;;
|
||||
i) INCS="$INCS $OPTARG"; ;;
|
||||
l) LIBS="$LIBS $OPTARG"; ;;
|
||||
b) if [ "$OPTARG" = perf-top ]; then BENCHMARK_STEPS=-1; RUN_PERFTOP=y; else BENCHMARK_STEPS="$OPTARG"; fi; ;;
|
||||
t) BENCHMARK_TESTS="$OPTARG"; ;;
|
||||
o) OUTPUT_EXECUTABLE="$OPTARG"; ;;
|
||||
O) CXX_OPTS="$CXX_OPTS $OPTARG"; ;;
|
||||
g) GLOBAL="$OPTARG"; ;;
|
||||
esac
|
||||
done
|
||||
shift $(( $OPTIND - 1 ))
|
||||
|
||||
#
|
||||
# Positional arguments
|
||||
#
|
||||
|
||||
EXPR=$1
|
||||
shift
|
||||
|
||||
if [ -z "$EXPR" ]; then usage; fi
|
||||
|
||||
#
|
||||
# Arguments forwarded to program should go after main code and before --
|
||||
#
|
||||
|
||||
while [ -n "$1" ] && [ "$1" != "--" ]; do
|
||||
CMD_PARAMS="$CMD_PARAMS $1"
|
||||
shift
|
||||
done
|
||||
if [ "$1" == "--" ]; then shift; fi
|
||||
|
||||
#
|
||||
# Setup workdir
|
||||
#
|
||||
|
||||
find_clickhouse_root () {
|
||||
local DIR="`pwd`"
|
||||
while [ $DIR != "/" ]; do
|
||||
if [ ! -e "$DIR/CMakeLists.txt" ]; then
|
||||
echo "error: $DIR has no CMakeLists.txt"
|
||||
return 1
|
||||
fi
|
||||
if grep "project(ClickHouse)" "$DIR/CMakeLists.txt" >/dev/null 2>&1; then
|
||||
echo $DIR
|
||||
return 0
|
||||
fi
|
||||
DIR="`dirname $DIR`"
|
||||
done
|
||||
echo "error: unable to find Clickhouse root folder"
|
||||
return 1
|
||||
}
|
||||
|
||||
find_clickhouse_build () {
|
||||
local CLICKHOUSE_ROOT="`find_clickhouse_root`"
|
||||
if [ -e "$CLICKHOUSE_ROOT/build/CMakeCache.txt" ]; then
|
||||
echo "$CLICKHOUSE_ROOT/build"
|
||||
return 0
|
||||
fi
|
||||
echo "error: $CLICKHOUSE_ROOT/build/CMakeCache.txt doesn't exist"
|
||||
return 1
|
||||
}
|
||||
|
||||
CALL_DIR=`pwd`
|
||||
EXECUTABLE=cppexpr_$$
|
||||
EXECUTABLE_DIR=.
|
||||
|
||||
if [ -n "$USE_CLICKHOUSE" ]; then
|
||||
SUBDIR=cppexpr_$$
|
||||
WORKDIR=$CALL_DIR/$SUBDIR
|
||||
if [ ! -e $CALL_DIR/CMakeLists.txt ]; then
|
||||
echo "error: $CALL_DIR/CMakeLists.txt is required for integration" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CLICKHOUSE_ROOT="`find_clickhouse_root`"
|
||||
BUILD_ROOT="`find_clickhouse_build`"
|
||||
CLICKHOUSE_PATH="${WORKDIR/$CLICKHOUSE_ROOT}"
|
||||
EXECUTABLE_DIR="${BUILD_ROOT}${CLICKHOUSE_PATH}"
|
||||
|
||||
if [ -z "$CLICKHOUSE_ROOT" ] || [ -z "$BUILD_ROOT" ] || [ -z "$CLICKHOUSE_PATH" ]; then
|
||||
echo "error: unable to locate ClickHouse" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cp $CALL_DIR/CMakeLists.txt $CALL_DIR/CMakeLists.txt.backup.$$
|
||||
echo "add_subdirectory ($SUBDIR)" >>$CALL_DIR/CMakeLists.txt
|
||||
cleanup() {
|
||||
mv $CALL_DIR/CMakeLists.txt.backup.$$ $CALL_DIR/CMakeLists.txt
|
||||
rm -rf $WORKDIR
|
||||
rm -rf ${BUILD_ROOT}${CLICKHOUSE_PATH}
|
||||
}
|
||||
else
|
||||
WORKDIR=/var/tmp/cppexpr_$$
|
||||
cleanup() {
|
||||
rm -rf $WORKDIR
|
||||
}
|
||||
fi
|
||||
|
||||
mkdir -p $WORKDIR
|
||||
cd $WORKDIR
|
||||
|
||||
#
|
||||
# Generate CMakeLists.txt
|
||||
#
|
||||
if [ -n "$USE_CMAKE" ]; then
|
||||
cat <<EOF >>CMakeLists.txt
|
||||
project(CppExpr)
|
||||
SET(PROJECT_NAME CppExpr)
|
||||
SET(CMAKE_INCLUDE_CURRENT_DIR TRUE)
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
set(CMAKE_CXX_FLAGS -fPIC)
|
||||
set(CMAKE_C_FLAGS -fPIC)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
set(SOURCES $SOURCE_FILE)
|
||||
add_executable($EXECUTABLE \${SOURCES})
|
||||
EOF
|
||||
fi
|
||||
|
||||
#
|
||||
# Generate CMakeLists.txt for integration
|
||||
#
|
||||
if [ -n "$USE_CLICKHOUSE" ]; then
|
||||
cat <<EOF >>CMakeLists.txt
|
||||
add_executable($EXECUTABLE $SOURCE_FILE)
|
||||
EOF
|
||||
fi
|
||||
|
||||
#
|
||||
# Add libraries to CMakeLists.txt
|
||||
#
|
||||
if [ -n "$LIBS" ]; then
|
||||
cat <<EOF >>CMakeLists.txt
|
||||
target_link_libraries($EXECUTABLE PRIVATE $LIBS)
|
||||
EOF
|
||||
fi
|
||||
|
||||
#
|
||||
# Generate source code
|
||||
#
|
||||
>$SOURCE_FILE
|
||||
for INC in $INCS; do
|
||||
echo "#include <$INC>" >> $SOURCE_FILE
|
||||
done
|
||||
cat <<EOF >>$SOURCE_FILE
|
||||
|
||||
#define OUT(expr) std::cout << #expr << " -> " << (expr) << std::endl;
|
||||
size_t max_tests = $BENCHMARK_TESTS;
|
||||
size_t max_steps = $BENCHMARK_STEPS;
|
||||
$GLOBAL
|
||||
int main(int argc, char** argv) {
|
||||
(void)argc; (void)argv;
|
||||
try {
|
||||
EOF
|
||||
|
||||
if [ $BENCHMARK_STEPS -eq 0 ]; then
|
||||
cat <<EOF >>$SOURCE_FILE
|
||||
$EXPR
|
||||
EOF
|
||||
else
|
||||
cat <<EOF >>$SOURCE_FILE
|
||||
std::cout << "Steps per test: " << max_steps << std::endl;
|
||||
if (max_steps == 0) max_steps = 1;
|
||||
double total = 0.0;
|
||||
for (size_t test = 0; test < max_tests; test++) {
|
||||
timeval beg, end;
|
||||
gettimeofday(&beg, nullptr);
|
||||
for (size_t step = 0; step < max_steps; step++) {
|
||||
asm volatile("" ::: "memory");
|
||||
$EXPR
|
||||
}
|
||||
gettimeofday(&end, nullptr);
|
||||
double interval = (end.tv_sec - beg.tv_sec)*1e6 + (end.tv_usec - beg.tv_usec);
|
||||
std::cout << "Test #" << test << ": " << interval / max_steps << " us\t" << max_steps * 1e6 / interval << " sps" << std::endl;
|
||||
total += interval;
|
||||
}
|
||||
std::cout << "Average: " << total / max_tests / max_steps << " us\t" << max_steps * 1e6 / (total / max_tests) << " sps" << std::endl;
|
||||
EOF
|
||||
fi
|
||||
|
||||
cat <<EOF >>$SOURCE_FILE
|
||||
return 0;
|
||||
} catch (std::exception& e) {
|
||||
std::cerr << "unhandled exception (" << typeid(e).name() << "):" << e.what() << std::endl;
|
||||
} catch (...) {
|
||||
std::cerr << "unknown unhandled exception\n";
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#ifdef OUT
|
||||
#undef OUT
|
||||
#endif
|
||||
EOF
|
||||
|
||||
#
|
||||
# Compile
|
||||
#
|
||||
if [ -n "$USE_CMAKE" ]; then
|
||||
if ! (cmake . && make); then
|
||||
cat -n $SOURCE_FILE
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
elif [ -n "$USE_CLICKHOUSE" ]; then
|
||||
if ! (cd $BUILD_ROOT && ninja $EXECUTABLE) >stdout.log 2>stderr.log; then
|
||||
cat stdout.log
|
||||
cat stderr.log >&2
|
||||
cat -n $SOURCE_FILE
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
RET=0
|
||||
$CXX $CXX_OPTS -I$CALL_DIR -o $EXECUTABLE $SOURCE_FILE || RET=$?
|
||||
if [ $RET -ne 0 ]; then
|
||||
cat -n $SOURCE_FILE
|
||||
cleanup
|
||||
exit $RET
|
||||
fi
|
||||
fi
|
||||
|
||||
#
|
||||
# Execute
|
||||
#
|
||||
RET=0
|
||||
if [ -z "$OUTPUT_EXECUTABLE" ]; then
|
||||
if [ -z "$RUN_PERFTOP" ]; then
|
||||
"$@" $EXECUTABLE_DIR/$EXECUTABLE $CMD_PARAMS || RET=$?
|
||||
else
|
||||
"$@" $EXECUTABLE_DIR/$EXECUTABLE $CMD_PARAMS &
|
||||
PID=$!
|
||||
perf top -p $PID
|
||||
kill $PID
|
||||
fi
|
||||
else
|
||||
cp $EXECUTABLE_DIR/$EXECUTABLE $CALL_DIR/$OUTPUT_EXECUTABLE
|
||||
fi
|
||||
|
||||
#
|
||||
# Cleanup
|
||||
#
|
||||
cleanup
|
||||
echo "Exit code: $RET"
|
||||
exit $RET
|
@ -3,6 +3,7 @@ title: 'Evolution of Data Structures in Yandex.Metrica'
|
||||
image: 'https://blog-images.clickhouse.com/en/2016/evolution-of-data-structures-in-yandex-metrica/main.jpg'
|
||||
date: '2016-12-13'
|
||||
tags: ['Yandex.Metrica', 'data structures', 'LSM tree', 'columnar storage']
|
||||
author: 'Alexey Milovidov'
|
||||
---
|
||||
|
||||
[Yandex.Metrica](https://metrica.yandex.com/) takes in a stream of data representing events that took place on sites or on apps. Our task is to keep this data and present it in an analyzable form. The real challenge lies in trying to determine what form the processed results should be saved in so that they are easy to work with. During the development process, we had to completely change our approach to data storage organization several times. We started with MyISAM tables, then used LSM-trees and eventually came up with column-oriented database, ClickHouse.
|
||||
@ -104,5 +105,3 @@ Effective hardware utilization is very important to us. In our experience, when
|
||||
To maximize efficiency, it's important to customize your solution to meet the needs of specific type of workload. There is no data structure that copes well with completely different scenarios. For example, it's clear that key-value databases don't work for analytical queries. The greater the load on the system, the narrower the specialization required. One should not be afraid to use completely different data structures for different tasks.
|
||||
|
||||
We were able to set things up so that Yandex.Metrica's hardware was relatively inexpensive. This has allowed us to offer the service free of charge to even very large sites and mobile apps, even larger than Yanex‘s own, while competitors typically start asking for a paid subscription plan.
|
||||
|
||||
|
||||
|
@ -3,6 +3,7 @@ title: 'Yandex Opensources ClickHouse'
|
||||
image: 'https://blog-images.clickhouse.com/en/2016/yandex-opensources-clickhouse/main.jpg'
|
||||
date: '2016-06-15'
|
||||
tags: ['announcement', 'GitHub', 'license']
|
||||
author: 'Alexey Milovidov'
|
||||
---
|
||||
|
||||
Today [analytical DBMS ClickHouse](https://clickhouse.com/) initially developed internally at Yandex, became available to everyone. Source code is published on [GitHub](https://github.com/ClickHouse/ClickHouse) under Apache 2.0 license.
|
||||
|
@ -3,6 +3,7 @@ title: 'ClickHouse at Data@Scale 2017'
|
||||
image: 'https://blog-images.clickhouse.com/en/2017/clickhouse-at-data-scale-2017/main.jpg'
|
||||
date: '2017-06-15'
|
||||
tags: ['conference', 'Seattle', 'USA', 'America', 'events']
|
||||
author: 'Alexey Milovidov'
|
||||
---
|
||||
|
||||
![iframe](https://www.youtube.com/embed/bSyQahMVZ7w)
|
||||
|
@ -3,6 +3,7 @@ title: 'How to speed up LZ4 decompression in ClickHouse?'
|
||||
image: 'https://blog-images.clickhouse.com/en/2019/how-to-speed-up-lz4-decompression-in-clickhouse/main.jpg'
|
||||
date: '2019-06-25'
|
||||
tags: ['performance', 'lz4', 'article', 'decompression']
|
||||
author: 'Alexey Milovidov'
|
||||
---
|
||||
|
||||
When you run queries in [ClickHouse](https://clickhouse.com/), you might notice that the profiler often shows the `LZ_decompress_fast` function near the top. What is going on? This question had us wondering how to choose the best compression algorithm.
|
||||
|
@ -3,6 +3,7 @@ title: 'Five Methods For Database Obfuscation'
|
||||
image: 'https://blog-images.clickhouse.com/en/2020/five-methods-for-database-obfuscation/main.jpg'
|
||||
date: '2020-01-27'
|
||||
tags: ['article', 'obfuscation']
|
||||
author: 'Alexey Milovidov'
|
||||
---
|
||||
|
||||
ClickHouse users already know that its biggest advantage is its high-speed processing of analytical queries. But claims like this need to be confirmed with reliable performance testing.
|
||||
|
@ -3,6 +3,7 @@ title: 'Package Repository Behind CDN'
|
||||
image: 'https://blog-images.clickhouse.com/en/2020/package-repository-behind-cdn/main.jpg'
|
||||
date: '2020-07-02'
|
||||
tags: ['article', 'CDN', 'Cloudflare', 'repository', 'deb', 'rpm', 'tgz']
|
||||
author: 'Ivan Blinkov'
|
||||
---
|
||||
|
||||
On initial open-source launch, ClickHouse packages were published at an independent repository implemented on Yandex infrastructure. We'd love to use the default repositories of Linux distributions, but, unfortunately, they have their own strict rules on third-party library usage and software compilation options. These rules happen to contradict with how ClickHouse is produced. In 2018 ClickHouse was added to [official Debian repository](https://packages.debian.org/sid/clickhouse-server) as an experiment, but it didn't get much traction. Adaptation to those rules ended up producing more like a demo version of ClickHouse with crippled performance and limited features.
|
||||
@ -68,4 +69,3 @@ Or you can take a look at all key charts for `repo.clickhouse.com` together on a
|
||||
* CDN is a must-have if you want people from all over the world to download some artifacts that you produce. Beware the huge pay-for-traffic bills from most CDN providers though.
|
||||
* Generic technical system metrics and drill-downs are a good starting point, but not always enough.
|
||||
* Serverless is not a myth. Nowadays it is indeed possible to build useful products by just integrating various infrastructure services together, without any dedicated servers to take care of.
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'Running ClickHouse on an Android phone'
|
||||
image: 'https://blog-images.clickhouse.com/en/2020/pixel-benchmark/main.jpg'
|
||||
date: '2020-07-16'
|
||||
author: '[Alexander Kuzmenkov](https://github.com/akuzm)'
|
||||
author: 'Alexander Kuzmenkov'
|
||||
tags: ['Android', 'benchmark', 'experiment']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'The ClickHouse Community'
|
||||
image: 'https://blog-images.clickhouse.com/en/2020/the-clickhouse-community/clickhouse-community-history.png'
|
||||
date: '2020-12-10'
|
||||
author: '[Robert Hodges](https://github.com/hodgesrm)'
|
||||
author: 'Robert Hodges'
|
||||
tags: ['community', 'open source', 'telegram', 'meetup']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'Introducing ClickHouse, Inc.'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-inc/home.png'
|
||||
date: '2021-09-20'
|
||||
author: '[Alexey Milovidov](https://github.com/alexey-milovidov)'
|
||||
author: 'Alexey Milovidov'
|
||||
tags: ['company', 'incorporation', 'yandex', 'community']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'ClickHouse Moscow Meetup October 19, 2021'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-october-moscow-meetup/featured.jpg'
|
||||
date: '2021-11-11'
|
||||
author: '[Rich Raposa](https://github.com/rfraposa)'
|
||||
author: 'Rich Raposa'
|
||||
tags: ['company', 'community']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'ClickHouse raises a $250M Series B at a $2B valuation...and we are hiring'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-raises-250m-series-b/featured.jpg'
|
||||
date: '2021-10-28'
|
||||
author: '[Dorota Szeremeta](https://www.linkedin.com/in/dorota-szeremeta-a849b7/)'
|
||||
author: 'Dorota Szeremeta'
|
||||
tags: ['company', 'investment']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'ClickHouse v21.10 Released'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-v21-10/featured.jpg'
|
||||
date: '2021-10-14'
|
||||
author: '[Rich Raposa](https://github.com/rfraposa), [Alexey Milovidov](https://github.com/alexey-milovidov)'
|
||||
author: 'Rich Raposa, Alexey Milovidov'
|
||||
tags: ['company', 'community']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'ClickHouse v21.11 Released'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-v21-11/featured-dog.jpg'
|
||||
date: '2021-11-11'
|
||||
author: '[Rich Raposa](https://github.com/rfraposa), [Alexey Milovidov](https://github.com/alexey-milovidov)'
|
||||
author: 'Rich Raposa, Alexey Milovidov'
|
||||
tags: ['company', 'community']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'What''s New in ClickHouse 21.12'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/clickhouse-v21-12/featured.jpg'
|
||||
date: '2021-12-16'
|
||||
author: '[Alexey Milovidov](https://github.com/alexey-milovidov), [Christoph Wurm](https://github.com/cwurm)'
|
||||
author: 'Alexey Milovidov, Christoph Wurm'
|
||||
tags: ['company', 'community']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'The Tests Are Passing, Why Would I Read The Diff Again?'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/code-review/two-ducks.jpg'
|
||||
date: '2021-04-14'
|
||||
author: '[Alexander Kuzmenkov](https://github.com/akuzm)'
|
||||
author: 'Alexander Kuzmenkov'
|
||||
tags: ['code review', 'development']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'Fuzzing ClickHouse'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/fuzzing-clickhouse/some-checks-were-not-successful.png'
|
||||
date: '2021-03-11'
|
||||
author: '[Alexander Kuzmenkov](https://github.com/akuzm)'
|
||||
author: 'Alexander Kuzmenkov'
|
||||
tags: ['fuzzing', 'testing']
|
||||
---
|
||||
|
||||
@ -56,6 +56,3 @@ To see for yourself how the fuzzer works, you only need the normal ClickHouse cl
|
||||
## Other Fuzzers
|
||||
|
||||
The AST-based fuzzer we discussed is only one of the many kinds of fuzzers we have in ClickHouse. There is a [talk](https://www.youtube.com/watch?v=GbmK84ZwSeI&t=4481s) (in Russian, [slides are here](https://presentations.clickhouse.com/cpp_siberia_2021/)) by Alexey Milovidov that explores all the fuzzers we have. Another interesting recent development is application of pivoted query synthesis technique, implemented in [SQLancer](https://github.com/sqlancer/sqlancer), to ClickHouse. The authors are going to give [a talk about this](https://heisenbug-piter.ru/2021/spb/talks/nr1cwknssdodjkqgzsbvh/) soon, so stay tuned.
|
||||
|
||||
_2021-03-11 [Alexander Kuzmenkov](https://github.com/akuzm)_
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'How to Enable Predictive Capabilities in Clickhouse Databases'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/mindsdb-enables-predictive-capabilities-in-clickHouse/featured.png'
|
||||
date: '2021-12-14'
|
||||
author: '[Ilya Yatsishin](https://github.com/qoega)'
|
||||
author: 'Ilya Yatsishin'
|
||||
tags: ['company', 'how-to', 'MindsDB']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'Testing the Performance of ClickHouse'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/performance-testing-1/chebu-crop.jpg'
|
||||
date: '2021-08-19'
|
||||
author: '[Alexander Kuzmenkov](https://github.com/akuzm)'
|
||||
author: 'Alexander Kuzmenkov'
|
||||
tags: ['testing', 'performance']
|
||||
---
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
title: 'A journey to io_uring, AIO and modern storage devices'
|
||||
image: 'https://blog-images.clickhouse.com/en/2021/reading-from-external-memory/all-single-read.png'
|
||||
date: '2021-03-09'
|
||||
author: '[Ruslan Savchenko](https://github.com/savrus)'
|
||||
author: 'Ruslan Savchenko'
|
||||
tags: ['Linux', 'benchmark', 'experiment']
|
||||
---
|
||||
|
||||
@ -67,4 +67,3 @@ We see that solid state device latencies are far better than HDD. For a single r
|
||||
So, how about testing modern IO interfaces in Linux? Continue reading the [full article](https://arxiv.org/pdf/2102.11198).
|
||||
|
||||
2021-03-09 [Ruslan Savchenko](https://github.com/savrus)
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user