mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Merge branch 'master' into system_on_cluster
This commit is contained in:
commit
c9a1c9a896
6
.gitmodules
vendored
6
.gitmodules
vendored
@ -217,6 +217,9 @@
|
||||
[submodule "contrib/yaml-cpp"]
|
||||
path = contrib/yaml-cpp
|
||||
url = https://github.com/ClickHouse-Extras/yaml-cpp.git
|
||||
[submodule "contrib/cld2"]
|
||||
path = contrib/cld2
|
||||
url = https://github.com/ClickHouse-Extras/cld2.git
|
||||
[submodule "contrib/libstemmer_c"]
|
||||
path = contrib/libstemmer_c
|
||||
url = https://github.com/ClickHouse-Extras/libstemmer_c.git
|
||||
@ -247,6 +250,9 @@
|
||||
[submodule "contrib/sysroot"]
|
||||
path = contrib/sysroot
|
||||
url = https://github.com/ClickHouse-Extras/sysroot.git
|
||||
[submodule "contrib/nlp-data"]
|
||||
path = contrib/nlp-data
|
||||
url = https://github.com/ClickHouse-Extras/nlp-data.git
|
||||
[submodule "contrib/hive-metastore"]
|
||||
path = contrib/hive-metastore
|
||||
url = https://github.com/ClickHouse-Extras/hive-metastore
|
||||
|
@ -247,8 +247,6 @@ endif()
|
||||
|
||||
if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
|
||||
set(USE_DEBUG_HELPERS ON)
|
||||
else ()
|
||||
set(USE_DEBUG_HELPERS ON)
|
||||
endif()
|
||||
option(USE_DEBUG_HELPERS "Enable debug helpers" ${USE_DEBUG_HELPERS})
|
||||
|
||||
|
4
LICENSE
4
LICENSE
@ -1,4 +1,4 @@
|
||||
Copyright 2016-2021 ClickHouse, Inc.
|
||||
Copyright 2016-2022 ClickHouse, Inc.
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
@ -188,7 +188,7 @@ Copyright 2016-2021 ClickHouse, Inc.
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2016-2021 ClickHouse, Inc.
|
||||
Copyright 2016-2022 ClickHouse, Inc.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -2,7 +2,9 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <string_view>
|
||||
#include <algorithm>
|
||||
|
||||
#include <cassert>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/select.h>
|
||||
@ -34,13 +36,37 @@ bool hasInputData()
|
||||
return select(1, &fds, nullptr, nullptr, &timeout) == 1;
|
||||
}
|
||||
|
||||
struct NoCaseCompare
|
||||
{
|
||||
bool operator()(const std::string & str1, const std::string & str2)
|
||||
{
|
||||
return std::lexicographical_compare(begin(str1), end(str1), begin(str2), end(str2), [](const char c1, const char c2)
|
||||
{
|
||||
return std::tolower(c1) < std::tolower(c2);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
using Words = std::vector<std::string>;
|
||||
template <class Compare>
|
||||
void addNewWords(Words & to, const Words & from, Compare comp)
|
||||
{
|
||||
size_t old_size = to.size();
|
||||
size_t new_size = old_size + from.size();
|
||||
|
||||
to.reserve(new_size);
|
||||
to.insert(to.end(), from.begin(), from.end());
|
||||
auto middle = to.begin() + old_size;
|
||||
std::inplace_merge(to.begin(), middle, to.end(), comp);
|
||||
|
||||
auto last_unique = std::unique(to.begin(), to.end());
|
||||
to.erase(last_unique, to.end());
|
||||
}
|
||||
|
||||
std::optional<LineReader::Suggest::WordsRange> LineReader::Suggest::getCompletions(const String & prefix, size_t prefix_length) const
|
||||
{
|
||||
if (!ready)
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
replxx::Replxx::completions_t LineReader::Suggest::getCompletions(const String & prefix, size_t prefix_length)
|
||||
{
|
||||
std::string_view last_word;
|
||||
|
||||
auto last_word_pos = prefix.find_last_of(word_break_characters);
|
||||
@ -48,21 +74,45 @@ std::optional<LineReader::Suggest::WordsRange> LineReader::Suggest::getCompletio
|
||||
last_word = prefix;
|
||||
else
|
||||
last_word = std::string_view(prefix).substr(last_word_pos + 1, std::string::npos);
|
||||
|
||||
/// last_word can be empty.
|
||||
|
||||
std::pair<Words::const_iterator, Words::const_iterator> range;
|
||||
|
||||
std::lock_guard lock(mutex);
|
||||
|
||||
/// Only perform case sensitive completion when the prefix string contains any uppercase characters
|
||||
if (std::none_of(prefix.begin(), prefix.end(), [&](auto c) { return c >= 'A' && c <= 'Z'; }))
|
||||
return std::equal_range(
|
||||
range = std::equal_range(
|
||||
words_no_case.begin(), words_no_case.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched)
|
||||
{
|
||||
return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0;
|
||||
});
|
||||
else
|
||||
return std::equal_range(words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched)
|
||||
range = std::equal_range(words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched)
|
||||
{
|
||||
return strncmp(s.data(), prefix_searched.data(), prefix_length) < 0;
|
||||
});
|
||||
|
||||
return replxx::Replxx::completions_t(range.first, range.second);
|
||||
}
|
||||
|
||||
void LineReader::Suggest::addWords(Words && new_words)
|
||||
{
|
||||
Words new_words_no_case = new_words;
|
||||
if (!new_words.empty())
|
||||
{
|
||||
std::sort(new_words.begin(), new_words.end());
|
||||
std::sort(new_words_no_case.begin(), new_words_no_case.end(), NoCaseCompare{});
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
addNewWords(words, new_words, std::less<std::string>{});
|
||||
addNewWords(words_no_case, new_words_no_case, NoCaseCompare{});
|
||||
}
|
||||
|
||||
assert(std::is_sorted(words.begin(), words.end()));
|
||||
assert(std::is_sorted(words_no_case.begin(), words_no_case.end(), NoCaseCompare{}));
|
||||
}
|
||||
|
||||
LineReader::LineReader(const String & history_file_path_, bool multiline_, Patterns extenders_, Patterns delimiters_)
|
||||
|
@ -1,10 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <vector>
|
||||
#include <optional>
|
||||
#include <replxx.hxx>
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
class LineReader
|
||||
{
|
||||
@ -12,14 +14,16 @@ public:
|
||||
struct Suggest
|
||||
{
|
||||
using Words = std::vector<std::string>;
|
||||
using WordsRange = std::pair<Words::const_iterator, Words::const_iterator>;
|
||||
|
||||
/// Get vector for the matched range of words if any.
|
||||
replxx::Replxx::completions_t getCompletions(const String & prefix, size_t prefix_length);
|
||||
void addWords(Words && new_words);
|
||||
|
||||
private:
|
||||
Words words;
|
||||
Words words_no_case;
|
||||
std::atomic<bool> ready{false};
|
||||
|
||||
/// Get iterators for the matched range of words if any.
|
||||
std::optional<WordsRange> getCompletions(const String & prefix, size_t prefix_length) const;
|
||||
std::mutex mutex;
|
||||
};
|
||||
|
||||
using Patterns = std::vector<const char *>;
|
||||
|
@ -133,7 +133,7 @@ void convertHistoryFile(const std::string & path, replxx::Replxx & rx)
|
||||
}
|
||||
|
||||
ReplxxLineReader::ReplxxLineReader(
|
||||
const Suggest & suggest,
|
||||
Suggest & suggest,
|
||||
const String & history_file_path_,
|
||||
bool multiline_,
|
||||
Patterns extenders_,
|
||||
@ -179,9 +179,7 @@ ReplxxLineReader::ReplxxLineReader(
|
||||
|
||||
auto callback = [&suggest] (const String & context, size_t context_size)
|
||||
{
|
||||
if (auto range = suggest.getCompletions(context, context_size))
|
||||
return Replxx::completions_t(range->first, range->second);
|
||||
return Replxx::completions_t();
|
||||
return suggest.getCompletions(context, context_size);
|
||||
};
|
||||
|
||||
rx.set_completion_callback(callback);
|
||||
|
@ -9,7 +9,7 @@ class ReplxxLineReader : public LineReader
|
||||
{
|
||||
public:
|
||||
ReplxxLineReader(
|
||||
const Suggest & suggest,
|
||||
Suggest & suggest,
|
||||
const String & history_file_path,
|
||||
bool multiline,
|
||||
Patterns extenders_,
|
||||
|
@ -1,59 +1,80 @@
|
||||
#include <sys/auxv.h>
|
||||
#include "atomic.h"
|
||||
#include <unistd.h> // __environ
|
||||
#include <sys/auxv.h>
|
||||
#include <fcntl.h> // open
|
||||
#include <sys/stat.h> // O_RDONLY
|
||||
#include <unistd.h> // read, close
|
||||
#include <stdlib.h> // ssize_t
|
||||
#include <stdio.h> // perror, fprintf
|
||||
#include <link.h> // ElfW
|
||||
#include <errno.h>
|
||||
|
||||
// We don't have libc struct available here. Compute aux vector manually.
|
||||
static unsigned long * __auxv = NULL;
|
||||
static unsigned long __auxv_secure = 0;
|
||||
#define ARRAY_SIZE(a) sizeof((a))/sizeof((a[0]))
|
||||
|
||||
static size_t __find_auxv(unsigned long type)
|
||||
{
|
||||
size_t i;
|
||||
for (i = 0; __auxv[i]; i += 2)
|
||||
{
|
||||
if (__auxv[i] == type)
|
||||
return i + 1;
|
||||
}
|
||||
return (size_t) -1;
|
||||
}
|
||||
// We don't have libc struct available here.
|
||||
// Compute aux vector manually (from /proc/self/auxv).
|
||||
//
|
||||
// Right now there is only 51 AT_* constants,
|
||||
// so 64 should be enough until this implementation will be replaced with musl.
|
||||
static unsigned long __auxv[64];
|
||||
static unsigned long __auxv_secure = 0;
|
||||
|
||||
unsigned long __getauxval(unsigned long type)
|
||||
{
|
||||
if (type == AT_SECURE)
|
||||
return __auxv_secure;
|
||||
|
||||
if (__auxv)
|
||||
if (type >= ARRAY_SIZE(__auxv))
|
||||
{
|
||||
size_t index = __find_auxv(type);
|
||||
if (index != ((size_t) -1))
|
||||
return __auxv[index];
|
||||
}
|
||||
|
||||
errno = ENOENT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void * volatile getauxval_func;
|
||||
|
||||
static unsigned long __auxv_init(unsigned long type)
|
||||
{
|
||||
if (!__environ)
|
||||
{
|
||||
// __environ is not initialized yet so we can't initialize __auxv right now.
|
||||
// That's normally occurred only when getauxval() is called from some sanitizer's internal code.
|
||||
errno = ENOENT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Initialize __auxv and __auxv_secure.
|
||||
size_t i;
|
||||
for (i = 0; __environ[i]; i++);
|
||||
__auxv = (unsigned long *) (__environ + i + 1);
|
||||
return __auxv[type];
|
||||
}
|
||||
|
||||
size_t secure_idx = __find_auxv(AT_SECURE);
|
||||
if (secure_idx != ((size_t) -1))
|
||||
__auxv_secure = __auxv[secure_idx];
|
||||
static void * volatile getauxval_func;
|
||||
|
||||
ssize_t __retry_read(int fd, void *buf, size_t count)
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
ssize_t ret = read(fd, buf, count);
|
||||
if (ret == -1)
|
||||
{
|
||||
if (errno == EINTR)
|
||||
continue;
|
||||
perror("Cannot read /proc/self/auxv");
|
||||
abort();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
static unsigned long __auxv_init(unsigned long type)
|
||||
{
|
||||
// od -t dL /proc/self/auxv
|
||||
int fd = open("/proc/self/auxv", O_RDONLY);
|
||||
if (fd == -1) {
|
||||
perror("Cannot read /proc/self/auxv (likely kernel is too old or procfs is not mounted)");
|
||||
abort();
|
||||
}
|
||||
|
||||
ElfW(auxv_t) aux;
|
||||
|
||||
/// NOTE: sizeof(aux) is very small (less then PAGE_SIZE), so partial read should not be possible.
|
||||
_Static_assert(sizeof(aux) < 4096, "Unexpected sizeof(aux)");
|
||||
while (__retry_read(fd, &aux, sizeof(aux)) == sizeof(aux))
|
||||
{
|
||||
if (aux.a_type >= ARRAY_SIZE(__auxv))
|
||||
{
|
||||
fprintf(stderr, "AT_* is out of range: %li (maximum allowed is %zu)\n", aux.a_type, ARRAY_SIZE(__auxv));
|
||||
abort();
|
||||
}
|
||||
__auxv[aux.a_type] = aux.a_un.a_val;
|
||||
}
|
||||
close(fd);
|
||||
|
||||
// AT_SECURE
|
||||
__auxv_secure = __getauxval(AT_SECURE);
|
||||
|
||||
// Now we've initialized __auxv, next time getauxval() will only call __get_auxval().
|
||||
a_cas_p(&getauxval_func, (void *)__auxv_init, (void *)__getauxval);
|
||||
|
2
contrib/CMakeLists.txt
vendored
2
contrib/CMakeLists.txt
vendored
@ -140,6 +140,8 @@ if (ENABLE_NLP)
|
||||
add_contrib (libstemmer-c-cmake libstemmer_c)
|
||||
add_contrib (wordnet-blast-cmake wordnet-blast)
|
||||
add_contrib (lemmagen-c-cmake lemmagen-c)
|
||||
add_contrib (nlp-data-cmake nlp-data)
|
||||
add_contrib (cld2-cmake cld2)
|
||||
endif()
|
||||
|
||||
add_contrib (sqlite-cmake sqlite-amalgamation)
|
||||
|
1
contrib/cld2
vendored
Submodule
1
contrib/cld2
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit bc6d493a2f64ed1fc1c4c4b4294a542a04e04217
|
33
contrib/cld2-cmake/CMakeLists.txt
Normal file
33
contrib/cld2-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,33 @@
|
||||
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
|
||||
|
||||
set (SRCS
|
||||
"${LIBRARY_DIR}/internal/cldutil.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det.cc"
|
||||
"${LIBRARY_DIR}/internal/cldutil_shared.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det_impl.cc"
|
||||
"${LIBRARY_DIR}/internal/debug.cc"
|
||||
"${LIBRARY_DIR}/internal/fixunicodevalue.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_entities.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_language.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_ulscript.cc"
|
||||
"${LIBRARY_DIR}/internal/getonescriptspan.cc"
|
||||
"${LIBRARY_DIR}/internal/lang_script.cc"
|
||||
"${LIBRARY_DIR}/internal/offsetmap.cc"
|
||||
"${LIBRARY_DIR}/internal/scoreonescriptspan.cc"
|
||||
"${LIBRARY_DIR}/internal/tote.cc"
|
||||
"${LIBRARY_DIR}/internal/utf8statetable.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc"
|
||||
)
|
||||
add_library(_cld2 ${SRCS})
|
||||
set_property(TARGET _cld2 PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_options (_cld2 PRIVATE -Wno-reserved-id-macro -Wno-c++11-narrowing)
|
||||
target_include_directories(_cld2 SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/public")
|
||||
add_library(ch_contrib::cld2 ALIAS _cld2)
|
1
contrib/nlp-data
vendored
Submodule
1
contrib/nlp-data
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 5591f91f5e748cba8fb9ef81564176feae774853
|
15
contrib/nlp-data-cmake/CMakeLists.txt
Normal file
15
contrib/nlp-data-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,15 @@
|
||||
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
|
||||
|
||||
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/nlp-data")
|
||||
|
||||
add_library (_nlp_data INTERFACE)
|
||||
|
||||
clickhouse_embed_binaries(
|
||||
TARGET nlp_dictionaries
|
||||
RESOURCE_DIR "${LIBRARY_DIR}"
|
||||
RESOURCES charset.zst tonality_ru.zst programming.zst
|
||||
)
|
||||
|
||||
add_dependencies(_nlp_data nlp_dictionaries)
|
||||
target_link_libraries(_nlp_data INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:nlp_dictionaries> -Wl,${NO_WHOLE_ARCHIVE}")
|
||||
add_library(ch_contrib::nlp_data ALIAS _nlp_data)
|
2
contrib/orc
vendored
2
contrib/orc
vendored
@ -1 +1 @@
|
||||
Subproject commit 0a936f6bbdb9303308973073f8623b5a8d82eae1
|
||||
Subproject commit f9a393ed2433a60034795284f82d093b348f2102
|
@ -65,7 +65,12 @@ do
|
||||
# check if variable not empty
|
||||
[ -z "$dir" ] && continue
|
||||
# ensure directories exist
|
||||
if ! mkdir -p "$dir"; then
|
||||
if [ "$DO_CHOWN" = "1" ]; then
|
||||
mkdir="mkdir"
|
||||
else
|
||||
mkdir="$gosu mkdir"
|
||||
fi
|
||||
if ! $mkdir -p "$dir"; then
|
||||
echo "Couldn't create necessary directory: $dir"
|
||||
exit 1
|
||||
fi
|
||||
|
@ -27,6 +27,7 @@ toc_title: Client Libraries
|
||||
- Go
|
||||
- [clickhouse](https://github.com/kshvakov/clickhouse/)
|
||||
- [go-clickhouse](https://github.com/roistat/go-clickhouse)
|
||||
- [chconn](https://github.com/vahid-sohrabloo/chconn)
|
||||
- [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse)
|
||||
- [golang-clickhouse](https://github.com/leprosus/golang-clickhouse)
|
||||
- Swift
|
||||
|
@ -43,7 +43,7 @@ User host is a host from which a connection to ClickHouse server could be establ
|
||||
- `HOST ANY` — User can connect from any location. This is a default option.
|
||||
- `HOST LOCAL` — User can connect only locally.
|
||||
- `HOST NAME 'fqdn'` — User host can be specified as FQDN. For example, `HOST NAME 'mysite.com'`.
|
||||
- `HOST NAME REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) regular expressions when specifying user hosts. For example, `HOST NAME REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) regular expressions when specifying user hosts. For example, `HOST REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST LIKE 'template'` — Allows you to use the [LIKE](../../../sql-reference/functions/string-search-functions.md#function-like) operator to filter the user hosts. For example, `HOST LIKE '%'` is equivalent to `HOST ANY`, `HOST LIKE '%.mysite.com'` filters all the hosts in the `mysite.com` domain.
|
||||
|
||||
Another way of specifying host is to use `@` syntax following the username. Examples:
|
||||
|
@ -43,7 +43,7 @@ CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1]
|
||||
- `HOST ANY` — Пользователь может подключиться с любого хоста. Используется по умолчанию.
|
||||
- `HOST LOCAL` — Пользователь может подключиться только локально.
|
||||
- `HOST NAME 'fqdn'` — Хост задается через FQDN. Например, `HOST NAME 'mysite.com'`.
|
||||
- `HOST NAME REGEXP 'regexp'` — Позволяет использовать регулярные выражения [pcre](http://www.pcre.org/), чтобы задать хосты. Например, `HOST NAME REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST REGEXP 'regexp'` — Позволяет использовать регулярные выражения [pcre](http://www.pcre.org/), чтобы задать хосты. Например, `HOST REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST LIKE 'template'` — Позволяет использовать оператор [LIKE](../../functions/string-search-functions.md#function-like) для фильтрации хостов. Например, `HOST LIKE '%'` эквивалентен `HOST ANY`; `HOST LIKE '%.mysite.com'` разрешает подключение со всех хостов в домене `mysite.com`.
|
||||
|
||||
Также, чтобы задать хост, вы можете использовать `@` вместе с именем пользователя. Примеры:
|
||||
|
@ -62,7 +62,7 @@ def build_for_lang(lang, args):
|
||||
strict=True,
|
||||
theme=theme_cfg,
|
||||
nav=blog_nav,
|
||||
copyright='©2016–2021 ClickHouse, Inc.',
|
||||
copyright='©2016–2022 ClickHouse, Inc.',
|
||||
use_directory_urls=True,
|
||||
repo_name='ClickHouse/ClickHouse',
|
||||
repo_url='https://github.com/ClickHouse/ClickHouse/',
|
||||
@ -97,10 +97,6 @@ def build_for_lang(lang, args):
|
||||
with open(os.path.join(args.blog_output_dir, lang, 'rss.xml'), 'w') as f:
|
||||
f.write(rss_template.render({'config': raw_config}))
|
||||
|
||||
# TODO: AMP for blog
|
||||
# if not args.skip_amp:
|
||||
# amp.build_amp(lang, args, cfg)
|
||||
|
||||
logging.info(f'Finished building {lang} blog')
|
||||
|
||||
except exceptions.ConfigurationError as e:
|
||||
|
@ -1 +0,0 @@
|
||||
../../../en/faq/general/index.md
|
27
docs/zh/faq/general/index.md
Normal file
27
docs/zh/faq/general/index.md
Normal file
@ -0,0 +1,27 @@
|
||||
---
|
||||
title: ClickHouse 有关常见问题
|
||||
toc_hidden_folder: true
|
||||
toc_priority: 1
|
||||
toc_title: General
|
||||
---
|
||||
|
||||
# ClickHouse 有关常见问题 {#general-questions}
|
||||
|
||||
常见问题:
|
||||
|
||||
- [什么是 ClickHouse?](../../index.md#what-is-clickhouse)
|
||||
- [为何 ClickHouse 如此迅捷?](../../faq/general/why-clickhouse-is-so-fast.md)
|
||||
- [谁在使用 ClickHouse?](../../faq/general/who-is-using-clickhouse.md)
|
||||
- [“ClickHouse” 有什么含义?](../../faq/general/dbms-naming.md)
|
||||
- [ “Не тормозит” 有什么含义?](../../faq/general/ne-tormozit.md)
|
||||
- [什么是 OLAP?](../../faq/general/olap.md)
|
||||
- [什么是列存储数据库?](../../faq/general/columnar-database.md)
|
||||
- [为何不使用 MapReduce等技术?](../../faq/general/mapreduce.md)
|
||||
- [我如何为 ClickHouse贡献代码?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md)
|
||||
|
||||
|
||||
|
||||
!!! info "没找到您需要的内容?"
|
||||
请查阅 [其他 F.A.Q. 类别](../../faq/index.md) 或者从左侧导航栏浏览其他文档
|
||||
|
||||
{## [原始文档](https://clickhouse.com/docs/en/faq/general/) ##}
|
@ -1 +0,0 @@
|
||||
../../../en/faq/general/mapreduce.md
|
13
docs/zh/faq/general/mapreduce.md
Normal file
13
docs/zh/faq/general/mapreduce.md
Normal file
@ -0,0 +1,13 @@
|
||||
---
|
||||
title: 为何不使用 MapReduce等技术?
|
||||
toc_hidden: true
|
||||
toc_priority: 110
|
||||
---
|
||||
|
||||
# 为何不使用 MapReduce等技术? {#why-not-use-something-like-mapreduce}
|
||||
|
||||
我们可以将MapReduce这样的系统称为分布式计算系统,其中的reduce操作是基于分布式排序的。这个领域中最常见的开源解决方案是[Apache Hadoop](http://hadoop.apache.org)。Yandex使用其内部解决方案YT。
|
||||
|
||||
这些系统不适合用于在线查询,因为它们的延迟很大。换句话说,它们不能被用作网页界面的后端。这些类型的系统对于实时数据更新并不是很有用。如果操作的结果和所有中间结果(如果有的话)都位于单个服务器的内存中,那么分布式排序就不是执行reduce操作的最佳方式,这通常是在线查询的情况。在这种情况下,哈希表是执行reduce操作的最佳方式。优化map-reduce任务的一种常见方法是使用内存中的哈希表进行预聚合(部分reduce)。用户手动执行此优化。在运行简单的map-reduce任务时,分布式排序是导致性能下降的主要原因之一。
|
||||
|
||||
大多数MapReduce实现允许你在集群中执行任意代码。但是声明性查询语言更适合于OLAP,以便快速运行实验。例如,Hadoop有Hive和Pig。还可以考虑使用Cloudera Impala或Shark(已经过时了)来支持Spark,以及Spark SQL、Presto和Apache Drill。与专门的系统相比,运行这些任务的性能是非常不理想的,但是相对较高的延迟使得使用这些系统作为web界面的后端是不现实的。
|
@ -19,6 +19,7 @@ toc_priority: 76
|
||||
- [什么是 OLAP?](../faq/general/olap.md)
|
||||
- [什么是列存储数据库?](../faq/general/columnar-database.md)
|
||||
- [为何不使用 MapReduce等技术?](../faq/general/mapreduce.md)
|
||||
- [我如何为 ClickHouse贡献代码?](../faq/general/how-do-i-contribute-code-to-clickhouse.md)
|
||||
- **[应用案例](../faq/use-cases/index.md)**
|
||||
- [我能把 ClickHouse 作为时序数据库来使用吗?](../faq/use-cases/time-series.md)
|
||||
- [我能把 ClickHouse 作为 key-value 键值存储吗?](../faq/use-cases/key-value.md)
|
||||
|
@ -364,7 +364,9 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
|
||||
"clickhouse-git-import",
|
||||
"clickhouse-compressor",
|
||||
"clickhouse-format",
|
||||
"clickhouse-extract-from-config"
|
||||
"clickhouse-extract-from-config",
|
||||
"clickhouse-keeper",
|
||||
"clickhouse-keeper-converter",
|
||||
};
|
||||
|
||||
for (const auto & tool : tools)
|
||||
|
@ -327,6 +327,7 @@ std::string LocalServer::getInitialCreateTableQuery()
|
||||
{
|
||||
/// Use Unix tools stdin naming convention
|
||||
table_file = "stdin";
|
||||
format_from_file_name = FormatFactory::instance().getFormatFromFileDescriptor(STDIN_FILENO);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -17,15 +17,11 @@ class AggregateFunctionSimpleState final : public IAggregateFunctionHelper<Aggre
|
||||
{
|
||||
private:
|
||||
AggregateFunctionPtr nested_func;
|
||||
DataTypes arguments;
|
||||
Array params;
|
||||
|
||||
public:
|
||||
AggregateFunctionSimpleState(AggregateFunctionPtr nested_, const DataTypes & arguments_, const Array & params_)
|
||||
: IAggregateFunctionHelper<AggregateFunctionSimpleState>(arguments_, params_)
|
||||
, nested_func(nested_)
|
||||
, arguments(arguments_)
|
||||
, params(params_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -35,18 +31,19 @@ public:
|
||||
{
|
||||
DataTypeCustomSimpleAggregateFunction::checkSupportedFunctions(nested_func);
|
||||
|
||||
// Need to make a clone because it'll be customized.
|
||||
auto storage_type = DataTypeFactory::instance().get(nested_func->getReturnType()->getName());
|
||||
|
||||
// Need to make a clone to avoid recursive reference.
|
||||
auto storage_type_out = DataTypeFactory::instance().get(nested_func->getReturnType()->getName());
|
||||
// Need to make a new function with promoted argument types because SimpleAggregates requires arg_type = return_type.
|
||||
AggregateFunctionProperties properties;
|
||||
auto function
|
||||
= AggregateFunctionFactory::instance().get(nested_func->getName(), {storage_type}, nested_func->getParameters(), properties);
|
||||
= AggregateFunctionFactory::instance().get(nested_func->getName(), {storage_type_out}, nested_func->getParameters(), properties);
|
||||
|
||||
// Need to make a clone because it'll be customized.
|
||||
auto storage_type_arg = DataTypeFactory::instance().get(nested_func->getReturnType()->getName());
|
||||
DataTypeCustomNamePtr custom_name
|
||||
= std::make_unique<DataTypeCustomSimpleAggregateFunction>(function, DataTypes{nested_func->getReturnType()}, params);
|
||||
storage_type->setCustomization(std::make_unique<DataTypeCustomDesc>(std::move(custom_name), nullptr));
|
||||
return storage_type;
|
||||
= std::make_unique<DataTypeCustomSimpleAggregateFunction>(function, DataTypes{nested_func->getReturnType()}, parameters);
|
||||
storage_type_arg->setCustomization(std::make_unique<DataTypeCustomDesc>(std::move(custom_name), nullptr));
|
||||
return storage_type_arg;
|
||||
}
|
||||
|
||||
bool isVersioned() const override
|
||||
|
@ -20,13 +20,12 @@ class AggregateFunctionState final : public IAggregateFunctionHelper<AggregateFu
|
||||
{
|
||||
private:
|
||||
AggregateFunctionPtr nested_func;
|
||||
DataTypes arguments;
|
||||
Array params;
|
||||
|
||||
public:
|
||||
AggregateFunctionState(AggregateFunctionPtr nested_, const DataTypes & arguments_, const Array & params_)
|
||||
: IAggregateFunctionHelper<AggregateFunctionState>(arguments_, params_)
|
||||
, nested_func(nested_), arguments(arguments_), params(params_) {}
|
||||
, nested_func(nested_)
|
||||
{}
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
|
@ -506,6 +506,7 @@ if (ENABLE_NLP)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::stemmer)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::wnb)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::lemmagen)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::nlp_data)
|
||||
endif()
|
||||
|
||||
if (TARGET ch_contrib::bzip2)
|
||||
@ -558,3 +559,4 @@ if (ENABLE_TESTS)
|
||||
|
||||
add_check(unit_tests_dbms)
|
||||
endif ()
|
||||
|
||||
|
@ -48,6 +48,7 @@
|
||||
#include <Parsers/ASTQueryWithOutput.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTIdentifier.h>
|
||||
#include <Parsers/ASTColumnDeclaration.h>
|
||||
|
||||
#include <Processors/Formats/Impl/NullFormat.h>
|
||||
#include <Processors/Formats/IInputFormat.h>
|
||||
@ -552,6 +553,25 @@ void ClientBase::initLogsOutputStream()
|
||||
}
|
||||
}
|
||||
|
||||
void ClientBase::updateSuggest(const ASTCreateQuery & ast_create)
|
||||
{
|
||||
std::vector<std::string> new_words;
|
||||
|
||||
if (ast_create.database)
|
||||
new_words.push_back(ast_create.getDatabase());
|
||||
new_words.push_back(ast_create.getTable());
|
||||
|
||||
if (ast_create.columns_list && ast_create.columns_list->columns)
|
||||
{
|
||||
for (const auto & elem : ast_create.columns_list->columns->children)
|
||||
{
|
||||
if (const auto * column = elem->as<ASTColumnDeclaration>())
|
||||
new_words.push_back(column->name);
|
||||
}
|
||||
}
|
||||
|
||||
suggest->addWords(std::move(new_words));
|
||||
}
|
||||
|
||||
void ClientBase::processTextAsSingleQuery(const String & full_query)
|
||||
{
|
||||
@ -565,6 +585,18 @@ void ClientBase::processTextAsSingleQuery(const String & full_query)
|
||||
|
||||
String query_to_execute;
|
||||
|
||||
/// Query will be parsed before checking the result because error does not
|
||||
/// always means a problem, i.e. if table already exists, and it is no a
|
||||
/// huge problem if suggestion will be added even on error, since this is
|
||||
/// just suggestion.
|
||||
if (auto * create = parsed_query->as<ASTCreateQuery>())
|
||||
{
|
||||
/// Do not update suggest, until suggestion will be ready
|
||||
/// (this will avoid extra complexity)
|
||||
if (suggest)
|
||||
updateSuggest(*create);
|
||||
}
|
||||
|
||||
// An INSERT query may have the data that follow query text. Remove the
|
||||
/// Send part of query without data, because data will be sent separately.
|
||||
auto * insert = parsed_query->as<ASTInsertQuery>();
|
||||
@ -1463,7 +1495,6 @@ void ClientBase::runInteractive()
|
||||
/// Initialize DateLUT here to avoid counting time spent here as query execution time.
|
||||
const auto local_tz = DateLUT::instance().getTimeZone();
|
||||
|
||||
std::optional<Suggest> suggest;
|
||||
suggest.emplace();
|
||||
if (load_suggestions)
|
||||
{
|
||||
|
@ -136,6 +136,8 @@ private:
|
||||
void readArguments(int argc, char ** argv, Arguments & common_arguments, std::vector<Arguments> & external_tables_arguments);
|
||||
void parseAndCheckOptions(OptionsDescription & options_description, po::variables_map & options, Arguments & arguments);
|
||||
|
||||
void updateSuggest(const ASTCreateQuery & ast_create);
|
||||
|
||||
protected:
|
||||
bool is_interactive = false; /// Use either interactive line editing interface or batch mode.
|
||||
bool is_multiquery = false;
|
||||
@ -144,6 +146,8 @@ protected:
|
||||
bool echo_queries = false; /// Print queries before execution in batch mode.
|
||||
bool ignore_error = false; /// In case of errors, don't print error message, continue to next query. Only applicable for non-interactive mode.
|
||||
bool print_time_to_stderr = false; /// Output execution time to stderr in batch mode.
|
||||
|
||||
std::optional<Suggest> suggest;
|
||||
bool load_suggestions = false;
|
||||
|
||||
std::vector<String> queries_files; /// If not empty, queries will be read from these files
|
||||
|
@ -29,19 +29,21 @@ namespace ErrorCodes
|
||||
Suggest::Suggest()
|
||||
{
|
||||
/// Keywords may be not up to date with ClickHouse parser.
|
||||
words = {"CREATE", "DATABASE", "IF", "NOT", "EXISTS", "TEMPORARY", "TABLE", "ON", "CLUSTER", "DEFAULT",
|
||||
"MATERIALIZED", "ALIAS", "ENGINE", "AS", "VIEW", "POPULATE", "SETTINGS", "ATTACH", "DETACH", "DROP",
|
||||
"RENAME", "TO", "ALTER", "ADD", "MODIFY", "CLEAR", "COLUMN", "AFTER", "COPY", "PROJECT",
|
||||
"PRIMARY", "KEY", "CHECK", "PARTITION", "PART", "FREEZE", "FETCH", "FROM", "SHOW", "INTO",
|
||||
"OUTFILE", "FORMAT", "TABLES", "DATABASES", "LIKE", "PROCESSLIST", "CASE", "WHEN", "THEN", "ELSE",
|
||||
"END", "DESCRIBE", "DESC", "USE", "SET", "OPTIMIZE", "FINAL", "DEDUPLICATE", "INSERT", "VALUES",
|
||||
"SELECT", "DISTINCT", "SAMPLE", "ARRAY", "JOIN", "GLOBAL", "LOCAL", "ANY", "ALL", "INNER",
|
||||
"LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "USING", "PREWHERE", "WHERE", "GROUP", "BY",
|
||||
"WITH", "TOTALS", "HAVING", "ORDER", "COLLATE", "LIMIT", "UNION", "AND", "OR", "ASC",
|
||||
"IN", "KILL", "QUERY", "SYNC", "ASYNC", "TEST", "BETWEEN", "TRUNCATE", "USER", "ROLE",
|
||||
"PROFILE", "QUOTA", "POLICY", "ROW", "GRANT", "REVOKE", "OPTION", "ADMIN", "EXCEPT", "REPLACE",
|
||||
"IDENTIFIED", "HOST", "NAME", "READONLY", "WRITABLE", "PERMISSIVE", "FOR", "RESTRICTIVE", "RANDOMIZED",
|
||||
"INTERVAL", "LIMITS", "ONLY", "TRACKING", "IP", "REGEXP", "ILIKE"};
|
||||
addWords({
|
||||
"CREATE", "DATABASE", "IF", "NOT", "EXISTS", "TEMPORARY", "TABLE", "ON", "CLUSTER", "DEFAULT",
|
||||
"MATERIALIZED", "ALIAS", "ENGINE", "AS", "VIEW", "POPULATE", "SETTINGS", "ATTACH", "DETACH", "DROP",
|
||||
"RENAME", "TO", "ALTER", "ADD", "MODIFY", "CLEAR", "COLUMN", "AFTER", "COPY", "PROJECT",
|
||||
"PRIMARY", "KEY", "CHECK", "PARTITION", "PART", "FREEZE", "FETCH", "FROM", "SHOW", "INTO",
|
||||
"OUTFILE", "FORMAT", "TABLES", "DATABASES", "LIKE", "PROCESSLIST", "CASE", "WHEN", "THEN", "ELSE",
|
||||
"END", "DESCRIBE", "DESC", "USE", "SET", "OPTIMIZE", "FINAL", "DEDUPLICATE", "INSERT", "VALUES",
|
||||
"SELECT", "DISTINCT", "SAMPLE", "ARRAY", "JOIN", "GLOBAL", "LOCAL", "ANY", "ALL", "INNER",
|
||||
"LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "USING", "PREWHERE", "WHERE", "GROUP", "BY",
|
||||
"WITH", "TOTALS", "HAVING", "ORDER", "COLLATE", "LIMIT", "UNION", "AND", "OR", "ASC",
|
||||
"IN", "KILL", "QUERY", "SYNC", "ASYNC", "TEST", "BETWEEN", "TRUNCATE", "USER", "ROLE",
|
||||
"PROFILE", "QUOTA", "POLICY", "ROW", "GRANT", "REVOKE", "OPTION", "ADMIN", "EXCEPT", "REPLACE",
|
||||
"IDENTIFIED", "HOST", "NAME", "READONLY", "WRITABLE", "PERMISSIVE", "FOR", "RESTRICTIVE", "RANDOMIZED",
|
||||
"INTERVAL", "LIMITS", "ONLY", "TRACKING", "IP", "REGEXP", "ILIKE",
|
||||
});
|
||||
}
|
||||
|
||||
static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggestion)
|
||||
@ -124,18 +126,6 @@ void Suggest::load(ContextPtr context, const ConnectionParameters & connection_p
|
||||
}
|
||||
|
||||
/// Note that keyword suggestions are available even if we cannot load data from server.
|
||||
|
||||
std::sort(words.begin(), words.end());
|
||||
words_no_case = words;
|
||||
std::sort(words_no_case.begin(), words_no_case.end(), [](const std::string & str1, const std::string & str2)
|
||||
{
|
||||
return std::lexicographical_compare(begin(str1), end(str1), begin(str2), end(str2), [](const char char1, const char char2)
|
||||
{
|
||||
return std::tolower(char1) < std::tolower(char2);
|
||||
});
|
||||
});
|
||||
|
||||
ready = true;
|
||||
});
|
||||
}
|
||||
|
||||
@ -190,8 +180,14 @@ void Suggest::fillWordsFromBlock(const Block & block)
|
||||
const ColumnString & column = typeid_cast<const ColumnString &>(*block.getByPosition(0).column);
|
||||
|
||||
size_t rows = block.rows();
|
||||
|
||||
Words new_words;
|
||||
new_words.reserve(rows);
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
words.emplace_back(column.getDataAt(i).toString());
|
||||
{
|
||||
new_words.emplace_back(column.getDataAt(i).toString());
|
||||
}
|
||||
addWords(std::move(new_words));
|
||||
}
|
||||
|
||||
template
|
||||
|
252
src/Common/FrequencyHolder.h
Normal file
252
src/Common/FrequencyHolder.h
Normal file
@ -0,0 +1,252 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/Arena.h>
|
||||
#include <Common/getResource.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/readFloatText.h>
|
||||
#include <IO/ZstdInflatingReadBuffer.h>
|
||||
|
||||
#include <base/StringRef.h>
|
||||
#include <base/logger_useful.h>
|
||||
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int FILE_DOESNT_EXIST;
|
||||
}
|
||||
|
||||
/// FrequencyHolder class is responsible for storing and loading dictionaries
|
||||
/// needed for text classification functions:
|
||||
///
|
||||
/// 1. detectLanguageUnknown
|
||||
/// 2. detectCharset
|
||||
/// 3. detectTonality
|
||||
/// 4. detectProgrammingLanguage
|
||||
|
||||
class FrequencyHolder
|
||||
{
|
||||
|
||||
public:
|
||||
struct Language
|
||||
{
|
||||
String name;
|
||||
HashMap<StringRef, Float64> map;
|
||||
};
|
||||
|
||||
struct Encoding
|
||||
{
|
||||
String name;
|
||||
String lang;
|
||||
HashMap<UInt16, Float64> map;
|
||||
};
|
||||
|
||||
public:
|
||||
using Map = HashMap<StringRef, Float64>;
|
||||
using Container = std::vector<Language>;
|
||||
using EncodingMap = HashMap<UInt16, Float64>;
|
||||
using EncodingContainer = std::vector<Encoding>;
|
||||
|
||||
static FrequencyHolder & getInstance()
|
||||
{
|
||||
static FrequencyHolder instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void loadEncodingsFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded charset frequencies");
|
||||
|
||||
auto resource = getResource("charset.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies");
|
||||
|
||||
String line;
|
||||
UInt16 bigram;
|
||||
Float64 frequency;
|
||||
String charset_name;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new charset
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(charset_name, buf_line);
|
||||
|
||||
/* In our dictionary we have lines with form: <Language>_<Charset>
|
||||
* If we need to find language of data, we return <Language>
|
||||
* If we need to find charset of data, we return <Charset>.
|
||||
*/
|
||||
size_t sep = charset_name.find('_');
|
||||
|
||||
Encoding enc;
|
||||
enc.lang = charset_name.substr(0, sep);
|
||||
enc.name = charset_name.substr(sep + 1);
|
||||
encodings_freq.push_back(std::move(enc));
|
||||
}
|
||||
else
|
||||
{
|
||||
readIntText(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
encodings_freq.back().map[bigram] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
|
||||
}
|
||||
|
||||
|
||||
void loadEmotionalDict()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
||||
LOG_TRACE(log, "Loading embedded emotional dictionary");
|
||||
|
||||
auto resource = getResource("tonality_ru.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary");
|
||||
|
||||
String line;
|
||||
String word;
|
||||
Float64 tonality;
|
||||
size_t count = 0;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
readStringUntilWhitespace(word, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(tonality, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(word.data(), word.size()), word.size()};
|
||||
emotional_dict[ref] = tonality;
|
||||
++count;
|
||||
}
|
||||
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
||||
}
|
||||
|
||||
|
||||
void loadProgrammingFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded programming languages frequencies loading");
|
||||
|
||||
auto resource = getResource("programming.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies");
|
||||
|
||||
String line;
|
||||
String bigram;
|
||||
Float64 frequency;
|
||||
String programming_language;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new language
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(programming_language, buf_line);
|
||||
|
||||
Language lang;
|
||||
lang.name = programming_language;
|
||||
programming_freq.push_back(std::move(lang));
|
||||
}
|
||||
else
|
||||
{
|
||||
readStringUntilWhitespace(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()};
|
||||
programming_freq.back().map[ref] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Programming languages frequencies was added");
|
||||
}
|
||||
|
||||
const Map & getEmotionalDict()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (emotional_dict.empty())
|
||||
loadEmotionalDict();
|
||||
|
||||
return emotional_dict;
|
||||
}
|
||||
|
||||
|
||||
const EncodingContainer & getEncodingsFrequency()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (encodings_freq.empty())
|
||||
loadEncodingsFrequency();
|
||||
|
||||
return encodings_freq;
|
||||
}
|
||||
|
||||
const Container & getProgrammingFrequency()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (programming_freq.empty())
|
||||
loadProgrammingFrequency();
|
||||
|
||||
return programming_freq;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
Arena string_pool;
|
||||
|
||||
Map emotional_dict;
|
||||
Container programming_freq;
|
||||
EncodingContainer encodings_freq;
|
||||
|
||||
std::mutex mutex;
|
||||
};
|
||||
}
|
@ -291,6 +291,15 @@ public:
|
||||
|
||||
size_t getIntervalsSize() const { return intervals_size; }
|
||||
|
||||
size_t getSizeInBytes() const
|
||||
{
|
||||
size_t nodes_size_in_bytes = nodes.size() * sizeof(Node);
|
||||
size_t intervals_size_in_bytes = sorted_intervals.size() * sizeof(IntervalWithValue);
|
||||
size_t result = nodes_size_in_bytes + intervals_size_in_bytes;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
struct Node
|
||||
{
|
||||
|
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <Common/VariableContext.h>
|
||||
|
||||
/// To be able to avoid MEMORY_LIMIT_EXCEEDED Exception in destructors:
|
||||
|
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <Common/VariableContext.h>
|
||||
|
||||
/// To be able to temporarily stop memory tracking from current thread.
|
||||
|
@ -24,7 +24,6 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int UNSUPPORTED_PARAMETER;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
@ -34,9 +33,12 @@ namespace ErrorCodes
|
||||
*/
|
||||
|
||||
|
||||
struct StringSearcherBase
|
||||
class StringSearcherBase
|
||||
{
|
||||
public:
|
||||
bool force_fallback = false;
|
||||
#ifdef __SSE2__
|
||||
protected:
|
||||
static constexpr auto n = sizeof(__m128i);
|
||||
const int page_size = ::getPageSize();
|
||||
|
||||
@ -53,7 +55,7 @@ template <bool CaseSensitive, bool ASCII> class StringSearcher;
|
||||
|
||||
/// Case-insensitive UTF-8 searcher
|
||||
template <>
|
||||
class StringSearcher<false, false> : private StringSearcherBase
|
||||
class StringSearcher<false, false> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
using UTF8SequenceBuffer = uint8_t[6];
|
||||
@ -119,11 +121,14 @@ public:
|
||||
size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq));
|
||||
|
||||
if (length_l != length_u)
|
||||
throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
|
||||
force_fallback = true;
|
||||
}
|
||||
|
||||
l = l_seq[0];
|
||||
u = u_seq[0];
|
||||
|
||||
if (force_fallback)
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
@ -158,7 +163,10 @@ public:
|
||||
|
||||
/// @note Unicode standard states it is a rare but possible occasion
|
||||
if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
|
||||
throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
|
||||
{
|
||||
force_fallback = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
cache_actual_len += src_len;
|
||||
@ -199,9 +207,10 @@ public:
|
||||
if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
|
||||
break;
|
||||
|
||||
/// @note assuming sequences for lowercase and uppercase have exact same length (that is not always true)
|
||||
const auto len = UTF8::seqLength(*haystack_pos);
|
||||
auto len = UTF8::seqLength(*haystack_pos);
|
||||
haystack_pos += len;
|
||||
|
||||
len = UTF8::seqLength(*needle_pos);
|
||||
needle_pos += len;
|
||||
}
|
||||
|
||||
@ -213,7 +222,7 @@ public:
|
||||
{
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
if (pageSafe(pos))
|
||||
if (pageSafe(pos) && !force_fallback)
|
||||
{
|
||||
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
||||
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
|
||||
@ -262,7 +271,7 @@ public:
|
||||
while (haystack < haystack_end)
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
if (haystack + n <= haystack_end && pageSafe(haystack))
|
||||
if (haystack + n <= haystack_end && pageSafe(haystack) && !force_fallback)
|
||||
{
|
||||
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
||||
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
|
||||
@ -339,7 +348,7 @@ public:
|
||||
|
||||
/// Case-insensitive ASCII searcher
|
||||
template <>
|
||||
class StringSearcher<false, true> : private StringSearcherBase
|
||||
class StringSearcher<false, true> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
/// string to be searched for
|
||||
@ -541,7 +550,7 @@ public:
|
||||
|
||||
/// Case-sensitive searcher (both ASCII and UTF-8)
|
||||
template <bool ASCII>
|
||||
class StringSearcher<true, ASCII> : private StringSearcherBase
|
||||
class StringSearcher<true, ASCII> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
/// string to be searched for
|
||||
@ -725,7 +734,7 @@ public:
|
||||
// Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings
|
||||
// should work just fine. But any Unicode whitespace is not considered a token separtor.
|
||||
template <typename StringSearcher>
|
||||
class TokenSearcher
|
||||
class TokenSearcher : public StringSearcherBase
|
||||
{
|
||||
StringSearcher searcher;
|
||||
size_t needle_size;
|
||||
@ -809,7 +818,7 @@ using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStri
|
||||
* It is required that strings are zero-terminated.
|
||||
*/
|
||||
|
||||
struct LibCASCIICaseSensitiveStringSearcher
|
||||
struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
@ -833,7 +842,7 @@ struct LibCASCIICaseSensitiveStringSearcher
|
||||
}
|
||||
};
|
||||
|
||||
struct LibCASCIICaseInsensitiveStringSearcher
|
||||
struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
|
177
src/Common/SystemLogBase.cpp
Normal file
177
src/Common/SystemLogBase.cpp
Normal file
@ -0,0 +1,177 @@
|
||||
#include <Interpreters/AsynchronousMetricLog.h>
|
||||
#include <Interpreters/CrashLog.h>
|
||||
#include <Interpreters/MetricLog.h>
|
||||
#include <Interpreters/OpenTelemetrySpanLog.h>
|
||||
#include <Interpreters/PartLog.h>
|
||||
#include <Interpreters/QueryLog.h>
|
||||
#include <Interpreters/QueryThreadLog.h>
|
||||
#include <Interpreters/QueryViewsLog.h>
|
||||
#include <Interpreters/SessionLog.h>
|
||||
#include <Interpreters/TextLog.h>
|
||||
#include <Interpreters/TraceLog.h>
|
||||
#include <Interpreters/ZooKeeperLog.h>
|
||||
|
||||
#include <Common/MemoryTrackerBlockerInThread.h>
|
||||
#include <Common/SystemLogBase.h>
|
||||
|
||||
#include <base/logger_useful.h>
|
||||
#include <base/scope_guard.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TIMEOUT_EXCEEDED;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
constexpr size_t DBMS_SYSTEM_LOG_QUEUE_SIZE = 1048576;
|
||||
}
|
||||
|
||||
void ISystemLog::stopFlushThread()
|
||||
{
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
|
||||
if (!saving_thread.joinable())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_shutdown)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
is_shutdown = true;
|
||||
|
||||
/// Tell thread to shutdown.
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
saving_thread.join();
|
||||
}
|
||||
|
||||
void ISystemLog::startup()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
saving_thread = ThreadFromGlobalPool([this] { savingThreadFunction(); });
|
||||
}
|
||||
|
||||
static thread_local bool recursive_add_call = false;
|
||||
|
||||
template <typename LogElement>
|
||||
void SystemLogBase<LogElement>::add(const LogElement & element)
|
||||
{
|
||||
/// It is possible that the method will be called recursively.
|
||||
/// Better to drop these events to avoid complications.
|
||||
if (recursive_add_call)
|
||||
return;
|
||||
recursive_add_call = true;
|
||||
SCOPE_EXIT({ recursive_add_call = false; });
|
||||
|
||||
/// Memory can be allocated while resizing on queue.push_back.
|
||||
/// The size of allocation can be in order of a few megabytes.
|
||||
/// But this should not be accounted for query memory usage.
|
||||
/// Otherwise the tests like 01017_uniqCombined_memory_usage.sql will be flacky.
|
||||
MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global);
|
||||
|
||||
/// Should not log messages under mutex.
|
||||
bool queue_is_half_full = false;
|
||||
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
if (is_shutdown)
|
||||
return;
|
||||
|
||||
if (queue.size() == DBMS_SYSTEM_LOG_QUEUE_SIZE / 2)
|
||||
{
|
||||
queue_is_half_full = true;
|
||||
|
||||
// The queue more than half full, time to flush.
|
||||
// We only check for strict equality, because messages are added one
|
||||
// by one, under exclusive lock, so we will see each message count.
|
||||
// It is enough to only wake the flushing thread once, after the message
|
||||
// count increases past half available size.
|
||||
const uint64_t queue_end = queue_front_index + queue.size();
|
||||
if (requested_flush_up_to < queue_end)
|
||||
requested_flush_up_to = queue_end;
|
||||
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
if (queue.size() >= DBMS_SYSTEM_LOG_QUEUE_SIZE)
|
||||
{
|
||||
// Ignore all further entries until the queue is flushed.
|
||||
// Log a message about that. Don't spam it -- this might be especially
|
||||
// problematic in case of trace log. Remember what the front index of the
|
||||
// queue was when we last logged the message. If it changed, it means the
|
||||
// queue was flushed, and we can log again.
|
||||
if (queue_front_index != logged_queue_full_at_index)
|
||||
{
|
||||
logged_queue_full_at_index = queue_front_index;
|
||||
|
||||
// TextLog sets its logger level to 0, so this log is a noop and
|
||||
// there is no recursive logging.
|
||||
lock.unlock();
|
||||
LOG_ERROR(log, "Queue is full for system log '{}' at {}", demangle(typeid(*this).name()), queue_front_index);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
queue.push_back(element);
|
||||
}
|
||||
|
||||
if (queue_is_half_full)
|
||||
LOG_INFO(log, "Queue is half full for system log '{}'.", demangle(typeid(*this).name()));
|
||||
}
|
||||
|
||||
template <typename LogElement>
|
||||
void SystemLogBase<LogElement>::flush(bool force)
|
||||
{
|
||||
uint64_t this_thread_requested_offset;
|
||||
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
if (is_shutdown)
|
||||
return;
|
||||
|
||||
this_thread_requested_offset = queue_front_index + queue.size();
|
||||
|
||||
// Publish our flush request, taking care not to overwrite the requests
|
||||
// made by other threads.
|
||||
is_force_prepare_tables |= force;
|
||||
requested_flush_up_to = std::max(requested_flush_up_to, this_thread_requested_offset);
|
||||
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
LOG_DEBUG(log, "Requested flush up to offset {}", this_thread_requested_offset);
|
||||
|
||||
// Use an arbitrary timeout to avoid endless waiting. 60s proved to be
|
||||
// too fast for our parallel functional tests, probably because they
|
||||
// heavily load the disk.
|
||||
const int timeout_seconds = 180;
|
||||
std::unique_lock lock(mutex);
|
||||
bool result = flush_event.wait_for(lock, std::chrono::seconds(timeout_seconds), [&]
|
||||
{
|
||||
return flushed_up_to >= this_thread_requested_offset && !is_force_prepare_tables;
|
||||
});
|
||||
|
||||
if (!result)
|
||||
{
|
||||
throw Exception(
|
||||
"Timeout exceeded (" + toString(timeout_seconds) + " s) while flushing system log '" + demangle(typeid(*this).name()) + "'.",
|
||||
ErrorCodes::TIMEOUT_EXCEEDED);
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_SYSTEM_LOG_BASE(ELEMENT) template class SystemLogBase<ELEMENT>;
|
||||
SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG_BASE)
|
||||
|
||||
}
|
109
src/Common/SystemLogBase.h
Normal file
109
src/Common/SystemLogBase.h
Normal file
@ -0,0 +1,109 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <base/types.h>
|
||||
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Parsers/IAST_fwd.h>
|
||||
#include <Storages/IStorage_fwd.h>
|
||||
#include <Common/ThreadPool.h>
|
||||
|
||||
#define SYSTEM_LOG_ELEMENTS(M) \
|
||||
M(AsynchronousMetricLogElement) \
|
||||
M(CrashLogElement) \
|
||||
M(MetricLogElement) \
|
||||
M(OpenTelemetrySpanLogElement) \
|
||||
M(PartLogElement) \
|
||||
M(QueryLogElement) \
|
||||
M(QueryThreadLogElement) \
|
||||
M(QueryViewsLogElement) \
|
||||
M(SessionLogElement) \
|
||||
M(TraceLogElement) \
|
||||
M(ZooKeeperLogElement) \
|
||||
M(TextLogElement)
|
||||
|
||||
namespace Poco
|
||||
{
|
||||
class Logger;
|
||||
namespace Util
|
||||
{
|
||||
class AbstractConfiguration;
|
||||
}
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
struct StorageID;
|
||||
|
||||
class ISystemLog
|
||||
{
|
||||
public:
|
||||
virtual String getName() = 0;
|
||||
//// force -- force table creation (used for SYSTEM FLUSH LOGS)
|
||||
virtual void flush(bool force = false) = 0;
|
||||
virtual void prepareTable() = 0;
|
||||
|
||||
/// Start the background thread.
|
||||
virtual void startup();
|
||||
|
||||
/// Stop the background flush thread before destructor. No more data will be written.
|
||||
virtual void shutdown() = 0;
|
||||
|
||||
virtual ~ISystemLog() = default;
|
||||
|
||||
virtual void savingThreadFunction() = 0;
|
||||
|
||||
protected:
|
||||
ThreadFromGlobalPool saving_thread;
|
||||
|
||||
/// Data shared between callers of add()/flush()/shutdown(), and the saving thread
|
||||
std::mutex mutex;
|
||||
|
||||
bool is_shutdown = false;
|
||||
std::condition_variable flush_event;
|
||||
|
||||
void stopFlushThread();
|
||||
};
|
||||
|
||||
template <typename LogElement>
|
||||
class SystemLogBase : public ISystemLog
|
||||
{
|
||||
public:
|
||||
using Self = SystemLogBase;
|
||||
|
||||
/** Append a record into log.
|
||||
* Writing to table will be done asynchronously and in case of failure, record could be lost.
|
||||
*/
|
||||
void add(const LogElement & element);
|
||||
|
||||
/// Flush data in the buffer to disk
|
||||
void flush(bool force) override;
|
||||
|
||||
String getName() override { return LogElement::name(); }
|
||||
|
||||
protected:
|
||||
Poco::Logger * log;
|
||||
|
||||
// Queue is bounded. But its size is quite large to not block in all normal cases.
|
||||
std::vector<LogElement> queue;
|
||||
// An always-incrementing index of the first message currently in the queue.
|
||||
// We use it to give a global sequential index to every message, so that we
|
||||
// can wait until a particular message is flushed. This is used to implement
|
||||
// synchronous log flushing for SYSTEM FLUSH LOGS.
|
||||
uint64_t queue_front_index = 0;
|
||||
// A flag that says we must create the tables even if the queue is empty.
|
||||
bool is_force_prepare_tables = false;
|
||||
// Requested to flush logs up to this index, exclusive
|
||||
uint64_t requested_flush_up_to = 0;
|
||||
// Flushed log up to this index, exclusive
|
||||
uint64_t flushed_up_to = 0;
|
||||
// Logged overflow message at this queue front index
|
||||
uint64_t logged_queue_full_at_index = -1;
|
||||
};
|
||||
|
||||
}
|
@ -372,7 +372,7 @@ public:
|
||||
, fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}
|
||||
, fallback_searcher{needle_, needle_size}
|
||||
{
|
||||
if (fallback)
|
||||
if (fallback || fallback_searcher.force_fallback)
|
||||
return;
|
||||
|
||||
hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
|
||||
@ -393,7 +393,7 @@ public:
|
||||
|
||||
const auto haystack_end = haystack + haystack_size;
|
||||
|
||||
if (fallback || haystack_size <= needle_size)
|
||||
if (fallback || haystack_size <= needle_size || fallback_searcher.force_fallback)
|
||||
return fallback_searcher.search(haystack, haystack_end);
|
||||
|
||||
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
|
||||
|
@ -3,12 +3,7 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
|
||||
add_headers_and_sources(clickhouse_common_zookeeper .)
|
||||
|
||||
# for clickhouse server
|
||||
#
|
||||
# NOTE: this library depends from Interpreters (DB::SystemLog<DB::ZooKeeperLogElement>::add),
|
||||
# and so it should be STATIC because otherwise:
|
||||
# - it will either fail to compile with -Wl,--unresolved-symbols=report-all
|
||||
# - or it will report errors at runtime
|
||||
add_library(clickhouse_common_zookeeper STATIC ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources})
|
||||
add_library(clickhouse_common_zookeeper ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources})
|
||||
target_compile_definitions (clickhouse_common_zookeeper PRIVATE -DZOOKEEPER_LOG)
|
||||
target_link_libraries (clickhouse_common_zookeeper
|
||||
PUBLIC
|
||||
|
@ -554,7 +554,7 @@ class IColumn;
|
||||
/** Experimental functions */ \
|
||||
M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \
|
||||
M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \
|
||||
|
||||
M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \
|
||||
// End of COMMON_SETTINGS
|
||||
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.
|
||||
|
||||
|
@ -32,6 +32,7 @@ namespace ErrorCodes
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH;
|
||||
extern const int ILLEGAL_INDEX;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
|
||||
@ -156,8 +157,19 @@ MutableColumnPtr DataTypeTuple::createColumn() const
|
||||
|
||||
MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const
|
||||
{
|
||||
const auto & element_serializations =
|
||||
assert_cast<const SerializationTuple &>(serialization).getElementsSerializations();
|
||||
/// If we read subcolumn of nested Tuple, it may be wrapped to SerializationNamed
|
||||
/// several times to allow to reconstruct the substream path name.
|
||||
/// Here we don't need substream path name, so we drop first several wrapper serializations.
|
||||
|
||||
const auto * current_serialization = &serialization;
|
||||
while (const auto * serialization_named = typeid_cast<const SerializationNamed *>(current_serialization))
|
||||
current_serialization = serialization_named->getNested().get();
|
||||
|
||||
const auto * serialization_tuple = typeid_cast<const SerializationTuple *>(current_serialization);
|
||||
if (!serialization_tuple)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected serialization to create column of type Tuple");
|
||||
|
||||
const auto & element_serializations = serialization_tuple->getElementsSerializations();
|
||||
|
||||
size_t size = elems.size();
|
||||
assert(element_serializations.size() == size);
|
||||
|
@ -523,6 +523,7 @@ inline bool isBool(const DataTypePtr & data_type)
|
||||
template <typename DataType> constexpr bool IsDataTypeDecimal = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeNumber = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeDateOrDateTime = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeEnum = false;
|
||||
|
||||
template <typename DataType> constexpr bool IsDataTypeDecimalOrNumber = IsDataTypeDecimal<DataType> || IsDataTypeNumber<DataType>;
|
||||
|
||||
@ -547,4 +548,9 @@ template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDate32> = tru
|
||||
template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDateTime> = true;
|
||||
template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDateTime64> = true;
|
||||
|
||||
template <typename T>
|
||||
class DataTypeEnum;
|
||||
|
||||
template <typename T> inline constexpr bool IsDataTypeEnum<DataTypeEnum<T>> = true;
|
||||
|
||||
}
|
||||
|
@ -5,6 +5,11 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Serialization wrapper that acts like nested serialization,
|
||||
/// but adds a passed name to the substream path like the
|
||||
/// read column was the tuple element with this name.
|
||||
/// It's used while reading subcolumns of complex types.
|
||||
/// In particular while reading components of named tuples.
|
||||
class SerializationNamed final : public SerializationWrapper
|
||||
{
|
||||
private:
|
||||
|
@ -60,8 +60,8 @@ private:
|
||||
const auto & attributes_types_to_read = coordinator->getAttributesTypesToRead();
|
||||
const auto & attributes_default_values_columns = coordinator->getAttributesDefaultValuesColumns();
|
||||
|
||||
const auto & dictionary = coordinator->getDictionary();
|
||||
auto attributes_columns = dictionary->getColumns(
|
||||
const auto & read_columns_func = coordinator->getReadColumnsFunc();
|
||||
auto attributes_columns = read_columns_func(
|
||||
attributes_names_to_read,
|
||||
attributes_types_to_read,
|
||||
key_columns,
|
||||
|
@ -19,6 +19,8 @@ class DictionarySourceCoordinator final : public shared_ptr_helper<DictionarySou
|
||||
|
||||
public:
|
||||
|
||||
using ReadColumnsFunc = std::function<Columns (const Strings &, const DataTypes &, const Columns &, const DataTypes &, const Columns &)>;
|
||||
|
||||
Pipe read(size_t num_streams);
|
||||
|
||||
private:
|
||||
@ -31,6 +33,15 @@ private:
|
||||
: dictionary(std::move(dictionary_))
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func([this](
|
||||
const Strings & attribute_names,
|
||||
const DataTypes & result_types,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const Columns & default_values_columns)
|
||||
{
|
||||
return dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_values_columns);
|
||||
})
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
@ -45,6 +56,31 @@ private:
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, data_columns_with_type(std::move(data_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func([this](
|
||||
const Strings & attribute_names,
|
||||
const DataTypes & result_types,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const Columns & default_values_columns)
|
||||
{
|
||||
return dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_values_columns);
|
||||
})
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
|
||||
explicit DictionarySourceCoordinator(
|
||||
std::shared_ptr<const IDictionary> dictionary_,
|
||||
const Names & column_names,
|
||||
ColumnsWithTypeAndName && key_columns_with_type_,
|
||||
ColumnsWithTypeAndName && data_columns_with_type_,
|
||||
size_t max_block_size_,
|
||||
ReadColumnsFunc read_columns_func_)
|
||||
: dictionary(std::move(dictionary_))
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, data_columns_with_type(std::move(data_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func(std::move(read_columns_func_))
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
@ -61,6 +97,8 @@ private:
|
||||
|
||||
const std::vector<ColumnPtr> & getAttributesDefaultValuesColumns() const { return attributes_default_values_columns; }
|
||||
|
||||
const ReadColumnsFunc & getReadColumnsFunc() const { return read_columns_func; }
|
||||
|
||||
const std::shared_ptr<const IDictionary> & getDictionary() const { return dictionary; }
|
||||
|
||||
void initialize(const Names & column_names);
|
||||
@ -79,6 +117,8 @@ private:
|
||||
std::vector<ColumnPtr> attributes_default_values_columns;
|
||||
|
||||
const size_t max_block_size;
|
||||
ReadColumnsFunc read_columns_func;
|
||||
|
||||
std::atomic<size_t> parallel_read_block_index = 0;
|
||||
};
|
||||
|
||||
|
@ -382,7 +382,8 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
|
||||
|
||||
void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConfiguration & config, const std::string & structure_prefix)
|
||||
{
|
||||
const char * range_default_type = "Date";
|
||||
static constexpr auto range_default_type = "Date";
|
||||
|
||||
if (config.has(structure_prefix + ".range_min"))
|
||||
range_min.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_min", range_default_type));
|
||||
|
||||
@ -395,7 +396,10 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf
|
||||
"Dictionary structure should have both 'range_min' and 'range_max' either specified or not.");
|
||||
}
|
||||
|
||||
if (range_min && range_max && !range_min->type->equals(*range_max->type))
|
||||
if (!range_min)
|
||||
return;
|
||||
|
||||
if (!range_min->type->equals(*range_max->type))
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Dictionary structure 'range_min' and 'range_max' should have same type, "
|
||||
@ -405,15 +409,20 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf
|
||||
range_max->type->getName());
|
||||
}
|
||||
|
||||
if (range_min && !range_min->type->isValueRepresentedByInteger())
|
||||
WhichDataType range_type(range_min->type);
|
||||
|
||||
bool valid_range = range_type.isInt() || range_type.isUInt() || range_type.isDecimal() || range_type.isFloat() || range_type.isEnum()
|
||||
|| range_type.isDate() || range_type.isDate32() || range_type.isDateTime() || range_type.isDateTime64();
|
||||
|
||||
if (!valid_range)
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum."
|
||||
"Dictionary structure type of 'range_min' and 'range_max' should be an Integer, Float, Decimal, Date, Date32, DateTime DateTime64, or Enum."
|
||||
" Actual 'range_min' and 'range_max' type is {}",
|
||||
range_min->type->getName());
|
||||
}
|
||||
|
||||
if ((range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty()))
|
||||
if (!range_min->expression.empty() || !range_max->expression.empty())
|
||||
has_expressions = true;
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -19,7 +19,18 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
using RangeStorageType = Int64;
|
||||
enum class RangeHashedDictionaryLookupStrategy : uint8_t
|
||||
{
|
||||
min,
|
||||
max
|
||||
};
|
||||
|
||||
struct RangeHashedDictionaryConfiguration
|
||||
{
|
||||
bool convert_null_range_bound_to_open;
|
||||
RangeHashedDictionaryLookupStrategy lookup_strategy;
|
||||
bool require_nonempty;
|
||||
};
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
class RangeHashedDictionary final : public IDictionary
|
||||
@ -31,11 +42,17 @@ public:
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
DictionarySourcePtr source_ptr_,
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
bool require_nonempty_,
|
||||
DictionaryLifetime dict_lifetime_,
|
||||
RangeHashedDictionaryConfiguration configuration_,
|
||||
BlockPtr update_field_loaded_block_ = nullptr);
|
||||
|
||||
std::string getTypeName() const override { return "RangeHashed"; }
|
||||
std::string getTypeName() const override
|
||||
{
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
|
||||
return "RangeHashed";
|
||||
else
|
||||
return "ComplexKeyRangeHashed";
|
||||
}
|
||||
|
||||
size_t getBytesAllocated() const override { return bytes_allocated; }
|
||||
|
||||
@ -57,7 +74,15 @@ public:
|
||||
|
||||
std::shared_ptr<const IExternalLoadable> clone() const override
|
||||
{
|
||||
return std::make_shared<RangeHashedDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, update_field_loaded_block);
|
||||
auto result = std::make_shared<RangeHashedDictionary>(
|
||||
getDictionaryID(),
|
||||
dict_struct,
|
||||
source_ptr->clone(),
|
||||
dict_lifetime,
|
||||
configuration,
|
||||
update_field_loaded_block);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DictionarySourcePtr getSource() const override { return source_ptr; }
|
||||
@ -76,7 +101,7 @@ public:
|
||||
DictionarySpecialKeyType getSpecialKeyType() const override { return DictionarySpecialKeyType::Range;}
|
||||
|
||||
ColumnPtr getColumn(
|
||||
const std::string& attribute_name,
|
||||
const std::string & attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
@ -88,52 +113,90 @@ public:
|
||||
|
||||
private:
|
||||
|
||||
using RangeInterval = Interval<RangeStorageType>;
|
||||
template <typename RangeStorageType>
|
||||
using IntervalMap = IntervalMap<Interval<RangeStorageType>, size_t>;
|
||||
|
||||
template <typename T>
|
||||
using Values = IntervalMap<RangeInterval, std::optional<T>>;
|
||||
template <typename RangeStorageType>
|
||||
using KeyAttributeContainerType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, IntervalMap<RangeStorageType>, DefaultHash<UInt64>>,
|
||||
HashMapWithSavedHash<StringRef, IntervalMap<RangeStorageType>, DefaultHash<StringRef>>>;
|
||||
|
||||
template <typename Value>
|
||||
using CollectionType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, Values<Value>, DefaultHash<UInt64>>,
|
||||
HashMapWithSavedHash<StringRef, Values<Value>, DefaultHash<StringRef>>>;
|
||||
|
||||
using NoAttributesCollectionType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, IntervalSet<RangeInterval>>,
|
||||
HashMapWithSavedHash<StringRef, IntervalSet<RangeInterval>>>;
|
||||
using AttributeContainerType = std::conditional_t<std::is_same_v<Value, Array>, std::vector<Value>, PaddedPODArray<Value>>;
|
||||
|
||||
struct Attribute final
|
||||
{
|
||||
public:
|
||||
AttributeUnderlyingType type;
|
||||
bool is_nullable;
|
||||
|
||||
std::variant<
|
||||
CollectionType<UInt8>,
|
||||
CollectionType<UInt16>,
|
||||
CollectionType<UInt32>,
|
||||
CollectionType<UInt64>,
|
||||
CollectionType<UInt128>,
|
||||
CollectionType<UInt256>,
|
||||
CollectionType<Int8>,
|
||||
CollectionType<Int16>,
|
||||
CollectionType<Int32>,
|
||||
CollectionType<Int64>,
|
||||
CollectionType<Int128>,
|
||||
CollectionType<Int256>,
|
||||
CollectionType<Decimal32>,
|
||||
CollectionType<Decimal64>,
|
||||
CollectionType<Decimal128>,
|
||||
CollectionType<Decimal256>,
|
||||
CollectionType<DateTime64>,
|
||||
CollectionType<Float32>,
|
||||
CollectionType<Float64>,
|
||||
CollectionType<UUID>,
|
||||
CollectionType<StringRef>,
|
||||
CollectionType<Array>>
|
||||
maps;
|
||||
AttributeContainerType<UInt8>,
|
||||
AttributeContainerType<UInt16>,
|
||||
AttributeContainerType<UInt32>,
|
||||
AttributeContainerType<UInt64>,
|
||||
AttributeContainerType<UInt128>,
|
||||
AttributeContainerType<UInt256>,
|
||||
AttributeContainerType<Int8>,
|
||||
AttributeContainerType<Int16>,
|
||||
AttributeContainerType<Int32>,
|
||||
AttributeContainerType<Int64>,
|
||||
AttributeContainerType<Int128>,
|
||||
AttributeContainerType<Int256>,
|
||||
AttributeContainerType<Decimal32>,
|
||||
AttributeContainerType<Decimal64>,
|
||||
AttributeContainerType<Decimal128>,
|
||||
AttributeContainerType<Decimal256>,
|
||||
AttributeContainerType<DateTime64>,
|
||||
AttributeContainerType<Float32>,
|
||||
AttributeContainerType<Float64>,
|
||||
AttributeContainerType<UUID>,
|
||||
AttributeContainerType<StringRef>,
|
||||
AttributeContainerType<Array>>
|
||||
container;
|
||||
|
||||
std::optional<std::vector<bool>> is_value_nullable;
|
||||
};
|
||||
|
||||
template <typename RangeStorageType>
|
||||
struct InvalidIntervalWithKey
|
||||
{
|
||||
KeyType key;
|
||||
Interval<RangeStorageType> interval;
|
||||
size_t attribute_value_index;
|
||||
};
|
||||
|
||||
template <typename RangeStorageType>
|
||||
using InvalidIntervalsContainerType = PaddedPODArray<InvalidIntervalWithKey<RangeStorageType>>;
|
||||
|
||||
template <template<typename> typename ContainerType>
|
||||
using RangeStorageTypeContainer = std::variant<
|
||||
ContainerType<UInt8>,
|
||||
ContainerType<UInt16>,
|
||||
ContainerType<UInt32>,
|
||||
ContainerType<UInt64>,
|
||||
ContainerType<UInt128>,
|
||||
ContainerType<UInt256>,
|
||||
ContainerType<Int8>,
|
||||
ContainerType<Int16>,
|
||||
ContainerType<Int32>,
|
||||
ContainerType<Int64>,
|
||||
ContainerType<Int128>,
|
||||
ContainerType<Int256>,
|
||||
ContainerType<Decimal32>,
|
||||
ContainerType<Decimal64>,
|
||||
ContainerType<Decimal128>,
|
||||
ContainerType<Decimal256>,
|
||||
ContainerType<DateTime64>,
|
||||
ContainerType<Float32>,
|
||||
ContainerType<Float64>,
|
||||
ContainerType<UUID>>;
|
||||
|
||||
struct KeyAttribute final
|
||||
{
|
||||
RangeStorageTypeContainer<KeyAttributeContainerType> container;
|
||||
|
||||
RangeStorageTypeContainer<InvalidIntervalsContainerType> invalid_intervals_container;
|
||||
|
||||
};
|
||||
|
||||
void createAttributes();
|
||||
@ -151,43 +214,31 @@ private:
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const;
|
||||
|
||||
ColumnPtr getColumnInternal(
|
||||
const std::string & attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
const PaddedPODArray<UInt64> & key_to_index) const;
|
||||
|
||||
template <typename AttributeType, bool is_nullable, typename ValueSetter>
|
||||
void getItemsInternalImpl(
|
||||
const Attribute & attribute,
|
||||
const PaddedPODArray<UInt64> & key_to_index,
|
||||
ValueSetter && set_value) const;
|
||||
|
||||
void updateData();
|
||||
|
||||
void blockToAttributes(const Block & block);
|
||||
|
||||
void buildAttributeIntervalTrees();
|
||||
|
||||
template <typename T>
|
||||
void setAttributeValueImpl(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value);
|
||||
|
||||
void setAttributeValue(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value);
|
||||
|
||||
template <typename RangeType>
|
||||
void getKeysAndDates(
|
||||
PaddedPODArray<KeyType> & keys,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const;
|
||||
|
||||
template <typename T, typename RangeType>
|
||||
void getKeysAndDates(
|
||||
const Attribute & attribute,
|
||||
PaddedPODArray<KeyType> & keys,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const;
|
||||
|
||||
template <typename RangeType>
|
||||
PaddedPODArray<Int64> makeDateKeys(
|
||||
const PaddedPODArray<RangeType> & block_start_dates,
|
||||
const PaddedPODArray<RangeType> & block_end_dates) const;
|
||||
void setAttributeValue(Attribute & attribute, const Field & value);
|
||||
|
||||
const DictionaryStructure dict_struct;
|
||||
const DictionarySourcePtr source_ptr;
|
||||
const DictionaryLifetime dict_lifetime;
|
||||
const bool require_nonempty;
|
||||
const RangeHashedDictionaryConfiguration configuration;
|
||||
BlockPtr update_field_loaded_block;
|
||||
|
||||
std::vector<Attribute> attributes;
|
||||
Arena complex_key_arena;
|
||||
KeyAttribute key_attribute;
|
||||
|
||||
size_t bytes_allocated = 0;
|
||||
size_t element_count = 0;
|
||||
@ -195,7 +246,6 @@ private:
|
||||
mutable std::atomic<size_t> query_count{0};
|
||||
mutable std::atomic<size_t> found_count{0};
|
||||
Arena string_arena;
|
||||
NoAttributesCollectionType no_attributes_container;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ namespace ErrorCodes
|
||||
extern const int CANNOT_TRUNCATE_FILE;
|
||||
extern const int CANNOT_UNLINK;
|
||||
extern const int CANNOT_RMDIR;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
std::mutex DiskLocal::reservation_mutex;
|
||||
@ -458,10 +459,16 @@ void registerDiskLocal(DiskFactory & factory)
|
||||
const Poco::Util::AbstractConfiguration & config,
|
||||
const String & config_prefix,
|
||||
ContextPtr context,
|
||||
const DisksMap & /*map*/) -> DiskPtr {
|
||||
const DisksMap & map) -> DiskPtr {
|
||||
String path;
|
||||
UInt64 keep_free_space_bytes;
|
||||
loadDiskLocalConfig(name, config, config_prefix, context, path, keep_free_space_bytes);
|
||||
|
||||
for (const auto & [disk_name, disk_ptr] : map)
|
||||
{
|
||||
if (path == disk_ptr->getPath())
|
||||
throw Exception("Disk " + name + " and Disk " + disk_name + " cannot have the same path" + " (" + path + ")", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
return std::make_shared<DiskLocal>(name, path, keep_free_space_bytes);
|
||||
};
|
||||
factory.registerDiskType("local", creator);
|
||||
|
@ -13,6 +13,8 @@
|
||||
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
|
||||
#include <Poco/URI.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
@ -431,6 +433,9 @@ void FormatFactory::registerFileExtension(const String & extension, const String
|
||||
|
||||
String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_found)
|
||||
{
|
||||
if (file_name == "stdin")
|
||||
return getFormatFromFileDescriptor(STDIN_FILENO);
|
||||
|
||||
CompressionMethod compression_method = chooseCompressionMethod(file_name, "");
|
||||
if (CompressionMethod::None != compression_method)
|
||||
{
|
||||
@ -459,6 +464,25 @@ String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_
|
||||
return it->second;
|
||||
}
|
||||
|
||||
String FormatFactory::getFormatFromFileDescriptor(int fd)
|
||||
{
|
||||
#ifdef OS_LINUX
|
||||
char buf[32] = {'\0'};
|
||||
snprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd);
|
||||
char file_path[PATH_MAX] = {'\0'};
|
||||
if (readlink(buf, file_path, sizeof(file_path) - 1) != -1)
|
||||
return getFormatFromFileName(file_path, false);
|
||||
return "";
|
||||
#elif defined(__APPLE__)
|
||||
char file_path[PATH_MAX] = {'\0'};
|
||||
if (fcntl(fd, F_GETPATH, file_path) != -1)
|
||||
return getFormatFromFileName(file_path, false);
|
||||
return "";
|
||||
#else
|
||||
return "";
|
||||
#endif
|
||||
}
|
||||
|
||||
void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine)
|
||||
{
|
||||
auto & target = dict[name].file_segmentation_engine;
|
||||
|
@ -187,6 +187,7 @@ public:
|
||||
/// Register file extension for format
|
||||
void registerFileExtension(const String & extension, const String & format_name);
|
||||
String getFormatFromFileName(String file_name, bool throw_if_not_found = false);
|
||||
String getFormatFromFileDescriptor(int fd);
|
||||
|
||||
/// Register schema readers for format its name.
|
||||
void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator);
|
||||
|
@ -17,7 +17,12 @@ namespace ErrorCodes
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context)
|
||||
ColumnsDescription readSchemaFromFormat(
|
||||
const String & format_name,
|
||||
const std::optional<FormatSettings> & format_settings,
|
||||
ReadBufferCreator read_buffer_creator,
|
||||
ContextPtr context,
|
||||
std::unique_ptr<ReadBuffer> & buf_out)
|
||||
{
|
||||
NamesAndTypesList names_and_types;
|
||||
if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name))
|
||||
@ -34,11 +39,11 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o
|
||||
}
|
||||
else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name))
|
||||
{
|
||||
auto read_buf = read_buffer_creator();
|
||||
if (read_buf->eof())
|
||||
buf_out = read_buffer_creator();
|
||||
if (buf_out->eof())
|
||||
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, file is empty", format_name);
|
||||
|
||||
auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *read_buf, context, format_settings);
|
||||
auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf_out, context, format_settings);
|
||||
try
|
||||
{
|
||||
names_and_types = schema_reader->readSchema();
|
||||
@ -54,6 +59,12 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o
|
||||
return ColumnsDescription(names_and_types);
|
||||
}
|
||||
|
||||
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context)
|
||||
{
|
||||
std::unique_ptr<ReadBuffer> buf_out;
|
||||
return readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, buf_out);
|
||||
}
|
||||
|
||||
DataTypePtr generalizeDataType(DataTypePtr type)
|
||||
{
|
||||
WhichDataType which(type);
|
||||
|
@ -15,7 +15,19 @@ namespace DB
|
||||
/// If format doesn't have any schema reader or a schema reader
|
||||
/// couldn't determine the schema, an exception will be thrown.
|
||||
using ReadBufferCreator = std::function<std::unique_ptr<ReadBuffer>()>;
|
||||
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context);
|
||||
ColumnsDescription readSchemaFromFormat(
|
||||
const String & format_name,
|
||||
const std::optional<FormatSettings> & format_settings,
|
||||
ReadBufferCreator read_buffer_creator,
|
||||
ContextPtr context);
|
||||
|
||||
/// If ReadBuffer is created, it will be written to buf_out.
|
||||
ColumnsDescription readSchemaFromFormat(
|
||||
const String & format_name,
|
||||
const std::optional<FormatSettings> & format_settings,
|
||||
ReadBufferCreator read_buffer_creator,
|
||||
ContextPtr context,
|
||||
std::unique_ptr<ReadBuffer> & buf_out);
|
||||
|
||||
/// Convert type to the most general type:
|
||||
/// - IntN, UIntN, FloatN, Decimal -> Float64
|
||||
|
@ -76,6 +76,10 @@ endif()
|
||||
|
||||
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::lz4)
|
||||
|
||||
if (ENABLE_NLP)
|
||||
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::cld2)
|
||||
endif()
|
||||
|
||||
if (TARGET ch_contrib::h3)
|
||||
target_link_libraries (clickhouse_functions PRIVATE ch_contrib::h3)
|
||||
endif()
|
||||
|
@ -18,6 +18,7 @@ namespace ErrorCodes
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int DECIMAL_OVERFLOW;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
}
|
||||
|
||||
/// Cast DateTime64 to Int64 representation narrowed down (or scaled up) to any scale value defined in Impl.
|
||||
@ -108,8 +109,8 @@ public:
|
||||
if (arguments.size() < 1 || arguments.size() > 2)
|
||||
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", name);
|
||||
|
||||
if (!typeid_cast<const DataTypeInt64 *>(arguments[0].type.get()))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The first argument for function {} must be Int64", name);
|
||||
if (!isInteger(arguments[0].type))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The first argument for function {} must be integer", name);
|
||||
|
||||
std::string timezone;
|
||||
if (arguments.size() == 2)
|
||||
@ -118,21 +119,48 @@ public:
|
||||
return std::make_shared<DataTypeDateTime64>(target_scale, timezone);
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
||||
template <typename T>
|
||||
bool executeType(auto & result_column, const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const
|
||||
{
|
||||
const auto & src = arguments[0];
|
||||
const auto & col = *src.column;
|
||||
|
||||
auto res_column = ColumnDecimal<DateTime64>::create(input_rows_count, target_scale);
|
||||
auto & result_data = res_column->getData();
|
||||
if (!checkAndGetColumn<ColumnVector<T>>(col))
|
||||
return 0;
|
||||
|
||||
const auto & source_data = typeid_cast<const ColumnInt64 &>(col).getData();
|
||||
auto & result_data = result_column->getData();
|
||||
|
||||
const auto & source_data = typeid_cast<const ColumnVector<T> &>(col).getData();
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
result_data[i] = source_data[i];
|
||||
|
||||
return res_column;
|
||||
return 1;
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
||||
{
|
||||
auto result_column = ColumnDecimal<DateTime64>::create(input_rows_count, target_scale);
|
||||
|
||||
if (!((executeType<UInt8>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<UInt16>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<UInt32>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<UInt32>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<UInt64>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<Int8>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<Int16>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<Int32>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<Int64>(result_column, arguments, input_rows_count))))
|
||||
{
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Illegal column {} of first argument of function {}",
|
||||
arguments[0].column->getName(),
|
||||
getName());
|
||||
}
|
||||
|
||||
return result_column;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
142
src/Functions/FunctionsCharsetClassification.cpp
Normal file
142
src/Functions/FunctionsCharsetClassification.cpp
Normal file
@ -0,0 +1,142 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
|
||||
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
|
||||
* Using a naive Bayesian classifier, find the most likely charset and language and return it
|
||||
*/
|
||||
|
||||
template <bool detect_language>
|
||||
struct CharsetClassificationImpl
|
||||
{
|
||||
/* We need to solve zero-frequency problem for Naive Bayes Classifier
|
||||
* If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
|
||||
* 1e-06 is minimal value in our marked-up dictionary.
|
||||
*/
|
||||
static constexpr Float64 zero_frequency = 1e-06;
|
||||
|
||||
/// If the data size is bigger than this, behaviour is unspecified for this function.
|
||||
static constexpr size_t max_string_size = 1u << 15;
|
||||
|
||||
static ALWAYS_INLINE inline Float64 naiveBayes(
|
||||
const FrequencyHolder::EncodingMap & standard,
|
||||
const HashMap<UInt16, UInt64> & model,
|
||||
Float64 max_result)
|
||||
{
|
||||
Float64 res = 0;
|
||||
for (const auto & el : model)
|
||||
{
|
||||
/// Try to find bigram in the dictionary.
|
||||
const auto * it = standard.find(el.getKey());
|
||||
if (it != standard.end())
|
||||
{
|
||||
res += el.getMapped() * log(it->getMapped());
|
||||
} else
|
||||
{
|
||||
res += el.getMapped() * log(zero_frequency);
|
||||
}
|
||||
/// If at some step the result has become less than the current maximum, then it makes no sense to count it fully.
|
||||
if (res < max_result)
|
||||
{
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Сount how many times each bigram occurs in the text.
|
||||
static ALWAYS_INLINE inline void calculateStats(
|
||||
const UInt8 * data,
|
||||
const size_t size,
|
||||
HashMap<UInt16, UInt64> & model)
|
||||
{
|
||||
UInt16 hash = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
hash <<= 8;
|
||||
hash += *(data + i);
|
||||
++model[hash];
|
||||
}
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
||||
|
||||
if (detect_language)
|
||||
/// 2 chars for ISO code + 1 zero byte
|
||||
res_data.reserve(offsets.size() * 3);
|
||||
else
|
||||
/// Mean charset length is 8
|
||||
res_data.reserve(offsets.size() * 8);
|
||||
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::string_view res;
|
||||
|
||||
HashMap<UInt16, UInt64> model;
|
||||
calculateStats(str, str_len, model);
|
||||
|
||||
/// Go through the dictionary and find the charset with the highest weight
|
||||
Float64 max_result = log(zero_frequency) * (max_string_size);
|
||||
for (const auto & item : encodings_freq)
|
||||
{
|
||||
Float64 score = naiveBayes(item.map, model, max_result);
|
||||
if (max_result < score)
|
||||
{
|
||||
max_result = score;
|
||||
res = detect_language ? item.lang : item.name;
|
||||
}
|
||||
}
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct NameDetectCharset
|
||||
{
|
||||
static constexpr auto name = "detectCharset";
|
||||
};
|
||||
|
||||
struct NameDetectLanguageUnknown
|
||||
{
|
||||
static constexpr auto name = "detectLanguageUnknown";
|
||||
};
|
||||
|
||||
|
||||
using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
|
||||
using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
|
||||
|
||||
void registerFunctionDetectCharset(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectCharset>();
|
||||
factory.registerFunction<FunctionDetectLanguageUnknown>();
|
||||
}
|
||||
|
||||
}
|
231
src/Functions/FunctionsLanguageClassification.cpp
Normal file
231
src/Functions/FunctionsLanguageClassification.cpp
Normal file
@ -0,0 +1,231 @@
|
||||
#include "config_functions.h"
|
||||
|
||||
#if USE_NLP
|
||||
|
||||
#include <Columns/ColumnMap.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Common/isValidUTF8.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
#include <compact_lang_det.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
/* Determine language of Unicode UTF-8 text.
|
||||
* Uses the cld2 library https://github.com/CLD2Owners/cld2
|
||||
*/
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
struct FunctionDetectLanguageImpl
|
||||
{
|
||||
static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string)
|
||||
{
|
||||
if (code_string.ends_with("-Latn"))
|
||||
code_string.remove_suffix(code_string.size() - 5);
|
||||
|
||||
if (code_string.ends_with("-Hant"))
|
||||
code_string.remove_suffix(code_string.size() - 5);
|
||||
|
||||
// Old deprecated codes
|
||||
if (code_string == "iw")
|
||||
return "he";
|
||||
|
||||
if (code_string == "jw")
|
||||
return "jv";
|
||||
|
||||
if (code_string == "in")
|
||||
return "id";
|
||||
|
||||
if (code_string == "mo")
|
||||
return "ro";
|
||||
|
||||
// Some languages do not have 2 letter codes, for example code for Cebuano is ceb
|
||||
if (code_string.size() != 2)
|
||||
return "other";
|
||||
|
||||
return code_string;
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
/// Constant 3 is based on the fact that in general we need 2 characters for ISO code + 1 zero byte
|
||||
res_data.reserve(offsets.size() * 3);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
bool is_reliable;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::string_view res;
|
||||
|
||||
if (UTF8::isValidUTF8(str, str_len))
|
||||
{
|
||||
auto lang = CLD2::DetectLanguage(reinterpret_cast<const char *>(str), str_len, true, &is_reliable);
|
||||
res = codeISO(LanguageCode(lang));
|
||||
}
|
||||
else
|
||||
{
|
||||
res = "un";
|
||||
}
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class FunctionDetectLanguageMixed : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "detectLanguageMixed";
|
||||
|
||||
/// Number of top results
|
||||
static constexpr auto top_N = 3;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionDetectLanguageMixed>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeFloat32>());
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
|
||||
{
|
||||
const auto & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal columns " + arguments[0].column->getName() + " of arguments of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
const auto & input_data = col->getChars();
|
||||
const auto & input_offsets = col->getOffsets();
|
||||
|
||||
/// Create and fill the result map.
|
||||
|
||||
const auto & result_type_map = static_cast<const DataTypeMap &>(*result_type);
|
||||
const DataTypePtr & key_type = result_type_map.getKeyType();
|
||||
const DataTypePtr & value_type = result_type_map.getValueType();
|
||||
|
||||
MutableColumnPtr keys_data = key_type->createColumn();
|
||||
MutableColumnPtr values_data = value_type->createColumn();
|
||||
MutableColumnPtr offsets = DataTypeNumber<IColumn::Offset>().createColumn();
|
||||
|
||||
size_t total_elements = input_rows_count * top_N;
|
||||
keys_data->reserve(total_elements);
|
||||
values_data->reserve(total_elements);
|
||||
offsets->reserve(input_rows_count);
|
||||
|
||||
bool is_reliable;
|
||||
CLD2::Language result_lang_top3[top_N];
|
||||
int32_t pc[top_N];
|
||||
int bytes[top_N];
|
||||
|
||||
IColumn::Offset current_offset = 0;
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
const UInt8 * str = input_data.data() + input_offsets[i - 1];
|
||||
const size_t str_len = input_offsets[i] - input_offsets[i - 1] - 1;
|
||||
|
||||
if (UTF8::isValidUTF8(str, str_len))
|
||||
{
|
||||
CLD2::DetectLanguageSummary(reinterpret_cast<const char *>(str), str_len, true, result_lang_top3, pc, bytes, &is_reliable);
|
||||
|
||||
for (size_t j = 0; j < top_N; ++j)
|
||||
{
|
||||
if (pc[j] == 0)
|
||||
break;
|
||||
|
||||
auto res_str = FunctionDetectLanguageImpl::codeISO(LanguageCode(result_lang_top3[j]));
|
||||
Float32 res_float = static_cast<Float32>(pc[j]) / 100;
|
||||
|
||||
keys_data->insertData(res_str.data(), res_str.size());
|
||||
values_data->insertData(reinterpret_cast<const char *>(&res_float), sizeof(res_float));
|
||||
++current_offset;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string_view res_str = "un";
|
||||
Float32 res_float = 0;
|
||||
|
||||
keys_data->insertData(res_str.data(), res_str.size());
|
||||
values_data->insertData(reinterpret_cast<const char *>(&res_float), sizeof(res_float));
|
||||
++current_offset;
|
||||
}
|
||||
offsets->insert(current_offset);
|
||||
}
|
||||
|
||||
auto nested_column = ColumnArray::create(
|
||||
ColumnTuple::create(Columns{std::move(keys_data), std::move(values_data)}),
|
||||
std::move(offsets));
|
||||
|
||||
return ColumnMap::create(nested_column);
|
||||
}
|
||||
};
|
||||
|
||||
struct NameDetectLanguage
|
||||
{
|
||||
static constexpr auto name = "detectLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionDetectLanguage = FunctionTextClassificationString<FunctionDetectLanguageImpl, NameDetectLanguage>;
|
||||
|
||||
void registerFunctionsDetectLanguage(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectLanguage>();
|
||||
factory.registerFunction<FunctionDetectLanguageMixed>();
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
120
src/Functions/FunctionsProgrammingClassification.cpp
Normal file
120
src/Functions/FunctionsProgrammingClassification.cpp
Normal file
@ -0,0 +1,120 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <unordered_map>
|
||||
#include <string_view>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/**
|
||||
* Determine the programming language from the source code.
|
||||
* We calculate all the unigrams and bigrams of commands in the source code.
|
||||
* Then using a marked-up dictionary with weights of unigrams and bigrams of commands for various programming languages
|
||||
* Find the biggest weight of the programming language and return it
|
||||
*/
|
||||
struct FunctionDetectProgrammingLanguageImpl
|
||||
{
|
||||
/// Calculate total weight
|
||||
static ALWAYS_INLINE inline Float64 stateMachine(
|
||||
const FrequencyHolder::Map & standard,
|
||||
const std::unordered_map<String, Float64> & model)
|
||||
{
|
||||
Float64 res = 0;
|
||||
for (const auto & el : model)
|
||||
{
|
||||
/// Try to find each n-gram in dictionary
|
||||
const auto * it = standard.find(el.first);
|
||||
if (it != standard.end())
|
||||
res += el.second * it->getMapped();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
const auto & programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
|
||||
|
||||
/// Constant 5 is arbitrary
|
||||
res_data.reserve(offsets.size() * 5);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::unordered_map<String, Float64> data_freq;
|
||||
StringRef prev_command;
|
||||
StringRef command;
|
||||
|
||||
/// Select all commands from the string
|
||||
for (size_t ind = 0; ind < str_len; ++ind)
|
||||
{
|
||||
/// Assume that all commands are split by spaces
|
||||
if (isWhitespaceASCII(str[ind]))
|
||||
continue;
|
||||
|
||||
size_t prev_ind = ind;
|
||||
while (ind < str_len && !isWhitespaceASCII(str[ind]))
|
||||
++ind;
|
||||
|
||||
command = {str + prev_ind, ind - prev_ind};
|
||||
|
||||
/// We add both unigrams and bigrams to later search for them in the dictionary
|
||||
if (prev_command.data)
|
||||
data_freq[prev_command.toString() + command.toString()] += 1;
|
||||
|
||||
data_freq[command.toString()] += 1;
|
||||
prev_command = command;
|
||||
}
|
||||
|
||||
std::string_view res;
|
||||
Float64 max_result = 0;
|
||||
/// Iterate over all programming languages and find the language with the highest weight
|
||||
for (const auto & item : programming_freq)
|
||||
{
|
||||
Float64 result = stateMachine(item.map, data_freq);
|
||||
if (result > max_result)
|
||||
{
|
||||
max_result = result;
|
||||
res = item.name;
|
||||
}
|
||||
}
|
||||
/// If all weights are zero, then we assume that the language is undefined
|
||||
if (res.empty())
|
||||
res = "Undefined";
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct NameDetectProgrammingLanguage
|
||||
{
|
||||
static constexpr auto name = "detectProgrammingLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionDetectProgrammingLanguage = FunctionTextClassificationString<FunctionDetectProgrammingLanguageImpl, NameDetectProgrammingLanguage>;
|
||||
|
||||
void registerFunctionDetectProgrammingLanguage(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectProgrammingLanguage>();
|
||||
}
|
||||
|
||||
}
|
122
src/Functions/FunctionsTextClassification.h
Normal file
122
src/Functions/FunctionsTextClassification.h
Normal file
@ -0,0 +1,122 @@
|
||||
#pragma once
|
||||
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
/// Functions for text classification with different result types
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
template <typename Impl, typename Name>
|
||||
class FunctionTextClassificationString : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = Name::name;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionTextClassificationString>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return arguments[0];
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
|
||||
{
|
||||
const ColumnPtr & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
auto col_res = ColumnString::create();
|
||||
Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
|
||||
return col_res;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Impl, typename Name>
|
||||
class FunctionTextClassificationFloat : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = Name::name;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionTextClassificationFloat>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return std::make_shared<DataTypeFloat32>();
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
|
||||
{
|
||||
const ColumnPtr & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
auto col_res = ColumnVector<Float32>::create();
|
||||
ColumnVector<Float32>::Container & vec_res = col_res->getData();
|
||||
vec_res.resize(col->size());
|
||||
|
||||
Impl::vector(col->getChars(), col->getOffsets(), vec_res);
|
||||
return col_res;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
89
src/Functions/FunctionsTonalityClassification.cpp
Normal file
89
src/Functions/FunctionsTonalityClassification.cpp
Normal file
@ -0,0 +1,89 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/**
|
||||
* Determines the sentiment of text data.
|
||||
* Uses a marked-up sentiment dictionary, each word has a tonality ranging from -12 to 6.
|
||||
* For each text, calculate the average sentiment value of its words and return it in range [-1,1]
|
||||
*/
|
||||
struct FunctionDetectTonalityImpl
|
||||
{
|
||||
static ALWAYS_INLINE inline Float32 detectTonality(
|
||||
const UInt8 * str,
|
||||
const size_t str_len,
|
||||
const FrequencyHolder::Map & emotional_dict)
|
||||
{
|
||||
Float64 weight = 0;
|
||||
UInt64 count_words = 0;
|
||||
|
||||
String word;
|
||||
/// Select all Russian words from the string
|
||||
for (size_t ind = 0; ind < str_len; ++ind)
|
||||
{
|
||||
/// Split words by whitespaces and punctuation signs
|
||||
if (isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind]))
|
||||
continue;
|
||||
|
||||
while (ind < str_len && !(isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind])))
|
||||
{
|
||||
word.push_back(str[ind]);
|
||||
++ind;
|
||||
}
|
||||
/// Try to find a russian word in the tonality dictionary
|
||||
const auto * it = emotional_dict.find(word);
|
||||
if (it != emotional_dict.end())
|
||||
{
|
||||
count_words += 1;
|
||||
weight += it->getMapped();
|
||||
}
|
||||
word.clear();
|
||||
}
|
||||
|
||||
if (!count_words)
|
||||
return 0;
|
||||
|
||||
/// Calculate average value of tonality.
|
||||
/// Convert values -12..6 to -1..1
|
||||
if (weight > 0)
|
||||
return weight / count_words / 6;
|
||||
else
|
||||
return weight / count_words / 12;
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
PaddedPODArray<Float32> & res)
|
||||
{
|
||||
const auto & emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
|
||||
size_t size = offsets.size();
|
||||
size_t prev_offset = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
res[i] = detectTonality(data.data() + prev_offset, offsets[i] - 1 - prev_offset, emotional_dict);
|
||||
prev_offset = offsets[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct NameDetectTonality
|
||||
{
|
||||
static constexpr auto name = "detectTonality";
|
||||
};
|
||||
|
||||
using FunctionDetectTonality = FunctionTextClassificationFloat<FunctionDetectTonalityImpl, NameDetectTonality>;
|
||||
|
||||
void registerFunctionDetectTonality(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectTonality>();
|
||||
}
|
||||
|
||||
}
|
@ -8,4 +8,5 @@
|
||||
#cmakedefine01 USE_H3
|
||||
#cmakedefine01 USE_S2_GEOMETRY
|
||||
#cmakedefine01 USE_FASTOPS
|
||||
#cmakedefine01 USE_NLP
|
||||
#cmakedefine01 USE_HYPERSCAN
|
||||
|
@ -39,6 +39,9 @@ void registerFunctionEncodeXMLComponent(FunctionFactory &);
|
||||
void registerFunctionDecodeXMLComponent(FunctionFactory &);
|
||||
void registerFunctionExtractTextFromHTML(FunctionFactory &);
|
||||
void registerFunctionToStringCutToZero(FunctionFactory &);
|
||||
void registerFunctionDetectCharset(FunctionFactory &);
|
||||
void registerFunctionDetectTonality(FunctionFactory &);
|
||||
void registerFunctionDetectProgrammingLanguage(FunctionFactory &);
|
||||
|
||||
#if USE_BASE64
|
||||
void registerFunctionBase64Encode(FunctionFactory &);
|
||||
@ -50,6 +53,7 @@ void registerFunctionTryBase64Decode(FunctionFactory &);
|
||||
void registerFunctionStem(FunctionFactory &);
|
||||
void registerFunctionSynonyms(FunctionFactory &);
|
||||
void registerFunctionLemmatize(FunctionFactory &);
|
||||
void registerFunctionsDetectLanguage(FunctionFactory &);
|
||||
#endif
|
||||
|
||||
#if USE_ICU
|
||||
@ -91,6 +95,9 @@ void registerFunctionsString(FunctionFactory & factory)
|
||||
registerFunctionDecodeXMLComponent(factory);
|
||||
registerFunctionExtractTextFromHTML(factory);
|
||||
registerFunctionToStringCutToZero(factory);
|
||||
registerFunctionDetectCharset(factory);
|
||||
registerFunctionDetectTonality(factory);
|
||||
registerFunctionDetectProgrammingLanguage(factory);
|
||||
|
||||
#if USE_BASE64
|
||||
registerFunctionBase64Encode(factory);
|
||||
@ -102,6 +109,7 @@ void registerFunctionsString(FunctionFactory & factory)
|
||||
registerFunctionStem(factory);
|
||||
registerFunctionSynonyms(factory);
|
||||
registerFunctionLemmatize(factory);
|
||||
registerFunctionsDetectLanguage(factory);
|
||||
#endif
|
||||
|
||||
#if USE_ICU
|
||||
|
@ -35,11 +35,9 @@ namespace ClusterProxy
|
||||
|
||||
SelectStreamFactory::SelectStreamFactory(
|
||||
const Block & header_,
|
||||
QueryProcessingStage::Enum processed_stage_,
|
||||
bool has_virtual_shard_num_column_)
|
||||
: header(header_),
|
||||
processed_stage{processed_stage_},
|
||||
has_virtual_shard_num_column(has_virtual_shard_num_column_)
|
||||
QueryProcessingStage::Enum processed_stage_)
|
||||
: header(header_)
|
||||
, processed_stage{processed_stage_}
|
||||
{
|
||||
}
|
||||
|
||||
@ -102,19 +100,15 @@ void SelectStreamFactory::createForShard(
|
||||
Shards & remote_shards,
|
||||
UInt32 shard_count)
|
||||
{
|
||||
auto modified_query_ast = query_ast->clone();
|
||||
if (has_virtual_shard_num_column)
|
||||
VirtualColumnUtils::rewriteEntityInAst(modified_query_ast, "_shard_num", shard_info.shard_num, "toUInt32");
|
||||
|
||||
auto emplace_local_stream = [&]()
|
||||
{
|
||||
local_plans.emplace_back(createLocalPlan(modified_query_ast, header, context, processed_stage, shard_info.shard_num, shard_count));
|
||||
local_plans.emplace_back(createLocalPlan(query_ast, header, context, processed_stage, shard_info.shard_num, shard_count));
|
||||
};
|
||||
|
||||
auto emplace_remote_stream = [&](bool lazy = false, UInt32 local_delay = 0)
|
||||
{
|
||||
remote_shards.emplace_back(Shard{
|
||||
.query = modified_query_ast,
|
||||
.query = query_ast,
|
||||
.header = header,
|
||||
.shard_num = shard_info.shard_num,
|
||||
.num_replicas = shard_info.getAllNodeCount(),
|
||||
|
@ -16,8 +16,7 @@ class SelectStreamFactory final : public IStreamFactory
|
||||
public:
|
||||
SelectStreamFactory(
|
||||
const Block & header_,
|
||||
QueryProcessingStage::Enum processed_stage_,
|
||||
bool has_virtual_shard_num_column_);
|
||||
QueryProcessingStage::Enum processed_stage_);
|
||||
|
||||
void createForShard(
|
||||
const Cluster::ShardInfo & shard_info,
|
||||
@ -32,8 +31,6 @@ public:
|
||||
private:
|
||||
const Block header;
|
||||
QueryProcessingStage::Enum processed_stage;
|
||||
|
||||
bool has_virtual_shard_num_column = false;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -32,15 +32,12 @@
|
||||
#include <base/scope_guard.h>
|
||||
|
||||
|
||||
#define DBMS_SYSTEM_LOG_QUEUE_SIZE 1048576
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int TIMEOUT_EXCEEDED;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
@ -114,13 +111,12 @@ std::shared_ptr<TSystemLog> createSystemLog(
|
||||
return std::make_shared<TSystemLog>(context, database, table, engine, flush_interval_milliseconds);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
///
|
||||
/// ISystemLog
|
||||
///
|
||||
ASTPtr ISystemLog::getCreateTableQueryClean(const StorageID & table_id, ContextPtr context)
|
||||
/// returns CREATE TABLE query, but with removed:
|
||||
/// - UUID
|
||||
/// - SETTINGS (for MergeTree)
|
||||
/// That way it can be used to compare with the SystemLog::getCreateTableQuery()
|
||||
ASTPtr getCreateTableQueryClean(const StorageID & table_id, ContextPtr context)
|
||||
{
|
||||
DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name);
|
||||
ASTPtr old_ast = database->getCreateTableQuery(table_id.table_name, context);
|
||||
@ -135,37 +131,8 @@ ASTPtr ISystemLog::getCreateTableQueryClean(const StorageID & table_id, ContextP
|
||||
return old_ast;
|
||||
}
|
||||
|
||||
void ISystemLog::stopFlushThread()
|
||||
{
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
|
||||
if (!saving_thread.joinable())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_shutdown)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
is_shutdown = true;
|
||||
|
||||
/// Tell thread to shutdown.
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
saving_thread.join();
|
||||
}
|
||||
|
||||
void ISystemLog::startup()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
saving_thread = ThreadFromGlobalPool([this] { savingThreadFunction(); });
|
||||
}
|
||||
|
||||
|
||||
///
|
||||
/// SystemLogs
|
||||
///
|
||||
@ -270,77 +237,6 @@ SystemLog<LogElement>::SystemLog(
|
||||
log = &Poco::Logger::get("SystemLog (" + database_name_ + "." + table_name_ + ")");
|
||||
}
|
||||
|
||||
|
||||
static thread_local bool recursive_add_call = false;
|
||||
|
||||
template <typename LogElement>
|
||||
void SystemLog<LogElement>::add(const LogElement & element)
|
||||
{
|
||||
/// It is possible that the method will be called recursively.
|
||||
/// Better to drop these events to avoid complications.
|
||||
if (recursive_add_call)
|
||||
return;
|
||||
recursive_add_call = true;
|
||||
SCOPE_EXIT({ recursive_add_call = false; });
|
||||
|
||||
/// Memory can be allocated while resizing on queue.push_back.
|
||||
/// The size of allocation can be in order of a few megabytes.
|
||||
/// But this should not be accounted for query memory usage.
|
||||
/// Otherwise the tests like 01017_uniqCombined_memory_usage.sql will be flacky.
|
||||
MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global);
|
||||
|
||||
/// Should not log messages under mutex.
|
||||
bool queue_is_half_full = false;
|
||||
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
if (is_shutdown)
|
||||
return;
|
||||
|
||||
if (queue.size() == DBMS_SYSTEM_LOG_QUEUE_SIZE / 2)
|
||||
{
|
||||
queue_is_half_full = true;
|
||||
|
||||
// The queue more than half full, time to flush.
|
||||
// We only check for strict equality, because messages are added one
|
||||
// by one, under exclusive lock, so we will see each message count.
|
||||
// It is enough to only wake the flushing thread once, after the message
|
||||
// count increases past half available size.
|
||||
const uint64_t queue_end = queue_front_index + queue.size();
|
||||
if (requested_flush_up_to < queue_end)
|
||||
requested_flush_up_to = queue_end;
|
||||
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
if (queue.size() >= DBMS_SYSTEM_LOG_QUEUE_SIZE)
|
||||
{
|
||||
// Ignore all further entries until the queue is flushed.
|
||||
// Log a message about that. Don't spam it -- this might be especially
|
||||
// problematic in case of trace log. Remember what the front index of the
|
||||
// queue was when we last logged the message. If it changed, it means the
|
||||
// queue was flushed, and we can log again.
|
||||
if (queue_front_index != logged_queue_full_at_index)
|
||||
{
|
||||
logged_queue_full_at_index = queue_front_index;
|
||||
|
||||
// TextLog sets its logger level to 0, so this log is a noop and
|
||||
// there is no recursive logging.
|
||||
lock.unlock();
|
||||
LOG_ERROR(log, "Queue is full for system log '{}' at {}", demangle(typeid(*this).name()), queue_front_index);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
queue.push_back(element);
|
||||
}
|
||||
|
||||
if (queue_is_half_full)
|
||||
LOG_INFO(log, "Queue is half full for system log '{}'.", demangle(typeid(*this).name()));
|
||||
}
|
||||
|
||||
template <typename LogElement>
|
||||
void SystemLog<LogElement>::shutdown()
|
||||
{
|
||||
@ -351,48 +247,6 @@ void SystemLog<LogElement>::shutdown()
|
||||
table->flushAndShutdown();
|
||||
}
|
||||
|
||||
template <typename LogElement>
|
||||
void SystemLog<LogElement>::flush(bool force)
|
||||
{
|
||||
uint64_t this_thread_requested_offset;
|
||||
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
if (is_shutdown)
|
||||
return;
|
||||
|
||||
this_thread_requested_offset = queue_front_index + queue.size();
|
||||
|
||||
// Publish our flush request, taking care not to overwrite the requests
|
||||
// made by other threads.
|
||||
is_force_prepare_tables |= force;
|
||||
requested_flush_up_to = std::max(requested_flush_up_to,
|
||||
this_thread_requested_offset);
|
||||
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
LOG_DEBUG(log, "Requested flush up to offset {}",
|
||||
this_thread_requested_offset);
|
||||
|
||||
// Use an arbitrary timeout to avoid endless waiting. 60s proved to be
|
||||
// too fast for our parallel functional tests, probably because they
|
||||
// heavily load the disk.
|
||||
const int timeout_seconds = 180;
|
||||
std::unique_lock lock(mutex);
|
||||
bool result = flush_event.wait_for(lock, std::chrono::seconds(timeout_seconds),
|
||||
[&] { return flushed_up_to >= this_thread_requested_offset
|
||||
&& !is_force_prepare_tables; });
|
||||
|
||||
if (!result)
|
||||
{
|
||||
throw Exception("Timeout exceeded (" + toString(timeout_seconds) + " s) while flushing system log '" + demangle(typeid(*this).name()) + "'.",
|
||||
ErrorCodes::TIMEOUT_EXCEEDED);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename LogElement>
|
||||
void SystemLog<LogElement>::savingThreadFunction()
|
||||
{
|
||||
@ -625,17 +479,7 @@ ASTPtr SystemLog<LogElement>::getCreateTableQuery()
|
||||
return create;
|
||||
}
|
||||
|
||||
template class SystemLog<AsynchronousMetricLogElement>;
|
||||
template class SystemLog<CrashLogElement>;
|
||||
template class SystemLog<MetricLogElement>;
|
||||
template class SystemLog<OpenTelemetrySpanLogElement>;
|
||||
template class SystemLog<PartLogElement>;
|
||||
template class SystemLog<QueryLogElement>;
|
||||
template class SystemLog<QueryThreadLogElement>;
|
||||
template class SystemLog<QueryViewsLogElement>;
|
||||
template class SystemLog<SessionLogElement>;
|
||||
template class SystemLog<TraceLogElement>;
|
||||
template class SystemLog<ZooKeeperLogElement>;
|
||||
template class SystemLog<TextLogElement>;
|
||||
#define INSTANTIATE_SYSTEM_LOG(ELEMENT) template class SystemLog<ELEMENT>;
|
||||
SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG)
|
||||
|
||||
}
|
||||
|
@ -1,34 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <condition_variable>
|
||||
#include <boost/noncopyable.hpp>
|
||||
#include <Common/SystemLogBase.h>
|
||||
|
||||
#include <base/types.h>
|
||||
#include <Core/Defines.h>
|
||||
#include <Storages/IStorage_fwd.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Interpreters/StorageID.h>
|
||||
#include <Parsers/IAST_fwd.h>
|
||||
#include <Common/ThreadPool.h>
|
||||
|
||||
|
||||
namespace Poco
|
||||
{
|
||||
class Logger;
|
||||
namespace Util
|
||||
{
|
||||
class AbstractConfiguration;
|
||||
}
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
|
||||
/** Allow to store structured log in system table.
|
||||
*
|
||||
* Logging is asynchronous. Data is put into queue from where it will be read by separate thread.
|
||||
@ -66,44 +44,6 @@ class QueryViewsLog;
|
||||
class ZooKeeperLog;
|
||||
class SessionLog;
|
||||
|
||||
|
||||
class ISystemLog
|
||||
{
|
||||
public:
|
||||
virtual String getName() = 0;
|
||||
//// force -- force table creation (used for SYSTEM FLUSH LOGS)
|
||||
virtual void flush(bool force = false) = 0;
|
||||
virtual void prepareTable() = 0;
|
||||
|
||||
/// Start the background thread.
|
||||
virtual void startup();
|
||||
|
||||
/// Stop the background flush thread before destructor. No more data will be written.
|
||||
virtual void shutdown() = 0;
|
||||
|
||||
virtual ~ISystemLog() = default;
|
||||
|
||||
virtual void savingThreadFunction() = 0;
|
||||
|
||||
/// returns CREATE TABLE query, but with removed:
|
||||
/// - UUID
|
||||
/// - SETTINGS (for MergeTree)
|
||||
/// That way it can be used to compare with the SystemLog::getCreateTableQuery()
|
||||
static ASTPtr getCreateTableQueryClean(const StorageID & table_id, ContextPtr context);
|
||||
|
||||
protected:
|
||||
ThreadFromGlobalPool saving_thread;
|
||||
|
||||
/// Data shared between callers of add()/flush()/shutdown(), and the saving thread
|
||||
std::mutex mutex;
|
||||
|
||||
bool is_shutdown = false;
|
||||
std::condition_variable flush_event;
|
||||
|
||||
void stopFlushThread();
|
||||
};
|
||||
|
||||
|
||||
/// System logs should be destroyed in destructor of the last Context and before tables,
|
||||
/// because SystemLog destruction makes insert query while flushing data into underlying tables
|
||||
struct SystemLogs
|
||||
@ -136,10 +76,11 @@ struct SystemLogs
|
||||
|
||||
|
||||
template <typename LogElement>
|
||||
class SystemLog : public ISystemLog, private boost::noncopyable, WithContext
|
||||
class SystemLog : public SystemLogBase<LogElement>, private boost::noncopyable, WithContext
|
||||
{
|
||||
public:
|
||||
using Self = SystemLog;
|
||||
using Base = SystemLogBase<LogElement>;
|
||||
|
||||
/** Parameter: table name where to write log.
|
||||
* If table is not exists, then it get created with specified engine.
|
||||
@ -156,27 +97,23 @@ public:
|
||||
const String & storage_def_,
|
||||
size_t flush_interval_milliseconds_);
|
||||
|
||||
/** Append a record into log.
|
||||
* Writing to table will be done asynchronously and in case of failure, record could be lost.
|
||||
*/
|
||||
void add(const LogElement & element);
|
||||
|
||||
void shutdown() override;
|
||||
|
||||
/// Flush data in the buffer to disk
|
||||
void flush(bool force) override;
|
||||
|
||||
String getName() override
|
||||
{
|
||||
return LogElement::name();
|
||||
}
|
||||
|
||||
ASTPtr getCreateTableQuery();
|
||||
|
||||
protected:
|
||||
Poco::Logger * log;
|
||||
using ISystemLog::mutex;
|
||||
using ISystemLog::is_shutdown;
|
||||
using ISystemLog::flush_event;
|
||||
using ISystemLog::stopFlushThread;
|
||||
using Base::log;
|
||||
using Base::queue;
|
||||
using Base::queue_front_index;
|
||||
using Base::is_force_prepare_tables;
|
||||
using Base::requested_flush_up_to;
|
||||
using Base::flushed_up_to;
|
||||
using Base::logged_queue_full_at_index;
|
||||
|
||||
private:
|
||||
|
||||
/* Saving thread data */
|
||||
const StorageID table_id;
|
||||
const String storage_def;
|
||||
@ -185,32 +122,17 @@ private:
|
||||
bool is_prepared = false;
|
||||
const size_t flush_interval_milliseconds;
|
||||
|
||||
// Queue is bounded. But its size is quite large to not block in all normal cases.
|
||||
std::vector<LogElement> queue;
|
||||
// An always-incrementing index of the first message currently in the queue.
|
||||
// We use it to give a global sequential index to every message, so that we
|
||||
// can wait until a particular message is flushed. This is used to implement
|
||||
// synchronous log flushing for SYSTEM FLUSH LOGS.
|
||||
uint64_t queue_front_index = 0;
|
||||
// A flag that says we must create the tables even if the queue is empty.
|
||||
bool is_force_prepare_tables = false;
|
||||
// Requested to flush logs up to this index, exclusive
|
||||
uint64_t requested_flush_up_to = 0;
|
||||
// Flushed log up to this index, exclusive
|
||||
uint64_t flushed_up_to = 0;
|
||||
// Logged overflow message at this queue front index
|
||||
uint64_t logged_queue_full_at_index = -1;
|
||||
|
||||
void savingThreadFunction() override;
|
||||
|
||||
/** Creates new table if it does not exist.
|
||||
* Renames old table if its structure is not suitable.
|
||||
* This cannot be done in constructor to avoid deadlock while renaming a table under locked Context when SystemLog object is created.
|
||||
*/
|
||||
void prepareTable() override;
|
||||
|
||||
void savingThreadFunction() override;
|
||||
|
||||
/// flushImpl can be executed only in saving_thread.
|
||||
void flushImpl(const std::vector<LogElement> & to_flush, uint64_t to_flush_end);
|
||||
ASTPtr getCreateTableQuery();
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -792,6 +792,39 @@ void markTupleLiteralsAsLegacy(ASTPtr & query)
|
||||
MarkTupleLiteralsAsLegacyVisitor(data).visit(query);
|
||||
}
|
||||
|
||||
/// Rewrite _shard_num -> shardNum() AS _shard_num
|
||||
struct RewriteShardNum
|
||||
{
|
||||
struct Data
|
||||
{
|
||||
};
|
||||
|
||||
static bool needChildVisit(const ASTPtr & parent, const ASTPtr & /*child*/)
|
||||
{
|
||||
/// ON section should not be rewritten.
|
||||
return typeid_cast<ASTTableJoin *>(parent.get()) == nullptr;
|
||||
}
|
||||
|
||||
static void visit(ASTPtr & ast, Data &)
|
||||
{
|
||||
if (auto * identifier = typeid_cast<ASTIdentifier *>(ast.get()))
|
||||
visit(*identifier, ast);
|
||||
}
|
||||
|
||||
static void visit(ASTIdentifier & identifier, ASTPtr & ast)
|
||||
{
|
||||
if (identifier.shortName() != "_shard_num")
|
||||
return;
|
||||
|
||||
String alias = identifier.tryGetAlias();
|
||||
if (alias.empty())
|
||||
alias = "_shard_num";
|
||||
ast = makeASTFunction("shardNum");
|
||||
ast->setAlias(alias);
|
||||
}
|
||||
};
|
||||
using RewriteShardNumVisitor = InDepthNodeVisitor<RewriteShardNum, true>;
|
||||
|
||||
}
|
||||
|
||||
TreeRewriterResult::TreeRewriterResult(
|
||||
@ -962,6 +995,7 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
|
||||
++it;
|
||||
}
|
||||
|
||||
has_virtual_shard_num = false;
|
||||
/// If there are virtual columns among the unknown columns. Remove them from the list of unknown and add
|
||||
/// in columns list, so that when further processing they are also considered.
|
||||
if (storage)
|
||||
@ -978,6 +1012,18 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
|
||||
else
|
||||
++it;
|
||||
}
|
||||
|
||||
if (is_remote_storage)
|
||||
{
|
||||
for (const auto & name_type : storage_virtuals)
|
||||
{
|
||||
if (name_type.name == "_shard_num" && storage->isVirtualColumn("_shard_num", metadata_snapshot))
|
||||
{
|
||||
has_virtual_shard_num = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!unknown_required_source_columns.empty())
|
||||
@ -1165,6 +1211,13 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
|
||||
}
|
||||
}
|
||||
|
||||
/// Rewrite _shard_num to shardNum()
|
||||
if (result.has_virtual_shard_num)
|
||||
{
|
||||
RewriteShardNumVisitor::Data data_rewrite_shard_num;
|
||||
RewriteShardNumVisitor(data_rewrite_shard_num).visit(query);
|
||||
}
|
||||
|
||||
result.ast_join = select_query->join();
|
||||
|
||||
if (result.optimize_trivial_count)
|
||||
|
@ -70,6 +70,9 @@ struct TreeRewriterResult
|
||||
/// Cache isRemote() call for storage, because it may be too heavy.
|
||||
bool is_remote_storage = false;
|
||||
|
||||
/// Rewrite _shard_num to shardNum()
|
||||
bool has_virtual_shard_num = false;
|
||||
|
||||
/// Results of scalar sub queries
|
||||
Scalars scalars;
|
||||
|
||||
|
@ -61,6 +61,7 @@
|
||||
#include <Processors/Sources/WaitForAsyncInsertSource.h>
|
||||
|
||||
#include <base/EnumReflection.h>
|
||||
#include <base/demangle.h>
|
||||
|
||||
#include <random>
|
||||
|
||||
@ -659,7 +660,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
|
||||
if (context->query_trace_context.trace_id != UUID())
|
||||
{
|
||||
auto * raw_interpreter_ptr = interpreter.get();
|
||||
std::string class_name(abi::__cxa_demangle(typeid(*raw_interpreter_ptr).name(), nullptr, nullptr, nullptr));
|
||||
std::string class_name(demangle(typeid(*raw_interpreter_ptr).name()));
|
||||
span = std::make_unique<OpenTelemetrySpanHolder>(class_name + "::execute()");
|
||||
}
|
||||
res = interpreter->execute();
|
||||
|
@ -32,6 +32,15 @@ void ASTFunction::appendColumnNameImpl(WriteBuffer & ostr) const
|
||||
if (name == "view")
|
||||
throw Exception("Table function view cannot be used as an expression", ErrorCodes::UNEXPECTED_EXPRESSION);
|
||||
|
||||
/// If function can be converted to literal it will be parsed as literal after formatting.
|
||||
/// In distributed query it may lead to mismathed column names.
|
||||
/// To avoid it we check whether we can convert function to literal.
|
||||
if (auto literal = toLiteral())
|
||||
{
|
||||
literal->appendColumnName(ostr);
|
||||
return;
|
||||
}
|
||||
|
||||
writeString(name, ostr);
|
||||
|
||||
if (parameters)
|
||||
@ -111,31 +120,42 @@ void ASTFunction::updateTreeHashImpl(SipHash & hash_state) const
|
||||
IAST::updateTreeHashImpl(hash_state);
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
static ASTPtr createLiteral(const ASTs & arguments)
|
||||
{
|
||||
Container container;
|
||||
|
||||
for (const auto & arg : arguments)
|
||||
{
|
||||
if (const auto * literal = arg->as<ASTLiteral>())
|
||||
{
|
||||
container.push_back(literal->value);
|
||||
}
|
||||
else if (auto * func = arg->as<ASTFunction>())
|
||||
{
|
||||
if (auto func_literal = func->toLiteral())
|
||||
container.push_back(func_literal->as<ASTLiteral>()->value);
|
||||
else
|
||||
return {};
|
||||
}
|
||||
else
|
||||
/// Some of the Array or Tuple arguments is not literal
|
||||
return {};
|
||||
}
|
||||
|
||||
return std::make_shared<ASTLiteral>(container);
|
||||
}
|
||||
|
||||
ASTPtr ASTFunction::toLiteral() const
|
||||
{
|
||||
if (!arguments) return {};
|
||||
if (!arguments)
|
||||
return {};
|
||||
|
||||
if (name == "array")
|
||||
{
|
||||
Array array;
|
||||
return createLiteral<Array>(arguments->children);
|
||||
|
||||
for (const auto & arg : arguments->children)
|
||||
{
|
||||
if (auto * literal = arg->as<ASTLiteral>())
|
||||
array.push_back(literal->value);
|
||||
else if (auto * func = arg->as<ASTFunction>())
|
||||
{
|
||||
if (auto func_literal = func->toLiteral())
|
||||
array.push_back(func_literal->as<ASTLiteral>()->value);
|
||||
}
|
||||
else
|
||||
/// Some of the Array arguments is not literal
|
||||
return {};
|
||||
}
|
||||
|
||||
return std::make_shared<ASTLiteral>(array);
|
||||
}
|
||||
if (name == "tuple")
|
||||
return createLiteral<Tuple>(arguments->children);
|
||||
|
||||
return {};
|
||||
}
|
||||
|
@ -217,7 +217,6 @@ public:
|
||||
/// Extract data from the backup and put it to the storage.
|
||||
virtual RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context);
|
||||
|
||||
protected:
|
||||
/// Returns whether the column is virtual - by default all columns are real.
|
||||
/// Initially reserved virtual column name may be shadowed by real column.
|
||||
bool isVirtualColumn(const String & column_name, const StorageMetadataPtr & metadata_snapshot) const;
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "IMergeTreeDataPart.h"
|
||||
|
||||
#include <optional>
|
||||
#include <string_view>
|
||||
#include <Core/Defines.h>
|
||||
#include <IO/HashingWriteBuffer.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
@ -1630,13 +1631,21 @@ UInt32 IMergeTreeDataPart::getNumberOfRefereneces() const
|
||||
}
|
||||
|
||||
|
||||
String IMergeTreeDataPart::getZeroLevelPartBlockID() const
|
||||
String IMergeTreeDataPart::getZeroLevelPartBlockID(std::string_view token) const
|
||||
{
|
||||
if (info.level != 0)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to get block id for non zero level part {}", name);
|
||||
|
||||
SipHash hash;
|
||||
checksums.computeTotalChecksumDataOnly(hash);
|
||||
if (token.empty())
|
||||
{
|
||||
checksums.computeTotalChecksumDataOnly(hash);
|
||||
}
|
||||
else
|
||||
{
|
||||
hash.update(token.data(), token.size());
|
||||
}
|
||||
|
||||
union
|
||||
{
|
||||
char bytes[16];
|
||||
|
@ -177,7 +177,8 @@ public:
|
||||
bool isEmpty() const { return rows_count == 0; }
|
||||
|
||||
/// Compute part block id for zero level part. Otherwise throws an exception.
|
||||
String getZeroLevelPartBlockID() const;
|
||||
/// If token is not empty, block id is calculated based on it instead of block data
|
||||
String getZeroLevelPartBlockID(std::string_view token) const;
|
||||
|
||||
const MergeTreeData & storage;
|
||||
|
||||
|
@ -2442,7 +2442,12 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace(
|
||||
}
|
||||
|
||||
|
||||
bool MergeTreeData::renameTempPartAndAdd(MutableDataPartPtr & part, SimpleIncrement * increment, Transaction * out_transaction, MergeTreeDeduplicationLog * deduplication_log)
|
||||
bool MergeTreeData::renameTempPartAndAdd(
|
||||
MutableDataPartPtr & part,
|
||||
SimpleIncrement * increment,
|
||||
Transaction * out_transaction,
|
||||
MergeTreeDeduplicationLog * deduplication_log,
|
||||
std::string_view deduplication_token)
|
||||
{
|
||||
if (out_transaction && &out_transaction->data != this)
|
||||
throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.",
|
||||
@ -2451,7 +2456,7 @@ bool MergeTreeData::renameTempPartAndAdd(MutableDataPartPtr & part, SimpleIncrem
|
||||
DataPartsVector covered_parts;
|
||||
{
|
||||
auto lock = lockParts();
|
||||
if (!renameTempPartAndReplace(part, increment, out_transaction, lock, &covered_parts, deduplication_log))
|
||||
if (!renameTempPartAndReplace(part, increment, out_transaction, lock, &covered_parts, deduplication_log, deduplication_token))
|
||||
return false;
|
||||
}
|
||||
if (!covered_parts.empty())
|
||||
@ -2463,8 +2468,13 @@ bool MergeTreeData::renameTempPartAndAdd(MutableDataPartPtr & part, SimpleIncrem
|
||||
|
||||
|
||||
bool MergeTreeData::renameTempPartAndReplace(
|
||||
MutableDataPartPtr & part, SimpleIncrement * increment, Transaction * out_transaction,
|
||||
std::unique_lock<std::mutex> & lock, DataPartsVector * out_covered_parts, MergeTreeDeduplicationLog * deduplication_log)
|
||||
MutableDataPartPtr & part,
|
||||
SimpleIncrement * increment,
|
||||
Transaction * out_transaction,
|
||||
std::unique_lock<std::mutex> & lock,
|
||||
DataPartsVector * out_covered_parts,
|
||||
MergeTreeDeduplicationLog * deduplication_log,
|
||||
std::string_view deduplication_token)
|
||||
{
|
||||
if (out_transaction && &out_transaction->data != this)
|
||||
throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.",
|
||||
@ -2526,7 +2536,7 @@ bool MergeTreeData::renameTempPartAndReplace(
|
||||
/// deduplication.
|
||||
if (deduplication_log)
|
||||
{
|
||||
String block_id = part->getZeroLevelPartBlockID();
|
||||
String block_id = part->getZeroLevelPartBlockID(deduplication_token);
|
||||
auto res = deduplication_log->addPart(block_id, part_info);
|
||||
if (!res.second)
|
||||
{
|
||||
|
@ -492,7 +492,12 @@ public:
|
||||
/// active set later with out_transaction->commit()).
|
||||
/// Else, commits the part immediately.
|
||||
/// Returns true if part was added. Returns false if part is covered by bigger part.
|
||||
bool renameTempPartAndAdd(MutableDataPartPtr & part, SimpleIncrement * increment = nullptr, Transaction * out_transaction = nullptr, MergeTreeDeduplicationLog * deduplication_log = nullptr);
|
||||
bool renameTempPartAndAdd(
|
||||
MutableDataPartPtr & part,
|
||||
SimpleIncrement * increment = nullptr,
|
||||
Transaction * out_transaction = nullptr,
|
||||
MergeTreeDeduplicationLog * deduplication_log = nullptr,
|
||||
std::string_view deduplication_token = std::string_view());
|
||||
|
||||
/// The same as renameTempPartAndAdd but the block range of the part can contain existing parts.
|
||||
/// Returns all parts covered by the added part (in ascending order).
|
||||
@ -502,9 +507,13 @@ public:
|
||||
|
||||
/// Low-level version of previous one, doesn't lock mutex
|
||||
bool renameTempPartAndReplace(
|
||||
MutableDataPartPtr & part, SimpleIncrement * increment, Transaction * out_transaction, DataPartsLock & lock,
|
||||
DataPartsVector * out_covered_parts = nullptr, MergeTreeDeduplicationLog * deduplication_log = nullptr);
|
||||
|
||||
MutableDataPartPtr & part,
|
||||
SimpleIncrement * increment,
|
||||
Transaction * out_transaction,
|
||||
DataPartsLock & lock,
|
||||
DataPartsVector * out_covered_parts = nullptr,
|
||||
MergeTreeDeduplicationLog * deduplication_log = nullptr,
|
||||
std::string_view deduplication_token = std::string_view());
|
||||
|
||||
/// Remove parts from working set immediately (without wait for background
|
||||
/// process). Transfer part state to temporary. Have very limited usage only
|
||||
|
@ -18,6 +18,7 @@ void MergeTreeSink::onStart()
|
||||
void MergeTreeSink::consume(Chunk chunk)
|
||||
{
|
||||
auto block = getHeader().cloneWithColumns(chunk.detachColumns());
|
||||
String block_dedup_token;
|
||||
|
||||
auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);
|
||||
for (auto & current_block : part_blocks)
|
||||
@ -31,8 +32,20 @@ void MergeTreeSink::consume(Chunk chunk)
|
||||
if (!part)
|
||||
continue;
|
||||
|
||||
if (storage.getDeduplicationLog())
|
||||
{
|
||||
const String & dedup_token = context->getSettingsRef().insert_deduplication_token;
|
||||
if (!dedup_token.empty())
|
||||
{
|
||||
/// multiple blocks can be inserted within the same insert query
|
||||
/// an ordinal number is added to dedup token to generate a distinctive block id for each block
|
||||
block_dedup_token = fmt::format("{}_{}", dedup_token, chunk_dedup_seqnum);
|
||||
++chunk_dedup_seqnum;
|
||||
}
|
||||
}
|
||||
|
||||
/// Part can be deduplicated, so increment counters and add to part log only if it's really added
|
||||
if (storage.renameTempPartAndAdd(part, &storage.increment, nullptr, storage.getDeduplicationLog()))
|
||||
if (storage.renameTempPartAndAdd(part, &storage.increment, nullptr, storage.getDeduplicationLog(), block_dedup_token))
|
||||
{
|
||||
PartLog::addNewPart(storage.getContext(), part, watch.elapsed());
|
||||
|
||||
|
@ -36,6 +36,7 @@ private:
|
||||
StorageMetadataPtr metadata_snapshot;
|
||||
size_t max_parts_per_block;
|
||||
ContextPtr context;
|
||||
uint64_t chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -1123,7 +1123,7 @@ bool ReplicatedMergeTreeQueue::addFuturePartIfNotCoveredByThem(const String & pa
|
||||
|
||||
if (isNotCoveredByFuturePartsImpl(entry, part_name, reject_reason, lock))
|
||||
{
|
||||
CurrentlyExecuting::setActualPartName(entry, part_name, *this);
|
||||
CurrentlyExecuting::setActualPartName(entry, part_name, *this, lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1375,7 +1375,8 @@ Int64 ReplicatedMergeTreeQueue::getCurrentMutationVersion(const String & partiti
|
||||
}
|
||||
|
||||
|
||||
ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting(const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_)
|
||||
ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting(
|
||||
const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_, std::lock_guard<std::mutex> & /* state_lock */)
|
||||
: entry(entry_), queue(queue_)
|
||||
{
|
||||
if (entry->type == ReplicatedMergeTreeLogEntry::DROP_RANGE || entry->type == ReplicatedMergeTreeLogEntry::REPLACE_RANGE)
|
||||
@ -1397,8 +1398,11 @@ ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting(const Replicate
|
||||
}
|
||||
|
||||
|
||||
void ReplicatedMergeTreeQueue::CurrentlyExecuting::setActualPartName(ReplicatedMergeTreeQueue::LogEntry & entry,
|
||||
const String & actual_part_name, ReplicatedMergeTreeQueue & queue)
|
||||
void ReplicatedMergeTreeQueue::CurrentlyExecuting::setActualPartName(
|
||||
ReplicatedMergeTreeQueue::LogEntry & entry,
|
||||
const String & actual_part_name,
|
||||
ReplicatedMergeTreeQueue & queue,
|
||||
std::lock_guard<std::mutex> & /* state_lock */)
|
||||
{
|
||||
if (!entry.actual_new_part_name.empty())
|
||||
throw Exception("Entry actual part isn't empty yet. This is a bug.", ErrorCodes::LOGICAL_ERROR);
|
||||
@ -1477,7 +1481,7 @@ ReplicatedMergeTreeQueue::SelectedEntryPtr ReplicatedMergeTreeQueue::selectEntry
|
||||
}
|
||||
|
||||
if (entry)
|
||||
return std::make_shared<SelectedEntry>(entry, std::unique_ptr<CurrentlyExecuting>{ new CurrentlyExecuting(entry, *this) });
|
||||
return std::make_shared<SelectedEntry>(entry, std::unique_ptr<CurrentlyExecuting>{new CurrentlyExecuting(entry, *this, lock)});
|
||||
else
|
||||
return {};
|
||||
}
|
||||
|
@ -251,11 +251,18 @@ private:
|
||||
friend class ReplicatedMergeTreeQueue;
|
||||
|
||||
/// Created only in the selectEntryToProcess function. It is called under mutex.
|
||||
CurrentlyExecuting(const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_);
|
||||
CurrentlyExecuting(
|
||||
const ReplicatedMergeTreeQueue::LogEntryPtr & entry_,
|
||||
ReplicatedMergeTreeQueue & queue_,
|
||||
std::lock_guard<std::mutex> & state_lock);
|
||||
|
||||
/// In case of fetch, we determine actual part during the execution, so we need to update entry. It is called under state_mutex.
|
||||
static void setActualPartName(ReplicatedMergeTreeQueue::LogEntry & entry, const String & actual_part_name,
|
||||
ReplicatedMergeTreeQueue & queue);
|
||||
static void setActualPartName(
|
||||
ReplicatedMergeTreeQueue::LogEntry & entry,
|
||||
const String & actual_part_name,
|
||||
ReplicatedMergeTreeQueue & queue,
|
||||
std::lock_guard<std::mutex> & state_lock);
|
||||
|
||||
public:
|
||||
~CurrentlyExecuting();
|
||||
};
|
||||
|
@ -160,8 +160,16 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
|
||||
{
|
||||
/// We add the hash from the data and partition identifier to deduplication ID.
|
||||
/// That is, do not insert the same data to the same partition twice.
|
||||
block_id = part->getZeroLevelPartBlockID();
|
||||
|
||||
String block_dedup_token = context->getSettingsRef().insert_deduplication_token;
|
||||
if (!block_dedup_token.empty())
|
||||
{
|
||||
/// multiple blocks can be inserted within the same insert query
|
||||
/// an ordinal number is added to dedup token to generate a distinctive block id for each block
|
||||
block_dedup_token += fmt::format("_{}", chunk_dedup_seqnum);
|
||||
++chunk_dedup_seqnum;
|
||||
}
|
||||
block_id = part->getZeroLevelPartBlockID(block_dedup_token);
|
||||
LOG_DEBUG(log, "Wrote block with ID '{}', {} rows", block_id, current_block.block.rows());
|
||||
}
|
||||
else
|
||||
|
@ -82,13 +82,14 @@ private:
|
||||
|
||||
bool is_attach = false;
|
||||
bool quorum_parallel = false;
|
||||
bool deduplicate = true;
|
||||
const bool deduplicate = true;
|
||||
bool last_block_is_duplicate = false;
|
||||
|
||||
using Logger = Poco::Logger;
|
||||
Poco::Logger * log;
|
||||
|
||||
ContextPtr context;
|
||||
UInt64 chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -308,7 +308,7 @@ NamesAndTypesList StorageDistributed::getVirtuals() const
|
||||
NameAndTypePair("_part_uuid", std::make_shared<DataTypeUUID>()),
|
||||
NameAndTypePair("_partition_id", std::make_shared<DataTypeString>()),
|
||||
NameAndTypePair("_sample_factor", std::make_shared<DataTypeFloat64>()),
|
||||
NameAndTypePair("_shard_num", std::make_shared<DataTypeUInt32>()),
|
||||
NameAndTypePair("_shard_num", std::make_shared<DataTypeUInt32>()), /// deprecated
|
||||
};
|
||||
}
|
||||
|
||||
@ -605,8 +605,8 @@ Pipe StorageDistributed::read(
|
||||
|
||||
void StorageDistributed::read(
|
||||
QueryPlan & query_plan,
|
||||
const Names & column_names,
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
const Names &,
|
||||
const StorageMetadataPtr &,
|
||||
SelectQueryInfo & query_info,
|
||||
ContextPtr local_context,
|
||||
QueryProcessingStage::Enum processed_stage,
|
||||
@ -635,10 +635,6 @@ void StorageDistributed::read(
|
||||
return;
|
||||
}
|
||||
|
||||
bool has_virtual_shard_num_column = std::find(column_names.begin(), column_names.end(), "_shard_num") != column_names.end();
|
||||
if (has_virtual_shard_num_column && !isVirtualColumn("_shard_num", metadata_snapshot))
|
||||
has_virtual_shard_num_column = false;
|
||||
|
||||
StorageID main_table = StorageID::createEmpty();
|
||||
if (!remote_table_function_ptr)
|
||||
main_table = StorageID{remote_database, remote_table};
|
||||
@ -646,8 +642,7 @@ void StorageDistributed::read(
|
||||
ClusterProxy::SelectStreamFactory select_stream_factory =
|
||||
ClusterProxy::SelectStreamFactory(
|
||||
header,
|
||||
processed_stage,
|
||||
has_virtual_shard_num_column);
|
||||
processed_stage);
|
||||
|
||||
ClusterProxy::executeQuery(
|
||||
query_plan, header, processed_stage,
|
||||
|
@ -218,8 +218,33 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user
|
||||
return paths;
|
||||
}
|
||||
|
||||
ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr context)
|
||||
{
|
||||
/// If we want to read schema from file descriptor we should create
|
||||
/// a read buffer from fd, create a checkpoint, read some data required
|
||||
/// for schema inference, rollback to checkpoint and then use the created
|
||||
/// peekable read buffer on the first read from storage. It's needed because
|
||||
/// in case of file descriptor we have a stream of data and we cannot
|
||||
/// start reading data from the beginning after reading some data for
|
||||
/// schema inference.
|
||||
auto read_buffer_creator = [&]()
|
||||
{
|
||||
/// We will use PeekableReadBuffer to create a checkpoint, so we need a place
|
||||
/// where we can store the original read buffer.
|
||||
read_buffer_from_fd = createReadBuffer("", true, getName(), table_fd, compression_method, context);
|
||||
auto read_buf = std::make_unique<PeekableReadBuffer>(*read_buffer_from_fd);
|
||||
read_buf->setCheckpoint();
|
||||
return read_buf;
|
||||
};
|
||||
|
||||
ColumnsDescription StorageFile::getTableStructureFromData(
|
||||
auto columns = readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, peekable_read_buffer_from_fd);
|
||||
if (peekable_read_buffer_from_fd)
|
||||
/// If we have created read buffer in readSchemaFromFormat we should rollback to checkpoint.
|
||||
assert_cast<PeekableReadBuffer *>(peekable_read_buffer_from_fd.get())->rollbackToCheckpoint();
|
||||
return columns;
|
||||
}
|
||||
|
||||
ColumnsDescription StorageFile::getTableStructureFromFile(
|
||||
const String & format,
|
||||
const std::vector<String> & paths,
|
||||
const String & compression_method,
|
||||
@ -272,8 +297,6 @@ StorageFile::StorageFile(int table_fd_, CommonArguments args)
|
||||
throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED);
|
||||
if (args.format_name == "Distributed")
|
||||
throw Exception("Distributed format is allowed only with explicit file path", ErrorCodes::INCORRECT_FILE_NAME);
|
||||
if (args.columns.empty())
|
||||
throw Exception("Automatic schema inference is not allowed when using file descriptor as source of storage", ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE);
|
||||
|
||||
is_db_table = false;
|
||||
use_table_fd = true;
|
||||
@ -323,9 +346,15 @@ void StorageFile::setStorageMetadata(CommonArguments args)
|
||||
|
||||
if (args.format_name == "Distributed" || args.columns.empty())
|
||||
{
|
||||
auto columns = getTableStructureFromData(format_name, paths, compression_method, format_settings, args.getContext());
|
||||
if (!args.columns.empty() && args.columns != columns)
|
||||
throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS);
|
||||
ColumnsDescription columns;
|
||||
if (use_table_fd)
|
||||
columns = getTableStructureFromFileDescriptor(args.getContext());
|
||||
else
|
||||
{
|
||||
columns = getTableStructureFromFile(format_name, paths, compression_method, format_settings, args.getContext());
|
||||
if (!args.columns.empty() && args.columns != columns)
|
||||
throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS);
|
||||
}
|
||||
storage_metadata.setColumns(columns);
|
||||
}
|
||||
else
|
||||
@ -397,11 +426,13 @@ public:
|
||||
ContextPtr context_,
|
||||
UInt64 max_block_size_,
|
||||
FilesInfoPtr files_info_,
|
||||
ColumnsDescription columns_description_)
|
||||
ColumnsDescription columns_description_,
|
||||
std::unique_ptr<ReadBuffer> read_buf_)
|
||||
: SourceWithProgress(getBlockForSource(storage_, metadata_snapshot_, columns_description_, files_info_))
|
||||
, storage(std::move(storage_))
|
||||
, metadata_snapshot(metadata_snapshot_)
|
||||
, files_info(std::move(files_info_))
|
||||
, read_buf(std::move(read_buf_))
|
||||
, columns_description(std::move(columns_description_))
|
||||
, context(context_)
|
||||
, max_block_size(max_block_size_)
|
||||
@ -443,7 +474,8 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context);
|
||||
if (!read_buf)
|
||||
read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context);
|
||||
|
||||
auto get_block_for_format = [&]() -> Block
|
||||
{
|
||||
@ -589,7 +621,7 @@ Pipe StorageFile::read(
|
||||
};
|
||||
|
||||
pipes.emplace_back(std::make_shared<StorageFileSource>(
|
||||
this_ptr, metadata_snapshot, context, max_block_size, files_info, get_columns_for_format()));
|
||||
this_ptr, metadata_snapshot, context, max_block_size, files_info, get_columns_for_format(), std::move(peekable_read_buffer_from_fd)));
|
||||
}
|
||||
|
||||
return Pipe::unitePipes(std::move(pipes));
|
||||
|
@ -71,7 +71,9 @@ public:
|
||||
|
||||
bool supportsPartitionBy() const override { return true; }
|
||||
|
||||
static ColumnsDescription getTableStructureFromData(
|
||||
ColumnsDescription getTableStructureFromFileDescriptor(ContextPtr context);
|
||||
|
||||
static ColumnsDescription getTableStructureFromFile(
|
||||
const String & format,
|
||||
const std::vector<String> & paths,
|
||||
const String & compression_method,
|
||||
@ -122,6 +124,11 @@ private:
|
||||
String path_for_partitioned_write;
|
||||
|
||||
bool is_path_with_globs = false;
|
||||
|
||||
/// These buffers are needed for schema inference when data source
|
||||
/// is file descriptor. See getTableStructureFromFileDescriptor.
|
||||
std::unique_ptr<ReadBuffer> read_buffer_from_fd;
|
||||
std::unique_ptr<ReadBuffer> peekable_read_buffer_from_fd;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -40,7 +40,6 @@ StorageSQLite::StorageSQLite(
|
||||
, WithContext(context_->getGlobalContext())
|
||||
, remote_table_name(remote_table_name_)
|
||||
, database_path(database_path_)
|
||||
, global_context(context_)
|
||||
, sqlite_db(sqlite_db_)
|
||||
, log(&Poco::Logger::get("StorageSQLite (" + table_id_.table_name + ")"))
|
||||
{
|
||||
|
@ -48,7 +48,6 @@ public:
|
||||
private:
|
||||
String remote_table_name;
|
||||
String database_path;
|
||||
ContextPtr global_context;
|
||||
SQLitePtr sqlite_db;
|
||||
Poco::Logger * log;
|
||||
};
|
||||
|
@ -38,7 +38,7 @@ ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context
|
||||
{
|
||||
size_t total_bytes_to_read = 0;
|
||||
Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read);
|
||||
return StorageFile::getTableStructureFromData(format, paths, compression_method, std::nullopt, context);
|
||||
return StorageFile::getTableStructureFromFile(format, paths, compression_method, std::nullopt, context);
|
||||
}
|
||||
|
||||
return parseColumnsListFromString(structure, context);
|
||||
|
@ -5,36 +5,66 @@ import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
|
||||
from ci_config import CI_CONFIG
|
||||
|
||||
DOWNLOAD_RETRIES_COUNT = 5
|
||||
|
||||
|
||||
def get_with_retries(
|
||||
url: str,
|
||||
retries: int = DOWNLOAD_RETRIES_COUNT,
|
||||
sleep: int = 3,
|
||||
**kwargs,
|
||||
) -> requests.Response:
|
||||
logging.info("Getting URL with %i and sleep %i in between: %s", retries, sleep, url)
|
||||
exc = None # type: Optional[Exception]
|
||||
for i in range(DOWNLOAD_RETRIES_COUNT):
|
||||
try:
|
||||
response = requests.get(url, **kwargs)
|
||||
response.raise_for_status()
|
||||
break
|
||||
except Exception as e:
|
||||
if i + 1 < DOWNLOAD_RETRIES_COUNT:
|
||||
logging.info("Exception '%s' while getting, retry %i", e, i + 1)
|
||||
time.sleep(sleep)
|
||||
|
||||
exc = e
|
||||
else:
|
||||
raise Exception(exc)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def get_build_name_for_check(check_name):
|
||||
return CI_CONFIG['tests_config'][check_name]['required_build']
|
||||
return CI_CONFIG["tests_config"][check_name]["required_build"]
|
||||
|
||||
|
||||
def get_build_urls(build_name, reports_path):
|
||||
for root, _, files in os.walk(reports_path):
|
||||
for f in files:
|
||||
if build_name in f :
|
||||
if build_name in f:
|
||||
logging.info("Found build report json %s", f)
|
||||
with open(os.path.join(root, f), 'r', encoding='utf-8') as file_handler:
|
||||
with open(os.path.join(root, f), "r", encoding="utf-8") as file_handler:
|
||||
build_report = json.load(file_handler)
|
||||
return build_report['build_urls']
|
||||
return build_report["build_urls"]
|
||||
return []
|
||||
|
||||
|
||||
def dowload_build_with_progress(url, path):
|
||||
logging.info("Downloading from %s to temp path %s", url, path)
|
||||
for i in range(DOWNLOAD_RETRIES_COUNT):
|
||||
try:
|
||||
with open(path, 'wb') as f:
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
total_length = response.headers.get('content-length')
|
||||
with open(path, "wb") as f:
|
||||
response = get_with_retries(url, retries=1, stream=True)
|
||||
total_length = response.headers.get("content-length")
|
||||
if total_length is None or int(total_length) == 0:
|
||||
logging.info("No content-length, will download file without progress")
|
||||
logging.info(
|
||||
"No content-length, will download file without progress"
|
||||
)
|
||||
f.write(response.content)
|
||||
else:
|
||||
dl = 0
|
||||
@ -46,32 +76,38 @@ def dowload_build_with_progress(url, path):
|
||||
if sys.stdout.isatty():
|
||||
done = int(50 * dl / total_length)
|
||||
percent = int(100 * float(dl) / total_length)
|
||||
eq_str = '=' * done
|
||||
space_str = ' ' * (50 - done)
|
||||
eq_str = "=" * done
|
||||
space_str = " " * (50 - done)
|
||||
sys.stdout.write(f"\r[{eq_str}{space_str}] {percent}%")
|
||||
sys.stdout.flush()
|
||||
break
|
||||
except Exception as ex:
|
||||
sys.stdout.write("\n")
|
||||
time.sleep(3)
|
||||
logging.info("Exception while downloading %s, retry %s", ex, i + 1)
|
||||
except Exception:
|
||||
if sys.stdout.isatty():
|
||||
sys.stdout.write("\n")
|
||||
if i + 1 < DOWNLOAD_RETRIES_COUNT:
|
||||
time.sleep(3)
|
||||
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
else:
|
||||
raise Exception(f"Cannot download dataset from {url}, all retries exceeded")
|
||||
|
||||
sys.stdout.write("\n")
|
||||
if sys.stdout.isatty():
|
||||
sys.stdout.write("\n")
|
||||
logging.info("Downloading finished")
|
||||
|
||||
|
||||
def download_builds(result_path, build_urls, filter_fn):
|
||||
for url in build_urls:
|
||||
if filter_fn(url):
|
||||
fname = os.path.basename(url.replace('%2B', '+').replace('%20', ' '))
|
||||
fname = os.path.basename(url.replace("%2B", "+").replace("%20", " "))
|
||||
logging.info("Will download %s to %s", fname, result_path)
|
||||
dowload_build_with_progress(url, os.path.join(result_path, fname))
|
||||
|
||||
def download_builds_filter(check_name, reports_path, result_path, filter_fn=lambda _: True):
|
||||
|
||||
def download_builds_filter(
|
||||
check_name, reports_path, result_path, filter_fn=lambda _: True
|
||||
):
|
||||
build_name = get_build_name_for_check(check_name)
|
||||
urls = get_build_urls(build_name, reports_path)
|
||||
print(urls)
|
||||
@ -81,17 +117,32 @@ def download_builds_filter(check_name, reports_path, result_path, filter_fn=lamb
|
||||
|
||||
download_builds(result_path, urls, filter_fn)
|
||||
|
||||
|
||||
def download_all_deb_packages(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('deb'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("deb")
|
||||
)
|
||||
|
||||
|
||||
def download_shared_build(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('shared_build.tgz'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("shared_build.tgz")
|
||||
)
|
||||
|
||||
|
||||
def download_unit_tests(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('unit_tests_dbms'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("unit_tests_dbms")
|
||||
)
|
||||
|
||||
|
||||
def download_clickhouse_binary(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('clickhouse'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("clickhouse")
|
||||
)
|
||||
|
||||
|
||||
def download_performance_build(check_name, reports_path, result_path):
|
||||
download_builds_filter(check_name, reports_path, result_path, lambda x: x.endswith('performance.tgz'))
|
||||
download_builds_filter(
|
||||
check_name, reports_path, result_path, lambda x: x.endswith("performance.tgz")
|
||||
)
|
||||
|
@ -2,28 +2,51 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import requests # type: ignore
|
||||
from unidiff import PatchSet # type: ignore
|
||||
|
||||
from env_helper import GITHUB_REPOSITORY, GITHUB_SERVER_URL, GITHUB_RUN_ID, GITHUB_EVENT_PATH
|
||||
from build_download_helper import get_with_retries
|
||||
from env_helper import (
|
||||
GITHUB_REPOSITORY,
|
||||
GITHUB_SERVER_URL,
|
||||
GITHUB_RUN_ID,
|
||||
GITHUB_EVENT_PATH,
|
||||
)
|
||||
|
||||
DIFF_IN_DOCUMENTATION_EXT = [
|
||||
".html",
|
||||
".md",
|
||||
".yml",
|
||||
".txt",
|
||||
".css",
|
||||
".js",
|
||||
".xml",
|
||||
".ico",
|
||||
".conf",
|
||||
".svg",
|
||||
".png",
|
||||
".jpg",
|
||||
".py",
|
||||
".sh",
|
||||
".json",
|
||||
]
|
||||
RETRY_SLEEP = 0
|
||||
|
||||
DIFF_IN_DOCUMENTATION_EXT = [".html", ".md", ".yml", ".txt", ".css", ".js", ".xml", ".ico", ".conf", ".svg", ".png",
|
||||
".jpg", ".py", ".sh", ".json"]
|
||||
|
||||
def get_pr_for_commit(sha, ref):
|
||||
if not ref:
|
||||
return None
|
||||
try_get_pr_url = f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{sha}/pulls"
|
||||
try_get_pr_url = (
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{sha}/pulls"
|
||||
)
|
||||
try:
|
||||
response = requests.get(try_get_pr_url)
|
||||
response.raise_for_status()
|
||||
response = get_with_retries(try_get_pr_url, sleep=RETRY_SLEEP)
|
||||
data = response.json()
|
||||
if len(data) > 1:
|
||||
print("Got more than one pr for commit", sha)
|
||||
for pr in data:
|
||||
# refs for pushes looks like refs/head/XX
|
||||
# refs for RPs looks like XX
|
||||
if pr['head']['ref'] in ref:
|
||||
if pr["head"]["ref"] in ref:
|
||||
return pr
|
||||
print("Cannot find PR with required ref", ref, "returning first one")
|
||||
first_pr = data[0]
|
||||
@ -35,15 +58,22 @@ def get_pr_for_commit(sha, ref):
|
||||
|
||||
class PRInfo:
|
||||
default_event = {
|
||||
'commits': 1,
|
||||
'before': 'HEAD~',
|
||||
'after': 'HEAD',
|
||||
'ref': None,
|
||||
}
|
||||
def __init__(self, github_event=None, need_orgs=False, need_changed_files=False, labels_from_api=False):
|
||||
"commits": 1,
|
||||
"before": "HEAD~",
|
||||
"after": "HEAD",
|
||||
"ref": None,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
github_event=None,
|
||||
need_orgs=False,
|
||||
need_changed_files=False,
|
||||
pr_event_from_api=False,
|
||||
):
|
||||
if not github_event:
|
||||
if GITHUB_EVENT_PATH:
|
||||
with open(GITHUB_EVENT_PATH, 'r', encoding='utf-8') as event_file:
|
||||
with open(GITHUB_EVENT_PATH, "r", encoding="utf-8") as event_file:
|
||||
github_event = json.load(event_file)
|
||||
else:
|
||||
github_event = PRInfo.default_event.copy()
|
||||
@ -51,22 +81,34 @@ class PRInfo:
|
||||
self.changed_files = set([])
|
||||
self.body = ""
|
||||
ref = github_event.get("ref", "refs/head/master")
|
||||
if ref and ref.startswith('refs/heads/'):
|
||||
if ref and ref.startswith("refs/heads/"):
|
||||
ref = ref[11:]
|
||||
|
||||
# workflow completed event, used for PRs only
|
||||
if 'action' in github_event and github_event['action'] == 'completed':
|
||||
self.sha = github_event['workflow_run']['head_sha']
|
||||
prs_for_sha = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{self.sha}/pulls").json()
|
||||
if "action" in github_event and github_event["action"] == "completed":
|
||||
self.sha = github_event["workflow_run"]["head_sha"]
|
||||
prs_for_sha = get_with_retries(
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}/commits/{self.sha}"
|
||||
"/pulls",
|
||||
sleep=RETRY_SLEEP,
|
||||
).json()
|
||||
if len(prs_for_sha) != 0:
|
||||
github_event['pull_request'] = prs_for_sha[0]
|
||||
github_event["pull_request"] = prs_for_sha[0]
|
||||
|
||||
if 'pull_request' in github_event: # pull request and other similar events
|
||||
self.number = github_event['pull_request']['number']
|
||||
if 'after' in github_event:
|
||||
self.sha = github_event['after']
|
||||
if "pull_request" in github_event: # pull request and other similar events
|
||||
self.number = github_event["pull_request"]["number"]
|
||||
if pr_event_from_api:
|
||||
response = get_with_retries(
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}"
|
||||
f"/pulls/{self.number}",
|
||||
sleep=RETRY_SLEEP,
|
||||
)
|
||||
github_event["pull_request"] = response.json()
|
||||
|
||||
if "after" in github_event:
|
||||
self.sha = github_event["after"]
|
||||
else:
|
||||
self.sha = github_event['pull_request']['head']['sha']
|
||||
self.sha = github_event["pull_request"]["head"]["sha"]
|
||||
|
||||
repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}"
|
||||
self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}"
|
||||
@ -75,35 +117,35 @@ class PRInfo:
|
||||
self.commit_html_url = f"{repo_prefix}/commits/{self.sha}"
|
||||
self.pr_html_url = f"{repo_prefix}/pull/{self.number}"
|
||||
|
||||
self.base_ref = github_event['pull_request']['base']['ref']
|
||||
self.base_name = github_event['pull_request']['base']['repo']['full_name']
|
||||
self.head_ref = github_event['pull_request']['head']['ref']
|
||||
self.head_name = github_event['pull_request']['head']['repo']['full_name']
|
||||
self.body = github_event['pull_request']['body']
|
||||
self.base_ref = github_event["pull_request"]["base"]["ref"]
|
||||
self.base_name = github_event["pull_request"]["base"]["repo"]["full_name"]
|
||||
self.head_ref = github_event["pull_request"]["head"]["ref"]
|
||||
self.head_name = github_event["pull_request"]["head"]["repo"]["full_name"]
|
||||
self.body = github_event["pull_request"]["body"]
|
||||
self.labels = {
|
||||
label["name"] for label in github_event["pull_request"]["labels"]
|
||||
}
|
||||
|
||||
if labels_from_api:
|
||||
response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels")
|
||||
self.labels = {l['name'] for l in response.json()}
|
||||
else:
|
||||
self.labels = {l['name'] for l in github_event['pull_request']['labels']}
|
||||
|
||||
self.user_login = github_event['pull_request']['user']['login']
|
||||
self.user_login = github_event["pull_request"]["user"]["login"]
|
||||
self.user_orgs = set([])
|
||||
if need_orgs:
|
||||
user_orgs_response = requests.get(github_event['pull_request']['user']['organizations_url'])
|
||||
user_orgs_response = get_with_retries(
|
||||
github_event["pull_request"]["user"]["organizations_url"],
|
||||
sleep=RETRY_SLEEP,
|
||||
)
|
||||
if user_orgs_response.ok:
|
||||
response_json = user_orgs_response.json()
|
||||
self.user_orgs = set(org['id'] for org in response_json)
|
||||
self.user_orgs = set(org["id"] for org in response_json)
|
||||
|
||||
self.diff_url = github_event['pull_request']['diff_url']
|
||||
elif 'commits' in github_event:
|
||||
self.sha = github_event['after']
|
||||
pull_request = get_pr_for_commit(self.sha, github_event['ref'])
|
||||
self.diff_url = github_event["pull_request"]["diff_url"]
|
||||
elif "commits" in github_event:
|
||||
self.sha = github_event["after"]
|
||||
pull_request = get_pr_for_commit(self.sha, github_event["ref"])
|
||||
repo_prefix = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}"
|
||||
self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}"
|
||||
self.commit_html_url = f"{repo_prefix}/commits/{self.sha}"
|
||||
self.repo_full_name = GITHUB_REPOSITORY
|
||||
if pull_request is None or pull_request['state'] == 'closed':
|
||||
if pull_request is None or pull_request["state"] == "closed":
|
||||
# it's merged PR to master
|
||||
self.number = 0
|
||||
self.labels = {}
|
||||
@ -112,25 +154,25 @@ class PRInfo:
|
||||
self.base_name = self.repo_full_name
|
||||
self.head_ref = ref
|
||||
self.head_name = self.repo_full_name
|
||||
self.diff_url = \
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}/compare/{github_event['before']}...{self.sha}"
|
||||
self.diff_url = (
|
||||
f"https://api.github.com/repos/{GITHUB_REPOSITORY}/"
|
||||
f"compare/{github_event['before']}...{self.sha}"
|
||||
)
|
||||
else:
|
||||
self.number = pull_request['number']
|
||||
if labels_from_api:
|
||||
response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels")
|
||||
self.labels = {l['name'] for l in response.json()}
|
||||
else:
|
||||
self.labels = {l['name'] for l in pull_request['labels']}
|
||||
self.labels = {label["name"] for label in pull_request["labels"]}
|
||||
|
||||
self.base_ref = pull_request['base']['ref']
|
||||
self.base_name = pull_request['base']['repo']['full_name']
|
||||
self.head_ref = pull_request['head']['ref']
|
||||
self.head_name = pull_request['head']['repo']['full_name']
|
||||
self.pr_html_url = pull_request['html_url']
|
||||
if 'pr-backport' in self.labels:
|
||||
self.diff_url = f"https://github.com/{GITHUB_REPOSITORY}/compare/master...{self.head_ref}.diff"
|
||||
self.base_ref = pull_request["base"]["ref"]
|
||||
self.base_name = pull_request["base"]["repo"]["full_name"]
|
||||
self.head_ref = pull_request["head"]["ref"]
|
||||
self.head_name = pull_request["head"]["repo"]["full_name"]
|
||||
self.pr_html_url = pull_request["html_url"]
|
||||
if "pr-backport" in self.labels:
|
||||
self.diff_url = (
|
||||
f"https://github.com/{GITHUB_REPOSITORY}/"
|
||||
f"compare/master...{self.head_ref}.diff"
|
||||
)
|
||||
else:
|
||||
self.diff_url = pull_request['diff_url']
|
||||
self.diff_url = pull_request["diff_url"]
|
||||
else:
|
||||
print(json.dumps(github_event, sort_keys=True, indent=4))
|
||||
self.sha = os.getenv("GITHUB_SHA")
|
||||
@ -153,24 +195,27 @@ class PRInfo:
|
||||
if not self.diff_url:
|
||||
raise Exception("Diff URL cannot be find for event")
|
||||
|
||||
response = requests.get(self.diff_url)
|
||||
response = get_with_retries(
|
||||
self.diff_url,
|
||||
sleep=RETRY_SLEEP,
|
||||
)
|
||||
response.raise_for_status()
|
||||
if 'commits' in self.event and self.number == 0:
|
||||
if "commits" in self.event and self.number == 0:
|
||||
diff = response.json()
|
||||
|
||||
if 'files' in diff:
|
||||
self.changed_files = [f['filename'] for f in diff['files']]
|
||||
if "files" in diff:
|
||||
self.changed_files = [f["filename"] for f in diff["files"]]
|
||||
else:
|
||||
diff_object = PatchSet(response.text)
|
||||
self.changed_files = {f.path for f in diff_object}
|
||||
|
||||
def get_dict(self):
|
||||
return {
|
||||
'sha': self.sha,
|
||||
'number': self.number,
|
||||
'labels': self.labels,
|
||||
'user_login': self.user_login,
|
||||
'user_orgs': self.user_orgs,
|
||||
"sha": self.sha,
|
||||
"number": self.number,
|
||||
"labels": self.labels,
|
||||
"user_login": self.user_login,
|
||||
"user_orgs": self.user_orgs,
|
||||
}
|
||||
|
||||
def has_changes_in_documentation(self):
|
||||
@ -181,49 +226,63 @@ class PRInfo:
|
||||
|
||||
for f in self.changed_files:
|
||||
_, ext = os.path.splitext(f)
|
||||
path_in_docs = 'docs' in f
|
||||
path_in_website = 'website' in f
|
||||
if (ext in DIFF_IN_DOCUMENTATION_EXT and (path_in_docs or path_in_website)) or 'docker/docs' in f:
|
||||
path_in_docs = "docs" in f
|
||||
path_in_website = "website" in f
|
||||
if (
|
||||
ext in DIFF_IN_DOCUMENTATION_EXT and (path_in_docs or path_in_website)
|
||||
) or "docker/docs" in f:
|
||||
return True
|
||||
return False
|
||||
|
||||
def can_skip_builds_and_use_version_from_master(self):
|
||||
if 'force tests' in self.labels:
|
||||
# TODO: See a broken loop
|
||||
if "force tests" in self.labels:
|
||||
return False
|
||||
|
||||
if self.changed_files is None or not self.changed_files:
|
||||
return False
|
||||
|
||||
for f in self.changed_files:
|
||||
if (not f.startswith('tests/queries')
|
||||
or not f.startswith('tests/integration')
|
||||
or not f.startswith('tests/performance')):
|
||||
# TODO: this logic is broken, should be fixed before using
|
||||
if (
|
||||
not f.startswith("tests/queries")
|
||||
or not f.startswith("tests/integration")
|
||||
or not f.startswith("tests/performance")
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def can_skip_integration_tests(self):
|
||||
if 'force tests' in self.labels:
|
||||
# TODO: See a broken loop
|
||||
if "force tests" in self.labels:
|
||||
return False
|
||||
|
||||
if self.changed_files is None or not self.changed_files:
|
||||
return False
|
||||
|
||||
for f in self.changed_files:
|
||||
if not f.startswith('tests/queries') or not f.startswith('tests/performance'):
|
||||
# TODO: this logic is broken, should be fixed before using
|
||||
if not f.startswith("tests/queries") or not f.startswith(
|
||||
"tests/performance"
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def can_skip_functional_tests(self):
|
||||
if 'force tests' in self.labels:
|
||||
# TODO: See a broken loop
|
||||
if "force tests" in self.labels:
|
||||
return False
|
||||
|
||||
if self.changed_files is None or not self.changed_files:
|
||||
return False
|
||||
|
||||
for f in self.changed_files:
|
||||
if not f.startswith('tests/integration') or not f.startswith('tests/performance'):
|
||||
# TODO: this logic is broken, should be fixed before using
|
||||
if not f.startswith("tests/integration") or not f.startswith(
|
||||
"tests/performance"
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
@ -204,7 +204,7 @@ def check_pr_description(pr_info):
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
pr_info = PRInfo(need_orgs=True, labels_from_api=True)
|
||||
pr_info = PRInfo(need_orgs=True, pr_event_from_api=True)
|
||||
can_run, description = should_run_checks_for_pr(pr_info)
|
||||
gh = Github(get_best_robot_token())
|
||||
commit = get_commit(gh, pr_info.sha)
|
||||
@ -212,6 +212,9 @@ if __name__ == "__main__":
|
||||
description_report = check_pr_description(pr_info)[:139]
|
||||
if description_report:
|
||||
print("::notice ::Cannot run, description does not match the template")
|
||||
logging.info(
|
||||
"PR body doesn't match the template: (start)\n%s\n(end)", pr_info.body
|
||||
)
|
||||
url = (
|
||||
f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/"
|
||||
"blob/master/.github/PULL_REQUEST_TEMPLATE.md?plain=1"
|
||||
|
20
tests/performance/classification.xml
Normal file
20
tests/performance/classification.xml
Normal file
@ -0,0 +1,20 @@
|
||||
<test>
|
||||
<settings>
|
||||
<allow_experimental_nlp_functions>1</allow_experimental_nlp_functions>
|
||||
</settings>
|
||||
|
||||
<preconditions>
|
||||
<table_exists>hits_100m_single</table_exists>
|
||||
</preconditions>
|
||||
|
||||
<query>SELECT detectLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
<query>SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
<query>SELECT detectTonality(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
|
||||
<!-- Input is not really correct for these functions,
|
||||
but at least it gives us some idea about their performance -->
|
||||
<query>SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
<query>SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
<query>SELECT detectCharset(SearchPhrase) FROM hits_100m_single FORMAT Null</query>
|
||||
|
||||
</test>
|
@ -90,3 +90,31 @@
|
||||
21
|
||||
22
|
||||
23
|
||||
6
|
||||
7
|
||||
7
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
17
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
|
@ -93,3 +93,34 @@ SELECT position(concat(' иголка.ру', arrayStringConcat
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
SELECT position(concat(' иголка.ру', arrayStringConcat(arrayMap(x -> ' ', range(20000)))), 'иголка.ру') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test ß test'), 'ß') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test AaßAa test'), 'aßa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('test A1ß2a test'), '1ß2') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest'), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat('test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' test a1ßAa test', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'a1ẞaa') AS res;
|
||||
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat('xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
SELECT positionCaseInsensitiveUTF8(materialize(concat(' xẞyyaa1ẞ1yzẞXẞẞ1ẞẞ1bctest', arrayStringConcat(arrayMap(x -> ' ', range(20000))))), 'aa1ẞ1Yzßxßß1ßß1BC') AS res;
|
||||
|
@ -1,34 +1,94 @@
|
||||
-- { echoOn }
|
||||
|
||||
-- remote(system.one)
|
||||
SELECT 'remote(system.one)';
|
||||
remote(system.one)
|
||||
SELECT * FROM remote('127.0.0.1', system.one);
|
||||
0
|
||||
SELECT * FROM remote('127.0.0.{1,2}', system.one);
|
||||
0
|
||||
0
|
||||
0
|
||||
SELECT _shard_num, * FROM remote('127.0.0.1', system.one);
|
||||
1 0
|
||||
SELECT _shard_num, * FROM remote('127.0.0.{1,2}', system.one) order by _shard_num;
|
||||
1 0
|
||||
2 0
|
||||
SELECT _shard_num, * FROM remote('127.0.0.{1,2}', system.one) WHERE _shard_num = 1;
|
||||
1 0
|
||||
-- dist_1 using test_shard_localhost
|
||||
SELECT 'dist_1';
|
||||
dist_1
|
||||
1
|
||||
1 10
|
||||
10
|
||||
SELECT _shard_num FROM dist_1 order by _shard_num;
|
||||
1
|
||||
1
|
||||
SELECT _shard_num FROM dist_1 order by _shard_num;
|
||||
1
|
||||
1
|
||||
SELECT _shard_num, key FROM dist_1 order by _shard_num;
|
||||
1 10
|
||||
1 20
|
||||
SELECT key FROM dist_1;
|
||||
10
|
||||
20
|
||||
SELECT _shard_num FROM dist_1 order by _shard_num;
|
||||
1
|
||||
1
|
||||
SELECT _shard_num, key FROM dist_1 order by _shard_num, key;
|
||||
1 10
|
||||
1 20
|
||||
SELECT key FROM dist_1;
|
||||
10
|
||||
20
|
||||
-- dist_2 using test_cluster_two_shards_localhost
|
||||
SELECT 'dist_2';
|
||||
dist_2
|
||||
SELECT _shard_num FROM dist_2 order by _shard_num;
|
||||
1
|
||||
2
|
||||
SELECT _shard_num FROM dist_2 order by _shard_num;
|
||||
1
|
||||
2
|
||||
SELECT _shard_num, key FROM dist_2 order by _shard_num, key;
|
||||
1 100
|
||||
2 100
|
||||
SELECT key FROM dist_2;
|
||||
100
|
||||
100
|
||||
-- multiple _shard_num
|
||||
SELECT 'remote(Distributed)';
|
||||
remote(Distributed)
|
||||
SELECT _shard_num, key FROM remote('127.0.0.1', currentDatabase(), dist_2) order by _shard_num, key;
|
||||
1 100
|
||||
1 100
|
||||
2 100
|
||||
-- JOIN system.clusters
|
||||
SELECT 'JOIN system.clusters';
|
||||
JOIN system.clusters
|
||||
SELECT a._shard_num, a.key, b.host_name, b.host_address IN ('::1', '127.0.0.1'), b.port
|
||||
FROM (SELECT *, _shard_num FROM dist_1) a
|
||||
JOIN system.clusters b
|
||||
ON a._shard_num = b.shard_num
|
||||
WHERE b.cluster = 'test_cluster_two_shards_localhost';
|
||||
1 10 localhost 1 9000
|
||||
1 20 localhost 1 9000
|
||||
SELECT _shard_num, key, b.host_name, b.host_address IN ('::1', '127.0.0.1'), b.port
|
||||
FROM dist_1 a
|
||||
JOIN system.clusters b
|
||||
ON _shard_num = b.shard_num
|
||||
WHERE b.cluster = 'test_cluster_two_shards_localhost'; -- { serverError 403 }
|
||||
SELECT 'Rewrite with alias';
|
||||
Rewrite with alias
|
||||
SELECT a._shard_num, key FROM dist_1 a;
|
||||
1 10
|
||||
1 20
|
||||
-- the same with JOIN, just in case
|
||||
SELECT a._shard_num, a.key, b.host_name, b.host_address IN ('::1', '127.0.0.1'), b.port
|
||||
FROM dist_1 a
|
||||
JOIN system.clusters b
|
||||
ON a._shard_num = b.shard_num
|
||||
WHERE b.cluster = 'test_cluster_two_shards_localhost'; -- { serverError 47; }
|
||||
SELECT 'dist_3';
|
||||
dist_3
|
||||
SELECT * FROM dist_3;
|
||||
100 foo
|
||||
SELECT _shard_num, * FROM dist_3 order by _shard_num;
|
||||
foo 100 foo
|
||||
|
@ -3,6 +3,28 @@
|
||||
-- make the order static
|
||||
SET max_threads = 1;
|
||||
|
||||
DROP TABLE IF EXISTS mem1;
|
||||
DROP TABLE IF EXISTS mem2;
|
||||
DROP TABLE IF EXISTS mem3;
|
||||
DROP TABLE IF EXISTS dist_1;
|
||||
DROP TABLE IF EXISTS dist_2;
|
||||
DROP TABLE IF EXISTS dist_3;
|
||||
|
||||
CREATE TABLE mem1 (key Int) Engine=Memory();
|
||||
INSERT INTO mem1 VALUES (10);
|
||||
CREATE TABLE dist_1 AS mem1 Engine=Distributed(test_shard_localhost, currentDatabase(), mem1);
|
||||
INSERT INTO dist_1 VALUES (20);
|
||||
|
||||
CREATE TABLE mem2 (key Int) Engine=Memory();
|
||||
INSERT INTO mem2 VALUES (100);
|
||||
CREATE TABLE dist_2 AS mem2 Engine=Distributed(test_cluster_two_shards_localhost, currentDatabase(), mem2);
|
||||
|
||||
CREATE TABLE mem3 (key Int, _shard_num String) Engine=Memory();
|
||||
INSERT INTO mem3 VALUES (100, 'foo');
|
||||
CREATE TABLE dist_3 AS mem3 Engine=Distributed(test_shard_localhost, currentDatabase(), mem3);
|
||||
|
||||
-- { echoOn }
|
||||
|
||||
-- remote(system.one)
|
||||
SELECT 'remote(system.one)';
|
||||
SELECT * FROM remote('127.0.0.1', system.one);
|
||||
@ -13,27 +35,20 @@ SELECT _shard_num, * FROM remote('127.0.0.{1,2}', system.one) WHERE _shard_num =
|
||||
|
||||
-- dist_1 using test_shard_localhost
|
||||
SELECT 'dist_1';
|
||||
CREATE TABLE mem1 (key Int) Engine=Memory();
|
||||
CREATE TABLE dist_1 AS mem1 Engine=Distributed(test_shard_localhost, currentDatabase(), mem1);
|
||||
SELECT _shard_num FROM dist_1 order by _shard_num;
|
||||
|
||||
INSERT INTO mem1 VALUES (10);
|
||||
SELECT _shard_num FROM dist_1 order by _shard_num;
|
||||
SELECT _shard_num, key FROM dist_1 order by _shard_num;
|
||||
SELECT key FROM dist_1;
|
||||
|
||||
INSERT INTO dist_1 VALUES (20);
|
||||
SELECT _shard_num FROM dist_1 order by _shard_num;
|
||||
SELECT _shard_num, key FROM dist_1 order by _shard_num, key;
|
||||
SELECT key FROM dist_1;
|
||||
|
||||
-- dist_2 using test_cluster_two_shards_localhost
|
||||
SELECT 'dist_2';
|
||||
CREATE TABLE mem2 (key Int) Engine=Memory();
|
||||
CREATE TABLE dist_2 AS mem2 Engine=Distributed(test_cluster_two_shards_localhost, currentDatabase(), mem2);
|
||||
SELECT _shard_num FROM dist_2 order by _shard_num;
|
||||
|
||||
INSERT INTO mem2 VALUES (100);
|
||||
SELECT _shard_num FROM dist_2 order by _shard_num;
|
||||
SELECT _shard_num, key FROM dist_2 order by _shard_num, key;
|
||||
SELECT key FROM dist_2;
|
||||
@ -57,8 +72,8 @@ JOIN system.clusters b
|
||||
ON _shard_num = b.shard_num
|
||||
WHERE b.cluster = 'test_cluster_two_shards_localhost'; -- { serverError 403 }
|
||||
|
||||
-- rewrite does not work with aliases, hence Missing columns (47)
|
||||
SELECT a._shard_num, key FROM dist_1 a; -- { serverError 47; }
|
||||
SELECT 'Rewrite with alias';
|
||||
SELECT a._shard_num, key FROM dist_1 a;
|
||||
-- the same with JOIN, just in case
|
||||
SELECT a._shard_num, a.key, b.host_name, b.host_address IN ('::1', '127.0.0.1'), b.port
|
||||
FROM dist_1 a
|
||||
@ -67,8 +82,5 @@ ON a._shard_num = b.shard_num
|
||||
WHERE b.cluster = 'test_cluster_two_shards_localhost'; -- { serverError 47; }
|
||||
|
||||
SELECT 'dist_3';
|
||||
CREATE TABLE mem3 (key Int, _shard_num String) Engine=Memory();
|
||||
CREATE TABLE dist_3 AS mem3 Engine=Distributed(test_shard_localhost, currentDatabase(), mem3);
|
||||
INSERT INTO mem3 VALUES (100, 'foo');
|
||||
SELECT * FROM dist_3;
|
||||
SELECT _shard_num, * FROM dist_3 order by _shard_num;
|
||||
|
@ -3,3 +3,7 @@ UTC 1234567891011 2009-02-13 23:31:31.011 1970-01-15 06:56:07.891011 1970-01-01
|
||||
Asia/Makassar 1234567891011 2009-02-14 07:31:31.011 1970-01-15 14:56:07.891011 1970-01-01 08:20:34.567891011 DateTime64(9, \'Asia/Makassar\')
|
||||
non-const column
|
||||
1234567891011 2009-02-13 23:31:31.011 1970-01-15 06:56:07.891011 1970-01-01 00:20:34.567891011
|
||||
upper range bound
|
||||
9904447342 2283-11-10 19:22:22.123 2283-11-10 19:22:22.123456 1925-01-01 00:00:00.586094827
|
||||
lower range bound
|
||||
-1420066799 1925-01-01 01:00:01.123 1925-01-01 01:00:01.123456 1925-01-01 01:00:01.123456789
|
||||
|
@ -42,4 +42,30 @@ SELECT
|
||||
i64,
|
||||
fromUnixTimestamp64Milli(i64, tz),
|
||||
fromUnixTimestamp64Micro(i64, tz),
|
||||
fromUnixTimestamp64Nano(i64, tz) as dt64;
|
||||
fromUnixTimestamp64Nano(i64, tz) as dt64;
|
||||
|
||||
SELECT 'upper range bound';
|
||||
WITH
|
||||
9904447342 AS timestamp,
|
||||
CAST(9904447342123 AS Int64) AS milli,
|
||||
CAST(9904447342123456 AS Int64) AS micro,
|
||||
CAST(9904447342123456789 AS Int64) AS nano,
|
||||
'UTC' AS tz
|
||||
SELECT
|
||||
timestamp,
|
||||
fromUnixTimestamp64Milli(milli, tz),
|
||||
fromUnixTimestamp64Micro(micro, tz),
|
||||
fromUnixTimestamp64Nano(nano, tz);
|
||||
|
||||
SELECT 'lower range bound';
|
||||
WITH
|
||||
-1420066799 AS timestamp,
|
||||
CAST(-1420066799123 AS Int64) AS milli,
|
||||
CAST(-1420066799123456 AS Int64) AS micro,
|
||||
CAST(-1420066799123456789 AS Int64) AS nano,
|
||||
'UTC' AS tz
|
||||
SELECT
|
||||
timestamp,
|
||||
fromUnixTimestamp64Milli(milli, tz),
|
||||
fromUnixTimestamp64Micro(micro, tz),
|
||||
fromUnixTimestamp64Nano(nano, tz);
|
@ -1,14 +1,31 @@
|
||||
-- { echo }
|
||||
with anySimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(any, UInt64) 0
|
||||
with anyLastSimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(anyLast, UInt64) 0
|
||||
with minSimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(min, UInt64) 0
|
||||
with maxSimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(max, UInt64) 0
|
||||
with sumSimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(sum, UInt64) 0
|
||||
with sumWithOverflowSimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(sumWithOverflow, UInt64) 0
|
||||
with groupBitAndSimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(groupBitAnd, UInt64) 0
|
||||
with groupBitOrSimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(groupBitOr, UInt64) 0
|
||||
with groupBitXorSimpleState(number) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(groupBitXor, UInt64) 0
|
||||
with sumMapSimpleState(([number], [number])) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(sumMap, Tuple(Array(UInt64), Array(UInt64))) ([],[])
|
||||
with minMapSimpleState(([number], [number])) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(minMap, Tuple(Array(UInt64), Array(UInt64))) ([0],[0])
|
||||
with maxMapSimpleState(([number], [number])) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(maxMap, Tuple(Array(UInt64), Array(UInt64))) ([0],[0])
|
||||
with groupArrayArraySimpleState([number]) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(groupArrayArray, Array(UInt64)) [0]
|
||||
with groupUniqArrayArraySimpleState([number]) as c select toTypeName(c), c from numbers(1);
|
||||
SimpleAggregateFunction(groupUniqArrayArray, Array(UInt64)) [0]
|
||||
-- non-SimpleAggregateFunction
|
||||
with countSimpleState(number) as c select toTypeName(c), c from numbers(1); -- { serverError 36 }
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user