mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-28 10:31:57 +00:00
Merge branch 'ClickHouse:master' into fix/signal_pipe_buf_size
This commit is contained in:
commit
4894e93e05
6
.gitmodules
vendored
6
.gitmodules
vendored
@ -217,6 +217,9 @@
|
||||
[submodule "contrib/yaml-cpp"]
|
||||
path = contrib/yaml-cpp
|
||||
url = https://github.com/ClickHouse-Extras/yaml-cpp.git
|
||||
[submodule "contrib/cld2"]
|
||||
path = contrib/cld2
|
||||
url = https://github.com/ClickHouse-Extras/cld2.git
|
||||
[submodule "contrib/libstemmer_c"]
|
||||
path = contrib/libstemmer_c
|
||||
url = https://github.com/ClickHouse-Extras/libstemmer_c.git
|
||||
@ -247,6 +250,9 @@
|
||||
[submodule "contrib/sysroot"]
|
||||
path = contrib/sysroot
|
||||
url = https://github.com/ClickHouse-Extras/sysroot.git
|
||||
[submodule "contrib/nlp-data"]
|
||||
path = contrib/nlp-data
|
||||
url = https://github.com/ClickHouse-Extras/nlp-data.git
|
||||
[submodule "contrib/hive-metastore"]
|
||||
path = contrib/hive-metastore
|
||||
url = https://github.com/ClickHouse-Extras/hive-metastore
|
||||
|
@ -247,8 +247,6 @@ endif()
|
||||
|
||||
if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
|
||||
set(USE_DEBUG_HELPERS ON)
|
||||
else ()
|
||||
set(USE_DEBUG_HELPERS ON)
|
||||
endif()
|
||||
option(USE_DEBUG_HELPERS "Enable debug helpers" ${USE_DEBUG_HELPERS})
|
||||
|
||||
@ -403,17 +401,6 @@ else ()
|
||||
option(WERROR "Enable -Werror compiler option" ON)
|
||||
endif ()
|
||||
|
||||
if (WERROR)
|
||||
# Don't pollute CMAKE_CXX_FLAGS with -Werror as it will break some CMake checks.
|
||||
# Instead, adopt modern cmake usage requirement.
|
||||
target_compile_options(global-libs INTERFACE "-Werror")
|
||||
endif ()
|
||||
|
||||
# Make this extra-checks for correct library dependencies.
|
||||
if (OS_LINUX AND NOT SANITIZE)
|
||||
target_link_options(global-libs INTERFACE "-Wl,--no-undefined")
|
||||
endif ()
|
||||
|
||||
# Increase stack size on Musl. We need big stack for our recursive-descend parser.
|
||||
if (USE_MUSL)
|
||||
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,stack-size=2097152")
|
||||
@ -421,6 +408,7 @@ endif ()
|
||||
|
||||
include(cmake/dbms_glob_sources.cmake)
|
||||
|
||||
add_library(global-group INTERFACE)
|
||||
if (OS_LINUX OR OS_ANDROID)
|
||||
include(cmake/linux/default_libs.cmake)
|
||||
elseif (OS_DARWIN)
|
||||
@ -428,6 +416,18 @@ elseif (OS_DARWIN)
|
||||
elseif (OS_FREEBSD)
|
||||
include(cmake/freebsd/default_libs.cmake)
|
||||
endif ()
|
||||
link_libraries(global-group)
|
||||
|
||||
if (WERROR)
|
||||
# Don't pollute CMAKE_CXX_FLAGS with -Werror as it will break some CMake checks.
|
||||
# Instead, adopt modern cmake usage requirement.
|
||||
target_compile_options(global-group INTERFACE "-Werror")
|
||||
endif ()
|
||||
|
||||
# Make this extra-checks for correct library dependencies.
|
||||
if (OS_LINUX AND NOT SANITIZE)
|
||||
target_link_options(global-group INTERFACE "-Wl,--no-undefined")
|
||||
endif ()
|
||||
|
||||
######################################
|
||||
### Add targets below this comment ###
|
||||
|
4
LICENSE
4
LICENSE
@ -1,4 +1,4 @@
|
||||
Copyright 2016-2021 ClickHouse, Inc.
|
||||
Copyright 2016-2022 ClickHouse, Inc.
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
@ -188,7 +188,7 @@ Copyright 2016-2021 ClickHouse, Inc.
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2016-2021 ClickHouse, Inc.
|
||||
Copyright 2016-2022 ClickHouse, Inc.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -2,7 +2,9 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <string_view>
|
||||
#include <algorithm>
|
||||
|
||||
#include <cassert>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/select.h>
|
||||
@ -34,13 +36,37 @@ bool hasInputData()
|
||||
return select(1, &fds, nullptr, nullptr, &timeout) == 1;
|
||||
}
|
||||
|
||||
struct NoCaseCompare
|
||||
{
|
||||
bool operator()(const std::string & str1, const std::string & str2)
|
||||
{
|
||||
return std::lexicographical_compare(begin(str1), end(str1), begin(str2), end(str2), [](const char c1, const char c2)
|
||||
{
|
||||
return std::tolower(c1) < std::tolower(c2);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
using Words = std::vector<std::string>;
|
||||
template <class Compare>
|
||||
void addNewWords(Words & to, const Words & from, Compare comp)
|
||||
{
|
||||
size_t old_size = to.size();
|
||||
size_t new_size = old_size + from.size();
|
||||
|
||||
to.reserve(new_size);
|
||||
to.insert(to.end(), from.begin(), from.end());
|
||||
auto middle = to.begin() + old_size;
|
||||
std::inplace_merge(to.begin(), middle, to.end(), comp);
|
||||
|
||||
auto last_unique = std::unique(to.begin(), to.end());
|
||||
to.erase(last_unique, to.end());
|
||||
}
|
||||
|
||||
std::optional<LineReader::Suggest::WordsRange> LineReader::Suggest::getCompletions(const String & prefix, size_t prefix_length) const
|
||||
{
|
||||
if (!ready)
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
replxx::Replxx::completions_t LineReader::Suggest::getCompletions(const String & prefix, size_t prefix_length)
|
||||
{
|
||||
std::string_view last_word;
|
||||
|
||||
auto last_word_pos = prefix.find_last_of(word_break_characters);
|
||||
@ -48,21 +74,45 @@ std::optional<LineReader::Suggest::WordsRange> LineReader::Suggest::getCompletio
|
||||
last_word = prefix;
|
||||
else
|
||||
last_word = std::string_view(prefix).substr(last_word_pos + 1, std::string::npos);
|
||||
|
||||
/// last_word can be empty.
|
||||
|
||||
std::pair<Words::const_iterator, Words::const_iterator> range;
|
||||
|
||||
std::lock_guard lock(mutex);
|
||||
|
||||
/// Only perform case sensitive completion when the prefix string contains any uppercase characters
|
||||
if (std::none_of(prefix.begin(), prefix.end(), [&](auto c) { return c >= 'A' && c <= 'Z'; }))
|
||||
return std::equal_range(
|
||||
range = std::equal_range(
|
||||
words_no_case.begin(), words_no_case.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched)
|
||||
{
|
||||
return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0;
|
||||
});
|
||||
else
|
||||
return std::equal_range(words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched)
|
||||
range = std::equal_range(words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched)
|
||||
{
|
||||
return strncmp(s.data(), prefix_searched.data(), prefix_length) < 0;
|
||||
});
|
||||
|
||||
return replxx::Replxx::completions_t(range.first, range.second);
|
||||
}
|
||||
|
||||
void LineReader::Suggest::addWords(Words && new_words)
|
||||
{
|
||||
Words new_words_no_case = new_words;
|
||||
if (!new_words.empty())
|
||||
{
|
||||
std::sort(new_words.begin(), new_words.end());
|
||||
std::sort(new_words_no_case.begin(), new_words_no_case.end(), NoCaseCompare{});
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
addNewWords(words, new_words, std::less<std::string>{});
|
||||
addNewWords(words_no_case, new_words_no_case, NoCaseCompare{});
|
||||
}
|
||||
|
||||
assert(std::is_sorted(words.begin(), words.end()));
|
||||
assert(std::is_sorted(words_no_case.begin(), words_no_case.end(), NoCaseCompare{}));
|
||||
}
|
||||
|
||||
LineReader::LineReader(const String & history_file_path_, bool multiline_, Patterns extenders_, Patterns delimiters_)
|
||||
|
@ -1,10 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <vector>
|
||||
#include <optional>
|
||||
#include <replxx.hxx>
|
||||
|
||||
#include <base/types.h>
|
||||
|
||||
class LineReader
|
||||
{
|
||||
@ -12,14 +14,16 @@ public:
|
||||
struct Suggest
|
||||
{
|
||||
using Words = std::vector<std::string>;
|
||||
using WordsRange = std::pair<Words::const_iterator, Words::const_iterator>;
|
||||
|
||||
/// Get vector for the matched range of words if any.
|
||||
replxx::Replxx::completions_t getCompletions(const String & prefix, size_t prefix_length);
|
||||
void addWords(Words && new_words);
|
||||
|
||||
private:
|
||||
Words words;
|
||||
Words words_no_case;
|
||||
std::atomic<bool> ready{false};
|
||||
|
||||
/// Get iterators for the matched range of words if any.
|
||||
std::optional<WordsRange> getCompletions(const String & prefix, size_t prefix_length) const;
|
||||
std::mutex mutex;
|
||||
};
|
||||
|
||||
using Patterns = std::vector<const char *>;
|
||||
|
@ -133,7 +133,7 @@ void convertHistoryFile(const std::string & path, replxx::Replxx & rx)
|
||||
}
|
||||
|
||||
ReplxxLineReader::ReplxxLineReader(
|
||||
const Suggest & suggest,
|
||||
Suggest & suggest,
|
||||
const String & history_file_path_,
|
||||
bool multiline_,
|
||||
Patterns extenders_,
|
||||
@ -179,9 +179,7 @@ ReplxxLineReader::ReplxxLineReader(
|
||||
|
||||
auto callback = [&suggest] (const String & context, size_t context_size)
|
||||
{
|
||||
if (auto range = suggest.getCompletions(context, context_size))
|
||||
return Replxx::completions_t(range->first, range->second);
|
||||
return Replxx::completions_t();
|
||||
return suggest.getCompletions(context, context_size);
|
||||
};
|
||||
|
||||
rx.set_completion_callback(callback);
|
||||
|
@ -9,7 +9,7 @@ class ReplxxLineReader : public LineReader
|
||||
{
|
||||
public:
|
||||
ReplxxLineReader(
|
||||
const Suggest & suggest,
|
||||
Suggest & suggest,
|
||||
const String & history_file_path,
|
||||
bool multiline,
|
||||
Patterns extenders_,
|
||||
|
@ -24,14 +24,10 @@ find_package(Threads REQUIRED)
|
||||
|
||||
include (cmake/find/cxx.cmake)
|
||||
|
||||
add_library(global-group INTERFACE)
|
||||
|
||||
target_link_libraries(global-group INTERFACE
|
||||
$<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
|
||||
)
|
||||
|
||||
link_libraries(global-group)
|
||||
|
||||
# FIXME: remove when all contribs will get custom cmake lists
|
||||
install(
|
||||
TARGETS global-group global-libs
|
||||
|
@ -25,14 +25,10 @@ find_package(Threads REQUIRED)
|
||||
include (cmake/find/unwind.cmake)
|
||||
include (cmake/find/cxx.cmake)
|
||||
|
||||
add_library(global-group INTERFACE)
|
||||
|
||||
target_link_libraries(global-group INTERFACE
|
||||
$<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
|
||||
)
|
||||
|
||||
link_libraries(global-group)
|
||||
|
||||
# FIXME: remove when all contribs will get custom cmake lists
|
||||
install(
|
||||
TARGETS global-group global-libs
|
||||
|
@ -45,15 +45,12 @@ endif ()
|
||||
include (cmake/find/unwind.cmake)
|
||||
include (cmake/find/cxx.cmake)
|
||||
|
||||
add_library(global-group INTERFACE)
|
||||
target_link_libraries(global-group INTERFACE
|
||||
-Wl,--start-group
|
||||
$<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
|
||||
-Wl,--end-group
|
||||
)
|
||||
|
||||
link_libraries(global-group)
|
||||
|
||||
# FIXME: remove when all contribs will get custom cmake lists
|
||||
install(
|
||||
TARGETS global-group global-libs
|
||||
|
2
contrib/CMakeLists.txt
vendored
2
contrib/CMakeLists.txt
vendored
@ -140,6 +140,8 @@ if (ENABLE_NLP)
|
||||
add_contrib (libstemmer-c-cmake libstemmer_c)
|
||||
add_contrib (wordnet-blast-cmake wordnet-blast)
|
||||
add_contrib (lemmagen-c-cmake lemmagen-c)
|
||||
add_contrib (nlp-data-cmake nlp-data)
|
||||
add_contrib (cld2-cmake cld2)
|
||||
endif()
|
||||
|
||||
add_contrib (sqlite-cmake sqlite-amalgamation)
|
||||
|
1
contrib/cld2
vendored
Submodule
1
contrib/cld2
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit bc6d493a2f64ed1fc1c4c4b4294a542a04e04217
|
33
contrib/cld2-cmake/CMakeLists.txt
Normal file
33
contrib/cld2-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,33 @@
|
||||
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
|
||||
|
||||
set (SRCS
|
||||
"${LIBRARY_DIR}/internal/cldutil.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det.cc"
|
||||
"${LIBRARY_DIR}/internal/cldutil_shared.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc"
|
||||
"${LIBRARY_DIR}/internal/compact_lang_det_impl.cc"
|
||||
"${LIBRARY_DIR}/internal/debug.cc"
|
||||
"${LIBRARY_DIR}/internal/fixunicodevalue.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_entities.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_language.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_ulscript.cc"
|
||||
"${LIBRARY_DIR}/internal/getonescriptspan.cc"
|
||||
"${LIBRARY_DIR}/internal/lang_script.cc"
|
||||
"${LIBRARY_DIR}/internal/offsetmap.cc"
|
||||
"${LIBRARY_DIR}/internal/scoreonescriptspan.cc"
|
||||
"${LIBRARY_DIR}/internal/tote.cc"
|
||||
"${LIBRARY_DIR}/internal/utf8statetable.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc"
|
||||
"${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc"
|
||||
"${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc"
|
||||
"${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc"
|
||||
)
|
||||
add_library(_cld2 ${SRCS})
|
||||
set_property(TARGET _cld2 PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_options (_cld2 PRIVATE -Wno-reserved-id-macro -Wno-c++11-narrowing)
|
||||
target_include_directories(_cld2 SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/public")
|
||||
add_library(ch_contrib::cld2 ALIAS _cld2)
|
1
contrib/nlp-data
vendored
Submodule
1
contrib/nlp-data
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 5591f91f5e748cba8fb9ef81564176feae774853
|
15
contrib/nlp-data-cmake/CMakeLists.txt
Normal file
15
contrib/nlp-data-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,15 @@
|
||||
include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
|
||||
|
||||
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/nlp-data")
|
||||
|
||||
add_library (_nlp_data INTERFACE)
|
||||
|
||||
clickhouse_embed_binaries(
|
||||
TARGET nlp_dictionaries
|
||||
RESOURCE_DIR "${LIBRARY_DIR}"
|
||||
RESOURCES charset.zst tonality_ru.zst programming.zst
|
||||
)
|
||||
|
||||
add_dependencies(_nlp_data nlp_dictionaries)
|
||||
target_link_libraries(_nlp_data INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:nlp_dictionaries> -Wl,${NO_WHOLE_ARCHIVE}")
|
||||
add_library(ch_contrib::nlp_data ALIAS _nlp_data)
|
2
contrib/orc
vendored
2
contrib/orc
vendored
@ -1 +1 @@
|
||||
Subproject commit 0a936f6bbdb9303308973073f8623b5a8d82eae1
|
||||
Subproject commit f9a393ed2433a60034795284f82d093b348f2102
|
@ -65,7 +65,12 @@ do
|
||||
# check if variable not empty
|
||||
[ -z "$dir" ] && continue
|
||||
# ensure directories exist
|
||||
if ! mkdir -p "$dir"; then
|
||||
if [ "$DO_CHOWN" = "1" ]; then
|
||||
mkdir="mkdir"
|
||||
else
|
||||
mkdir="$gosu mkdir"
|
||||
fi
|
||||
if ! $mkdir -p "$dir"; then
|
||||
echo "Couldn't create necessary directory: $dir"
|
||||
exit 1
|
||||
fi
|
||||
|
@ -78,15 +78,21 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree](
|
||||
| DATE, NEWDATE | [Date](../../sql-reference/data-types/date.md) |
|
||||
| DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) |
|
||||
| DATETIME2, TIMESTAMP2 | [DateTime64](../../sql-reference/data-types/datetime64.md) |
|
||||
| YEAR | [UInt16](../../sql-reference/data-types/int-uint.md) |
|
||||
| TIME | [Int64](../../sql-reference/data-types/int-uint.md) |
|
||||
| ENUM | [Enum](../../sql-reference/data-types/enum.md) |
|
||||
| STRING | [String](../../sql-reference/data-types/string.md) |
|
||||
| VARCHAR, VAR_STRING | [String](../../sql-reference/data-types/string.md) |
|
||||
| BLOB | [String](../../sql-reference/data-types/string.md) |
|
||||
| GEOMETRY | [String](../../sql-reference/data-types/string.md) |
|
||||
| BINARY | [FixedString](../../sql-reference/data-types/fixedstring.md) |
|
||||
| BIT | [UInt64](../../sql-reference/data-types/int-uint.md) |
|
||||
| SET | [UInt64](../../sql-reference/data-types/int-uint.md) |
|
||||
|
||||
[Nullable](../../sql-reference/data-types/nullable.md) is supported.
|
||||
|
||||
The data of TIME type in MySQL is converted to microseconds in ClickHouse.
|
||||
|
||||
Other types are not supported. If MySQL table contains a column of such type, ClickHouse throws exception "Unhandled data type" and stops replication.
|
||||
|
||||
## Specifics and Recommendations {#specifics-and-recommendations}
|
||||
|
@ -27,6 +27,7 @@ toc_title: Client Libraries
|
||||
- Go
|
||||
- [clickhouse](https://github.com/kshvakov/clickhouse/)
|
||||
- [go-clickhouse](https://github.com/roistat/go-clickhouse)
|
||||
- [chconn](https://github.com/vahid-sohrabloo/chconn)
|
||||
- [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse)
|
||||
- [golang-clickhouse](https://github.com/leprosus/golang-clickhouse)
|
||||
- Swift
|
||||
|
@ -43,7 +43,7 @@ User host is a host from which a connection to ClickHouse server could be establ
|
||||
- `HOST ANY` — User can connect from any location. This is a default option.
|
||||
- `HOST LOCAL` — User can connect only locally.
|
||||
- `HOST NAME 'fqdn'` — User host can be specified as FQDN. For example, `HOST NAME 'mysite.com'`.
|
||||
- `HOST NAME REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) regular expressions when specifying user hosts. For example, `HOST NAME REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST REGEXP 'regexp'` — You can use [pcre](http://www.pcre.org/) regular expressions when specifying user hosts. For example, `HOST REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST LIKE 'template'` — Allows you to use the [LIKE](../../../sql-reference/functions/string-search-functions.md#function-like) operator to filter the user hosts. For example, `HOST LIKE '%'` is equivalent to `HOST ANY`, `HOST LIKE '%.mysite.com'` filters all the hosts in the `mysite.com` domain.
|
||||
|
||||
Another way of specifying host is to use `@` syntax following the username. Examples:
|
||||
|
@ -3,14 +3,14 @@ toc_priority: 53
|
||||
toc_title: USE
|
||||
---
|
||||
|
||||
# USE 语句 {#use}
|
||||
# USE Statement {#use}
|
||||
|
||||
``` sql
|
||||
USE db
|
||||
```
|
||||
|
||||
用于设置会话的当前数据库。
|
||||
Lets you set the current database for the session.
|
||||
|
||||
如果查询语句中没有在表名前面以加点的方式指明数据库名, 则用当前数据库进行搜索。
|
||||
The current database is used for searching for tables if the database is not explicitly defined in the query with a dot before the table name.
|
||||
|
||||
使用 HTTP 协议时无法进行此查询,因为没有会话的概念。
|
||||
This query can’t be made when using the HTTP protocol, since there is no concept of a session.
|
||||
|
BIN
docs/ko/images/column-oriented.gif
Normal file
BIN
docs/ko/images/column-oriented.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 43 KiB |
1
docs/ko/images/logo.svg
Normal file
1
docs/ko/images/logo.svg
Normal file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="54" height="48" markdown="1" viewBox="0 0 9 8"><style>.o{fill:#fc0}.r{fill:red}</style><path d="M0,7 h1 v1 h-1 z" class="r"/><path d="M0,0 h1 v7 h-1 z" class="o"/><path d="M2,0 h1 v8 h-1 z" class="o"/><path d="M4,0 h1 v8 h-1 z" class="o"/><path d="M6,0 h1 v8 h-1 z" class="o"/><path d="M8,3.25 h1 v1.5 h-1 z" class="o"/></svg>
|
After Width: | Height: | Size: 373 B |
BIN
docs/ko/images/play.png
Normal file
BIN
docs/ko/images/play.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
BIN
docs/ko/images/row-oriented.gif
Normal file
BIN
docs/ko/images/row-oriented.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 38 KiB |
94
docs/ko/index.md
Normal file
94
docs/ko/index.md
Normal file
@ -0,0 +1,94 @@
|
||||
---
|
||||
toc_priority: 0
|
||||
toc_title: 목차
|
||||
---
|
||||
|
||||
# ClickHouse란? {#what-is-clickhouse}
|
||||
|
||||
ClickHouse® 는 query의 온라인 분석 처리(OLAP)를 위한 열 지향(column-oriented) 데이터베이스 관리 시스템(DBMS)입니다.
|
||||
|
||||
"보통의" 행 지향(row-oriented) DMBS에서는 데이터가 다음과 같은 순서로 저장됩니다.
|
||||
|
||||
| row | WatchID | JavaEnable | Title | GoodEvent | EventTime |
|
||||
|-----|-------------|------------|--------------------|-----------|---------------------|
|
||||
| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 |
|
||||
| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 |
|
||||
| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 |
|
||||
| #N | … | … | … | … | … |
|
||||
|
||||
즉, 행과 관련된 모든 값들은 물리적으로 나란히 저장됩니다.
|
||||
|
||||
행 지향(row-oriented) DMBS의 예시로는 MySQL, Postgres, 그리고 MS SQL 서버 등이 있습니다.
|
||||
|
||||
열 지향 (column-oriented) DBMS에서는 데이터가 아래와 같은 방식으로 저장됩니다:
|
||||
|
||||
| Row: | #0 | #1 | #2 | #N |
|
||||
|-------------|---------------------|---------------------|---------------------|-----|
|
||||
| WatchID: | 89354350662 | 90329509958 | 89953706054 | … |
|
||||
| JavaEnable: | 1 | 0 | 1 | … |
|
||||
| Title: | Investor Relations | Contact us | Mission | … |
|
||||
| GoodEvent: | 1 | 1 | 1 | … |
|
||||
| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … |
|
||||
|
||||
이 예에서는 데이터가 정렬된 순서만을 보여줍니다. 다른 열의 값들은 서로 분리되어 저장되고, 같은 열의 정보들은 함께 저장됩니다.
|
||||
|
||||
열 지향(column-oriented) DBMS 의 종류는 Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, 그리고 kdb+ 등이 있습니다.
|
||||
|
||||
데이터를 저장하기 위한 서로 다른 순서는 다른 시나리오에 더 적합합니다. 데이터 접근 시나리오는 쿼리가 수행되는 빈도, 비율 및 비율을 나타내거나, 각 쿼리 유형(행, 열 및 바이트)에 대해 읽은 데이터의 양 데이터 읽기와 업데이트 사이의 관계, 데이터의 작업 크기 및 로컬에서 사용되는 방법 트랜잭션이 사용되는지 여부, 트랜잭션이 얼마나 격리되어 있는지, 데이터 복제 및 논리적 무결성에 대한 요구 사항, 각 쿼리 유형에 대한 대기 시간 및 처리량 요구 사항 등이 있습니다.
|
||||
|
||||
시스템의 부하가 높을수록 사용 시나리오의 요구 사항에 맞게 시스템 설정을 사용자 지정하는 것이 더 중요하며 이 사용자 지정은 더욱 세분화됩니다. 상당히 다른 시나리오에 똑같이 적합한 시스템은 없습니다. 만약 높은 부하에서 시스템이 넓은 시나리오 집합에 대해 적응한다면 시스템은 모든 시나리오를 모두 제대로 처리하지 못하거나 가능한 시나리오 중 하나 또는 몇 개에 대해서만 잘 작동할 것입니다.
|
||||
|
||||
## OLAP 시나리오의 중요 속성들 {#key-properties-of-olap-scenario}
|
||||
|
||||
- 요청(request)의 대부분은 읽기 접근에 관한 것입니다.
|
||||
- 데이터는 단일 행이 아니라 상당히 큰 일괄 처리(\> 1000개 행)로 업데이트됩니다. 또는 전혀 업데이트되지 않습니다.
|
||||
- 데이터는 DB에 추가되지만 수정되지는 않습니다.
|
||||
- 읽기의 경우 DB에서 상당히 많은 수의 행이 추출되지만 열은 일부만 추출됩니다.
|
||||
- 테이블은 "넓습니다". 이는 열의 수가 많다는 것을 의미합니다.
|
||||
- 쿼리는 상대적으로 드뭅니다(일반적으로 서버당 수백 또는 초당 쿼리 미만).
|
||||
- 간단한 쿼리의 경우 약 50ms의 대기 시간이 허용됩니다.
|
||||
- 열 값은 숫자와 짧은 문자열(예: URL당 60바이트)과 같이 상당히 작습니다
|
||||
- 단일 쿼리를 처리할 때 높은 처리량이 필요합니다(서버당 초당 최대 수십억 행).
|
||||
- 트랜잭션이 필요하지 않습니다.
|
||||
- 데이터 일관성에 대한 요구 사항이 낮습니다.
|
||||
- 쿼리당 하나의 큰 테이블이 존재하고 하나를 제외한 모든 테이블은 작습니다.
|
||||
- 쿼리 결과가 원본 데이터보다 훨씬 작습니다. 즉, 데이터가 필터링되거나 집계되므로 결과가 단일 서버의 RAM에 꼭 들어맞습니다.
|
||||
|
||||
OLAP 시나리오가 다른 일반적인 시나리오(OLTP 또는 키-값 액세스와 같은)와 매우 다르다는 것을 쉽게 알 수 있습니다. 따라서 적절한 성능을 얻으려면 분석 쿼리를 처리하기 위해 OLTP 또는 키-값 DB를 사용하는 것은 의미가 없습니다. 예를 들어 분석에 MongoDB나 Redis를 사용하려고 하면 OLAP 데이터베이스에 비해 성능이 매우 저하됩니다.
|
||||
|
||||
## 왜 열 지향 데이터베이스가 OLAP 시나리오에 적합한가{#why-column-oriented-databases-work-better-in-the-olap-scenario}
|
||||
|
||||
열 지향(column-oriented) 데이터베이스는 OLAP 시나리오에 더 적합합니다. 대부분의 쿼리를 처리하는 데 있어서 행 지향(row-oriented) 데이터베이스보다 100배 이상 빠릅니다. 그 이유는 아래에 자세히 설명되어 있지만 사실은 시각적으로 더 쉽게 설명할 수 있습니다.
|
||||
|
||||
**행 지향 DBMS**
|
||||
|
||||
![Row-oriented](images/row-oriented.gif#)
|
||||
|
||||
**열 지향 DBMS**
|
||||
|
||||
![Column-oriented](images/column-oriented.gif#)
|
||||
|
||||
차이가 보이시나요?
|
||||
|
||||
### 입출력 {#inputoutput}
|
||||
|
||||
1. 분석 쿼리의 경우 적은 수의 테이블 열만 읽어야 합니다. 열 지향 데이터베이스에서는 필요한 데이터만 읽을 수 있습니다. 예를 들어 100개 중 5개의 열이 필요한 경우 I/O가 20배 감소할 것으로 예상할 수 있습니다.
|
||||
2. 데이터는 패킷으로 읽히므로 압축하기가 더 쉽습니다. 열의 데이터도 압축하기 쉽습니다. 이것은 I/O의 볼륨을 더욱 감소시킵니다.
|
||||
3. 감소된 I/O로 인해 시스템 캐시에 더 많은 데이터가 들어갑니다.
|
||||
|
||||
예를 들어, "각 광고 플랫폼에 대한 레코드 수 계산" 쿼리는 압축되지 않은 1바이트를 차지하는 하나의 "광고 플랫폼 ID" 열을 읽어야 합니다. 트래픽의 대부분이 광고 플랫폼에서 발생하지 않은 경우 이 열의 최소 10배 압축을 기대할 수 있습니다. 빠른 압축 알고리즘을 사용하면 초당 최소 몇 기가바이트의 압축되지 않은 데이터의 속도로 데이터 압축 해제가 가능합니다. 즉, 이 쿼리는 단일 서버에서 초당 약 수십억 행의 속도로 처리될 수 있습니다. 이 속도는 정말 실제로 달성됩니다.
|
||||
|
||||
### CPU {#cpu}
|
||||
|
||||
쿼리를 수행하려면 많은 행을 처리해야 하므로 별도의 행이 아닌 전체 벡터에 대한 모든 연산을 디스패치하거나 쿼리 엔진을 구현하여 디스패치 비용이 거의 들지 않습니다. 반쯤 괜찮은 디스크 하위 시스템에서 이렇게 하지 않으면 쿼리 인터프리터가 불가피하게 CPU를 정지시킵니다. 데이터를 열에 저장하고 가능한 경우 열별로 처리하는 것이 좋습니다.
|
||||
|
||||
이를 수행하기위한 두가지 방법이 있습니다.
|
||||
|
||||
1. 벡터 엔진. 모든 연산은 별도의 값 대신 벡터에 대해 작성됩니다. 즉, 작업을 자주 호출할 필요가 없으며 파견 비용도 무시할 수 있습니다. 작업 코드에는 최적화된 내부 주기가 포함되어 있습니다.
|
||||
2. 코드 생성. 쿼리에 대해 생성된 코드에는 모든 간접 호출이 있습니다.
|
||||
|
||||
이것은 단순한 쿼리를 실행할 때 의미가 없기 때문에 "일반" 데이터베이스에서는 수행되지 않습니다. 그러나 예외가 있습니다. 예를 들어 MemSQL은 코드 생성을 사용하여 SQL 쿼리를 처리할 때 대기 시간을 줄입니다. (비교되게, 분석 DBMS는 대기 시간이 아닌 처리량 최적화가 필요합니다.)
|
||||
|
||||
CPU 효율성을 위해 쿼리 언어는 선언적(SQL 또는 MDX)이거나 최소한 벡터(J, K)여야 합니다. 쿼리는 최적화를 허용하는 암시적 루프만 포함해야 합니다.
|
||||
|
||||
{## [원문](https://clickhouse.com/docs/en/) ##}
|
@ -43,7 +43,7 @@ CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1]
|
||||
- `HOST ANY` — Пользователь может подключиться с любого хоста. Используется по умолчанию.
|
||||
- `HOST LOCAL` — Пользователь может подключиться только локально.
|
||||
- `HOST NAME 'fqdn'` — Хост задается через FQDN. Например, `HOST NAME 'mysite.com'`.
|
||||
- `HOST NAME REGEXP 'regexp'` — Позволяет использовать регулярные выражения [pcre](http://www.pcre.org/), чтобы задать хосты. Например, `HOST NAME REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST REGEXP 'regexp'` — Позволяет использовать регулярные выражения [pcre](http://www.pcre.org/), чтобы задать хосты. Например, `HOST REGEXP '.*\.mysite\.com'`.
|
||||
- `HOST LIKE 'template'` — Позволяет использовать оператор [LIKE](../../functions/string-search-functions.md#function-like) для фильтрации хостов. Например, `HOST LIKE '%'` эквивалентен `HOST ANY`; `HOST LIKE '%.mysite.com'` разрешает подключение со всех хостов в домене `mysite.com`.
|
||||
|
||||
Также, чтобы задать хост, вы можете использовать `@` вместе с именем пользователя. Примеры:
|
||||
|
@ -62,7 +62,7 @@ def build_for_lang(lang, args):
|
||||
strict=True,
|
||||
theme=theme_cfg,
|
||||
nav=blog_nav,
|
||||
copyright='©2016–2021 ClickHouse, Inc.',
|
||||
copyright='©2016–2022 ClickHouse, Inc.',
|
||||
use_directory_urls=True,
|
||||
repo_name='ClickHouse/ClickHouse',
|
||||
repo_url='https://github.com/ClickHouse/ClickHouse/',
|
||||
@ -97,10 +97,6 @@ def build_for_lang(lang, args):
|
||||
with open(os.path.join(args.blog_output_dir, lang, 'rss.xml'), 'w') as f:
|
||||
f.write(rss_template.render({'config': raw_config}))
|
||||
|
||||
# TODO: AMP for blog
|
||||
# if not args.skip_amp:
|
||||
# amp.build_amp(lang, args, cfg)
|
||||
|
||||
logging.info(f'Finished building {lang} blog')
|
||||
|
||||
except exceptions.ConfigurationError as e:
|
||||
|
@ -1 +0,0 @@
|
||||
../../../en/faq/general/dbms-naming.md
|
17
docs/zh/faq/general/dbms-naming.md
Normal file
17
docs/zh/faq/general/dbms-naming.md
Normal file
@ -0,0 +1,17 @@
|
||||
---
|
||||
title: "\u201CClickHouse\u201D 有什么含义?"
|
||||
toc_hidden: true
|
||||
toc_priority: 10
|
||||
---
|
||||
|
||||
# “ClickHouse” 有什么含义? {#what-does-clickhouse-mean}
|
||||
|
||||
它是“**点击**流”和“数据**仓库**”的组合。它来自于Yandex最初的用例。在Metrica网站上,ClickHouse本应该保存人们在互联网上的所有点击记录,现在它仍然在做这项工作。你可以在[ClickHouse history](../../introduction/history.md)页面上阅读更多关于这个用例的信息。
|
||||
|
||||
这个由两部分组成的意思有两个结果:
|
||||
|
||||
- 唯一正确的写“Click**H** house”的方式是用大写H。
|
||||
- 如果需要缩写,请使用“**CH**”。由于一些历史原因,缩写CK在中国也很流行,主要是因为中文中最早的一个关于ClickHouse的演讲使用了这种形式。
|
||||
|
||||
!!! info “有趣的事实”
|
||||
多年后ClickHouse闻名于世, 这种命名方法:结合各有深意的两个词被赞扬为最好的数据库命名方式, 卡内基梅隆大学数据库副教授[Andy Pavlo做的研究](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html) 。ClickHouse与Postgres共同获得“史上最佳数据库名”奖。
|
@ -1 +0,0 @@
|
||||
../../../en/faq/general/index.md
|
27
docs/zh/faq/general/index.md
Normal file
27
docs/zh/faq/general/index.md
Normal file
@ -0,0 +1,27 @@
|
||||
---
|
||||
title: ClickHouse 有关常见问题
|
||||
toc_hidden_folder: true
|
||||
toc_priority: 1
|
||||
toc_title: General
|
||||
---
|
||||
|
||||
# ClickHouse 有关常见问题 {#general-questions}
|
||||
|
||||
常见问题:
|
||||
|
||||
- [什么是 ClickHouse?](../../index.md#what-is-clickhouse)
|
||||
- [为何 ClickHouse 如此迅捷?](../../faq/general/why-clickhouse-is-so-fast.md)
|
||||
- [谁在使用 ClickHouse?](../../faq/general/who-is-using-clickhouse.md)
|
||||
- [“ClickHouse” 有什么含义?](../../faq/general/dbms-naming.md)
|
||||
- [ “Не тормозит” 有什么含义?](../../faq/general/ne-tormozit.md)
|
||||
- [什么是 OLAP?](../../faq/general/olap.md)
|
||||
- [什么是列存储数据库?](../../faq/general/columnar-database.md)
|
||||
- [为何不使用 MapReduce等技术?](../../faq/general/mapreduce.md)
|
||||
- [我如何为 ClickHouse贡献代码?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md)
|
||||
|
||||
|
||||
|
||||
!!! info "没找到您需要的内容?"
|
||||
请查阅 [其他 F.A.Q. 类别](../../faq/index.md) 或者从左侧导航栏浏览其他文档
|
||||
|
||||
{## [原始文档](https://clickhouse.com/docs/en/faq/general/) ##}
|
@ -1 +0,0 @@
|
||||
../../../en/faq/general/mapreduce.md
|
13
docs/zh/faq/general/mapreduce.md
Normal file
13
docs/zh/faq/general/mapreduce.md
Normal file
@ -0,0 +1,13 @@
|
||||
---
|
||||
title: 为何不使用 MapReduce等技术?
|
||||
toc_hidden: true
|
||||
toc_priority: 110
|
||||
---
|
||||
|
||||
# 为何不使用 MapReduce等技术? {#why-not-use-something-like-mapreduce}
|
||||
|
||||
我们可以将MapReduce这样的系统称为分布式计算系统,其中的reduce操作是基于分布式排序的。这个领域中最常见的开源解决方案是[Apache Hadoop](http://hadoop.apache.org)。Yandex使用其内部解决方案YT。
|
||||
|
||||
这些系统不适合用于在线查询,因为它们的延迟很大。换句话说,它们不能被用作网页界面的后端。这些类型的系统对于实时数据更新并不是很有用。如果操作的结果和所有中间结果(如果有的话)都位于单个服务器的内存中,那么分布式排序就不是执行reduce操作的最佳方式,这通常是在线查询的情况。在这种情况下,哈希表是执行reduce操作的最佳方式。优化map-reduce任务的一种常见方法是使用内存中的哈希表进行预聚合(部分reduce)。用户手动执行此优化。在运行简单的map-reduce任务时,分布式排序是导致性能下降的主要原因之一。
|
||||
|
||||
大多数MapReduce实现允许你在集群中执行任意代码。但是声明性查询语言更适合于OLAP,以便快速运行实验。例如,Hadoop有Hive和Pig。还可以考虑使用Cloudera Impala或Shark(已经过时了)来支持Spark,以及Spark SQL、Presto和Apache Drill。与专门的系统相比,运行这些任务的性能是非常不理想的,但是相对较高的延迟使得使用这些系统作为web界面的后端是不现实的。
|
@ -19,6 +19,7 @@ toc_priority: 76
|
||||
- [什么是 OLAP?](../faq/general/olap.md)
|
||||
- [什么是列存储数据库?](../faq/general/columnar-database.md)
|
||||
- [为何不使用 MapReduce等技术?](../faq/general/mapreduce.md)
|
||||
- [我如何为 ClickHouse贡献代码?](../faq/general/how-do-i-contribute-code-to-clickhouse.md)
|
||||
- **[应用案例](../faq/use-cases/index.md)**
|
||||
- [我能把 ClickHouse 作为时序数据库来使用吗?](../faq/use-cases/time-series.md)
|
||||
- [我能把 ClickHouse 作为 key-value 键值存储吗?](../faq/use-cases/key-value.md)
|
||||
|
@ -1,59 +1,59 @@
|
||||
---
|
||||
toc_priority: 44
|
||||
toc_title: "要求"
|
||||
toc_title: "必备条件"
|
||||
---
|
||||
|
||||
# 要求 {#requirements}
|
||||
# 必备条件 {#requirements}
|
||||
|
||||
## CPU {#cpu}
|
||||
|
||||
对于从预构建的deb包进行安装,请使用具有x86_64架构并支持SSE4.2指令的CPU。 要使用不支持SSE4.2或具有AArch64或PowerPC64LE体系结构的处理器运行ClickHouse,您应该从源代码构建ClickHouse。
|
||||
如果您使用预编译的DEB/RPM包安装ClickHouse,请使用支持SSE4.2指令集的x86_64架构的CPU。如果需要在不支持SSE4.2指令集的CPU上,或者在AArch64(ARM)和PowerPC64LE(IBM Power)架构上运行ClickHouse,您应该从源码编译ClickHouse。
|
||||
|
||||
ClickHouse实现并行数据处理并使用所有可用的硬件资源。 在选择处理器时,考虑到ClickHouse在具有大量内核但时钟速率较低的配置中的工作效率要高于具有较少内核和较高时钟速率的配置。 例如,具有2600MHz的16核心优于具有3600MHz的8核心。
|
||||
ClickHouse实现了并行数据处理,处理时会使用所有的可用资源。在选择处理器时,请注意:ClickHouse在具有大量计算核、时钟频率稍低的平台上比计算核少、时钟频率高的平台上效率更高。例如,ClickHouse在16核 2.6GHz的CPU上运行速度高于8核 3.6GHz的CPU。
|
||||
|
||||
建议使用 **睿频加速** 和 **超线程** 技术。 它显着提高了典型工作负载的性能。
|
||||
建议使用 **睿频加速** 和 **超线程** 技术。 它显着提高了正常工作负载的性能。
|
||||
|
||||
## RAM {#ram}
|
||||
|
||||
我们建议使用至少4GB的RAM来执行重要的查询。 ClickHouse服务器可以使用少得多的RAM运行,但它需要处理查询的内存。
|
||||
我们建议使用至少4GB的内存来执行重要的查询。 ClickHouse服务器可以使用很少的内存运行,但它需要一定量的内存用于处理查询。
|
||||
|
||||
RAM所需的体积取决于:
|
||||
ClickHouse所需内存取决于:
|
||||
|
||||
- 查询的复杂性。
|
||||
- 查询中处理的数据量。
|
||||
- 查询的复杂程度。
|
||||
- 查询处理的数据量。
|
||||
|
||||
要计算所需的RAM体积,您应该估计临时数据的大小 [GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause), [DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct), [JOIN](../sql-reference/statements/select/join.md#select-join) 和您使用的其他操作。
|
||||
要计算所需的内存大小,您应该考虑用于[GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause)、[DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct)、[JOIN](../sql-reference/statements/select/join.md#select-join) 和其他操作所需的临时数据量。
|
||||
|
||||
ClickHouse可以使用外部存储器来存储临时数据。看 [在外部存储器中分组](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory) 有关详细信息。
|
||||
ClickHouse可以使用外部存储器来存储临时数据。详情请见[在外部存储器中分组](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory)。
|
||||
|
||||
## 交换文件 {#swap-file}
|
||||
|
||||
禁用生产环境的交换文件。
|
||||
请在生产环境禁用交换文件。
|
||||
|
||||
## 存储子系统 {#storage-subsystem}
|
||||
|
||||
您需要有2GB的可用磁盘空间来安装ClickHouse。
|
||||
|
||||
数据所需的存储量应单独计算。 评估应包括:
|
||||
数据所需的存储空间应单独计算。预估存储容量时请考虑:
|
||||
|
||||
- 估计数据量。
|
||||
- 数据量
|
||||
|
||||
您可以采取数据的样本并从中获取行的平均大小。 然后将该值乘以计划存储的行数。
|
||||
您可以对数据进行采样并计算每行的平均占用空间。然后将该值乘以计划存储的行数。
|
||||
|
||||
- 数据压缩系数。
|
||||
- 数据压缩比
|
||||
|
||||
要估计数据压缩系数,请将数据的样本加载到ClickHouse中,并将数据的实际大小与存储的表的大小进行比较。 例如,点击流数据通常被压缩6-10倍。
|
||||
要计算数据压缩比,请将样本数据写入ClickHouse,并将原始数据大小与ClickHouse实际存储的数据进行比较。例如,用户点击行为的原始数据压缩比通常为6-10。
|
||||
|
||||
要计算要存储的最终数据量,请将压缩系数应用于估计的数据量。 如果计划将数据存储在多个副本中,则将估计的量乘以副本数。
|
||||
请将原始数据的大小除以压缩比来获得实际所需存储的大小。如果您打算将数据存放于几个副本中,请将存储容量乘上副本数。
|
||||
|
||||
## 网络 {#network}
|
||||
|
||||
如果可能的话,使用10G或更高级别的网络。
|
||||
如果可能的话,请使用10G或更高级别的网络。
|
||||
|
||||
网络带宽对于处理具有大量中间结果数据的分布式查询至关重要。 此外,网络速度会影响复制过程。
|
||||
网络带宽对于处理具有大量中间结果数据的分布式查询至关重要。此外,网络速度会影响复制过程。
|
||||
|
||||
## 软件 {#software}
|
||||
|
||||
ClickHouse主要是为Linux系列操作系统开发的。 推荐的Linux发行版是Ubuntu。 `tzdata` 软件包应安装在系统中。
|
||||
ClickHouse主要是为Linux系列操作系统开发的。推荐的Linux发行版是Ubuntu。您需要检查`tzdata`(对于Ubuntu)软件包是否在安装ClickHouse之前已经安装。
|
||||
|
||||
ClickHouse也可以在其他操作系统系列中工作。 查看详细信息 [开始](../getting-started/index.md) 文档的部分。
|
||||
ClickHouse也可以在其他操作系统系列中工作。详情请查看[开始](../getting-started/index.md)。
|
||||
|
@ -1,23 +1,74 @@
|
||||
---
|
||||
toc_hidden_folder: true
|
||||
toc_priority: 42
|
||||
toc_title: INDEX
|
||||
toc_priority: 35
|
||||
toc_title: ALTER
|
||||
---
|
||||
|
||||
# 操作数据跳过索引 {#manipulations-with-data-skipping-indices}
|
||||
## ALTER {#query_language_queries_alter}
|
||||
|
||||
可以使用以下操作:
|
||||
大多数 `ALTER TABLE` 查询修改表设置或数据:
|
||||
|
||||
- `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - 向表元数据添加索引描述。
|
||||
- [COLUMN](../../../sql-reference/statements/alter/column.md)
|
||||
- [PARTITION](../../../sql-reference/statements/alter/partition.md)
|
||||
- [DELETE](../../../sql-reference/statements/alter/delete.md)
|
||||
- [UPDATE](../../../sql-reference/statements/alter/update.md)
|
||||
- [ORDER BY](../../../sql-reference/statements/alter/order-by.md)
|
||||
- [INDEX](../../../sql-reference/statements/alter/index/index.md)
|
||||
- [CONSTRAINT](../../../sql-reference/statements/alter/constraint.md)
|
||||
- [TTL](../../../sql-reference/statements/alter/ttl.md)
|
||||
|
||||
- `ALTER TABLE [db].name DROP INDEX name` - 从表元数据中删除索引描述并从磁盘中删除索引文件。
|
||||
!!! note "备注"
|
||||
大多数 `ALTER TABLE` 查询只支持[\*MergeTree](../../../engines/table-engines/mergetree-family/index.md)表,以及[Merge](../../../engines/table-engines/special/merge.md)和[Distributed](../../../engines/table-engines/special/distributed.md)。
|
||||
|
||||
- `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - 查询在分区`partition_name`中重建二级索引`name`。 操作为[mutation](../../../sql-reference/statements/alter/index.md#mutations).
|
||||
这些 `ALTER` 语句操作视图:
|
||||
|
||||
前两个命令是轻量级的,它们只更改元数据或删除文件。
|
||||
- [ALTER TABLE ... MODIFY QUERY](../../../sql-reference/statements/alter/view.md) — 修改一个 [Materialized view](../create/view.md#materialized) 结构.
|
||||
- [ALTER LIVE VIEW](../../../sql-reference/statements/alter/view.md#alter-live-view) — 刷新一个 [Live view](../create/view.md#live-view).
|
||||
|
||||
Also, they are replicated, syncing indices metadata via ZooKeeper.
|
||||
此外,它们会被复制,会通过ZooKeeper同步索引元数据。
|
||||
这些 `ALTER` 语句修改与基于角色的访问控制相关的实体:
|
||||
|
||||
!!! note "注意"
|
||||
索引操作仅支持具有以下特征的表 [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md)引擎 (包括[replicated](../../../engines/table-engines/mergetree-family/replication.md)).
|
||||
- [USER](../../../sql-reference/statements/alter/user.md)
|
||||
- [ROLE](../../../sql-reference/statements/alter/role.md)
|
||||
- [QUOTA](../../../sql-reference/statements/alter/quota.md)
|
||||
- [ROW POLICY](../../../sql-reference/statements/alter/row-policy.md)
|
||||
- [SETTINGS PROFILE](../../../sql-reference/statements/alter/settings-profile.md)
|
||||
|
||||
[ALTER TABLE ... MODIFY COMMENT](../../../sql-reference/statements/alter/comment.md) 语句添加、修改或删除表中的注释,无论之前是否设置过。
|
||||
|
||||
## Mutations 突变 {#mutations}
|
||||
|
||||
用来操作表数据的ALTER查询是通过一种叫做“突变”的机制来实现的,最明显的是[ALTER TABLE … DELETE](../../../sql-reference/statements/alter/delete.md)和[ALTER TABLE … UPDATE](../../../sql-reference/statements/alter/update.md)。它们是异步的后台进程,类似于[MergeTree](../../../engines/table-engines/mergetree-family/index.md)表的合并,产生新的“突变”版本的部件。
|
||||
|
||||
|
||||
|
||||
对于 `*MergeTree` 表,通过重写整个数据部分来执行突变。没有原子性——一旦突变的部件准备好,部件就会被替换,并且在突变期间开始执行的 `SELECT` 查询将看到来自已经突变的部件的数据,以及来自尚未突变的部件的数据。
|
||||
|
||||
|
||||
|
||||
突变完全按照它们的产生顺序排列,并按此顺序应用于每个部分。突变还与“INSERT INTO”查询进行部分排序:在提交突变之前插入表中的数据将被突变,而在此之后插入的数据将不会被突变。注意,突变不会以任何方式阻止插入。
|
||||
|
||||
|
||||
|
||||
突变查询在添加突变条目后立即返回(对于复制表到ZooKeeper,对于非复制表到文件系统)。突变本身使用系统配置文件设置异步执行。要跟踪突变的进程,可以使用[`system.mutations`](../../../operations/system-tables/mutations.md#system_tables-mutations) 表。成功提交的变异将继续执行,即使ClickHouse服务器重新启动。没有办法回滚突变一旦提交,但如果突变卡住了,它可以取消与[`KILL MUTATION`](../../../sql-reference/statements/misc.md#kill-mutation) 查询。
|
||||
|
||||
|
||||
|
||||
完成突变的条目不会立即删除(保留条目的数量由 `finished_mutations_to_keep` 存储引擎参数决定)。删除旧的突变条目。
|
||||
|
||||
## ALTER 查询的同步性 {#synchronicity-of-alter-queries}
|
||||
|
||||
|
||||
对于非复制表,所有的 `ALTER` 查询都是同步执行的。对于复制表,查询只是向“ZooKeeper”添加相应动作的指令,动作本身会尽快执行。但是,查询可以等待所有副本上的这些操作完成。
|
||||
|
||||
对于所有的“ALTER”查询,您可以使用[replication_alter_partitions_sync](../../../operations/settings/settings.md#replication-alter-partitions-sync)设置等待。
|
||||
|
||||
通过[replication_wait_for_inactive_replica_timeout](../../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout]设置,可以指定不活动的副本执行所有 `ALTER` 查询的等待时间(以秒为单位)。
|
||||
|
||||
|
||||
|
||||
!!! info "备注"
|
||||
|
||||
对于所有的 `ALTER` 查询,如果 `replication_alter_partitions_sync = 2` 和一些副本的不激活时间超过时间(在 `replication_wait_for_inactive_replica_timeout` 设置中指定),那么将抛出一个异常 `UNFINISHED`。
|
||||
|
||||
|
||||
|
||||
对于 `ALTER TABLE ... UPDATE|DELETE` 查询由 [mutations_sync](../../../operations/settings/settings.md#mutations_sync) 设置定义的同步度。
|
||||
|
@ -1 +0,0 @@
|
||||
../../../../../en/sql-reference/statements/alter/index/index.md
|
23
docs/zh/sql-reference/statements/alter/index/index.md
Normal file
23
docs/zh/sql-reference/statements/alter/index/index.md
Normal file
@ -0,0 +1,23 @@
|
||||
---
|
||||
toc_hidden_folder: true
|
||||
toc_priority: 42
|
||||
toc_title: INDEX
|
||||
---
|
||||
|
||||
# 操作数据跳过索引 {#manipulations-with-data-skipping-indices}
|
||||
|
||||
可以使用以下操作:
|
||||
|
||||
- `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - 向表元数据添加索引描述。
|
||||
|
||||
- `ALTER TABLE [db].name DROP INDEX name` - 从表元数据中删除索引描述并从磁盘中删除索引文件。
|
||||
|
||||
- `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - 查询在分区`partition_name`中重建二级索引`name`。 操作为[mutation](../../../../sql-reference/statements/alter/index.md#mutations).
|
||||
|
||||
前两个命令是轻量级的,它们只更改元数据或删除文件。
|
||||
|
||||
Also, they are replicated, syncing indices metadata via ZooKeeper.
|
||||
此外,它们会被复制,会通过ZooKeeper同步索引元数据。
|
||||
|
||||
!!! note "注意"
|
||||
索引操作仅支持具有以下特征的表 [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md)引擎 (包括[replicated](../../../../engines/table-engines/mergetree-family/replication.md)).
|
@ -1 +0,0 @@
|
||||
../../../en/sql-reference/statements/exists.md
|
12
docs/zh/sql-reference/statements/exists.md
Normal file
12
docs/zh/sql-reference/statements/exists.md
Normal file
@ -0,0 +1,12 @@
|
||||
---
|
||||
toc_priority: 45
|
||||
toc_title: EXISTS
|
||||
---
|
||||
|
||||
# EXISTS 语句 {#exists-statement}
|
||||
|
||||
``` sql
|
||||
EXISTS [TEMPORARY] [TABLE|DICTIONARY] [db.]name [INTO OUTFILE filename] [FORMAT format]
|
||||
```
|
||||
|
||||
返回一个单独的 `UInt8`类型的列,如果表或数据库不存在,则包含一个值 `0`,如果表在指定的数据库中存在,则包含一个值 `1`。
|
@ -1 +0,0 @@
|
||||
../../../en/sql-reference/statements/set.md
|
23
docs/zh/sql-reference/statements/set.md
Normal file
23
docs/zh/sql-reference/statements/set.md
Normal file
@ -0,0 +1,23 @@
|
||||
---
|
||||
toc_priority: 50
|
||||
toc_title: SET
|
||||
---
|
||||
|
||||
# SET 语句 {#query-set}
|
||||
|
||||
``` sql
|
||||
SET param = value
|
||||
```
|
||||
|
||||
给当前会话的 `param` [配置项](../../operations/settings/index.md)赋值。你不能用这样的方式修改[服务器相关设置](../../operations/server-configuration-parameters/index.md)。
|
||||
|
||||
|
||||
您还可以在单个查询中设置指定设置配置文件中的所有值。
|
||||
|
||||
|
||||
|
||||
``` sql
|
||||
SET profile = 'profile-name-from-the-settings-file'
|
||||
```
|
||||
|
||||
更多详情, 详见 [配置项](../../operations/settings/settings.md).
|
@ -1 +0,0 @@
|
||||
../../../en/sql-reference/statements/use.md
|
16
docs/zh/sql-reference/statements/use.md
Normal file
16
docs/zh/sql-reference/statements/use.md
Normal file
@ -0,0 +1,16 @@
|
||||
---
|
||||
toc_priority: 53
|
||||
toc_title: USE
|
||||
---
|
||||
|
||||
# USE 语句 {#use}
|
||||
|
||||
``` sql
|
||||
USE db
|
||||
```
|
||||
|
||||
用于设置会话的当前数据库。
|
||||
|
||||
如果查询语句中没有在表名前面以加点的方式指明数据库名, 则用当前数据库进行搜索。
|
||||
|
||||
使用 HTTP 协议时无法进行此查询,因为没有会话的概念。
|
@ -364,7 +364,9 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
|
||||
"clickhouse-git-import",
|
||||
"clickhouse-compressor",
|
||||
"clickhouse-format",
|
||||
"clickhouse-extract-from-config"
|
||||
"clickhouse-extract-from-config",
|
||||
"clickhouse-keeper",
|
||||
"clickhouse-keeper-converter",
|
||||
};
|
||||
|
||||
for (const auto & tool : tools)
|
||||
|
@ -327,6 +327,7 @@ std::string LocalServer::getInitialCreateTableQuery()
|
||||
{
|
||||
/// Use Unix tools stdin naming convention
|
||||
table_file = "stdin";
|
||||
format_from_file_name = FormatFactory::instance().getFormatFromFileDescriptor(STDIN_FILENO);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -72,7 +72,7 @@ private:
|
||||
using Base = AggregateFunctionNullBase<result_is_nullable, serialize_flag,
|
||||
AggregateFunctionIfNullUnary<result_is_nullable, serialize_flag>>;
|
||||
|
||||
inline bool singleFilter(const IColumn ** columns, size_t row_num, size_t num_arguments) const
|
||||
inline bool singleFilter(const IColumn ** columns, size_t row_num) const
|
||||
{
|
||||
const IColumn * filter_column = columns[num_arguments - 1];
|
||||
|
||||
@ -112,7 +112,7 @@ public:
|
||||
{
|
||||
const ColumnNullable * column = assert_cast<const ColumnNullable *>(columns[0]);
|
||||
const IColumn * nested_column = &column->getNestedColumn();
|
||||
if (!column->isNullAt(row_num) && singleFilter(columns, row_num, num_arguments))
|
||||
if (!column->isNullAt(row_num) && singleFilter(columns, row_num))
|
||||
{
|
||||
this->setFlag(place);
|
||||
this->nested_function->add(this->nestedPlace(place), &nested_column, row_num, arena);
|
||||
|
@ -17,15 +17,11 @@ class AggregateFunctionSimpleState final : public IAggregateFunctionHelper<Aggre
|
||||
{
|
||||
private:
|
||||
AggregateFunctionPtr nested_func;
|
||||
DataTypes arguments;
|
||||
Array params;
|
||||
|
||||
public:
|
||||
AggregateFunctionSimpleState(AggregateFunctionPtr nested_, const DataTypes & arguments_, const Array & params_)
|
||||
: IAggregateFunctionHelper<AggregateFunctionSimpleState>(arguments_, params_)
|
||||
, nested_func(nested_)
|
||||
, arguments(arguments_)
|
||||
, params(params_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -35,18 +31,19 @@ public:
|
||||
{
|
||||
DataTypeCustomSimpleAggregateFunction::checkSupportedFunctions(nested_func);
|
||||
|
||||
// Need to make a clone because it'll be customized.
|
||||
auto storage_type = DataTypeFactory::instance().get(nested_func->getReturnType()->getName());
|
||||
|
||||
// Need to make a clone to avoid recursive reference.
|
||||
auto storage_type_out = DataTypeFactory::instance().get(nested_func->getReturnType()->getName());
|
||||
// Need to make a new function with promoted argument types because SimpleAggregates requires arg_type = return_type.
|
||||
AggregateFunctionProperties properties;
|
||||
auto function
|
||||
= AggregateFunctionFactory::instance().get(nested_func->getName(), {storage_type}, nested_func->getParameters(), properties);
|
||||
= AggregateFunctionFactory::instance().get(nested_func->getName(), {storage_type_out}, nested_func->getParameters(), properties);
|
||||
|
||||
// Need to make a clone because it'll be customized.
|
||||
auto storage_type_arg = DataTypeFactory::instance().get(nested_func->getReturnType()->getName());
|
||||
DataTypeCustomNamePtr custom_name
|
||||
= std::make_unique<DataTypeCustomSimpleAggregateFunction>(function, DataTypes{nested_func->getReturnType()}, params);
|
||||
storage_type->setCustomization(std::make_unique<DataTypeCustomDesc>(std::move(custom_name), nullptr));
|
||||
return storage_type;
|
||||
= std::make_unique<DataTypeCustomSimpleAggregateFunction>(function, DataTypes{nested_func->getReturnType()}, parameters);
|
||||
storage_type_arg->setCustomization(std::make_unique<DataTypeCustomDesc>(std::move(custom_name), nullptr));
|
||||
return storage_type_arg;
|
||||
}
|
||||
|
||||
bool isVersioned() const override
|
||||
|
@ -20,13 +20,12 @@ class AggregateFunctionState final : public IAggregateFunctionHelper<AggregateFu
|
||||
{
|
||||
private:
|
||||
AggregateFunctionPtr nested_func;
|
||||
DataTypes arguments;
|
||||
Array params;
|
||||
|
||||
public:
|
||||
AggregateFunctionState(AggregateFunctionPtr nested_, const DataTypes & arguments_, const Array & params_)
|
||||
: IAggregateFunctionHelper<AggregateFunctionState>(arguments_, params_)
|
||||
, nested_func(nested_), arguments(arguments_), params(params_) {}
|
||||
, nested_func(nested_)
|
||||
{}
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
|
@ -226,7 +226,7 @@ public:
|
||||
{
|
||||
// FIXME why is storing NearestFieldType not enough, and we
|
||||
// have to check for decimals again here?
|
||||
UInt32 scale = static_cast<const ColumnDecimal<T> &>(key_column).getData().getScale();
|
||||
UInt32 scale = static_cast<const ColumnDecimal<T> &>(key_column).getScale();
|
||||
it = merged_maps.find(DecimalField<T>(key, scale));
|
||||
}
|
||||
else
|
||||
@ -251,7 +251,7 @@ public:
|
||||
|
||||
if constexpr (is_decimal<T>)
|
||||
{
|
||||
UInt32 scale = static_cast<const ColumnDecimal<T> &>(key_column).getData().getScale();
|
||||
UInt32 scale = static_cast<const ColumnDecimal<T> &>(key_column).getScale();
|
||||
merged_maps.emplace(DecimalField<T>(key, scale), std::move(new_values));
|
||||
}
|
||||
else
|
||||
|
@ -506,6 +506,7 @@ if (ENABLE_NLP)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::stemmer)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::wnb)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::lemmagen)
|
||||
dbms_target_link_libraries (PUBLIC ch_contrib::nlp_data)
|
||||
endif()
|
||||
|
||||
if (TARGET ch_contrib::bzip2)
|
||||
@ -558,3 +559,4 @@ if (ENABLE_TESTS)
|
||||
|
||||
add_check(unit_tests_dbms)
|
||||
endif ()
|
||||
|
||||
|
@ -48,6 +48,7 @@
|
||||
#include <Parsers/ASTQueryWithOutput.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTIdentifier.h>
|
||||
#include <Parsers/ASTColumnDeclaration.h>
|
||||
|
||||
#include <Processors/Formats/Impl/NullFormat.h>
|
||||
#include <Processors/Formats/IInputFormat.h>
|
||||
@ -552,6 +553,25 @@ void ClientBase::initLogsOutputStream()
|
||||
}
|
||||
}
|
||||
|
||||
void ClientBase::updateSuggest(const ASTCreateQuery & ast_create)
|
||||
{
|
||||
std::vector<std::string> new_words;
|
||||
|
||||
if (ast_create.database)
|
||||
new_words.push_back(ast_create.getDatabase());
|
||||
new_words.push_back(ast_create.getTable());
|
||||
|
||||
if (ast_create.columns_list && ast_create.columns_list->columns)
|
||||
{
|
||||
for (const auto & elem : ast_create.columns_list->columns->children)
|
||||
{
|
||||
if (const auto * column = elem->as<ASTColumnDeclaration>())
|
||||
new_words.push_back(column->name);
|
||||
}
|
||||
}
|
||||
|
||||
suggest->addWords(std::move(new_words));
|
||||
}
|
||||
|
||||
void ClientBase::processTextAsSingleQuery(const String & full_query)
|
||||
{
|
||||
@ -565,6 +585,18 @@ void ClientBase::processTextAsSingleQuery(const String & full_query)
|
||||
|
||||
String query_to_execute;
|
||||
|
||||
/// Query will be parsed before checking the result because error does not
|
||||
/// always means a problem, i.e. if table already exists, and it is no a
|
||||
/// huge problem if suggestion will be added even on error, since this is
|
||||
/// just suggestion.
|
||||
if (auto * create = parsed_query->as<ASTCreateQuery>())
|
||||
{
|
||||
/// Do not update suggest, until suggestion will be ready
|
||||
/// (this will avoid extra complexity)
|
||||
if (suggest)
|
||||
updateSuggest(*create);
|
||||
}
|
||||
|
||||
// An INSERT query may have the data that follow query text. Remove the
|
||||
/// Send part of query without data, because data will be sent separately.
|
||||
auto * insert = parsed_query->as<ASTInsertQuery>();
|
||||
@ -1463,7 +1495,6 @@ void ClientBase::runInteractive()
|
||||
/// Initialize DateLUT here to avoid counting time spent here as query execution time.
|
||||
const auto local_tz = DateLUT::instance().getTimeZone();
|
||||
|
||||
std::optional<Suggest> suggest;
|
||||
suggest.emplace();
|
||||
if (load_suggestions)
|
||||
{
|
||||
|
@ -136,6 +136,8 @@ private:
|
||||
void readArguments(int argc, char ** argv, Arguments & common_arguments, std::vector<Arguments> & external_tables_arguments);
|
||||
void parseAndCheckOptions(OptionsDescription & options_description, po::variables_map & options, Arguments & arguments);
|
||||
|
||||
void updateSuggest(const ASTCreateQuery & ast_create);
|
||||
|
||||
protected:
|
||||
bool is_interactive = false; /// Use either interactive line editing interface or batch mode.
|
||||
bool is_multiquery = false;
|
||||
@ -144,6 +146,8 @@ protected:
|
||||
bool echo_queries = false; /// Print queries before execution in batch mode.
|
||||
bool ignore_error = false; /// In case of errors, don't print error message, continue to next query. Only applicable for non-interactive mode.
|
||||
bool print_time_to_stderr = false; /// Output execution time to stderr in batch mode.
|
||||
|
||||
std::optional<Suggest> suggest;
|
||||
bool load_suggestions = false;
|
||||
|
||||
std::vector<String> queries_files; /// If not empty, queries will be read from these files
|
||||
|
@ -29,19 +29,21 @@ namespace ErrorCodes
|
||||
Suggest::Suggest()
|
||||
{
|
||||
/// Keywords may be not up to date with ClickHouse parser.
|
||||
words = {"CREATE", "DATABASE", "IF", "NOT", "EXISTS", "TEMPORARY", "TABLE", "ON", "CLUSTER", "DEFAULT",
|
||||
"MATERIALIZED", "ALIAS", "ENGINE", "AS", "VIEW", "POPULATE", "SETTINGS", "ATTACH", "DETACH", "DROP",
|
||||
"RENAME", "TO", "ALTER", "ADD", "MODIFY", "CLEAR", "COLUMN", "AFTER", "COPY", "PROJECT",
|
||||
"PRIMARY", "KEY", "CHECK", "PARTITION", "PART", "FREEZE", "FETCH", "FROM", "SHOW", "INTO",
|
||||
"OUTFILE", "FORMAT", "TABLES", "DATABASES", "LIKE", "PROCESSLIST", "CASE", "WHEN", "THEN", "ELSE",
|
||||
"END", "DESCRIBE", "DESC", "USE", "SET", "OPTIMIZE", "FINAL", "DEDUPLICATE", "INSERT", "VALUES",
|
||||
"SELECT", "DISTINCT", "SAMPLE", "ARRAY", "JOIN", "GLOBAL", "LOCAL", "ANY", "ALL", "INNER",
|
||||
"LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "USING", "PREWHERE", "WHERE", "GROUP", "BY",
|
||||
"WITH", "TOTALS", "HAVING", "ORDER", "COLLATE", "LIMIT", "UNION", "AND", "OR", "ASC",
|
||||
"IN", "KILL", "QUERY", "SYNC", "ASYNC", "TEST", "BETWEEN", "TRUNCATE", "USER", "ROLE",
|
||||
"PROFILE", "QUOTA", "POLICY", "ROW", "GRANT", "REVOKE", "OPTION", "ADMIN", "EXCEPT", "REPLACE",
|
||||
"IDENTIFIED", "HOST", "NAME", "READONLY", "WRITABLE", "PERMISSIVE", "FOR", "RESTRICTIVE", "RANDOMIZED",
|
||||
"INTERVAL", "LIMITS", "ONLY", "TRACKING", "IP", "REGEXP", "ILIKE"};
|
||||
addWords({
|
||||
"CREATE", "DATABASE", "IF", "NOT", "EXISTS", "TEMPORARY", "TABLE", "ON", "CLUSTER", "DEFAULT",
|
||||
"MATERIALIZED", "ALIAS", "ENGINE", "AS", "VIEW", "POPULATE", "SETTINGS", "ATTACH", "DETACH", "DROP",
|
||||
"RENAME", "TO", "ALTER", "ADD", "MODIFY", "CLEAR", "COLUMN", "AFTER", "COPY", "PROJECT",
|
||||
"PRIMARY", "KEY", "CHECK", "PARTITION", "PART", "FREEZE", "FETCH", "FROM", "SHOW", "INTO",
|
||||
"OUTFILE", "FORMAT", "TABLES", "DATABASES", "LIKE", "PROCESSLIST", "CASE", "WHEN", "THEN", "ELSE",
|
||||
"END", "DESCRIBE", "DESC", "USE", "SET", "OPTIMIZE", "FINAL", "DEDUPLICATE", "INSERT", "VALUES",
|
||||
"SELECT", "DISTINCT", "SAMPLE", "ARRAY", "JOIN", "GLOBAL", "LOCAL", "ANY", "ALL", "INNER",
|
||||
"LEFT", "RIGHT", "FULL", "OUTER", "CROSS", "USING", "PREWHERE", "WHERE", "GROUP", "BY",
|
||||
"WITH", "TOTALS", "HAVING", "ORDER", "COLLATE", "LIMIT", "UNION", "AND", "OR", "ASC",
|
||||
"IN", "KILL", "QUERY", "SYNC", "ASYNC", "TEST", "BETWEEN", "TRUNCATE", "USER", "ROLE",
|
||||
"PROFILE", "QUOTA", "POLICY", "ROW", "GRANT", "REVOKE", "OPTION", "ADMIN", "EXCEPT", "REPLACE",
|
||||
"IDENTIFIED", "HOST", "NAME", "READONLY", "WRITABLE", "PERMISSIVE", "FOR", "RESTRICTIVE", "RANDOMIZED",
|
||||
"INTERVAL", "LIMITS", "ONLY", "TRACKING", "IP", "REGEXP", "ILIKE",
|
||||
});
|
||||
}
|
||||
|
||||
static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggestion)
|
||||
@ -124,18 +126,6 @@ void Suggest::load(ContextPtr context, const ConnectionParameters & connection_p
|
||||
}
|
||||
|
||||
/// Note that keyword suggestions are available even if we cannot load data from server.
|
||||
|
||||
std::sort(words.begin(), words.end());
|
||||
words_no_case = words;
|
||||
std::sort(words_no_case.begin(), words_no_case.end(), [](const std::string & str1, const std::string & str2)
|
||||
{
|
||||
return std::lexicographical_compare(begin(str1), end(str1), begin(str2), end(str2), [](const char char1, const char char2)
|
||||
{
|
||||
return std::tolower(char1) < std::tolower(char2);
|
||||
});
|
||||
});
|
||||
|
||||
ready = true;
|
||||
});
|
||||
}
|
||||
|
||||
@ -190,8 +180,14 @@ void Suggest::fillWordsFromBlock(const Block & block)
|
||||
const ColumnString & column = typeid_cast<const ColumnString &>(*block.getByPosition(0).column);
|
||||
|
||||
size_t rows = block.rows();
|
||||
|
||||
Words new_words;
|
||||
new_words.reserve(rows);
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
words.emplace_back(column.getDataAt(i).toString());
|
||||
{
|
||||
new_words.emplace_back(column.getDataAt(i).toString());
|
||||
}
|
||||
addWords(std::move(new_words));
|
||||
}
|
||||
|
||||
template
|
||||
|
@ -1,5 +1,3 @@
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
@ -9,12 +7,7 @@
|
||||
#include <Columns/ColumnsCommon.h>
|
||||
#include <Columns/ColumnCompressed.h>
|
||||
#include <Columns/MaskOperations.h>
|
||||
|
||||
#include <base/unaligned.h>
|
||||
#include <base/sort.h>
|
||||
|
||||
#include <Processors/Transforms/ColumnGathererTransform.h>
|
||||
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/Arena.h>
|
||||
#include <Common/SipHash.h>
|
||||
@ -22,6 +15,8 @@
|
||||
#include <Common/assert_cast.h>
|
||||
#include <Common/WeakHash.h>
|
||||
#include <Common/HashTable/Hash.h>
|
||||
#include <base/unaligned.h>
|
||||
#include <cstring> // memcpy
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -127,18 +122,8 @@ size_t ColumnArray::size() const
|
||||
|
||||
Field ColumnArray::operator[](size_t n) const
|
||||
{
|
||||
size_t offset = offsetAt(n);
|
||||
size_t size = sizeAt(n);
|
||||
|
||||
if (size > max_array_size_as_field)
|
||||
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array of size {} is too large to be manipulated as single field, maximum size {}",
|
||||
size, max_array_size_as_field);
|
||||
|
||||
Array res(size);
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
res[i] = getData()[offset + i];
|
||||
|
||||
Field res;
|
||||
get(n, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -152,11 +137,12 @@ void ColumnArray::get(size_t n, Field & res) const
|
||||
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array of size {} is too large to be manipulated as single field, maximum size {}",
|
||||
size, max_array_size_as_field);
|
||||
|
||||
res = Array(size);
|
||||
res = Array();
|
||||
Array & res_arr = DB::get<Array &>(res);
|
||||
res_arr.reserve(size);
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
getData().get(offset + i, res_arr[i]);
|
||||
res_arr.push_back(getData()[offset + i]);
|
||||
}
|
||||
|
||||
|
||||
|
@ -32,12 +32,6 @@ namespace ErrorCodes
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
template class DecimalPaddedPODArray<Decimal32>;
|
||||
template class DecimalPaddedPODArray<Decimal64>;
|
||||
template class DecimalPaddedPODArray<Decimal128>;
|
||||
template class DecimalPaddedPODArray<Decimal256>;
|
||||
template class DecimalPaddedPODArray<DateTime64>;
|
||||
|
||||
template <is_decimal T>
|
||||
int ColumnDecimal<T>::compareAt(size_t n, size_t m, const IColumn & rhs_, int) const
|
||||
{
|
||||
@ -131,19 +125,6 @@ void ColumnDecimal<T>::updateHashFast(SipHash & hash) const
|
||||
template <is_decimal T>
|
||||
void ColumnDecimal<T>::getPermutation(bool reverse, size_t limit, int , IColumn::Permutation & res) const
|
||||
{
|
||||
#if 1 /// TODO: perf test
|
||||
if (data.size() <= std::numeric_limits<UInt32>::max())
|
||||
{
|
||||
PaddedPODArray<UInt32> tmp_res;
|
||||
permutation(reverse, limit, tmp_res);
|
||||
|
||||
res.resize(tmp_res.size());
|
||||
for (size_t i = 0; i < tmp_res.size(); ++i)
|
||||
res[i] = tmp_res[i];
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
permutation(reverse, limit, res);
|
||||
}
|
||||
|
||||
|
@ -1,66 +1,21 @@
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include <base/sort.h>
|
||||
#include <base/TypeName.h>
|
||||
#include <Core/Field.h>
|
||||
#include <Core/DecimalFunctions.h>
|
||||
#include <Core/TypeId.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Columns/ColumnVectorHelper.h>
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/IColumnImpl.h>
|
||||
#include <Core/Field.h>
|
||||
#include <Core/DecimalFunctions.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <base/sort.h>
|
||||
#include <Core/TypeId.h>
|
||||
#include <base/TypeName.h>
|
||||
|
||||
#include <cmath>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// PaddedPODArray extended by Decimal scale
|
||||
template <typename T>
|
||||
class DecimalPaddedPODArray : public PaddedPODArray<T>
|
||||
{
|
||||
public:
|
||||
using Base = PaddedPODArray<T>;
|
||||
using Base::operator[];
|
||||
|
||||
DecimalPaddedPODArray(size_t size, UInt32 scale_)
|
||||
: Base(size),
|
||||
scale(scale_)
|
||||
{}
|
||||
|
||||
DecimalPaddedPODArray(const DecimalPaddedPODArray & other)
|
||||
: Base(other.begin(), other.end()),
|
||||
scale(other.scale)
|
||||
{}
|
||||
|
||||
DecimalPaddedPODArray(DecimalPaddedPODArray && other)
|
||||
{
|
||||
this->swap(other);
|
||||
std::swap(scale, other.scale);
|
||||
}
|
||||
|
||||
DecimalPaddedPODArray & operator=(DecimalPaddedPODArray && other)
|
||||
{
|
||||
this->swap(other);
|
||||
std::swap(scale, other.scale);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UInt32 getScale() const { return scale; }
|
||||
|
||||
private:
|
||||
UInt32 scale;
|
||||
};
|
||||
|
||||
/// Prevent implicit template instantiation of DecimalPaddedPODArray for common decimal types
|
||||
|
||||
extern template class DecimalPaddedPODArray<Decimal32>;
|
||||
extern template class DecimalPaddedPODArray<Decimal64>;
|
||||
extern template class DecimalPaddedPODArray<Decimal128>;
|
||||
extern template class DecimalPaddedPODArray<Decimal256>;
|
||||
extern template class DecimalPaddedPODArray<DateTime64>;
|
||||
|
||||
/// A ColumnVector for Decimals
|
||||
template <is_decimal T>
|
||||
class ColumnDecimal final : public COWHelper<ColumnVectorHelper, ColumnDecimal<T>>
|
||||
@ -72,16 +27,16 @@ private:
|
||||
public:
|
||||
using ValueType = T;
|
||||
using NativeT = typename T::NativeType;
|
||||
using Container = DecimalPaddedPODArray<T>;
|
||||
using Container = PaddedPODArray<T>;
|
||||
|
||||
private:
|
||||
ColumnDecimal(const size_t n, UInt32 scale_)
|
||||
: data(n, scale_),
|
||||
: data(n),
|
||||
scale(scale_)
|
||||
{}
|
||||
|
||||
ColumnDecimal(const ColumnDecimal & src)
|
||||
: data(src.data),
|
||||
: data(src.data.begin(), src.data.end()),
|
||||
scale(src.scale)
|
||||
{}
|
||||
|
||||
@ -195,7 +150,7 @@ public:
|
||||
const T & getElement(size_t n) const { return data[n]; }
|
||||
T & getElement(size_t n) { return data[n]; }
|
||||
|
||||
UInt32 getScale() const {return scale;}
|
||||
UInt32 getScale() const { return scale; }
|
||||
|
||||
protected:
|
||||
Container data;
|
||||
@ -206,8 +161,8 @@ protected:
|
||||
{
|
||||
size_t s = data.size();
|
||||
res.resize(s);
|
||||
for (U i = 0; i < s; ++i)
|
||||
res[i] = i;
|
||||
for (size_t i = 0; i < s; ++i)
|
||||
res[i] = static_cast<U>(i);
|
||||
|
||||
auto sort_end = res.end();
|
||||
if (limit && limit < s)
|
||||
|
@ -4,8 +4,6 @@
|
||||
#include <Processors/Transforms/ColumnGathererTransform.h>
|
||||
#include <IO/WriteBufferFromString.h>
|
||||
#include <IO/Operators.h>
|
||||
#include <base/map.h>
|
||||
#include <base/range.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <Common/WeakHash.h>
|
||||
@ -64,8 +62,9 @@ MutableColumnPtr ColumnMap::cloneResized(size_t new_size) const
|
||||
|
||||
Field ColumnMap::operator[](size_t n) const
|
||||
{
|
||||
auto array = DB::get<Array>((*nested)[n]);
|
||||
return Map(std::make_move_iterator(array.begin()), std::make_move_iterator(array.end()));
|
||||
Field res;
|
||||
get(n, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
void ColumnMap::get(size_t n, Field & res) const
|
||||
@ -74,11 +73,12 @@ void ColumnMap::get(size_t n, Field & res) const
|
||||
size_t offset = offsets[n - 1];
|
||||
size_t size = offsets[n] - offsets[n - 1];
|
||||
|
||||
res = Map(size);
|
||||
res = Map();
|
||||
auto & map = DB::get<Map &>(res);
|
||||
map.reserve(size);
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
getNestedData().get(offset + i, map[i]);
|
||||
map.push_back(getNestedData()[offset + i]);
|
||||
}
|
||||
|
||||
bool ColumnMap::isDefaultAt(size_t n) const
|
||||
|
@ -9,9 +9,6 @@
|
||||
#include <Common/WeakHash.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <base/sort.h>
|
||||
#include <base/map.h>
|
||||
#include <base/range.h>
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
|
||||
|
||||
@ -101,17 +98,21 @@ MutableColumnPtr ColumnTuple::cloneResized(size_t new_size) const
|
||||
|
||||
Field ColumnTuple::operator[](size_t n) const
|
||||
{
|
||||
return collections::map<Tuple>(columns, [n] (const auto & column) { return (*column)[n]; });
|
||||
Field res;
|
||||
get(n, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
void ColumnTuple::get(size_t n, Field & res) const
|
||||
{
|
||||
const size_t tuple_size = columns.size();
|
||||
Tuple tuple(tuple_size);
|
||||
for (const auto i : collections::range(0, tuple_size))
|
||||
columns[i]->get(n, tuple[i]);
|
||||
|
||||
res = tuple;
|
||||
res = Tuple();
|
||||
Tuple & res_tuple = DB::get<Tuple &>(res);
|
||||
res_tuple.reserve(tuple_size);
|
||||
|
||||
for (size_t i = 0; i < tuple_size; ++i)
|
||||
res_tuple.push_back((*columns[i])[n]);
|
||||
}
|
||||
|
||||
bool ColumnTuple::isDefaultAt(size_t n) const
|
||||
@ -483,7 +484,7 @@ void ColumnTuple::getExtremes(Field & min, Field & max) const
|
||||
Tuple min_tuple(tuple_size);
|
||||
Tuple max_tuple(tuple_size);
|
||||
|
||||
for (const auto i : collections::range(0, tuple_size))
|
||||
for (size_t i = 0; i < tuple_size; ++i)
|
||||
columns[i]->getExtremes(min_tuple[i], max_tuple[i]);
|
||||
|
||||
min = min_tuple;
|
||||
@ -504,7 +505,7 @@ bool ColumnTuple::structureEquals(const IColumn & rhs) const
|
||||
if (tuple_size != rhs_tuple->columns.size())
|
||||
return false;
|
||||
|
||||
for (const auto i : collections::range(0, tuple_size))
|
||||
for (size_t i = 0; i < tuple_size; ++i)
|
||||
if (!columns[i]->structureEquals(*rhs_tuple->columns[i]))
|
||||
return false;
|
||||
|
||||
|
252
src/Common/FrequencyHolder.h
Normal file
252
src/Common/FrequencyHolder.h
Normal file
@ -0,0 +1,252 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/Arena.h>
|
||||
#include <Common/getResource.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/readFloatText.h>
|
||||
#include <IO/ZstdInflatingReadBuffer.h>
|
||||
|
||||
#include <base/StringRef.h>
|
||||
#include <base/logger_useful.h>
|
||||
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int FILE_DOESNT_EXIST;
|
||||
}
|
||||
|
||||
/// FrequencyHolder class is responsible for storing and loading dictionaries
|
||||
/// needed for text classification functions:
|
||||
///
|
||||
/// 1. detectLanguageUnknown
|
||||
/// 2. detectCharset
|
||||
/// 3. detectTonality
|
||||
/// 4. detectProgrammingLanguage
|
||||
|
||||
class FrequencyHolder
|
||||
{
|
||||
|
||||
public:
|
||||
struct Language
|
||||
{
|
||||
String name;
|
||||
HashMap<StringRef, Float64> map;
|
||||
};
|
||||
|
||||
struct Encoding
|
||||
{
|
||||
String name;
|
||||
String lang;
|
||||
HashMap<UInt16, Float64> map;
|
||||
};
|
||||
|
||||
public:
|
||||
using Map = HashMap<StringRef, Float64>;
|
||||
using Container = std::vector<Language>;
|
||||
using EncodingMap = HashMap<UInt16, Float64>;
|
||||
using EncodingContainer = std::vector<Encoding>;
|
||||
|
||||
static FrequencyHolder & getInstance()
|
||||
{
|
||||
static FrequencyHolder instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void loadEncodingsFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded charset frequencies");
|
||||
|
||||
auto resource = getResource("charset.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded charset frequencies");
|
||||
|
||||
String line;
|
||||
UInt16 bigram;
|
||||
Float64 frequency;
|
||||
String charset_name;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new charset
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(charset_name, buf_line);
|
||||
|
||||
/* In our dictionary we have lines with form: <Language>_<Charset>
|
||||
* If we need to find language of data, we return <Language>
|
||||
* If we need to find charset of data, we return <Charset>.
|
||||
*/
|
||||
size_t sep = charset_name.find('_');
|
||||
|
||||
Encoding enc;
|
||||
enc.lang = charset_name.substr(0, sep);
|
||||
enc.name = charset_name.substr(sep + 1);
|
||||
encodings_freq.push_back(std::move(enc));
|
||||
}
|
||||
else
|
||||
{
|
||||
readIntText(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
encodings_freq.back().map[bigram] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
|
||||
}
|
||||
|
||||
|
||||
void loadEmotionalDict()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
||||
LOG_TRACE(log, "Loading embedded emotional dictionary");
|
||||
|
||||
auto resource = getResource("tonality_ru.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded emotional dictionary");
|
||||
|
||||
String line;
|
||||
String word;
|
||||
Float64 tonality;
|
||||
size_t count = 0;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
readStringUntilWhitespace(word, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(tonality, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(word.data(), word.size()), word.size()};
|
||||
emotional_dict[ref] = tonality;
|
||||
++count;
|
||||
}
|
||||
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
||||
}
|
||||
|
||||
|
||||
void loadProgrammingFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
||||
|
||||
LOG_TRACE(log, "Loading embedded programming languages frequencies loading");
|
||||
|
||||
auto resource = getResource("programming.zst");
|
||||
if (resource.empty())
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "There is no embedded programming languages frequencies");
|
||||
|
||||
String line;
|
||||
String bigram;
|
||||
Float64 frequency;
|
||||
String programming_language;
|
||||
|
||||
auto buf = std::make_unique<ReadBufferFromMemory>(resource.data(), resource.size());
|
||||
ZstdInflatingReadBuffer in(std::move(buf));
|
||||
|
||||
while (!in.eof())
|
||||
{
|
||||
readString(line, in);
|
||||
in.ignore();
|
||||
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
||||
ReadBufferFromString buf_line(line);
|
||||
|
||||
// Start loading a new language
|
||||
if (line.starts_with("// "))
|
||||
{
|
||||
// Skip "// "
|
||||
buf_line.ignore(3);
|
||||
readString(programming_language, buf_line);
|
||||
|
||||
Language lang;
|
||||
lang.name = programming_language;
|
||||
programming_freq.push_back(std::move(lang));
|
||||
}
|
||||
else
|
||||
{
|
||||
readStringUntilWhitespace(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
|
||||
StringRef ref{string_pool.insert(bigram.data(), bigram.size()), bigram.size()};
|
||||
programming_freq.back().map[ref] = frequency;
|
||||
}
|
||||
}
|
||||
LOG_TRACE(log, "Programming languages frequencies was added");
|
||||
}
|
||||
|
||||
const Map & getEmotionalDict()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (emotional_dict.empty())
|
||||
loadEmotionalDict();
|
||||
|
||||
return emotional_dict;
|
||||
}
|
||||
|
||||
|
||||
const EncodingContainer & getEncodingsFrequency()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (encodings_freq.empty())
|
||||
loadEncodingsFrequency();
|
||||
|
||||
return encodings_freq;
|
||||
}
|
||||
|
||||
const Container & getProgrammingFrequency()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (programming_freq.empty())
|
||||
loadProgrammingFrequency();
|
||||
|
||||
return programming_freq;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
Arena string_pool;
|
||||
|
||||
Map emotional_dict;
|
||||
Container programming_freq;
|
||||
EncodingContainer encodings_freq;
|
||||
|
||||
std::mutex mutex;
|
||||
};
|
||||
}
|
@ -291,6 +291,15 @@ public:
|
||||
|
||||
size_t getIntervalsSize() const { return intervals_size; }
|
||||
|
||||
size_t getSizeInBytes() const
|
||||
{
|
||||
size_t nodes_size_in_bytes = nodes.size() * sizeof(Node);
|
||||
size_t intervals_size_in_bytes = sorted_intervals.size() * sizeof(IntervalWithValue);
|
||||
size_t result = nodes_size_in_bytes + intervals_size_in_bytes;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
struct Node
|
||||
{
|
||||
|
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <Common/VariableContext.h>
|
||||
|
||||
/// To be able to avoid MEMORY_LIMIT_EXCEEDED Exception in destructors:
|
||||
|
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <Common/VariableContext.h>
|
||||
|
||||
/// To be able to temporarily stop memory tracking from current thread.
|
||||
|
@ -41,6 +41,7 @@ private:
|
||||
|
||||
ObjectPtr object;
|
||||
bool in_use = false;
|
||||
std::atomic<bool> is_expired = false;
|
||||
PoolBase & pool;
|
||||
};
|
||||
|
||||
@ -87,6 +88,14 @@ public:
|
||||
Object & operator*() & { return *data->data.object; }
|
||||
const Object & operator*() const & { return *data->data.object; }
|
||||
|
||||
/**
|
||||
* Expire an object to make it reallocated later.
|
||||
*/
|
||||
void expire()
|
||||
{
|
||||
data->data.is_expired = true;
|
||||
}
|
||||
|
||||
bool isNull() const { return data == nullptr; }
|
||||
|
||||
PoolBase * getPool() const
|
||||
@ -112,9 +121,22 @@ public:
|
||||
while (true)
|
||||
{
|
||||
for (auto & item : items)
|
||||
{
|
||||
if (!item->in_use)
|
||||
return Entry(*item);
|
||||
|
||||
{
|
||||
if (likely(!item->is_expired))
|
||||
{
|
||||
return Entry(*item);
|
||||
}
|
||||
else
|
||||
{
|
||||
expireObject(item->object);
|
||||
item->object = allocObject();
|
||||
item->is_expired = false;
|
||||
return Entry(*item);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (items.size() < max_items)
|
||||
{
|
||||
ObjectPtr object = allocObject();
|
||||
@ -139,6 +161,12 @@ public:
|
||||
items.emplace_back(std::make_shared<PooledObject>(allocObject(), *this));
|
||||
}
|
||||
|
||||
inline size_t size()
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
return items.size();
|
||||
}
|
||||
|
||||
private:
|
||||
/** The maximum size of the pool. */
|
||||
unsigned max_items;
|
||||
@ -162,4 +190,5 @@ protected:
|
||||
|
||||
/** Creates a new object to put into the pool. */
|
||||
virtual ObjectPtr allocObject() = 0;
|
||||
virtual void expireObject(ObjectPtr) {}
|
||||
};
|
||||
|
@ -24,7 +24,6 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int UNSUPPORTED_PARAMETER;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
@ -34,9 +33,12 @@ namespace ErrorCodes
|
||||
*/
|
||||
|
||||
|
||||
struct StringSearcherBase
|
||||
class StringSearcherBase
|
||||
{
|
||||
public:
|
||||
bool force_fallback = false;
|
||||
#ifdef __SSE2__
|
||||
protected:
|
||||
static constexpr auto n = sizeof(__m128i);
|
||||
const int page_size = ::getPageSize();
|
||||
|
||||
@ -53,7 +55,7 @@ template <bool CaseSensitive, bool ASCII> class StringSearcher;
|
||||
|
||||
/// Case-insensitive UTF-8 searcher
|
||||
template <>
|
||||
class StringSearcher<false, false> : private StringSearcherBase
|
||||
class StringSearcher<false, false> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
using UTF8SequenceBuffer = uint8_t[6];
|
||||
@ -119,11 +121,14 @@ public:
|
||||
size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq));
|
||||
|
||||
if (length_l != length_u)
|
||||
throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
|
||||
force_fallback = true;
|
||||
}
|
||||
|
||||
l = l_seq[0];
|
||||
u = u_seq[0];
|
||||
|
||||
if (force_fallback)
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
@ -158,7 +163,10 @@ public:
|
||||
|
||||
/// @note Unicode standard states it is a rare but possible occasion
|
||||
if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
|
||||
throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
|
||||
{
|
||||
force_fallback = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
cache_actual_len += src_len;
|
||||
@ -199,9 +207,10 @@ public:
|
||||
if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
|
||||
break;
|
||||
|
||||
/// @note assuming sequences for lowercase and uppercase have exact same length (that is not always true)
|
||||
const auto len = UTF8::seqLength(*haystack_pos);
|
||||
auto len = UTF8::seqLength(*haystack_pos);
|
||||
haystack_pos += len;
|
||||
|
||||
len = UTF8::seqLength(*needle_pos);
|
||||
needle_pos += len;
|
||||
}
|
||||
|
||||
@ -213,7 +222,7 @@ public:
|
||||
{
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
if (pageSafe(pos))
|
||||
if (pageSafe(pos) && !force_fallback)
|
||||
{
|
||||
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
||||
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
|
||||
@ -262,7 +271,7 @@ public:
|
||||
while (haystack < haystack_end)
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
if (haystack + n <= haystack_end && pageSafe(haystack))
|
||||
if (haystack + n <= haystack_end && pageSafe(haystack) && !force_fallback)
|
||||
{
|
||||
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
||||
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
|
||||
@ -339,7 +348,7 @@ public:
|
||||
|
||||
/// Case-insensitive ASCII searcher
|
||||
template <>
|
||||
class StringSearcher<false, true> : private StringSearcherBase
|
||||
class StringSearcher<false, true> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
/// string to be searched for
|
||||
@ -541,7 +550,7 @@ public:
|
||||
|
||||
/// Case-sensitive searcher (both ASCII and UTF-8)
|
||||
template <bool ASCII>
|
||||
class StringSearcher<true, ASCII> : private StringSearcherBase
|
||||
class StringSearcher<true, ASCII> : public StringSearcherBase
|
||||
{
|
||||
private:
|
||||
/// string to be searched for
|
||||
@ -725,7 +734,7 @@ public:
|
||||
// Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings
|
||||
// should work just fine. But any Unicode whitespace is not considered a token separtor.
|
||||
template <typename StringSearcher>
|
||||
class TokenSearcher
|
||||
class TokenSearcher : public StringSearcherBase
|
||||
{
|
||||
StringSearcher searcher;
|
||||
size_t needle_size;
|
||||
@ -809,7 +818,7 @@ using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStri
|
||||
* It is required that strings are zero-terminated.
|
||||
*/
|
||||
|
||||
struct LibCASCIICaseSensitiveStringSearcher
|
||||
struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
@ -833,7 +842,7 @@ struct LibCASCIICaseSensitiveStringSearcher
|
||||
}
|
||||
};
|
||||
|
||||
struct LibCASCIICaseInsensitiveStringSearcher
|
||||
struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
|
177
src/Common/SystemLogBase.cpp
Normal file
177
src/Common/SystemLogBase.cpp
Normal file
@ -0,0 +1,177 @@
|
||||
#include <Interpreters/AsynchronousMetricLog.h>
|
||||
#include <Interpreters/CrashLog.h>
|
||||
#include <Interpreters/MetricLog.h>
|
||||
#include <Interpreters/OpenTelemetrySpanLog.h>
|
||||
#include <Interpreters/PartLog.h>
|
||||
#include <Interpreters/QueryLog.h>
|
||||
#include <Interpreters/QueryThreadLog.h>
|
||||
#include <Interpreters/QueryViewsLog.h>
|
||||
#include <Interpreters/SessionLog.h>
|
||||
#include <Interpreters/TextLog.h>
|
||||
#include <Interpreters/TraceLog.h>
|
||||
#include <Interpreters/ZooKeeperLog.h>
|
||||
|
||||
#include <Common/MemoryTrackerBlockerInThread.h>
|
||||
#include <Common/SystemLogBase.h>
|
||||
|
||||
#include <base/logger_useful.h>
|
||||
#include <base/scope_guard.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TIMEOUT_EXCEEDED;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
constexpr size_t DBMS_SYSTEM_LOG_QUEUE_SIZE = 1048576;
|
||||
}
|
||||
|
||||
void ISystemLog::stopFlushThread()
|
||||
{
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
|
||||
if (!saving_thread.joinable())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_shutdown)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
is_shutdown = true;
|
||||
|
||||
/// Tell thread to shutdown.
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
saving_thread.join();
|
||||
}
|
||||
|
||||
void ISystemLog::startup()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
saving_thread = ThreadFromGlobalPool([this] { savingThreadFunction(); });
|
||||
}
|
||||
|
||||
static thread_local bool recursive_add_call = false;
|
||||
|
||||
template <typename LogElement>
|
||||
void SystemLogBase<LogElement>::add(const LogElement & element)
|
||||
{
|
||||
/// It is possible that the method will be called recursively.
|
||||
/// Better to drop these events to avoid complications.
|
||||
if (recursive_add_call)
|
||||
return;
|
||||
recursive_add_call = true;
|
||||
SCOPE_EXIT({ recursive_add_call = false; });
|
||||
|
||||
/// Memory can be allocated while resizing on queue.push_back.
|
||||
/// The size of allocation can be in order of a few megabytes.
|
||||
/// But this should not be accounted for query memory usage.
|
||||
/// Otherwise the tests like 01017_uniqCombined_memory_usage.sql will be flacky.
|
||||
MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global);
|
||||
|
||||
/// Should not log messages under mutex.
|
||||
bool queue_is_half_full = false;
|
||||
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
if (is_shutdown)
|
||||
return;
|
||||
|
||||
if (queue.size() == DBMS_SYSTEM_LOG_QUEUE_SIZE / 2)
|
||||
{
|
||||
queue_is_half_full = true;
|
||||
|
||||
// The queue more than half full, time to flush.
|
||||
// We only check for strict equality, because messages are added one
|
||||
// by one, under exclusive lock, so we will see each message count.
|
||||
// It is enough to only wake the flushing thread once, after the message
|
||||
// count increases past half available size.
|
||||
const uint64_t queue_end = queue_front_index + queue.size();
|
||||
if (requested_flush_up_to < queue_end)
|
||||
requested_flush_up_to = queue_end;
|
||||
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
if (queue.size() >= DBMS_SYSTEM_LOG_QUEUE_SIZE)
|
||||
{
|
||||
// Ignore all further entries until the queue is flushed.
|
||||
// Log a message about that. Don't spam it -- this might be especially
|
||||
// problematic in case of trace log. Remember what the front index of the
|
||||
// queue was when we last logged the message. If it changed, it means the
|
||||
// queue was flushed, and we can log again.
|
||||
if (queue_front_index != logged_queue_full_at_index)
|
||||
{
|
||||
logged_queue_full_at_index = queue_front_index;
|
||||
|
||||
// TextLog sets its logger level to 0, so this log is a noop and
|
||||
// there is no recursive logging.
|
||||
lock.unlock();
|
||||
LOG_ERROR(log, "Queue is full for system log '{}' at {}", demangle(typeid(*this).name()), queue_front_index);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
queue.push_back(element);
|
||||
}
|
||||
|
||||
if (queue_is_half_full)
|
||||
LOG_INFO(log, "Queue is half full for system log '{}'.", demangle(typeid(*this).name()));
|
||||
}
|
||||
|
||||
template <typename LogElement>
|
||||
void SystemLogBase<LogElement>::flush(bool force)
|
||||
{
|
||||
uint64_t this_thread_requested_offset;
|
||||
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
|
||||
if (is_shutdown)
|
||||
return;
|
||||
|
||||
this_thread_requested_offset = queue_front_index + queue.size();
|
||||
|
||||
// Publish our flush request, taking care not to overwrite the requests
|
||||
// made by other threads.
|
||||
is_force_prepare_tables |= force;
|
||||
requested_flush_up_to = std::max(requested_flush_up_to, this_thread_requested_offset);
|
||||
|
||||
flush_event.notify_all();
|
||||
}
|
||||
|
||||
LOG_DEBUG(log, "Requested flush up to offset {}", this_thread_requested_offset);
|
||||
|
||||
// Use an arbitrary timeout to avoid endless waiting. 60s proved to be
|
||||
// too fast for our parallel functional tests, probably because they
|
||||
// heavily load the disk.
|
||||
const int timeout_seconds = 180;
|
||||
std::unique_lock lock(mutex);
|
||||
bool result = flush_event.wait_for(lock, std::chrono::seconds(timeout_seconds), [&]
|
||||
{
|
||||
return flushed_up_to >= this_thread_requested_offset && !is_force_prepare_tables;
|
||||
});
|
||||
|
||||
if (!result)
|
||||
{
|
||||
throw Exception(
|
||||
"Timeout exceeded (" + toString(timeout_seconds) + " s) while flushing system log '" + demangle(typeid(*this).name()) + "'.",
|
||||
ErrorCodes::TIMEOUT_EXCEEDED);
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_SYSTEM_LOG_BASE(ELEMENT) template class SystemLogBase<ELEMENT>;
|
||||
SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG_BASE)
|
||||
|
||||
}
|
109
src/Common/SystemLogBase.h
Normal file
109
src/Common/SystemLogBase.h
Normal file
@ -0,0 +1,109 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <base/types.h>
|
||||
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Parsers/IAST_fwd.h>
|
||||
#include <Storages/IStorage_fwd.h>
|
||||
#include <Common/ThreadPool.h>
|
||||
|
||||
#define SYSTEM_LOG_ELEMENTS(M) \
|
||||
M(AsynchronousMetricLogElement) \
|
||||
M(CrashLogElement) \
|
||||
M(MetricLogElement) \
|
||||
M(OpenTelemetrySpanLogElement) \
|
||||
M(PartLogElement) \
|
||||
M(QueryLogElement) \
|
||||
M(QueryThreadLogElement) \
|
||||
M(QueryViewsLogElement) \
|
||||
M(SessionLogElement) \
|
||||
M(TraceLogElement) \
|
||||
M(ZooKeeperLogElement) \
|
||||
M(TextLogElement)
|
||||
|
||||
namespace Poco
|
||||
{
|
||||
class Logger;
|
||||
namespace Util
|
||||
{
|
||||
class AbstractConfiguration;
|
||||
}
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
struct StorageID;
|
||||
|
||||
class ISystemLog
|
||||
{
|
||||
public:
|
||||
virtual String getName() = 0;
|
||||
//// force -- force table creation (used for SYSTEM FLUSH LOGS)
|
||||
virtual void flush(bool force = false) = 0;
|
||||
virtual void prepareTable() = 0;
|
||||
|
||||
/// Start the background thread.
|
||||
virtual void startup();
|
||||
|
||||
/// Stop the background flush thread before destructor. No more data will be written.
|
||||
virtual void shutdown() = 0;
|
||||
|
||||
virtual ~ISystemLog() = default;
|
||||
|
||||
virtual void savingThreadFunction() = 0;
|
||||
|
||||
protected:
|
||||
ThreadFromGlobalPool saving_thread;
|
||||
|
||||
/// Data shared between callers of add()/flush()/shutdown(), and the saving thread
|
||||
std::mutex mutex;
|
||||
|
||||
bool is_shutdown = false;
|
||||
std::condition_variable flush_event;
|
||||
|
||||
void stopFlushThread();
|
||||
};
|
||||
|
||||
template <typename LogElement>
|
||||
class SystemLogBase : public ISystemLog
|
||||
{
|
||||
public:
|
||||
using Self = SystemLogBase;
|
||||
|
||||
/** Append a record into log.
|
||||
* Writing to table will be done asynchronously and in case of failure, record could be lost.
|
||||
*/
|
||||
void add(const LogElement & element);
|
||||
|
||||
/// Flush data in the buffer to disk
|
||||
void flush(bool force) override;
|
||||
|
||||
String getName() override { return LogElement::name(); }
|
||||
|
||||
protected:
|
||||
Poco::Logger * log;
|
||||
|
||||
// Queue is bounded. But its size is quite large to not block in all normal cases.
|
||||
std::vector<LogElement> queue;
|
||||
// An always-incrementing index of the first message currently in the queue.
|
||||
// We use it to give a global sequential index to every message, so that we
|
||||
// can wait until a particular message is flushed. This is used to implement
|
||||
// synchronous log flushing for SYSTEM FLUSH LOGS.
|
||||
uint64_t queue_front_index = 0;
|
||||
// A flag that says we must create the tables even if the queue is empty.
|
||||
bool is_force_prepare_tables = false;
|
||||
// Requested to flush logs up to this index, exclusive
|
||||
uint64_t requested_flush_up_to = 0;
|
||||
// Flushed log up to this index, exclusive
|
||||
uint64_t flushed_up_to = 0;
|
||||
// Logged overflow message at this queue front index
|
||||
uint64_t logged_queue_full_at_index = -1;
|
||||
};
|
||||
|
||||
}
|
@ -372,7 +372,7 @@ public:
|
||||
, fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}
|
||||
, fallback_searcher{needle_, needle_size}
|
||||
{
|
||||
if (fallback)
|
||||
if (fallback || fallback_searcher.force_fallback)
|
||||
return;
|
||||
|
||||
hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
|
||||
@ -393,7 +393,7 @@ public:
|
||||
|
||||
const auto haystack_end = haystack + haystack_size;
|
||||
|
||||
if (fallback || haystack_size <= needle_size)
|
||||
if (fallback || haystack_size <= needle_size || fallback_searcher.force_fallback)
|
||||
return fallback_searcher.search(haystack, haystack_end);
|
||||
|
||||
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
|
||||
|
@ -3,12 +3,7 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
|
||||
add_headers_and_sources(clickhouse_common_zookeeper .)
|
||||
|
||||
# for clickhouse server
|
||||
#
|
||||
# NOTE: this library depends from Interpreters (DB::SystemLog<DB::ZooKeeperLogElement>::add),
|
||||
# and so it should be STATIC because otherwise:
|
||||
# - it will either fail to compile with -Wl,--unresolved-symbols=report-all
|
||||
# - or it will report errors at runtime
|
||||
add_library(clickhouse_common_zookeeper STATIC ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources})
|
||||
add_library(clickhouse_common_zookeeper ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources})
|
||||
target_compile_definitions (clickhouse_common_zookeeper PRIVATE -DZOOKEEPER_LOG)
|
||||
target_link_libraries (clickhouse_common_zookeeper
|
||||
PUBLIC
|
||||
|
@ -16,7 +16,15 @@ using MYSQL_ROW = char**;
|
||||
struct st_mysql_field;
|
||||
using MYSQL_FIELD = st_mysql_field;
|
||||
|
||||
enum struct enum_field_types;
|
||||
enum struct enum_field_types { MYSQL_TYPE_DECIMAL, MYSQL_TYPE_TINY,
|
||||
MYSQL_TYPE_SHORT, MYSQL_TYPE_LONG,
|
||||
MYSQL_TYPE_FLOAT, MYSQL_TYPE_DOUBLE,
|
||||
MYSQL_TYPE_NULL, MYSQL_TYPE_TIMESTAMP,
|
||||
MYSQL_TYPE_LONGLONG, MYSQL_TYPE_INT24,
|
||||
MYSQL_TYPE_DATE, MYSQL_TYPE_TIME,
|
||||
MYSQL_TYPE_DATETIME, MYSQL_TYPE_YEAR,
|
||||
MYSQL_TYPE_NEWDATE, MYSQL_TYPE_VARCHAR,
|
||||
MYSQL_TYPE_BIT };
|
||||
|
||||
#endif
|
||||
|
||||
|
52
src/Common/tests/gtest_poolbase.cpp
Normal file
52
src/Common/tests/gtest_poolbase.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
#include <memory>
|
||||
#include <gtest/gtest.h>
|
||||
#include <Common/PoolBase.h>
|
||||
#include <Poco/Logger.h>
|
||||
using namespace DB;
|
||||
|
||||
class PoolObject
|
||||
{
|
||||
public:
|
||||
int x = 0;
|
||||
};
|
||||
|
||||
class MyPoolBase : public PoolBase<PoolObject>
|
||||
{
|
||||
public:
|
||||
using Object = PoolBase<PoolObject>::Object;
|
||||
using ObjectPtr = std::shared_ptr<Object>;
|
||||
using Ptr = PoolBase<PoolObject>::Ptr;
|
||||
|
||||
int last_destroy_value = 0;
|
||||
MyPoolBase() : PoolBase<PoolObject>(100, &Poco::Logger::get("MyPoolBase")) { }
|
||||
|
||||
protected:
|
||||
ObjectPtr allocObject() override { return std::make_shared<Object>(); }
|
||||
|
||||
void expireObject(ObjectPtr obj) override
|
||||
{
|
||||
LOG_TRACE(log, "expire object");
|
||||
ASSERT_TRUE(obj->x == 100);
|
||||
last_destroy_value = obj->x;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(PoolBase, testDestroy1)
|
||||
{
|
||||
MyPoolBase pool;
|
||||
{
|
||||
auto obj_entry = pool.get(-1);
|
||||
ASSERT_TRUE(!obj_entry.isNull());
|
||||
obj_entry->x = 100;
|
||||
obj_entry.expire();
|
||||
}
|
||||
ASSERT_EQ(1, pool.size());
|
||||
|
||||
{
|
||||
auto obj_entry = pool.get(-1);
|
||||
ASSERT_TRUE(!obj_entry.isNull());
|
||||
ASSERT_EQ(obj_entry->x, 0);
|
||||
ASSERT_EQ(1, pool.size());
|
||||
}
|
||||
ASSERT_EQ(100, pool.last_destroy_value);
|
||||
}
|
@ -5,7 +5,6 @@
|
||||
#include <Common/CurrentThread.h>
|
||||
#include <base/logger_useful.h>
|
||||
#include <chrono>
|
||||
#include <base/scope_guard.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -246,7 +245,6 @@ void BackgroundSchedulePool::threadFunction()
|
||||
setThreadName(thread_name.c_str());
|
||||
|
||||
attachToThreadGroup();
|
||||
SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); });
|
||||
|
||||
while (!shutdown)
|
||||
{
|
||||
@ -273,7 +271,6 @@ void BackgroundSchedulePool::delayExecutionThreadFunction()
|
||||
setThreadName((thread_name + "/D").c_str());
|
||||
|
||||
attachToThreadGroup();
|
||||
SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); });
|
||||
|
||||
while (!shutdown)
|
||||
{
|
||||
|
@ -204,6 +204,7 @@ namespace MySQLReplication
|
||||
case MYSQL_TYPE_DATE:
|
||||
case MYSQL_TYPE_DATETIME:
|
||||
case MYSQL_TYPE_NEWDATE:
|
||||
case MYSQL_TYPE_YEAR:
|
||||
{
|
||||
/// No data here.
|
||||
column_meta.emplace_back(0);
|
||||
@ -214,7 +215,9 @@ namespace MySQLReplication
|
||||
case MYSQL_TYPE_DOUBLE:
|
||||
case MYSQL_TYPE_TIMESTAMP2:
|
||||
case MYSQL_TYPE_DATETIME2:
|
||||
case MYSQL_TYPE_TIME2:
|
||||
case MYSQL_TYPE_BLOB:
|
||||
case MYSQL_TYPE_GEOMETRY:
|
||||
{
|
||||
column_meta.emplace_back(UInt16(meta[pos]));
|
||||
pos += 1;
|
||||
@ -432,6 +435,98 @@ namespace MySQLReplication
|
||||
row.push_back(Field(date_day_number.toUnderType()));
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_YEAR: {
|
||||
Int16 val = 0;
|
||||
payload.readStrict(reinterpret_cast<char *>(&val), 1);
|
||||
row.push_back(Field{UInt16{static_cast<UInt16>(val + 1900)}});
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_TIME2:
|
||||
{
|
||||
UInt64 uintpart = 0UL;
|
||||
Int32 frac = 0U;
|
||||
Int64 ltime;
|
||||
Int64 intpart;
|
||||
switch (meta)
|
||||
{
|
||||
case 0:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 3);
|
||||
intpart = uintpart - 0x800000L;
|
||||
ltime = intpart << 24;
|
||||
break;
|
||||
}
|
||||
case 1:
|
||||
case 2:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 3);
|
||||
intpart = uintpart - 0x800000L;
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&frac), 1);
|
||||
if (intpart < 0 && frac > 0)
|
||||
{
|
||||
intpart ++;
|
||||
frac -= 0x100;
|
||||
}
|
||||
frac = frac * 10000;
|
||||
ltime = intpart << 24;
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
case 4:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 3);
|
||||
intpart = uintpart - 0x800000L;
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&frac), 2);
|
||||
if (intpart < 0 && frac > 0)
|
||||
{
|
||||
intpart ++;
|
||||
frac -= 0x10000;
|
||||
}
|
||||
frac = frac * 100;
|
||||
ltime = intpart << 24;
|
||||
break;
|
||||
}
|
||||
case 5:
|
||||
case 6:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 6);
|
||||
intpart = uintpart - 0x800000000000L;
|
||||
ltime = intpart;
|
||||
frac = std::abs(intpart % (1L << 24));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
readBigEndianStrict(payload, reinterpret_cast<char *>(&uintpart), 3);
|
||||
intpart = uintpart - 0x800000L;
|
||||
ltime = intpart << 24;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Int64 hh, mm, ss;
|
||||
bool negative = false;
|
||||
if (intpart == 0)
|
||||
{
|
||||
hh = 0;
|
||||
mm = 0;
|
||||
ss = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ltime < 0) negative= true;
|
||||
UInt64 ultime = std::abs(ltime);
|
||||
intpart = ultime >> 24;
|
||||
hh = (intpart >> 12) % (1 << 10);
|
||||
mm = (intpart >> 6) % (1 << 6);
|
||||
ss = intpart % (1 << 6);
|
||||
}
|
||||
|
||||
Int64 time_micro = 0;
|
||||
time_micro = (hh * 3600 + mm * 60 + ss) * 1000000 + std::abs(frac);
|
||||
if (negative) time_micro = - time_micro;
|
||||
row.push_back(Field{Int64{time_micro}});
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_DATETIME2:
|
||||
{
|
||||
Int64 val = 0;
|
||||
@ -585,6 +680,14 @@ namespace MySQLReplication
|
||||
}
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_SET:
|
||||
{
|
||||
UInt32 size = (meta & 0xff);
|
||||
Bitmap bitmap1;
|
||||
readBitmap(payload, bitmap1, size);
|
||||
row.push_back(Field{UInt64{bitmap1.to_ulong()}});
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_BIT:
|
||||
{
|
||||
UInt32 bits = ((meta >> 8) * 8) + (meta & 0xff);
|
||||
@ -631,6 +734,7 @@ namespace MySQLReplication
|
||||
row.push_back(Field{String{val}});
|
||||
break;
|
||||
}
|
||||
case MYSQL_TYPE_GEOMETRY:
|
||||
case MYSQL_TYPE_BLOB:
|
||||
{
|
||||
UInt32 size = 0;
|
||||
|
@ -554,7 +554,7 @@ class IColumn;
|
||||
/** Experimental functions */ \
|
||||
M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \
|
||||
M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \
|
||||
|
||||
M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \
|
||||
// End of COMMON_SETTINGS
|
||||
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.
|
||||
|
||||
|
@ -92,5 +92,7 @@ void registerDataTypeString(DataTypeFactory & factory)
|
||||
factory.registerAlias("BINARY LARGE OBJECT", "String", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("BINARY VARYING", "String", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("VARBINARY", "String", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("GEOMETRY", "String", DataTypeFactory::CaseInsensitive); //mysql
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -32,6 +32,7 @@ namespace ErrorCodes
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH;
|
||||
extern const int ILLEGAL_INDEX;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
|
||||
@ -156,8 +157,19 @@ MutableColumnPtr DataTypeTuple::createColumn() const
|
||||
|
||||
MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const
|
||||
{
|
||||
const auto & element_serializations =
|
||||
assert_cast<const SerializationTuple &>(serialization).getElementsSerializations();
|
||||
/// If we read subcolumn of nested Tuple, it may be wrapped to SerializationNamed
|
||||
/// several times to allow to reconstruct the substream path name.
|
||||
/// Here we don't need substream path name, so we drop first several wrapper serializations.
|
||||
|
||||
const auto * current_serialization = &serialization;
|
||||
while (const auto * serialization_named = typeid_cast<const SerializationNamed *>(current_serialization))
|
||||
current_serialization = serialization_named->getNested().get();
|
||||
|
||||
const auto * serialization_tuple = typeid_cast<const SerializationTuple *>(current_serialization);
|
||||
if (!serialization_tuple)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected serialization to create column of type Tuple");
|
||||
|
||||
const auto & element_serializations = serialization_tuple->getElementsSerializations();
|
||||
|
||||
size_t size = elems.size();
|
||||
assert(element_serializations.size() == size);
|
||||
|
@ -86,7 +86,10 @@ void registerDataTypeNumbers(DataTypeFactory & factory)
|
||||
factory.registerAlias("INT UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("INTEGER UNSIGNED", "UInt32", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("BIGINT UNSIGNED", "UInt64", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("BIT", "UInt64", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("BIT", "UInt64", DataTypeFactory::CaseInsensitive); /// MySQL
|
||||
factory.registerAlias("SET", "UInt64", DataTypeFactory::CaseInsensitive); /// MySQL
|
||||
factory.registerAlias("YEAR", "UInt16", DataTypeFactory::CaseInsensitive);
|
||||
factory.registerAlias("TIME", "Int64", DataTypeFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -523,6 +523,7 @@ inline bool isBool(const DataTypePtr & data_type)
|
||||
template <typename DataType> constexpr bool IsDataTypeDecimal = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeNumber = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeDateOrDateTime = false;
|
||||
template <typename DataType> constexpr bool IsDataTypeEnum = false;
|
||||
|
||||
template <typename DataType> constexpr bool IsDataTypeDecimalOrNumber = IsDataTypeDecimal<DataType> || IsDataTypeNumber<DataType>;
|
||||
|
||||
@ -547,4 +548,9 @@ template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDate32> = tru
|
||||
template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDateTime> = true;
|
||||
template <> inline constexpr bool IsDataTypeDateOrDateTime<DataTypeDateTime64> = true;
|
||||
|
||||
template <typename T>
|
||||
class DataTypeEnum;
|
||||
|
||||
template <typename T> inline constexpr bool IsDataTypeEnum<DataTypeEnum<T>> = true;
|
||||
|
||||
}
|
||||
|
@ -37,10 +37,11 @@ void SerializationArray::deserializeBinary(Field & field, ReadBuffer & istr) con
|
||||
{
|
||||
size_t size;
|
||||
readVarUInt(size, istr);
|
||||
field = Array(size);
|
||||
field = Array();
|
||||
Array & arr = get<Array &>(field);
|
||||
arr.reserve(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
nested->deserializeBinary(arr[i], istr);
|
||||
nested->deserializeBinary(arr.emplace_back(), istr);
|
||||
}
|
||||
|
||||
|
||||
|
@ -53,13 +53,15 @@ void SerializationMap::deserializeBinary(Field & field, ReadBuffer & istr) const
|
||||
{
|
||||
size_t size;
|
||||
readVarUInt(size, istr);
|
||||
field = Map(size);
|
||||
for (auto & elem : field.get<Map &>())
|
||||
field = Map();
|
||||
Map & map = field.get<Map &>();
|
||||
map.reserve(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
Tuple tuple(2);
|
||||
key->deserializeBinary(tuple[0], istr);
|
||||
value->deserializeBinary(tuple[1], istr);
|
||||
elem = std::move(tuple);
|
||||
map.push_back(std::move(tuple));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5,6 +5,11 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Serialization wrapper that acts like nested serialization,
|
||||
/// but adds a passed name to the substream path like the
|
||||
/// read column was the tuple element with this name.
|
||||
/// It's used while reading subcolumns of complex types.
|
||||
/// In particular while reading components of named tuples.
|
||||
class SerializationNamed final : public SerializationWrapper
|
||||
{
|
||||
private:
|
||||
|
@ -1,4 +1,3 @@
|
||||
#include <base/range.h>
|
||||
#include <DataTypes/Serializations/SerializationTuple.h>
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
@ -44,11 +43,11 @@ void SerializationTuple::deserializeBinary(Field & field, ReadBuffer & istr) con
|
||||
{
|
||||
const size_t size = elems.size();
|
||||
|
||||
Tuple tuple(size);
|
||||
for (const auto i : collections::range(0, size))
|
||||
elems[i]->deserializeBinary(tuple[i], istr);
|
||||
|
||||
field = tuple;
|
||||
field = Tuple();
|
||||
Tuple & tuple = get<Tuple &>(field);
|
||||
tuple.reserve(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
elems[i]->deserializeBinary(tuple.emplace_back(), istr);
|
||||
}
|
||||
|
||||
void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const
|
||||
@ -73,7 +72,7 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl)
|
||||
|
||||
// Check that all columns now have the same size.
|
||||
size_t new_size = column.size();
|
||||
for (auto i : collections::range(1, num_elems))
|
||||
for (size_t i = 1; i < num_elems; ++i)
|
||||
{
|
||||
const auto & element_column = extractElementColumn(column, i);
|
||||
if (element_column.size() != new_size)
|
||||
@ -87,7 +86,7 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl)
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
for (const auto & i : collections::range(0, num_elems))
|
||||
for (size_t i = 0; i < num_elems; ++i)
|
||||
{
|
||||
auto & element_column = extractElementColumn(column, i);
|
||||
if (element_column.size() > old_size)
|
||||
@ -102,7 +101,7 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr)
|
||||
{
|
||||
addElementSafe(elems.size(), column, [&]
|
||||
{
|
||||
for (const auto & i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->deserializeBinary(extractElementColumn(column, i), istr);
|
||||
});
|
||||
}
|
||||
@ -110,7 +109,7 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr)
|
||||
void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
writeChar('(', ostr);
|
||||
for (const auto i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
if (i != 0)
|
||||
writeChar(',', ostr);
|
||||
@ -126,7 +125,7 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co
|
||||
|
||||
addElementSafe(elems.size(), column, [&]
|
||||
{
|
||||
for (const auto i : collections::range(0, size))
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
skipWhitespaceIfAny(istr);
|
||||
if (i != 0)
|
||||
@ -158,7 +157,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu
|
||||
&& have_explicit_names)
|
||||
{
|
||||
writeChar('{', ostr);
|
||||
for (const auto i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
if (i != 0)
|
||||
{
|
||||
@ -173,7 +172,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu
|
||||
else
|
||||
{
|
||||
writeChar('[', ostr);
|
||||
for (const auto i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
if (i != 0)
|
||||
writeChar(',', ostr);
|
||||
@ -195,7 +194,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
|
||||
addElementSafe(elems.size(), column, [&]
|
||||
{
|
||||
// Require all elements but in arbitrary order.
|
||||
for (auto i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
if (i > 0)
|
||||
{
|
||||
@ -226,7 +225,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
|
||||
|
||||
addElementSafe(elems.size(), column, [&]
|
||||
{
|
||||
for (const auto i : collections::range(0, size))
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
skipWhitespaceIfAny(istr);
|
||||
if (i != 0)
|
||||
@ -246,7 +245,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
|
||||
void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
writeCString("<tuple>", ostr);
|
||||
for (const auto i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
writeCString("<elem>", ostr);
|
||||
elems[i]->serializeTextXML(extractElementColumn(column, i), row_num, ostr, settings);
|
||||
@ -257,7 +256,7 @@ void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num
|
||||
|
||||
void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
for (const auto i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
if (i != 0)
|
||||
writeChar(settings.csv.tuple_delimiter, ostr);
|
||||
@ -270,7 +269,7 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
|
||||
addElementSafe(elems.size(), column, [&]
|
||||
{
|
||||
const size_t size = elems.size();
|
||||
for (const auto i : collections::range(0, size))
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
if (i != 0)
|
||||
{
|
||||
@ -362,7 +361,7 @@ void SerializationTuple::serializeBinaryBulkWithMultipleStreams(
|
||||
{
|
||||
auto * tuple_state = checkAndGetState<SerializeBinaryBulkStateTuple>(state);
|
||||
|
||||
for (const auto i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
const auto & element_col = extractElementColumn(column, i);
|
||||
elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]);
|
||||
@ -382,7 +381,7 @@ void SerializationTuple::deserializeBinaryBulkWithMultipleStreams(
|
||||
auto & column_tuple = assert_cast<ColumnTuple &>(*mutable_column);
|
||||
|
||||
settings.avg_value_size_hint = 0;
|
||||
for (const auto i : collections::range(0, elems.size()))
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache);
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <Databases/MySQL/MaterializeMetadata.h>
|
||||
#include <Processors/Sources/MySQLSource.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/Operators.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/executeQuery.h>
|
||||
#include <Storages/StorageMergeTree.h>
|
||||
@ -315,6 +316,47 @@ getTableOutput(const String & database_name, const String & table_name, ContextM
|
||||
return std::move(res.pipeline);
|
||||
}
|
||||
|
||||
static inline String reWriteMysqlQueryColumn(mysqlxx::Pool::Entry & connection, const String & database_name, const String & table_name, const Settings & global_settings)
|
||||
{
|
||||
Block tables_columns_sample_block
|
||||
{
|
||||
{ std::make_shared<DataTypeString>(), "column_name" },
|
||||
{ std::make_shared<DataTypeString>(), "column_type" }
|
||||
};
|
||||
|
||||
const String & query = "SELECT COLUMN_NAME AS column_name, COLUMN_TYPE AS column_type FROM INFORMATION_SCHEMA.COLUMNS"
|
||||
" WHERE TABLE_SCHEMA = '" + backQuoteIfNeed(database_name) +
|
||||
"' AND TABLE_NAME = '" + backQuoteIfNeed(table_name) + "' ORDER BY ORDINAL_POSITION";
|
||||
|
||||
StreamSettings mysql_input_stream_settings(global_settings, false, true);
|
||||
auto mysql_source = std::make_unique<MySQLSource>(connection, query, tables_columns_sample_block, mysql_input_stream_settings);
|
||||
|
||||
Block block;
|
||||
WriteBufferFromOwnString query_columns;
|
||||
QueryPipeline pipeline(std::move(mysql_source));
|
||||
PullingPipelineExecutor executor(pipeline);
|
||||
while (executor.pull(block))
|
||||
{
|
||||
const auto & column_name_col = *block.getByPosition(0).column;
|
||||
const auto & column_type_col = *block.getByPosition(1).column;
|
||||
size_t rows = block.rows();
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
{
|
||||
String column_name = column_name_col[i].safeGet<String>();
|
||||
String column_type = column_type_col[i].safeGet<String>();
|
||||
//we can do something special conversion to guarantee select results is the same as the binlog parse results
|
||||
if (column_type.starts_with("set"))
|
||||
{
|
||||
query_columns << (backQuote(column_name) + " + 0");
|
||||
} else
|
||||
query_columns << backQuote(column_name);
|
||||
query_columns << ",";
|
||||
}
|
||||
}
|
||||
String query_columns_str = query_columns.str();
|
||||
return query_columns_str.substr(0, query_columns_str.length() - 1);
|
||||
}
|
||||
|
||||
static inline void dumpDataForTables(
|
||||
mysqlxx::Pool::Entry & connection, const std::unordered_map<String, String> & need_dumping_tables,
|
||||
const String & query_prefix, const String & database_name, const String & mysql_database_name,
|
||||
@ -334,9 +376,10 @@ static inline void dumpDataForTables(
|
||||
|
||||
auto pipeline = getTableOutput(database_name, table_name, query_context);
|
||||
StreamSettings mysql_input_stream_settings(context->getSettingsRef());
|
||||
auto input = std::make_unique<MySQLSource>(
|
||||
connection, "SELECT * FROM " + backQuoteIfNeed(mysql_database_name) + "." + backQuoteIfNeed(table_name),
|
||||
pipeline.getHeader(), mysql_input_stream_settings);
|
||||
String mysql_select_all_query = "SELECT " + reWriteMysqlQueryColumn(connection, mysql_database_name, table_name, context->getSettings()) + " FROM "
|
||||
+ backQuoteIfNeed(mysql_database_name) + "." + backQuoteIfNeed(table_name);
|
||||
LOG_INFO(&Poco::Logger::get("MaterializedMySQLSyncThread(" + database_name + ")"), "mysql_select_all_query is {}", mysql_select_all_query);
|
||||
auto input = std::make_unique<MySQLSource>(connection, mysql_select_all_query, pipeline.getHeader(), mysql_input_stream_settings);
|
||||
auto counting = std::make_shared<CountingTransform>(pipeline.getHeader());
|
||||
Pipe pipe(std::move(input));
|
||||
pipe.addTransform(counting);
|
||||
|
@ -60,8 +60,8 @@ private:
|
||||
const auto & attributes_types_to_read = coordinator->getAttributesTypesToRead();
|
||||
const auto & attributes_default_values_columns = coordinator->getAttributesDefaultValuesColumns();
|
||||
|
||||
const auto & dictionary = coordinator->getDictionary();
|
||||
auto attributes_columns = dictionary->getColumns(
|
||||
const auto & read_columns_func = coordinator->getReadColumnsFunc();
|
||||
auto attributes_columns = read_columns_func(
|
||||
attributes_names_to_read,
|
||||
attributes_types_to_read,
|
||||
key_columns,
|
||||
|
@ -19,6 +19,8 @@ class DictionarySourceCoordinator final : public shared_ptr_helper<DictionarySou
|
||||
|
||||
public:
|
||||
|
||||
using ReadColumnsFunc = std::function<Columns (const Strings &, const DataTypes &, const Columns &, const DataTypes &, const Columns &)>;
|
||||
|
||||
Pipe read(size_t num_streams);
|
||||
|
||||
private:
|
||||
@ -31,6 +33,15 @@ private:
|
||||
: dictionary(std::move(dictionary_))
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func([this](
|
||||
const Strings & attribute_names,
|
||||
const DataTypes & result_types,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const Columns & default_values_columns)
|
||||
{
|
||||
return dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_values_columns);
|
||||
})
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
@ -45,6 +56,31 @@ private:
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, data_columns_with_type(std::move(data_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func([this](
|
||||
const Strings & attribute_names,
|
||||
const DataTypes & result_types,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
const Columns & default_values_columns)
|
||||
{
|
||||
return dictionary->getColumns(attribute_names, result_types, key_columns, key_types, default_values_columns);
|
||||
})
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
|
||||
explicit DictionarySourceCoordinator(
|
||||
std::shared_ptr<const IDictionary> dictionary_,
|
||||
const Names & column_names,
|
||||
ColumnsWithTypeAndName && key_columns_with_type_,
|
||||
ColumnsWithTypeAndName && data_columns_with_type_,
|
||||
size_t max_block_size_,
|
||||
ReadColumnsFunc read_columns_func_)
|
||||
: dictionary(std::move(dictionary_))
|
||||
, key_columns_with_type(std::move(key_columns_with_type_))
|
||||
, data_columns_with_type(std::move(data_columns_with_type_))
|
||||
, max_block_size(max_block_size_)
|
||||
, read_columns_func(std::move(read_columns_func_))
|
||||
{
|
||||
initialize(column_names);
|
||||
}
|
||||
@ -61,6 +97,8 @@ private:
|
||||
|
||||
const std::vector<ColumnPtr> & getAttributesDefaultValuesColumns() const { return attributes_default_values_columns; }
|
||||
|
||||
const ReadColumnsFunc & getReadColumnsFunc() const { return read_columns_func; }
|
||||
|
||||
const std::shared_ptr<const IDictionary> & getDictionary() const { return dictionary; }
|
||||
|
||||
void initialize(const Names & column_names);
|
||||
@ -79,6 +117,8 @@ private:
|
||||
std::vector<ColumnPtr> attributes_default_values_columns;
|
||||
|
||||
const size_t max_block_size;
|
||||
ReadColumnsFunc read_columns_func;
|
||||
|
||||
std::atomic<size_t> parallel_read_block_index = 0;
|
||||
};
|
||||
|
||||
|
@ -382,7 +382,8 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
|
||||
|
||||
void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConfiguration & config, const std::string & structure_prefix)
|
||||
{
|
||||
const char * range_default_type = "Date";
|
||||
static constexpr auto range_default_type = "Date";
|
||||
|
||||
if (config.has(structure_prefix + ".range_min"))
|
||||
range_min.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_min", range_default_type));
|
||||
|
||||
@ -395,7 +396,10 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf
|
||||
"Dictionary structure should have both 'range_min' and 'range_max' either specified or not.");
|
||||
}
|
||||
|
||||
if (range_min && range_max && !range_min->type->equals(*range_max->type))
|
||||
if (!range_min)
|
||||
return;
|
||||
|
||||
if (!range_min->type->equals(*range_max->type))
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Dictionary structure 'range_min' and 'range_max' should have same type, "
|
||||
@ -405,15 +409,20 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf
|
||||
range_max->type->getName());
|
||||
}
|
||||
|
||||
if (range_min && !range_min->type->isValueRepresentedByInteger())
|
||||
WhichDataType range_type(range_min->type);
|
||||
|
||||
bool valid_range = range_type.isInt() || range_type.isUInt() || range_type.isDecimal() || range_type.isFloat() || range_type.isEnum()
|
||||
|| range_type.isDate() || range_type.isDate32() || range_type.isDateTime() || range_type.isDateTime64();
|
||||
|
||||
if (!valid_range)
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum."
|
||||
"Dictionary structure type of 'range_min' and 'range_max' should be an Integer, Float, Decimal, Date, Date32, DateTime DateTime64, or Enum."
|
||||
" Actual 'range_min' and 'range_max' type is {}",
|
||||
range_min->type->getName());
|
||||
}
|
||||
|
||||
if ((range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty()))
|
||||
if (!range_min->expression.empty() || !range_max->expression.empty())
|
||||
has_expressions = true;
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -19,7 +19,18 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
using RangeStorageType = Int64;
|
||||
enum class RangeHashedDictionaryLookupStrategy : uint8_t
|
||||
{
|
||||
min,
|
||||
max
|
||||
};
|
||||
|
||||
struct RangeHashedDictionaryConfiguration
|
||||
{
|
||||
bool convert_null_range_bound_to_open;
|
||||
RangeHashedDictionaryLookupStrategy lookup_strategy;
|
||||
bool require_nonempty;
|
||||
};
|
||||
|
||||
template <DictionaryKeyType dictionary_key_type>
|
||||
class RangeHashedDictionary final : public IDictionary
|
||||
@ -31,11 +42,17 @@ public:
|
||||
const StorageID & dict_id_,
|
||||
const DictionaryStructure & dict_struct_,
|
||||
DictionarySourcePtr source_ptr_,
|
||||
const DictionaryLifetime dict_lifetime_,
|
||||
bool require_nonempty_,
|
||||
DictionaryLifetime dict_lifetime_,
|
||||
RangeHashedDictionaryConfiguration configuration_,
|
||||
BlockPtr update_field_loaded_block_ = nullptr);
|
||||
|
||||
std::string getTypeName() const override { return "RangeHashed"; }
|
||||
std::string getTypeName() const override
|
||||
{
|
||||
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
|
||||
return "RangeHashed";
|
||||
else
|
||||
return "ComplexKeyRangeHashed";
|
||||
}
|
||||
|
||||
size_t getBytesAllocated() const override { return bytes_allocated; }
|
||||
|
||||
@ -57,7 +74,15 @@ public:
|
||||
|
||||
std::shared_ptr<const IExternalLoadable> clone() const override
|
||||
{
|
||||
return std::make_shared<RangeHashedDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, update_field_loaded_block);
|
||||
auto result = std::make_shared<RangeHashedDictionary>(
|
||||
getDictionaryID(),
|
||||
dict_struct,
|
||||
source_ptr->clone(),
|
||||
dict_lifetime,
|
||||
configuration,
|
||||
update_field_loaded_block);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DictionarySourcePtr getSource() const override { return source_ptr; }
|
||||
@ -76,7 +101,7 @@ public:
|
||||
DictionarySpecialKeyType getSpecialKeyType() const override { return DictionarySpecialKeyType::Range;}
|
||||
|
||||
ColumnPtr getColumn(
|
||||
const std::string& attribute_name,
|
||||
const std::string & attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
const Columns & key_columns,
|
||||
const DataTypes & key_types,
|
||||
@ -88,52 +113,90 @@ public:
|
||||
|
||||
private:
|
||||
|
||||
using RangeInterval = Interval<RangeStorageType>;
|
||||
template <typename RangeStorageType>
|
||||
using IntervalMap = IntervalMap<Interval<RangeStorageType>, size_t>;
|
||||
|
||||
template <typename T>
|
||||
using Values = IntervalMap<RangeInterval, std::optional<T>>;
|
||||
template <typename RangeStorageType>
|
||||
using KeyAttributeContainerType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, IntervalMap<RangeStorageType>, DefaultHash<UInt64>>,
|
||||
HashMapWithSavedHash<StringRef, IntervalMap<RangeStorageType>, DefaultHash<StringRef>>>;
|
||||
|
||||
template <typename Value>
|
||||
using CollectionType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, Values<Value>, DefaultHash<UInt64>>,
|
||||
HashMapWithSavedHash<StringRef, Values<Value>, DefaultHash<StringRef>>>;
|
||||
|
||||
using NoAttributesCollectionType = std::conditional_t<
|
||||
dictionary_key_type == DictionaryKeyType::Simple,
|
||||
HashMap<UInt64, IntervalSet<RangeInterval>>,
|
||||
HashMapWithSavedHash<StringRef, IntervalSet<RangeInterval>>>;
|
||||
using AttributeContainerType = std::conditional_t<std::is_same_v<Value, Array>, std::vector<Value>, PaddedPODArray<Value>>;
|
||||
|
||||
struct Attribute final
|
||||
{
|
||||
public:
|
||||
AttributeUnderlyingType type;
|
||||
bool is_nullable;
|
||||
|
||||
std::variant<
|
||||
CollectionType<UInt8>,
|
||||
CollectionType<UInt16>,
|
||||
CollectionType<UInt32>,
|
||||
CollectionType<UInt64>,
|
||||
CollectionType<UInt128>,
|
||||
CollectionType<UInt256>,
|
||||
CollectionType<Int8>,
|
||||
CollectionType<Int16>,
|
||||
CollectionType<Int32>,
|
||||
CollectionType<Int64>,
|
||||
CollectionType<Int128>,
|
||||
CollectionType<Int256>,
|
||||
CollectionType<Decimal32>,
|
||||
CollectionType<Decimal64>,
|
||||
CollectionType<Decimal128>,
|
||||
CollectionType<Decimal256>,
|
||||
CollectionType<DateTime64>,
|
||||
CollectionType<Float32>,
|
||||
CollectionType<Float64>,
|
||||
CollectionType<UUID>,
|
||||
CollectionType<StringRef>,
|
||||
CollectionType<Array>>
|
||||
maps;
|
||||
AttributeContainerType<UInt8>,
|
||||
AttributeContainerType<UInt16>,
|
||||
AttributeContainerType<UInt32>,
|
||||
AttributeContainerType<UInt64>,
|
||||
AttributeContainerType<UInt128>,
|
||||
AttributeContainerType<UInt256>,
|
||||
AttributeContainerType<Int8>,
|
||||
AttributeContainerType<Int16>,
|
||||
AttributeContainerType<Int32>,
|
||||
AttributeContainerType<Int64>,
|
||||
AttributeContainerType<Int128>,
|
||||
AttributeContainerType<Int256>,
|
||||
AttributeContainerType<Decimal32>,
|
||||
AttributeContainerType<Decimal64>,
|
||||
AttributeContainerType<Decimal128>,
|
||||
AttributeContainerType<Decimal256>,
|
||||
AttributeContainerType<DateTime64>,
|
||||
AttributeContainerType<Float32>,
|
||||
AttributeContainerType<Float64>,
|
||||
AttributeContainerType<UUID>,
|
||||
AttributeContainerType<StringRef>,
|
||||
AttributeContainerType<Array>>
|
||||
container;
|
||||
|
||||
std::optional<std::vector<bool>> is_value_nullable;
|
||||
};
|
||||
|
||||
template <typename RangeStorageType>
|
||||
struct InvalidIntervalWithKey
|
||||
{
|
||||
KeyType key;
|
||||
Interval<RangeStorageType> interval;
|
||||
size_t attribute_value_index;
|
||||
};
|
||||
|
||||
template <typename RangeStorageType>
|
||||
using InvalidIntervalsContainerType = PaddedPODArray<InvalidIntervalWithKey<RangeStorageType>>;
|
||||
|
||||
template <template<typename> typename ContainerType>
|
||||
using RangeStorageTypeContainer = std::variant<
|
||||
ContainerType<UInt8>,
|
||||
ContainerType<UInt16>,
|
||||
ContainerType<UInt32>,
|
||||
ContainerType<UInt64>,
|
||||
ContainerType<UInt128>,
|
||||
ContainerType<UInt256>,
|
||||
ContainerType<Int8>,
|
||||
ContainerType<Int16>,
|
||||
ContainerType<Int32>,
|
||||
ContainerType<Int64>,
|
||||
ContainerType<Int128>,
|
||||
ContainerType<Int256>,
|
||||
ContainerType<Decimal32>,
|
||||
ContainerType<Decimal64>,
|
||||
ContainerType<Decimal128>,
|
||||
ContainerType<Decimal256>,
|
||||
ContainerType<DateTime64>,
|
||||
ContainerType<Float32>,
|
||||
ContainerType<Float64>,
|
||||
ContainerType<UUID>>;
|
||||
|
||||
struct KeyAttribute final
|
||||
{
|
||||
RangeStorageTypeContainer<KeyAttributeContainerType> container;
|
||||
|
||||
RangeStorageTypeContainer<InvalidIntervalsContainerType> invalid_intervals_container;
|
||||
|
||||
};
|
||||
|
||||
void createAttributes();
|
||||
@ -151,43 +214,31 @@ private:
|
||||
ValueSetter && set_value,
|
||||
DefaultValueExtractor & default_value_extractor) const;
|
||||
|
||||
ColumnPtr getColumnInternal(
|
||||
const std::string & attribute_name,
|
||||
const DataTypePtr & result_type,
|
||||
const PaddedPODArray<UInt64> & key_to_index) const;
|
||||
|
||||
template <typename AttributeType, bool is_nullable, typename ValueSetter>
|
||||
void getItemsInternalImpl(
|
||||
const Attribute & attribute,
|
||||
const PaddedPODArray<UInt64> & key_to_index,
|
||||
ValueSetter && set_value) const;
|
||||
|
||||
void updateData();
|
||||
|
||||
void blockToAttributes(const Block & block);
|
||||
|
||||
void buildAttributeIntervalTrees();
|
||||
|
||||
template <typename T>
|
||||
void setAttributeValueImpl(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value);
|
||||
|
||||
void setAttributeValue(Attribute & attribute, KeyType key, const RangeInterval & interval, const Field & value);
|
||||
|
||||
template <typename RangeType>
|
||||
void getKeysAndDates(
|
||||
PaddedPODArray<KeyType> & keys,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const;
|
||||
|
||||
template <typename T, typename RangeType>
|
||||
void getKeysAndDates(
|
||||
const Attribute & attribute,
|
||||
PaddedPODArray<KeyType> & keys,
|
||||
PaddedPODArray<RangeType> & start_dates,
|
||||
PaddedPODArray<RangeType> & end_dates) const;
|
||||
|
||||
template <typename RangeType>
|
||||
PaddedPODArray<Int64> makeDateKeys(
|
||||
const PaddedPODArray<RangeType> & block_start_dates,
|
||||
const PaddedPODArray<RangeType> & block_end_dates) const;
|
||||
void setAttributeValue(Attribute & attribute, const Field & value);
|
||||
|
||||
const DictionaryStructure dict_struct;
|
||||
const DictionarySourcePtr source_ptr;
|
||||
const DictionaryLifetime dict_lifetime;
|
||||
const bool require_nonempty;
|
||||
const RangeHashedDictionaryConfiguration configuration;
|
||||
BlockPtr update_field_loaded_block;
|
||||
|
||||
std::vector<Attribute> attributes;
|
||||
Arena complex_key_arena;
|
||||
KeyAttribute key_attribute;
|
||||
|
||||
size_t bytes_allocated = 0;
|
||||
size_t element_count = 0;
|
||||
@ -195,7 +246,6 @@ private:
|
||||
mutable std::atomic<size_t> query_count{0};
|
||||
mutable std::atomic<size_t> found_count{0};
|
||||
Arena string_arena;
|
||||
NoAttributesCollectionType no_attributes_container;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ namespace ErrorCodes
|
||||
extern const int CANNOT_TRUNCATE_FILE;
|
||||
extern const int CANNOT_UNLINK;
|
||||
extern const int CANNOT_RMDIR;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
std::mutex DiskLocal::reservation_mutex;
|
||||
@ -458,10 +459,16 @@ void registerDiskLocal(DiskFactory & factory)
|
||||
const Poco::Util::AbstractConfiguration & config,
|
||||
const String & config_prefix,
|
||||
ContextPtr context,
|
||||
const DisksMap & /*map*/) -> DiskPtr {
|
||||
const DisksMap & map) -> DiskPtr {
|
||||
String path;
|
||||
UInt64 keep_free_space_bytes;
|
||||
loadDiskLocalConfig(name, config, config_prefix, context, path, keep_free_space_bytes);
|
||||
|
||||
for (const auto & [disk_name, disk_ptr] : map)
|
||||
{
|
||||
if (path == disk_ptr->getPath())
|
||||
throw Exception("Disk " + name + " and Disk " + disk_name + " cannot have the same path" + " (" + path + ")", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
return std::make_shared<DiskLocal>(name, path, keep_free_space_bytes);
|
||||
};
|
||||
factory.registerDiskType("local", creator);
|
||||
|
@ -13,6 +13,8 @@
|
||||
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
|
||||
#include <Poco/URI.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
|
||||
@ -431,6 +433,9 @@ void FormatFactory::registerFileExtension(const String & extension, const String
|
||||
|
||||
String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_found)
|
||||
{
|
||||
if (file_name == "stdin")
|
||||
return getFormatFromFileDescriptor(STDIN_FILENO);
|
||||
|
||||
CompressionMethod compression_method = chooseCompressionMethod(file_name, "");
|
||||
if (CompressionMethod::None != compression_method)
|
||||
{
|
||||
@ -459,6 +464,25 @@ String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_
|
||||
return it->second;
|
||||
}
|
||||
|
||||
String FormatFactory::getFormatFromFileDescriptor(int fd)
|
||||
{
|
||||
#ifdef OS_LINUX
|
||||
char buf[32] = {'\0'};
|
||||
snprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd);
|
||||
char file_path[PATH_MAX] = {'\0'};
|
||||
if (readlink(buf, file_path, sizeof(file_path) - 1) != -1)
|
||||
return getFormatFromFileName(file_path, false);
|
||||
return "";
|
||||
#elif defined(__APPLE__)
|
||||
char file_path[PATH_MAX] = {'\0'};
|
||||
if (fcntl(fd, F_GETPATH, file_path) != -1)
|
||||
return getFormatFromFileName(file_path, false);
|
||||
return "";
|
||||
#else
|
||||
return "";
|
||||
#endif
|
||||
}
|
||||
|
||||
void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine)
|
||||
{
|
||||
auto & target = dict[name].file_segmentation_engine;
|
||||
|
@ -187,6 +187,7 @@ public:
|
||||
/// Register file extension for format
|
||||
void registerFileExtension(const String & extension, const String & format_name);
|
||||
String getFormatFromFileName(String file_name, bool throw_if_not_found = false);
|
||||
String getFormatFromFileDescriptor(int fd);
|
||||
|
||||
/// Register schema readers for format its name.
|
||||
void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator);
|
||||
|
@ -17,7 +17,12 @@ namespace ErrorCodes
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context)
|
||||
ColumnsDescription readSchemaFromFormat(
|
||||
const String & format_name,
|
||||
const std::optional<FormatSettings> & format_settings,
|
||||
ReadBufferCreator read_buffer_creator,
|
||||
ContextPtr context,
|
||||
std::unique_ptr<ReadBuffer> & buf_out)
|
||||
{
|
||||
NamesAndTypesList names_and_types;
|
||||
if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name))
|
||||
@ -34,11 +39,11 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o
|
||||
}
|
||||
else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name))
|
||||
{
|
||||
auto read_buf = read_buffer_creator();
|
||||
if (read_buf->eof())
|
||||
buf_out = read_buffer_creator();
|
||||
if (buf_out->eof())
|
||||
throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, file is empty", format_name);
|
||||
|
||||
auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *read_buf, context, format_settings);
|
||||
auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf_out, context, format_settings);
|
||||
try
|
||||
{
|
||||
names_and_types = schema_reader->readSchema();
|
||||
@ -54,6 +59,12 @@ ColumnsDescription readSchemaFromFormat(const String & format_name, const std::o
|
||||
return ColumnsDescription(names_and_types);
|
||||
}
|
||||
|
||||
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context)
|
||||
{
|
||||
std::unique_ptr<ReadBuffer> buf_out;
|
||||
return readSchemaFromFormat(format_name, format_settings, read_buffer_creator, context, buf_out);
|
||||
}
|
||||
|
||||
DataTypePtr generalizeDataType(DataTypePtr type)
|
||||
{
|
||||
WhichDataType which(type);
|
||||
|
@ -15,7 +15,19 @@ namespace DB
|
||||
/// If format doesn't have any schema reader or a schema reader
|
||||
/// couldn't determine the schema, an exception will be thrown.
|
||||
using ReadBufferCreator = std::function<std::unique_ptr<ReadBuffer>()>;
|
||||
ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional<FormatSettings> & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context);
|
||||
ColumnsDescription readSchemaFromFormat(
|
||||
const String & format_name,
|
||||
const std::optional<FormatSettings> & format_settings,
|
||||
ReadBufferCreator read_buffer_creator,
|
||||
ContextPtr context);
|
||||
|
||||
/// If ReadBuffer is created, it will be written to buf_out.
|
||||
ColumnsDescription readSchemaFromFormat(
|
||||
const String & format_name,
|
||||
const std::optional<FormatSettings> & format_settings,
|
||||
ReadBufferCreator read_buffer_creator,
|
||||
ContextPtr context,
|
||||
std::unique_ptr<ReadBuffer> & buf_out);
|
||||
|
||||
/// Convert type to the most general type:
|
||||
/// - IntN, UIntN, FloatN, Decimal -> Float64
|
||||
|
@ -76,6 +76,10 @@ endif()
|
||||
|
||||
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::lz4)
|
||||
|
||||
if (ENABLE_NLP)
|
||||
target_link_libraries(clickhouse_functions PRIVATE ch_contrib::cld2)
|
||||
endif()
|
||||
|
||||
if (TARGET ch_contrib::h3)
|
||||
target_link_libraries (clickhouse_functions PRIVATE ch_contrib::h3)
|
||||
endif()
|
||||
|
@ -125,7 +125,7 @@ private:
|
||||
{
|
||||
const auto & src_data = col->getData();
|
||||
const size_t size = src_data.size();
|
||||
UInt32 scale = src_data.getScale();
|
||||
UInt32 scale = col->getScale();
|
||||
|
||||
auto dst = ColumnVector<ReturnType>::create();
|
||||
auto & dst_data = dst->getData();
|
||||
|
@ -18,6 +18,7 @@ namespace ErrorCodes
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int DECIMAL_OVERFLOW;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
}
|
||||
|
||||
/// Cast DateTime64 to Int64 representation narrowed down (or scaled up) to any scale value defined in Impl.
|
||||
@ -108,8 +109,8 @@ public:
|
||||
if (arguments.size() < 1 || arguments.size() > 2)
|
||||
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} takes one or two arguments", name);
|
||||
|
||||
if (!typeid_cast<const DataTypeInt64 *>(arguments[0].type.get()))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The first argument for function {} must be Int64", name);
|
||||
if (!isInteger(arguments[0].type))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The first argument for function {} must be integer", name);
|
||||
|
||||
std::string timezone;
|
||||
if (arguments.size() == 2)
|
||||
@ -118,21 +119,48 @@ public:
|
||||
return std::make_shared<DataTypeDateTime64>(target_scale, timezone);
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
||||
template <typename T>
|
||||
bool executeType(auto & result_column, const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const
|
||||
{
|
||||
const auto & src = arguments[0];
|
||||
const auto & col = *src.column;
|
||||
|
||||
auto res_column = ColumnDecimal<DateTime64>::create(input_rows_count, target_scale);
|
||||
auto & result_data = res_column->getData();
|
||||
if (!checkAndGetColumn<ColumnVector<T>>(col))
|
||||
return 0;
|
||||
|
||||
const auto & source_data = typeid_cast<const ColumnInt64 &>(col).getData();
|
||||
auto & result_data = result_column->getData();
|
||||
|
||||
const auto & source_data = typeid_cast<const ColumnVector<T> &>(col).getData();
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
result_data[i] = source_data[i];
|
||||
|
||||
return res_column;
|
||||
return 1;
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
||||
{
|
||||
auto result_column = ColumnDecimal<DateTime64>::create(input_rows_count, target_scale);
|
||||
|
||||
if (!((executeType<UInt8>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<UInt16>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<UInt32>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<UInt32>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<UInt64>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<Int8>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<Int16>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<Int32>(result_column, arguments, input_rows_count))
|
||||
|| (executeType<Int64>(result_column, arguments, input_rows_count))))
|
||||
{
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Illegal column {} of first argument of function {}",
|
||||
arguments[0].column->getName(),
|
||||
getName());
|
||||
}
|
||||
|
||||
return result_column;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
142
src/Functions/FunctionsCharsetClassification.cpp
Normal file
142
src/Functions/FunctionsCharsetClassification.cpp
Normal file
@ -0,0 +1,142 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
|
||||
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
|
||||
* Using a naive Bayesian classifier, find the most likely charset and language and return it
|
||||
*/
|
||||
|
||||
template <bool detect_language>
|
||||
struct CharsetClassificationImpl
|
||||
{
|
||||
/* We need to solve zero-frequency problem for Naive Bayes Classifier
|
||||
* If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
|
||||
* 1e-06 is minimal value in our marked-up dictionary.
|
||||
*/
|
||||
static constexpr Float64 zero_frequency = 1e-06;
|
||||
|
||||
/// If the data size is bigger than this, behaviour is unspecified for this function.
|
||||
static constexpr size_t max_string_size = 1u << 15;
|
||||
|
||||
static ALWAYS_INLINE inline Float64 naiveBayes(
|
||||
const FrequencyHolder::EncodingMap & standard,
|
||||
const HashMap<UInt16, UInt64> & model,
|
||||
Float64 max_result)
|
||||
{
|
||||
Float64 res = 0;
|
||||
for (const auto & el : model)
|
||||
{
|
||||
/// Try to find bigram in the dictionary.
|
||||
const auto * it = standard.find(el.getKey());
|
||||
if (it != standard.end())
|
||||
{
|
||||
res += el.getMapped() * log(it->getMapped());
|
||||
} else
|
||||
{
|
||||
res += el.getMapped() * log(zero_frequency);
|
||||
}
|
||||
/// If at some step the result has become less than the current maximum, then it makes no sense to count it fully.
|
||||
if (res < max_result)
|
||||
{
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Сount how many times each bigram occurs in the text.
|
||||
static ALWAYS_INLINE inline void calculateStats(
|
||||
const UInt8 * data,
|
||||
const size_t size,
|
||||
HashMap<UInt16, UInt64> & model)
|
||||
{
|
||||
UInt16 hash = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
hash <<= 8;
|
||||
hash += *(data + i);
|
||||
++model[hash];
|
||||
}
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
||||
|
||||
if (detect_language)
|
||||
/// 2 chars for ISO code + 1 zero byte
|
||||
res_data.reserve(offsets.size() * 3);
|
||||
else
|
||||
/// Mean charset length is 8
|
||||
res_data.reserve(offsets.size() * 8);
|
||||
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::string_view res;
|
||||
|
||||
HashMap<UInt16, UInt64> model;
|
||||
calculateStats(str, str_len, model);
|
||||
|
||||
/// Go through the dictionary and find the charset with the highest weight
|
||||
Float64 max_result = log(zero_frequency) * (max_string_size);
|
||||
for (const auto & item : encodings_freq)
|
||||
{
|
||||
Float64 score = naiveBayes(item.map, model, max_result);
|
||||
if (max_result < score)
|
||||
{
|
||||
max_result = score;
|
||||
res = detect_language ? item.lang : item.name;
|
||||
}
|
||||
}
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct NameDetectCharset
|
||||
{
|
||||
static constexpr auto name = "detectCharset";
|
||||
};
|
||||
|
||||
struct NameDetectLanguageUnknown
|
||||
{
|
||||
static constexpr auto name = "detectLanguageUnknown";
|
||||
};
|
||||
|
||||
|
||||
using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
|
||||
using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
|
||||
|
||||
void registerFunctionDetectCharset(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectCharset>();
|
||||
factory.registerFunction<FunctionDetectLanguageUnknown>();
|
||||
}
|
||||
|
||||
}
|
@ -152,9 +152,11 @@ struct ConvertImpl
|
||||
if (const ColVecFrom * col_from = checkAndGetColumn<ColVecFrom>(named_from.column.get()))
|
||||
{
|
||||
typename ColVecTo::MutablePtr col_to = nullptr;
|
||||
|
||||
if constexpr (IsDataTypeDecimal<ToDataType>)
|
||||
{
|
||||
UInt32 scale;
|
||||
|
||||
if constexpr (std::is_same_v<Additions, AccurateConvertStrategyAdditions>
|
||||
|| std::is_same_v<Additions, AccurateOrNullConvertStrategyAdditions>)
|
||||
{
|
||||
@ -208,11 +210,11 @@ struct ConvertImpl
|
||||
bool convert_result = false;
|
||||
|
||||
if constexpr (IsDataTypeDecimal<FromDataType> && IsDataTypeDecimal<ToDataType>)
|
||||
convert_result = tryConvertDecimals<FromDataType, ToDataType>(vec_from[i], vec_from.getScale(), vec_to.getScale(), result);
|
||||
convert_result = tryConvertDecimals<FromDataType, ToDataType>(vec_from[i], col_from->getScale(), col_to->getScale(), result);
|
||||
else if constexpr (IsDataTypeDecimal<FromDataType> && IsDataTypeNumber<ToDataType>)
|
||||
convert_result = tryConvertFromDecimal<FromDataType, ToDataType>(vec_from[i], vec_from.getScale(), result);
|
||||
convert_result = tryConvertFromDecimal<FromDataType, ToDataType>(vec_from[i], col_from->getScale(), result);
|
||||
else if constexpr (IsDataTypeNumber<FromDataType> && IsDataTypeDecimal<ToDataType>)
|
||||
convert_result = tryConvertToDecimal<FromDataType, ToDataType>(vec_from[i], vec_to.getScale(), result);
|
||||
convert_result = tryConvertToDecimal<FromDataType, ToDataType>(vec_from[i], col_to->getScale(), result);
|
||||
|
||||
if (convert_result)
|
||||
vec_to[i] = result;
|
||||
@ -225,11 +227,11 @@ struct ConvertImpl
|
||||
else
|
||||
{
|
||||
if constexpr (IsDataTypeDecimal<FromDataType> && IsDataTypeDecimal<ToDataType>)
|
||||
vec_to[i] = convertDecimals<FromDataType, ToDataType>(vec_from[i], vec_from.getScale(), vec_to.getScale());
|
||||
vec_to[i] = convertDecimals<FromDataType, ToDataType>(vec_from[i], col_from->getScale(), col_to->getScale());
|
||||
else if constexpr (IsDataTypeDecimal<FromDataType> && IsDataTypeNumber<ToDataType>)
|
||||
vec_to[i] = convertFromDecimal<FromDataType, ToDataType>(vec_from[i], vec_from.getScale());
|
||||
vec_to[i] = convertFromDecimal<FromDataType, ToDataType>(vec_from[i], col_from->getScale());
|
||||
else if constexpr (IsDataTypeNumber<FromDataType> && IsDataTypeDecimal<ToDataType>)
|
||||
vec_to[i] = convertToDecimal<FromDataType, ToDataType>(vec_from[i], vec_to.getScale());
|
||||
vec_to[i] = convertToDecimal<FromDataType, ToDataType>(vec_from[i], col_to->getScale());
|
||||
else
|
||||
throw Exception("Unsupported data type in conversion function", ErrorCodes::CANNOT_CONVERT_TYPE);
|
||||
}
|
||||
@ -820,7 +822,7 @@ struct ConvertImpl<FromDataType, std::enable_if_t<!std::is_same_v<FromDataType,
|
||||
else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime>)
|
||||
data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss") + 1));
|
||||
else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime64>)
|
||||
data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + vec_from.getScale() + 1));
|
||||
data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + col_from->getScale() + 1));
|
||||
else
|
||||
data_to.resize(size * 3); /// Arbitrary
|
||||
|
||||
@ -1169,7 +1171,7 @@ struct ConvertThroughParsing
|
||||
if constexpr (to_datetime64)
|
||||
{
|
||||
DateTime64 res = 0;
|
||||
parseDateTime64BestEffort(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone);
|
||||
parseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone);
|
||||
vec_to[i] = res;
|
||||
}
|
||||
else
|
||||
@ -1184,7 +1186,7 @@ struct ConvertThroughParsing
|
||||
if constexpr (to_datetime64)
|
||||
{
|
||||
DateTime64 res = 0;
|
||||
parseDateTime64BestEffortUS(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone);
|
||||
parseDateTime64BestEffortUS(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone);
|
||||
vec_to[i] = res;
|
||||
}
|
||||
else
|
||||
@ -1199,12 +1201,12 @@ struct ConvertThroughParsing
|
||||
if constexpr (to_datetime64)
|
||||
{
|
||||
DateTime64 value = 0;
|
||||
readDateTime64Text(value, vec_to.getScale(), read_buffer, *local_time_zone);
|
||||
readDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone);
|
||||
vec_to[i] = value;
|
||||
}
|
||||
else if constexpr (IsDataTypeDecimal<ToDataType>)
|
||||
SerializationDecimal<typename ToDataType::FieldType>::readText(
|
||||
vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale());
|
||||
vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale());
|
||||
else
|
||||
{
|
||||
parseImpl<ToDataType>(vec_to[i], read_buffer, local_time_zone);
|
||||
@ -1223,7 +1225,7 @@ struct ConvertThroughParsing
|
||||
if constexpr (to_datetime64)
|
||||
{
|
||||
DateTime64 res = 0;
|
||||
parsed = tryParseDateTime64BestEffort(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone);
|
||||
parsed = tryParseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone);
|
||||
vec_to[i] = res;
|
||||
}
|
||||
else
|
||||
@ -1244,12 +1246,12 @@ struct ConvertThroughParsing
|
||||
if constexpr (to_datetime64)
|
||||
{
|
||||
DateTime64 value = 0;
|
||||
parsed = tryReadDateTime64Text(value, vec_to.getScale(), read_buffer, *local_time_zone);
|
||||
parsed = tryReadDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone);
|
||||
vec_to[i] = value;
|
||||
}
|
||||
else if constexpr (IsDataTypeDecimal<ToDataType>)
|
||||
parsed = SerializationDecimal<typename ToDataType::FieldType>::tryReadText(
|
||||
vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale());
|
||||
vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale());
|
||||
else
|
||||
parsed = tryParseImpl<ToDataType>(vec_to[i], read_buffer, local_time_zone);
|
||||
}
|
||||
|
231
src/Functions/FunctionsLanguageClassification.cpp
Normal file
231
src/Functions/FunctionsLanguageClassification.cpp
Normal file
@ -0,0 +1,231 @@
|
||||
#include "config_functions.h"
|
||||
|
||||
#if USE_NLP
|
||||
|
||||
#include <Columns/ColumnMap.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Common/isValidUTF8.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
#include <compact_lang_det.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
/* Determine language of Unicode UTF-8 text.
|
||||
* Uses the cld2 library https://github.com/CLD2Owners/cld2
|
||||
*/
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
struct FunctionDetectLanguageImpl
|
||||
{
|
||||
static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string)
|
||||
{
|
||||
if (code_string.ends_with("-Latn"))
|
||||
code_string.remove_suffix(code_string.size() - 5);
|
||||
|
||||
if (code_string.ends_with("-Hant"))
|
||||
code_string.remove_suffix(code_string.size() - 5);
|
||||
|
||||
// Old deprecated codes
|
||||
if (code_string == "iw")
|
||||
return "he";
|
||||
|
||||
if (code_string == "jw")
|
||||
return "jv";
|
||||
|
||||
if (code_string == "in")
|
||||
return "id";
|
||||
|
||||
if (code_string == "mo")
|
||||
return "ro";
|
||||
|
||||
// Some languages do not have 2 letter codes, for example code for Cebuano is ceb
|
||||
if (code_string.size() != 2)
|
||||
return "other";
|
||||
|
||||
return code_string;
|
||||
}
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
/// Constant 3 is based on the fact that in general we need 2 characters for ISO code + 1 zero byte
|
||||
res_data.reserve(offsets.size() * 3);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
bool is_reliable;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::string_view res;
|
||||
|
||||
if (UTF8::isValidUTF8(str, str_len))
|
||||
{
|
||||
auto lang = CLD2::DetectLanguage(reinterpret_cast<const char *>(str), str_len, true, &is_reliable);
|
||||
res = codeISO(LanguageCode(lang));
|
||||
}
|
||||
else
|
||||
{
|
||||
res = "un";
|
||||
}
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class FunctionDetectLanguageMixed : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "detectLanguageMixed";
|
||||
|
||||
/// Number of top results
|
||||
static constexpr auto top_N = 3;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionDetectLanguageMixed>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeFloat32>());
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
|
||||
{
|
||||
const auto & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal columns " + arguments[0].column->getName() + " of arguments of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
const auto & input_data = col->getChars();
|
||||
const auto & input_offsets = col->getOffsets();
|
||||
|
||||
/// Create and fill the result map.
|
||||
|
||||
const auto & result_type_map = static_cast<const DataTypeMap &>(*result_type);
|
||||
const DataTypePtr & key_type = result_type_map.getKeyType();
|
||||
const DataTypePtr & value_type = result_type_map.getValueType();
|
||||
|
||||
MutableColumnPtr keys_data = key_type->createColumn();
|
||||
MutableColumnPtr values_data = value_type->createColumn();
|
||||
MutableColumnPtr offsets = DataTypeNumber<IColumn::Offset>().createColumn();
|
||||
|
||||
size_t total_elements = input_rows_count * top_N;
|
||||
keys_data->reserve(total_elements);
|
||||
values_data->reserve(total_elements);
|
||||
offsets->reserve(input_rows_count);
|
||||
|
||||
bool is_reliable;
|
||||
CLD2::Language result_lang_top3[top_N];
|
||||
int32_t pc[top_N];
|
||||
int bytes[top_N];
|
||||
|
||||
IColumn::Offset current_offset = 0;
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
const UInt8 * str = input_data.data() + input_offsets[i - 1];
|
||||
const size_t str_len = input_offsets[i] - input_offsets[i - 1] - 1;
|
||||
|
||||
if (UTF8::isValidUTF8(str, str_len))
|
||||
{
|
||||
CLD2::DetectLanguageSummary(reinterpret_cast<const char *>(str), str_len, true, result_lang_top3, pc, bytes, &is_reliable);
|
||||
|
||||
for (size_t j = 0; j < top_N; ++j)
|
||||
{
|
||||
if (pc[j] == 0)
|
||||
break;
|
||||
|
||||
auto res_str = FunctionDetectLanguageImpl::codeISO(LanguageCode(result_lang_top3[j]));
|
||||
Float32 res_float = static_cast<Float32>(pc[j]) / 100;
|
||||
|
||||
keys_data->insertData(res_str.data(), res_str.size());
|
||||
values_data->insertData(reinterpret_cast<const char *>(&res_float), sizeof(res_float));
|
||||
++current_offset;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string_view res_str = "un";
|
||||
Float32 res_float = 0;
|
||||
|
||||
keys_data->insertData(res_str.data(), res_str.size());
|
||||
values_data->insertData(reinterpret_cast<const char *>(&res_float), sizeof(res_float));
|
||||
++current_offset;
|
||||
}
|
||||
offsets->insert(current_offset);
|
||||
}
|
||||
|
||||
auto nested_column = ColumnArray::create(
|
||||
ColumnTuple::create(Columns{std::move(keys_data), std::move(values_data)}),
|
||||
std::move(offsets));
|
||||
|
||||
return ColumnMap::create(nested_column);
|
||||
}
|
||||
};
|
||||
|
||||
struct NameDetectLanguage
|
||||
{
|
||||
static constexpr auto name = "detectLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionDetectLanguage = FunctionTextClassificationString<FunctionDetectLanguageImpl, NameDetectLanguage>;
|
||||
|
||||
void registerFunctionsDetectLanguage(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectLanguage>();
|
||||
factory.registerFunction<FunctionDetectLanguageMixed>();
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user