Merge branch 'master' into crash-log

This commit is contained in:
Alexey Milovidov 2020-07-31 16:12:53 +03:00
commit c3ad710b84
1539 changed files with 50873 additions and 12323 deletions

10
.gitmodules vendored
View File

@ -49,7 +49,7 @@
url = https://github.com/ClickHouse-Extras/boost.git
[submodule "contrib/base64"]
path = contrib/base64
url = https://github.com/powturbo/Turbo-Base64.git
url = https://github.com/ClickHouse-Extras/Turbo-Base64.git
[submodule "contrib/arrow"]
path = contrib/arrow
url = https://github.com/apache/arrow
@ -76,7 +76,7 @@
url = https://github.com/google/snappy
[submodule "contrib/cppkafka"]
path = contrib/cppkafka
url = https://github.com/ClickHouse-Extras/cppkafka.git
url = https://github.com/mfontanini/cppkafka.git
[submodule "contrib/brotli"]
path = contrib/brotli
url = https://github.com/google/brotli.git
@ -174,3 +174,9 @@
[submodule "contrib/sentry-native"]
path = contrib/sentry-native
url = https://github.com/getsentry/sentry-native.git
[submodule "contrib/gcem"]
path = contrib/gcem
url = https://github.com/kthohr/gcem.git
[submodule "contrib/stats"]
path = contrib/stats
url = https://github.com/kthohr/stats.git

View File

@ -12,6 +12,20 @@ foreach(policy
endif()
endforeach()
# set default policy
foreach(default_policy_var_name
# make option() honor normal variables for BUILD_SHARED_LIBS:
# - re2
# - snappy
CMAKE_POLICY_DEFAULT_CMP0077
# Google Test from sources uses too old cmake, 2.6.x, and CMP0022 should
# set, to avoid using deprecated LINK_INTERFACE_LIBRARIES(_<CONFIG>)? over
# INTERFACE_LINK_LIBRARIES.
CMAKE_POLICY_DEFAULT_CMP0022
)
set(${default_policy_var_name} NEW)
endforeach()
project(ClickHouse)
include (cmake/arch.cmake)
@ -273,7 +287,7 @@ endif ()
include(cmake/dbms_glob_sources.cmake)
if (OS_LINUX)
if (OS_LINUX OR OS_ANDROID)
include(cmake/linux/default_libs.cmake)
elseif (OS_DARWIN)
include(cmake/darwin/default_libs.cmake)
@ -364,6 +378,7 @@ include (cmake/find/avro.cmake)
include (cmake/find/msgpack.cmake)
include (cmake/find/cassandra.cmake)
include (cmake/find/sentry.cmake)
include (cmake/find/stats.cmake)
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
@ -378,10 +393,6 @@ include (cmake/find/mysqlclient.cmake)
# When testing for memory leaks with Valgrind, don't link tcmalloc or jemalloc.
if (OS_LINUX AND NOT ENABLE_JEMALLOC)
message (WARNING "Non default allocator is disabled. This is not recommended for production Linux builds.")
endif ()
if (USE_OPENCL)
if (OS_DARWIN)
set(OPENCL_LINKER_FLAGS "-framework OpenCL")
@ -397,6 +408,10 @@ endif ()
add_subdirectory (contrib EXCLUDE_FROM_ALL)
if (NOT ENABLE_JEMALLOC)
message (WARNING "Non default allocator is disabled. This is not recommended for production builds.")
endif ()
macro (add_executable target)
# invoke built-in add_executable
# explicitly acquire and interpose malloc symbols by clickhouse_malloc

View File

@ -9,12 +9,11 @@ ClickHouse is an open-source column-oriented database management system that all
* [Documentation](https://clickhouse.tech/docs/en/) provides more in-depth information.
* [YouTube channel](https://www.youtube.com/c/ClickHouseDB) has a lot of content about ClickHouse in video format.
* [Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-d2zxkf9e-XyxDa_ucfPxzuH4SJIm~Ng) and [Telegram](https://telegram.me/clickhouse_en) allow to chat with ClickHouse users in real-time.
* [Blog](https://clickhouse.yandex/blog/en/) contains various ClickHouse-related articles, as well as announces and reports about events.
* [Blog](https://clickhouse.yandex/blog/en/) contains various ClickHouse-related articles, as well as announcements and reports about events.
* [Yandex.Messenger channel](https://yandex.ru/chat/#/join/20e380d9-c7be-4123-ab06-e95fb946975e) shares announcements and useful links in Russian.
* [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any.
* You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person.
## Upcoming Events
* [ClickHouse for genetic data (in Russian)](https://cloud.yandex.ru/events/152) on July 14, 2020.
* [ClickHouse virtual office hours](https://www.eventbrite.com/e/clickhouse-july-virtual-meetup-tickets-111199787558) on July 15, 2020.
* [ClickHouse at ByteDance (in Chinese)](https://mp.weixin.qq.com/s/Em-HjPylO8D7WPui4RREAQ) on July 31, 2020.

View File

@ -18,5 +18,4 @@ currently being supported with security updates:
## Reporting a Vulnerability
To report a potential vulnerability in ClickHouse please use the security advisory feature of GitHub:
https://github.com/ClickHouse/ClickHouse/security/advisories
To report a potential vulnerability in ClickHouse please send the details about it to [clickhouse-feedback@yandex-team.com](mailto:clickhouse-feedback@yandex-team.com).

View File

@ -49,7 +49,7 @@ public:
struct Values
{
/// Least significat 32 bits from time_t at beginning of the day.
/// If the unix timestamp of beginning of the day is negative (example: 1970-01-01 MSK, where time_t == -10800), then value is zero.
/// If the unix timestamp of beginning of the day is negative (example: 1970-01-01 MSK, where time_t == -10800), then value will overflow.
/// Change to time_t; change constants above; and recompile the sources if you need to support time after 2105 year.
UInt32 date;
@ -686,12 +686,17 @@ public:
inline time_t makeDateTime(UInt16 year, UInt8 month, UInt8 day_of_month, UInt8 hour, UInt8 minute, UInt8 second) const
{
size_t index = makeDayNum(year, month, day_of_month);
time_t time_offset = hour * 3600 + minute * 60 + second;
UInt32 time_offset = hour * 3600 + minute * 60 + second;
if (time_offset >= lut[index].time_at_offset_change)
time_offset -= lut[index].amount_of_offset_change;
return lut[index].date + time_offset;
UInt32 res = lut[index].date + time_offset;
if (unlikely(res > DATE_LUT_MAX))
return 0;
return res;
}
inline const Values & getValues(DayNum d) const { return lut[d]; }

View File

@ -16,6 +16,19 @@ void trim(String & s)
s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !std::isspace(ch); }).base(), s.end());
}
// Uses separate replxx::Replxx instance to avoid loading them again in the
// current context (replxx::Replxx::history_load() will re-load the history
// from the file), since then they will overlaps with history from the current
// session (this will make behavior compatible with other interpreters, i.e.
// bash).
void history_save(const String & history_file_path, const String & line)
{
replxx::Replxx rx_no_overlap;
rx_no_overlap.history_load(history_file_path);
rx_no_overlap.history_add(line);
rx_no_overlap.history_save(history_file_path);
}
}
ReplxxLineReader::ReplxxLineReader(
@ -101,6 +114,10 @@ LineReader::InputStatus ReplxxLineReader::readOneLine(const String & prompt)
void ReplxxLineReader::addToHistory(const String & line)
{
// locking history file to prevent from inconsistent concurrent changes
//
// replxx::Replxx::history_save() already has lockf(),
// but replxx::Replxx::history_load() does not
// and that is why flock() is added here.
bool locked = false;
if (flock(history_file_fd, LOCK_EX))
rx.print("Lock of history file failed: %s\n", strerror(errno));
@ -110,7 +127,7 @@ void ReplxxLineReader::addToHistory(const String & line)
rx.history_add(line);
// flush changes to the disk
rx.history_save(history_file_path);
history_save(history_file_path, line);
if (locked && 0 != flock(history_file_fd, LOCK_UN))
rx.print("Unlock of history file failed: %s\n", strerror(errno));

View File

@ -30,7 +30,7 @@ struct StringRef
constexpr StringRef(const CharT * data_, size_t size_) : data(reinterpret_cast<const char *>(data_)), size(size_) {}
StringRef(const std::string & s) : data(s.data()), size(s.size()) {}
constexpr StringRef(const std::string_view & s) : data(s.data()), size(s.size()) {}
constexpr explicit StringRef(const std::string_view & s) : data(s.data()), size(s.size()) {}
constexpr StringRef(const char * data_) : StringRef(std::string_view{data_}) {}
constexpr StringRef() = default;

View File

@ -1,6 +1,9 @@
#include <common/getThreadId.h>
#if defined(OS_LINUX)
#if defined(OS_ANDROID)
#include <sys/types.h>
#include <unistd.h>
#elif defined(OS_LINUX)
#include <unistd.h>
#include <syscall.h>
#elif defined(OS_FREEBSD)
@ -16,7 +19,9 @@ uint64_t getThreadId()
{
if (!current_tid)
{
#if defined(OS_LINUX)
#if defined(OS_ANDROID)
current_tid = gettid();
#elif defined(OS_LINUX)
current_tid = syscall(SYS_gettid); /// This call is always successful. - man gettid
#elif defined(OS_FREEBSD)
current_tid = pthread_getthreadid_np();

View File

@ -9,7 +9,6 @@
#include <string.h>
#include <signal.h>
#include <cxxabi.h>
#include <execinfo.h>
#include <unistd.h>
#include <typeinfo>

17
benchmark/omnisci/benchmark.sh Executable file
View File

@ -0,0 +1,17 @@
#!/bin/bash
grep -v -P '^#' queries.sql | sed -e 's/{table}/hits/' | while read query; do
echo 3 | sudo tee /proc/sys/vm/drop_caches
sudo systemctl restart omnisci_server
for i in {1..1000}; do
/opt/omnisci/bin/omnisql -t -p HyperInteractive <<< "SELECT 1;" 2>&1 | grep -q '1 rows returned' && break;
sleep 0.1;
done
sleep 10;
echo "$query";
for i in {1..3}; do
/opt/omnisci/bin/omnisql -t -p HyperInteractive <<< "$query" 2>&1 | grep -P 'Exception:|Execution time:';
done;
done;

View File

@ -0,0 +1,332 @@
# Instruction to run benchmark for OmniSci on web-analytics dataset
OmniSci (former name "MapD") is open-source (open-core) in-memory analytical DBMS with support for GPU processing.
It can run on CPU without GPU as well. It can show competitive performance on simple queries (like - simple aggregation on a single column).
# How to install
https://docs.omnisci.com/installation-and-configuration/installation/installing-on-ubuntu
# Caveats
- Dataset (at least needed columns) must fit in memory.
- It does not support data compression (only dictionary encoding for strings).
- First query execution is very slow because uncompressed data is read from disk.
- It does not support index for quick range queries.
- It does not support NOT NULL for data types.
- It does not support BLOB.
- No support for UNSIGNED data type (it's Ok according to SQL standard).
- Lack of string processing functions.
- Strings are limited to 32767 bytes.
- GROUP BY on text data type is supported only if it has dictionary encoding.
`Exception: Cannot group by string columns which are not dictionary encoded`
- Some aggregate functions are not supported for strings at all.
`Aggregate on TEXT is not supported yet.`
- Sometimes I hit a bug when query is run in infinite loop and does not finish (after retry it's finished successfully).
- One query executed in hours even with retries.
- Sorting is slow and disabled with default settings for large resultsets.
`Exception: Sorting the result would be too slow`
`Cast from dictionary-encoded string to none-encoded would be slow`
- There is approximate count distinct function but the precision is not documented.
To enable sorting of large resultsets, see:
https://stackoverflow.com/questions/62977734/omnissci-sorting-the-result-would-be-too-slow
The list of known issues is here:
https://github.com/omnisci/omniscidb/issues?q=is%3Aissue+author%3Aalexey-milovidov
# How to prepare data
Download the 100 million rows dataset from here and insert into ClickHouse:
https://clickhouse.tech/docs/en/getting-started/example-datasets/metrica/
Convert the CREATE TABLE query:
```
clickhouse-client --query "SHOW CREATE TABLE hits_100m" --format TSVRaw |
tr '`' '"' |
sed -r -e '
s/U?Int64/BIGINT/;
s/U?Int32/INTEGER/;
s/U?Int16/SMALLINT/;
s/U?Int8/TINYINT/;
s/DateTime/TIMESTAMP ENCODING FIXED(32)/;
s/ Date/ DATE ENCODING DAYS(16)/;
s/FixedString\(2\)/TEXT ENCODING DICT(16)/;
s/FixedString\(3\)/TEXT ENCODING DICT/;
s/FixedString\(\d+\)/TEXT ENCODING DICT/;
s/String/TEXT ENCODING DICT/;'
```
And cut `ENGINE` part.
The resulting CREATE TABLE query:
```
CREATE TABLE hits
(
"WatchID" BIGINT,
"JavaEnable" TINYINT,
"Title" TEXT ENCODING DICT,
"GoodEvent" SMALLINT,
"EventTime" TIMESTAMP ENCODING FIXED(32),
"EventDate" ENCODING DAYS(16) Date,
"CounterID" INTEGER,
"ClientIP" INTEGER,
"RegionID" INTEGER,
"UserID" BIGINT,
"CounterClass" TINYINT,
"OS" TINYINT,
"UserAgent" TINYINT,
"URL" TEXT ENCODING DICT,
"Referer" TEXT ENCODING DICT,
"Refresh" TINYINT,
"RefererCategoryID" SMALLINT,
"RefererRegionID" INTEGER,
"URLCategoryID" SMALLINT,
"URLRegionID" INTEGER,
"ResolutionWidth" SMALLINT,
"ResolutionHeight" SMALLINT,
"ResolutionDepth" TINYINT,
"FlashMajor" TINYINT,
"FlashMinor" TINYINT,
"FlashMinor2" TEXT ENCODING DICT,
"NetMajor" TINYINT,
"NetMinor" TINYINT,
"UserAgentMajor" SMALLINT,
"UserAgentMinor" TEXT ENCODING DICT(16),
"CookieEnable" TINYINT,
"JavascriptEnable" TINYINT,
"IsMobile" TINYINT,
"MobilePhone" TINYINT,
"MobilePhoneModel" TEXT ENCODING DICT,
"Params" TEXT ENCODING DICT,
"IPNetworkID" INTEGER,
"TraficSourceID" TINYINT,
"SearchEngineID" SMALLINT,
"SearchPhrase" TEXT ENCODING DICT,
"AdvEngineID" TINYINT,
"IsArtifical" TINYINT,
"WindowClientWidth" SMALLINT,
"WindowClientHeight" SMALLINT,
"ClientTimeZone" SMALLINT,
"ClientEventTime" TIMESTAMP ENCODING FIXED(32),
"SilverlightVersion1" TINYINT,
"SilverlightVersion2" TINYINT,
"SilverlightVersion3" INTEGER,
"SilverlightVersion4" SMALLINT,
"PageCharset" TEXT ENCODING DICT,
"CodeVersion" INTEGER,
"IsLink" TINYINT,
"IsDownload" TINYINT,
"IsNotBounce" TINYINT,
"FUniqID" BIGINT,
"OriginalURL" TEXT ENCODING DICT,
"HID" INTEGER,
"IsOldCounter" TINYINT,
"IsEvent" TINYINT,
"IsParameter" TINYINT,
"DontCountHits" TINYINT,
"WithHash" TINYINT,
"HitColor" TEXT ENCODING DICT(8),
"LocalEventTime" TIMESTAMP ENCODING FIXED(32),
"Age" TINYINT,
"Sex" TINYINT,
"Income" TINYINT,
"Interests" SMALLINT,
"Robotness" TINYINT,
"RemoteIP" INTEGER,
"WindowName" INTEGER,
"OpenerName" INTEGER,
"HistoryLength" SMALLINT,
"BrowserLanguage" TEXT ENCODING DICT(16),
"BrowserCountry" TEXT ENCODING DICT(16),
"SocialNetwork" TEXT ENCODING DICT,
"SocialAction" TEXT ENCODING DICT,
"HTTPError" SMALLINT,
"SendTiming" INTEGER,
"DNSTiming" INTEGER,
"ConnectTiming" INTEGER,
"ResponseStartTiming" INTEGER,
"ResponseEndTiming" INTEGER,
"FetchTiming" INTEGER,
"SocialSourceNetworkID" TINYINT,
"SocialSourcePage" TEXT ENCODING DICT,
"ParamPrice" BIGINT,
"ParamOrderID" TEXT ENCODING DICT,
"ParamCurrency" TEXT ENCODING DICT,
"ParamCurrencyID" SMALLINT,
"OpenstatServiceName" TEXT ENCODING DICT,
"OpenstatCampaignID" TEXT ENCODING DICT,
"OpenstatAdID" TEXT ENCODING DICT,
"OpenstatSourceID" TEXT ENCODING DICT,
"UTMSource" TEXT ENCODING DICT,
"UTMMedium" TEXT ENCODING DICT,
"UTMCampaign" TEXT ENCODING DICT,
"UTMContent" TEXT ENCODING DICT,
"UTMTerm" TEXT ENCODING DICT,
"FromTag" TEXT ENCODING DICT,
"HasGCLID" TINYINT,
"RefererHash" BIGINT,
"URLHash" BIGINT,
"CLID" INTEGER
);
```
Convert the dataset, prepare the list of fields for SELECT:
```
clickhouse-client --query "SHOW CREATE TABLE hits_100m" --format TSVRaw |
tr '`' '"' |
sed -r -e '
s/"(\w+)" U?Int([0-9]+)/toInt\2(\1)/;
s/"(\w+)" (Fixed)?String(\([0-9]+\))?/toValidUTF8(toString(\1))/;
s/"(\w+)" \w+/\1/'
```
The resulting SELECT query for data preparation:
```
SELECT
toInt64(WatchID),
toInt8(JavaEnable),
toValidUTF8(toString(Title)),
toInt16(GoodEvent),
EventTime,
EventDate,
toInt32(CounterID),
toInt32(ClientIP),
toInt32(RegionID),
toInt64(UserID),
toInt8(CounterClass),
toInt8(OS),
toInt8(UserAgent),
toValidUTF8(toString(URL)),
toValidUTF8(toString(Referer)),
toInt8(Refresh),
toInt16(RefererCategoryID),
toInt32(RefererRegionID),
toInt16(URLCategoryID),
toInt32(URLRegionID),
toInt16(ResolutionWidth),
toInt16(ResolutionHeight),
toInt8(ResolutionDepth),
toInt8(FlashMajor),
toInt8(FlashMinor),
toValidUTF8(toString(FlashMinor2)),
toInt8(NetMajor),
toInt8(NetMinor),
toInt16(UserAgentMajor),
toValidUTF8(toString(UserAgentMinor)),
toInt8(CookieEnable),
toInt8(JavascriptEnable),
toInt8(IsMobile),
toInt8(MobilePhone),
toValidUTF8(toString(MobilePhoneModel)),
toValidUTF8(toString(Params)),
toInt32(IPNetworkID),
toInt8(TraficSourceID),
toInt16(SearchEngineID),
toValidUTF8(toString(SearchPhrase)),
toInt8(AdvEngineID),
toInt8(IsArtifical),
toInt16(WindowClientWidth),
toInt16(WindowClientHeight),
toInt16(ClientTimeZone),
ClientEventTime,
toInt8(SilverlightVersion1),
toInt8(SilverlightVersion2),
toInt32(SilverlightVersion3),
toInt16(SilverlightVersion4),
toValidUTF8(toString(PageCharset)),
toInt32(CodeVersion),
toInt8(IsLink),
toInt8(IsDownload),
toInt8(IsNotBounce),
toInt64(FUniqID),
toValidUTF8(toString(OriginalURL)),
toInt32(HID),
toInt8(IsOldCounter),
toInt8(IsEvent),
toInt8(IsParameter),
toInt8(DontCountHits),
toInt8(WithHash),
toValidUTF8(toString(HitColor)),
LocalEventTime,
toInt8(Age),
toInt8(Sex),
toInt8(Income),
toInt16(Interests),
toInt8(Robotness),
toInt32(RemoteIP),
toInt32(WindowName),
toInt32(OpenerName),
toInt16(HistoryLength),
toValidUTF8(toString(BrowserLanguage)),
toValidUTF8(toString(BrowserCountry)),
toValidUTF8(toString(SocialNetwork)),
toValidUTF8(toString(SocialAction)),
toInt16(HTTPError),
toInt32(SendTiming),
toInt32(DNSTiming),
toInt32(ConnectTiming),
toInt32(ResponseStartTiming),
toInt32(ResponseEndTiming),
toInt32(FetchTiming),
toInt8(SocialSourceNetworkID),
toValidUTF8(toString(SocialSourcePage)),
toInt64(ParamPrice),
toValidUTF8(toString(ParamOrderID)),
toValidUTF8(toString(ParamCurrency)),
toInt16(ParamCurrencyID),
toValidUTF8(toString(OpenstatServiceName)),
toValidUTF8(toString(OpenstatCampaignID)),
toValidUTF8(toString(OpenstatAdID)),
toValidUTF8(toString(OpenstatSourceID)),
toValidUTF8(toString(UTMSource)),
toValidUTF8(toString(UTMMedium)),
toValidUTF8(toString(UTMCampaign)),
toValidUTF8(toString(UTMContent)),
toValidUTF8(toString(UTMTerm)),
toValidUTF8(toString(FromTag)),
toInt8(HasGCLID),
toInt64(RefererHash),
toInt64(URLHash),
toInt32(CLID)
FROM hits_100m_obfuscated
INTO OUTFILE '/home/milovidov/example_datasets/hits_100m_obfuscated.csv'
FORMAT CSV;
```
Upload data to OmniSci:
```
/opt/omnisci/bin/omnisql -t -p HyperInteractive
```
Run CREATE TABLE statement, then run:
```
COPY hits FROM '/home/milovidov/example_datasets/hits_100m_obfuscated.csv' WITH (HEADER = 'false');
```
Data loading took
```
336639 ms
```
on a server (Linux Ubuntu, Xeon E5-2560v2, 32 logical CPU, 128 GiB RAM, 8xHDD RAID-5, 40 TB).
Run benchmark:
```
./benchmark.sh
```
Prepare the result to paste into JSON:
```
grep -oP 'Total time: \d+' log.txt |
grep -oP '\d+' |
awk '{
if (i % 3 == 0) { a = $1 }
else if (i % 3 == 1) { b = $1 }
else if (i % 3 == 2) { c = $1; print "[" a / 1000 ", " b / 1000 ", " c / 1000 "]," };
++i; }'
```
And fill out `[null, null, null]` for missing runs.

210
benchmark/omnisci/log.txt Normal file
View File

@ -0,0 +1,210 @@
3
SELECT count(*) FROM hits;
Execution time: 23471 ms, Total time: 23471 ms
Execution time: 42 ms, Total time: 43 ms
Execution time: 35 ms, Total time: 35 ms
3
SELECT count(*) FROM hits WHERE AdvEngineID != 0;
Execution time: 17328 ms, Total time: 17329 ms
Execution time: 58 ms, Total time: 59 ms
Execution time: 57 ms, Total time: 59 ms
3
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits;
Execution time: 17309 ms, Total time: 17310 ms
Execution time: 115 ms, Total time: 115 ms
Execution time: 129 ms, Total time: 130 ms
3
SELECT sum(UserID) FROM hits;
Execution time: 26091 ms, Total time: 26091 ms
Execution time: 88 ms, Total time: 89 ms
Execution time: 71 ms, Total time: 72 ms
3
SELECT APPROX_COUNT_DISTINCT(UserID) FROM hits;
Execution time: 21720 ms, Total time: 21720 ms
Execution time: 364 ms, Total time: 364 ms
Execution time: 344 ms, Total time: 345 ms
3
SELECT APPROX_COUNT_DISTINCT(SearchPhrase) FROM hits;
Execution time: 19314 ms, Total time: 19315 ms
Execution time: 385 ms, Total time: 386 ms
Execution time: 382 ms, Total time: 382 ms
3
SELECT min(EventDate), max(EventDate) FROM hits;
Execution time: 19431 ms, Total time: 19432 ms
Execution time: 130 ms, Total time: 131 ms
Execution time: 147 ms, Total time: 148 ms
3
SELECT AdvEngineID, count(*) FROM hits WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
Execution time: 20660 ms, Total time: 20661 ms
Execution time: 63 ms, Total time: 64 ms
Execution time: 88 ms, Total time: 89 ms
3
SELECT RegionID, APPROX_COUNT_DISTINCT(UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;
Execution time: 21364 ms, Total time: 21472 ms
Execution time: 1387 ms, Total time: 1504 ms
Execution time: 1443 ms, Total time: 1505 ms
3
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), APPROX_COUNT_DISTINCT(UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;
Execution time: 22205 ms, Total time: 22285 ms
Execution time: 1590 ms, Total time: 1655 ms
Execution time: 1591 ms, Total time: 1658 ms
3
SELECT MobilePhoneModel, APPROX_COUNT_DISTINCT(UserID) AS u FROM hits WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
Execution time: 22343 ms, Total time: 22344 ms
Execution time: 122 ms, Total time: 123 ms
Execution time: 117 ms, Total time: 118 ms
3
SELECT MobilePhone, MobilePhoneModel, APPROX_COUNT_DISTINCT(UserID) AS u FROM hits WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
Execution time: 21681 ms, Total time: 21695 ms
Execution time: 299 ms, Total time: 310 ms
Execution time: 275 ms, Total time: 292 ms
3
SELECT SearchPhrase, count(*) AS c FROM hits WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
Execution time: 23346 ms, Total time: 23360 ms
Execution time: 613 ms, Total time: 631 ms
Execution time: 606 ms, Total time: 624 ms
3
SELECT SearchPhrase, APPROX_COUNT_DISTINCT(UserID) AS u FROM hits WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
Execution time: 66014 ms, Total time: 68618 ms
Execution time: 44309 ms, Total time: 47296 ms
Execution time: 44019 ms, Total time: 46866 ms
3
SELECT SearchEngineID, SearchPhrase, count(*) AS c FROM hits WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
Execution time: 25853 ms, Total time: 25984 ms
Execution time: 2590 ms, Total time: 2728 ms
Execution time: 2652 ms, Total time: 2789 ms
3
SELECT UserID, count(*) FROM hits GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
Execution time: 26581 ms, Total time: 26953 ms
Execution time: 5843 ms, Total time: 6158 ms
Execution time: 5970 ms, Total time: 6286 ms
3
SELECT UserID, SearchPhrase, count(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
Execution time: 33007 ms, Total time: 33581 ms
Execution time: 9943 ms, Total time: 10509 ms
Execution time: 9470 ms, Total time: 10047 ms
3
SELECT UserID, SearchPhrase, count(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
Execution time: 39009 ms, Total time: 39575 ms
Execution time: 8151 ms, Total time: 8785 ms
Execution time: 8037 ms, Total time: 8665 ms
3
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, count(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
Execution time: 56207 ms, Total time: 57764 ms
Execution time: 26653 ms, Total time: 28199 ms
Execution time: 25614 ms, Total time: 27336 ms
3
SELECT UserID FROM hits WHERE UserID = -6101065172474983726;
Execution time: 18975 ms, Total time: 18976 ms
Execution time: 136 ms, Total time: 136 ms
Execution time: 136 ms, Total time: 136 ms
3
SELECT count(*) FROM hits WHERE URL LIKE '%metrika%';
Execution time: 32444 ms, Total time: 32445 ms
Execution time: 125 ms, Total time: 126 ms
Execution time: 134 ms, Total time: 136 ms
3
SELECT SearchPhrase, min(URL), count(*) AS c FROM hits WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
Exception: Aggregate on TEXT is not supported yet.
Exception: Aggregate on TEXT is not supported yet.
Exception: Aggregate on TEXT is not supported yet.
3
SELECT SearchPhrase, min(URL), min(Title), count(*) AS c, APPROX_COUNT_DISTINCT(UserID) FROM hits WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
Exception: Aggregate on TEXT is not supported yet.
Exception: Aggregate on TEXT is not supported yet.
Exception: Aggregate on TEXT is not supported yet.
3
SELECT * FROM hits WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
Execution time: 96163 ms, Total time: 96166 ms
Execution time: 312 ms, Total time: 314 ms
Execution time: 303 ms, Total time: 305 ms
3
SELECT SearchPhrase FROM hits WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
Execution time: 27493 ms, Total time: 27494 ms
Execution time: 216 ms, Total time: 216 ms
Execution time: 221 ms, Total time: 222 ms
3
SELECT SearchPhrase FROM hits WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
Execution time: 38230 ms, Total time: 38308 ms
Execution time: 17175 ms, Total time: 17256 ms
Execution time: 17225 ms, Total time: 17310 ms
3
SELECT SearchPhrase FROM hits WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
Execution time: 115614 ms, Total time: 115714 ms
Execution time: 95944 ms, Total time: 96041 ms
Execution time: 94274 ms, Total time: 94383 ms
3
SELECT CounterID, avg(length(URL)) AS l, count(*) AS c FROM hits WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
Execution time: 31775 ms, Total time: 31779 ms
Execution time: 2643 ms, Total time: 2647 ms
Execution time: 2933 ms, Total time: 2937 ms
3
SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count(*) AS c, min(Referer) FROM hits WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
Exception: Exception occurred: org.apache.calcite.runtime.CalciteContextException: From line 1, column 8 to line 1, column 36: No match found for function signature domainWithoutWWW(<CHARACTER>)
Exception: Exception occurred: org.apache.calcite.runtime.CalciteContextException: From line 1, column 8 to line 1, column 36: No match found for function signature domainWithoutWWW(<CHARACTER>)
Exception: Exception occurred: org.apache.calcite.runtime.CalciteContextException: From line 1, column 8 to line 1, column 36: No match found for function signature domainWithoutWWW(<CHARACTER>)
3
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits;
Execution time: 28853 ms, Total time: 28854 ms
Execution time: 5654 ms, Total time: 5655 ms
Execution time: 5579 ms, Total time: 5581 ms
3
SELECT SearchEngineID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM hits WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
Execution time: 31694 ms, Total time: 31925 ms
Execution time: 3872 ms, Total time: 4142 ms
Execution time: 3928 ms, Total time: 4162 ms
3
SELECT WatchID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM hits WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
Execution time: 43690 ms, Total time: 44297 ms
Execution time: 8221 ms, Total time: 8825 ms
Execution time: 8115 ms, Total time: 8711 ms
3
SELECT URL, count(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;
Execution time: 29669 ms, Total time: 29715 ms
Execution time: 1623 ms, Total time: 1669 ms
Execution time: 1534 ms, Total time: 1586 ms
3
SELECT 1, URL, count(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
Execution time: 34860 ms, Total time: 35201 ms
Execution time: 7075 ms, Total time: 7414 ms
Execution time: 7164 ms, Total time: 7567 ms
3
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
Execution time: 26467 ms, Total time: 26724 ms
Execution time: 5740 ms, Total time: 6026 ms
Execution time: 5667 ms, Total time: 5920 ms
3
SELECT URL, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND "Refresh" = 0 AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
Execution time: 31899 ms, Total time: 31908 ms
Execution time: 1141 ms, Total time: 1154 ms
Execution time: 1155 ms, Total time: 1168 ms
3
SELECT Title, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND "Refresh" = 0 AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
Execution time: 27991 ms, Total time: 27997 ms
Execution time: 719 ms, Total time: 724 ms
Execution time: 737 ms, Total time: 744 ms
3
SELECT URL, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND IsLink != 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
Execution time: 34651 ms, Total time: 34661 ms
Execution time: 1182 ms, Total time: 1200 ms
Execution time: 1142 ms, Total time: 1159 ms
3
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
Execution time: 30130 ms, Total time: 30136 ms
Execution time: 461 ms, Total time: 467 ms
Execution time: 445 ms, Total time: 451 ms
3
SELECT URLHash, EventDate, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 686716256552154761 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100;
Execution time: 19989 ms, Total time: 19991 ms
Execution time: 326 ms, Total time: 327 ms
Execution time: 325 ms, Total time: 326 ms
3
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND DontCountHits = 0 AND URLHash = 686716256552154761 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
Execution time: 18658 ms, Total time: 18660 ms
Execution time: 265 ms, Total time: 266 ms
Execution time: 254 ms, Total time: 255 ms
3
SELECT DATE_TRUNC(minute, EventTime) AS "Minute", count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-02' AND "Refresh" = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC(minute, EventTime) ORDER BY DATE_TRUNC(minute, EventTime);
Execution time: 25225 ms, Total time: 25227 ms
Execution time: 210 ms, Total time: 212 ms
Execution time: 199 ms, Total time: 200 ms

View File

@ -0,0 +1,43 @@
SELECT count(*) FROM {table};
SELECT count(*) FROM {table} WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM {table};
SELECT sum(UserID) FROM {table};
SELECT APPROX_COUNT_DISTINCT(UserID) FROM {table};
SELECT APPROX_COUNT_DISTINCT(SearchPhrase) FROM {table};
SELECT min(EventDate), max(EventDate) FROM {table};
SELECT AdvEngineID, count(*) FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
SELECT RegionID, APPROX_COUNT_DISTINCT(UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), APPROX_COUNT_DISTINCT(UserID) FROM {table} GROUP BY RegionID ORDER BY c DESC LIMIT 10;
SELECT MobilePhoneModel, APPROX_COUNT_DISTINCT(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, APPROX_COUNT_DISTINCT(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, count(*) AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, APPROX_COUNT_DISTINCT(UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, count(*) AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT UserID, count(*) FROM {table} GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, count(*) FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID FROM {table} WHERE UserID = -6101065172474983726;
SELECT count(*) FROM {table} WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, min(URL), count(*) AS c FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, min(URL), min(Title), count(*) AS c, APPROX_COUNT_DISTINCT(UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, avg(length(URL)) AS l, count(*) AS c FROM {table} WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count(*) AS c, min(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table};
SELECT SearchEngineID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
#SELECT WatchID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT URL, count(*) AS c FROM {table} GROUP BY URL ORDER BY c DESC LIMIT 10;
SELECT 1, URL, count(*) AS c FROM {table} GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) AS c FROM {table} GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND "Refresh" = 0 AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND "Refresh" = 0 AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND IsLink != 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 686716256552154761 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND DontCountHits = 0 AND URLHash = 686716256552154761 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT DATE_TRUNC(minute, EventTime) AS "Minute", count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-02' AND "Refresh" = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC(minute, EventTime) ORDER BY DATE_TRUNC(minute, EventTime);

View File

@ -7,7 +7,7 @@
#
# Sets values of:
# OPENLDAP_FOUND - TRUE if found
# OPENLDAP_INCLUDE_DIR - path to the include directory
# OPENLDAP_INCLUDE_DIRS - paths to the include directories
# OPENLDAP_LIBRARIES - paths to the libldap and liblber libraries
# OPENLDAP_LDAP_LIBRARY - paths to the libldap library
# OPENLDAP_LBER_LIBRARY - paths to the liblber library
@ -28,11 +28,11 @@ if(OPENLDAP_USE_REENTRANT_LIBS)
endif()
if(OPENLDAP_ROOT_DIR)
find_path(OPENLDAP_INCLUDE_DIR NAMES "ldap.h" "lber.h" PATHS "${OPENLDAP_ROOT_DIR}" PATH_SUFFIXES "include" NO_DEFAULT_PATH)
find_path(OPENLDAP_INCLUDE_DIRS NAMES "ldap.h" "lber.h" PATHS "${OPENLDAP_ROOT_DIR}" PATH_SUFFIXES "include" NO_DEFAULT_PATH)
find_library(OPENLDAP_LDAP_LIBRARY NAMES "ldap${_r_suffix}" PATHS "${OPENLDAP_ROOT_DIR}" PATH_SUFFIXES "lib" NO_DEFAULT_PATH)
find_library(OPENLDAP_LBER_LIBRARY NAMES "lber" PATHS "${OPENLDAP_ROOT_DIR}" PATH_SUFFIXES "lib" NO_DEFAULT_PATH)
else()
find_path(OPENLDAP_INCLUDE_DIR NAMES "ldap.h" "lber.h")
find_path(OPENLDAP_INCLUDE_DIRS NAMES "ldap.h" "lber.h")
find_library(OPENLDAP_LDAP_LIBRARY NAMES "ldap${_r_suffix}")
find_library(OPENLDAP_LBER_LIBRARY NAMES "lber")
endif()
@ -44,10 +44,10 @@ set(OPENLDAP_LIBRARIES ${OPENLDAP_LDAP_LIBRARY} ${OPENLDAP_LBER_LIBRARY})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(
OpenLDAP DEFAULT_MSG
OPENLDAP_INCLUDE_DIR OPENLDAP_LDAP_LIBRARY OPENLDAP_LBER_LIBRARY
OPENLDAP_INCLUDE_DIRS OPENLDAP_LDAP_LIBRARY OPENLDAP_LBER_LIBRARY
)
mark_as_advanced(OPENLDAP_INCLUDE_DIR OPENLDAP_LIBRARIES OPENLDAP_LDAP_LIBRARY OPENLDAP_LBER_LIBRARY)
mark_as_advanced(OPENLDAP_INCLUDE_DIRS OPENLDAP_LIBRARIES OPENLDAP_LDAP_LIBRARY OPENLDAP_LBER_LIBRARY)
if(OPENLDAP_USE_STATIC_LIBS)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${_orig_CMAKE_FIND_LIBRARY_SUFFIXES})

View File

@ -1,9 +1,9 @@
# This strings autochanged from release_lib.sh:
SET(VERSION_REVISION 54436)
SET(VERSION_REVISION 54437)
SET(VERSION_MAJOR 20)
SET(VERSION_MINOR 6)
SET(VERSION_MINOR 7)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH efc57fb063b3fb4df968d916720ec4d4ced4642e)
SET(VERSION_DESCRIBE v20.6.1.1-prestable)
SET(VERSION_STRING 20.6.1.1)
SET(VERSION_GITHASH d64e51d1a78c1b53c33915ca0f75c97b2333844f)
SET(VERSION_DESCRIBE v20.7.1.1-prestable)
SET(VERSION_STRING 20.7.1.1)
# end of autochange

View File

@ -1,4 +1,5 @@
SET(ENABLE_AMQPCPP ${ENABLE_LIBRARIES})
option(ENABLE_AMQPCPP "Enalbe AMQP-CPP" ${ENABLE_LIBRARIES})
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/AMQP-CPP/CMakeLists.txt")
message (WARNING "submodule contrib/AMQP-CPP is missing. to fix try run: \n git submodule update --init --recursive")
set (ENABLE_AMQPCPP 0)

View File

@ -1,3 +1,7 @@
option (ENABLE_GTEST_LIBRARY "Enable gtest library" ${ENABLE_LIBRARIES})
if (ENABLE_GTEST_LIBRARY)
option (USE_INTERNAL_GTEST_LIBRARY "Set to FALSE to use system Google Test instead of bundled" ${NOT_UNBUNDLED})
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/googletest/googletest/CMakeLists.txt")
@ -28,4 +32,6 @@ if((GTEST_INCLUDE_DIRS AND GTEST_BOTH_LIBRARIES) OR GTEST_SRC_DIR)
set(USE_GTEST 1)
endif()
endif()
message (STATUS "Using gtest=${USE_GTEST}: ${GTEST_INCLUDE_DIRS} : ${GTEST_BOTH_LIBRARIES} : ${GTEST_SRC_DIR}")

View File

@ -16,11 +16,16 @@ if (ENABLE_LDAP)
set (OPENLDAP_USE_REENTRANT_LIBS 1)
if (NOT USE_INTERNAL_LDAP_LIBRARY)
if (APPLE AND NOT OPENLDAP_ROOT_DIR)
set (OPENLDAP_ROOT_DIR "/usr/local/opt/openldap")
endif ()
if (OPENLDAP_USE_STATIC_LIBS)
message (WARNING "Unable to use external static OpenLDAP libraries, falling back to the bundled version.")
set (USE_INTERNAL_LDAP_LIBRARY 1)
else ()
if (APPLE AND NOT OPENLDAP_ROOT_DIR)
set (OPENLDAP_ROOT_DIR "/usr/local/opt/openldap")
endif ()
find_package (OpenLDAP)
find_package (OpenLDAP)
endif ()
endif ()
if (NOT OPENLDAP_FOUND AND NOT MISSING_INTERNAL_LDAP_LIBRARY)
@ -54,7 +59,10 @@ if (ENABLE_LDAP)
else ()
set (USE_INTERNAL_LDAP_LIBRARY 1)
set (OPENLDAP_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/openldap")
set (OPENLDAP_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/openldap/include")
set (OPENLDAP_INCLUDE_DIRS
"${ClickHouse_SOURCE_DIR}/contrib/openldap-cmake/${_system_name}_${_system_processor}/include"
"${ClickHouse_SOURCE_DIR}/contrib/openldap/include"
)
# Below, 'ldap'/'ldap_r' and 'lber' will be resolved to
# the targets defined in contrib/openldap-cmake/CMakeLists.txt
if (OPENLDAP_USE_REENTRANT_LIBS)
@ -73,4 +81,4 @@ if (ENABLE_LDAP)
endif ()
endif ()
message (STATUS "Using ldap=${USE_LDAP}: ${OPENLDAP_INCLUDE_DIR} : ${OPENLDAP_LIBRARIES}")
message (STATUS "Using ldap=${USE_LDAP}: ${OPENLDAP_INCLUDE_DIRS} : ${OPENLDAP_LIBRARIES}")

View File

@ -1,3 +1,7 @@
option(ENABLE_GSASL_LIBRARY "Enable gsasl library" ${ENABLE_LIBRARIES})
if (ENABLE_GSASL_LIBRARY)
option (USE_INTERNAL_LIBGSASL_LIBRARY "Set to FALSE to use system libgsasl library instead of bundled" ${NOT_UNBUNDLED})
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libgsasl/src/gsasl.h")
@ -24,4 +28,6 @@ if(LIBGSASL_LIBRARY AND LIBGSASL_INCLUDE_DIR)
set (USE_LIBGSASL 1)
endif()
endif()
message (STATUS "Using libgsasl=${USE_LIBGSASL}: ${LIBGSASL_INCLUDE_DIR} : ${LIBGSASL_LIBRARY}")

View File

@ -1,3 +1,7 @@
option (ENABLE_MSGPACK "Enable msgpack library" ${ENABLE_LIBRARIES})
if (ENABLE_MSGPACK)
option (USE_INTERNAL_MSGPACK_LIBRARY "Set to FALSE to use system msgpack library instead of bundled" ${NOT_UNBUNDLED})
if (USE_INTERNAL_MSGPACK_LIBRARY)
@ -14,4 +18,10 @@ else()
find_path(MSGPACK_INCLUDE_DIR NAMES msgpack.hpp PATHS ${MSGPACK_INCLUDE_PATHS})
endif()
message(STATUS "Using msgpack: ${MSGPACK_INCLUDE_DIR}")
if (MSGPACK_INCLUDE_DIR)
set(USE_MSGPACK 1)
endif()
endif()
message(STATUS "Using msgpack=${USE_MSGPACK}: ${MSGPACK_INCLUDE_DIR}")

View File

@ -1,17 +1,8 @@
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/simdjson/include/simdjson/jsonparser.h")
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/simdjson/include/simdjson.h")
message (WARNING "submodule contrib/simdjson is missing. to fix try run: \n git submodule update --init --recursive")
return()
endif ()
if (NOT HAVE_SSE42)
message (WARNING "submodule contrib/simdjson requires support of SSE4.2 instructions")
return()
elseif (NOT HAVE_PCLMULQDQ)
message (WARNING "submodule contrib/simdjson requires support of PCLMULQDQ instructions")
return()
endif ()
option (USE_SIMDJSON "Use simdjson" ON)
set (SIMDJSON_LIBRARY "simdjson")
message(STATUS "Using simdjson=${USE_SIMDJSON}: ${SIMDJSON_LIBRARY}")
message(STATUS "Using simdjson=${USE_SIMDJSON}")

20
cmake/find/stats.cmake Normal file
View File

@ -0,0 +1,20 @@
option(ENABLE_STATS "Enalbe StatsLib library" ${ENABLE_LIBRARIES})
if (ENABLE_STATS)
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/stats")
message (WARNING "submodule contrib/stats is missing. to fix try run: \n git submodule update --init --recursive")
set (ENABLE_STATS 0)
set (USE_STATS 0)
elseif (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/gcem")
message (WARNING "submodule contrib/gcem is missing. to fix try run: \n git submodule update --init --recursive")
set (ENABLE_STATS 0)
set (USE_STATS 0)
else()
set(STATS_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/stats/include)
set(GCEM_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/gcem/include)
set (USE_STATS 1)
endif()
endif()
message (STATUS "Using stats=${USE_STATS} : ${STATS_INCLUDE_DIR}")
message (STATUS "Using gcem=${USE_STATS}: ${GCEM_INCLUDE_DIR}")

View File

@ -11,7 +11,12 @@ else ()
set (BUILTINS_LIBRARY "-lgcc")
endif ()
if (OS_ANDROID)
# pthread and rt are included in libc
set (DEFAULT_LIBS "${DEFAULT_LIBS} ${BUILTINS_LIBRARY} ${COVERAGE_OPTION} -lc -lm -ldl")
else ()
set (DEFAULT_LIBS "${DEFAULT_LIBS} ${BUILTINS_LIBRARY} ${COVERAGE_OPTION} -lc -lm -lrt -lpthread -ldl")
endif ()
message(STATUS "Default libraries: ${DEFAULT_LIBS}")
@ -35,7 +40,11 @@ add_library(global-libs INTERFACE)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
add_subdirectory(base/glibc-compatibility)
if (NOT OS_ANDROID)
# Our compatibility layer doesn't build under Android, many errors in musl.
add_subdirectory(base/glibc-compatibility)
endif ()
include (cmake/find/unwind.cmake)
include (cmake/find/cxx.cmake)

View File

@ -1,6 +1,11 @@
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
set (OS_LINUX 1)
add_definitions(-D OS_LINUX)
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
# This is a toy configuration and not in CI, so expect it to be broken.
# Use cmake flags such as: -DCMAKE_TOOLCHAIN_FILE=~/ch2/android-ndk-r21d/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=28
set (OS_ANDROID 1)
add_definitions(-D OS_ANDROID)
elseif (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
set (OS_FREEBSD 1)
add_definitions(-D OS_FREEBSD)
@ -17,7 +22,7 @@ if (CMAKE_CROSSCOMPILING)
set (ENABLE_PARQUET OFF CACHE INTERNAL "")
set (ENABLE_ICU OFF CACHE INTERNAL "")
set (ENABLE_FASTOPS OFF CACHE INTERNAL "")
elseif (OS_LINUX)
elseif (OS_LINUX OR OS_ANDROID)
if (ARCH_AARCH64)
# FIXME: broken dependencies
set (ENABLE_PROTOBUF OFF CACHE INTERNAL "")

View File

@ -22,7 +22,7 @@ elseif (COMPILER_CLANG)
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${APPLE_CLANG_MINIMUM_VERSION})
message (FATAL_ERROR "AppleClang compiler version must be at least ${APPLE_CLANG_MINIMUM_VERSION} (Xcode ${XCODE_MINIMUM_VERSION}).")
elseif (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0.0)
# char8_t is available staring (upstream vanilla) Clang 7, but prior to Clang 8,
# char8_t is available starting (upstream vanilla) Clang 7, but prior to Clang 8,
# it is not enabled by -std=c++20 and can be enabled with an explicit -fchar8_t.
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fchar8_t")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fchar8_t")

View File

@ -44,13 +44,8 @@ endif ()
if (USE_INTERNAL_RE2_LIBRARY)
set(RE2_BUILD_TESTING 0 CACHE INTERNAL "")
function(re2_support)
# make option() honor normal variables for BUILD_SHARED_LIBS
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
add_subdirectory (re2)
add_subdirectory (re2_st)
endfunction()
re2_support()
add_subdirectory (re2)
add_subdirectory (re2_st)
endif ()
if (USE_INTERNAL_DOUBLE_CONVERSION_LIBRARY)
@ -107,7 +102,7 @@ if (USE_INTERNAL_SSL_LIBRARY)
add_library(OpenSSL::SSL ALIAS ${OPENSSL_SSL_LIBRARY})
endif ()
if (ENABLE_LDAP AND USE_INTERNAL_LDAP_LIBRARY)
if (USE_INTERNAL_LDAP_LIBRARY)
add_subdirectory (openldap-cmake)
endif ()
@ -227,19 +222,11 @@ if (USE_INTERNAL_AVRO_LIBRARY)
endif()
if(USE_INTERNAL_GTEST_LIBRARY)
# Wrap into function because of CMAKE_POLICY_DEFAULT_CMP0022
function(googletest_support)
set(GOOGLETEST_VERSION 1.10.0) # master
# Google Test from sources uses too old cmake, 2.6.x, and CMP0022 should
# set, to avoid using deprecated LINK_INTERFACE_LIBRARIES(_<CONFIG>)? over
# INTERFACE_LINK_LIBRARIES.
set(CMAKE_POLICY_DEFAULT_CMP0022 NEW)
# Google Test from sources
add_subdirectory(${ClickHouse_SOURCE_DIR}/contrib/googletest/googletest ${CMAKE_CURRENT_BINARY_DIR}/googletest)
# avoid problems with <regexp.h>
target_compile_definitions (gtest INTERFACE GTEST_HAS_POSIX_RE=0)
endfunction()
googletest_support()
set(GOOGLETEST_VERSION 1.10.0) # master
# Google Test from sources
add_subdirectory(${ClickHouse_SOURCE_DIR}/contrib/googletest/googletest ${CMAKE_CURRENT_BINARY_DIR}/googletest)
# avoid problems with <regexp.h>
target_compile_definitions (gtest INTERFACE GTEST_HAS_POSIX_RE=0)
elseif(GTEST_SRC_DIR)
add_subdirectory(${GTEST_SRC_DIR}/googletest ${CMAKE_CURRENT_BINARY_DIR}/googletest)
target_compile_definitions(gtest INTERFACE GTEST_HAS_POSIX_RE=0)
@ -320,3 +307,7 @@ endif()
add_subdirectory (fmtlib-cmake)
if (USE_STATS)
add_subdirectory (stats-cmake)
add_subdirectory (gcem)
endif()

View File

@ -24,7 +24,7 @@ set (SRCS
add_library(amqp-cpp ${SRCS})
target_compile_options (amqp-cpp
PUBLIC
PRIVATE
-Wno-old-style-cast
-Wno-inconsistent-missing-destructor-override
-Wno-deprecated
@ -38,7 +38,7 @@ target_compile_options (amqp-cpp
-w
)
target_include_directories (amqp-cpp PUBLIC ${LIBRARY_DIR}/include)
target_include_directories (amqp-cpp SYSTEM PUBLIC ${LIBRARY_DIR}/include)
target_link_libraries (amqp-cpp PUBLIC ssl)

View File

@ -20,5 +20,7 @@
#define ARROW_VERSION_PATCH
#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
/* #undef DOUBLE_CONVERSION_HAS_CASE_INSENSIBILITY */
#define ARROW_SO_VERSION ""
#define ARROW_FULL_SO_VERSION ""
/* #undef GRPCPP_PP_INCLUDE */

2
contrib/base64 vendored

@ -1 +1 @@
Subproject commit 95ba56a9b041f9933f5cd2bbb2ee4e083468c20a
Subproject commit af9b331f2b4f30b41c70f3a571ff904a8251c1d3

2
contrib/cppkafka vendored

@ -1 +1 @@
Subproject commit f555ee36aaa74d17ca0dab3ce472070a610b2966
Subproject commit b06e64ef5bffd636d918a742c689f69130c1dbab

2
contrib/fmtlib vendored

@ -1 +1 @@
Subproject commit 297c3b2ed551a4989826fc8c4780bf533e964bd9
Subproject commit c108ee1d590089ccf642fc85652b845924067af2

1
contrib/gcem vendored Submodule

@ -0,0 +1 @@
Subproject commit 8d4f1b5d76ea8f6ff12f3f4f34cda45424556b00

2
contrib/libhdfs3 vendored

@ -1 +1 @@
Subproject commit e2131aa752d7e95441e08f9a18304c1445f2576a
Subproject commit 1b666578c85094306b061352078022f6350bfab8

2
contrib/simdjson vendored

@ -1 +1 @@
Subproject commit 560f0742cc0895d00d78359dbdeb82064a24adb8
Subproject commit 1e4aa116e5a39e4ba23b9a93e6c7f048c5105b20

View File

@ -1,14 +1,6 @@
set(SIMDJSON_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/simdjson/include")
set(SIMDJSON_SRC_DIR "${SIMDJSON_INCLUDE_DIR}/../src")
set(SIMDJSON_SRC
${SIMDJSON_SRC_DIR}/document.cpp
${SIMDJSON_SRC_DIR}/error.cpp
${SIMDJSON_SRC_DIR}/implementation.cpp
${SIMDJSON_SRC_DIR}/jsonioutil.cpp
${SIMDJSON_SRC_DIR}/jsonminifier.cpp
${SIMDJSON_SRC_DIR}/stage1_find_marks.cpp
${SIMDJSON_SRC_DIR}/stage2_build_tape.cpp
)
set(SIMDJSON_SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/simdjson/src")
set(SIMDJSON_SRC ${SIMDJSON_SRC_DIR}/simdjson.cpp)
add_library(${SIMDJSON_LIBRARY} ${SIMDJSON_SRC})
target_include_directories(${SIMDJSON_LIBRARY} SYSTEM PUBLIC "${SIMDJSON_INCLUDE_DIR}" PRIVATE "${SIMDJSON_SRC_DIR}")
add_library(simdjson ${SIMDJSON_SRC})
target_include_directories(simdjson SYSTEM PUBLIC "${SIMDJSON_INCLUDE_DIR}" PRIVATE "${SIMDJSON_SRC_DIR}")

1
contrib/stats vendored Submodule

@ -0,0 +1 @@
Subproject commit b6dd459c10a88c7ea04693c007e9e35820c5d9ad

View File

@ -0,0 +1,9 @@
# The stats is a header-only library of probability density functions,
# cumulative distribution functions, quantile functions, and random sampling methods.
set(STATS_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/stats/include)
set(GCEM_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/gcem/include)
add_library(stats INTERFACE)
target_include_directories(stats SYSTEM INTERFACE ${STATS_INCLUDE_DIR})
target_include_directories(stats SYSTEM INTERFACE ${GCEM_INCLUDE_DIR})

4
debian/changelog vendored
View File

@ -1,5 +1,5 @@
clickhouse (20.6.1.1) unstable; urgency=low
clickhouse (20.7.1.1) unstable; urgency=low
* Modified source code
-- clickhouse-release <clickhouse-release@yandex-team.ru> Mon, 22 Jun 2020 20:40:23 +0300
-- clickhouse-release <clickhouse-release@yandex-team.ru> Mon, 13 Jul 2020 18:25:58 +0300

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=20.6.1.*
ARG version=20.7.1.*
RUN apt-get update \
&& apt-get install --yes --no-install-recommends \

View File

@ -31,6 +31,10 @@
"name": "yandex/clickhouse-integration-test",
"dependent": []
},
"docker/test/fuzzer": {
"name": "yandex/clickhouse-fuzzer",
"dependent": []
},
"docker/test/performance-comparison": {
"name": "yandex/clickhouse-performance-comparison",
"dependent": []
@ -83,5 +87,21 @@
"docker/test/testflows/runner": {
"name": "yandex/clickhouse-testflows-runner",
"dependent": []
},
"docker/test/fasttest": {
"name": "yandex/clickhouse-fasttest",
"dependent": []
},
"docker/test/integration/s3_proxy": {
"name": "yandex/clickhouse-s3-proxy",
"dependent": []
},
"docker/test/integration/resolver": {
"name": "yandex/clickhouse-python-bottle",
"dependent": []
},
"docker/test/integration/helper_container": {
"name": "yandex/clickhouse-integration-helper",
"dependent": []
}
}

View File

@ -33,6 +33,25 @@ then
rm /output/clickhouse-odbc-bridge ||:
cp -r ../docker/test/performance-comparison /output/scripts ||:
# We have to know the revision that corresponds to this binary build.
# It is not the nominal SHA from pull/*/head, but the pull/*/merge, which is
# head merged to master by github, at some point after the PR is updated.
# There are some quirks to consider:
# - apparently the real SHA is not recorded in system.build_options;
# - it can change at any time as github pleases, so we can't just record
# the SHA and use it later, it might become inaccessible;
# - CI has an immutable snapshot of repository that it uses for all checks
# for a given nominal SHA, but it is not accessible outside Yandex.
# This is why we add this repository snapshot from CI to the performance test
# package.
mkdir /output/ch
git -C /output/ch init --bare
git -C /output/ch remote add origin /build
git -C /output/ch fetch --no-tags --depth 50 origin HEAD:pr
git -C /output/ch fetch --no-tags --depth 50 origin master:master
git -C /output/ch reset --soft pr
git -C /output/ch log -5
fi
# May be set for split build or for performance test.

View File

@ -1,7 +1,7 @@
FROM ubuntu:20.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=20.6.1.*
ARG version=20.7.1.*
ARG gosu_ver=1.10
RUN apt-get update \

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=20.6.1.*
ARG version=20.7.1.*
RUN apt-get update && \
apt-get install -y apt-transport-https dirmngr && \

View File

@ -0,0 +1,65 @@
# docker build -t yandex/clickhouse-fasttest .
FROM ubuntu:19.10
ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.4.20200302/clickhouse-odbc-1.1.4-Linux.tar.gz"
ENV COMMIT_SHA=''
ENV PULL_REQUEST_NUMBER=''
RUN apt-get --allow-unauthenticated update -y && apt-get install --yes wget gnupg
RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
RUN echo "deb [trusted=yes] http://apt.llvm.org/eoan/ llvm-toolchain-eoan-10 main" >> /etc/apt/sources.list
RUN apt-get --allow-unauthenticated update -y \
&& env DEBIAN_FRONTEND=noninteractive \
apt-get --allow-unauthenticated install --yes --no-install-recommends \
bash \
fakeroot \
ccache \
software-properties-common \
apt-transport-https \
ca-certificates \
wget \
bash \
fakeroot \
cmake \
ccache \
llvm-10 \
clang-10 \
lld-10 \
clang-tidy-10 \
ninja-build \
gperf \
git \
tzdata \
gperf \
rename \
build-essential \
expect \
python \
python-lxml \
python-termcolor \
python-requests \
unixodbc \
qemu-user-static \
sudo \
moreutils \
curl \
brotli
RUN mkdir -p /tmp/clickhouse-odbc-tmp \
&& wget --quiet -O - ${odbc_driver_url} | tar --strip-components=1 -xz -C /tmp/clickhouse-odbc-tmp \
&& cp /tmp/clickhouse-odbc-tmp/lib64/*.so /usr/local/lib/ \
&& odbcinst -i -d -f /tmp/clickhouse-odbc-tmp/share/doc/clickhouse-odbc/config/odbcinst.ini.sample \
&& odbcinst -i -s -l -f /tmp/clickhouse-odbc-tmp/share/doc/clickhouse-odbc/config/odbc.ini.sample \
&& rm -rf /tmp/clickhouse-odbc-tmp
# This symlink required by gcc to find lld compiler
RUN ln -s /usr/bin/lld-10 /usr/bin/ld.lld
ENV TZ=Europe/Moscow
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
COPY run.sh /
CMD ["/bin/bash", "/run.sh"]

134
docker/test/fasttest/run.sh Executable file
View File

@ -0,0 +1,134 @@
#!/bin/bash
set -x -e
ls -la
git clone https://github.com/ClickHouse/ClickHouse.git | ts '%Y-%m-%d %H:%M:%S' | tee /test_output/clone_log.txt
cd ClickHouse
CLICKHOUSE_DIR=`pwd`
if [ "$PULL_REQUEST_NUMBER" != "0" ]; then
if git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then
git checkout FETCH_HEAD
echo 'Clonned merge head'
else
git fetch
git checkout $COMMIT_SHA
echo 'Checked out to commit'
fi
else
if [ "$COMMIT_SHA" != "" ]; then
git checkout $COMMIT_SHA
fi
fi
SUBMODULES_TO_UPDATE="contrib/boost contrib/zlib-ng contrib/libxml2 contrib/poco contrib/libunwind contrib/ryu contrib/fmtlib contrib/base64 contrib/cctz contrib/libcpuid contrib/double-conversion contrib/libcxx contrib/libcxxabi contrib/libc-headers contrib/lz4 contrib/zstd contrib/fastops contrib/rapidjson contrib/re2 contrib/sparsehash-c11"
git submodule update --init --recursive $SUBMODULES_TO_UPDATE | ts '%Y-%m-%d %H:%M:%S' | tee /test_output/submodule_log.txt
export CMAKE_LIBS_CONFIG="-DENABLE_LIBRARIES=0 -DENABLE_TESTS=0 -DENABLE_UTILS=0 -DENABLE_EMBEDDED_COMPILER=0 -DENABLE_THINLTO=0 -DUSE_UNWIND=1"
export CCACHE_DIR=/ccache
export CCACHE_BASEDIR=/ClickHouse
export CCACHE_NOHASHDIR=true
export CCACHE_COMPILERCHECK=content
export CCACHE_MAXSIZE=15G
ccache --show-stats ||:
ccache --zero-stats ||:
mkdir build
cd build
CLICKHOUSE_BUILD_DIR=`pwd`
cmake .. -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_C_COMPILER=clang-10 $CMAKE_LIBS_CONFIG | ts '%Y-%m-%d %H:%M:%S' | tee /test_output/cmake_log.txt
ninja clickhouse-bundle | ts '%Y-%m-%d %H:%M:%S' | tee /test_output/build_log.txt
ninja install | ts '%Y-%m-%d %H:%M:%S' | tee /test_output/install_log.txt
ccache --show-stats ||:
mkdir -p /etc/clickhouse-server
mkdir -p /etc/clickhouse-client
mkdir -p /etc/clickhouse-server/config.d
mkdir -p /etc/clickhouse-server/users.d
mkdir -p /var/log/clickhouse-server
cp $CLICKHOUSE_DIR/programs/server/config.xml /etc/clickhouse-server/
cp $CLICKHOUSE_DIR/programs/server/users.xml /etc/clickhouse-server/
mkdir -p /etc/clickhouse-server/dict_examples
ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/dict_examples/
ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/dict_examples/
ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/dict_examples/
ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/listen.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/metric_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/access_management.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/macros.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/disks.xml /etc/clickhouse-server/config.d/
#ln -s /usr/share/clickhouse-test/config/secure_ports.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/clusters.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/graphite.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/server.key /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/server.crt /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/dhparam.pem /etc/clickhouse-server/
ln -sf /usr/share/clickhouse-test/config/client_config.xml /etc/clickhouse-client/config.xml
clickhouse-server --config /etc/clickhouse-server/config.xml --daemon
until clickhouse-client --query "SELECT 1"
do
sleep 0.1
done
TESTS_TO_SKIP="parquet avro h3 odbc mysql sha256 _orc_ arrow 01098_temporary_and_external_tables 01083_expressions_in_engine_arguments hdfs 00911_tautological_compare protobuf capnproto java_hash hashing secure 00490_special_line_separators_and_characters_outside_of_bmp 00436_convert_charset 00105_shard_collations 01354_order_by_tuple_collate_const 01292_create_user 01098_msgpack_format 00929_multi_match_edit_distance 00926_multimatch 00834_cancel_http_readonly_queries_on_client_close brotli parallel_alter 00302_http_compression 00417_kill_query 01294_lazy_database_concurrent 01193_metadata_loading base64 01031_mutations_interpreter_and_context json client 01305_replica_create_drop_zookeeper 01092_memory_profiler 01355_ilike 01281_unsucceeded_insert_select_queries_counter live_view limit_memory memory_limit memory_leak 00110_external_sort 00682_empty_parts_merge 00701_rollup 00109_shard_totals_after_having ddl_dictionaries 01251_dict_is_in_infinite_loop 01259_dictionary_custom_settings_ddl 01268_dictionary_direct_layout 01280_ssd_complex_key_dictionary 00652_replicated_mutations_zookeeper 01411_bayesian_ab_testing"
clickhouse-test -j 4 --no-long --testname --shard --zookeeper --skip $TESTS_TO_SKIP 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee /test_output/test_log.txt
kill_clickhouse () {
kill `ps ax | grep clickhouse-server | grep -v 'grep' | awk '{print $1}'` 2>/dev/null
for i in {1..10}
do
if ! kill -0 `ps ax | grep clickhouse-server | grep -v 'grep' | awk '{print $1}'`; then
echo "No clickhouse process"
break
else
echo "Process" `ps ax | grep clickhouse-server | grep -v 'grep' | awk '{print $1}'` "still alive"
sleep 10
fi
done
}
FAILED_TESTS=`grep 'FAIL\|TIMEOUT\|ERROR' /test_output/test_log.txt | awk 'BEGIN { ORS=" " }; { print substr($3, 1, length($3)-1) }'`
if [[ ! -z "$FAILED_TESTS" ]]; then
kill_clickhouse
clickhouse-server --config /etc/clickhouse-server/config.xml --daemon
until clickhouse-client --query "SELECT 1"
do
sleep 0.1
done
echo "Going to run again: $FAILED_TESTS"
clickhouse-test --no-long --testname --shard --zookeeper $FAILED_TESTS 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee -a /test_output/test_log.txt
else
echo "No failed tests"
fi
mv /var/log/clickhouse-server/* /test_output

View File

@ -0,0 +1,38 @@
# docker build -t yandex/clickhouse-fuzzer .
FROM ubuntu:18.04
ENV LANG=C.UTF-8
ENV TZ=Europe/Moscow
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \
bash \
ca-certificates \
curl \
gdb \
git \
libc6-dbg \
moreutils \
ncdu \
p7zip-full \
parallel \
psmisc \
rsync \
tree \
tzdata \
vim \
wget \
&& apt-get autoremove --yes \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
COPY * /
SHELL ["/bin/bash", "-c"]
CMD set -o pipefail \
&& cd /workspace \
&& /run-fuzzer.sh 2>&1 | ts "$(printf '%%Y-%%m-%%d %%H:%%M:%%S\t')" | tee main.log
# docker run --network=host --volume <workspace>:/workspace -e PR_TO_TEST=<> -e SHA_TO_TEST=<> yandex/clickhouse-fuzzer

View File

@ -0,0 +1,7 @@
<yandex>
<profiles>
<default>
<max_execution_time>10</max_execution_time>
</default>
</profiles>
</yandex>

179
docker/test/fuzzer/run-fuzzer.sh Executable file
View File

@ -0,0 +1,179 @@
#!/bin/bash
set -eux
set -o pipefail
trap "exit" INT TERM
trap 'kill $(jobs -pr) ||:' EXIT
stage=${stage:-}
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
echo "$script_dir"
repo_dir=ch
function clone
{
(
rm -rf ch ||:
mkdir ch
cd ch
git init
git remote add origin https://github.com/ClickHouse/ClickHouse
git fetch --depth=1 origin "$SHA_TO_TEST"
# If not master, try to fetch pull/.../{head,merge}
if [ "$PR_TO_TEST" != "0" ]
then
git fetch --depth=1 origin "refs/pull/$PR_TO_TEST/*:refs/heads/pull/$PR_TO_TEST/*"
fi
git checkout "$SHA_TO_TEST"
)
}
function download
{
# wget -O- -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/performance/performance.tgz" \
# | tar --strip-components=1 -zxv
wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-10_debug_none_bundled_unsplitted_disable_False_binary/clickhouse"
chmod +x clickhouse
ln -s ./clickhouse ./clickhouse-server
ln -s ./clickhouse ./clickhouse-client
}
function configure
{
rm -rf db ||:
mkdir db ||:
cp -av "$repo_dir"/programs/server/config* db
cp -av "$repo_dir"/programs/server/user* db
# TODO figure out which ones are needed
cp -av "$repo_dir"/tests/config/listen.xml db/config.d
cp -av "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d
}
function watchdog
{
sleep 3600
echo "Fuzzing run has timed out"
killall clickhouse-client ||:
for x in {1..10}
do
if ! pgrep -f clickhouse-client
then
break
fi
sleep 1
done
killall -9 clickhouse-client ||:
}
function fuzz
{
./clickhouse-server --config-file db/config.xml -- --path db 2>&1 | tail -10000 > server.log &
server_pid=$!
kill -0 $server_pid
while ! ./clickhouse-client --query "select 1" && kill -0 $server_pid ; do echo . ; sleep 1 ; done
./clickhouse-client --query "select 1"
kill -0 $server_pid
echo Server started
fuzzer_exit_code=0
./clickhouse-client --query-fuzzer-runs=1000 \
< <(for f in $(ls ch/tests/queries/0_stateless/*.sql | sort -R); do cat "$f"; echo ';'; done) \
> >(tail -10000 > fuzzer.log) \
2>&1 \
|| fuzzer_exit_code=$?
echo "Fuzzer exit code is $fuzzer_exit_code"
./clickhouse-client --query "select elapsed, query from system.processes" ||:
killall clickhouse-server ||:
for x in {1..10}
do
if ! pgrep -f clickhouse-server
then
break
fi
sleep 1
done
killall -9 clickhouse-server ||:
}
case "$stage" in
"")
;&
"clone")
time clone
if [ -v FUZZ_LOCAL_SCRIPT ]
then
# just fall through
echo Using the testing script from docker container
:
else
# Run the testing script from the repository
echo Using the testing script from the repository
export stage=download
time ch/docker/test/fuzzer/run-fuzzer.sh
# Keep the error code
exit $?
fi
;&
"download")
time download
;&
"configure")
time configure
;&
"fuzz")
# Start a watchdog that should kill the fuzzer on timeout.
# The shell won't kill the child sleep when we kill it, so we have to put it
# into a separate process group so that we can kill them all.
set -m
watchdog &
watchdog_pid=$!
set +m
# Check that the watchdog has started
kill -0 $watchdog_pid
fuzzer_exit_code=0
time fuzz || fuzzer_exit_code=$?
kill -- -$watchdog_pid ||:
# Debug
date
sleep 10
jobs
pstree -aspgT
# Make files with status and description we'll show for this check on Github
task_exit_code=$fuzzer_exit_code
if [ "$fuzzer_exit_code" == 143 ]
then
# SIGTERM -- the fuzzer was killed by timeout, which means a normal run.
echo "success" > status.txt
echo "OK" > description.txt
task_exit_code=0
elif [ "$fuzzer_exit_code" == 210 ]
then
# Lost connection to the server. This probably means that the server died
# with abort.
echo "failure" > status.txt
if ! grep -a "Received signal \|Logical error" server.log > description.txt
then
echo "Lost connection to server. See the logs" > description.txt
fi
else
# Something different -- maybe the fuzzer itself died? Don't grep the
# server log in this case, because we will find a message about normal
# server termination (Received signal 15), which is confusing.
echo "failure" > status.txt
echo "Fuzzer failed ($fuzzer_exit_code). See the logs" > description.txt
fi
exit $task_exit_code
;&
esac

View File

@ -25,7 +25,7 @@ RUN rm -rf \
RUN apt-get clean
# Install MySQL ODBC driver
RUN curl 'https://cdn.mysql.com//Downloads/Connector-ODBC/8.0/mysql-connector-odbc-8.0.18-linux-glibc2.12-x86-64bit.tar.gz' --output 'mysql-connector.tar.gz' && tar -xzf mysql-connector.tar.gz && cd mysql-connector-odbc-8.0.18-linux-glibc2.12-x86-64bit/lib && mv * /usr/local/lib && ln -s /usr/local/lib/libmyodbc8a.so /usr/lib/x86_64-linux-gnu/odbc/libmyodbc.so
RUN curl 'https://cdn.mysql.com//Downloads/Connector-ODBC/8.0/mysql-connector-odbc-8.0.21-linux-glibc2.12-x86-64bit.tar.gz' --output 'mysql-connector.tar.gz' && tar -xzf mysql-connector.tar.gz && cd mysql-connector-odbc-8.0.21-linux-glibc2.12-x86-64bit/lib && mv * /usr/local/lib && ln -s /usr/local/lib/libmyodbc8a.so /usr/lib/x86_64-linux-gnu/odbc/libmyodbc.so
ENV TZ=Europe/Moscow
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

View File

@ -1,3 +1,4 @@
# docker build -t yandex/clickhouse-integration-helper .
# Helper docker container to run iptables without sudo
FROM alpine

View File

@ -1,4 +1,5 @@
# docker build -t yandex/clickhouse-python-bottle .
# Helper docker container to run python bottle apps
FROM python:3
RUN python -m pip install bottle
RUN python -m pip install bottle

View File

@ -76,4 +76,3 @@ VOLUME /var/lib/docker
EXPOSE 2375
ENTRYPOINT ["dockerd-entrypoint.sh"]
CMD ["sh", "-c", "pytest $PYTEST_OPTS"]

View File

@ -5,50 +5,38 @@ services:
image: minio/minio
volumes:
- data1-1:/data1
- ${MINIO_CERTS_DIR:-}:/certs
ports:
- "9001:9001"
environment:
MINIO_ACCESS_KEY: minio
MINIO_SECRET_KEY: minio123
command: server --address :9001 /data1-1
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9001/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
MINIO_PROMETHEUS_AUTH_TYPE: public
command: server --address :9001 --certs-dir /certs /data1-1
depends_on:
- redirect
- resolver
- proxy1
- proxy2
# Redirects all requests to origin Minio.
redirect:
image: schmunk42/nginx-redirect
volumes:
- /nginx:/nginx
environment:
- SERVER_REDIRECT=minio1:9001
- SERVER_REDIRECT_CODE=307
- SERVER_ACCESS_LOG=/nginx/access.log
# HTTP proxies for Minio.
# HTTP proxies for Minio.
proxy1:
image: vimagick/tinyproxy
image: yandex/clickhouse-s3-proxy
ports:
- "4081:8888"
- "8080" # Redirect proxy port
- "80" # Reverse proxy port
- "443" # Reverse proxy port (secure)
proxy2:
image: vimagick/tinyproxy
image: yandex/clickhouse-s3-proxy
ports:
- "4082:8888"
- "8080"
- "80"
- "443"
# Empty container to run proxy resolver.
# Empty container to run proxy resolver.
resolver:
build:
context: ../../../docker/test/integration/
dockerfile: resolver/Dockerfile
network: host
image: yandex/clickhouse-python-bottle
ports:
- "4083:8080"
- "8080"
tty: true
depends_on:
- proxy1

View File

@ -0,0 +1,11 @@
# docker build -t yandex/clickhouse-s3-proxy .
FROM nginx:alpine
COPY run.sh /run.sh
COPY server.crt /etc/ssl/certs/server.crt
COPY server.key /etc/ssl/certs/server.key
COPY nginx.conf /etc/nginx/nginx.conf
RUN chmod +x /run.sh
CMD ["/run.sh"]

View File

@ -0,0 +1,59 @@
events {
use epoll;
worker_connections 128;
}
http {
# Docker DNS resolver
resolver 127.0.0.11;
map $http_x_forwarded_proto $redirect_scheme {
default $scheme;
https https;
}
# Redirect proxy
server {
listen 8080;
server_name proxy1 proxy2;
# To allow special characters in headers
ignore_invalid_headers off;
return 307 $redirect_scheme://${S3_HOST}:${S3_PORT}$request_uri;
}
# Reverse proxy
server {
listen 80;
listen 443 ssl;
server_name proxy1 proxy2;
ssl_certificate /etc/ssl/certs/server.crt;
ssl_certificate_key /etc/ssl/certs/server.key;
# To allow special characters in headers
ignore_invalid_headers off;
# Allow any size file to be uploaded.
# Set to a value such as 1000m; to restrict file size to a specific value
client_max_body_size 0;
# To disable buffering
proxy_buffering off;
location / {
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Host $http_host;
proxy_connect_timeout 300;
# Default is HTTP/1, keepalive is only enabled in HTTP/1.1
proxy_http_version 1.1;
proxy_set_header Connection "";
chunked_transfer_encoding off;
proxy_pass $scheme://${S3_HOST}:${S3_PORT};
proxy_ssl_verify off;
}
}
}

View File

@ -0,0 +1,15 @@
#!/usr/bin/env sh
if [ -z "$S3_HOST" ] ; then
S3_HOST='minio1'
fi
if [ -z "$S3_PORT" ] ; then
S3_PORT='9001'
fi
# Replace config placeholders with environment variables
sed -i "s|\${S3_HOST}|${S3_HOST}|" /etc/nginx/nginx.conf
sed -i "s|\${S3_PORT}|${S3_PORT}|" /etc/nginx/nginx.conf
exec nginx -g 'daemon off;'

View File

@ -0,0 +1,19 @@
-----BEGIN CERTIFICATE-----
MIIDBTCCAe2gAwIBAgIRANb2pr4HgR8YFwKNJMUSWiIwDQYJKoZIhvcNAQELBQAw
EjEQMA4GA1UEChMHQWNtZSBDbzAeFw0yMDA3MDkxODE1MDBaFw0yMTA3MDkxODE1
MDBaMBIxEDAOBgNVBAoTB0FjbWUgQ28wggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAw
ggEKAoIBAQC9ORgaBCx42ejp9PSjc0uvwH/hTB6yZvZB4S+wxbzzfeKomX/JBcFH
mGCIJJVjVV0rafv3vw+9f9u4wrZpN4HZKnVyz3mBXEA1WDvLTLV8n8zVyso1qbnf
F9Fa8wnk89b0xGWyM7jie7/cTIGMrgm7hIPaM2zDzFwIfIAqZ1AexC4vADIffF9r
cFLLjNHuv1uAc32jdfQEPluvmBMzGkz254+MabxZWIZjkYn70kNSZDoyFmMGafBt
kRTUPNq2+fGv/eLJ9Lxm3153Ja0sCyzLlEo9+/z4ERqM5zwWre4vcwfO63c5pcSC
zGw84teTpmDwSyiSR70TYJdtBGQqZvLZAgMBAAGjVjBUMA4GA1UdDwEB/wQEAwIC
pDATBgNVHSUEDDAKBggrBgEFBQcDATAPBgNVHRMBAf8EBTADAQH/MBwGA1UdEQQV
MBOCBm1pbmlvMYIJbG9jYWxob3N0MA0GCSqGSIb3DQEBCwUAA4IBAQAKU2LhvFFz
RFfUibt/WTj3rtUfKEBrQuUOYt2A8MTbC8pyEu+UJASTzunluUFze5zchEm1s3pZ
YRLcNwbJqLE6CzUxQ9b2iUhaeWuKrx4ZoPkY0uGiaXM/iKfVKTuNmhF2Sf/P4xUE
Pt19yQjpIhcicWQc37BBQFvnvy+n5wgHa/pgl1+QUvAa/fwYhF9S28xRLESzZepm
NMYysopV+YMaxcFa9SH44toXtXnvRWwVdEorlq1W3/AiJg8hDPzSa9UXLMjA968J
ONtn3qvwac9Ot53+QsXJdsMmDZLWGCi6I1w0ZQetpr/0ubaA1F3GdK9eB/S0thqU
l2VUgn3c/kKS
-----END CERTIFICATE-----

View File

@ -0,0 +1,28 @@
-----BEGIN PRIVATE KEY-----
MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQC9ORgaBCx42ejp
9PSjc0uvwH/hTB6yZvZB4S+wxbzzfeKomX/JBcFHmGCIJJVjVV0rafv3vw+9f9u4
wrZpN4HZKnVyz3mBXEA1WDvLTLV8n8zVyso1qbnfF9Fa8wnk89b0xGWyM7jie7/c
TIGMrgm7hIPaM2zDzFwIfIAqZ1AexC4vADIffF9rcFLLjNHuv1uAc32jdfQEPluv
mBMzGkz254+MabxZWIZjkYn70kNSZDoyFmMGafBtkRTUPNq2+fGv/eLJ9Lxm3153
Ja0sCyzLlEo9+/z4ERqM5zwWre4vcwfO63c5pcSCzGw84teTpmDwSyiSR70TYJdt
BGQqZvLZAgMBAAECggEANe8oJ4I5CtlRwh3H/S7Hy/iaeqUvuroORwjghwpVqTGg
gV3/RlUVmkqceTG0QvP58n3rC9qxqdnfzvHw/FyN7lBj2a25fF3HD21u3aunrzX9
NJLwwAr4p9YqHjpX/6JhCrNQKVMEx8luDmTgKDETJRfIXVF7FvQQ53pVLcD03U+g
MgN61HBzfT5L0TLHoiKNQbVi+Wm1gw3zvb/a9Z1rULRZfIuKGM0bNNqRZt4rUUAV
QicklDR0Qv59jhr5Y/zjinKkqF8qudvUkaNT2JH1DLfXiAhuC0OQugMjYzNntQB4
hMhkqARnjuk/WPMvnXivnqx9o69BL5wyXIj3vD4fgQKBgQDVKaXAZJ5bo3VfcpLm
cyjtUuOzAxLU1bVGI0Hm1ARqeGVxSTypZLSX8xFi2n5Bvbgh/Y60aEac/1uKoXA9
gej1MT4hKpXyagrARx97E8zk5nf88kVxkiKUrifMjP2lDzHIYhdKk9R3SiV6gWvA
FoJtjBwFhJ6uWUPyry4nqFSENQKBgQDjP9k6CTZF0EnDqbADiQr7VKpebqhtLWRD
U0bQh/l57VrWqGksVOlivIJChP49q1H+hQ1YgfKIEDag8JJnf/inUSpVsw1ljAjv
knqNzn0Gdd9lTsiNGgqlCjhmWedkh4eO8uau479TwQc6gB4PQdLAFynQtt8Kk45P
GxdpRx4AlQKBgQCgxUGbYwhBC37aF1sObqrenBbajCXm2qxXEv6Ab0ZJWzb/g4I6
LJc8x3pEeZCiWsoG8Otxy/f+L2bGn049Rb8DNzmp4Cmp5SrorHvk4yE1P1IeOEgC
CXsFcnjYATrJBDXC8aCpgefMdOLhi71N6mxC3VrBGq5nxzHFVzTTelUMRQKBgQDa
yekhiCb5liy+tcuhy7qH+Z7BpjaATrh+XVoLgS5+5jeT/basmN/OUQH0e0iwJRaf
Poh30zynJT0DPDsobLwAkxN4SRg30Vf1GAjoKIqUwr2fMvfBafYfqbRdTmeKkTXB
OjlA3kKhp3GHMDxAojX+/Q4kRTx+WUwk+0dR88d99QKBgEiYrkSLjKXUFllDmVyp
HtlYKZiq5c33DA06SA2uVOprCdTbnbvP4WrgUsLGvqBcaPEd06fGGbvJWwUdnkXM
HNAkqSeUe5ueovidtoPdF+aPyxdGg3Z8551xOoHZFYrvgdZ4YMPcJrwQQsvWCcYP
GDnSoD8Xjd2LmekTpDBt5ZVz
-----END PRIVATE KEY-----

View File

@ -17,6 +17,7 @@ RUN apt-get update \
libc6-dbg \
moreutils \
ncdu \
numactl \
p7zip-full \
parallel \
psmisc \

View File

@ -36,6 +36,9 @@ Action required for every item -- these are errors that must be fixed. The error
#### Slow on client
Action required for every item -- these are errors that must be fixed. This table shows queries that take significantly longer to process on the client than on the server. A possible reason might be sending too much data to the client, e.g., a forgotten `format Null`.
#### Short queries not marked as short
Action required for every item -- these are errors that must be fixed. This table shows queries that are "short" but not explicitly marked as such. "Short" queries are too fast to meaningfully compare performance, because the changes are drowned by the noise. We consider all queries that run faster than 0.02 s to be "short", and only check the performance if they became slower than this threshold. Probably this mode is not what you want, so you have to increase the query run time to be between 1 and 0.1 s, so that the performance can be compared. You do want this "short" mode for queries that complete "immediately", such as some varieties of `select count(*)`. You have to mark them as "short" explicitly by writing `<query short="1">...`. The value of "short" attribute is evaluated as a python expression, and substitutions are performed, so you can write something like `<query short="{column1} = {column2}">select count(*) from table where {column1} > {column2}</query>`, to mark only a particular combination of variables as short.
#### Partial queries
Action required for the cells marked in red. Shows the queries we are unable to run on an old server -- probably because they contain a new function. You should see this table when you add a new function and a performance test for it. Check that the run time and variance are acceptable (run time between 0.1 and 1 seconds, variance below 10%). If not, they will be highlighted in red.

View File

@ -282,6 +282,7 @@ do
sed -n "s/^report-threshold\t/$test_name\t/p" < "$test_file" >> "analyze/report-thresholds.tsv"
sed -n "s/^skipped\t/$test_name\t/p" < "$test_file" >> "analyze/skipped-tests.tsv"
sed -n "s/^display-name\t/$test_name\t/p" < "$test_file" >> "analyze/query-display-names.tsv"
sed -n "s/^short\t/$test_name\t/p" < "$test_file" >> "analyze/marked-short-queries.tsv"
sed -n "s/^partial\t/$test_name\t/p" < "$test_file" >> "analyze/partial-queries.tsv"
done
unset IFS
@ -291,6 +292,9 @@ clickhouse-local --query "
create view query_runs as select * from file('analyze/query-runs.tsv', TSV,
'test text, query_index int, query_id text, version UInt8, time float');
-- Separately process 'partial' queries which we could only run on the new server
-- because they use new functions. We can't make normal stats for them, but still
-- have to show some stats so that the PR author can tweak them.
create view partial_queries as select test, query_index
from file('analyze/partial-queries.tsv', TSV,
'test text, query_index int, servers Array(int)');
@ -303,6 +307,7 @@ create table partial_query_times engine File(TSVWithNamesAndTypes,
group by test, query_index
;
-- Process queries that were run normally, on both servers.
create view left_query_log as select *
from file('left-query-log.tsv', TSVWithNamesAndTypes,
'$(cat "left-query-log.tsv.columns")');
@ -312,12 +317,17 @@ create view right_query_log as select *
'$(cat "right-query-log.tsv.columns")');
create view query_logs as
select *, 0 version from left_query_log
select 0 version, query_id, ProfileEvents.Names, ProfileEvents.Values,
query_duration_ms from left_query_log
union all
select *, 1 version from right_query_log
select 1 version, query_id, ProfileEvents.Names, ProfileEvents.Values,
query_duration_ms from right_query_log
;
create table query_run_metrics_full engine File(TSV, 'analyze/query-run-metrics-full.tsv')
-- This is a single source of truth on all metrics we have for query runs. The
-- metrics include ProfileEvents from system.query_log, and query run times
-- reported by the perf.py test runner.
create table query_run_metric_arrays engine File(TSV, 'analyze/query-run-metric-arrays.tsv')
as
with (
-- sumMapState with the list of all keys with '-0.' values. Negative zero is because
@ -349,18 +359,29 @@ create table query_run_metrics_full engine File(TSV, 'analyze/query-run-metrics-
where (test, query_index) not in partial_queries
;
create table query_run_metrics engine File(
-- This is just for convenience -- human-readable + easy to make plots.
create table query_run_metrics_denorm engine File(TSV, 'analyze/query-run-metrics-denorm.tsv')
as select test, query_index, metric_names, version, query_id, metric_values
from query_run_metric_arrays
array join metric_names, metric_values
order by test, query_index, metric_names, version, query_id
;
-- This is for statistical processing with eqmed.sql
create table query_run_metrics_for_stats engine File(
TSV, -- do not add header -- will parse with grep
'analyze/query-run-metrics.tsv')
'analyze/query-run-metrics-for-stats.tsv')
as select test, query_index, 0 run, version, metric_values
from query_run_metrics_full
from query_run_metric_arrays
order by test, query_index, run, version
;
-- This is the list of metric names, so that we can join them back after
-- statistical processing.
create table query_run_metric_names engine File(TSV, 'analyze/query-run-metric-names.tsv')
as select metric_names from query_run_metrics_full limit 1
as select metric_names from query_run_metric_arrays limit 1
;
"
" 2> >(tee -a analyze/errors.log 1>&2)
# This is a lateral join in bash... please forgive me.
# We don't have arrayPermute(), so I have to make random permutations with
@ -370,16 +391,16 @@ create table query_run_metric_names engine File(TSV, 'analyze/query-run-metric-n
# for each file. I do this in parallel using GNU parallel.
( set +x # do not bloat the log
IFS=$'\n'
for prefix in $(cut -f1,2 "analyze/query-run-metrics.tsv" | sort | uniq)
for prefix in $(cut -f1,2 "analyze/query-run-metrics-for-stats.tsv" | sort | uniq)
do
file="analyze/tmp/$(echo "$prefix" | sed 's/\t/_/g').tsv"
grep "^$prefix " "analyze/query-run-metrics.tsv" > "$file" &
grep "^$prefix " "analyze/query-run-metrics-for-stats.tsv" > "$file" &
printf "%s\0\n" \
"clickhouse-local \
--file \"$file\" \
--structure 'test text, query text, run int, version UInt8, metrics Array(float)' \
--query \"$(cat "$script_dir/eqmed.sql")\" \
>> \"analyze/query-reports.tsv\"" \
>> \"analyze/query-metric-stats.tsv\"" \
2>> analyze/errors.log \
>> analyze/commands.txt
done
@ -388,6 +409,33 @@ unset IFS
)
parallel --joblog analyze/parallel-log.txt --null < analyze/commands.txt 2>> analyze/errors.log
clickhouse-local --query "
-- Join the metric names back to the metric statistics we've calculated, and make
-- a denormalized table of them -- statistics for all metrics for all queries.
-- The WITH, ARRAY JOIN and CROSS JOIN do not like each other:
-- https://github.com/ClickHouse/ClickHouse/issues/11868
-- https://github.com/ClickHouse/ClickHouse/issues/11757
-- Because of this, we make a view with arrays first, and then apply all the
-- array joins.
create view query_metric_stat_arrays as
with (select * from file('analyze/query-run-metric-names.tsv',
TSV, 'n Array(String)')) as metric_name
select test, query_index, metric_name, left, right, diff, stat_threshold
from file('analyze/query-metric-stats.tsv', TSV, 'left Array(float),
right Array(float), diff Array(float), stat_threshold Array(float),
test text, query_index int') reports
order by test, query_index, metric_name
;
create table query_metric_stats_denorm engine File(TSVWithNamesAndTypes,
'analyze/query-metric-stats-denorm.tsv')
as select test, query_index, metric_name, left, right, diff, stat_threshold
from query_metric_stat_arrays
left array join metric_name, left, right, diff, stat_threshold
order by test, query_index, metric_name
;
" 2> >(tee -a analyze/errors.log 1>&2)
}
# Analyze results
@ -403,58 +451,46 @@ build_log_column_definitions
cat analyze/errors.log >> report/errors.log ||:
cat profile-errors.log >> report/errors.log ||:
short_query_threshold="0.02"
clickhouse-local --query "
create view query_display_names as select * from
file('analyze/query-display-names.tsv', TSV,
'test text, query_index int, query_display_name text')
;
create view partial_query_times as select * from
file('analyze/partial-query-times.tsv', TSVWithNamesAndTypes,
'test text, query_index int, time_stddev float, time_median float')
;
-- Report for partial queries that we could only run on the new server (e.g.
-- queries with new functions added in the tested PR).
create table partial_queries_report engine File(TSV, 'report/partial-queries-report.tsv')
as select floor(time_median, 3) m, floor(time_stddev / time_median, 3) v,
as select floor(time_median, 3) time,
floor(time_stddev / time_median, 3) relative_time_stddev,
test, query_index, query_display_name
from file('analyze/partial-query-times.tsv', TSVWithNamesAndTypes,
'test text, query_index int, time_stddev float, time_median float') t
from partial_query_times
join query_display_names using (test, query_index)
order by test, query_index
;
-- WITH, ARRAY JOIN and CROSS JOIN do not like each other:
-- https://github.com/ClickHouse/ClickHouse/issues/11868
-- https://github.com/ClickHouse/ClickHouse/issues/11757
-- Because of this, we make a view with arrays first, and then apply all the
-- array joins.
create view query_metric_stat_arrays as
with (select * from file('analyze/query-run-metric-names.tsv',
TSV, 'n Array(String)')) as metric_name
select metric_name, left, right, diff, stat_threshold, test, query_index,
query_display_name
from file ('analyze/query-reports.tsv', TSV, 'left Array(float),
right Array(float), diff Array(float), stat_threshold Array(float),
test text, query_index int') reports
left join query_display_names
on reports.test = query_display_names.test
and reports.query_index = query_display_names.query_index
;
create table query_metric_stats engine File(TSVWithNamesAndTypes,
'report/query-metric-stats.tsv')
as
select metric_name, left, right, diff, stat_threshold, test, query_index,
query_display_name
from query_metric_stat_arrays
left array join metric_name, left, right, diff, stat_threshold
create view query_metric_stats as
select * from file('analyze/query-metric-stats-denorm.tsv',
TSVWithNamesAndTypes,
'test text, query_index int, metric_name text, left float, right float,
diff float, stat_threshold float')
;
-- Main statistics for queries -- query time as reported in query log.
create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
as select
-- FIXME Comparison mode doesn't make sense for queries that complete
-- immediately (on the same order of time as noise). We compute average
-- run time between old and new version, and if it is below a threshold,
-- we just skip the query. If there is a significant regression, the
-- average will be above threshold, we'll process it normally and will
-- detect the regression.
(left + right) / 2 < 0.02 as short,
-- Comparison mode doesn't make sense for queries that complete
-- immediately (on the same order of time as noise). If query duration is
-- less that some threshold, we just skip it. If there is a significant
-- regression in such query, the time will exceed the threshold, and we
-- well process it normally and detect the regression.
right < $short_query_threshold as short,
not short and abs(diff) > report_threshold and abs(diff) > stat_threshold as changed_fail,
not short and abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show,
@ -464,68 +500,33 @@ create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
left, right, diff, stat_threshold,
if(report_threshold > 0, report_threshold, 0.10) as report_threshold,
test, query_index, query_display_name
query_metric_stats.test test, query_metric_stats.query_index query_index,
query_display_name
from query_metric_stats
left join file('analyze/report-thresholds.tsv', TSV,
'test text, report_threshold float') thresholds
on query_metric_stats.test = thresholds.test
left join query_display_names
on query_metric_stats.test = query_display_names.test
and query_metric_stats.query_index = query_display_names.query_index
where metric_name = 'server_time'
order by test, query_index, metric_name
;
-- keep the table in old format so that we can analyze new and old data together
create table queries_old_format engine File(TSVWithNamesAndTypes, 'queries.rep')
as select short, changed_fail, unstable_fail, left, right, diff,
stat_threshold, test, query_display_name query
from queries
;
-- save all test runs as JSON for the new comparison page
create table all_query_runs_json engine File(JSON, 'report/all-query-runs.json') as
select test, query_index, query_display_name query,
left, right, diff, stat_threshold, report_threshold,
versions_runs[1] runs_left, versions_runs[2] runs_right
from (
select
test, query_index,
groupArrayInsertAt(runs, version) versions_runs
from (
select
test, query_index, version,
groupArray(metrics[1]) runs
from file('analyze/query-run-metrics.tsv', TSV,
'test text, query_index int, run int, version UInt8, metrics Array(float)')
group by test, query_index, version
)
group by test, query_index
) runs
left join query_display_names
on runs.test = query_display_names.test
and runs.query_index = query_display_names.query_index
left join file('analyze/report-thresholds.tsv',
TSV, 'test text, report_threshold float') thresholds
on runs.test = thresholds.test
left join query_metric_stats
on runs.test = query_metric_stats.test
and runs.query_index = query_metric_stats.query_index
where
query_metric_stats.metric_name = 'server_time'
;
create table changed_perf_tsv engine File(TSV, 'report/changed-perf.tsv') as
select left, right, diff, stat_threshold, changed_fail, test, query_index, query_display_name
create table changed_perf_report engine File(TSV, 'report/changed-perf.tsv') as
select
left, right,
left > right
? '- ' || toString(floor(left / right, 3)) || 'x'
: '+ ' || toString(floor(right / left, 3)) || 'x',
diff, stat_threshold, changed_fail, test, query_index, query_display_name
from queries where changed_show order by abs(diff) desc;
create table unstable_queries_tsv engine File(TSV, 'report/unstable-queries.tsv') as
create table unstable_queries_report engine File(TSV, 'report/unstable-queries.tsv') as
select left, right, diff, stat_threshold, unstable_fail, test, query_index, query_display_name
from queries where unstable_show order by stat_threshold desc;
create table queries_for_flamegraph engine File(TSVWithNamesAndTypes,
'report/queries-for-flamegraph.tsv') as
select test, query_index from queries where unstable_show or changed_show
;
create table test_time_changes_tsv engine File(TSV, 'report/test-time-changes.tsv') as
create table test_time_changes engine File(TSV, 'report/test-time-changes.tsv') as
select test, queries, average_time_change from (
select test, count(*) queries,
sum(left) as left, sum(right) as right,
@ -536,22 +537,22 @@ create table test_time_changes_tsv engine File(TSV, 'report/test-time-changes.ts
)
;
create table unstable_tests_tsv engine File(TSV, 'report/unstable-tests.tsv') as
create table unstable_tests engine File(TSV, 'report/unstable-tests.tsv') as
select test, sum(unstable_show) total_unstable, sum(changed_show) total_changed
from queries
group by test
order by total_unstable + total_changed desc
;
create table test_perf_changes_tsv engine File(TSV, 'report/test-perf-changes.tsv') as
create table test_perf_changes_report engine File(TSV, 'report/test-perf-changes.tsv') as
select test,
queries,
coalesce(total_unstable, 0) total_unstable,
coalesce(total_changed, 0) total_changed,
total_unstable + total_changed total_bad,
coalesce(toString(floor(average_time_change, 3)), '??') average_time_change_str
from test_time_changes_tsv
full join unstable_tests_tsv
from test_time_changes
full join unstable_tests
using test
where (abs(average_time_change) > 0.05 and queries > 5)
or (total_bad > 0)
@ -559,28 +560,28 @@ create table test_perf_changes_tsv engine File(TSV, 'report/test-perf-changes.ts
settings join_use_nulls = 1
;
create table query_time engine Memory as select *
create view total_client_time_per_query as select *
from file('analyze/client-times.tsv', TSV,
'test text, query_index int, client float, server float');
create table wall_clock engine Memory as select *
from file('wall-clock-times.tsv', TSV, 'test text, real float, user float, system float');
create table slow_on_client_tsv engine File(TSV, 'report/slow-on-client.tsv') as
create table slow_on_client_report engine File(TSV, 'report/slow-on-client.tsv') as
select client, server, floor(client/server, 3) p, test, query_display_name
from query_time left join query_display_names using (test, query_index)
from total_client_time_per_query left join query_display_names using (test, query_index)
where p > 1.02 order by p desc;
create table wall_clock_time_per_test engine Memory as select *
from file('wall-clock-times.tsv', TSV, 'test text, real float, user float, system float');
create table test_time engine Memory as
select test, sum(client) total_client_time,
maxIf(client, not short) query_max,
minIf(client, not short) query_min,
count(*) queries, sum(short) short_queries
from query_time full join queries using (test, query_index)
from total_client_time_per_query full join queries using (test, query_index)
group by test;
create table test_times_tsv engine File(TSV, 'report/test-times.tsv') as
select wall_clock.test, real,
create table test_times_report engine File(TSV, 'report/test-times.tsv') as
select wall_clock_time_per_test.test, real,
floor(total_client_time, 3),
queries,
short_queries,
@ -590,23 +591,64 @@ create table test_times_tsv engine File(TSV, 'report/test-times.tsv') as
from test_time
-- wall clock times are also measured for skipped tests, so don't
-- do full join
left join wall_clock using test
left join wall_clock_time_per_test using test
order by avg_real_per_query desc;
-- report for all queries page, only main metric
create table all_tests_tsv engine File(TSV, 'report/all-queries.tsv') as
create table all_tests_report engine File(TSV, 'report/all-queries.tsv') as
select changed_fail, unstable_fail,
left, right, diff,
floor(left > right ? left / right : right / left, 3),
stat_threshold, test, query_index, query_display_name
left, right,
left > right
? '- ' || toString(floor(left / right, 3)) || 'x'
: '+ ' || toString(floor(right / left, 3)) || 'x',
diff, stat_threshold, test, query_index, query_display_name
from queries order by test, query_index;
-- queries for which we will build flamegraphs (see below)
create table queries_for_flamegraph engine File(TSVWithNamesAndTypes,
'report/queries-for-flamegraph.tsv') as
select test, query_index from queries where unstable_show or changed_show
;
-- List of queries that have 'short' duration, but are not marked as 'short' by
-- the test author (we report them).
create table unmarked_short_queries_report
engine File(TSV, 'report/unmarked-short-queries.tsv')
as select time, test, query_index, query_display_name
from (
select right time, test, query_index from queries where short
union all
select time_median, test, query_index from partial_query_times
where time_median < $short_query_threshold
) times
left join query_display_names
on times.test = query_display_names.test
and times.query_index = query_display_names.query_index
where (test, query_index) not in
(select * from file('analyze/marked-short-queries.tsv', TSV,
'test text, query_index int'))
order by test, query_index
;
--------------------------------------------------------------------------------
-- various compatibility data formats follow, not related to the main report
-- keep the table in old format so that we can analyze new and old data together
create table queries_old_format engine File(TSVWithNamesAndTypes, 'queries.rep')
as select short, changed_fail, unstable_fail, left, right, diff,
stat_threshold, test, query_display_name query
from queries
;
-- new report for all queries with all metrics (no page yet)
create table all_query_metrics_tsv engine File(TSV, 'report/all-query-metrics.tsv') as
select metric_name, left, right, diff,
floor(left > right ? left / right : right / left, 3),
stat_threshold, test, query_index, query_display_name
from query_metric_stats
left join query_display_names
on query_metric_stats.test = query_display_names.test
and query_metric_stats.query_index = query_display_names.query_index
order by test, query_index;
" 2> >(tee -a report/errors.log 1>&2)
@ -634,7 +676,8 @@ create view query_display_names as select * from
create table unstable_query_runs engine File(TSVWithNamesAndTypes,
'unstable-query-runs.$version.rep') as
select test, query_index, query_display_name, query_id
select query_runs.test test, query_runs.query_index query_index,
query_display_name, query_id
from query_runs
join queries_for_flamegraph on
query_runs.test = queries_for_flamegraph.test

View File

@ -6,7 +6,6 @@ trap 'kill $(jobs -pr) ||:' EXIT
mkdir db0 ||:
mkdir left ||:
mkdir right ||:
left_pr=$1
left_sha=$2
@ -24,7 +23,7 @@ dataset_paths["values"]="https://clickhouse-datasets.s3.yandex.net/values_with_e
function download
{
# Historically there were various path for the performance test package.
# Historically there were various paths for the performance test package.
# Test all of them.
for path in "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/"{,clickhouse_build_check/}"performance/performance.tgz"
do
@ -34,22 +33,13 @@ function download
fi
done
for path in "https://clickhouse-builds.s3.yandex.net/$right_pr/$right_sha/"{,clickhouse_build_check/}"performance/performance.tgz"
do
if curl --fail --head "$path"
then
right_path="$path"
fi
done
# might have the same version on left and right
if ! [ "$left_path" = "$right_path" ]
# Might have the same version on left and right (for testing).
if ! [ "$left_sha" = "$right_sha" ]
then
wget -nv -nd -c "$left_path" -O- | tar -C left --strip-components=1 -zxv &
wget -nv -nd -c "$right_path" -O- | tar -C right --strip-components=1 -zxv &
else
mkdir right ||:
wget -nv -nd -c "$left_path" -O- | tar -C left --strip-components=1 -zxv && cp -a left/* right &
mkdir left ||:
cp -a right/* left &
fi
for dataset_name in $datasets

View File

@ -1,38 +1,25 @@
#!/bin/bash
set -ex
chown nobody workspace output
chgrp nogroup workspace output
chmod 777 workspace output
cd workspace
# Fetch the repository to find and describe the compared revisions.
rm -rf ch ||:
time git clone --depth 50 --bare https://github.com/ClickHouse/ClickHouse ch
git -C ch fetch origin "$SHA_TO_TEST"
# Use the packaged repository to find the revision we will compare to.
function find_reference_sha
{
# If not master, try to fetch pull/.../{head,merge}
if [ "$PR_TO_TEST" != "0" ]
then
git -C ch fetch origin "refs/pull/$PR_TO_TEST/*:refs/heads/pull/$PR_TO_TEST/*"
fi
git -C right/ch log -1 origin/master
git -C right/ch log -1 pr
# Go back from the revision to be tested, trying to find the closest published
# testing release.
start_ref="$SHA_TO_TEST"~
# If we are testing a PR, and it merges with master successfully, we are
# building and testing not the nominal last SHA specified by pull/.../head
# and SHA_TO_TEST, but a revision that is merged with recent master, given
# by pull/.../merge ref.
# Master is the first parent of the pull/.../merge.
if git -C ch rev-parse "pull/$PR_TO_TEST/merge"
# testing release. The PR branch may be either pull/*/head which is the
# author's branch, or pull/*/merge, which is head merged with some master
# automatically by Github. We will use a merge base with master as a reference
# for tesing (or some older commit). A caveat is that if we're testing the
# master, the merge base is the tested commit itself, so we have to step back
# once.
start_ref=$(git -C right/ch merge-base origin/master pr)
if [ "PR_TO_TEST" == "0" ]
then
start_ref="pull/$PR_TO_TEST/merge~"
start_ref=$start_ref~
fi
# Loop back to find a commit that actually has a published perf test package.
while :
do
# FIXME the original idea was to compare to a closest testing tag, which
@ -46,12 +33,12 @@ function find_reference_sha
echo Reference tag is "$ref_tag"
# We use annotated tags which have their own shas, so we have to further
# dereference the tag to get the commit it points to, hence the '~0' thing.
REF_SHA=$(git -C ch rev-parse "$ref_tag~0")
REF_SHA=$(git -C right/ch rev-parse "$ref_tag~0")
# FIXME sometimes we have testing tags on commits without published builds --
# normally these are documentation commits. Loop to skip them.
# Historically there were various path for the performance test package.
# Test all of them.
# FIXME sometimes we have testing tags on commits without published builds.
# Normally these are documentation commits. Loop to skip them.
# Historically there were various path for the performance test package,
# test all of them.
unset found
for path in "https://clickhouse-builds.s3.yandex.net/0/$REF_SHA/"{,clickhouse_build_check/}"performance/performance.tgz"
do
@ -69,6 +56,24 @@ function find_reference_sha
REF_PR=0
}
chown nobody workspace output
chgrp nogroup workspace output
chmod 777 workspace output
cd workspace
# Download the package for the version we are going to test
for path in "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/"{,clickhouse_build_check/}"performance/performance.tgz"
do
if curl --fail --head "$path"
then
right_path="$path"
fi
done
mkdir right
wget -nv -nd -c "$right_path" -O- | tar -C right --strip-components=1 -zxv
# Find reference revision if not specified explicitly
if [ "$REF_SHA" == "" ]; then find_reference_sha; fi
if [ "$REF_SHA" == "" ]; then echo Reference SHA is not specified ; exit 1 ; fi
@ -76,17 +81,14 @@ if [ "$REF_PR" == "" ]; then echo Reference PR is not specified ; exit 1 ; fi
# Show what we're testing
(
git -C ch log -1 --decorate "$REF_SHA" ||:
git -C right/ch log -1 --decorate "$REF_SHA" ||:
) | tee left-commit.txt
(
git -C ch log -1 --decorate "$SHA_TO_TEST" ||:
if git -C ch rev-parse "pull/$PR_TO_TEST/merge" &> /dev/null
then
echo
echo Real tested commit is:
git -C ch log -1 --decorate "pull/$PR_TO_TEST/merge"
fi
git -C right/ch log -1 --decorate "$SHA_TO_TEST" ||:
echo
echo Real tested commit is:
git -C right/ch log -1 --decorate "pr"
) | tee right-commit.txt
if [ "$PR_TO_TEST" != "0" ]
@ -94,8 +96,8 @@ then
# If the PR only changes the tests and nothing else, prepare a list of these
# tests for use by compare.sh. Compare to merge base, because master might be
# far in the future and have unrelated test changes.
base=$(git -C ch merge-base "$SHA_TO_TEST" master)
git -C ch diff --name-only "$base" "$SHA_TO_TEST" | tee changed-tests.txt
base=$(git -C right/ch merge-base pr origin/master)
git -C right/ch diff --name-only "$base" pr | tee changed-tests.txt
if grep -vq '^tests/performance' changed-tests.txt
then
# Have some other changes besides the tests, so truncate the test list,

View File

@ -37,21 +37,44 @@ available_parameters = {} # { 'table': ['hits_10m', 'hits_100m'], ... }
for e in subst_elems:
available_parameters[e.find('name').text] = [v.text for v in e.findall('values/value')]
# Take care to keep the order of queries -- sometimes we have DROP IF EXISTS
# Takes parallel lists of templates, substitutes them with all combos of
# parameters. The set of parameters is determined based on the first list.
# Note: keep the order of queries -- sometimes we have DROP IF EXISTS
# followed by CREATE in create queries section, so the order matters.
def substitute_parameters(query_templates):
result = []
for q in query_templates:
def substitute_parameters(query_templates, other_templates = []):
query_results = []
other_results = [[]] * (len(other_templates))
for i, q in enumerate(query_templates):
keys = set(n for _, n, _, _ in string.Formatter().parse(q) if n)
values = [available_parameters[k] for k in keys]
result.extend([
q.format(**dict(zip(keys, values_combo)))
for values_combo in itertools.product(*values)])
return result
combos = itertools.product(*values)
for c in combos:
with_keys = dict(zip(keys, c))
query_results.append(q.format(**with_keys))
for j, t in enumerate(other_templates):
other_results[j].append(t[i].format(**with_keys))
if len(other_templates):
return query_results, other_results
else:
return query_results
# Build a list of test queries, substituting parameters to query templates,
# and reporting the queries marked as short.
test_queries = []
for e in root.findall('query'):
new_queries = []
if 'short' in e.attrib:
new_queries, [is_short] = substitute_parameters([e.text], [[e.attrib['short']]])
for i, s in enumerate(is_short):
# Don't print this if we only need to print the queries.
if eval(s) and not args.print_queries:
print(f'short\t{i + len(test_queries)}')
else:
new_queries = substitute_parameters([e.text])
test_queries += new_queries
# Build a list of test queries, processing all substitutions
test_query_templates = [q.text for q in root.findall('query')]
test_queries = substitute_parameters(test_query_templates)
# If we're only asked to print the queries, do that and exit
if args.print_queries:
@ -166,7 +189,7 @@ for conn_index, c in enumerate(connections):
c.execute(q)
print(f'fill\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')
# Run test queries
# Run test queries.
for query_index, q in enumerate(test_queries):
query_prefix = f'{test_name}.query{query_index}'

View File

@ -63,7 +63,48 @@ p.links a {{ padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-
color: inherit;
text-decoration: none;
}}
tr:nth-child(odd) td {{filter: brightness(95%);}}
.all-query-times tr td:nth-child(1),
.all-query-times tr td:nth-child(2),
.all-query-times tr td:nth-child(3),
.all-query-times tr td:nth-child(4),
.all-query-times tr td:nth-child(5),
.all-query-times tr td:nth-child(7),
.changes-in-performance tr td:nth-child(1),
.changes-in-performance tr td:nth-child(2),
.changes-in-performance tr td:nth-child(3),
.changes-in-performance tr td:nth-child(4),
.changes-in-performance tr td:nth-child(5),
.changes-in-performance tr td:nth-child(7),
.unstable-queries tr td:nth-child(1),
.unstable-queries tr td:nth-child(2),
.unstable-queries tr td:nth-child(3),
.unstable-queries tr td:nth-child(4),
.unstable-queries tr td:nth-child(6),
.test-performance-changes tr td:nth-child(2),
.test-performance-changes tr td:nth-child(3),
.test-performance-changes tr td:nth-child(4),
.test-performance-changes tr td:nth-child(5),
.test-performance-changes tr td:nth-child(6),
.test-times tr td:nth-child(2),
.test-times tr td:nth-child(3),
.test-times tr td:nth-child(4),
.test-times tr td:nth-child(5),
.test-times tr td:nth-child(6),
.test-times tr td:nth-child(7),
.test-times tr td:nth-child(8),
.concurrent-benchmarks tr td:nth-child(2),
.concurrent-benchmarks tr td:nth-child(3),
.concurrent-benchmarks tr td:nth-child(4),
.concurrent-benchmarks tr td:nth-child(5),
.metric-changes tr td:nth-child(2),
.metric-changes tr td:nth-child(3),
.metric-changes tr td:nth-child(4),
.metric-changes tr td:nth-child(5)
{{ text-align: right; }}
</style>
<title>Clickhouse performance comparison</title>
</head>
@ -111,11 +152,14 @@ def tableHeader(r):
return tr(''.join([th(f) for f in r]))
def tableStart(title):
return """
<h2 id="{anchor}"><a class="cancela" href="#{anchor}">{title}</a></h2>
<table>""".format(
anchor = nextTableAnchor(),
title = title)
anchor = nextTableAnchor();
cls = '-'.join(title.lower().split(' ')[:3]);
return f"""
<h2 id="{anchor}">
<a class="cancela" href="#{anchor}">{title}</a>
</h2>
<table class="{cls}">
"""
def tableEnd():
return '</table>'
@ -196,6 +240,12 @@ if args.report == 'main':
['Client time,&nbsp;s', 'Server time,&nbsp;s', 'Ratio', 'Test', 'Query'],
slow_on_client_rows)
unmarked_short_rows = tsvRows('report/unmarked-short-queries.tsv')
error_tests += len(unmarked_short_rows)
printSimpleTable('Short queries not marked as short',
['New client time, s', 'Test', '#', 'Query'],
unmarked_short_rows)
def print_partial():
rows = tsvRows('report/partial-queries-report.tsv')
if not rows:
@ -232,12 +282,13 @@ if args.report == 'main':
columns = [
'Old,&nbsp;s', # 0
'New,&nbsp;s', # 1
'Relative difference (new&nbsp;&minus;&nbsp;old) / old', # 2
'p&nbsp;<&nbsp;0.001 threshold', # 3
# Failed # 4
'Test', # 5
'#', # 6
'Query', # 7
'Times speedup / slowdown', # 2
'Relative difference (new&nbsp;&minus;&nbsp;old) / old', # 3
'p&nbsp;<&nbsp;0.001 threshold', # 4
# Failed # 5
'Test', # 6
'#', # 7
'Query', # 8
]
print(tableHeader(columns))
@ -245,15 +296,15 @@ if args.report == 'main':
attrs = ['' for c in columns]
attrs[4] = None
for row in rows:
if int(row[4]):
if float(row[2]) < 0.:
if int(row[5]):
if float(row[3]) < 0.:
faster_queries += 1
attrs[2] = f'style="background: {color_good}"'
attrs[2] = attrs[3] = f'style="background: {color_good}"'
else:
slower_queries += 1
attrs[2] = f'style="background: {color_bad}"'
attrs[2] = attrs[3] = f'style="background: {color_bad}"'
else:
attrs[2] = ''
attrs[2] = attrs[3] = ''
print(tableRow(row, attrs))
@ -275,7 +326,7 @@ if args.report == 'main':
'Old,&nbsp;s', #0
'New,&nbsp;s', #1
'Relative difference (new&nbsp;-&nbsp;old)/old', #2
'p&nbsp;<&nbsp;0.001 threshold', #3
'p&nbsp;&lt;&nbsp;0.001 threshold', #3
# Failed #4
'Test', #5
'#', #6
@ -492,9 +543,9 @@ elif args.report == 'all-queries':
# Unstable #1
'Old,&nbsp;s', #2
'New,&nbsp;s', #3
'Relative difference (new&nbsp;&minus;&nbsp;old) / old', #4
'Times speedup / slowdown', #5
'p&nbsp;<&nbsp;0.001 threshold', #6
'Times speedup / slowdown', #4
'Relative difference (new&nbsp;&minus;&nbsp;old) / old', #5
'p&nbsp;&lt;&nbsp;0.001 threshold', #6
'Test', #7
'#', #8
'Query', #9
@ -513,12 +564,12 @@ elif args.report == 'all-queries':
attrs[6] = ''
if int(r[0]):
if float(r[4]) > 0.:
attrs[4] = f'style="background: {color_bad}"'
if float(r[5]) > 0.:
attrs[4] = attrs[5] = f'style="background: {color_bad}"'
else:
attrs[4] = f'style="background: {color_good}"'
attrs[4] = attrs[5] = f'style="background: {color_good}"'
else:
attrs[4] = ''
attrs[4] = attrs[5] = ''
if (float(r[2]) + float(r[3])) / 2 > allowed_single_run_time:
attrs[2] = f'style="background: {color_bad}"'

View File

@ -55,18 +55,21 @@ ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-serv
ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/dict_examples/; \
ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/dict_examples/;
ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/listen.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/metric_log.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/; \
ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/; \
ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/; \
ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/; \
ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/; \
ln -s /usr/share/clickhouse-test/config/macros.xml /etc/clickhouse-server/config.d/;
ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/listen.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/metric_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/macros.xml /etc/clickhouse-server/config.d/
# Retain any pre-existing config and allow ClickHouse to load those if required
ln -s --backup=simple --suffix=_original.xml \
/usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/
service zookeeper start

View File

@ -17,7 +17,6 @@ ln -s /usr/share/clickhouse-test/config/listen.xml /etc/clickhouse-server/config
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/metric_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/access_management.xml /etc/clickhouse-server/users.d/
@ -33,6 +32,10 @@ ln -s /usr/share/clickhouse-test/config/server.key /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/server.crt /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/dhparam.pem /etc/clickhouse-server/
# Retain any pre-existing config and allow ClickHouse to load it if required
ln -s --backup=simple --suffix=_original.xml \
/usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/
if [[ -n "$USE_POLYMORPHIC_PARTS" ]] && [[ "$USE_POLYMORPHIC_PARTS" -eq 1 ]]; then
ln -s /usr/share/clickhouse-test/config/polymorphic_parts.xml /etc/clickhouse-server/config.d/
fi

View File

@ -46,27 +46,30 @@ ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-serv
ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/dict_examples/; \
ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/dict_examples/;
ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/listen.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/metric_log.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/; \
ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/; \
ln -s /usr/share/clickhouse-test/config/access_management.xml /etc/clickhouse-server/users.d/; \
ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/; \
ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/; \
ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/; \
ln -s /usr/share/clickhouse-test/config/macros.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/disks.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/secure_ports.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/clusters.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/graphite.xml /etc/clickhouse-server/config.d/; \
ln -s /usr/share/clickhouse-test/config/server.key /etc/clickhouse-server/; \
ln -s /usr/share/clickhouse-test/config/server.crt /etc/clickhouse-server/; \
ln -s /usr/share/clickhouse-test/config/dhparam.pem /etc/clickhouse-server/; \
ln -sf /usr/share/clickhouse-test/config/client_config.xml /etc/clickhouse-client/config.xml
ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/listen.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/metric_log.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/access_management.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/macros.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/disks.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/secure_ports.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/clusters.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/graphite.xml /etc/clickhouse-server/config.d/
ln -s /usr/share/clickhouse-test/config/server.key /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/server.crt /etc/clickhouse-server/
ln -s /usr/share/clickhouse-test/config/dhparam.pem /etc/clickhouse-server/
ln -sf /usr/share/clickhouse-test/config/client_config.xml /etc/clickhouse-client/config.xml
# Retain any pre-existing config and allow ClickHouse to load it if required
ln -s --backup=simple --suffix=_original.xml \
/usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/
service zookeeper start
sleep 5

View File

@ -23,28 +23,7 @@ RUN apt-get update -y \
brotli
COPY ./stress /stress
COPY run.sh /
ENV DATASETS="hits visits"
CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \
dpkg -i package_folder/clickhouse-common-static-dbg_*.deb; \
dpkg -i package_folder/clickhouse-server_*.deb; \
dpkg -i package_folder/clickhouse-client_*.deb; \
dpkg -i package_folder/clickhouse-test_*.deb; \
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/; \
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/; \
echo "TSAN_OPTIONS='halt_on_error=1 history_size=7 ignore_noninstrumented_modules=1 verbosity=1'" >> /etc/environment; \
echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \
echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment; \
service clickhouse-server start && sleep 5 \
&& /s3downloader --dataset-names $DATASETS \
&& chmod 777 -R /var/lib/clickhouse \
&& clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordinary" \
&& clickhouse-client --query "CREATE DATABASE IF NOT EXISTS test" \
&& service clickhouse-server restart && sleep 5 \
&& clickhouse-client --query "SHOW TABLES FROM datasets" \
&& clickhouse-client --query "SHOW TABLES FROM test" \
&& clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" \
&& clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" \
&& clickhouse-client --query "SHOW TABLES FROM test" \
&& ./stress --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION"
CMD ["/bin/bash", "/run.sh"]

56
docker/test/stress/run.sh Executable file
View File

@ -0,0 +1,56 @@
#!/bin/bash
set -x
dpkg -i package_folder/clickhouse-common-static_*.deb
dpkg -i package_folder/clickhouse-common-static-dbg_*.deb
dpkg -i package_folder/clickhouse-server_*.deb
dpkg -i package_folder/clickhouse-client_*.deb
dpkg -i package_folder/clickhouse-test_*.deb
function wait_server()
{
counter=0
until clickhouse-client --query "SELECT 1"
do
if [ "$counter" -gt 120 ]
then
break
fi
sleep 0.5
counter=$(($counter + 1))
done
}
ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/
ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/
echo "TSAN_OPTIONS='halt_on_error=1 history_size=7 ignore_noninstrumented_modules=1 verbosity=1'" >> /etc/environment
echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment
echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
service clickhouse-server start
wait_server
/s3downloader --dataset-names $DATASETS
chmod 777 -R /var/lib/clickhouse
clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordinary"
clickhouse-client --query "CREATE DATABASE IF NOT EXISTS test"
service clickhouse-server restart
wait_server
clickhouse-client --query "SHOW TABLES FROM datasets"
clickhouse-client --query "SHOW TABLES FROM test"
clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits"
clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits"
clickhouse-client --query "SHOW TABLES FROM test"
./stress --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION"
service clickhouse-server restart
wait_server
clickhouse-client --query "SELECT 'Server successfuly started'" > /test_output/alive_check.txt || echo 'Server failed to start' > /test_output/alive_check.txt

View File

@ -41,15 +41,6 @@ def run_func_test(cmd, output_prefix, num_processes, skip_tests_option):
return pipes
def check_clickhouse_alive(cmd):
try:
logging.info("Checking ClickHouse still alive")
check_call("{} --query \"select 'Still alive'\"".format(cmd), shell=True)
return True
except:
return False
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
parser = argparse.ArgumentParser(description="ClickHouse script for running stresstest")
@ -65,29 +56,18 @@ if __name__ == "__main__":
args = parser.parse_args()
func_pipes = []
perf_process = None
try:
perf_process = run_perf_test(args.perf_test_cmd, args.perf_test_xml_path, args.output_folder)
func_pipes = run_func_test(args.test_cmd, args.output_folder, args.num_parallel, args.skip_func_tests)
perf_process = run_perf_test(args.perf_test_cmd, args.perf_test_xml_path, args.output_folder)
func_pipes = run_func_test(args.test_cmd, args.output_folder, args.num_parallel, args.skip_func_tests)
logging.info("Will wait functests to finish")
while True:
retcodes = []
for p in func_pipes:
if p.poll() is not None:
retcodes.append(p.returncode)
if len(retcodes) == len(func_pipes):
break
logging.info("Finished %s from %s processes", len(retcodes), len(func_pipes))
time.sleep(5)
logging.info("Will wait functests to finish")
while True:
retcodes = []
for p in func_pipes:
if p.poll() is not None:
retcodes.append(p.returncode)
if len(retcodes) == len(func_pipes):
break
logging.info("Finished %s from %s processes", len(retcodes), len(func_pipes))
time.sleep(5)
if not check_clickhouse_alive(args.client_cmd):
raise Exception("Stress failed, results in logs")
else:
logging.info("Stress is ok")
except Exception as ex:
raise ex
finally:
if os.path.exists(args.server_log_folder):
logging.info("Copying server log files")
for log_file in os.listdir(args.server_log_folder):
shutil.copy(os.path.join(args.server_log_folder, log_file), os.path.join(args.output_folder, log_file))
logging.info("Stress test finished")

View File

@ -35,7 +35,7 @@ RUN apt-get update \
ENV TZ=Europe/Moscow
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN pip3 install urllib3 testflows==1.6.24 docker-compose docker dicttoxml kazoo tzlocal
RUN pip3 install urllib3 testflows==1.6.39 docker-compose docker dicttoxml kazoo tzlocal
ENV DOCKER_CHANNEL stable
ENV DOCKER_VERSION 17.09.1-ce

View File

@ -1,25 +0,0 @@
{
"checkYo": false,
"excludeFiles": [],
"fileExtensions": [],
"format": "auto",
"ignoreTags": [
"code",
"kbd",
"object",
"samp",
"script",
"style",
"var"
],
"maxRequests": 2,
"lang": "en,ru",
"report": ["console"],
"dictionary": [
"(C|c)lick(H|h)ouse",
"CatBoost",
"(Ш|ш)ард(ы|ов|а|у|е|ам|ирование|ированы|ах)?",
"логир(ование|уются|ования)?",
"конфиг(а|е|ом|у)"
]
}

View File

@ -0,0 +1,24 @@
# Statement name (for example, SHOW USER)
Brief description of what the statement does.
Syntax:
```sql
Syntax of the statement.
```
## Other necessary sections of the description (Optional)
Examples of descriptions with a complicated structure:
- https://clickhouse.tech/docs/en/sql-reference/statements/grant/
- https://clickhouse.tech/docs/en/sql-reference/statements/revoke/
- https://clickhouse.tech/docs/en/sql-reference/statements/select/join/
## See Also (Optional)
Links to related topics as a list.
- [link](#)

View File

@ -4,15 +4,14 @@ toc_priority: 70
toc_title: Introduction
---
# ClickHouse Commercial Services
# ClickHouse Commercial Services {#clickhouse-commercial-services}
This section is a directory of commercial service providers specializing in ClickHouse. They are independent companies not necessarily affiliated with Yandex.
Service categories:
- [Cloud](cloud.md)
- [Support](support.md)
- [Cloud](../commercial/cloud.md)
- [Support](../commercial/support.md)
!!! note "For service providers"
If you happen to represent one of them, feel free to open a pull request adding your company to the respective section (or even adding a new section if the service doesn't fit into existing categories). The easiest way to open a pull-request for documentation page is by using a “pencil” edit button in the top-right corner. If your service available in some local market, make sure to mention it in a localized documentation page as well (or at least point it out in a pull-request description).
If you happen to represent one of them, feel free to open a pull request adding your company to the respective section (or even adding a new section if the service doesnt fit into existing categories). The easiest way to open a pull-request for documentation page is by using a “pencil” edit button in the top-right corner. If your service available in some local market, make sure to mention it in a localized documentation page as well (or at least point it out in a pull-request description).

View File

@ -5,7 +5,7 @@ toc_title: Architecture Overview
# Overview of ClickHouse Architecture {#overview-of-clickhouse-architecture}
ClickHouse is a true column-oriented DBMS. Data is stored by columns, and during the execution of arrays (vectors or chunks of columns). Whenever possible, operations are dispatched on arrays, rather than on individual values. It is called "vectorized query execution" and it helps lower the cost of actual data processing.
ClickHouse is a true column-oriented DBMS. Data is stored by columns, and during the execution of arrays (vectors or chunks of columns). Whenever possible, operations are dispatched on arrays, rather than on individual values. It is called “vectorized query execution” and it helps lower the cost of actual data processing.
> This idea is nothing new. It dates back to the `APL` (A programming language, 1957) and its descendants: `A +` (APL dialect), `J` (1990), `K` (1993), and `Q` (programming language from Kx Systems, 2003). Array programming is used in scientific data processing. Neither is this idea something new in relational databases: for example, it is used in the `VectorWise` system (also known as Actian Vector Analytic Database by Actian Corporation).
@ -21,11 +21,11 @@ Various `IColumn` implementations (`ColumnUInt8`, `ColumnString`, and so on) are
Nevertheless, it is possible to work with individual values as well. To represent an individual value, the `Field` is used. `Field` is just a discriminated union of `UInt64`, `Int64`, `Float64`, `String` and `Array`. `IColumn` has the `operator []` method to get the n-th value as a `Field`, and the `insert` method to append a `Field` to the end of a column. These methods are not very efficient, because they require dealing with temporary `Field` objects representing an individual value. There are more efficient methods, such as `insertFrom`, `insertRangeFrom`, and so on.
`Field` doesn't have enough information about a specific data type for a table. For example, `UInt8`, `UInt16`, `UInt32`, and `UInt64` are all represented as `UInt64` in a `Field`.
`Field` doesnt have enough information about a specific data type for a table. For example, `UInt8`, `UInt16`, `UInt32`, and `UInt64` are all represented as `UInt64` in a `Field`.
## Leaky Abstractions {#leaky-abstractions}
`IColumn` has methods for common relational transformations of data, but they dont meet all needs. For example, `ColumnUInt64` doesn't have a method to calculate the sum of two columns, and `ColumnString` doesn't have a method to run a substring search. These countless routines are implemented outside of `IColumn`.
`IColumn` has methods for common relational transformations of data, but they dont meet all needs. For example, `ColumnUInt64` doesnt have a method to calculate the sum of two columns, and `ColumnString` doesnt have a method to run a substring search. These countless routines are implemented outside of `IColumn`.
Various functions on columns can be implemented in a generic, non-efficient way using `IColumn` methods to extract `Field` values, or in a specialized way using knowledge of inner memory layout of data in a specific `IColumn` implementation. It is implemented by casting functions to a specific `IColumn` type and deal with internal representation directly. For example, `ColumnUInt64` has the `getData` method that returns a reference to an internal array, then a separate routine reads or fills that array directly. We have “leaky abstractions” to allow efficient specializations of various routines.
@ -35,7 +35,7 @@ Various functions on columns can be implemented in a generic, non-efficient way
`IDataType` and `IColumn` are only loosely related to each other. Different data types can be represented in memory by the same `IColumn` implementations. For example, `DataTypeUInt32` and `DataTypeDateTime` are both represented by `ColumnUInt32` or `ColumnConstUInt32`. In addition, the same data type can be represented by different `IColumn` implementations. For example, `DataTypeUInt8` can be represented by `ColumnUInt8` or `ColumnConstUInt8`.
`IDataType` only stores metadata. For instance, `DataTypeUInt8` doesn't store anything at all (except virtual pointer `vptr`) and `DataTypeFixedString` stores just `N` (the size of fixed-size strings).
`IDataType` only stores metadata. For instance, `DataTypeUInt8` doesnt store anything at all (except virtual pointer `vptr`) and `DataTypeFixedString` stores just `N` (the size of fixed-size strings).
`IDataType` has helper methods for various data formats. Examples are methods to serialize a value with possible quoting, to serialize a value for JSON, and to serialize a value as part of the XML format. There is no direct correspondence to data formats. For example, the different data formats `Pretty` and `TabSeparated` can use the same `serializeTextEscaped` helper method from the `IDataType` interface.
@ -120,9 +120,9 @@ There are ordinary functions and aggregate functions. For aggregate functions, s
Ordinary functions dont change the number of rows they work as if they are processing each row independently. In fact, functions are not called for individual rows, but for `Block`s of data to implement vectorized query execution.
There are some miscellaneous functions, like [blockSize](../sql-reference/functions/other-functions.md#function-blocksize), [rowNumberInBlock](../sql-reference/functions/other-functions.md#function-rownumberinblock), and [runningAccumulate](../sql-reference/functions/other-functions.md#runningaccumulatexploit block processing and violate the independence of rows.
There are some miscellaneous functions, like [blockSize](../sql-reference/functions/other-functions.md#function-blocksize), [rowNumberInBlock](../sql-reference/functions/other-functions.md#function-rownumberinblock), and \[runningAccumulate\](../sql-reference/functions/other-functions.md\#runningaccumulatexploit block processing and violate the independence of rows.
ClickHouse has strong typing, so theres no implicit type conversion. If a function doesn't support a specific combination of types, it throws an exception. But functions can work (be overloaded) for many different combinations of types. For example, the `plus` function (to implement the `+` operator) works for any combination of numeric types: `UInt8` + `Float32`, `UInt16` + `Int8`, and so on. Also, some variadic functions can accept any number of arguments, such as the `concat` function.
ClickHouse has strong typing, so theres no implicit type conversion. If a function doesnt support a specific combination of types, it throws an exception. But functions can work (be overloaded) for many different combinations of types. For example, the `plus` function (to implement the `+` operator) works for any combination of numeric types: `UInt8` + `Float32`, `UInt16` + `Int8`, and so on. Also, some variadic functions can accept any number of arguments, such as the `concat` function.
Implementing a function may be slightly inconvenient because a function explicitly dispatches supported data types and supported `IColumns`. For example, the `plus` function has code generated by instantiation of a C++ template for each combination of numeric types, and constant or non-constant left and right arguments.
@ -169,13 +169,13 @@ There is no global query plan for distributed query execution. Each node has its
`MergeTree` is a family of storage engines that supports indexing by primary key. The primary key can be an arbitrary tuple of columns or expressions. Data in a `MergeTree` table is stored in “parts”. Each part stores data in the primary key order, so data is ordered lexicographically by the primary key tuple. All the table columns are stored in separate `column.bin` files in these parts. The files consist of compressed blocks. Each block is usually from 64 KB to 1 MB of uncompressed data, depending on the average value size. The blocks consist of column values placed contiguously one after the other. Column values are in the same order for each column (the primary key defines the order), so when you iterate by many columns, you get values for the corresponding rows.
The primary key itself is “sparse”. It doesn't address every single row, but only some ranges of data. A separate `primary.idx` file has the value of the primary key for each N-th row, where N is called `index_granularity` (usually, N = 8192). Also, for each column, we have `column.mrk` files with “marks,” which are offsets to each N-th row in the data file. Each mark is a pair: the offset in the file to the beginning of the compressed block, and the offset in the decompressed block to the beginning of data. Usually, compressed blocks are aligned by marks, and the offset in the decompressed block is zero. Data for `primary.idx` always resides in memory, and data for `column.mrk` files is cached.
The primary key itself is “sparse”. It doesnt address every single row, but only some ranges of data. A separate `primary.idx` file has the value of the primary key for each N-th row, where N is called `index_granularity` (usually, N = 8192). Also, for each column, we have `column.mrk` files with “marks,” which are offsets to each N-th row in the data file. Each mark is a pair: the offset in the file to the beginning of the compressed block, and the offset in the decompressed block to the beginning of data. Usually, compressed blocks are aligned by marks, and the offset in the decompressed block is zero. Data for `primary.idx` always resides in memory, and data for `column.mrk` files is cached.
When we are going to read something from a part in `MergeTree`, we look at `primary.idx` data and locate ranges that could contain requested data, then look at `column.mrk` data and calculate offsets for where to start reading those ranges. Because of sparseness, excess data may be read. ClickHouse is not suitable for a high load of simple point queries, because the entire range with `index_granularity` rows must be read for each key, and the entire compressed block must be decompressed for each column. We made the index sparse because we must be able to maintain trillions of rows per single server without noticeable memory consumption for the index. Also, because the primary key is sparse, it is not unique: it cannot check the existence of the key in the table at INSERT time. You could have many rows with the same key in a table.
When you `INSERT` a bunch of data into `MergeTree`, that bunch is sorted by primary key order and forms a new part. There are background threads that periodically select some parts and merge them into a single sorted part to keep the number of parts relatively low. Thats why it is called `MergeTree`. Of course, merging leads to “write amplification”. All parts are immutable: they are only created and deleted, but not modified. When SELECT is executed, it holds a snapshot of the table (a set of parts). After merging, we also keep old parts for some time to make a recovery after failure easier, so if we see that some merged part is probably broken, we can replace it with its source parts.
`MergeTree` is not an LSM tree because it doesn't contain “memtable” and “log”: inserted data is written directly to the filesystem. This makes it suitable only to INSERT data in batches, not by individual row and not very frequently about once per second is ok, but a thousand times a second is not. We did it this way for simplicitys sake, and because we are already inserting data in batches in our applications.
`MergeTree` is not an LSM tree because it doesnt contain “memtable” and “log”: inserted data is written directly to the filesystem. This makes it suitable only to INSERT data in batches, not by individual row and not very frequently about once per second is ok, but a thousand times a second is not. We did it this way for simplicitys sake, and because we are already inserting data in batches in our applications.
> MergeTree tables can only have one (primary) index: there arent any secondary indices. It would be nice to allow multiple physical representations under one logical table, for example, to store data in more than one physical order or even to allow representations with pre-aggregated data along with original data.
@ -187,7 +187,7 @@ Replication in ClickHouse can be configured on a per-table basis. You could have
Replication is implemented in the `ReplicatedMergeTree` storage engine. The path in `ZooKeeper` is specified as a parameter for the storage engine. All tables with the same path in `ZooKeeper` become replicas of each other: they synchronize their data and maintain consistency. Replicas can be added and removed dynamically simply by creating or dropping a table.
Replication uses an asynchronous multi-master scheme. You can insert data into any replica that has a session with `ZooKeeper`, and data is replicated to all other replicas asynchronously. Because ClickHouse doesn't support UPDATEs, replication is conflict-free. As there is no quorum acknowledgment of inserts, just-inserted data might be lost if one node fails.
Replication uses an asynchronous multi-master scheme. You can insert data into any replica that has a session with `ZooKeeper`, and data is replicated to all other replicas asynchronously. Because ClickHouse doesnt support UPDATEs, replication is conflict-free. As there is no quorum acknowledgment of inserts, just-inserted data might be lost if one node fails.
Metadata for replication is stored in ZooKeeper. There is a replication log that lists what actions to do. Actions are: get part; merge parts; drop a partition, and so on. Each replica copies the replication log to its queue and then executes the actions from the queue. For example, on insertion, the “get the part” action is created in the log, and every replica downloads that part. Merges are coordinated between replicas to get byte-identical results. All parts are merged in the same way on all replicas. It is achieved by electing one replica as the leader, and that replica initiates merges and writes “merge parts” actions to the log.

View File

@ -1,6 +1,6 @@
---
toc_priority: 71
toc_title: Source Code
toc_title: Source Code Browser
---
# Browse ClickHouse Source Code {#browse-clickhouse-source-code}

View File

@ -1,6 +1,6 @@
---
toc_priority: 67
toc_title: How to Build ClickHouse on Linux for AARCH64 (ARM64)
toc_title: Build on Linux for AARCH64 (ARM64)
---
# How to Build ClickHouse on Linux for AARCH64 (ARM64) Architecture {#how-to-build-clickhouse-on-linux-for-aarch64-arm64-architecture}
@ -9,7 +9,7 @@ This is for the case when you have Linux machine and want to use it to build `cl
The cross-build for AARCH64 is based on the [Build instructions](../development/build.md), follow them first.
# Install Clang-8 {#install-clang-8}
## Install Clang-8 {#install-clang-8}
Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup.
For example, in Ubuntu Bionic you can use the following commands:
@ -20,7 +20,7 @@ sudo apt-get update
sudo apt-get install clang-8
```
# Install Cross-Compilation Toolset {#install-cross-compilation-toolset}
## Install Cross-Compilation Toolset {#install-cross-compilation-toolset}
``` bash
cd ClickHouse
@ -29,7 +29,7 @@ wget 'https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel
tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build-aarch64/cmake/toolchain/linux-aarch64 --strip-components=1
```
# Build ClickHouse {#build-clickhouse}
## Build ClickHouse {#build-clickhouse}
``` bash
cd ClickHouse

View File

@ -1,6 +1,6 @@
---
toc_priority: 66
toc_title: How to Build ClickHouse on Linux for Mac OS X
toc_title: Build on Linux for Mac OS X
---
# How to Build ClickHouse on Linux for Mac OS X {#how-to-build-clickhouse-on-linux-for-mac-os-x}
@ -9,7 +9,7 @@ This is for the case when you have Linux machine and want to use it to build `cl
The cross-build for Mac OS X is based on the [Build instructions](../development/build.md), follow them first.
# Install Clang-8 {#install-clang-8}
## Install Clang-8 {#install-clang-8}
Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup.
For example the commands for Bionic are like:
@ -19,7 +19,7 @@ sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8
sudo apt-get install clang-8
```
# Install Cross-Compilation Toolset {#install-cross-compilation-toolset}
## Install Cross-Compilation Toolset {#install-cross-compilation-toolset}
Lets remember the path where we install `cctools` as ${CCTOOLS}
@ -47,7 +47,7 @@ mkdir -p build-darwin/cmake/toolchain/darwin-x86_64
tar xJf MacOSX10.14.sdk.tar.xz -C build-darwin/cmake/toolchain/darwin-x86_64 --strip-components=1
```
# Build ClickHouse {#build-clickhouse}
## Build ClickHouse {#build-clickhouse}
``` bash
cd ClickHouse

View File

@ -1,6 +1,6 @@
---
toc_priority: 65
toc_title: How to Build ClickHouse on Mac OS X
toc_title: Build on Mac OS X
---
# How to Build ClickHouse on Mac OS X {#how-to-build-clickhouse-on-mac-os-x}
@ -45,14 +45,12 @@ $ cd ..
## Caveats {#caveats}
If you intend to run clickhouse-server, make sure to increase the systems maxfiles variable.
If you intend to run `clickhouse-server`, make sure to increase the systems maxfiles variable.
!!! info "Note"
Youll need to use sudo.
To do so, create the following file:
/Library/LaunchDaemons/limit.maxfiles.plist:
To do so, create the `/Library/LaunchDaemons/limit.maxfiles.plist` file with the following content:
``` xml
<?xml version="1.0" encoding="UTF-8"?>

View File

@ -1,11 +1,9 @@
---
toc_priority: 64
toc_title: How to Build ClickHouse on Linux
toc_title: Build on Linux
---
# How to Build ClickHouse for Development {#how-to-build-clickhouse-for-development}
The following tutorial is based on the Ubuntu Linux system. With appropriate changes, it should also work on any other Linux distribution.
# How to Build ClickHouse on Linux {#how-to-build-clickhouse-for-development}
Supported platforms:
@ -13,7 +11,11 @@ Supported platforms:
- AArch64
- Power9 (experimental)
## Install Git, CMake, Python and Ninja {#install-git-cmake-python-and-ninja}
## Normal Build for Development on Ubuntu
The following tutorial is based on the Ubuntu Linux system. With appropriate changes, it should also work on any other Linux distribution.
### Install Git, CMake, Python and Ninja {#install-git-cmake-python-and-ninja}
``` bash
$ sudo apt-get install git cmake python ninja-build
@ -21,18 +23,18 @@ $ sudo apt-get install git cmake python ninja-build
Or cmake3 instead of cmake on older systems.
## Install GCC 9 {#install-gcc-9}
### Install GCC 9 {#install-gcc-9}
There are several ways to do this.
### Install from Repository {#install-from-repository}
#### Install from Repository {#install-from-repository}
On Ubuntu 19.10 or newer:
$ sudo apt-get update
$ sudo apt-get install gcc-9 g++-9
### Install from a PPA Package {#install-from-a-ppa-package}
#### Install from a PPA Package {#install-from-a-ppa-package}
On older Ubuntu:
@ -43,18 +45,18 @@ $ sudo apt-get update
$ sudo apt-get install gcc-9 g++-9
```
### Install from Sources {#install-from-sources}
#### Install from Sources {#install-from-sources}
See [utils/ci/build-gcc-from-sources.sh](https://github.com/ClickHouse/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh)
## Use GCC 9 for Builds {#use-gcc-9-for-builds}
### Use GCC 9 for Builds {#use-gcc-9-for-builds}
``` bash
$ export CC=gcc-9
$ export CXX=g++-9
```
## Checkout ClickHouse Sources {#checkout-clickhouse-sources}
### Checkout ClickHouse Sources {#checkout-clickhouse-sources}
``` bash
$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git
@ -66,7 +68,7 @@ or
$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git
```
## Build ClickHouse {#build-clickhouse}
### Build ClickHouse {#build-clickhouse}
``` bash
$ cd ClickHouse
@ -79,7 +81,7 @@ $ ninja
To create an executable, run `ninja clickhouse`.
This will create the `programs/clickhouse` executable, which can be used with `client` or `server` arguments.
# How to Build ClickHouse on Any Linux {#how-to-build-clickhouse-on-any-linux}
## How to Build ClickHouse on Any Linux {#how-to-build-clickhouse-on-any-linux}
The build requires the following components:
@ -93,32 +95,58 @@ The build requires the following components:
If all the components are installed, you may build in the same way as the steps above.
Example for Ubuntu Eoan:
sudo apt update
sudo apt install git cmake ninja-build g++ python
git clone --recursive https://github.com/ClickHouse/ClickHouse.git
mkdir build && cd build
cmake ../ClickHouse
ninja
``` bash
sudo apt update
sudo apt install git cmake ninja-build g++ python
git clone --recursive https://github.com/ClickHouse/ClickHouse.git
mkdir build && cd build
cmake ../ClickHouse
ninja
```
Example for OpenSUSE Tumbleweed:
sudo zypper install git cmake ninja gcc-c++ python lld
git clone --recursive https://github.com/ClickHouse/ClickHouse.git
mkdir build && cd build
cmake ../ClickHouse
ninja
``` bash
sudo zypper install git cmake ninja gcc-c++ python lld
git clone --recursive https://github.com/ClickHouse/ClickHouse.git
mkdir build && cd build
cmake ../ClickHouse
ninja
```
Example for Fedora Rawhide:
``` bash
sudo yum update
yum --nogpg install git cmake make gcc-c++ python2
git clone --recursive https://github.com/ClickHouse/ClickHouse.git
mkdir build && cd build
cmake ../ClickHouse
make -j $(nproc)
```
sudo yum update
yum --nogpg install git cmake make gcc-c++ python2
git clone --recursive https://github.com/ClickHouse/ClickHouse.git
mkdir build && cd build
cmake ../ClickHouse
make -j $(nproc)
# You Dont Have to Build ClickHouse {#you-dont-have-to-build-clickhouse}
## How to Build ClickHouse Debian Package {#how-to-build-clickhouse-debian-package}
### Install Git and Pbuilder {#install-git-and-pbuilder}
``` bash
$ sudo apt-get update
$ sudo apt-get install git python pbuilder debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring
```
### Checkout ClickHouse Sources {#checkout-clickhouse-sources-1}
``` bash
$ git clone --recursive --branch master https://github.com/ClickHouse/ClickHouse.git
$ cd ClickHouse
```
### Run Release Script {#run-release-script}
``` bash
$ ./release
```
## You Dont Have to Build ClickHouse {#you-dont-have-to-build-clickhouse}
ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.
@ -126,26 +154,4 @@ They are built for stable, prestable and testing releases as long as for every c
To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green checkmark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.
# How to Build ClickHouse Debian Package {#how-to-build-clickhouse-debian-package}
## Install Git and Pbuilder {#install-git-and-pbuilder}
``` bash
$ sudo apt-get update
$ sudo apt-get install git python pbuilder debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring
```
## Checkout ClickHouse Sources {#checkout-clickhouse-sources-1}
``` bash
$ git clone --recursive --branch master https://github.com/ClickHouse/ClickHouse.git
$ cd ClickHouse
```
## Run Release Script {#run-release-script}
``` bash
$ ./release
```
[Original article](https://clickhouse.tech/docs/en/development/build/) <!--hide-->

View File

@ -35,6 +35,7 @@ toc_title: Third-Party Libraries Used
| poco | [Boost Software License - Version 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) |
| protobuf | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) |
| re2 | [BSD 3-Clause License](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) |
| sentry-native | [MIT License](https://github.com/getsentry/sentry-native/blob/master/LICENSE) |
| UnixODBC | [LGPL v2.1](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) |
| zlib-ng | [Zlib License](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) |
| zstd | [BSD 3-Clause License](https://github.com/facebook/zstd/blob/dev/LICENSE) |

View File

@ -1,6 +1,6 @@
---
toc_priority: 68
toc_title: How to Write C++ Code
toc_title: C++ Guide
---
# How to Write C++ Code {#how-to-write-c-code}

View File

@ -1,6 +1,6 @@
---
toc_priority: 69
toc_title: How to Run ClickHouse Tests
toc_title: Testing
---
# ClickHouse Testing {#clickhouse-testing}
@ -25,12 +25,7 @@ Tests should use (create, drop, etc) only tables in `test` database that is assu
If you want to use distributed queries in functional tests, you can leverage `remote` table function with `127.0.0.{1..2}` addresses for the server to query itself; or you can use predefined test clusters in server configuration file like `test_shard_localhost`.
Some tests are marked with `zookeeper`, `shard` or `long` in their names.
`zookeeper` is for tests that are using ZooKeeper. `shard` is for tests that
requires server to listen `127.0.0.*`; `distributed` or `global` have the same
meaning. `long` is for tests that run slightly longer that one second. You can
disable these groups of tests using `--no-zookeeper`, `--no-shard` and
`--no-long` options, respectively.
Some tests are marked with `zookeeper`, `shard` or `long` in their names. `zookeeper` is for tests that are using ZooKeeper. `shard` is for tests that requires server to listen `127.0.0.*`; `distributed` or `global` have the same meaning. `long` is for tests that run slightly longer that one second. You can disable these groups of tests using `--no-zookeeper`, `--no-shard` and `--no-long` options, respectively.
## Known Bugs {#known-bugs}
@ -153,11 +148,11 @@ Motivation:
Normally we release and run all tests on a single variant of ClickHouse build. But there are alternative build variants that are not thoroughly tested. Examples:
- build on FreeBSD;
- build on Debian with libraries from system packages;
- build with shared linking of libraries;
- build on AArch64 platform;
- build on PowerPc platform.
- build on FreeBSD
- build on Debian with libraries from system packages
- build with shared linking of libraries
- build on AArch64 platform
- build on PowerPc platform
For example, build with system packages is bad practice, because we cannot guarantee what exact version of packages a system will have. But this is really needed by Debian maintainers. For this reason we at least have to support this variant of build. Another example: shared linking is a common source of trouble, but it is needed for some enthusiasts.
@ -177,22 +172,22 @@ For production builds, gcc is used (it still generates slightly more efficient c
## Sanitizers {#sanitizers}
**Address sanitizer**.
### Address sanitizer
We run functional and integration tests under ASan on per-commit basis.
**Valgrind (Memcheck)**.
### Valgrind (Memcheck)
We run functional tests under Valgrind overnight. It takes multiple hours. Currently there is one known false positive in `re2` library, see [this article](https://research.swtch.com/sparse).
**Undefined behaviour sanitizer.**
### Undefined behaviour sanitizer
We run functional and integration tests under ASan on per-commit basis.
**Thread sanitizer**.
### Thread sanitizer
We run functional tests under TSan on per-commit basis. We still dont run integration tests under TSan on per-commit basis.
**Memory sanitizer**.
### Memory sanitizer
Currently we still dont use MSan.
**Debug allocator.**
### Debug allocator
Debug version of `jemalloc` is used for debug build.
## Fuzzing {#fuzzing}
@ -227,7 +222,7 @@ If you use `CLion` as an IDE, you can leverage some `clang-tidy` checks out of t
## Code Style {#code-style}
Code style rules are described [here](https://clickhouse.tech/docs/en/development/style/).
Code style rules are described [here](style.md).
To check for some common style violations, you can use `utils/check-style` script.

View File

@ -5,11 +5,11 @@ toc_priority: 25
toc_title: hidden
---
# ClickHouse Engines
# ClickHouse Engines {#clickhouse-engines}
There are two key engine kinds in ClickHouse:
- [Table engines](table-engines/index.md)
- [Database engines](database-engines/index.md)
- [Table engines](../engines/table-engines/index.md)
- [Database engines](../engines/database-engines/index.md)
{## [Original article](https://clickhouse.tech/docs/en/engines/) ##}

View File

@ -3,14 +3,14 @@ toc_folder_title: Integrations
toc_priority: 30
---
# Table Engines for Integrations
# Table Engines for Integrations {#table-engines-for-integrations}
ClickHouse provides various means for integrating with external systems, including table engines. Like with all other table engines, the configuration is done using `CREATE TABLE` or `ALTER TABLE` queries. Then from a user perspective, the configured integration looks like a normal table, but queries to it are proxied to the external system. This transparent querying is one of the key advantages of this approach over alternative integration methods, like external dictionaries or table functions, which require to use custom query methods on each use.
List of supported integrations:
- [ODBC](odbc.md)
- [JDBC](jdbc.md)
- [MySQL](mysql.md)
- [HDFS](hdfs.md)
- [Kafka](kafka.md)
- [ODBC](../../../engines/table-engines/integrations/odbc.md)
- [JDBC](../../../engines/table-engines/integrations/jdbc.md)
- [MySQL](../../../engines/table-engines/integrations/mysql.md)
- [HDFS](../../../engines/table-engines/integrations/hdfs.md)
- [Kafka](../../../engines/table-engines/integrations/kafka.md)

View File

@ -18,7 +18,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
) ENGINE = MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']);
```
See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) query.
See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query.
The table structure can differ from the original MySQL table structure:

View File

@ -23,7 +23,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
ENGINE = ODBC(connection_settings, external_database, external_table)
```
See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) query.
See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query.
The table structure can differ from the source table structure:

View File

@ -0,0 +1,122 @@
---
toc_priority: 6
toc_title: RabbitMQ
---
# RabbitMQ Engine {#rabbitmq-engine}
This engine allows integrating ClickHouse with [RabbitMQ](https://www.rabbitmq.com).
RabbitMQ lets you:
- Publish or subscribe to data flows.
- Process streams as they become available.
## Creating a Table {#table_engine-rabbitmq-creating-a-table}
``` sql
CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
(
name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
...
) ENGINE = RabbitMQ SETTINGS
rabbitmq_host_port = 'host:port',
rabbitmq_exchange_name = 'exchange_name',
rabbitmq_format = 'data_format'[,]
[rabbitmq_exchange_type = 'exchange_type',]
[rabbitmq_routing_key_list = 'key1,key2,...',]
[rabbitmq_row_delimiter = 'delimiter_symbol',]
[rabbitmq_num_consumers = N,]
[rabbitmq_num_queues = N,]
[rabbitmq_transactional_channel = 0]
```
Required parameters:
- `rabbitmq_host_port` host:port (for example, `localhost:5672`).
- `rabbitmq_exchange_name` RabbitMQ exchange name.
- `rabbitmq_format` Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section.
Optional parameters:
- `rabbitmq_exchange_type` The type of RabbitMQ exchange: `direct`, `fanout`, `topic`, `headers`, `consistent-hash`. Default: `fanout`.
- `rabbitmq_routing_key_list` A comma-separated list of routing keys.
- `rabbitmq_row_delimiter` Delimiter character, which ends the message.
- `rabbitmq_num_consumers` The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient.
- `rabbitmq_num_queues` The number of queues per consumer. Default: `1`. Specify more queues if the capacity of one queue per consumer is insufficient. Single queue can contain up to 50K messages at the same time.
- `rabbitmq_transactional_channel` Wrap insert queries in transactions. Default: `0`.
Required configuration:
The RabbitMQ server configuration should be added using the ClickHouse config file.
``` xml
<rabbitmq>
<username>root</username>
<password>clickhouse</password>
</rabbitmq>
```
Example:
``` sql
CREATE TABLE queue (
key UInt64,
value UInt64
) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672',
rabbitmq_exchange_name = 'exchange1',
rabbitmq_format = 'JSONEachRow',
rabbitmq_num_consumers = 5;
```
## Description {#description}
`SELECT` is not particularly useful for reading messages (except for debugging), because each message can be read only once. It is more practical to create real-time threads using materialized views. To do this:
1. Use the engine to create a RabbitMQ consumer and consider it a data stream.
2. Create a table with the desired structure.
3. Create a materialized view that converts data from the engine and puts it into a previously created table.
When the `MATERIALIZED VIEW` joins the engine, it starts collecting data in the background. This allows you to continually receive messages from RabbitMQ and convert them to the required format using `SELECT`.
One RabbitMQ table can have as many materialized views as you like.
Data can be channeled based on `rabbitmq_exchange_type` and the specified `rabbitmq_routing_key_list`.
There can be no more than one exchange per table. One exchange can be shared between multiple tables - it enables routing into multiple tables at the same time.
Exchange type options:
- `direct` - Routing is based on exact matching of keys. Example table key list: `key1,key2,key3,key4,key5`, message key can eqaul any of them.
- `fanout` - Routing to all tables (where exchange name is the same) regardless of the keys.
- `topic` - Routing is based on patterns with dot-separated keys. Examples: `*.logs`, `records.*.*.2020`, `*.2018,*.2019,*.2020`.
- `headers` - Routing is based on `key=value` matches with a setting `x-match=all` or `x-match=any`. Example table key list: `x-match=all,format=logs,type=report,year=2020`.
- `consistent-hash` - Data is evenly distributed between all bound tables (where exchange name is the same). Note that this exchange type must be enabled with RabbitMQ plugin: `rabbitmq-plugins enable rabbitmq_consistent_hash_exchange`.
If exchange type is not specified, then default is `fanout` and routing keys for data publishing must be randomized in range `[1, num_consumers]` for every message/batch (or in range `[1, num_consumers * num_queues]` if `rabbitmq_num_queues` is set). This table configuration works quicker then any other, especially when `rabbitmq_num_consumers` and/or `rabbitmq_num_queues` parameters are set.
If `rabbitmq_num_consumers` and/or `rabbitmq_num_queues` parameters are specified along with `rabbitmq_exchange_type`, then:
- `rabbitmq-consistent-hash-exchange` plugin must be enabled.
- `message_id` property of the published messages must be specified (unique for each message/batch).
Example:
``` sql
CREATE TABLE queue (
key UInt64,
value UInt64
) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672',
rabbitmq_exchange_name = 'exchange1',
rabbitmq_exchange_type = 'headers',
rabbitmq_routing_key_list = 'format=logs,type=report,year=2020',
rabbitmq_format = 'JSONEachRow',
rabbitmq_num_consumers = 5;
CREATE TABLE daily (key UInt64, value UInt64)
ENGINE = MergeTree();
CREATE MATERIALIZED VIEW consumer TO daily
AS SELECT key, value FROM queue;
SELECT key, value FROM daily ORDER BY key;
```

View File

@ -1,6 +1,45 @@
---
toc_folder_title: Log Family
toc_priority: 29
toc_title: Introduction
---
# Log Engine Family {#log-engine-family}
These engines were developed for scenarios when you need to quickly write many small tables (up to about 1 million rows) and read them later as a whole.
Engines of the family:
- [StripeLog](../../../engines/table-engines/log-family/stripelog.md)
- [Log](../../../engines/table-engines/log-family/log.md)
- [TinyLog](../../../engines/table-engines/log-family/tinylog.md)
## Common Properties {#common-properties}
Engines:
- Store data on a disk.
- Append data to the end of file when writing.
- Support locks for concurrent data access.
During `INSERT` queries, the table is locked, and other queries for reading and writing data both wait for the table to unlock. If there are no data writing queries, any number of data reading queries can be performed concurrently.
- Do not support [mutations](../../../sql-reference/statements/alter/index.md#alter-mutations).
- Do not support indexes.
This means that `SELECT` queries for ranges of data are not efficient.
- Do not write data atomically.
You can get a table with corrupted data if something breaks the write operation, for example, abnormal server shutdown.
## Differences {#differences}
The `TinyLog` engine is the simplest in the family and provides the poorest functionality and lowest efficiency. The `TinyLog` engine doesnt support parallel data reading by several threads in a single query. It reads data slower than other engines in the family that support parallel reading from a single query and it uses almost as many file descriptors as the `Log` engine because it stores each column in a separate file. Use it only in simple scenarios.
The `Log` and `StripeLog` engines support parallel data reading. When reading data, ClickHouse uses multiple threads. Each thread processes a separate data block. The `Log` engine uses a separate file for each column of the table. `StripeLog` stores all the data in one file. As a result, the `StripeLog` engine uses fewer file descriptors, but the `Log` engine provides higher efficiency when reading data.
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/log_family/) <!--hide-->

View File

@ -1,44 +0,0 @@
---
toc_priority: 31
toc_title: Introduction
---
# Log Engine Family {#log-engine-family}
These engines were developed for scenarios when you need to quickly write many small tables (up to about 1 million rows) and read them later as a whole.
Engines of the family:
- [StripeLog](../../../engines/table-engines/log-family/stripelog.md)
- [Log](../../../engines/table-engines/log-family/log.md)
- [TinyLog](../../../engines/table-engines/log-family/tinylog.md)
## Common Properties {#common-properties}
Engines:
- Store data on a disk.
- Append data to the end of file when writing.
- Support locks for concurrent data access.
During `INSERT` queries, the table is locked, and other queries for reading and writing data both wait for the table to unlock. If there are no data writing queries, any number of data reading queries can be performed concurrently.
- Do not support [mutation](../../../sql-reference/statements/alter.md#alter-mutations) operations.
- Do not support indexes.
This means that `SELECT` queries for ranges of data are not efficient.
- Do not write data atomically.
You can get a table with corrupted data if something breaks the write operation, for example, abnormal server shutdown.
## Differences {#differences}
The `TinyLog` engine is the simplest in the family and provides the poorest functionality and lowest efficiency. The `TinyLog` engine doesnt support parallel data reading by several threads. It reads data slower than other engines in the family that support parallel reading and it uses almost as many descriptors as the `Log` engine because it stores each column in a separate file. Use it in simple low-load scenarios.
The `Log` and `StripeLog` engines support parallel data reading. When reading data, ClickHouse uses multiple threads. Each thread processes a separate data block. The `Log` engine uses a separate file for each column of the table. `StripeLog` stores all the data in one file. As a result, the `StripeLog` engine uses fewer descriptors in the operating system, but the `Log` engine provides higher efficiency when reading data.
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/log_family/) <!--hide-->

View File

@ -5,7 +5,7 @@ toc_title: Log
# Log {#log}
Engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](../../../engines/table-engines/log-family/log-family.md) article.
Engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](../../../engines/table-engines/log-family/index.md) article.
Log differs from [TinyLog](../../../engines/table-engines/log-family/tinylog.md) in that a small file of “marks” resides with the column files. These marks are written on every data block and contain offsets that indicate where to start reading the file in order to skip the specified number of rows. This makes it possible to read table data in multiple threads.
For concurrent data access, the read operations can be performed simultaneously, while write operations block reads and each other.

View File

@ -5,7 +5,7 @@ toc_title: StripeLog
# Stripelog {#stripelog}
This engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](../../../engines/table-engines/log-family/log-family.md) article.
This engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](../../../engines/table-engines/log-family/index.md) article.
Use this engine in scenarios when you need to write many tables with a small amount of data (less than 1 million rows).
@ -20,7 +20,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
) ENGINE = StripeLog
```
See the detailed description of the [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) query.
See the detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query.
## Writing the Data {#table_engines-stripelog-writing-the-data}

View File

@ -5,7 +5,7 @@ toc_title: TinyLog
# TinyLog {#tinylog}
The engine belongs to the log engine family. See [Log Engine Family](../../../engines/table-engines/log-family/log-family.md) for common properties of log engines and their differences.
The engine belongs to the log engine family. See [Log Engine Family](../../../engines/table-engines/log-family/index.md) for common properties of log engines and their differences.
This table engine is typically used with the write-once method: write data one time, then read it as many times as necessary. For example, you can use `TinyLog`-type tables for intermediary data that is processed in small batches. Note that storing data in a large number of small tables is inefficient.

View File

@ -32,7 +32,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[SETTINGS name=value, ...]
```
For a description of request parameters, see [request description](../../../sql-reference/statements/create.md).
For a description of request parameters, see [request description](../../../sql-reference/statements/create/table.md).
**Query clauses**

View File

@ -26,7 +26,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[SETTINGS name=value, ...]
```
For a description of query parameters, see [query description](../../../sql-reference/statements/create.md).
For a description of query parameters, see [query description](../../../sql-reference/statements/create/table.md).
**CollapsingMergeTree Parameters**

View File

@ -77,7 +77,7 @@ Lets break down the name of the first part: `201901_1_3_1`:
The `active` column shows the status of the part. `1` is active; `0` is inactive. The inactive parts are, for example, source parts remaining after merging to a larger part. The corrupted data parts are also indicated as inactive.
As you can see in the example, there are several separated parts of the same partition (for example, `201901_1_3_1` and `201901_1_9_2`). This means that these parts are not merged yet. ClickHouse merges the inserted parts of data periodically, approximately 15 minutes after inserting. In addition, you can perform a non-scheduled merge using the [OPTIMIZE](../../../sql-reference/statements/misc.md#misc_operations-optimize) query. Example:
As you can see in the example, there are several separated parts of the same partition (for example, `201901_1_3_1` and `201901_1_9_2`). This means that these parts are not merged yet. ClickHouse merges the inserted parts of data periodically, approximately 15 minutes after inserting. In addition, you can perform a non-scheduled merge using the [OPTIMIZE](../../../sql-reference/statements/optimize.md) query. Example:
``` sql
OPTIMIZE TABLE visits PARTITION 201902;
@ -116,10 +116,10 @@ drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached
The folders 201901\_1\_1\_0, 201901\_1\_7\_1 and so on are the directories of the parts. Each part relates to a corresponding partition and contains data just for a certain month (the table in this example has partitioning by month).
The `detached` directory contains parts that were detached from the table using the [DETACH](../../../sql-reference/statements/alter.md#alter_detach-partition) query. The corrupted parts are also moved to this directory, instead of being deleted. The server does not use the parts from the `detached` directory. You can add, delete, or modify the data in this directory at any time the server will not know about this until you run the [ATTACH](../../../sql-reference/statements/alter.md#alter_attach-partition) query.
The `detached` directory contains parts that were detached from the table using the [DETACH](../../../sql-reference/statements/alter/partition.md#alter_detach-partition) query. The corrupted parts are also moved to this directory, instead of being deleted. The server does not use the parts from the `detached` directory. You can add, delete, or modify the data in this directory at any time the server will not know about this until you run the [ATTACH](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query.
Note that on the operating server, you cannot manually change the set of parts or their data on the file system, since the server will not know about it. For non-replicated tables, you can do this when the server is stopped, but it isnt recommended. For replicated tables, the set of parts cannot be changed in any case.
ClickHouse allows you to perform operations with the partitions: delete them, copy from one table to another, or create a backup. See the list of all operations in the section [Manipulations With Partitions and Parts](../../../sql-reference/statements/alter.md#alter_manipulations-with-partitions).
ClickHouse allows you to perform operations with the partitions: delete them, copy from one table to another, or create a backup. See the list of all operations in the section [Manipulations With Partitions and Parts](../../../sql-reference/statements/alter/partition.md#alter_manipulations-with-partitions).
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/custom_partitioning_key/) <!--hide-->

View File

@ -28,7 +28,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
[SETTINGS name=value, ...]
```
See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create.md#create-table-query) query.
See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query.
A table for the Graphite data should have the following columns for the following data:

View File

@ -1,6 +1,17 @@
---
toc_folder_title: MergeTree Family
toc_priority: 28
toc_title: Introduction
---
# MergeTree Engine Family {#mergetree-engine-family}
Table engines from the MergeTree family are the core of ClickHouse data storage capabilities. They provide most features for resilience and high-performance data retrieval: columnar storage, custom partitioning, sparse primary index, secondary data-skipping indexes, etc.
Base [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) table engine can be considered the default table engine for single-node ClickHouse instances because it is versatile and practical for a wide range of use cases.
For production usage [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) is the way to go, because it adds high-availability to all features of regular MergeTree engine. A bonus is automatic data deduplication on data ingestion, so the software can safely retry if there was some network issue during insert.
All other engines of MergeTree family add extra functionality for some specific use cases. Usually, its implemented as additional data manipulation in background.
The main downside of MergeTree engines is that they are rather heavy-weight. So the typical pattern is to have not so many of them. If you need many small tables, for example for temporary data, consider [Log engine family](../../../engines/table-engines/log-family/index.md).

View File

@ -49,7 +49,7 @@ ORDER BY expr
[SETTINGS name=value, ...]
```
For a description of parameters, see the [CREATE query description](../../../sql-reference/statements/create.md).
For a description of parameters, see the [CREATE query description](../../../sql-reference/statements/create/table.md).
### Query Clauses {#mergetree-query-clauses}
@ -96,6 +96,7 @@ For a description of parameters, see the [CREATE query description](../../../sql
- `write_final_mark` — Enables or disables writing the final index mark at the end of data part (after the last byte). Default value: 1. Dont turn it off.
- `merge_max_block_size` — Maximum number of rows in block for merge operations. Default value: 8192.
- `storage_policy` — Storage policy. See [Using Multiple Block Devices for Data Storage](#table_engine-mergetree-multiple-volumes).
- `min_bytes_for_wide_part`, `min_rows_for_wide_part` — Minimum number of bytes/rows in a data part that can be stored in `Wide` format. You can set one, both or none of these settings. See [Data Storage](#mergetree-data-storage).
**Example of Sections Setting**
@ -149,6 +150,10 @@ When data is inserted in a table, separate data parts are created and each of th
Data belonging to different partitions are separated into different parts. In the background, ClickHouse merges data parts for more efficient storage. Parts belonging to different partitions are not merged. The merge mechanism does not guarantee that all rows with the same primary key will be in the same data part.
Data parts can be stored in `Wide` or `Compact` format. In `Wide` format each column is stored in a separate file in a filesystem, in `Compact` format all columns are stored in one file. `Compact` format can be used to increase performance of small and frequent inserts.
Data storing format is controlled by the `min_bytes_for_wide_part` and `min_rows_for_wide_part` settings of the table engine. If the number of bytes or rows in a data part is less then the corresponding setting's value, the part is stored in `Compact` format. Otherwise it is stored in `Wide` format. If none of these settings is set, data parts are stored in `Wide` format.
Each data part is logically divided into granules. A granule is the smallest indivisible data set that ClickHouse reads when selecting data. ClickHouse doesnt split rows or values, so each granule always contains an integer number of rows. The first row of a granule is marked with the value of the primary key for the row. For each data part, ClickHouse creates an index file that stores the marks. For each column, whether its in the primary key or not, ClickHouse also stores the same marks. These marks let you find data directly in column files.
The granule size is restricted by the `index_granularity` and `index_granularity_bytes` settings of the table engine. The number of rows in a granule lays in the `[1, index_granularity]` range, depending on the size of the rows. The size of a granule can exceed `index_granularity_bytes` if the size of a single row is greater than the value of the setting. In this case, the size of the granule equals the size of the row.
@ -212,7 +217,7 @@ This feature is helpful when using the [SummingMergeTree](../../../engines/table
In this case it makes sense to leave only a few columns in the primary key that will provide efficient range scans and add the remaining dimension columns to the sorting key tuple.
[ALTER](../../../sql-reference/statements/alter.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts dont need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification.
[ALTER](../../../sql-reference/statements/alter/index.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts dont need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification.
### Use of Indexes and Partitions in Queries {#use-of-indexes-and-partitions-in-queries}
@ -482,7 +487,7 @@ Data with an expired TTL is removed when ClickHouse merges data parts.
When ClickHouse see that data is expired, it performs an off-schedule merge. To control the frequency of such merges, you can set `merge_with_ttl_timeout`. If the value is too low, it will perform many off-schedule merges that may consume a lot of resources.
If you perform the `SELECT` query between merges, you may get expired data. To avoid it, use the [OPTIMIZE](../../../sql-reference/statements/misc.md#misc_operations-optimize) query before `SELECT`.
If you perform the `SELECT` query between merges, you may get expired data. To avoid it, use the [OPTIMIZE](../../../sql-reference/statements/optimize.md) query before `SELECT`.
## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes}
@ -490,7 +495,7 @@ If you perform the `SELECT` query between merges, you may get expired data. To a
`MergeTree` family table engines can store data on multiple block devices. For example, it can be useful when the data of a certain table are implicitly split into “hot” and “cold”. The most recent data is regularly requested but requires only a small amount of space. On the contrary, the fat-tailed historical data is requested rarely. If several disks are available, the “hot” data may be located on fast disks (for example, NVMe SSDs or in memory), while the “cold” data - on relatively slow ones (for example, HDD).
Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](../../../sql-reference/statements/alter.md#alter_move-partition) queries.
Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](../../../sql-reference/statements/alter/partition.md#alter_move-partition) queries.
### Terms {#terms}
@ -636,9 +641,9 @@ The number of threads performing background moves of data parts can be changed b
In the case of `MergeTree` tables, data is getting to disk in different ways:
- As a result of an insert (`INSERT` query).
- During background merges and [mutations](../../../sql-reference/statements/alter.md#alter-mutations).
- During background merges and [mutations](../../../sql-reference/statements/alter/index.md#alter-mutations).
- When downloading from another replica.
- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter.md#alter_freeze-partition).
- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter/partition.md#alter_freeze-partition).
In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy:
@ -650,7 +655,7 @@ Under the hood, mutations and partition freezing make use of [hard links](https:
In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file.
Data is never transferred from the last one and into the first one. One may use system tables [system.part\_log](../../../operations/system-tables/part_log.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs.
User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met.
User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter/partition.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met.
Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas.

Some files were not shown because too many files have changed in this diff Show More