This commit is contained in:
Alexey Vasiliev 2015-04-07 15:15:17 +03:00
commit 1eb7413b25
65 changed files with 2644 additions and 1169 deletions

View File

@ -0,0 +1,18 @@
#!/bin/bash
QUERIES_FILE="queries.sql"
TABLE=$1
TRIES=3
cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
sync
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
echo -n "["
for i in $(seq 1 $TRIES); do
RES=$(clickhouse-client --time --format=Null --query="$query" 2>&1)
[[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done

View File

@ -1,366 +0,0 @@
#!/bin/bash
test_table="hits_100m"
start_date="'2013-07-01'"
early_stop_date="'2013-07-02'"
stop_date="'2013-07-31'"
counter_id=34
function run_ck_server
{
sudo sh -c " ulimit -v 54000000; /etc/init.d/clickhouse-server restart"
}
# execute queries
function execute()
{
queries=("${@}")
queries_count=${#queries[@]}
if [ -z $TIMES ]; then
TIMES=1
fi
index=0
comment_re='\#.*'
while [ "$index" -lt "$queries_count" ]; do
query=${queries[$index]}
if [[ $query =~ $comment_re ]]; then
echo "$query"
echo
else
sync
sudo sh -c "echo 3 > /proc/sys/vm/drop_caches"
for i in $(seq $TIMES)
do
expect -f ./expect.tcl "$query"
if [ "$?" != "0" ]; then
echo "Error: $?"
#break
fi
# restart clickhouse if failed
ps aux | grep -P '\d+ clickhouse-server'
if [ "$?" != "0" ]; then
run_ck_server
fi
done
fi
let "index = $index + 1"
echo "Ran $index queries." >&2
done
}
init_queries=(
# DB structure with array arguments
#"CREATE TABLE $test_table ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32 ) ENGINE = MergeTree(EventDate, intHash32(UserID), tuple(CounterID, EventDate, intHash32(UserID), EventTime), 8192);"
#DB structure without array arguments
#"CREATE TABLE $test_table ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32 ) ENGINE = MergeTree(EventDate, intHash32(UserID), tuple(CounterID, EventDate, intHash32(UserID), EventTime), 8192);"
#modified table without uint
"CREATE TABLE $test_table ( WatchID Int64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID Int64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID Int64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash Int64, URLHash Int64, CLID UInt32, UserIDHash UInt64 ) ENGINE = MergeTree(EventDate, intHash32(UserID), tuple(CounterID, EventDate, intHash32(UserID), EventTime), 8192);"
)
test_queries=(
"SELECT count() FROM $test_table;"
"SELECT count() FROM $test_table WHERE AdvEngineID != 0;"
"SELECT sum(AdvEngineID), count(), avg(ResolutionWidth) FROM $test_table;"
"SELECT sum(UserID) FROM $test_table;"
"SELECT uniq(UserID) FROM $test_table;"
"SELECT uniq(SearchPhrase) FROM $test_table;"
"SELECT min(EventDate), max(EventDate) FROM $test_table;"
"SELECT AdvEngineID, count() FROM $test_table WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count() DESC;"
"#- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;"
"SELECT RegionID, uniq(UserID) AS u FROM $test_table GROUP BY RegionID ORDER BY u DESC LIMIT 10;"
"#- агрегация, среднее количество ключей.;"
"SELECT RegionID, sum(AdvEngineID), count() AS c, avg(ResolutionWidth), uniq(UserID) FROM $test_table GROUP BY RegionID ORDER BY c DESC LIMIT 10;"
"#- агрегация, среднее количество ключей, несколько агрегатных функций.;"
"SELECT MobilePhoneModel, uniq(UserID) AS u FROM $test_table WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;"
"#- мощная фильтрация по строкам, затем агрегация по строкам.;"
"SELECT MobilePhone, MobilePhoneModel, uniq(UserID) AS u FROM $test_table WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;"
"#- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;"
"SELECT SearchPhrase, count() AS c FROM $test_table WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"
"#- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;"
"SELECT SearchPhrase, uniq(UserID) AS u FROM $test_table WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;"
"#- агрегация чуть сложнее.;"
"SELECT SearchEngineID, SearchPhrase, count() AS c FROM $test_table WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;"
"#- агрегация по числу и строке, большое количество ключей.;"
"SELECT UserID, count() FROM $test_table GROUP BY UserID ORDER BY count() DESC LIMIT 10;"
"#- агрегация по очень большому количеству ключей, может не хватить оперативки.;"
"SELECT UserID, SearchPhrase, count() FROM $test_table GROUP BY UserID, SearchPhrase ORDER BY count() DESC LIMIT 10;"
"#- ещё более сложная агрегация.;"
"SELECT UserID, SearchPhrase, count() FROM $test_table GROUP BY UserID, SearchPhrase LIMIT 10;"
"#- то же самое, но без сортировки.;"
"SELECT UserID, toMinute(EventTime) AS m, SearchPhrase, count() FROM $test_table GROUP BY UserID, m, SearchPhrase ORDER BY count() DESC LIMIT 10;"
"#- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;"
"SELECT UserID FROM $test_table WHERE UserID = 12345678901234567890;"
"#- мощная фильтрация по столбцу типа UInt64.;"
"SELECT count() FROM $test_table WHERE URL LIKE '%metrika%';"
"#- фильтрация по поиску подстроки в строке.;"
"SELECT SearchPhrase, any(URL), count() AS c FROM $test_table WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"
"#- вынимаем большие столбцы, фильтрация по строке.;"
"SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM $test_table WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"
"#- чуть больше столбцы.;"
"SELECT * FROM $test_table WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;"
"#- плохой запрос - вынимаем все столбцы.;"
"SELECT SearchPhrase FROM $test_table WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;"
"#- большая сортировка.;"
"SELECT SearchPhrase FROM $test_table WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;"
"#- большая сортировка по строкам.;"
"SELECT SearchPhrase FROM $test_table WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;"
"#- большая сортировка по кортежу.;"
"SELECT CounterID, avg(length(URL)) AS l, count() AS c FROM $test_table WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;"
"#- считаем средние длины URL для крупных счётчиков.;"
"SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count() AS c, any(Referer) FROM $test_table WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;"
"#- то же самое, но с разбивкой по доменам.;"
"SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM $test_table;"
"#- много тупых агрегатных функций.;"
"SELECT SearchEngineID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM $test_table WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;"
"#- сложная агрегация, для больших таблиц может не хватить оперативки.;"
"SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM $test_table WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;"
"#- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;"
"SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM $test_table GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;"
"#- то же самое, но ещё и без фильтрации.;"
"SELECT URL, count() AS c FROM $test_table GROUP BY URL ORDER BY c DESC LIMIT 10;"
"#- агрегация по URL.;"
"SELECT 1, URL, count() AS c FROM $test_table GROUP BY 1, URL ORDER BY c DESC LIMIT 10;"
"#- агрегация по URL и числу.;"
"SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM $test_table GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10;"
"SELECT
URL,
count() AS PageViews
FROM $test_table
WHERE
CounterID = $counter_id
AND EventDate >= toDate($start_date)
AND EventDate <= toDate($stop_date)
AND NOT DontCountHits
AND NOT Refresh
AND notEmpty(URL)
GROUP BY URL
ORDER BY PageViews DESC
LIMIT 10;"
"SELECT
Title,
count() AS PageViews
FROM $test_table
WHERE
CounterID = $counter_id
AND EventDate >= toDate($start_date)
AND EventDate <= toDate($stop_date)
AND NOT DontCountHits
AND NOT Refresh
AND notEmpty(Title)
GROUP BY Title
ORDER BY PageViews DESC
LIMIT 10;"
"SELECT
URL,
count() AS PageViews
FROM $test_table
WHERE
CounterID = $counter_id
AND EventDate >= toDate($start_date)
AND EventDate <= toDate($stop_date)
AND NOT Refresh
AND IsLink
AND NOT IsDownload
GROUP BY URL
ORDER BY PageViews DESC
LIMIT 1000;"
"SELECT
TraficSourceID,
SearchEngineID,
AdvEngineID,
((SearchEngineID = 0 AND AdvEngineID = 0) ? Referer : '') AS Src,
URL AS Dst,
count() AS PageViews
FROM $test_table
WHERE
CounterID = $counter_id
AND EventDate >= toDate($start_date)
AND EventDate <= toDate($stop_date)
AND NOT Refresh
GROUP BY
TraficSourceID,
SearchEngineID,
AdvEngineID,
Src,
Dst
ORDER BY PageViews DESC
LIMIT 1000;"
"SELECT
URLHash,
EventDate,
count() AS PageViews
FROM $test_table
WHERE
CounterID = $counter_id
AND EventDate >= toDate($start_date)
AND EventDate <= toDate($stop_date)
AND NOT Refresh
AND TraficSourceID IN (-1, 6)
AND RefererHash = halfMD5('http://example.ru/')
GROUP BY
URLHash,
EventDate
ORDER BY PageViews DESC
LIMIT 100000;"
"SELECT
WindowClientWidth,
WindowClientHeight,
count() AS PageViews
FROM $test_table
WHERE
CounterID = $counter_id
AND EventDate >= toDate($start_date)
AND EventDate <= toDate($stop_date)
AND NOT Refresh
AND NOT DontCountHits
AND URLHash = halfMD5('http://example.ru/')
GROUP BY
WindowClientWidth,
WindowClientHeight
ORDER BY PageViews DESC
LIMIT 10000;"
"SELECT
toStartOfMinute(EventTime) AS Minute,
count() AS PageViews
FROM $test_table
WHERE
CounterID = $counter_id
AND EventDate >= toDate($start_date)
AND EventDate <= toDate($early_stop_date)
AND NOT Refresh
AND NOT DontCountHits
GROUP BY
Minute
ORDER BY Minute;"
)
function test {
TIMES=3
execute "${test_queries[@]}"
}
function init {
execute "${init_queries[@]}"
}
function debug {
TIMES=3
debug_queries=(
)
execute "${debug_queries[@]}"
}
function usage {
cat <<EOF
usage: $0 options
This script run benhmark for clickhouse
OPTIONS:
-h Show this message
-d Run debug queries
-i Init database
-p log_file Parse log file to columns with result
-t Run tests
EOF
}
function parse_log {
results=$(cat $1 | grep -P 'Elapsed: \d+.\d+ ' | awk '{print $6}')
index=1
for res in $results
do
echo -n "$res "
let "index=$index % 3"
if [ "$index" == "0" ]; then
echo
fi
let "index=$index + 1"
done
}
if [ "$#" == "0" ]; then
usage
exit 0
fi
echo "Start date" $(date)
while getopts “hitdp:” OPTION
do
case $OPTION in
h)
usage
exit 0
;;
i)
init
;;
t)
test
;;
d)
debug
;;
p)
parse_log $OPTARG
;;
?)
usage
exit 0
;;
esac
done
echo "Stop date" $(date)

View File

@ -1,4 +0,0 @@
CONF_DIR=/home/kartavyy/benchmark/clickhouse
expect_file=$CONF_DIR/expect.tcl
test_file=$CONF_DIR/queries.sql
etc_init_d_service=/etc/init.d/clickhouse-server-metrika-yandex

View File

@ -1,13 +0,0 @@
#!/bin/expect
# Set timeout
set timeout 600
# Get arguments
set query [lindex $argv 0]
spawn clickhouse-client --multiline;
expect ":) "
send "$query;\r";
expect ":) "
send "quit";

View File

@ -1,109 +1,43 @@
SELECT count() FROM hits_10m;
SELECT count() FROM hits_10m WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(), avg(ResolutionWidth) FROM hits_10m ;
SELECT sum(UserID) FROM hits_10m ;
SELECT uniq(UserID) FROM hits_10m ;
SELECT uniq(SearchPhrase) FROM hits_10m ;
SELECT min(EventDate), max(EventDate) FROM hits_10m ;
SELECT AdvEngineID, count() FROM hits_10m WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count() DESC;
-- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;
SELECT RegionID, uniq(UserID) AS u FROM hits_10m GROUP BY RegionID ORDER BY u DESC LIMIT 10;
-- агрегация, среднее количество ключей.;
SELECT RegionID, sum(AdvEngineID), count() AS c, avg(ResolutionWidth), uniq(UserID) FROM hits_10m GROUP BY RegionID ORDER BY c DESC LIMIT 10;
-- агрегация, среднее количество ключей, несколько агрегатных функций.;
SELECT MobilePhoneModel, uniq(UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по строкам.;
SELECT MobilePhone, MobilePhoneModel, uniq(UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;
SELECT SearchPhrase, count() AS c FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
-- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;
SELECT SearchPhrase, uniq(UserID) AS u FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
-- агрегация чуть сложнее.;
SELECT SearchEngineID, SearchPhrase, count() AS c FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
-- агрегация по числу и строке, большое количество ключей.;
SELECT UserID, count() FROM hits_10m GROUP BY UserID ORDER BY count() DESC LIMIT 10;
-- агрегация по очень большому количеству ключей, может не хватить оперативки.;
SELECT UserID, SearchPhrase, count() FROM hits_10m GROUP BY UserID, SearchPhrase ORDER BY count() DESC LIMIT 10;
-- ещё более сложная агрегация.;
SELECT UserID, SearchPhrase, count() FROM hits_10m GROUP BY UserID, SearchPhrase LIMIT 10;
-- то же самое, но без сортировки.;
SELECT UserID, toMinute(EventTime) AS m, SearchPhrase, count() FROM hits_10m GROUP BY UserID, m, SearchPhrase ORDER BY count() DESC LIMIT 10;
-- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;
SELECT UserID FROM hits_10m WHERE UserID = 12345678901234567890;
-- мощная фильтрация по столбцу типа UInt64.;
SELECT count() FROM hits_10m WHERE URL LIKE '%metrika%';
-- фильтрация по поиску подстроки в строке.;
SELECT SearchPhrase, any(URL), count() AS c FROM hits_10m WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
-- вынимаем большие столбцы, фильтрация по строке.;
SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM hits_10m WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
-- чуть больше столбцы.;
SELECT * FROM hits_10m WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
-- плохой запрос - вынимаем все столбцы.;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
-- большая сортировка.;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
-- большая сортировка по строкам.;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
-- большая сортировка по кортежу.;
SELECT CounterID, avg(length(URL)) AS l, count() AS c FROM hits_10m WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
-- считаем средние длины URL для крупных счётчиков.;
SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count() AS c, any(Referer) FROM hits_10m WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
-- то же самое, но с разбивкой по доменам.;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_10m ;
-- много тупых агрегатных функций.;
SELECT SearchEngineID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
-- сложная агрегация, для больших таблиц может не хватить оперативки.;
SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
-- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;
SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
-- то же самое, но ещё и без фильтрации.;
SELECT URL, count() AS c FROM hits_10m GROUP BY URL ORDER BY c DESC LIMIT 10;
-- агрегация по URL.;
SELECT 1, URL, count() AS c FROM hits_10m GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
-- агрегация по URL и числу.;
SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM hits_10m GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, count() AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND notEmpty(URL) GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count() AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND notEmpty(Title) GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count() AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, ((SearchEngineID = 0 AND AdvEngineID = 0) ? Referer : '') AS Src, URL AS Dst, count() AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count() AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100;
SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;
SELECT count() FROM {table};
SELECT count() FROM {table} WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(), avg(ResolutionWidth) FROM {table} ;
SELECT sum(UserID) FROM {table} ;
SELECT uniq(UserID) FROM {table} ;
SELECT uniq(SearchPhrase) FROM {table} ;
SELECT min(EventDate), max(EventDate) FROM {table} ;
SELECT AdvEngineID, count() FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count() DESC;
SELECT RegionID, uniq(UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, sum(AdvEngineID), count() AS c, avg(ResolutionWidth), uniq(UserID) FROM {table} GROUP BY RegionID ORDER BY c DESC LIMIT 10;
SELECT MobilePhoneModel, uniq(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, uniq(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, count() AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, uniq(UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, count() AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT UserID, count() FROM {table} GROUP BY UserID ORDER BY count() DESC LIMIT 10;
SELECT UserID, SearchPhrase, count() FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count() DESC LIMIT 10;
SELECT UserID, SearchPhrase, count() FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, toMinute(EventTime) AS m, SearchPhrase, count() FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count() DESC LIMIT 10;
SELECT UserID FROM {table} WHERE UserID = 12345678901234567890;
SELECT count() FROM {table} WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, any(URL), count() AS c FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, avg(length(URL)) AS l, count() AS c FROM {table} WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count() AS c, any(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table};
SELECT SearchEngineID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT URL, count() AS c FROM {table} GROUP BY URL ORDER BY c DESC LIMIT 10;
SELECT 1, URL, count() AS c FROM {table} GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM {table} GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND notEmpty(URL) GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND notEmpty(Title) GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, ((SearchEngineID = 0 AND AdvEngineID = 0) ? Referer : '') AS Src, URL AS Dst, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100;
SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= toDate('2013-07-01') AND EventDate <= toDate('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

View File

@ -1,18 +1,3 @@
path=/opt/dump/dump_0.3
db_name=hits_1b
num=1000000000
#!/bin/bash
dump_replaced=$path/dump_"$db_name"_replaced.tsv
dump_meshed=$path/dump_"$db_name"_meshed.tsv
dump_meshed_utf8=$path/dump_"$db_name"_meshed_utf8.tsv
clickhouse-client --query="SET GLOBAL max_block_size=100000"
clickhouse-client --query="SET GLOBAL max_threads=1"
clickhouse-client --query="SELECT toInt64(WatchID), JavaEnable, Title, GoodEvent, (EventTime < toDateTime('1971-01-01 00:00:00') ? toDateTime('1971-01-01 00:00:01') : EventTime), (EventDate < toDate('1971-01-01') ? toDate('1971-01-01') : EventDate), CounterID, ClientIP, RegionID, toInt64(UserID), CounterClass, OS, UserAgent, URL, Referer, Refresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, (ClientEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : ClientEventTime), SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, toInt64(FUniqID), OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, (LocalEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : LocalEventTime), Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, toInt64(RefererHash), toInt64(URLHash), CLID, toInt64(intHash32(UserID)) FROM hits_mt_test_1b LIMIT $num FORMAT TabSeparated" > $dump_replaced
/etc/init.d/clickhouse-server-metrika-yandex-ulimit restart
sudo nsort -format=maximum_size:65535 -k1 -T /opt -o $dump_meshed $dump_replaced
cat $dump_meshed | iconv -futf8 -tutf8//IGNORE 2>/dev/null 1> $dump_meshed_utf8
table=hits_10m; time clickhouse-client --max_bytes_before_external_sort=30000000000 --query="SELECT toInt64(WatchID), JavaEnable, Title, GoodEvent, (EventTime < toDateTime('1971-01-01 00:00:00') ? toDateTime('1971-01-01 00:00:01') : EventTime), (EventDate < toDate('1971-01-01') ? toDate('1971-01-01') : EventDate), CounterID, ClientIP, RegionID, toInt64(UserID), CounterClass, OS, UserAgent, URL, Referer, Refresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, (ClientEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : ClientEventTime), SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, toInt64(FUniqID), OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, (LocalEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : LocalEventTime), Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, toInt64(RefererHash), toInt64(URLHash), CLID FROM $table ORDER BY rand()" | corrector_utf8 > /opt/dumps/${table}_corrected.tsv

View File

@ -73,7 +73,7 @@ create table hits_10m
RemoteIP BIGINT,
WindowName INT,
OpenerName INT,
x HistoryLength SMALLINT,
HistoryLength SMALLINT,
BrowserLanguage CHAR(2),
BrowserCountry CHAR(2),
SocialNetwork VARCHAR(128),
@ -108,4 +108,4 @@ x HistoryLength SMALLINT,
UserIDHash BIGINT
);
load data infile '/opt/dump/dump_0.3/dump_hits_10m_meshed_utf8.tsv' into table hits_10m FIELDS TERMINATED BY '\t' ESCAPED BY '\\' ENCLOSED BY "NULL";
LOAD DATA INFILE '/opt/dump/dump_0.3/dump_hits_10m_meshed_utf8.tsv' INTO TABLE hits_10m FIELDS TERMINATED BY '\t' ESCAPED BY '\\' ENCLOSED BY "NULL";

View File

@ -1,26 +0,0 @@
#!/bin/sh
if [[ $# -ne 0 ]]; then
echo "usage: if memory limit is exceeded kill process with biggest memory consumption"
exit 1
fi
while [ 1=1 ];
do
FREE_MEMORY_MB=$(free -m | sed -n '3,3p' | awk '{print $4}')
PID="$(ps -eF --sort -rss | sed -n '2,2p' | awk '{print $2}')"
NAME="$(ps -eF --sort -rss | sed -n '2,2p' | awk '{print $11}')"
SIZEGB="$(ps -eF --sort -rss | sed -n '2,2p' | awk '{print $6}')"
SIZEGB=$(($SIZEGB/1024/1024))
echo "Process id ="$PID" Size = "$SIZEGB" GB" "Free Memory = " $FREE_MEMORY_MB" MB"
if (( $FREE_MEMORY_MB < 512 ));
then echo "Killing the process with biggest memory consumption......"
sudo kill -9 $PID
echo "$(date) Killed the process with PID: $PID NAME: $NAME"
else
echo "SIZE has not yet exceeding"
fi
sleep 10
done

View File

@ -0,0 +1,20 @@
#!/bin/bash
QUERIES_FILE="queries.sql"
TABLE=$1
TRIES=3
cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
sync
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
echo -n "["
for i in $(seq 1 $TRIES); do
RES=$(mysql -u root -h 127.0.0.1 -P 3306 --database=test -t -vvv -e "$query" 2>&1 | grep 'in set' | grep -oP '\d+\.\d+')
[[ "$?" == "0" ]] && echo -n "$RES" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done

View File

@ -0,0 +1,139 @@
http://www.memsql.com/download/
http://docs.memsql.com/docs/latest/setup/setup_onprem.html
wget http://download.memsql.com/8d9f4c4d99a547baa40ba097b171bd15/memsql-3.2.x86_64.deb
scp memsql-3.2.x86_64.deb example05e:~
ssh example05e
sudo dpkg -i memsql-3.2.x86_64.deb
sudo mkdir /opt/memsql-data/
sudo cp -r /var/lib/memsql/data/* /opt/memsql-data/
sudo rm -rf /var/lib/memsql/data
sudo ln -s /opt/memsql-data /var/lib/memsql/data
sudo chown -R memsql /opt/memsql-data
sudo chown -R memsql /var/lib/memsql/data
sudo service memsql start
mysql -u root -h 127.0.0.1 -P 3306 --prompt="memsql> "
CREATE DATABASE test;
USE test;
CREATE TABLE hits_10m
(
WatchID BIGINT,
JavaEnable SMALLINT,
Title VARCHAR(1400),
GoodEvent SMALLINT,
EventTime TIMESTAMP,
EventDate DATE,
CounterID BIGINT,
ClientIP BIGINT,
RegionID BIGINT,
UserID BIGINT,
CounterClass TINYINT,
OS SMALLINT,
UserAgent SMALLINT,
URL VARCHAR(7800),
Referer VARCHAR(3125),
Refresh TINYINT,
RefererCategoryID INT,
RefererRegionID BIGINT,
URLCategoryID INT,
URLRegionID BIGINT,
ResolutionWidth INT,
ResolutionHeight INT,
ResolutionDepth SMALLINT,
FlashMajor SMALLINT,
FlashMinor SMALLINT,
FlashMinor2 VARCHAR(256),
NetMajor SMALLINT,
NetMinor SMALLINT,
UserAgentMajor INT,
UserAgentMinor CHAR(2),
CookieEnable SMALLINT,
JavascriptEnable SMALLINT,
IsMobile SMALLINT,
MobilePhone SMALLINT,
MobilePhoneModel VARCHAR(80),
Params VARCHAR(2925),
IPNetworkID BIGINT,
TraficSourceID SMALLINT,
SearchEngineID INT,
SearchPhrase VARCHAR(2008),
AdvEngineID SMALLINT,
IsArtifical SMALLINT,
WindowClientWidth INT,
WindowClientHeight INT,
ClientTimeZone INTEGER,
ClientEventTime TIMESTAMP,
SilverlightVersion1 SMALLINT,
SilverlightVersion2 SMALLINT,
SilverlightVersion3 BIGINT,
SilverlightVersion4 INT,
PageCharset VARCHAR(80),
CodeVersion BIGINT,
IsLink SMALLINT,
IsDownload SMALLINT,
IsNotBounce SMALLINT,
FUniqID BIGINT,
OriginalURL VARCHAR(8181),
HID BIGINT,
IsOldCounter SMALLINT,
IsEvent SMALLINT,
IsParameter SMALLINT,
DontCountHits SMALLINT,
WithHash SMALLINT,
HitColor CHAR(1),
LocalEventTime TIMESTAMP,
Age SMALLINT,
Sex SMALLINT,
Income SMALLINT,
Interests INT,
Robotness SMALLINT,
RemoteIP BIGINT,
WindowName INT,
OpenerName INT,
HistoryLength SMALLINT,
BrowserLanguage CHAR(2),
BrowserCountry CHAR(2),
SocialNetwork VARCHAR(128),
SocialAction VARCHAR(128),
HTTPError INT,
SendTiming BIGINT,
DNSTiming BIGINT,
ConnectTiming BIGINT,
ResponseStartTiming BIGINT,
ResponseEndTiming BIGINT,
FetchTiming BIGINT,
SocialSourceNetworkID SMALLINT,
SocialSourcePage VARCHAR(256),
ParamPrice BIGINT,
ParamOrderID VARCHAR(80),
ParamCurrency CHAR(3),
ParamCurrencyID INT,
OpenstatServiceName VARCHAR(80),
OpenstatCampaignID VARCHAR(512),
OpenstatAdID VARCHAR(80),
OpenstatSourceID VARCHAR(256),
UTMSource VARCHAR(256),
UTMMedium VARCHAR(256),
UTMCampaign VARCHAR(407),
UTMContent VARCHAR(256),
UTMTerm VARCHAR(437),
FromTag VARCHAR(428),
HasGCLID SMALLINT,
RefererHash BIGINT,
URLHash BIGINT,
CLID BIGINT,
INDEX ColumnStoreIndex USING CLUSTERED COLUMNSTORE (CounterID, EventDate, UserID, EventTime)
);
Table creation takes about 15 seconds.
LOAD DATA INFILE '/opt/dumps/hits_10m_corrected.tsv' INTO TABLE hits_10m;
12 min 24.51 sec
13422 rows/sec.
data size: 1 613 773 528 bytes.

View File

@ -0,0 +1,43 @@
SELECT count(*) FROM hits_10m;
SELECT count(*) FROM hits_10m WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_10m;
SELECT sum(UserID) FROM hits_10m;
SELECT count(DISTINCT UserID) FROM hits_10m;
SELECT count(DISTINCT SearchPhrase) FROM hits_10m;
SELECT min(EventDate), max(EventDate) FROM hits_10m;
SELECT AdvEngineID, count(*) FROM hits_10m WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_10m GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_10m GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, count(*) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, count(*) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, count(*) FROM hits_10m GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID FROM hits_10m WHERE UserID = 123456789;
SELECT count(*) FROM hits_10m WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, MAX(URL), count(*) FROM hits_10m WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM hits_10m WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT * FROM hits_10m WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, avg(length(URL)) AS l, count(*) FROM hits_10m WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT SUBSTRING(SUBSTRING(Referer, POSITION('//' IN Referer) + 2), 1, GREATEST(0, POSITION('/' IN SUBSTRING(Referer, POSITION('//' IN Referer) + 2)) - 1)) AS k, avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM hits_10m WHERE Referer != '' GROUP BY k HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_10m;
SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) FROM hits_10m GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
SELECT 1, URL, count(*) FROM hits_10m GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM hits_10m GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = 6202628419148573758 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = 6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT EventTime - INTERVAL SECOND(EventTime) SECOND AS Minute, count(*) AS PageViews FROM hits_10m WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

View File

@ -1,114 +0,0 @@
from optparse import OptionParser
import argparse
import re
import sys
def log_to_rows(filename, pattern_select, time_pattern, pattern_ignore):
time_matcher = re.compile(time_pattern)
select_matcher = re.compile(pattern_select, re.IGNORECASE);
ignore_matcher = re.compile(pattern_ignore)
f = open(filename, 'r');
query = ''
raw_time = ''
for line in f:
if ignore_matcher.match(line):
continue
m = select_matcher.search(line)
if m :
if line != query:
query = line
sys.stdout.write("\n")
raw_time = raw_time + "\n"
m = time_matcher.search(line)
if m:
sec = 0
minute = 0
ms = 0
if 'min' in m.groupdict() and m.group('min'):
minute = float(m.group('min').replace(',','.'))
if 'sec' in m.groupdict() and m.group('sec'):
sec = float(m.group('sec').replace(',','.'))
if 'ms' in m.groupdict() and m.group('ms'):
ms = float(m.group('ms').replace(',', '.'))
sys.stdout.write( str(minute*60 + sec + ms/1000.) + " " )
raw_time = raw_time + " | " + m.group('time')
print
print " =======raw time====== \n" + raw_time
def process_log(filename, pattern_select, time_pattern, pattern_ignore, error_pattern):
time_matcher = re.compile(time_pattern)
select_matcher = re.compile(pattern_select, re.IGNORECASE);
ignore_matcher = re.compile(pattern_ignore)
error_matcher = re.compile(error_pattern, re.IGNORECASE)
f = open(filename, 'r');
query = ''
for line in f:
if error_matcher.match(line):
print line
continue
if ignore_matcher.match(line):
continue
m = select_matcher.search(line)
if m :
if line != query:
sys.stdout.flush()
query = line
print "\n\n"
print query
m = time_matcher.search(line)
if m:
sys.stdout.write(m.group('time') + " " )
def main():
parser = argparse.ArgumentParser(description="Process log files form different databases")
parser.add_argument('log_file', metavar = 'log_file', help = 'database log file')
parser.add_argument('db_name', metavar = 'db_name', help = ' database name one of clickhouse, vertica, infinidb, monetdb, infobright, hive (... more later)')
args = parser.parse_args()
log_file = args.log_file
db_name = args.db_name
time_pattern = ''
select_pattern = r'query: select '
ignore_pattern = r'#'
error_pattern = r'error .*'
if db_name == 'clickhouse':
time_pattern = r'(?P<time>(?P<sec>\d+.\d{3}) sec\.)'
select_pattern = r'query\: select '
ignore_pattern = r':\).*'
elif db_name == 'vertica' :
time_pattern = r'(?P<time>(?P<ms>\d+.\d+) ms\.)'
select_pattern = r'select '
ignore_pattern = r'(.*dbadmin=>|query:|.*Timing is on\.).*'
elif db_name == 'infinidb' :
time_pattern = r'(?P<time>(?:(?P<min>\d+) min )?(?P<sec>\d+.\d+) sec)'
ignore_pattern = r'Query OK, 0 rows affected \(0\.00 sec\)'
elif db_name == 'monetdb' :
time_pattern = r'tuples? \((?P<time>(?:(?P<min>\d+)m )?(?:(?P<sec>\d+.?\d+)s)?(?:(?P<ms>\d+.\d+)ms)?)\)'
elif db_name == 'infobright' :
time_pattern = r'(?P<time>(?:(?P<min>\d+) min ){0,1}(?P<sec>\d+.\d+) sec)'
elif db_name == 'hive':
time_pattern = r'Time taken\: (?P<time>(?:(?P<sec>\d+.?\d+) seconds))'
error_pattern = r'failed\: .*'
elif db_name == 'mysql':
time_pattern = r'(?P<time>(?:(?P<min>\d+) min )?(?P<sec>\d+.\d+) sec)'
else:
sys.exit("unknown db_name")
process_log(log_file, select_pattern, time_pattern, ignore_pattern, error_pattern )
log_to_rows(log_file, select_pattern, time_pattern, ignore_pattern )
main()

View File

@ -0,0 +1,41 @@
Quick installation instructions
-------------------------------
Register on my.vertica.com
https://my.vertica.com/download-community-edition/
Download HP Vertica 7.1.1 Analytic Database Server, Debian or Ubuntu 14.04 version.
sudo apt-get install sysstat pstack mcelog
sudo dpkg -i vertica_7.1.1-0_amd64.deb
sudo sh -c "echo 'export TZ=Europe/Moscow' >> /home/dbadmin/.bash_profile"
sudo /opt/vertica/sbin/install_vertica --hosts=127.0.0.1 --failure-threshold=NONE
sudo mkdir /opt/vertica-data/
sudo chown dbadmin /opt/vertica-data/
sudo su dbadmin
/opt/vertica/bin/adminTools
configuration menu
create database
name: default
empty password
both directories: /opt/vertica-data/
main menu
exit
PS. Note that Vertica doesn't support IPv6.
How to prepare data
-------------------
Prepare dumps with script create_dump.sh for tables hits_10m, hits_100m, hits_1000m. It takes about 5 hours (1m41.882s, 25m11.103s, 276m36.388s).
Start vsql command line client.
/opt/vertica/bin/vsql -U dbadmin
Create tables with queries from hits_define_schema.sql.
Time to insert data:
hits_10m: 91 sec.
hits_100m: 774 sec.
hits_1000m: 13769 sec.
You need to validate number of rows with SELECT count(*).

View File

@ -0,0 +1,24 @@
#!/bin/bash
QUERIES_FILE="queries.sql"
TABLE=$1
TRIES=3
cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
sync
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
echo -n "["
for i in $(seq 1 $TRIES); do
RES=$((echo '\timing'; echo "$query") |
/opt/vertica/bin/vsql -U dbadmin |
grep -oP 'All rows formatted: [^ ]+ ms' |
ssed -R -e 's/^All rows formatted: ([\d,]+) ms$/\1/' |
tr ',' '.')
[[ "$?" == "0" ]] && echo -n "$(perl -e "print ${RES} / 1000")" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done

View File

@ -1,21 +0,0 @@
#!/bin/bash
#!/bin/expect
# Set timeout
set timeout 600
# Get arguments
set query [lindex $argv 0]
spawn vsql -eU dbadmin
expect "dbadmin=>"
send "\\timing\r"
expect "dbadmin=>"
send "$query\r"
expect "dbadmin=>"
send "\\q\r"
expect eof

View File

@ -1,6 +1,6 @@
\timing
create table hits_10m_meshed
create table hits_10m
(
WatchID INTEGER,
JavaEnable INTEGER,
@ -106,17 +106,14 @@ create table hits_10m_meshed
HasGCLID INTEGER,
RefererHash INTEGER,
URLHash INTEGER,
CLID INTEGER,
UserIDHash INTEGER
) ORDER BY CounterID, EventDate, UserIDHash, EventTime;
CLID INTEGER
) ORDER BY CounterID, EventDate, UserID, EventTime;
\set input_file '''/opt/dumps/hits_10m_corrected.tsv'''
COPY hits_10m FROM :input_file DELIMITER E'\t' DIRECT;
\set input_file '''/opt/dump/dump_0.3/dump_hits_10m_meshed.tsv'''
COPY hits_10m_meshed FROM :input_file DELIMITER E'\t' DIRECT;
create table hits_100m_meshed
create table hits_100m
(
WatchID INTEGER,
JavaEnable INTEGER,
@ -222,17 +219,14 @@ create table hits_100m_meshed
HasGCLID INTEGER,
RefererHash INTEGER,
URLHash INTEGER,
CLID INTEGER,
UserIDHash INTEGER
) ORDER BY CounterID, EventDate, UserIDHash, EventTime;;
CLID INTEGER
) ORDER BY CounterID, EventDate, UserID, EventTime;
\set input_file '''/opt/dump/dump_0.3/dump_hits_100m_meshed.tsv'''
COPY hits_100m_meshed FROM :input_file DELIMITER E'\t' DIRECT;
\set input_file '''/opt/dumps/hits_100m_corrected.tsv'''
COPY hits_100m FROM :input_file DELIMITER E'\t' DIRECT;
create table hits_1b_meshed
create table hits_1000m
(
WatchID INTEGER,
JavaEnable INTEGER,
@ -338,10 +332,8 @@ create table hits_1b_meshed
HasGCLID INTEGER,
RefererHash INTEGER,
URLHash INTEGER,
CLID INTEGER,
UserIDHash INTEGER
) ORDER BY CounterID, EventDate, UserIDHash, EventTime;
CLID INTEGER
) ORDER BY CounterID, EventDate, UserID, EventTime;
\set input_file '''/opt/dump/dump_0.3/dump_hits_1b_meshed.tsv'''
COPY hits_1b_meshed FROM :input_file DELIMITER E'\t' DIRECT;
\set input_file '''/opt/dumps/hits_1000m_corrected.tsv'''
COPY hits_1000m FROM :input_file DELIMITER E'\t' DIRECT;

View File

@ -0,0 +1,43 @@
SELECT count(*) FROM {table};
SELECT count(*) FROM {table} WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM {table};
SELECT sum_float(UserID) FROM {table};
SELECT COUNT(DISTINCT UserID) FROM {table};
SELECT COUNT(DISTINCT SearchPhrase) FROM {table};
SELECT min(EventDate), max(EventDate) FROM {table};
SELECT AdvEngineID, count(*) FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), COUNT(DISTINCT UserID) FROM {table} GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, count(*) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, count(*) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, count(*) FROM {table} GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID FROM {table} WHERE UserID = 12345678901234567890;
SELECT count(*) FROM {table} WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, MAX(URL), count(*) FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, COUNT(DISTINCT UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, avg(OCTET_LENGTH(URL)) AS l, count(*) FROM {table} WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT SUBSTRB(SUBSTRB(Referer, POSITIONB(Referer, '//') + 2), 1, GREATEST(0, POSITIONB(SUBSTRB(Referer, POSITIONB(Referer, '//') + 2), '/') - 1)) AS key, avg(OCTET_LENGTH(Referer)) AS l, count(*) AS c, MAX(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table};
SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) FROM {table} GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
SELECT 1, URL, count(*) FROM {table} GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM {table} GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = 6202628419148573758 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = 6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT TIME_SLICE(EventTime, 1, 'MINUTE') AS Minute, count(*) AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

View File

@ -1,111 +0,0 @@
SELECT count(*) FROM hits_100m_meshed;
SELECT count(*) FROM hits_100m_meshed WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_100m_meshed;
SELECT sum_float(UserID) FROM hits_100m_meshed;
SELECT count(DISTINCT UserID) FROM hits_100m_meshed;
SELECT count(DISTINCT SearchPhrase) FROM hits_100m_meshed;
SELECT min(EventDate), max(EventDate) FROM hits_100m_meshed;
SELECT AdvEngineID, count(*) FROM hits_100m_meshed WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
-- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;
SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_100m_meshed GROUP BY RegionID ORDER BY u DESC LIMIT 10;
-- агрегация, среднее количество ключей.;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_100m_meshed GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
-- агрегация, среднее количество ключей, несколько агрегатных функций.;
SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по строкам.;
SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;
SELECT SearchPhrase, count(*) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;
SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
-- агрегация чуть сложнее.;
SELECT SearchEngineID, SearchPhrase, count(*) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- агрегация по числу и строке, большое количество ключей.;
SELECT UserID, count(*) FROM hits_100m_meshed GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
-- агрегация по очень большому количеству ключей, может не хватить оперативки.;
SELECT UserID, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- ещё более сложная агрегация.;
SELECT UserID, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, SearchPhrase LIMIT 10;
-- то же самое, но без сортировки.;
SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM hits_100m_meshed GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;
SELECT UserID FROM hits_100m_meshed WHERE UserID = 12345678901234567890;
-- мощная фильтрация по столбцу типа UInt64.;
SELECT count(*) FROM hits_100m_meshed WHERE URL LIKE '%metrika%';
-- фильтрация по поиску подстроки в строке.;
SELECT SearchPhrase, MAX(URL), count(*) FROM hits_100m_meshed WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- вынимаем большие столбцы, фильтрация по строке.;
SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM hits_100m_meshed WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- чуть больше столбцы.;
SELECT * FROM hits_100m_meshed WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
-- плохой запрос - вынимаем все столбцы.;
SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
-- большая сортировка.;
SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
-- большая сортировка по строкам.;
SELECT SearchPhrase FROM hits_100m_meshed WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
-- большая сортировка по кортежу.;
SELECT CounterID, avg(length(URL)) AS l, count(*) FROM hits_100m_meshed WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-- считаем средние длины URL для крупных счётчиков.;
SELECT SUBSTRING(SUBSTRING(Referer, POSITION('//' IN Referer) + 2), 1, GREATEST(0, POSITION('/' IN SUBSTRING(Referer, POSITION('//' IN Referer) + 2)) - 1)) AS key, avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM hits_10m_meshed WHERE Referer != '' GROUP BY key HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-- то же самое, но с разбивкой по доменам.;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_100m_meshed;
-- много тупых агрегатных функций.;
SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- сложная агрегация, для больших таблиц может не хватить оперативки.;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_meshed GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- то же самое, но ещё и без фильтрации.;
SELECT URL, count(*) FROM hits_100m_meshed GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
-- агрегация по URL.;
SELECT 1, URL, count(*) FROM hits_100m_meshed GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
-- агрегация по URL и числу.;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM hits_100m_meshed GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = 6202628419148573758 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = 6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT TIME_SLICE(EventTime, 1, 'MINUTE') AS Minute, count(*) AS PageViews FROM hits_100m_meshed WHERE CounterID = 34 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

View File

@ -32,6 +32,8 @@ public:
return res.str();
}
RowInputStreamPtr & getRowInput() { return row_input; }
protected:
Block readImpl() override;

View File

@ -29,6 +29,35 @@ private:
bool with_names;
bool with_types;
DataTypes data_types;
/// Для удобной диагностики в случае ошибки.
size_t row_num = 0;
/// Сколько байт было считано, не считая тех, что ещё в буфере.
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
size_t bytes_read_at_start_of_buffer_on_prev_row = 0;
BufferBase::Position pos_of_current_row = nullptr;
BufferBase::Position pos_of_prev_row = nullptr;
/** В случае исключения при парсинге, вызывается эта функция.
* Она выполняет заново парсинг последних двух строк и выводит подробную информацию о том, что происходит.
*/
void printDiagnosticInfo(WriteBuffer & out);
void updateDiagnosticInfo()
{
++row_num;
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
bytes_read_at_start_of_buffer_on_current_row = istr.count() - istr.offset();
pos_of_prev_row = pos_of_current_row;
pos_of_current_row = istr.position();
}
bool parseRowAndPrintDiagnosticInfo(WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
};
}

View File

@ -11,6 +11,7 @@
#include <DB/Columns/ColumnFixedString.h>
#include <DB/Columns/ColumnConst.h>
#include <DB/Functions/IFunction.h>
#include <statdaemons/ext/range.hpp>
namespace DB
@ -498,10 +499,11 @@ public:
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!arguments[1].column)
throw Exception("Second argument for function " + getName() + " must be constant", ErrorCodes::ILLEGAL_COLUMN);
if (!typeid_cast<const DataTypeString *>(&*arguments[0].type))
throw Exception(getName() + " is only implemented for type String", ErrorCodes::NOT_IMPLEMENTED);
if (!typeid_cast<const DataTypeString *>(arguments[0].type.get()) &&
!typeid_cast<const DataTypeFixedString *>(arguments[0].type.get()))
throw Exception(getName() + " is only implemented for types String and FixedString", ErrorCodes::NOT_IMPLEMENTED);
size_t n = getSize(arguments[1]);
const size_t n = getSize(arguments[1]);
out_return_type = new DataTypeFixedString(n);
}
@ -523,7 +525,7 @@ public:
block.getByPosition(result).column = new ColumnConst<String>(column_const->size(), std::move(resized_string), new DataTypeFixedString(n));
}
else if(const ColumnString * column_string = typeid_cast<const ColumnString *>(&*column))
else if (const ColumnString * column_string = typeid_cast<const ColumnString *>(&*column))
{
ColumnFixedString * column_fixed = new ColumnFixedString(n);
ColumnPtr result_ptr = column_fixed;
@ -542,6 +544,26 @@ public:
}
block.getByPosition(result).column = result_ptr;
}
else if (const auto column_fixed_string = typeid_cast<const ColumnFixedString *>(column.get()))
{
const auto src_n = column_fixed_string->getN();
if (src_n > n)
throw Exception{
"String too long for type FixedString(" + toString(n) + ")",
ErrorCodes::TOO_LARGE_STRING_SIZE
};
const auto column_fixed = new ColumnFixedString{n};
block.getByPosition(result).column = column_fixed;
auto & out_chars = column_fixed->getChars();
const auto & in_chars = column_fixed_string->getChars();
const auto size = column_fixed_string->size();
out_chars.resize_fill(size * n);
for (const auto i : ext::range(0, size))
memcpy(&out_chars[i * n], &in_chars[i * src_n], src_n);
}
else
throw Exception("Unexpected column: " + column->getName(), ErrorCodes::ILLEGAL_COLUMN);
}

View File

@ -22,6 +22,7 @@
#include <DB/Columns/ColumnReplicated.h>
#include <DB/Common/UnicodeBar.h>
#include <DB/Functions/IFunction.h>
#include <statdaemons/ext/range.hpp>
namespace DB
@ -770,4 +771,117 @@ private:
}
};
template <typename Impl>
class FunctionNumericPredicate : public IFunction
{
public:
static constexpr auto name = Impl::name;
static IFunction * create(const Context &) { return new FunctionNumericPredicate; }
String getName() const override { return name; }
DataTypePtr getReturnType(const DataTypes & arguments) const override
{
const auto args_size = arguments.size();
if (args_size != 1)
throw Exception{
"Number of arguments for function " + getName() + " doesn't match: passed " +
toString(args_size) + ", should be 1",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH
};
const auto arg = arguments.front().get();
if (!typeid_cast<const DataTypeUInt8 *>(arg) &&
!typeid_cast<const DataTypeUInt16 *>(arg) &&
!typeid_cast<const DataTypeUInt32 *>(arg) &&
!typeid_cast<const DataTypeUInt64 *>(arg) &&
!typeid_cast<const DataTypeInt8 *>(arg) &&
!typeid_cast<const DataTypeInt16 *>(arg) &&
!typeid_cast<const DataTypeInt32 *>(arg) &&
!typeid_cast<const DataTypeInt64 *>(arg) &&
!typeid_cast<const DataTypeFloat32 *>(arg) &&
!typeid_cast<const DataTypeFloat64 *>(arg))
throw Exception{
"Argument for function " + getName() + " must be numeric",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT
};
return new DataTypeUInt8;
}
void execute(Block & block, const ColumnNumbers & arguments, const size_t result) override
{
const auto in = block.getByPosition(arguments.front()).column.get();
if (!execute<UInt8>(block, in, result) &&
!execute<UInt16>(block, in, result) &&
!execute<UInt32>(block, in, result) &&
!execute<UInt64>(block, in, result) &&
!execute<Int8>(block, in, result) &&
!execute<Int16>(block, in, result) &&
!execute<Int32>(block, in, result) &&
!execute<Int64>(block, in, result) &&
!execute<Float32>(block, in, result) &&
!execute<Float64>(block, in, result))
throw Exception{
"Illegal column " + in->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN
};
}
template <typename T>
bool execute(Block & block, const IColumn * in_untyped, const size_t result) override
{
if (const auto in = typeid_cast<const ColumnVector<T> *>(in_untyped))
{
const auto size = in->size();
const auto out = new ColumnVector<UInt8>{size};
block.getByPosition(result).column = out;
const auto & in_data = in->getData();
auto & out_data = out->getData();
for (const auto i : ext::range(0, size))
out_data[i] = Impl::execute(in_data[i]);
return true;
}
else if (const auto in = typeid_cast<const ColumnConst<T> *>(in_untyped))
{
block.getByPosition(result).column = new ColumnConstUInt8{
in->size(),
Impl::execute(in->getData())
};
return true;
}
return false;
}
};
struct IsFiniteImpl
{
static constexpr auto name = "isFinite";
template <typename T> static bool execute(const T t) { return std::isfinite(t); }
};
struct IsInfiniteImpl
{
static constexpr auto name = "isInfinite";
template <typename T> static bool execute(const T t) { return std::isinf(t); }
};
struct IsNaNImpl
{
static constexpr auto name = "isNaN";
template <typename T> static bool execute(const T t) { return std::isnan(t); }
};
using FunctionIsFinite = FunctionNumericPredicate<IsFiniteImpl>;
using FunctionIsInfinite = FunctionNumericPredicate<IsInfiniteImpl>;
using FunctionIsNaN = FunctionNumericPredicate<IsNaNImpl>;
}

View File

@ -395,14 +395,21 @@ struct MatchImpl
if (required_substring.empty())
{
size_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
if (!regexp->getRE2()) /// Пустой регексп. Всегда матчит.
{
res[i] = revert ^ regexp->getRE2()->Match(
re2_st::StringPiece(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1),
0, offsets[i] - prev_offset - 1, re2_st::RE2::UNANCHORED, nullptr, 0);
memset(&res[0], 1, size * sizeof(res[0]));
}
else
{
size_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
res[i] = revert ^ regexp->getRE2()->Match(
re2_st::StringPiece(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1),
0, offsets[i] - prev_offset - 1, re2_st::RE2::UNANCHORED, nullptr, 0);
prev_offset = offsets[i];
prev_offset = offsets[i];
}
}
}
else

View File

@ -349,29 +349,49 @@ inline void readDateText(mysqlxx::Date & date, ReadBuffer & buf)
}
/// в формате YYYY-MM-DD HH:MM:SS, согласно текущему часовому поясу
template <typename T>
inline T parse(const char * data, size_t size);
void readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf);
/** В формате YYYY-MM-DD hh:mm:ss, согласно текущему часовому поясу
* В качестве исключения, также поддерживается парсинг из десятичного числа - unix timestamp.
*/
inline void readDateTimeText(time_t & datetime, ReadBuffer & buf)
{
char s[19];
size_t size = buf.read(s, 19);
if (19 != size)
/** Считываем 10 символов, которые могут быть unix timestamp.
* При этом, поддерживается только unix timestamp из 10 символов - от 9 сентября 2001.
* Потом смотрим на пятый символ. Если это число - парсим unix timestamp.
* Если это не число - парсим YYYY-MM-DD hh:mm:ss.
*/
/// Оптимистичный вариант, когда всё значение точно лежит в буфере.
const char * s = buf.position();
if (s + 19 < buf.buffer().end())
{
s[size] = 0;
throw Exception(std::string("Cannot parse datetime ") + s, ErrorCodes::CANNOT_PARSE_DATETIME);
if (s[4] < '0' || s[4] > '9')
{
UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
UInt8 month = (s[5] - '0') * 10 + (s[6] - '0');
UInt8 day = (s[8] - '0') * 10 + (s[9] - '0');
UInt8 hour = (s[11] - '0') * 10 + (s[12] - '0');
UInt8 minute = (s[14] - '0') * 10 + (s[15] - '0');
UInt8 second = (s[17] - '0') * 10 + (s[18] - '0');
if (unlikely(year == 0))
datetime = 0;
else
datetime = DateLUT::instance().makeDateTime(year, month, day, hour, minute, second);
buf.position() += 19;
}
else
readIntTextUnsafe(datetime, buf);
}
UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
UInt8 month = (s[5] - '0') * 10 + (s[6] - '0');
UInt8 day = (s[8] - '0') * 10 + (s[9] - '0');
UInt8 hour = (s[11] - '0') * 10 + (s[12] - '0');
UInt8 minute = (s[14] - '0') * 10 + (s[15] - '0');
UInt8 second = (s[17] - '0') * 10 + (s[18] - '0');
if (unlikely(year == 0))
datetime = 0;
else
datetime = DateLUT::instance().makeDateTime(year, month, day, hour, minute, second);
readDateTimeTextFallback(datetime, buf);
}
inline void readDateTimeText(mysqlxx::DateTime & datetime, ReadBuffer & buf)

View File

@ -262,9 +262,8 @@ public:
const FormatFactory & getFormatFactory() const { return shared->format_factory; }
const Dictionaries & getDictionaries() const;
const ExternalDictionaries & getExternalDictionaries() const;
void tryCreateDictionaries(bool throw_on_error = false) const;
void tryCreateExternalDictionaries(bool throw_on_error = false) const;
void tryCreateDictionaries() const;
void tryCreateExternalDictionaries() const;
InterserverIOHandler & getInterserverIOHandler() { return shared->interserver_io_handler; }
@ -342,6 +341,10 @@ public:
Compiler & getCompiler();
void shutdown() { shared->shutdown(); }
private:
const Dictionaries & getDictionariesImpl(bool throw_on_error) const;
const ExternalDictionaries & getExternalDictionariesImpl(bool throw_on_error) const;
};

View File

@ -64,42 +64,53 @@ private:
LOG_INFO(log, "Loading dictionaries.");
auto & config = Poco::Util::Application::instance().config();
bool was_exception = false;
try
if (config.has(TechDataHierarchy::required_key))
{
MultiVersion<TechDataHierarchy>::Version new_tech_data_hierarchy = new TechDataHierarchy;
tech_data_hierarchy.set(new_tech_data_hierarchy);
}
catch (...)
{
handleException(throw_on_error);
was_exception = true;
try
{
auto new_tech_data_hierarchy = std::make_unique<TechDataHierarchy>();
tech_data_hierarchy.set(new_tech_data_hierarchy.release());
}
catch (...)
{
handleException(throw_on_error);
was_exception = true;
}
}
try
{
MultiVersion<RegionsHierarchies>::Version new_regions_hierarchies = new RegionsHierarchies;
new_regions_hierarchies->reload();
regions_hierarchies.set(new_regions_hierarchies);
}
catch (...)
if (config.has(RegionsHierarchies::required_key))
{
handleException(throw_on_error);
was_exception = true;
try
{
auto new_regions_hierarchies = std::make_unique<RegionsHierarchies>();
new_regions_hierarchies->reload();
regions_hierarchies.set(new_regions_hierarchies.release());
}
catch (...)
{
handleException(throw_on_error);
was_exception = true;
}
}
try
if (config.has(RegionsNames::required_key))
{
MultiVersion<RegionsNames>::Version new_regions_names = new RegionsNames;
new_regions_names->reload();
regions_names.set(new_regions_names);
}
catch (...)
{
handleException(throw_on_error);
was_exception = true;
try
{
auto new_regions_names = std::make_unique<RegionsNames>();
new_regions_names->reload();
regions_names.set(new_regions_names.release());
}
catch (...)
{
handleException(throw_on_error);
was_exception = true;
}
}
if (!was_exception)

View File

@ -86,7 +86,7 @@ struct Settings
/** Включена ли компиляция запросов. */ \
M(SettingBool, compile, false) \
/** Количество одинаковых по структуре запросов перед тем, как инициируется их компиляция. */ \
M(SettingUInt64, min_count_to_compile, 0) \
M(SettingUInt64, min_count_to_compile, 3) \
/** При каком количестве ключей, начинает использоваться двухуровневая агрегация. 0 - никогда не использовать. */ \
M(SettingUInt64, group_by_two_level_threshold, 100000) \
\

View File

@ -284,3 +284,27 @@ void NO_INLINE Aggregator::executeSpecializedWithoutKey(
}
}
/** Основной код компилируется с помощью gcc 4.9.
* Но SpecializedAggregator компилируется с помощью clang 3.6 в .so-файл.
* Это делается потому что gcc не удаётся заставить инлайнить функции,
* которые были девиртуализированы, в конкретном случае, и производительность получается ниже.
* А также clang проще распространять для выкладки на серверы.
*
* После перехода с gcc 4.8 и gnu++1x на gcc 4.9 и gnu++1y,
* при dlopen стала возникать ошибка: undefined symbol: __cxa_pure_virtual
*
* Скорее всего, это происходит из-за изменившейся версии этого символа:
* gcc создаёт в .so символ
* U __cxa_pure_virtual@@CXXABI_1.3
* а clang создаёт символ
* U __cxa_pure_virtual
*
* Но нам не принципиально, как будет реализована функция __cxa_pure_virtual,
* потому что она не вызывается при нормальной работе программы,
* а если вызывается - то программа и так гарантированно глючит.
*
* Поэтому, мы можем обойти проблему таким образом:
*/
extern "C" void __attribute__((__visibility__("default"), __noreturn__)) __cxa_pure_virtual() { abort(); };

View File

@ -30,26 +30,39 @@ public:
{
std::reverse(remaining_mark_ranges.begin(), remaining_mark_ranges.end());
/// inject columns required for defaults evaluation
const auto injected_columns = injectRequiredColumns(column_names);
/// insert injected columns into ordered columns list to avoid exception about different block structures
ordered_names.insert(std::end(ordered_names), std::begin(injected_columns), std::end(injected_columns));
Names pre_column_names;
if (prewhere_actions)
{
pre_column_names = prewhere_actions->getRequiredColumns();
/// @todo somehow decide which injected columns belong to PREWHERE, optimizing reads
pre_column_names.insert(std::end(pre_column_names),
std::begin(injected_columns), std::end(injected_columns));
if (pre_column_names.empty())
pre_column_names.push_back(column_names[0]);
NameSet pre_name_set(pre_column_names.begin(), pre_column_names.end());
const NameSet pre_name_set(pre_column_names.begin(), pre_column_names.end());
/// Если выражение в PREWHERE - не столбец таблицы, не нужно отдавать наружу столбец с ним
/// (от storage ожидают получить только столбцы таблицы).
remove_prewhere_column = !pre_name_set.count(prewhere_column);
Names post_column_names;
for (const auto & name : column_names)
{
if (!pre_name_set.count(name))
post_column_names.push_back(name);
}
column_names = post_column_names;
}
column_name_set.insert(column_names.begin(), column_names.end());
/// will be used to distinguish between PREWHERE and WHERE columns when applying filter
column_name_set = NameSet{column_names.begin(), column_names.end()};
if (check_columns)
{
@ -111,47 +124,53 @@ protected:
/// Будем вызывать progressImpl самостоятельно.
void progress(const Progress & value) override {}
void injectRequiredColumns(NamesAndTypesList & columns) const {
std::set<NameAndTypePair> required_columns;
auto modified = false;
for (auto it = std::begin(columns); it != std::end(columns);)
/** Если некоторых запрошенных столбцов нет в куске,
* то выясняем, какие столбцы может быть необходимо дополнительно прочитать,
* чтобы можно было вычислить DEFAULT выражение для этих столбцов.
* Добавляет их в columns.
*/
NameSet injectRequiredColumns(Names & columns) const
{
NameSet required_columns{std::begin(columns), std::end(columns)};
NameSet injected_columns;
for (size_t i = 0; i < columns.size(); ++i)
{
required_columns.emplace(*it);
const auto & column_name = columns[i];
if (!owned_data_part->hasColumnFiles(it->name))
/// column has files and hence does not require evaluation
if (owned_data_part->hasColumnFiles(column_name))
continue;
const auto default_it = storage.column_defaults.find(column_name);
/// columns has no explicit default expression
if (default_it == std::end(storage.column_defaults))
continue;
/// collect identifiers required for evaluation
IdentifierNameSet identifiers;
default_it->second.expression->collectIdentifierNames(identifiers);
for (const auto & identifier : identifiers)
{
const auto default_it = storage.column_defaults.find(it->name);
if (default_it != std::end(storage.column_defaults))
if (storage.hasColumn(identifier))
{
IdentifierNameSet identifiers;
default_it->second.expression->collectIdentifierNames(identifiers);
for (const auto & identifier : identifiers)
/// ensure each column is added only once
if (required_columns.count(identifier) == 0)
{
if (storage.hasColumn(identifier))
{
NameAndTypePair column{identifier, storage.getDataTypeByName(identifier)};
if (required_columns.count(column) == 0)
{
it = columns.emplace(++it, std::move(column));
modified = true;
}
}
columns.emplace_back(identifier);
required_columns.emplace(identifier);
injected_columns.emplace(identifier);
}
if (modified)
continue;
}
}
++it;
}
if (modified)
columns = NamesAndTypesList{std::begin(required_columns), std::end(required_columns)};
return injected_columns;
}
Block readImpl() override
{
Block res;
@ -161,14 +180,12 @@ protected:
if (!reader)
{
injectRequiredColumns(columns);
injectRequiredColumns(pre_columns);
UncompressedCache * uncompressed_cache = use_uncompressed_cache ? storage.context.getUncompressedCache() : nullptr;
UncompressedCache * uncompressed_cache = use_uncompressed_cache ? storage.context.getUncompressedCache() : NULL;
reader.reset(new MergeTreeReader(path, owned_data_part, columns, uncompressed_cache, storage, all_mark_ranges));
if (prewhere_actions)
pre_reader.reset(new MergeTreeReader(path, owned_data_part, pre_columns, uncompressed_cache, storage,
all_mark_ranges));
pre_reader.reset(new MergeTreeReader(path, owned_data_part, pre_columns, uncompressed_cache, storage, all_mark_ranges));
}
if (prewhere_actions)
@ -191,7 +208,7 @@ protected:
if (range.begin == range.end)
remaining_mark_ranges.pop_back();
}
progressImpl(Progress(res.rows(), res.bytes()));
progressImpl(Progress(res.rowsInFirstColumn(), res.bytes()));
pre_reader->fillMissingColumns(res, ordered_names);
/// Вычислим выражение в PREWHERE.
@ -204,8 +221,8 @@ protected:
size_t pre_bytes = res.bytes();
/** Если фильтр - константа (например, написано PREWHERE 1),
* то либо вернём пустой блок, либо вернём блок без изменений.
*/
* то либо вернём пустой блок, либо вернём блок без изменений.
*/
if (ColumnConstUInt8 * column_const = typeid_cast<ColumnConstUInt8 *>(&*column))
{
if (!column_const->getData())
@ -295,7 +312,7 @@ protected:
else
throw Exception("Illegal type " + column->getName() + " of column for filter. Must be ColumnUInt8 or ColumnConstUInt8.", ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER);
reader->fillMissingColumns(res, ordered_names);
reader->fillMissingColumnsAndReorder(res, ordered_names);
}
while (!remaining_mark_ranges.empty() && !res && !isCancelled());
}
@ -315,7 +332,7 @@ protected:
remaining_mark_ranges.pop_back();
}
progressImpl(Progress(res.rows(), res.bytes()));
progressImpl(Progress(res.rowsInFirstColumn(), res.bytes()));
reader->fillMissingColumns(res, ordered_names);
}
@ -356,8 +373,8 @@ private:
Logger * log;
/// requested column names in specific order as expected by other stages
const Names ordered_names;
/// column names in specific order as expected by other stages
Names ordered_names;
};
}

View File

@ -61,7 +61,8 @@ public:
/** Если столбцов нет в блоке, добавляет их, если есть - добавляет прочитанные значения к ним в конец.
* Не добавляет столбцы, для которых нет файлов. Чтобы их добавить, нужно вызвать fillMissingColumns.
* В блоке должно быть либо ни одного столбца из columns, либо все, для которых есть файлы. */
* В блоке должно быть либо ни одного столбца из columns, либо все, для которых есть файлы.
*/
void readRange(size_t from_mark, size_t to_mark, Block & res)
{
try
@ -128,8 +129,7 @@ public:
}
catch (const Exception & e)
{
if (e.code() != ErrorCodes::ALL_REQUESTED_COLUMNS_ARE_MISSING
&& e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
{
storage.reportBrokenPart(part_name);
}
@ -187,111 +187,22 @@ public:
added_column = &columns.front();
}
/// Заполняет столбцы, которых нет в блоке, значениями по умолчанию.
/** Добавляет в блок недостающие столбцы из ordered_names, состоящие из значений по-умолчанию.
* Недостающие столбцы добавляются в позиции, такие же как в ordered_names.
* Если был добавлен хотя бы один столбец - то все столбцы в блоке переупорядочиваются как в ordered_names.
*/
void fillMissingColumns(Block & res, const Names & ordered_names)
{
try
{
/** Для недостающих столбцов из вложенной структуры нужно создавать не столбец пустых массивов, а столбец массивов
* правильных длин.
* TODO: Если для какой-то вложенной структуры были запрошены только отсутствующие столбцы, для них вернутся пустые
* массивы, даже если в куске есть смещения для этой вложенной структуры. Это можно исправить.
*/
fillMissingColumnsImpl(res, ordered_names, false);
}
/// Сначала запомним столбцы смещений для всех массивов в блоке.
OffsetColumns offset_columns;
for (size_t i = 0; i < res.columns(); ++i)
{
const ColumnWithNameAndType & column = res.getByPosition(i);
if (const ColumnArray * array = typeid_cast<const ColumnArray *>(&*column.column))
{
String offsets_name = DataTypeNested::extractNestedTableName(column.name);
offset_columns[offsets_name] = array->getOffsetsColumn();
}
}
auto should_evaluate_defaults = false;
auto should_sort = false;
for (NamesAndTypesList::const_iterator it = columns.begin(); it != columns.end(); ++it)
{
/// insert default values only for columns without default expressions
if (!res.has(it->name))
{
should_sort = true;
if (storage.column_defaults.count(it->name) != 0)
{
should_evaluate_defaults = true;
continue;
}
ColumnWithNameAndType column;
column.name = it->name;
column.type = it->type;
String offsets_name = DataTypeNested::extractNestedTableName(column.name);
if (offset_columns.count(offsets_name))
{
ColumnPtr offsets_column = offset_columns[offsets_name];
DataTypePtr nested_type = typeid_cast<DataTypeArray &>(*column.type).getNestedType();
size_t nested_rows = offsets_column->empty() ? 0
: typeid_cast<ColumnUInt64 &>(*offsets_column).getData().back();
ColumnPtr nested_column = dynamic_cast<IColumnConst &>(*nested_type->createConstColumn(
nested_rows, nested_type->getDefault())).convertToFullColumn();
column.column = new ColumnArray(nested_column, offsets_column);
}
else
{
/** Нужно превратить константный столбец в полноценный, так как в части блоков (из других кусков),
* он может быть полноценным (а то интерпретатор может посчитать, что он константный везде).
*/
column.column = dynamic_cast<IColumnConst &>(*column.type->createConstColumn(
res.rows(), column.type->getDefault())).convertToFullColumn();
}
res.insert(column);
}
}
/// evaluate defaulted columns if necessary
if (should_evaluate_defaults)
evaluateMissingDefaults(res, columns, storage.column_defaults, storage.context);
/// remove added column to ensure same content among all blocks
if (added_column)
{
res.erase(0);
streams.erase(added_column->name);
columns.erase(std::begin(columns));
added_column = nullptr;
}
/// sort columns to ensure consistent order among all blocks
if (should_sort)
{
Block ordered_block;
for (const auto & name : ordered_names)
if (res.has(name))
ordered_block.insert(res.getByName(name));
if (res.columns() != ordered_block.columns())
throw Exception{
"Ordered block has different columns than original one:\n" +
ordered_block.dumpNames() + "\nvs.\n" + res.dumpNames(),
ErrorCodes::LOGICAL_ERROR
};
std::swap(res, ordered_block);
}
}
catch (const Exception & e)
{
/// Более хорошая диагностика.
throw Exception(e.message() + '\n' + e.getStackTrace().toString()
+ "\n(while reading from part " + path + ")", e.code());
}
/** То же самое, но всегда переупорядочивает столбцы в блоке, как в ordered_names
* (даже если не было недостающих столбцов).
*/
void fillMissingColumnsAndReorder(Block & res, const Names & ordered_names)
{
fillMissingColumnsImpl(res, ordered_names, true);
}
private:
@ -523,6 +434,111 @@ private:
}
}
}
void fillMissingColumnsImpl(Block & res, const Names & ordered_names, bool always_reorder)
{
try
{
/** Для недостающих столбцов из вложенной структуры нужно создавать не столбец пустых массивов, а столбец массивов
* правильных длин.
* TODO: Если для какой-то вложенной структуры были запрошены только отсутствующие столбцы, для них вернутся пустые
* массивы, даже если в куске есть смещения для этой вложенной структуры. Это можно исправить.
*/
/// Сначала запомним столбцы смещений для всех массивов в блоке.
OffsetColumns offset_columns;
for (size_t i = 0; i < res.columns(); ++i)
{
const ColumnWithNameAndType & column = res.getByPosition(i);
if (const ColumnArray * array = typeid_cast<const ColumnArray *>(&*column.column))
{
String offsets_name = DataTypeNested::extractNestedTableName(column.name);
offset_columns[offsets_name] = array->getOffsetsColumn();
}
}
auto should_evaluate_defaults = false;
auto should_sort = always_reorder;
for (NamesAndTypesList::const_iterator it = columns.begin(); it != columns.end(); ++it)
{
/// insert default values only for columns without default expressions
if (!res.has(it->name))
{
should_sort = true;
if (storage.column_defaults.count(it->name) != 0)
{
should_evaluate_defaults = true;
continue;
}
ColumnWithNameAndType column;
column.name = it->name;
column.type = it->type;
String offsets_name = DataTypeNested::extractNestedTableName(column.name);
if (offset_columns.count(offsets_name))
{
ColumnPtr offsets_column = offset_columns[offsets_name];
DataTypePtr nested_type = typeid_cast<DataTypeArray &>(*column.type).getNestedType();
size_t nested_rows = offsets_column->empty() ? 0
: typeid_cast<ColumnUInt64 &>(*offsets_column).getData().back();
ColumnPtr nested_column = dynamic_cast<IColumnConst &>(*nested_type->createConstColumn(
nested_rows, nested_type->getDefault())).convertToFullColumn();
column.column = new ColumnArray(nested_column, offsets_column);
}
else
{
/** Нужно превратить константный столбец в полноценный, так как в части блоков (из других кусков),
* он может быть полноценным (а то интерпретатор может посчитать, что он константный везде).
*/
column.column = dynamic_cast<IColumnConst &>(*column.type->createConstColumn(
res.rows(), column.type->getDefault())).convertToFullColumn();
}
res.insert(column);
}
}
/// evaluate defaulted columns if necessary
if (should_evaluate_defaults)
evaluateMissingDefaults(res, columns, storage.column_defaults, storage.context);
/// remove added column to ensure same content among all blocks
if (added_column)
{
res.erase(0);
streams.erase(added_column->name);
columns.erase(std::begin(columns));
added_column = nullptr;
}
/// sort columns to ensure consistent order among all blocks
if (should_sort)
{
Block ordered_block;
for (const auto & name : ordered_names)
if (res.has(name))
ordered_block.insert(res.getByName(name));
if (res.columns() != ordered_block.columns())
throw Exception{
"Ordered block has different number of columns than original one:\n" +
ordered_block.dumpNames() + "\nvs.\n" + res.dumpNames(),
ErrorCodes::LOGICAL_ERROR};
std::swap(res, ordered_block);
}
}
catch (const Exception & e)
{
/// Более хорошая диагностика.
throw Exception(e.message() + '\n' + e.getStackTrace().toString()
+ "\n(while reading from part " + path + ")", e.code());
}
}
};
}

11
dbms/scripts/README Normal file
View File

@ -0,0 +1,11 @@
# How to create dictionaries for region* functions:
# 1. You need access to host ███████████.yandex-team.ru.
# 2. Do the following commands:
curl 'http://███████████.yandex-team.ru/?fields=id,parent_id,type,population' | tail -n+2 > regions_hierarchy.txt
curl 'http://███████████.yandex-team.ru/?fields=id,parent_id,type,population&new_parents=977:187' | tail -n+2 > regions_hierarchy_ua.txt
curl 'http://███████████.yandex-team.ru/?fields=id,ru_name' | tail -n+2 > regions_names_ru.txt
curl 'http://███████████.yandex-team.ru/?fields=id,uk_name' | tail -n+2 > regions_names_ua.txt
curl 'http://███████████.yandex-team.ru/?fields=id,by_name' | tail -n+2 > regions_names_by.txt
curl 'http://███████████.yandex-team.ru/?fields=id,kz_name' | tail -n+2 > regions_names_kz.txt
curl 'http://███████████.yandex-team.ru/?fields=id,tr_name' | tail -n+2 > regions_names_tr.txt

View File

@ -1,31 +0,0 @@
#!/usr/bin/perl -w
use strict;
use warnings;
use geobase;
sub get_population {
my $key = shift;
my $depth = shift || 0;
return 0 if ($depth > 100);
my $current = int($Region{$key}->{zip_old} || 0); # zip_old, не смотря на название, содержит население региона.
return $current if ($current);
my $sum_of_children = 0;
for my $child (@{$Region{$key}->{chld}}) {
$sum_of_children += get_population($child, $depth + 1);
}
return $sum_of_children;
}
foreach my $key (keys %Region) {
print $key . "\t"
. ($Region{$key}->{parents}[-1] || 0) . "\t"
. ($Region{$key}->{type} || 0) . "\t"
. get_population($key) . "\n";
}

View File

@ -1,25 +0,0 @@
#!/usr/bin/perl -w
use strict;
use warnings;
use geobase;
my @languages = ('ru', 'en', 'ua', 'by', 'kz', 'tr');
my @output_files = map { open(my $output, ">:encoding(UTF-8)", "regions_names_" . $_ . ".txt") || die $!; $output } @languages;
my %outputs;
@outputs{@languages} = @output_files;
foreach my $key (keys %Region) {
foreach my $lang (@languages) {
my $field = ( $lang eq 'ru' ? 'name' : $lang . '_name' );
my $name = $Region{$key}->{$field};
if ($name) {
$name =~ s/^\s+//;
$name =~ s/\s+$//;
$name =~ s/(\t|\n)/ /g;
if ($name ne '') {
print { $outputs{$lang} } $key . "\t" . $name . "\n";
}
}
}
}

View File

@ -38,6 +38,8 @@
#include <DB/IO/ReadBufferFromIStream.h>
#include <DB/DataStreams/AsynchronousBlockInputStream.h>
#include <DB/DataStreams/BlockInputStreamFromRowInputStream.h>
#include <DB/DataStreams/TabSeparatedRowInputStream.h>
#include <DB/Parsers/ParserQuery.h>
#include <DB/Parsers/ASTSetQuery.h>
@ -57,6 +59,7 @@
#include <DB/Common/ExternalTable.h>
#include <DB/Common/UnicodeBar.h>
#include <DB/Common/formatReadable.h>
#include <DB/Columns/ColumnString.h>
/// http://en.wikipedia.org/wiki/ANSI_escape_code
@ -93,6 +96,7 @@ private:
};
bool is_interactive = true; /// Использовать readline интерфейс или batch режим.
bool print_time_to_stderr = false; /// В неинтерактивном режиме, выводить время выполнения в stderr.
bool stdin_is_not_tty = false; /// stdin - не терминал.
winsize terminal_size {}; /// Размер терминала - для вывода прогресс-бара.
@ -188,14 +192,25 @@ private:
}
catch (const Exception & e)
{
bool print_stack_trace = config().getBool("stacktrace", false);
std::string text = e.displayText();
/** Если эксепшен пришёл с сервера, то стек трейс будет расположен внутри текста.
* Если эксепшен на клиенте, то стек трейс расположен отдельно.
*/
auto embedded_stack_trace_pos = text.find("Stack trace");
if (std::string::npos != embedded_stack_trace_pos && !print_stack_trace)
text.resize(embedded_stack_trace_pos);
std::cerr << "Code: " << e.code() << ". " << text << std::endl << std::endl;
/// Если есть стек-трейс на сервере, то не будем писать стек-трейс на клиенте.
/// Также не будем писать стек-трейс в случае сетевых ошибок.
if (e.code() != ErrorCodes::NETWORK_ERROR
&& std::string::npos == text.find("Stack trace"))
if (print_stack_trace
&& e.code() != ErrorCodes::NETWORK_ERROR
&& std::string::npos == embedded_stack_trace_pos)
{
std::cerr << "Stack trace:" << std::endl
<< e.getStackTrace().toString();
@ -257,6 +272,9 @@ private:
if (is_interactive)
{
if (print_time_to_stderr)
throw Exception("time option could be specified only in non-interactive mode", ErrorCodes::BAD_ARGUMENTS);
/// Отключаем tab completion.
rl_bind_key('\t', rl_insert);
@ -557,6 +575,10 @@ private:
std::cout << std::endl << std::endl;
}
else if (print_time_to_stderr)
{
std::cerr << watch.elapsedSeconds() << "\n";
}
return true;
}
@ -679,13 +701,16 @@ private:
if (!insert->format.empty())
current_format = insert->format;
BlockInputStreamPtr block_std_in = new AsynchronousBlockInputStream(context.getFormatFactory().getInput(
current_format, buf, sample, insert_format_max_block_size, context.getDataTypeFactory()));
block_std_in->readPrefix();
BlockInputStreamPtr block_input = context.getFormatFactory().getInput(
current_format, buf, sample, insert_format_max_block_size, context.getDataTypeFactory());
BlockInputStreamPtr async_block_input = new AsynchronousBlockInputStream(block_input);
async_block_input->readPrefix();
while (true)
{
Block block = block_std_in->read();
Block block = async_block_input->read();
connection->sendData(block);
processed_rows += block.rows();
@ -693,7 +718,7 @@ private:
break;
}
block_std_in->readSuffix();
async_block_input->readSuffix();
}
@ -975,8 +1000,14 @@ private:
resetOutput();
got_exception = true;
std::string text = e.displayText();
auto embedded_stack_trace_pos = text.find("Stack trace");
if (std::string::npos != embedded_stack_trace_pos && !config().getBool("stacktrace", false))
text.resize(embedded_stack_trace_pos);
std::cerr << "Received exception from server:" << std::endl
<< "Code: " << e.code() << ". " << e.displayText();
<< "Code: " << e.code() << ". " << text;
}
@ -1022,7 +1053,10 @@ public:
("database,d", boost::program_options::value<std::string>(), "database")
("multiline,m", "multiline")
("multiquery,n", "multiquery")
("vertical,E", "vertical")
("format,f", boost::program_options::value<std::string>(), "default output format")
("vertical,E", "vertical output format, same as --format=Vertical or FORMAT Vertical or \\G at end of command")
("time,t", "print query execution time to stderr in non-interactive mode (for benchmarks)")
("stacktrace", "print stack traces of exceptions")
APPLY_FOR_SETTINGS(DECLARE_SETTING)
APPLY_FOR_LIMITS(DECLARE_LIMIT)
;
@ -1135,8 +1169,14 @@ public:
config().setBool("multiline", true);
if (options.count("multiquery"))
config().setBool("multiquery", true);
if (options.count("format"))
config().setString("format", options["format"].as<std::string>());
if (options.count("vertical"))
config().setBool("vertical", true);
if (options.count("stacktrace"))
config().setBool("stacktrace", true);
if (options.count("time"))
print_time_to_stderr = true;
}
};

View File

@ -1,6 +1,8 @@
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/Operators.h>
#include <DB/DataStreams/TabSeparatedRowInputStream.h>
#include <DB/DataTypes/DataTypesNumberFixed.h>
namespace DB
@ -43,29 +45,327 @@ void TabSeparatedRowInputStream::readPrefix()
}
/** Проверка на распространённый случай ошибки - использование Windows перевода строки.
*/
static void checkForCarriageReturn(ReadBuffer & istr)
{
if (istr.position()[0] == '\r' || (istr.position() != istr.buffer().begin() && istr.position()[-1] == '\r'))
throw Exception("\nYou have carriage return (\\r, 0x0D, ASCII 13) at end of first row."
"\nIt's like your input data has DOS/Windows style line separators, that are illegal in TabSeparated format."
" You must transform your file to Unix format."
"\nBut if you really need carriage return at end of string value of last column, you need to escape it as \\r.",
ErrorCodes::INCORRECT_DATA);
}
bool TabSeparatedRowInputStream::read(Row & row)
{
updateDiagnosticInfo();
size_t size = data_types.size();
row.resize(size);
try
{
for (size_t i = 0; i < size; ++i)
{
if (i == 0 && istr.eof())
{
row.clear();
return false;
}
data_types[i]->deserializeTextEscaped(row[i], istr);
/// пропускаем разделители
if (i + 1 == size)
{
if (!istr.eof())
{
if (unlikely(row_num == 1))
checkForCarriageReturn(istr);
assertString("\n", istr);
}
}
else
assertString("\t", istr);
}
}
catch (Exception & e)
{
String verbose_diagnostic;
{
WriteBufferFromString diagnostic_out(verbose_diagnostic);
printDiagnosticInfo(diagnostic_out);
}
e.addMessage("\n" + verbose_diagnostic);
throw;
}
return true;
}
void TabSeparatedRowInputStream::printDiagnosticInfo(WriteBuffer & out)
{
/// Вывести подробную диагностику возможно лишь если последняя и предпоследняя строка ещё находятся в буфере для чтения.
size_t bytes_read_at_start_of_buffer = istr.count() - istr.offset();
if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
{
out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
return;
}
size_t max_length_of_column_name = 0;
for (size_t i = 0; i < sample.columns(); ++i)
if (sample.getByPosition(i).name.size() > max_length_of_column_name)
max_length_of_column_name = sample.getByPosition(i).name.size();
size_t max_length_of_data_type_name = 0;
for (size_t i = 0; i < sample.columns(); ++i)
if (sample.getByPosition(i).type->getName().size() > max_length_of_data_type_name)
max_length_of_data_type_name = sample.getByPosition(i).type->getName().size();
/// Откатываем курсор для чтения на начало предыдущей или текущей строки и парсим всё заново. Но теперь выводим подробную информацию.
if (pos_of_prev_row)
{
istr.position() = pos_of_prev_row;
out << "\nRow " << (row_num - 1) << ":\n";
if (!parseRowAndPrintDiagnosticInfo(out, max_length_of_column_name, max_length_of_data_type_name))
return;
}
else
{
if (!pos_of_current_row)
{
out << "Could not print diagnostic info because parsing of data hasn't started.\n";
return;
}
istr.position() = pos_of_current_row;
}
out << "\nRow " << row_num << ":\n";
parseRowAndPrintDiagnosticInfo(out, max_length_of_column_name, max_length_of_data_type_name);
out << "\n";
}
static void verbosePrintString(BufferBase::Position begin, BufferBase::Position end, WriteBuffer & out)
{
if (end == begin)
{
out << "<EMPTY>";
return;
}
out << "\"";
for (auto pos = begin; pos < end; ++pos)
{
switch (*pos)
{
case '\0':
out << "<ASCII NUL>";
break;
case '\b':
out << "<BACKSPACE>";
break;
case '\f':
out << "<FORM FEED>";
break;
case '\n':
out << "<LINE FEED>";
break;
case '\r':
out << "<CARRIAGE RETURN>";
break;
case '\t':
out << "<TAB>";
break;
case '\\':
out << "<BACKSLASH>";
break;
case '"':
out << "<DOUBLE QUOTE>";
break;
case '\'':
out << "<SINGLE QUOTE>";
break;
default:
{
if (*pos >= 0 && *pos < 32)
{
static const char * hex = "0123456789ABCDEF";
out << "<0x" << hex[*pos / 16] << hex[*pos % 16] << ">";
}
else
out << *pos;
}
}
}
out << "\"";
}
bool TabSeparatedRowInputStream::parseRowAndPrintDiagnosticInfo(
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{
size_t size = data_types.size();
for (size_t i = 0; i < size; ++i)
{
if (i == 0 && istr.eof())
{
row.clear();
out << "<End of stream>\n";
return false;
}
data_types[i]->deserializeTextEscaped(row[i], istr);
/// пропускаем разделители
out << "Column " << i << ", " << std::string((i < 10 ? 2 : i < 100 ? 1 : 0), ' ')
<< "name: " << sample.getByPosition(i).name << ", " << std::string(max_length_of_column_name - sample.getByPosition(i).name.size(), ' ')
<< "type: " << data_types[i]->getName() << ", " << std::string(max_length_of_data_type_name - data_types[i]->getName().size(), ' ');
auto prev_position = istr.position();
std::exception_ptr exception;
Field field;
try
{
data_types[i]->deserializeTextEscaped(field, istr);
}
catch (...)
{
exception = std::current_exception();
}
auto curr_position = istr.position();
if (curr_position < prev_position)
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
if (data_types[i]->isNumeric())
{
/// Пустая строка вместо числа.
if (curr_position == prev_position)
{
out << "ERROR: text ";
verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
out << " is not like " << data_types[i]->getName() << "\n";
return false;
}
}
out << "parsed text: ";
verbosePrintString(prev_position, curr_position, out);
if (exception)
{
if (data_types[i]->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (data_types[i]->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
else
out << "ERROR\n";
return false;
}
out << "\n";
if (data_types[i]->isNumeric())
{
if (*curr_position != '\n' && *curr_position != '\t')
{
out << "ERROR: garbage after " << data_types[i]->getName() << ": ";
verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
out << "\n";
if (data_types[i]->getName() == "DateTime")
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
else if (data_types[i]->getName() == "Date")
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
return false;
}
}
if ( (typeid_cast<const DataTypeUInt8 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt8>::max())
|| (typeid_cast<const DataTypeUInt16 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt16>::max())
|| (typeid_cast<const DataTypeUInt32 *>(data_types[i].get()) && field.get<UInt64>() > std::numeric_limits<UInt32>::max())
|| (typeid_cast<const DataTypeInt8 *>(data_types[i].get())
&& (field.get<Int64>() > std::numeric_limits<Int8>::max() || field.get<Int64>() < std::numeric_limits<Int8>::min()))
|| (typeid_cast<const DataTypeInt16 *>(data_types[i].get())
&& (field.get<Int64>() > std::numeric_limits<Int16>::max() || field.get<Int64>() < std::numeric_limits<Int16>::min()))
|| (typeid_cast<const DataTypeInt32 *>(data_types[i].get())
&& (field.get<Int64>() > std::numeric_limits<Int32>::max() || field.get<Int64>() < std::numeric_limits<Int32>::min())))
{
out << "ERROR: parsed number is out of range of data type.\n";
return false;
}
/// Разделители
if (i + 1 == size)
{
if (!istr.eof())
assertString("\n", istr);
{
try
{
assertString("\n", istr);
}
catch (const DB::Exception &)
{
if (*istr.position() == '\t')
{
out << "ERROR: Tab found where line feed is expected."
" It's like your file has more columns than expected.\n"
"And if your file have right number of columns, maybe it have unescaped tab in value.\n";
}
else if (*istr.position() == '\r')
{
out << "ERROR: Carriage return found where line feed is expected."
" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
}
else
{
out << "ERROR: There is no line feed. ";
verbosePrintString(istr.position(), istr.position() + 1, out);
out << " found instead.\n";
}
return false;
}
}
}
else
assertString("\t", istr);
{
try
{
assertString("\t", istr);
}
catch (const DB::Exception &)
{
if (*istr.position() == '\n')
{
out << "ERROR: Line feed found where tab is expected."
" It's like your file has less columns than expected.\n"
"And if your file have right number of columns, maybe it have unescaped backslash in value before tab, which cause tab has escaped.\n";
}
else if (*istr.position() == '\r')
{
out << "ERROR: Carriage return found where tab is expected.\n";
}
else
{
out << "ERROR: There is no tab. ";
verbosePrintString(istr.position(), istr.position() + 1, out);
out << " found instead.\n";
}
return false;
}
}
}
return true;

View File

@ -333,6 +333,10 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
factory.registerFunction<FunctionIn<false, true>>();
factory.registerFunction<FunctionIn<true, false>>();
factory.registerFunction<FunctionIn<true, true>>();
factory.registerFunction<FunctionIsFinite>();
factory.registerFunction<FunctionIsInfinite>();
factory.registerFunction<FunctionIsNaN>();
}
}

View File

@ -69,7 +69,7 @@ void readString(String & s, ReadBuffer & buf)
*
* Использует SSE2, что даёт прирост скорости примерно в 1.7 раза (по сравнению с тривиальным циклом)
* при парсинге типичного tab-separated файла со строками.
* Можно было бы использовать SSE4.2, но он пока поддерживается не на всех наших серверах.
* Можно было бы использовать SSE4.2, но он на момент написания кода поддерживался не на всех наших серверах (сейчас уже поддерживается везде).
* При парсинге файла с короткими строками, падения производительности нет.
*/
static inline const char * find_first_tab_lf_or_backslash(const char * begin, const char * end)
@ -232,6 +232,44 @@ void readBackQuotedString(String & s, ReadBuffer & buf)
}
void readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf)
{
char s[19];
size_t size = buf.read(s, 10);
if (10 != size)
{
s[size] = 0;
throw Exception(std::string("Cannot parse datetime ") + s, ErrorCodes::CANNOT_PARSE_DATETIME);
}
if (s[4] < '0' || s[4] > '9')
{
size_t size = buf.read(&s[10], 9);
if (9 != size)
{
s[10 + size] = 0;
throw Exception(std::string("Cannot parse datetime ") + s, ErrorCodes::CANNOT_PARSE_DATETIME);
}
UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
UInt8 month = (s[5] - '0') * 10 + (s[6] - '0');
UInt8 day = (s[8] - '0') * 10 + (s[9] - '0');
UInt8 hour = (s[11] - '0') * 10 + (s[12] - '0');
UInt8 minute = (s[14] - '0') * 10 + (s[15] - '0');
UInt8 second = (s[17] - '0') * 10 + (s[18] - '0');
if (unlikely(year == 0))
datetime = 0;
else
datetime = DateLUT::instance().makeDateTime(year, month, day, hour, minute, second);
}
else
datetime = parse<time_t>(s, 10);
}
void readException(Exception & e, ReadBuffer & buf, const String & additional_message)
{
int code = 0;
@ -239,7 +277,7 @@ void readException(Exception & e, ReadBuffer & buf, const String & additional_me
String message;
String stack_trace;
bool has_nested = false;
readBinary(code, buf);
readBinary(name, buf);
readBinary(message, buf);

View File

@ -218,6 +218,7 @@ void Compiler::compile(
" -I /usr/share/clickhouse/headers/libs/libcityhash/"
" -I /usr/share/clickhouse/headers/libs/libcommon/include/"
" -I /usr/share/clickhouse/headers/libs/libdouble-conversion/"
" -I /usr/share/clickhouse/headers/libs/libcpuid/include/"
" -I /usr/share/clickhouse/headers/libs/libmysqlxx/include/"
" -I /usr/share/clickhouse/headers/libs/libstatdaemons/include/"
" -I /usr/share/clickhouse/headers/libs/libstats/include/"

View File

@ -492,37 +492,51 @@ Context & Context::getGlobalContext()
const Dictionaries & Context::getDictionaries() const
{
Poco::ScopedLock<Poco::Mutex> lock(shared->mutex);
tryCreateDictionaries();
return *shared->dictionaries;
return getDictionariesImpl(false);
}
const ExternalDictionaries & Context::getExternalDictionaries() const
{
return getExternalDictionariesImpl(false);
}
const Dictionaries & Context::getDictionariesImpl(const bool throw_on_error) const
{
Poco::ScopedLock<Poco::Mutex> lock(shared->mutex);
tryCreateExternalDictionaries();
return *shared->external_dictionaries;
}
void Context::tryCreateDictionaries(const bool throw_on_error) const
{
if (!shared->dictionaries)
shared->dictionaries = new Dictionaries{throw_on_error};
return *shared->dictionaries;
}
void Context::tryCreateExternalDictionaries(const bool throw_on_error) const
const ExternalDictionaries & Context::getExternalDictionariesImpl(const bool throw_on_error) const
{
Poco::ScopedLock<Poco::Mutex> lock(shared->mutex);
if (!shared->external_dictionaries)
{
if (!this->global_context)
throw Exception("Logical error: there is no global context", ErrorCodes::LOGICAL_ERROR);
shared->external_dictionaries = new ExternalDictionaries{*this->global_context, throw_on_error};
}
return *shared->external_dictionaries;
}
void Context::tryCreateDictionaries() const
{
static_cast<void>(getDictionariesImpl(true));
}
void Context::tryCreateExternalDictionaries() const
{
static_cast<void>(getExternalDictionariesImpl(true));
}

View File

@ -25,6 +25,7 @@
#include <DB/Interpreters/ExpressionAnalyzer.h>
#include <DB/DataTypes/DataTypesNumberFixed.h>
#include <DB/DataTypes/DataTypeNested.h>
#include <DB/DataTypes/DataTypeFixedString.h>
namespace DB
@ -293,13 +294,31 @@ InterpreterCreateQuery::ColumnsAndDefaults InterpreterCreateQuery::parseColumns(
* 2. conversion of expression (1) to explicitly-specified type alias as column name */
if (col_decl.type)
{
const auto tmp_column_name = col_decl.name + "_tmp";
const auto & final_column_name = col_decl.name;
const auto conversion_function_name = "to" + columns.back().type->getName();
const auto tmp_column_name = final_column_name + "_tmp";
const auto data_type_ptr = columns.back().type.get();
default_expr_list->children.emplace_back(setAlias(
makeASTFunction(conversion_function_name, ASTPtr{new ASTIdentifier{{}, tmp_column_name}}),
final_column_name));
/// specific code for different data types, e.g. toFixedString(col, N) for DataTypeFixedString
if (const auto fixed_string = typeid_cast<const DataTypeFixedString *>(data_type_ptr))
{
const auto conversion_function_name = "toFixedString";
default_expr_list->children.emplace_back(setAlias(
makeASTFunction(
conversion_function_name,
ASTPtr{new ASTIdentifier{{}, tmp_column_name}},
ASTPtr{new ASTLiteral{{}, fixed_string->getN()}}),
final_column_name));
}
else
{
/// @todo fix for parametric types, results in broken code, i.e. toArray(ElementType)(col)
const auto conversion_function_name = "to" + data_type_ptr->getName();
default_expr_list->children.emplace_back(setAlias(
makeASTFunction(conversion_function_name, ASTPtr{new ASTIdentifier{{}, tmp_column_name}}),
final_column_name));
}
default_expr_list->children.emplace_back(setAlias(col_decl.default_expression->clone(), tmp_column_name));
}

View File

@ -111,6 +111,15 @@ bool ParserInsertQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Expected &
ParserWhiteSpaceOrComments ws_without_nl(false);
ws_without_nl.ignore(pos, end);
if (pos != end && *pos == ';')
throw Exception("You have excessive ';' symbol before data for INSERT.\n"
"Example:\n\n"
"INSERT INTO t (x, y) FORMAT TabSeparated\n"
"1\tHello\n"
"2\tWorld\n"
"\n"
"Note that there is no ';' in first line.", ErrorCodes::SYNTAX_ERROR);
if (pos != end && *pos == '\n')
++pos;

View File

@ -8,6 +8,7 @@
#include <Yandex/ErrorHandlers.h>
#include <Yandex/Revision.h>
#include <statdaemons/ConfigProcessor.h>
#include <statdaemons/ext/scope_guard.hpp>
#include <memory>
#include <DB/Interpreters/loadMetadata.h>
@ -536,6 +537,25 @@ int Server::main(const std::vector<std::string> & args)
global_context->setCurrentDatabase(config().getString("default_database", "default"));
SCOPE_EXIT(
LOG_DEBUG(log, "Closed all connections.");
/** Попросим завершить фоновую работу у всех движков таблиц.
* Это важно делать заранее, не в деструкторе Context-а, так как
* движки таблиц могут при уничтожении всё ещё пользоваться Context-ом.
*/
LOG_INFO(log, "Shutting down storages.");
global_context->shutdown();
LOG_DEBUG(log, "Shutted down storages.");
/** Явно уничтожаем контекст - это удобнее, чем в деструкторе Server-а, так как ещё доступен логгер.
* В этот момент никто больше не должен владеть shared-частью контекста.
*/
global_context.reset();
LOG_DEBUG(log, "Destroyed global context.");
);
{
const auto profile_events_transmitter = config().getBool("use_graphite", true)
? std::make_unique<ProfileEventsTransmitter>()
@ -609,54 +629,39 @@ int Server::main(const std::vector<std::string> & args)
if (olap_http_server)
olap_http_server->start();
LOG_INFO(log, "Ready for connections.");
SCOPE_EXIT(
LOG_DEBUG(log, "Received termination signal. Waiting for current connections to close.");
users_config_reloader.reset();
is_cancelled = true;
http_server.stop();
tcp_server.stop();
if (use_olap_server)
olap_http_server->stop();
);
/// try to load dictionaries immediately, throw on error and die
try
{
if (!config().getBool("dictionaries_lazy_load", true))
{
global_context->tryCreateDictionaries(true);
global_context->tryCreateExternalDictionaries(true);
global_context->tryCreateDictionaries();
global_context->tryCreateExternalDictionaries();
}
LOG_INFO(log, "Ready for connections.");
waitForTerminationRequest();
}
catch (...)
{
LOG_ERROR(log, "Caught exception while loading dictionaries.");
tryLogCurrentException(log);
throw;
}
LOG_DEBUG(log, "Received termination signal. Waiting for current connections to close.");
users_config_reloader.reset();
is_cancelled = true;
http_server.stop();
tcp_server.stop();
if (use_olap_server)
olap_http_server->stop();
}
LOG_DEBUG(log, "Closed all connections.");
/** Попросим завершить фоновую работу у всех движков таблиц.
* Это важно делать заранее, не в деструкторе Context-а, так как
* движки таблиц могут при уничтожении всё ещё пользоваться Context-ом.
*/
LOG_INFO(log, "Shutting down storages.");
global_context->shutdown();
LOG_DEBUG(log, "Shutted down storages.");
/** Явно уничтожаем контекст - это удобнее, чем в деструкторе Server-а, так как ещё доступен логгер.
* В этот момент никто больше не должен владеть shared-частью контекста.
*/
global_context.reset();
LOG_DEBUG(log, "Destroyed global context.");
return Application::EXIT_OK;
}

View File

@ -3,6 +3,7 @@
#include <DB/DataTypes/DataTypesNumberFixed.h>
#include <DB/DataTypes/DataTypeNested.h>
#include <DB/DataTypes/DataTypeArray.h>
#include <DB/DataTypes/DataTypeFixedString.h>
#include <DB/Interpreters/Context.h>
#include <DB/Interpreters/ExpressionAnalyzer.h>
#include <DB/Parsers/ASTIdentifier.h>
@ -212,13 +213,31 @@ namespace DB
{
if (command.data_type)
{
const auto & column_name = command.column_name;
const auto tmp_column_name = column_name + "_tmp";
const auto conversion_function_name = "to" + command.data_type->getName();
const auto & final_column_name = command.column_name;
const auto tmp_column_name = final_column_name + "_tmp";
const auto data_type_ptr = command.data_type.get();
default_expr_list->children.emplace_back(setAlias(
makeASTFunction(conversion_function_name, ASTPtr{new ASTIdentifier{{}, tmp_column_name}}),
column_name));
/// specific code for different data types, e.g. toFixedString(col, N) for DataTypeFixedString
if (const auto fixed_string = typeid_cast<const DataTypeFixedString *>(data_type_ptr))
{
const auto conversion_function_name = "toFixedString";
default_expr_list->children.emplace_back(setAlias(
makeASTFunction(
conversion_function_name,
ASTPtr{new ASTIdentifier{{}, tmp_column_name}},
ASTPtr{new ASTLiteral{{}, fixed_string->getN()}}),
final_column_name));
}
else
{
/// @todo fix for parametric types, results in broken codem, i.e. toArray(ElementType)(col)
const auto conversion_function_name = "to" + data_type_ptr->getName();
default_expr_list->children.emplace_back(setAlias(
makeASTFunction(conversion_function_name, ASTPtr{new ASTIdentifier{{}, tmp_column_name}}),
final_column_name));
}
default_expr_list->children.emplace_back(setAlias(command.default_expression->clone(), tmp_column_name));

View File

@ -13,6 +13,7 @@
#include <DB/IO/WriteBufferFromFile.h>
#include <DB/IO/CompressedReadBuffer.h>
#include <DB/DataTypes/DataTypeDate.h>
#include <DB/DataTypes/DataTypeFixedString.h>
#include <DB/Common/localBackup.h>
#include <DB/Functions/FunctionFactory.h>
@ -466,7 +467,8 @@ void MergeTreeData::createConvertExpression(const DataPartPtr & part, const Name
}
else
{
String new_type_name = new_types[column.name]->getName();
const auto new_type = new_types[column.name].get();
const String new_type_name = new_type->getName();
if (new_type_name != column.type->getName() &&
(!part || part->hasColumnFiles(column.name)))
@ -478,13 +480,31 @@ void MergeTreeData::createConvertExpression(const DataPartPtr & part, const Name
out_expression->addInput(ColumnWithNameAndType(nullptr, column.type, column.name));
const FunctionPtr & function = FunctionFactory::instance().get("to" + new_type_name, context);
Names out_names;
out_expression->add(ExpressionAction::applyFunction(function, Names(1, column.name)), out_names);
if (const auto fixed_string = typeid_cast<const DataTypeFixedString *>(new_type))
{
const auto width = fixed_string->getN();
const auto string_width_column = toString(width);
out_expression->addInput({ new ColumnConstUInt64{1, width}, new DataTypeUInt64, string_width_column });
const auto function = FunctionFactory::instance().get("toFixedString", context);
out_expression->add(ExpressionAction::applyFunction(function, Names{
column.name, string_width_column
}), out_names);
out_expression->add(ExpressionAction::removeColumn(string_width_column));
}
else
{
const FunctionPtr & function = FunctionFactory::instance().get("to" + new_type_name, context);
out_expression->add(ExpressionAction::applyFunction(function, Names{column.name}), out_names);
}
out_expression->add(ExpressionAction::removeColumn(column.name));
String escaped_expr = escapeForFileName(out_names[0]);
String escaped_column = escapeForFileName(column.name);
const String escaped_expr = escapeForFileName(out_names[0]);
const String escaped_column = escapeForFileName(column.name);
out_rename_map[escaped_expr + ".bin"] = escaped_column + ".bin";
out_rename_map[escaped_expr + ".mrk"] = escaped_column + ".mrk";
}

View File

@ -227,12 +227,11 @@ class StorageChunkMerger::MergeTask
{
public:
MergeTask(const StorageChunkMerger & chunk_merger_, DB::Context & context_, Logger * log_)
:
shutdown_called(false),
chunk_merger(chunk_merger_),
context(context_),
log(log_),
merging(false)
: shutdown_called(false),
chunk_merger(chunk_merger_),
context(context_),
log(log_),
merging(false)
{
}
@ -551,6 +550,7 @@ bool StorageChunkMerger::MergeTask::mergeChunks(const Storages & chunks)
if (shutdown_called)
{
LOG_INFO(log, "Shutdown requested while merging chunks.");
output->writeSuffix();
new_storage.removeReference(); /// После этого временные данные удалятся.
return false;
}

View File

@ -164,7 +164,21 @@ StorageChunks::StorageChunks(
if (!attach)
reference_counter.add(1, true);
loadIndex();
_table_column_name = "_table" + VirtualColumnUtils::chooseSuffix(getColumnsList(), "_table");
try
{
loadIndex();
}
catch (Exception & e)
{
if (e.code() != ErrorCodes::SIZES_OF_MARKS_FILES_ARE_INCONSISTENT)
throw;
e.addMessage("Table " + name_ + " is broken and loaded as empty.");
tryLogCurrentException(__PRETTY_FUNCTION__);
return;
}
/// Создадим все таблицы типа ChunkRef. Они должны располагаться в той же БД.
{
@ -180,8 +194,6 @@ StorageChunks::StorageChunks(
context.addTable(database_name, it->first, StorageChunkRef::create(it->first, context, database_name, name, true));
}
}
_table_column_name = "_table" + VirtualColumnUtils::chooseSuffix(getColumnsList(), "_table");
}
NameAndTypePair StorageChunks::getColumn(const String & column_name) const

View File

@ -103,13 +103,25 @@ public:
addStream(column.name, *column.type);
}
~LogBlockOutputStream() { writeSuffix(); }
~LogBlockOutputStream()
{
try
{
writeSuffix();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
void write(const Block & block);
void writeSuffix();
private:
StorageLog & storage;
Poco::ScopedWriteRWLock lock;
bool done = false;
struct Stream
{
@ -362,6 +374,10 @@ void LogBlockOutputStream::write(const Block & block)
void LogBlockOutputStream::writeSuffix()
{
if (done)
return;
done = true;
/// Заканчиваем запись.
marks_stream.next();

View File

@ -86,13 +86,22 @@ public:
~TinyLogBlockOutputStream()
{
writeSuffix();
try
{
writeSuffix();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
void write(const Block & block);
void writeSuffix();
private:
StorageTinyLog & storage;
bool done = false;
struct Stream
{
@ -349,6 +358,10 @@ void TinyLogBlockOutputStream::writeData(const String & name, const IDataType &
void TinyLogBlockOutputStream::writeSuffix()
{
if (done)
return;
done = true;
/// Заканчиваем запись.
for (FileStreams::iterator it = streams.begin(); it != streams.end(); ++it)
it->second->finalize();

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +0,0 @@
79628
79628
102851

View File

@ -1,5 +0,0 @@
/* Заметим, что запросы написаны так, как будто пользователь не понимает смысл символа _ в LIKE выражении. */
SELECT count() FROM test.hits WHERE URL LIKE '%/avtomobili_s_probegom/_%__%__%__%';
SELECT count() FROM test.hits WHERE URL LIKE '/avtomobili_s_probegom/_%__%__%__%';
SELECT count() FROM test.hits WHERE URL LIKE '%_/avtomobili_s_probegom/_%__%__%__%';
SELECT count() FROM test.hits WHERE URL LIKE '%avtomobili%';

View File

@ -0,0 +1 @@
SELECT min(ts = toUInt32(toDateTime(toString(ts)))) FROM (SELECT 1000000000 + 1234 * number AS ts FROM system.numbers LIMIT 1000000);

View File

@ -0,0 +1,2 @@
2015-01-01 hello world
2015-01-01 hello1 xxx world1

View File

@ -0,0 +1,11 @@
DROP TABLE IF EXISTS test.prewhere;
CREATE TABLE test.prewhere (d Date, a String, b String) ENGINE = MergeTree(d, d, 8192);
INSERT INTO test.prewhere VALUES ('2015-01-01', 'hello', 'world');
ALTER TABLE test.prewhere ADD COLUMN a1 String AFTER a;
INSERT INTO test.prewhere VALUES ('2015-01-01', 'hello1', 'xxx', 'world1');
SELECT d, a, a1, b FROM test.prewhere PREWHERE a LIKE 'hello%' ORDER BY a1;
DROP TABLE test.prewhere;

View File

@ -0,0 +1 @@
2009-02-01 1234567890

View File

@ -0,0 +1,7 @@
DROP TABLE IF EXISTS test.default;
CREATE TABLE test.default (d Date DEFAULT toDate(t), t DateTime) ENGINE = MergeTree(d, t, 8192);
INSERT INTO test.default (t) VALUES ('1234567890');
SELECT toStartOfMonth(d), toUInt32(t) FROM test.default;
DROP TABLE test.default;

View File

@ -0,0 +1 @@
SELECT min(ts = toUInt32(toDateTime(toString(ts)))) FROM (SELECT 1000000000 + 1234 * number AS ts FROM system.numbers LIMIT 1000000);

View File

@ -0,0 +1,30 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,33 @@
select isFinite(0) = 1;
select isFinite(1) = 1;
select isFinite(materialize(0)) = 1;
select isFinite(materialize(1)) = 1;
select isFinite(1/0) = 0;
select isFinite(-1/0) = 0;
select isFinite(0/0) = 0;
select isFinite(inf) = 0;
select isFinite(-inf) = 0;
select isFinite(nan) = 0;
select isInfinite(0) = 0;
select isInfinite(1) = 0;
select isInfinite(materialize(0)) = 0;
select isInfinite(materialize(1)) = 0;
select isInfinite(1/0) = 1;
select isInfinite(-1/0) = 1;
select isInfinite(0/0) = 0;
select isInfinite(inf) = 1;
select isInfinite(-inf) = 1;
select isInfinite(nan) = 0;
select isNaN(0) = 0;
select isNaN(1) = 0;
select isNaN(materialize(0)) = 0;
select isNaN(materialize(1)) = 0;
select isNaN(1/0) = 0;
select isNaN(-1/0) = 0;
select isNaN(0/0) = 1;
select isNaN(inf) = 0;
select isNaN(-inf) = 0;
select isNaN(nan) = 1;

View File

@ -0,0 +1,2 @@
1
1

View File

@ -0,0 +1,2 @@
SELECT match(materialize('Hello'), '');
SELECT match('Hello', '');

View File

@ -0,0 +1,16 @@
0
1
1
1
1
0
1
1
0
1
1
1
1
0
1
1

View File

@ -0,0 +1,18 @@
SELECT materialize('Hello') LIKE '';
SELECT materialize('Hello') LIKE '%';
SELECT materialize('Hello') LIKE '%%';
SELECT materialize('Hello') LIKE '%%%';
SELECT materialize('Hello') LIKE '%_%';
SELECT materialize('Hello') LIKE '_';
SELECT materialize('Hello') LIKE '_%';
SELECT materialize('Hello') LIKE '%_';
SELECT 'Hello' LIKE '';
SELECT 'Hello' LIKE '%';
SELECT 'Hello' LIKE '%%';
SELECT 'Hello' LIKE '%%%';
SELECT 'Hello' LIKE '%_%';
SELECT 'Hello' LIKE '_';
SELECT 'Hello' LIKE '_%';
SELECT 'Hello' LIKE '%_';