mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Merge pull request #7498 from zhang2014/docs/diff_script
add easy diff for document tracking
This commit is contained in:
commit
b2f72c7857
146
docs/tools/easy_diff.py
Executable file
146
docs/tools/easy_diff.py
Executable file
@ -0,0 +1,146 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os, sys
|
||||
import argparse
|
||||
import subprocess
|
||||
import contextlib
|
||||
from git import cmd
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
SCRIPT_DESCRIPTION = '''
|
||||
usage: ./easy_diff.py language/document path
|
||||
|
||||
Show the difference between a language document and an English document.
|
||||
|
||||
This script is based on the assumption that documents in other languages are fully synchronized with the en document at a commit.
|
||||
|
||||
For example:
|
||||
Execute:
|
||||
./easy_diff.py --no-pager zh/data_types
|
||||
Output:
|
||||
Need translate document:~/ClickHouse/docs/en/data_types/uuid.md
|
||||
Need link document:~/ClickHouse/docs/en/data_types/decimal.md to ~/ClickHouse/docs/zh/data_types/decimal.md
|
||||
diff --git a/docs/en/data_types/domains/ipv6.md b/docs/en/data_types/domains/ipv6.md
|
||||
index 1bfbe3400b..e2abaff017 100644
|
||||
--- a/docs/en/data_types/domains/ipv6.md
|
||||
+++ b/docs/en/data_types/domains/ipv6.md
|
||||
@@ -4,13 +4,13 @@
|
||||
|
||||
### Basic Usage
|
||||
|
||||
-``` sql
|
||||
+```sql
|
||||
CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url;
|
||||
|
||||
DESCRIBE TABLE hits;
|
||||
```
|
||||
|
||||
-```
|
||||
+```text
|
||||
┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐
|
||||
│ url │ String │ │ │ │ │
|
||||
│ from │ IPv6 │ │ │ │ │
|
||||
@@ -19,19 +19,19 @@ DESCRIBE TABLE hits;
|
||||
|
||||
OR you can use `IPv6` domain as a key:
|
||||
|
||||
-``` sql
|
||||
+```sql
|
||||
CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from;
|
||||
... MORE
|
||||
|
||||
OPTIONS:
|
||||
-h, --help show this help message and exit
|
||||
--no-pager use stdout as difference result output
|
||||
'''
|
||||
|
||||
SCRIPT_PATH = os.path.abspath(__file__)
|
||||
CLICKHOUSE_REPO_HOME = os.path.join(os.path.dirname(SCRIPT_PATH), '..', '..')
|
||||
SCRIPT_COMMAND_EXECUTOR = cmd.Git(CLICKHOUSE_REPO_HOME)
|
||||
|
||||
SCRIPT_COMMAND_PARSER = argparse.ArgumentParser(add_help=False)
|
||||
SCRIPT_COMMAND_PARSER.add_argument('path', type=bytes, nargs='?', default=None)
|
||||
SCRIPT_COMMAND_PARSER.add_argument('--no-pager', action='store_true', default=False)
|
||||
SCRIPT_COMMAND_PARSER.add_argument('-h', '--help', action='store_true', default=False)
|
||||
|
||||
|
||||
def execute(commands):
|
||||
return SCRIPT_COMMAND_EXECUTOR.execute(commands)
|
||||
|
||||
|
||||
def get_hash(file_name):
|
||||
return execute(['git', 'log', '-n', '1', '--pretty=format:"%H"', file_name])
|
||||
|
||||
|
||||
def diff_file(reference_file, working_file, out):
|
||||
if not os.path.exists(reference_file):
|
||||
raise RuntimeError('reference file [' + os.path.abspath(reference_file) + '] is not exists.')
|
||||
|
||||
if os.path.islink(working_file):
|
||||
out.writelines(["Need translate document:" + os.path.abspath(reference_file)])
|
||||
elif not os.path.exists(working_file):
|
||||
out.writelines(['Need link document ' + os.path.abspath(reference_file) + ' to ' + os.path.abspath(working_file)])
|
||||
elif get_hash(working_file) != get_hash(reference_file):
|
||||
out.writelines([(execute(['git', 'diff', get_hash(working_file).strip('"'), reference_file]).encode('utf-8'))])
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def diff_directory(reference_directory, working_directory, out):
|
||||
if not os.path.isdir(reference_directory):
|
||||
return diff_file(reference_directory, working_directory, out)
|
||||
|
||||
for list_item in os.listdir(reference_directory):
|
||||
working_item = os.path.join(working_directory, list_item)
|
||||
reference_item = os.path.join(reference_directory, list_item)
|
||||
if diff_file(reference_item, working_item, out) if os.path.isfile(reference_item) else diff_directory(reference_item, working_item, out) != 0:
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def find_language_doc(custom_document, other_language='en', children=[]):
|
||||
if len(custom_document) == 0:
|
||||
raise RuntimeError('The ' + os.path.join(custom_document, *children) + " is not in docs directory.")
|
||||
|
||||
if os.path.samefile(os.path.join(CLICKHOUSE_REPO_HOME, 'docs'), custom_document):
|
||||
return os.path.join(CLICKHOUSE_REPO_HOME, 'docs', other_language, *children[1:])
|
||||
children.insert(0, os.path.split(custom_document)[1])
|
||||
return find_language_doc(os.path.split(custom_document)[0], other_language, children)
|
||||
|
||||
|
||||
class ToPager:
|
||||
def __init__(self, temp_named_file):
|
||||
self.temp_named_file = temp_named_file
|
||||
|
||||
def writelines(self, lines):
|
||||
self.temp_named_file.writelines(lines)
|
||||
|
||||
def close(self):
|
||||
self.temp_named_file.flush()
|
||||
git_pager = execute(['git', 'var', 'GIT_PAGER'])
|
||||
subprocess.check_call([git_pager, self.temp_named_file.name])
|
||||
self.temp_named_file.close()
|
||||
|
||||
|
||||
class ToStdOut:
|
||||
def writelines(self, lines):
|
||||
self.system_stdout_stream.writelines(lines)
|
||||
|
||||
def close(self):
|
||||
self.system_stdout_stream.flush()
|
||||
|
||||
def __init__(self, system_stdout_stream):
|
||||
self.system_stdout_stream = system_stdout_stream
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
arguments = SCRIPT_COMMAND_PARSER.parse_args()
|
||||
if arguments.help or not arguments.path:
|
||||
sys.stdout.write(SCRIPT_DESCRIPTION)
|
||||
sys.exit(0)
|
||||
|
||||
working_language = os.path.join(CLICKHOUSE_REPO_HOME, 'docs', arguments.path)
|
||||
with contextlib.closing(ToStdOut(sys.stdout) if arguments.no_pager else ToPager(NamedTemporaryFile('r+'))) as writer:
|
||||
exit(diff_directory(find_language_doc(working_language), working_language, writer))
|
@ -33,3 +33,4 @@ tornado==5.1
|
||||
typing==3.6.2
|
||||
Unidecode==1.0.23
|
||||
urllib3==1.24.2
|
||||
gitpython==2.1.14
|
||||
|
@ -7,21 +7,21 @@
|
||||
在控制台运行以下命令:
|
||||
|
||||
```bash
|
||||
sudo apt-get install s3cmd
|
||||
mkdir tiny; cd tiny;
|
||||
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ .
|
||||
cd ..
|
||||
mkdir 1node; cd 1node;
|
||||
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ .
|
||||
cd ..
|
||||
mkdir 5nodes; cd 5nodes;
|
||||
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ .
|
||||
cd ..
|
||||
$ sudo apt-get install s3cmd
|
||||
$ mkdir tiny; cd tiny;
|
||||
$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ .
|
||||
$ cd ..
|
||||
$ mkdir 1node; cd 1node;
|
||||
$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ .
|
||||
$ cd ..
|
||||
$ mkdir 5nodes; cd 5nodes;
|
||||
$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ .
|
||||
$ cd ..
|
||||
```
|
||||
|
||||
在ClickHouse运行如下查询:
|
||||
|
||||
``` sql
|
||||
```sql
|
||||
CREATE TABLE rankings_tiny
|
||||
(
|
||||
pageURL String,
|
||||
@ -86,12 +86,12 @@ CREATE TABLE uservisits_5nodes_on_single
|
||||
回到控制台运行如下命令:
|
||||
|
||||
```bash
|
||||
for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done
|
||||
for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done
|
||||
for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done
|
||||
for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done
|
||||
for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done
|
||||
for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done
|
||||
$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done
|
||||
$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done
|
||||
$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done
|
||||
$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done
|
||||
$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done
|
||||
$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done
|
||||
```
|
||||
|
||||
简单的查询示例:
|
||||
|
@ -4,14 +4,14 @@
|
||||
|
||||
创建原始数据对应的表结构:
|
||||
|
||||
``` sql
|
||||
```sql
|
||||
CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log
|
||||
```
|
||||
|
||||
下载数据:
|
||||
|
||||
```bash
|
||||
for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done
|
||||
$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done
|
||||
```
|
||||
|
||||
创建转换后的数据对应的表结构:
|
||||
@ -65,7 +65,7 @@ CREATE TABLE criteo
|
||||
|
||||
将第一张表中的原始数据转化写入到第二张表中去:
|
||||
|
||||
``` sql
|
||||
```sql
|
||||
INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log;
|
||||
|
||||
DROP TABLE criteo_log;
|
||||
|
File diff suppressed because one or more lines are too long
@ -1,6 +1,13 @@
|
||||
|
||||
# 航班飞行数据
|
||||
|
||||
航班飞行数据有以下两个方式获取:
|
||||
|
||||
- 从原始数据导入
|
||||
- 下载预处理好的分区数据
|
||||
|
||||
## 从原始数据导入
|
||||
|
||||
下载数据:
|
||||
|
||||
```bash
|
||||
@ -134,39 +141,75 @@ CREATE TABLE `ontime` (
|
||||
加载数据:
|
||||
|
||||
```bash
|
||||
for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
|
||||
$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
|
||||
```
|
||||
|
||||
查询:
|
||||
## 下载预处理好的分区数据
|
||||
|
||||
```bash
|
||||
$ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar
|
||||
$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory
|
||||
$ # check permissions of unpacked data, fix if required
|
||||
$ sudo service clickhouse-server restart
|
||||
$ clickhouse-client --query "select count(*) from datasets.ontime"
|
||||
```
|
||||
!!!info
|
||||
如果要运行下面的SQL查询,必须使用完整的表名,
|
||||
`datasets.ontime`。
|
||||
|
||||
## 查询:
|
||||
|
||||
Q0.
|
||||
|
||||
```sql
|
||||
select avg(c1) from (select Year, Month, count(*) as c1 from ontime group by Year, Month);
|
||||
SELECT avg(c1)
|
||||
FROM
|
||||
(
|
||||
SELECT Year, Month, count(*) AS c1
|
||||
FROM ontime
|
||||
GROUP BY Year, Month
|
||||
);
|
||||
```
|
||||
|
||||
Q1. 查询从2000年到2008年每天的航班数
|
||||
|
||||
```sql
|
||||
SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC;
|
||||
SELECT DayOfWeek, count(*) AS c
|
||||
FROM ontime
|
||||
WHERE Year>=2000 AND Year<=2008
|
||||
GROUP BY DayOfWeek
|
||||
ORDER BY c DESC;
|
||||
```
|
||||
|
||||
Q2. 查询从2000年到2008年每周延误超过10分钟的航班数。
|
||||
|
||||
```sql
|
||||
SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC
|
||||
SELECT DayOfWeek, count(*) AS c
|
||||
FROM ontime
|
||||
WHERE DepDelay>10 AND Year>=2000 AND Year<=2008
|
||||
GROUP BY DayOfWeek
|
||||
ORDER BY c DESC;
|
||||
```
|
||||
|
||||
Q3. 查询2000年到2008年每个机场延误超过10分钟以上的次数
|
||||
|
||||
```sql
|
||||
SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY Origin ORDER BY c DESC LIMIT 10
|
||||
SELECT Origin, count(*) AS c
|
||||
FROM ontime
|
||||
WHERE DepDelay>10 AND Year>=2000 AND Year<=2008
|
||||
GROUP BY Origin
|
||||
ORDER BY c DESC
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
Q4. 查询2007年各航空公司延误超过10分钟以上的次数
|
||||
|
||||
```sql
|
||||
SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year = 2007 GROUP BY Carrier ORDER BY count(*) DESC
|
||||
SELECT Carrier, count(*)
|
||||
FROM ontime
|
||||
WHERE DepDelay>10 AND Year=2007
|
||||
GROUP BY Carrier
|
||||
ORDER BY count(*) DESC;
|
||||
```
|
||||
|
||||
Q5. 查询2007年各航空公司延误超过10分钟以上的百分比
|
||||
@ -198,7 +241,11 @@ ORDER BY c3 DESC;
|
||||
更好的查询版本:
|
||||
|
||||
```sql
|
||||
SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year = 2007 GROUP BY Carrier ORDER BY Carrier
|
||||
SELECT Carrier, avg(DepDelay>10)*100 AS c3
|
||||
FROM ontime
|
||||
WHERE Year=2007
|
||||
GROUP BY Carrier
|
||||
ORDER BY Carrier
|
||||
```
|
||||
|
||||
Q6. 同上一个查询一致,只是查询范围扩大到2000年到2008年
|
||||
@ -212,7 +259,7 @@ FROM
|
||||
count(*) AS c
|
||||
FROM ontime
|
||||
WHERE DepDelay>10
|
||||
AND Year >= 2000 AND Year <= 2008
|
||||
AND Year>=2000 AND Year<=2008
|
||||
GROUP BY Carrier
|
||||
)
|
||||
ANY INNER JOIN
|
||||
@ -221,7 +268,7 @@ ANY INNER JOIN
|
||||
Carrier,
|
||||
count(*) AS c2
|
||||
FROM ontime
|
||||
WHERE Year >= 2000 AND Year <= 2008
|
||||
WHERE Year>=2000 AND Year<=2008
|
||||
GROUP BY Carrier
|
||||
) USING Carrier
|
||||
ORDER BY c3 DESC;
|
||||
@ -230,7 +277,11 @@ ORDER BY c3 DESC;
|
||||
更好的查询版本:
|
||||
|
||||
```sql
|
||||
SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY Carrier ORDER BY Carrier
|
||||
SELECT Carrier, avg(DepDelay>10)*100 AS c3
|
||||
FROM ontime
|
||||
WHERE Year>=2000 AND Year<=2008
|
||||
GROUP BY Carrier
|
||||
ORDER BY Carrier;
|
||||
```
|
||||
|
||||
Q7. 每年航班延误超过10分钟的百分比
|
||||
@ -254,41 +305,50 @@ ANY INNER JOIN
|
||||
from ontime
|
||||
GROUP BY Year
|
||||
) USING (Year)
|
||||
ORDER BY Year
|
||||
ORDER BY Year;
|
||||
```
|
||||
|
||||
更好的查询版本:
|
||||
|
||||
```sql
|
||||
SELECT Year, avg(DepDelay > 10) FROM ontime GROUP BY Year ORDER BY Year
|
||||
SELECT Year, avg(DepDelay>10)
|
||||
FROM ontime
|
||||
GROUP BY Year
|
||||
ORDER BY Year;
|
||||
```
|
||||
|
||||
Q8. 每年更受人们喜爱的目的地
|
||||
|
||||
```sql
|
||||
SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
|
||||
SELECT DestCityName, uniqExact(OriginCityName) AS u
|
||||
FROM ontime
|
||||
WHERE Year >= 2000 and Year <= 2010
|
||||
GROUP BY DestCityName
|
||||
ORDER BY u DESC LIMIT 10;
|
||||
```
|
||||
|
||||
Q9.
|
||||
|
||||
```sql
|
||||
select Year, count(*) as c1 from ontime group by Year;
|
||||
SELECT Year, count(*) AS c1
|
||||
FROM ontime
|
||||
GROUP BY Year;
|
||||
```
|
||||
|
||||
Q10.
|
||||
|
||||
```sql
|
||||
select
|
||||
min(Year), max(Year), Carrier, count(*) as cnt,
|
||||
sum(ArrDelayMinutes>30) as flights_delayed,
|
||||
round(sum(ArrDelayMinutes>30)/count(*),2) as rate
|
||||
SELECT
|
||||
min(Year), max(Year), Carrier, count(*) AS cnt,
|
||||
sum(ArrDelayMinutes>30) AS flights_delayed,
|
||||
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
|
||||
FROM ontime
|
||||
WHERE
|
||||
DayOfWeek not in (6,7) and OriginState not in ('AK', 'HI', 'PR', 'VI')
|
||||
and DestState not in ('AK', 'HI', 'PR', 'VI')
|
||||
and FlightDate < '2010-01-01'
|
||||
DayOfWeek NOT IN (6,7) AND OriginState NOT IN ('AK', 'HI', 'PR', 'VI')
|
||||
AND DestState NOT IN ('AK', 'HI', 'PR', 'VI')
|
||||
AND FlightDate < '2010-01-01'
|
||||
GROUP by Carrier
|
||||
HAVING cnt > 100000 and max(Year) > 1990
|
||||
HAVING cnt>100000 and max(Year)>1990
|
||||
ORDER by rate DESC
|
||||
LIMIT 1000;
|
||||
```
|
||||
@ -296,15 +356,39 @@ LIMIT 1000;
|
||||
Bonus:
|
||||
|
||||
```sql
|
||||
SELECT avg(cnt) FROM (SELECT Year,Month,count(*) AS cnt FROM ontime WHERE DepDel15=1 GROUP BY Year,Month)
|
||||
SELECT avg(cnt)
|
||||
FROM
|
||||
(
|
||||
SELECT Year,Month,count(*) AS cnt
|
||||
FROM ontime
|
||||
WHERE DepDel15=1
|
||||
GROUP BY Year,Month
|
||||
);
|
||||
|
||||
select avg(c1) from (select Year,Month,count(*) as c1 from ontime group by Year,Month)
|
||||
SELECT avg(c1) FROM
|
||||
(
|
||||
SELECT Year,Month,count(*) AS c1
|
||||
FROM ontime
|
||||
GROUP BY Year,Month
|
||||
);
|
||||
|
||||
SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
|
||||
SELECT DestCityName, uniqExact(OriginCityName) AS u
|
||||
FROM ontime
|
||||
GROUP BY DestCityName
|
||||
ORDER BY u DESC
|
||||
LIMIT 10;
|
||||
|
||||
SELECT OriginCityName, DestCityName, count() AS c FROM ontime GROUP BY OriginCityName, DestCityName ORDER BY c DESC LIMIT 10;
|
||||
SELECT OriginCityName, DestCityName, count() AS c
|
||||
FROM ontime
|
||||
GROUP BY OriginCityName, DestCityName
|
||||
ORDER BY c DESC
|
||||
LIMIT 10;
|
||||
|
||||
SELECT OriginCityName, count() AS c FROM ontime GROUP BY OriginCityName ORDER BY c DESC LIMIT 10;
|
||||
SELECT OriginCityName, count() AS c
|
||||
FROM ontime
|
||||
GROUP BY OriginCityName
|
||||
ORDER BY c DESC
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
这个性能测试由Vadim Tkachenko提供。参考:
|
||||
|
@ -1,26 +1,26 @@
|
||||
# Star Schema Benchmark
|
||||
|
||||
Compiling dbgen:
|
||||
编译 dbgen:
|
||||
|
||||
```
|
||||
git clone git@github.com:vadimtk/ssb-dbgen.git
|
||||
cd ssb-dbgen
|
||||
make
|
||||
```bash
|
||||
$ git clone git@github.com:vadimtk/ssb-dbgen.git
|
||||
$ cd ssb-dbgen
|
||||
$ make
|
||||
```
|
||||
|
||||
Generating data:
|
||||
开始生成数据:
|
||||
|
||||
```
|
||||
./dbgen -s 1000 -T c
|
||||
./dbgen -s 1000 -T l
|
||||
./dbgen -s 1000 -T p
|
||||
./dbgen -s 1000 -T s
|
||||
./dbgen -s 1000 -T d
|
||||
```bash
|
||||
$ ./dbgen -s 1000 -T c
|
||||
$ ./dbgen -s 1000 -T l
|
||||
$ ./dbgen -s 1000 -T p
|
||||
$ ./dbgen -s 1000 -T s
|
||||
$ ./dbgen -s 1000 -T d
|
||||
```
|
||||
|
||||
Creating tables in ClickHouse:
|
||||
在ClickHouse中创建表结构:
|
||||
|
||||
```
|
||||
```sql
|
||||
CREATE TABLE customer
|
||||
(
|
||||
C_CUSTKEY UInt32,
|
||||
@ -83,73 +83,85 @@ CREATE TABLE supplier
|
||||
ENGINE = MergeTree ORDER BY S_SUPPKEY;
|
||||
```
|
||||
|
||||
Inserting data:
|
||||
写入数据:
|
||||
|
||||
```
|
||||
clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl
|
||||
clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl
|
||||
clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl
|
||||
clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl
|
||||
```bash
|
||||
$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl
|
||||
$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl
|
||||
$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl
|
||||
$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl
|
||||
```
|
||||
|
||||
Converting "star schema" to denormalized "flat schema":
|
||||
将“星型模型”转换为非规范化的“平面模型”:
|
||||
|
||||
```
|
||||
```sql
|
||||
SET max_memory_usage = 20000000000, allow_experimental_multiple_joins_emulation = 1;
|
||||
|
||||
CREATE TABLE lineorder_flat
|
||||
ENGINE = MergeTree
|
||||
PARTITION BY toYear(LO_ORDERDATE)
|
||||
ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS
|
||||
SELECT *
|
||||
FROM lineorder
|
||||
ANY INNER JOIN customer ON LO_CUSTKEY = C_CUSTKEY
|
||||
ANY INNER JOIN supplier ON LO_SUPPKEY = S_SUPPKEY
|
||||
ANY INNER JOIN part ON LO_PARTKEY = P_PARTKEY;
|
||||
SELECT l.*, c.*, s.*, p.*
|
||||
FROM lineorder l
|
||||
ANY INNER JOIN customer c ON (c.C_CUSTKEY = l.LO_CUSTKEY)
|
||||
ANY INNER JOIN supplier s ON (s.S_SUPPKEY = l.LO_SUPPKEY)
|
||||
ANY INNER JOIN part p ON (p.P_PARTKEY = l.LO_PARTKEY);
|
||||
|
||||
ALTER TABLE lineorder_flat DROP COLUMN C_CUSTKEY, DROP COLUMN S_SUPPKEY, DROP COLUMN P_PARTKEY;
|
||||
```
|
||||
|
||||
Running the queries:
|
||||
|
||||
```
|
||||
Q1.1
|
||||
```sql
|
||||
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25;
|
||||
|
||||
```
|
||||
Q1.2
|
||||
```sql
|
||||
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35;
|
||||
|
||||
```
|
||||
Q1.3
|
||||
```sql
|
||||
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35;
|
||||
|
||||
```
|
||||
Q2.1
|
||||
```sql
|
||||
SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND;
|
||||
|
||||
```
|
||||
Q2.2
|
||||
```sql
|
||||
SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228' AND S_REGION = 'ASIA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND;
|
||||
|
||||
```
|
||||
Q2.3
|
||||
```sql
|
||||
SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' GROUP BY year, P_BRAND ORDER BY year, P_BRAND;
|
||||
|
||||
```
|
||||
Q3.1
|
||||
```sql
|
||||
SELECT C_NATION, S_NATION, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 GROUP BY C_NATION, S_NATION, year ORDER BY year asc, revenue desc;
|
||||
|
||||
```
|
||||
Q3.2
|
||||
```sql
|
||||
SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc;
|
||||
|
||||
```
|
||||
Q3.3
|
||||
```sql
|
||||
SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc;
|
||||
|
||||
```
|
||||
Q3.4
|
||||
```sql
|
||||
SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = '199712' GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc;
|
||||
|
||||
```
|
||||
Q4.1
|
||||
```sql
|
||||
SELECT toYear(LO_ORDERDATE) AS year, C_NATION, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, C_NATION ORDER BY year, C_NATION;
|
||||
|
||||
```
|
||||
Q4.2
|
||||
```sql
|
||||
SELECT toYear(LO_ORDERDATE) AS year, S_NATION, P_CATEGORY, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, S_NATION, P_CATEGORY ORDER BY year, S_NATION, P_CATEGORY;
|
||||
|
||||
```
|
||||
Q4.3
|
||||
```sql
|
||||
SELECT toYear(LO_ORDERDATE) AS year, S_CITY, P_BRAND, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' GROUP BY year, S_CITY, P_BRAND ORDER BY year, S_CITY, P_BRAND;
|
||||
```
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
创建表结构:
|
||||
|
||||
``` sql
|
||||
```sql
|
||||
CREATE TABLE wikistat
|
||||
(
|
||||
date Date,
|
||||
@ -20,9 +20,9 @@ CREATE TABLE wikistat
|
||||
加载数据:
|
||||
|
||||
```bash
|
||||
for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt
|
||||
cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done
|
||||
ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done
|
||||
$ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt
|
||||
$ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done
|
||||
$ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done
|
||||
```
|
||||
|
||||
|
||||
|
@ -22,8 +22,8 @@ ClickHouse还可以在FreeBSD与Mac OS X上工作。同时它可以在不支持S
|
||||
|
||||
在`/etc/apt/sources.list` (或创建`/etc/apt/sources.list.d/clickhouse.list`文件)中添加仓库:
|
||||
|
||||
```text
|
||||
deb http://repo.yandex.ru/clickhouse/deb/stable/ main/
|
||||
```bash
|
||||
$ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/
|
||||
```
|
||||
|
||||
如果你想使用最新的测试版本,请使用'testing'替换'stable'。
|
||||
@ -31,10 +31,10 @@ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/
|
||||
然后运行:
|
||||
|
||||
```bash
|
||||
sudo apt-get install dirmngr # optional
|
||||
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional
|
||||
sudo apt-get update
|
||||
sudo apt-get install clickhouse-client clickhouse-server
|
||||
$ sudo apt-get install dirmngr # optional
|
||||
$ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional
|
||||
$ sudo apt-get update
|
||||
$ sudo apt-get install clickhouse-client clickhouse-server
|
||||
```
|
||||
|
||||
你也可以从这里手动下载安装包:<https://repo.yandex.ru/clickhouse/deb/stable/main/>。
|
||||
@ -43,16 +43,16 @@ ClickHouse包含访问控制配置,它们位于`users.xml`文件中(与'config
|
||||
默认情况下,允许从任何地方使用默认的‘default’用户无密码的访问ClickHouse。参考‘user/default/networks’。
|
||||
有关更多信息,请参考"Configuration files"部分。
|
||||
|
||||
###来自RPM包
|
||||
###为CentOS/RedHat安装
|
||||
|
||||
Yandex ClickHouse团队建议使用官方预编译的`rpm`软件包,用于CentOS,RedHat和所有其他基于rpm的Linux发行版。
|
||||
|
||||
首先,您需要添加官方存储库:
|
||||
|
||||
```bash
|
||||
sudo yum install yum-utils
|
||||
sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG
|
||||
sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64
|
||||
$ sudo yum install yum-utils
|
||||
$ sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG
|
||||
$ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64
|
||||
```
|
||||
|
||||
如果您想使用最新版本,请将`stable`替换为`testing`(建议您在测试环境中使用)。
|
||||
@ -60,12 +60,12 @@ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/
|
||||
然后运行这些命令以实际安装包:
|
||||
|
||||
```bash
|
||||
sudo yum install clickhouse-server clickhouse-client
|
||||
$ sudo yum install clickhouse-server clickhouse-client
|
||||
```
|
||||
|
||||
您也可以从此处手动下载和安装软件包:<https://repo.yandex.ru/clickhouse/rpm/stable/x86_64>。
|
||||
|
||||
###来自Docker
|
||||
###使用Docker安装
|
||||
|
||||
要在Docker中运行ClickHouse,请遵循[Docker Hub](https://hub.docker.com/r/yandex/clickhouse-server/)上的指南。这些镜像使用官方的`deb`包构建。
|
||||
|
||||
@ -136,18 +136,14 @@ milovidov@hostname:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client
|
||||
ClickHouse client version 0.0.18749.
|
||||
Connecting to localhost:9000.
|
||||
Connected to ClickHouse server version 0.0.18749.
|
||||
|
||||
```
|
||||
```sql
|
||||
:) SELECT 1
|
||||
|
||||
SELECT 1
|
||||
|
||||
```
|
||||
```text
|
||||
┌─1─┐
|
||||
│ 1 │
|
||||
└───┘
|
||||
|
||||
1 rows in set. Elapsed: 0.003 sec.
|
||||
|
||||
:)
|
||||
```
|
||||
|
||||
**恭喜,系统已经工作了!**
|
||||
|
Loading…
Reference in New Issue
Block a user