Merge pull request #7498 from zhang2014/docs/diff_script

add easy diff for document tracking
This commit is contained in:
alexey-milovidov 2019-11-01 18:34:04 +03:00 committed by GitHub
commit b2f72c7857
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 388 additions and 132 deletions

146
docs/tools/easy_diff.py Executable file
View File

@ -0,0 +1,146 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys
import argparse
import subprocess
import contextlib
from git import cmd
from tempfile import NamedTemporaryFile
SCRIPT_DESCRIPTION = '''
usage: ./easy_diff.py language/document path
Show the difference between a language document and an English document.
This script is based on the assumption that documents in other languages are fully synchronized with the en document at a commit.
For example:
Execute:
./easy_diff.py --no-pager zh/data_types
Output:
Need translate document:~/ClickHouse/docs/en/data_types/uuid.md
Need link document:~/ClickHouse/docs/en/data_types/decimal.md to ~/ClickHouse/docs/zh/data_types/decimal.md
diff --git a/docs/en/data_types/domains/ipv6.md b/docs/en/data_types/domains/ipv6.md
index 1bfbe3400b..e2abaff017 100644
--- a/docs/en/data_types/domains/ipv6.md
+++ b/docs/en/data_types/domains/ipv6.md
@@ -4,13 +4,13 @@
### Basic Usage
-``` sql
+```sql
CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url;
DESCRIBE TABLE hits;
```
-```
+```text
nametypedefault_typedefault_expressioncommentcodec_expression
url String
from IPv6
@@ -19,19 +19,19 @@ DESCRIBE TABLE hits;
OR you can use `IPv6` domain as a key:
-``` sql
+```sql
CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from;
... MORE
OPTIONS:
-h, --help show this help message and exit
--no-pager use stdout as difference result output
'''
SCRIPT_PATH = os.path.abspath(__file__)
CLICKHOUSE_REPO_HOME = os.path.join(os.path.dirname(SCRIPT_PATH), '..', '..')
SCRIPT_COMMAND_EXECUTOR = cmd.Git(CLICKHOUSE_REPO_HOME)
SCRIPT_COMMAND_PARSER = argparse.ArgumentParser(add_help=False)
SCRIPT_COMMAND_PARSER.add_argument('path', type=bytes, nargs='?', default=None)
SCRIPT_COMMAND_PARSER.add_argument('--no-pager', action='store_true', default=False)
SCRIPT_COMMAND_PARSER.add_argument('-h', '--help', action='store_true', default=False)
def execute(commands):
return SCRIPT_COMMAND_EXECUTOR.execute(commands)
def get_hash(file_name):
return execute(['git', 'log', '-n', '1', '--pretty=format:"%H"', file_name])
def diff_file(reference_file, working_file, out):
if not os.path.exists(reference_file):
raise RuntimeError('reference file [' + os.path.abspath(reference_file) + '] is not exists.')
if os.path.islink(working_file):
out.writelines(["Need translate document:" + os.path.abspath(reference_file)])
elif not os.path.exists(working_file):
out.writelines(['Need link document ' + os.path.abspath(reference_file) + ' to ' + os.path.abspath(working_file)])
elif get_hash(working_file) != get_hash(reference_file):
out.writelines([(execute(['git', 'diff', get_hash(working_file).strip('"'), reference_file]).encode('utf-8'))])
return 0
def diff_directory(reference_directory, working_directory, out):
if not os.path.isdir(reference_directory):
return diff_file(reference_directory, working_directory, out)
for list_item in os.listdir(reference_directory):
working_item = os.path.join(working_directory, list_item)
reference_item = os.path.join(reference_directory, list_item)
if diff_file(reference_item, working_item, out) if os.path.isfile(reference_item) else diff_directory(reference_item, working_item, out) != 0:
return 1
return 0
def find_language_doc(custom_document, other_language='en', children=[]):
if len(custom_document) == 0:
raise RuntimeError('The ' + os.path.join(custom_document, *children) + " is not in docs directory.")
if os.path.samefile(os.path.join(CLICKHOUSE_REPO_HOME, 'docs'), custom_document):
return os.path.join(CLICKHOUSE_REPO_HOME, 'docs', other_language, *children[1:])
children.insert(0, os.path.split(custom_document)[1])
return find_language_doc(os.path.split(custom_document)[0], other_language, children)
class ToPager:
def __init__(self, temp_named_file):
self.temp_named_file = temp_named_file
def writelines(self, lines):
self.temp_named_file.writelines(lines)
def close(self):
self.temp_named_file.flush()
git_pager = execute(['git', 'var', 'GIT_PAGER'])
subprocess.check_call([git_pager, self.temp_named_file.name])
self.temp_named_file.close()
class ToStdOut:
def writelines(self, lines):
self.system_stdout_stream.writelines(lines)
def close(self):
self.system_stdout_stream.flush()
def __init__(self, system_stdout_stream):
self.system_stdout_stream = system_stdout_stream
if __name__ == '__main__':
arguments = SCRIPT_COMMAND_PARSER.parse_args()
if arguments.help or not arguments.path:
sys.stdout.write(SCRIPT_DESCRIPTION)
sys.exit(0)
working_language = os.path.join(CLICKHOUSE_REPO_HOME, 'docs', arguments.path)
with contextlib.closing(ToStdOut(sys.stdout) if arguments.no_pager else ToPager(NamedTemporaryFile('r+'))) as writer:
exit(diff_directory(find_language_doc(working_language), working_language, writer))

View File

@ -33,3 +33,4 @@ tornado==5.1
typing==3.6.2
Unidecode==1.0.23
urllib3==1.24.2
gitpython==2.1.14

View File

@ -7,21 +7,21 @@
在控制台运行以下命令:
```bash
sudo apt-get install s3cmd
mkdir tiny; cd tiny;
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ .
cd ..
mkdir 1node; cd 1node;
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ .
cd ..
mkdir 5nodes; cd 5nodes;
s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ .
cd ..
$ sudo apt-get install s3cmd
$ mkdir tiny; cd tiny;
$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ .
$ cd ..
$ mkdir 1node; cd 1node;
$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ .
$ cd ..
$ mkdir 5nodes; cd 5nodes;
$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ .
$ cd ..
```
在ClickHouse运行如下查询
``` sql
```sql
CREATE TABLE rankings_tiny
(
pageURL String,
@ -86,12 +86,12 @@ CREATE TABLE uservisits_5nodes_on_single
回到控制台运行如下命令:
```bash
for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done
for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done
for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done
for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done
for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done
for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done
$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done
$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done
$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done
$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done
$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done
$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done
```
简单的查询示例:

View File

@ -4,14 +4,14 @@
创建原始数据对应的表结构:
``` sql
```sql
CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log
```
下载数据:
```bash
for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done
$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done
```
创建转换后的数据对应的表结构:
@ -65,7 +65,7 @@ CREATE TABLE criteo
将第一张表中的原始数据转化写入到第二张表中去:
``` sql
```sql
INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log;
DROP TABLE criteo_log;

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,13 @@
# 航班飞行数据
航班飞行数据有以下两个方式获取:
- 从原始数据导入
- 下载预处理好的分区数据
## 从原始数据导入
下载数据:
```bash
@ -134,39 +141,75 @@ CREATE TABLE `ontime` (
加载数据:
```bash
for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done
```
查询:
## 下载预处理好的分区数据
```bash
$ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar
$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory
$ # check permissions of unpacked data, fix if required
$ sudo service clickhouse-server restart
$ clickhouse-client --query "select count(*) from datasets.ontime"
```
!!!info
如果要运行下面的SQL查询必须使用完整的表名
`datasets.ontime`
## 查询:
Q0.
```sql
select avg(c1) from (select Year, Month, count(*) as c1 from ontime group by Year, Month);
SELECT avg(c1)
FROM
(
SELECT Year, Month, count(*) AS c1
FROM ontime
GROUP BY Year, Month
);
```
Q1. 查询从2000年到2008年每天的航班数
```sql
SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC;
SELECT DayOfWeek, count(*) AS c
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY DayOfWeek
ORDER BY c DESC;
```
Q2. 查询从2000年到2008年每周延误超过10分钟的航班数。
```sql
SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC
SELECT DayOfWeek, count(*) AS c
FROM ontime
WHERE DepDelay>10 AND Year>=2000 AND Year<=2008
GROUP BY DayOfWeek
ORDER BY c DESC;
```
Q3. 查询2000年到2008年每个机场延误超过10分钟以上的次数
```sql
SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY Origin ORDER BY c DESC LIMIT 10
SELECT Origin, count(*) AS c
FROM ontime
WHERE DepDelay>10 AND Year>=2000 AND Year<=2008
GROUP BY Origin
ORDER BY c DESC
LIMIT 10;
```
Q4. 查询2007年各航空公司延误超过10分钟以上的次数
```sql
SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year = 2007 GROUP BY Carrier ORDER BY count(*) DESC
SELECT Carrier, count(*)
FROM ontime
WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier
ORDER BY count(*) DESC;
```
Q5. 查询2007年各航空公司延误超过10分钟以上的百分比
@ -198,7 +241,11 @@ ORDER BY c3 DESC;
更好的查询版本:
```sql
SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year = 2007 GROUP BY Carrier ORDER BY Carrier
SELECT Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year=2007
GROUP BY Carrier
ORDER BY Carrier
```
Q6. 同上一个查询一致,只是查询范围扩大到2000年到2008年
@ -212,7 +259,7 @@ FROM
count(*) AS c
FROM ontime
WHERE DepDelay>10
AND Year >= 2000 AND Year <= 2008
AND Year>=2000 AND Year<=2008
GROUP BY Carrier
)
ANY INNER JOIN
@ -221,7 +268,7 @@ ANY INNER JOIN
Carrier,
count(*) AS c2
FROM ontime
WHERE Year >= 2000 AND Year <= 2008
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
) USING Carrier
ORDER BY c3 DESC;
@ -230,7 +277,11 @@ ORDER BY c3 DESC;
更好的查询版本:
```sql
SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY Carrier ORDER BY Carrier
SELECT Carrier, avg(DepDelay>10)*100 AS c3
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
ORDER BY Carrier;
```
Q7. 每年航班延误超过10分钟的百分比
@ -254,41 +305,50 @@ ANY INNER JOIN
from ontime
GROUP BY Year
) USING (Year)
ORDER BY Year
ORDER BY Year;
```
更好的查询版本:
```sql
SELECT Year, avg(DepDelay > 10) FROM ontime GROUP BY Year ORDER BY Year
SELECT Year, avg(DepDelay>10)
FROM ontime
GROUP BY Year
ORDER BY Year;
```
Q8. 每年更受人们喜爱的目的地
```sql
SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
SELECT DestCityName, uniqExact(OriginCityName) AS u
FROM ontime
WHERE Year >= 2000 and Year <= 2010
GROUP BY DestCityName
ORDER BY u DESC LIMIT 10;
```
Q9.
```sql
select Year, count(*) as c1 from ontime group by Year;
SELECT Year, count(*) AS c1
FROM ontime
GROUP BY Year;
```
Q10.
```sql
select
min(Year), max(Year), Carrier, count(*) as cnt,
sum(ArrDelayMinutes>30) as flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) as rate
SELECT
min(Year), max(Year), Carrier, count(*) AS cnt,
sum(ArrDelayMinutes>30) AS flights_delayed,
round(sum(ArrDelayMinutes>30)/count(*),2) AS rate
FROM ontime
WHERE
DayOfWeek not in (6,7) and OriginState not in ('AK', 'HI', 'PR', 'VI')
and DestState not in ('AK', 'HI', 'PR', 'VI')
and FlightDate < '2010-01-01'
DayOfWeek NOT IN (6,7) AND OriginState NOT IN ('AK', 'HI', 'PR', 'VI')
AND DestState NOT IN ('AK', 'HI', 'PR', 'VI')
AND FlightDate < '2010-01-01'
GROUP by Carrier
HAVING cnt > 100000 and max(Year) > 1990
HAVING cnt>100000 and max(Year)>1990
ORDER by rate DESC
LIMIT 1000;
```
@ -296,15 +356,39 @@ LIMIT 1000;
Bonus:
```sql
SELECT avg(cnt) FROM (SELECT Year,Month,count(*) AS cnt FROM ontime WHERE DepDel15=1 GROUP BY Year,Month)
SELECT avg(cnt)
FROM
(
SELECT Year,Month,count(*) AS cnt
FROM ontime
WHERE DepDel15=1
GROUP BY Year,Month
);
select avg(c1) from (select Year,Month,count(*) as c1 from ontime group by Year,Month)
SELECT avg(c1) FROM
(
SELECT Year,Month,count(*) AS c1
FROM ontime
GROUP BY Year,Month
);
SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime GROUP BY DestCityName ORDER BY u DESC LIMIT 10;
SELECT DestCityName, uniqExact(OriginCityName) AS u
FROM ontime
GROUP BY DestCityName
ORDER BY u DESC
LIMIT 10;
SELECT OriginCityName, DestCityName, count() AS c FROM ontime GROUP BY OriginCityName, DestCityName ORDER BY c DESC LIMIT 10;
SELECT OriginCityName, DestCityName, count() AS c
FROM ontime
GROUP BY OriginCityName, DestCityName
ORDER BY c DESC
LIMIT 10;
SELECT OriginCityName, count() AS c FROM ontime GROUP BY OriginCityName ORDER BY c DESC LIMIT 10;
SELECT OriginCityName, count() AS c
FROM ontime
GROUP BY OriginCityName
ORDER BY c DESC
LIMIT 10;
```
这个性能测试由Vadim Tkachenko提供。参考

View File

@ -1,26 +1,26 @@
# Star Schema Benchmark
Compiling dbgen:
编译 dbgen:
```
git clone git@github.com:vadimtk/ssb-dbgen.git
cd ssb-dbgen
make
```bash
$ git clone git@github.com:vadimtk/ssb-dbgen.git
$ cd ssb-dbgen
$ make
```
Generating data:
开始生成数据:
```
./dbgen -s 1000 -T c
./dbgen -s 1000 -T l
./dbgen -s 1000 -T p
./dbgen -s 1000 -T s
./dbgen -s 1000 -T d
```bash
$ ./dbgen -s 1000 -T c
$ ./dbgen -s 1000 -T l
$ ./dbgen -s 1000 -T p
$ ./dbgen -s 1000 -T s
$ ./dbgen -s 1000 -T d
```
Creating tables in ClickHouse:
在ClickHouse中创建表结构
```
```sql
CREATE TABLE customer
(
C_CUSTKEY UInt32,
@ -83,73 +83,85 @@ CREATE TABLE supplier
ENGINE = MergeTree ORDER BY S_SUPPKEY;
```
Inserting data:
写入数据:
```
clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl
clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl
clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl
clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl
```bash
$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl
$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl
$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl
$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl
```
Converting "star schema" to denormalized "flat schema":
将“星型模型”转换为非规范化的“平面模型”:
```
```sql
SET max_memory_usage = 20000000000, allow_experimental_multiple_joins_emulation = 1;
CREATE TABLE lineorder_flat
ENGINE = MergeTree
PARTITION BY toYear(LO_ORDERDATE)
ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS
SELECT *
FROM lineorder
ANY INNER JOIN customer ON LO_CUSTKEY = C_CUSTKEY
ANY INNER JOIN supplier ON LO_SUPPKEY = S_SUPPKEY
ANY INNER JOIN part ON LO_PARTKEY = P_PARTKEY;
SELECT l.*, c.*, s.*, p.*
FROM lineorder l
ANY INNER JOIN customer c ON (c.C_CUSTKEY = l.LO_CUSTKEY)
ANY INNER JOIN supplier s ON (s.S_SUPPKEY = l.LO_SUPPKEY)
ANY INNER JOIN part p ON (p.P_PARTKEY = l.LO_PARTKEY);
ALTER TABLE lineorder_flat DROP COLUMN C_CUSTKEY, DROP COLUMN S_SUPPKEY, DROP COLUMN P_PARTKEY;
```
Running the queries:
```
Q1.1
```sql
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25;
```
Q1.2
```sql
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35;
```
Q1.3
```sql
SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35;
```
Q2.1
```sql
SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND;
```
Q2.2
```sql
SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228' AND S_REGION = 'ASIA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND;
```
Q2.3
```sql
SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' GROUP BY year, P_BRAND ORDER BY year, P_BRAND;
```
Q3.1
```sql
SELECT C_NATION, S_NATION, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 GROUP BY C_NATION, S_NATION, year ORDER BY year asc, revenue desc;
```
Q3.2
```sql
SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc;
```
Q3.3
```sql
SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc;
```
Q3.4
```sql
SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = '199712' GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc;
```
Q4.1
```sql
SELECT toYear(LO_ORDERDATE) AS year, C_NATION, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, C_NATION ORDER BY year, C_NATION;
```
Q4.2
```sql
SELECT toYear(LO_ORDERDATE) AS year, S_NATION, P_CATEGORY, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, S_NATION, P_CATEGORY ORDER BY year, S_NATION, P_CATEGORY;
```
Q4.3
```sql
SELECT toYear(LO_ORDERDATE) AS year, S_CITY, P_BRAND, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' GROUP BY year, S_CITY, P_BRAND ORDER BY year, S_CITY, P_BRAND;
```

View File

@ -4,7 +4,7 @@
创建表结构:
``` sql
```sql
CREATE TABLE wikistat
(
date Date,
@ -20,9 +20,9 @@ CREATE TABLE wikistat
加载数据:
```bash
for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt
cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done
ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done
$ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt
$ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done
$ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done
```

View File

@ -22,8 +22,8 @@ ClickHouse还可以在FreeBSD与Mac OS X上工作。同时它可以在不支持S
在`/etc/apt/sources.list` (或创建`/etc/apt/sources.list.d/clickhouse.list`文件)中添加仓库:
```text
deb http://repo.yandex.ru/clickhouse/deb/stable/ main/
```bash
$ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/
```
如果你想使用最新的测试版本,请使用'testing'替换'stable'。
@ -31,10 +31,10 @@ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/
然后运行:
```bash
sudo apt-get install dirmngr # optional
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional
sudo apt-get update
sudo apt-get install clickhouse-client clickhouse-server
$ sudo apt-get install dirmngr # optional
$ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional
$ sudo apt-get update
$ sudo apt-get install clickhouse-client clickhouse-server
```
你也可以从这里手动下载安装包:<https://repo.yandex.ru/clickhouse/deb/stable/main/>
@ -43,16 +43,16 @@ ClickHouse包含访问控制配置它们位于`users.xml`文件中(与'config
默认情况下允许从任何地方使用默认的default用户无密码的访问ClickHouse。参考user/default/networks
有关更多信息,请参考"Configuration files"部分。
###来自RPM包
###为CentOS/RedHat安装
Yandex ClickHouse团队建议使用官方预编译的`rpm`软件包用于CentOSRedHat和所有其他基于rpm的Linux发行版。
首先,您需要添加官方存储库:
```bash
sudo yum install yum-utils
sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG
sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64
$ sudo yum install yum-utils
$ sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG
$ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64
```
如果您想使用最新版本,请将`stable`替换为`testing`(建议您在测试环境中使用)。
@ -60,12 +60,12 @@ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/
然后运行这些命令以实际安装包:
```bash
sudo yum install clickhouse-server clickhouse-client
$ sudo yum install clickhouse-server clickhouse-client
```
您也可以从此处手动下载和安装软件包:<https://repo.yandex.ru/clickhouse/rpm/stable/x86_64>
###来自Docker
###使用Docker安装
要在Docker中运行ClickHouse请遵循[Docker Hub](https://hub.docker.com/r/yandex/clickhouse-server/)上的指南。这些镜像使用官方的`deb`包构建。
@ -136,18 +136,14 @@ milovidov@hostname:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client
ClickHouse client version 0.0.18749.
Connecting to localhost:9000.
Connected to ClickHouse server version 0.0.18749.
```
```sql
:) SELECT 1
SELECT 1
```
```text
┌─1─┐
│ 1 │
└───┘
1 rows in set. Elapsed: 0.003 sec.
:)
```
**恭喜,系统已经工作了!**