diff --git a/docs/tools/easy_diff.py b/docs/tools/easy_diff.py new file mode 100755 index 00000000000..2c7b5429994 --- /dev/null +++ b/docs/tools/easy_diff.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os, sys +import argparse +import subprocess +import contextlib +from git import cmd +from tempfile import NamedTemporaryFile + +SCRIPT_DESCRIPTION = ''' + usage: ./easy_diff.py language/document path + + Show the difference between a language document and an English document. + + This script is based on the assumption that documents in other languages are fully synchronized with the en document at a commit. + + For example: + Execute: + ./easy_diff.py --no-pager zh/data_types + Output: + Need translate document:~/ClickHouse/docs/en/data_types/uuid.md + Need link document:~/ClickHouse/docs/en/data_types/decimal.md to ~/ClickHouse/docs/zh/data_types/decimal.md + diff --git a/docs/en/data_types/domains/ipv6.md b/docs/en/data_types/domains/ipv6.md + index 1bfbe3400b..e2abaff017 100644 + --- a/docs/en/data_types/domains/ipv6.md + +++ b/docs/en/data_types/domains/ipv6.md + @@ -4,13 +4,13 @@ + + ### Basic Usage + + -``` sql + +```sql + CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; + + DESCRIBE TABLE hits; + ``` + + -``` + +```text + ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ + │ url │ String │ │ │ │ │ + │ from │ IPv6 │ │ │ │ │ + @@ -19,19 +19,19 @@ DESCRIBE TABLE hits; + + OR you can use `IPv6` domain as a key: + + -``` sql + +```sql + CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from; + ... MORE + + OPTIONS: + -h, --help show this help message and exit + --no-pager use stdout as difference result output +''' + +SCRIPT_PATH = os.path.abspath(__file__) +CLICKHOUSE_REPO_HOME = os.path.join(os.path.dirname(SCRIPT_PATH), '..', '..') +SCRIPT_COMMAND_EXECUTOR = cmd.Git(CLICKHOUSE_REPO_HOME) + +SCRIPT_COMMAND_PARSER = argparse.ArgumentParser(add_help=False) +SCRIPT_COMMAND_PARSER.add_argument('path', type=bytes, nargs='?', default=None) +SCRIPT_COMMAND_PARSER.add_argument('--no-pager', action='store_true', default=False) +SCRIPT_COMMAND_PARSER.add_argument('-h', '--help', action='store_true', default=False) + + +def execute(commands): + return SCRIPT_COMMAND_EXECUTOR.execute(commands) + + +def get_hash(file_name): + return execute(['git', 'log', '-n', '1', '--pretty=format:"%H"', file_name]) + + +def diff_file(reference_file, working_file, out): + if not os.path.exists(reference_file): + raise RuntimeError('reference file [' + os.path.abspath(reference_file) + '] is not exists.') + + if os.path.islink(working_file): + out.writelines(["Need translate document:" + os.path.abspath(reference_file)]) + elif not os.path.exists(working_file): + out.writelines(['Need link document ' + os.path.abspath(reference_file) + ' to ' + os.path.abspath(working_file)]) + elif get_hash(working_file) != get_hash(reference_file): + out.writelines([(execute(['git', 'diff', get_hash(working_file).strip('"'), reference_file]).encode('utf-8'))]) + + return 0 + + +def diff_directory(reference_directory, working_directory, out): + if not os.path.isdir(reference_directory): + return diff_file(reference_directory, working_directory, out) + + for list_item in os.listdir(reference_directory): + working_item = os.path.join(working_directory, list_item) + reference_item = os.path.join(reference_directory, list_item) + if diff_file(reference_item, working_item, out) if os.path.isfile(reference_item) else diff_directory(reference_item, working_item, out) != 0: + return 1 + + return 0 + + +def find_language_doc(custom_document, other_language='en', children=[]): + if len(custom_document) == 0: + raise RuntimeError('The ' + os.path.join(custom_document, *children) + " is not in docs directory.") + + if os.path.samefile(os.path.join(CLICKHOUSE_REPO_HOME, 'docs'), custom_document): + return os.path.join(CLICKHOUSE_REPO_HOME, 'docs', other_language, *children[1:]) + children.insert(0, os.path.split(custom_document)[1]) + return find_language_doc(os.path.split(custom_document)[0], other_language, children) + + +class ToPager: + def __init__(self, temp_named_file): + self.temp_named_file = temp_named_file + + def writelines(self, lines): + self.temp_named_file.writelines(lines) + + def close(self): + self.temp_named_file.flush() + git_pager = execute(['git', 'var', 'GIT_PAGER']) + subprocess.check_call([git_pager, self.temp_named_file.name]) + self.temp_named_file.close() + + +class ToStdOut: + def writelines(self, lines): + self.system_stdout_stream.writelines(lines) + + def close(self): + self.system_stdout_stream.flush() + + def __init__(self, system_stdout_stream): + self.system_stdout_stream = system_stdout_stream + + +if __name__ == '__main__': + arguments = SCRIPT_COMMAND_PARSER.parse_args() + if arguments.help or not arguments.path: + sys.stdout.write(SCRIPT_DESCRIPTION) + sys.exit(0) + + working_language = os.path.join(CLICKHOUSE_REPO_HOME, 'docs', arguments.path) + with contextlib.closing(ToStdOut(sys.stdout) if arguments.no_pager else ToPager(NamedTemporaryFile('r+'))) as writer: + exit(diff_directory(find_language_doc(working_language), working_language, writer)) diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index 2c395da402c..d95ddee9452 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -33,3 +33,4 @@ tornado==5.1 typing==3.6.2 Unidecode==1.0.23 urllib3==1.24.2 +gitpython==2.1.14 diff --git a/docs/zh/getting_started/example_datasets/amplab_benchmark.md b/docs/zh/getting_started/example_datasets/amplab_benchmark.md index 415457c9403..5afd7dfd705 100644 --- a/docs/zh/getting_started/example_datasets/amplab_benchmark.md +++ b/docs/zh/getting_started/example_datasets/amplab_benchmark.md @@ -7,21 +7,21 @@ 在控制台运行以下命令: ```bash -sudo apt-get install s3cmd -mkdir tiny; cd tiny; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . -cd .. -mkdir 1node; cd 1node; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . -cd .. -mkdir 5nodes; cd 5nodes; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . -cd .. +$ sudo apt-get install s3cmd +$ mkdir tiny; cd tiny; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . +$ cd .. +$ mkdir 1node; cd 1node; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . +$ cd .. +$ mkdir 5nodes; cd 5nodes; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . +$ cd .. ``` 在ClickHouse运行如下查询: -``` sql +```sql CREATE TABLE rankings_tiny ( pageURL String, @@ -86,12 +86,12 @@ CREATE TABLE uservisits_5nodes_on_single 回到控制台运行如下命令: ```bash -for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done -for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done -for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done -for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done -for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done -for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done +$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done +$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done +$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done +$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done +$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done +$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done ``` 简单的查询示例: diff --git a/docs/zh/getting_started/example_datasets/criteo.md b/docs/zh/getting_started/example_datasets/criteo.md index 9914bb8720c..3a86f630d7b 100644 --- a/docs/zh/getting_started/example_datasets/criteo.md +++ b/docs/zh/getting_started/example_datasets/criteo.md @@ -4,14 +4,14 @@ 创建原始数据对应的表结构: -``` sql +```sql CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log ``` 下载数据: ```bash -for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done +$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done ``` 创建转换后的数据对应的表结构: @@ -65,7 +65,7 @@ CREATE TABLE criteo 将第一张表中的原始数据转化写入到第二张表中去: -``` sql +```sql INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; DROP TABLE criteo_log; diff --git a/docs/zh/getting_started/example_datasets/nyc_taxi.md b/docs/zh/getting_started/example_datasets/nyc_taxi.md index 16adae18120..338ac5ba0b7 100644 --- a/docs/zh/getting_started/example_datasets/nyc_taxi.md +++ b/docs/zh/getting_started/example_datasets/nyc_taxi.md @@ -1,5 +1,10 @@ # 纽约市出租车数据 +纽约市出租车数据有以下两个方式获取: + +从原始数据导入 +下载预处理好的分区数据 + ## 怎样导入原始数据 可以参考中的关于数据集结构描述与数据下载指令说明。 @@ -24,8 +29,8 @@ mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv 您可以按如下方式检查下载的行数: -``` -time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" +```bash +$ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" ## Count 1298979494 (1 row) @@ -39,7 +44,7 @@ PostgreSQL处理这些数据大概需要370GB的磁盘空间。 从PostgreSQL中导出数据: -``` sql +```sql COPY ( SELECT trips.id, @@ -114,7 +119,7 @@ COPY 在ClickHouse中创建临时表: -``` sql +```sql CREATE TABLE trips ( trip_id UInt32, @@ -173,8 +178,8 @@ dropoff_puma Nullable(String) 接下来,需要将字段转换为更正确的数据类型,并且在可能的情况下,消除NULL。 -``` -time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv +```bash +$ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv real 75m56.214s ``` @@ -191,7 +196,7 @@ real 75m56.214s 创建表结构并写入数据: -``` +```sql CREATE TABLE trips_mergetree ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) AS SELECT @@ -258,13 +263,10 @@ FROM trips 这个表需要使用126GB的磁盘空间。 +```sql +SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active ``` -:) SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active - -SELECT formatReadableSize(sum(bytes)) -FROM system.parts -WHERE (table = 'trips_mergetree') AND active - +```text ┌─formatReadableSize(sum(bytes))─┐ │ 126.18 GiB │ └────────────────────────────────┘ @@ -272,11 +274,26 @@ WHERE (table = 'trips_mergetree') AND active 除此之外,你还可以在MergeTree上运行OPTIMIZE查询来进行优化。但这不是必须的,因为即使在没有进行优化的情况下它的表现依然是很好的。 +## 下载预处理好的分区数据 + +```bash +$ curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar +$ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory +$ # check permissions of unpacked data, fix if required +$ sudo service clickhouse-server restart +$ clickhouse-client --query "select count(*) from datasets.trips_mergetree" +``` + +!!!info + 如果要运行下面的SQL查询,必须使用完整的表名, + `datasets.trips_mergetree`。 + + ## 单台服务器运行结果 Q1: -``` sql +```sql SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type ``` @@ -284,7 +301,7 @@ SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type Q2: -``` sql +```sql SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count ``` @@ -292,7 +309,7 @@ SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenge Q3: -``` sql +```sql SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year ``` @@ -300,7 +317,7 @@ SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetr Q4: -``` sql +```sql SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) FROM trips_mergetree GROUP BY passenger_count, year, distance @@ -319,19 +336,19 @@ Two Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz, 16 physical kernels total,128 GiB 在每台服务器中运行: -``` +```sql CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) ``` 在之前的服务器中运行: -``` sql +```sql CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) ``` 运行如下查询重新分布数据: -``` sql +```sql INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree ``` diff --git a/docs/zh/getting_started/example_datasets/ontime.md b/docs/zh/getting_started/example_datasets/ontime.md index ed81e2459e7..ec4053490a5 100644 --- a/docs/zh/getting_started/example_datasets/ontime.md +++ b/docs/zh/getting_started/example_datasets/ontime.md @@ -1,6 +1,13 @@ # 航班飞行数据 +航班飞行数据有以下两个方式获取: + +- 从原始数据导入 +- 下载预处理好的分区数据 + +## 从原始数据导入 + 下载数据: ```bash @@ -134,39 +141,75 @@ CREATE TABLE `ontime` ( 加载数据: ```bash -for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done +$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ``` -查询: +## 下载预处理好的分区数据 + +```bash +$ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar +$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory +$ # check permissions of unpacked data, fix if required +$ sudo service clickhouse-server restart +$ clickhouse-client --query "select count(*) from datasets.ontime" +``` +!!!info + 如果要运行下面的SQL查询,必须使用完整的表名, + `datasets.ontime`。 + +## 查询: Q0. ```sql -select avg(c1) from (select Year, Month, count(*) as c1 from ontime group by Year, Month); +SELECT avg(c1) +FROM +( + SELECT Year, Month, count(*) AS c1 + FROM ontime + GROUP BY Year, Month +); ``` Q1. 查询从2000年到2008年每天的航班数 ```sql -SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC; +SELECT DayOfWeek, count(*) AS c +FROM ontime +WHERE Year>=2000 AND Year<=2008 +GROUP BY DayOfWeek +ORDER BY c DESC; ``` Q2. 查询从2000年到2008年每周延误超过10分钟的航班数。 ```sql -SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC +SELECT DayOfWeek, count(*) AS c +FROM ontime +WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 +GROUP BY DayOfWeek +ORDER BY c DESC; ``` Q3. 查询2000年到2008年每个机场延误超过10分钟以上的次数 ```sql -SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY Origin ORDER BY c DESC LIMIT 10 +SELECT Origin, count(*) AS c +FROM ontime +WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 +GROUP BY Origin +ORDER BY c DESC +LIMIT 10; ``` Q4. 查询2007年各航空公司延误超过10分钟以上的次数 ```sql -SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year = 2007 GROUP BY Carrier ORDER BY count(*) DESC +SELECT Carrier, count(*) +FROM ontime +WHERE DepDelay>10 AND Year=2007 +GROUP BY Carrier +ORDER BY count(*) DESC; ``` Q5. 查询2007年各航空公司延误超过10分钟以上的百分比 @@ -198,7 +241,11 @@ ORDER BY c3 DESC; 更好的查询版本: ```sql -SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year = 2007 GROUP BY Carrier ORDER BY Carrier +SELECT Carrier, avg(DepDelay>10)*100 AS c3 +FROM ontime +WHERE Year=2007 +GROUP BY Carrier +ORDER BY Carrier ``` Q6. 同上一个查询一致,只是查询范围扩大到2000年到2008年 @@ -212,7 +259,7 @@ FROM count(*) AS c FROM ontime WHERE DepDelay>10 - AND Year >= 2000 AND Year <= 2008 + AND Year>=2000 AND Year<=2008 GROUP BY Carrier ) ANY INNER JOIN @@ -221,7 +268,7 @@ ANY INNER JOIN Carrier, count(*) AS c2 FROM ontime - WHERE Year >= 2000 AND Year <= 2008 + WHERE Year>=2000 AND Year<=2008 GROUP BY Carrier ) USING Carrier ORDER BY c3 DESC; @@ -230,7 +277,11 @@ ORDER BY c3 DESC; 更好的查询版本: ```sql -SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY Carrier ORDER BY Carrier +SELECT Carrier, avg(DepDelay>10)*100 AS c3 +FROM ontime +WHERE Year>=2000 AND Year<=2008 +GROUP BY Carrier +ORDER BY Carrier; ``` Q7. 每年航班延误超过10分钟的百分比 @@ -254,41 +305,50 @@ ANY INNER JOIN from ontime GROUP BY Year ) USING (Year) -ORDER BY Year +ORDER BY Year; ``` 更好的查询版本: ```sql -SELECT Year, avg(DepDelay > 10) FROM ontime GROUP BY Year ORDER BY Year +SELECT Year, avg(DepDelay>10) +FROM ontime +GROUP BY Year +ORDER BY Year; ``` Q8. 每年更受人们喜爱的目的地 ```sql -SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 GROUP BY DestCityName ORDER BY u DESC LIMIT 10; +SELECT DestCityName, uniqExact(OriginCityName) AS u +FROM ontime +WHERE Year >= 2000 and Year <= 2010 +GROUP BY DestCityName +ORDER BY u DESC LIMIT 10; ``` Q9. ```sql -select Year, count(*) as c1 from ontime group by Year; +SELECT Year, count(*) AS c1 +FROM ontime +GROUP BY Year; ``` Q10. ```sql -select - min(Year), max(Year), Carrier, count(*) as cnt, - sum(ArrDelayMinutes>30) as flights_delayed, - round(sum(ArrDelayMinutes>30)/count(*),2) as rate +SELECT + min(Year), max(Year), Carrier, count(*) AS cnt, + sum(ArrDelayMinutes>30) AS flights_delayed, + round(sum(ArrDelayMinutes>30)/count(*),2) AS rate FROM ontime WHERE - DayOfWeek not in (6,7) and OriginState not in ('AK', 'HI', 'PR', 'VI') - and DestState not in ('AK', 'HI', 'PR', 'VI') - and FlightDate < '2010-01-01' + DayOfWeek NOT IN (6,7) AND OriginState NOT IN ('AK', 'HI', 'PR', 'VI') + AND DestState NOT IN ('AK', 'HI', 'PR', 'VI') + AND FlightDate < '2010-01-01' GROUP by Carrier -HAVING cnt > 100000 and max(Year) > 1990 +HAVING cnt>100000 and max(Year)>1990 ORDER by rate DESC LIMIT 1000; ``` @@ -296,15 +356,39 @@ LIMIT 1000; Bonus: ```sql -SELECT avg(cnt) FROM (SELECT Year,Month,count(*) AS cnt FROM ontime WHERE DepDel15=1 GROUP BY Year,Month) +SELECT avg(cnt) +FROM +( + SELECT Year,Month,count(*) AS cnt + FROM ontime + WHERE DepDel15=1 + GROUP BY Year,Month +); -select avg(c1) from (select Year,Month,count(*) as c1 from ontime group by Year,Month) +SELECT avg(c1) FROM +( + SELECT Year,Month,count(*) AS c1 + FROM ontime + GROUP BY Year,Month +); -SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime GROUP BY DestCityName ORDER BY u DESC LIMIT 10; +SELECT DestCityName, uniqExact(OriginCityName) AS u +FROM ontime +GROUP BY DestCityName +ORDER BY u DESC +LIMIT 10; -SELECT OriginCityName, DestCityName, count() AS c FROM ontime GROUP BY OriginCityName, DestCityName ORDER BY c DESC LIMIT 10; +SELECT OriginCityName, DestCityName, count() AS c +FROM ontime +GROUP BY OriginCityName, DestCityName +ORDER BY c DESC +LIMIT 10; -SELECT OriginCityName, count() AS c FROM ontime GROUP BY OriginCityName ORDER BY c DESC LIMIT 10; +SELECT OriginCityName, count() AS c +FROM ontime +GROUP BY OriginCityName +ORDER BY c DESC +LIMIT 10; ``` 这个性能测试由Vadim Tkachenko提供。参考: diff --git a/docs/zh/getting_started/example_datasets/star_schema.md b/docs/zh/getting_started/example_datasets/star_schema.md index 1d8af3b29a5..865327b50ec 100644 --- a/docs/zh/getting_started/example_datasets/star_schema.md +++ b/docs/zh/getting_started/example_datasets/star_schema.md @@ -1,26 +1,26 @@ # Star Schema Benchmark -Compiling dbgen: +编译 dbgen: -``` -git clone git@github.com:vadimtk/ssb-dbgen.git -cd ssb-dbgen -make +```bash +$ git clone git@github.com:vadimtk/ssb-dbgen.git +$ cd ssb-dbgen +$ make ``` -Generating data: +开始生成数据: -``` -./dbgen -s 1000 -T c -./dbgen -s 1000 -T l -./dbgen -s 1000 -T p -./dbgen -s 1000 -T s -./dbgen -s 1000 -T d +```bash +$ ./dbgen -s 1000 -T c +$ ./dbgen -s 1000 -T l +$ ./dbgen -s 1000 -T p +$ ./dbgen -s 1000 -T s +$ ./dbgen -s 1000 -T d ``` -Creating tables in ClickHouse: +在ClickHouse中创建表结构: -``` +```sql CREATE TABLE customer ( C_CUSTKEY UInt32, @@ -83,73 +83,85 @@ CREATE TABLE supplier ENGINE = MergeTree ORDER BY S_SUPPKEY; ``` -Inserting data: +写入数据: -``` -clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl -clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl -clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl -clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl +```bash +$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl +$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl +$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl +$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl ``` -Converting "star schema" to denormalized "flat schema": +将“星型模型”转换为非规范化的“平面模型”: -``` +```sql SET max_memory_usage = 20000000000, allow_experimental_multiple_joins_emulation = 1; CREATE TABLE lineorder_flat ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS -SELECT * -FROM lineorder -ANY INNER JOIN customer ON LO_CUSTKEY = C_CUSTKEY -ANY INNER JOIN supplier ON LO_SUPPKEY = S_SUPPKEY -ANY INNER JOIN part ON LO_PARTKEY = P_PARTKEY; +SELECT l.*, c.*, s.*, p.* +FROM lineorder l + ANY INNER JOIN customer c ON (c.C_CUSTKEY = l.LO_CUSTKEY) + ANY INNER JOIN supplier s ON (s.S_SUPPKEY = l.LO_SUPPKEY) + ANY INNER JOIN part p ON (p.P_PARTKEY = l.LO_PARTKEY); ALTER TABLE lineorder_flat DROP COLUMN C_CUSTKEY, DROP COLUMN S_SUPPKEY, DROP COLUMN P_PARTKEY; ``` Running the queries: -``` Q1.1 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; - +``` Q1.2 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; - +``` Q1.3 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; - +``` Q2.1 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q2.2 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228' AND S_REGION = 'ASIA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q2.3 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q3.1 +```sql SELECT C_NATION, S_NATION, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 GROUP BY C_NATION, S_NATION, year ORDER BY year asc, revenue desc; - +``` Q3.2 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q3.3 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q3.4 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = '199712' GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q4.1 +```sql SELECT toYear(LO_ORDERDATE) AS year, C_NATION, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, C_NATION ORDER BY year, C_NATION; - +``` Q4.2 +```sql SELECT toYear(LO_ORDERDATE) AS year, S_NATION, P_CATEGORY, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, S_NATION, P_CATEGORY ORDER BY year, S_NATION, P_CATEGORY; - +``` Q4.3 +```sql SELECT toYear(LO_ORDERDATE) AS year, S_CITY, P_BRAND, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' GROUP BY year, S_CITY, P_BRAND ORDER BY year, S_CITY, P_BRAND; ``` diff --git a/docs/zh/getting_started/example_datasets/wikistat.md b/docs/zh/getting_started/example_datasets/wikistat.md index c306c644551..ee3b800f47b 100644 --- a/docs/zh/getting_started/example_datasets/wikistat.md +++ b/docs/zh/getting_started/example_datasets/wikistat.md @@ -4,7 +4,7 @@ 创建表结构: -``` sql +```sql CREATE TABLE wikistat ( date Date, @@ -20,9 +20,9 @@ CREATE TABLE wikistat 加载数据: ```bash -for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt -cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done -ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done +$ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt +$ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done +$ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done ``` diff --git a/docs/zh/getting_started/index.md b/docs/zh/getting_started/index.md index 5e1a5777292..b1c94600da0 100644 --- a/docs/zh/getting_started/index.md +++ b/docs/zh/getting_started/index.md @@ -22,8 +22,8 @@ ClickHouse还可以在FreeBSD与Mac OS X上工作。同时它可以在不支持S 在`/etc/apt/sources.list` (或创建`/etc/apt/sources.list.d/clickhouse.list`文件)中添加仓库: -```text -deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ +```bash +$ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ ``` 如果你想使用最新的测试版本,请使用'testing'替换'stable'。 @@ -31,10 +31,10 @@ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ 然后运行: ```bash -sudo apt-get install dirmngr # optional -sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional -sudo apt-get update -sudo apt-get install clickhouse-client clickhouse-server +$ sudo apt-get install dirmngr # optional +$ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional +$ sudo apt-get update +$ sudo apt-get install clickhouse-client clickhouse-server ``` 你也可以从这里手动下载安装包:。 @@ -43,16 +43,16 @@ ClickHouse包含访问控制配置,它们位于`users.xml`文件中(与'config 默认情况下,允许从任何地方使用默认的‘default’用户无密码的访问ClickHouse。参考‘user/default/networks’。 有关更多信息,请参考"Configuration files"部分。 -###来自RPM包 +###为CentOS/RedHat安装 Yandex ClickHouse团队建议使用官方预编译的`rpm`软件包,用于CentOS,RedHat和所有其他基于rpm的Linux发行版。 首先,您需要添加官方存储库: ```bash -sudo yum install yum-utils -sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG -sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 +$ sudo yum install yum-utils +$ sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG +$ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 ``` 如果您想使用最新版本,请将`stable`替换为`testing`(建议您在测试环境中使用)。 @@ -60,12 +60,12 @@ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/ 然后运行这些命令以实际安装包: ```bash -sudo yum install clickhouse-server clickhouse-client +$ sudo yum install clickhouse-server clickhouse-client ``` 您也可以从此处手动下载和安装软件包:。 -###来自Docker +###使用Docker安装 要在Docker中运行ClickHouse,请遵循[Docker Hub](https://hub.docker.com/r/yandex/clickhouse-server/)上的指南。这些镜像使用官方的`deb`包构建。 @@ -136,18 +136,14 @@ milovidov@hostname:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client ClickHouse client version 0.0.18749. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.18749. - +``` +```sql :) SELECT 1 - -SELECT 1 - +``` +```text ┌─1─┐ │ 1 │ └───┘ - -1 rows in set. Elapsed: 0.003 sec. - -:) ``` **恭喜,系统已经工作了!**