mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Update references in docs
This commit is contained in:
parent
261806e897
commit
5a4466cec7
@ -243,7 +243,7 @@ List of tasks: https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aopen+is%3A
|
||||
|
||||
## Test Data {#test-data}
|
||||
|
||||
Developing ClickHouse often requires loading realistic datasets. It is particularly important for performance testing. We have a specially prepared set of anonymized data from Yandex.Metrica. It requires additionally some 3GB of free disk space. Note that this data is not required to accomplish most of the development tasks.
|
||||
Developing ClickHouse often requires loading realistic datasets. It is particularly important for performance testing. We have a specially prepared set of anonymized data of web analytics. It requires additionally some 3GB of free disk space. Note that this data is not required to accomplish most of the development tasks.
|
||||
|
||||
sudo apt install wget xz-utils
|
||||
|
||||
@ -259,7 +259,7 @@ Developing ClickHouse often requires loading realistic datasets. It is particula
|
||||
|
||||
CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime);
|
||||
|
||||
CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID);
|
||||
CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsClickHouse UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID);
|
||||
|
||||
clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv
|
||||
clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv
|
||||
@ -270,7 +270,7 @@ Navigate to your fork repository in GitHub’s UI. If you have been developing i
|
||||
|
||||
A pull request can be created even if the work is not completed yet. In this case please put the word “WIP” (work in progress) at the beginning of the title, it can be changed later. This is useful for cooperative reviewing and discussion of changes as well as for running all of the available tests. It is important that you provide a brief description of your changes, it will later be used for generating release changelogs.
|
||||
|
||||
Testing will commence as soon as Yandex employees label your PR with a tag “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour.
|
||||
Testing will commence as soon as ClickHouse employees label your PR with a tag “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour.
|
||||
|
||||
The system will prepare ClickHouse binary builds for your pull request individually. To retrieve these builds click the “Details” link next to “ClickHouse build check” entry in the list of checks. There you will find direct links to the built .deb packages of ClickHouse which you can deploy even on your production servers (if you have no fear).
|
||||
|
||||
|
@ -11,7 +11,7 @@ Functional tests are the most simple and convenient to use. Most of ClickHouse f
|
||||
|
||||
Each functional test sends one or multiple queries to the running ClickHouse server and compares the result with reference.
|
||||
|
||||
Tests are located in `queries` directory. There are two subdirectories: `stateless` and `stateful`. Stateless tests run queries without any preloaded test data - they often create small synthetic datasets on the fly, within the test itself. Stateful tests require preloaded test data from Yandex.Metrica and it is available to general public.
|
||||
Tests are located in `queries` directory. There are two subdirectories: `stateless` and `stateful`. Stateless tests run queries without any preloaded test data - they often create small synthetic datasets on the fly, within the test itself. Stateful tests require preloaded test data from CLickHouse and it is available to general public.
|
||||
|
||||
Each test can be one of two types: `.sql` and `.sh`. `.sql` test is the simple SQL script that is piped to `clickhouse-client --multiquery --testmode`. `.sh` test is a script that is run by itself. SQL tests are generally preferable to `.sh` tests. You should use `.sh` tests only when you have to test some feature that cannot be exercised from pure SQL, such as piping some input data into `clickhouse-client` or testing `clickhouse-local`.
|
||||
|
||||
@ -135,13 +135,12 @@ If the system clickhouse-server is already running and you do not want to stop i
|
||||
|
||||
## Testing Environment {#testing-environment}
|
||||
|
||||
Before publishing release as stable we deploy it on testing environment. Testing environment is a cluster that process 1/39 part of [Yandex.Metrica](https://metrica.yandex.com/) data. We share our testing environment with Yandex.Metrica team. ClickHouse is upgraded without downtime on top of existing data. We look at first that data is processed successfully without lagging from realtime, the replication continue to work and there is no issues visible to Yandex.Metrica team. First check can be done in the following way:
|
||||
Before publishing release as stable we deploy it on testing environment. Testing environment is a cluster that process 1/39 part of ClickHouse web analytics data. ClickHouse is upgraded without downtime on top of existing data. We look at first that data is processed successfully without lagging from realtime, the replication continue to work and there is no issues visible to the ClickHouse team. First check can be done in the following way:
|
||||
|
||||
``` sql
|
||||
SELECT hostName() AS h, any(version()), any(uptime()), max(UTCEventTime), count() FROM remote('example01-01-{1..3}t', merge, hits) WHERE EventDate >= today() - 2 GROUP BY h ORDER BY h;
|
||||
```
|
||||
|
||||
In some cases we also deploy to testing environment of our friend teams in Yandex: Market, Cloud, etc. Also we have some hardware servers that are used for development purposes.
|
||||
|
||||
## Load Testing {#load-testing}
|
||||
|
||||
@ -155,7 +154,7 @@ Collect query log for a day or more:
|
||||
$ clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv
|
||||
```
|
||||
|
||||
This is a way complicated example. `type = 2` will filter queries that are executed successfully. `query LIKE '%ym:%'` is to select relevant queries from Yandex.Metrica. `is_initial_query` is to select only queries that are initiated by client, not by ClickHouse itself (as parts of distributed query processing).
|
||||
This is a way complicated example. `type = 2` will filter queries that are executed successfully. `query LIKE '%ym:%'` is to select relevant queries from the web analytics data. `is_initial_query` is to select only queries that are initiated by client, not by ClickHouse itself (as parts of distributed query processing).
|
||||
|
||||
`scp` this log to your testing cluster and run it as following:
|
||||
|
||||
@ -259,13 +258,13 @@ Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuz
|
||||
|
||||
## Security Audit
|
||||
|
||||
People from Yandex Security Team did some basic overview of ClickHouse capabilities from the security standpoint.
|
||||
Our Security Team did some basic overview of ClickHouse capabilities from the security standpoint.
|
||||
|
||||
## Static Analyzers {#static-analyzers}
|
||||
|
||||
We run `clang-tidy` on per-commit basis. `clang-static-analyzer` checks are also enabled. `clang-tidy` is also used for some style checks.
|
||||
|
||||
We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`, `CodeQL`. You will find instructions for usage in `tests/instructions/` directory. Also you can read [the article in russian](https://habr.com/company/yandex/blog/342018/).
|
||||
We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`, `CodeQL`. You will find instructions for usage in `tests/instructions/` directory.
|
||||
|
||||
If you use `CLion` as an IDE, you can leverage some `clang-tidy` checks out of the box.
|
||||
|
||||
@ -312,7 +311,7 @@ We also use `codespell` to find typos in code. It is automated as well.
|
||||
|
||||
## Metrica B2B Tests {#metrica-b2b-tests}
|
||||
|
||||
Each ClickHouse release is tested with Yandex Metrica and AppMetrica engines. Testing and stable versions of ClickHouse are deployed on VMs and run with a small copy of Metrica engine that is processing fixed sample of input data. Then results of two instances of Metrica engine are compared together.
|
||||
Each ClickHouse release is tested with AppMetrica engines. Testing and stable versions of ClickHouse are deployed on VMs and run with a small copy of Metrica engine that is processing fixed sample of input data. Then results of two instances of Metrica engine are compared together.
|
||||
|
||||
These tests are automated by separate team. Due to high number of moving parts, tests are fail most of the time by completely unrelated reasons, that are very difficult to figure out. Most likely these tests have negative value for us. Nevertheless these tests was proved to be useful in about one or two times out of hundreds.
|
||||
|
||||
|
@ -26,7 +26,7 @@ CREATE TABLE s3_engine_table (name String, value UInt32)
|
||||
|
||||
``` sql
|
||||
CREATE TABLE s3_engine_table (name String, value UInt32)
|
||||
ENGINE=S3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'gzip')
|
||||
ENGINE=S3('https://hostname/my-test-bucket-768/test-data.csv.gz', 'CSV', 'gzip')
|
||||
SETTINGS input_format_with_names_use_header = 0;
|
||||
|
||||
INSERT INTO s3_engine_table VALUES ('one', 1), ('two', 2), ('three', 3);
|
||||
@ -75,19 +75,19 @@ Create table with files named `file-000.csv`, `file-001.csv`, … , `file-999.cs
|
||||
|
||||
``` sql
|
||||
CREATE TABLE big_table (name String, value UInt32)
|
||||
ENGINE = S3('https://storage.yandexcloud.net/my-bucket/my_folder/file-{000..999}.csv', 'CSV');
|
||||
ENGINE = S3('https://hostname/my-bucket/my_folder/file-{000..999}.csv', 'CSV');
|
||||
```
|
||||
|
||||
**Example with wildcards 2**
|
||||
|
||||
Suppose we have several files in CSV format with the following URIs on S3:
|
||||
|
||||
- 'https://storage.yandexcloud.net/my-bucket/some_folder/some_file_1.csv'
|
||||
- 'https://storage.yandexcloud.net/my-bucket/some_folder/some_file_2.csv'
|
||||
- 'https://storage.yandexcloud.net/my-bucket/some_folder/some_file_3.csv'
|
||||
- 'https://storage.yandexcloud.net/my-bucket/another_folder/some_file_1.csv'
|
||||
- 'https://storage.yandexcloud.net/my-bucket/another_folder/some_file_2.csv'
|
||||
- 'https://storage.yandexcloud.net/my-bucket/another_folder/some_file_3.csv'
|
||||
- 'https://hostname/my-bucket/some_folder/some_file_1.csv'
|
||||
- 'https://hostname/my-bucket/some_folder/some_file_2.csv'
|
||||
- 'https://hostname/my-bucket/some_folder/some_file_3.csv'
|
||||
- 'https://hostname/my-bucket/another_folder/some_file_1.csv'
|
||||
- 'https://hostname/my-bucket/another_folder/some_file_2.csv'
|
||||
- 'https://hostname/my-bucket/another_folder/some_file_3.csv'
|
||||
|
||||
|
||||
There are several ways to make a table consisting of all six files:
|
||||
@ -96,21 +96,21 @@ There are several ways to make a table consisting of all six files:
|
||||
|
||||
``` sql
|
||||
CREATE TABLE table_with_range (name String, value UInt32)
|
||||
ENGINE = S3('https://storage.yandexcloud.net/my-bucket/{some,another}_folder/some_file_{1..3}', 'CSV');
|
||||
ENGINE = S3('https://hostname/my-bucket/{some,another}_folder/some_file_{1..3}', 'CSV');
|
||||
```
|
||||
|
||||
2. Take all files with `some_file_` prefix (there should be no extra files with such prefix in both folders):
|
||||
|
||||
``` sql
|
||||
CREATE TABLE table_with_question_mark (name String, value UInt32)
|
||||
ENGINE = S3('https://storage.yandexcloud.net/my-bucket/{some,another}_folder/some_file_?', 'CSV');
|
||||
ENGINE = S3('https://hostname/my-bucket/{some,another}_folder/some_file_?', 'CSV');
|
||||
```
|
||||
|
||||
3. Take all the files in both folders (all files should satisfy format and schema described in query):
|
||||
|
||||
``` sql
|
||||
CREATE TABLE table_with_asterisk (name String, value UInt32)
|
||||
ENGINE = S3('https://storage.yandexcloud.net/my-bucket/{some,another}_folder/*', 'CSV');
|
||||
ENGINE = S3('https://hostname/my-bucket/{some,another}_folder/*', 'CSV');
|
||||
```
|
||||
|
||||
## S3-related Settings {#settings}
|
||||
@ -142,7 +142,7 @@ The following settings can be specified in configuration file for given endpoint
|
||||
``` xml
|
||||
<s3>
|
||||
<endpoint-name>
|
||||
<endpoint>https://storage.yandexcloud.net/my-test-bucket-768/</endpoint>
|
||||
<endpoint>https://hostname/my-test-bucket-768/</endpoint>
|
||||
<!-- <access_key_id>ACCESS_KEY_ID</access_key_id> -->
|
||||
<!-- <secret_access_key>SECRET_ACCESS_KEY</secret_access_key> -->
|
||||
<!-- <region>us-west-1</region> -->
|
||||
|
@ -802,7 +802,7 @@ Configuration markup:
|
||||
<disks>
|
||||
<s3>
|
||||
<type>s3</type>
|
||||
<endpoint>https://storage.yandexcloud.net/my-bucket/root-path/</endpoint>
|
||||
<endpoint>https://hostname/my-bucket/root-path/</endpoint>
|
||||
<access_key_id>your_access_key_id</access_key_id>
|
||||
<secret_access_key>your_secret_access_key</secret_access_key>
|
||||
<region></region>
|
||||
@ -856,7 +856,7 @@ S3 disk can be configured as `main` or `cold` storage:
|
||||
<disks>
|
||||
<s3>
|
||||
<type>s3</type>
|
||||
<endpoint>https://storage.yandexcloud.net/my-bucket/root-path/</endpoint>
|
||||
<endpoint>https://hostname/my-bucket/root-path/</endpoint>
|
||||
<access_key_id>your_access_key_id</access_key_id>
|
||||
<secret_access_key>your_secret_access_key</secret_access_key>
|
||||
</s3>
|
||||
|
@ -97,7 +97,7 @@ ZooKeeper is not used in `SELECT` queries because replication does not affect th
|
||||
|
||||
For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it does not create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data.
|
||||
|
||||
For very large clusters, you can use different ZooKeeper clusters for different shards. However, this hasn’t proven necessary on the Yandex.Metrica cluster (approximately 300 servers).
|
||||
For very large clusters, you can use different ZooKeeper clusters for different shards. However, from our experience this has not proven necessary based on production clusters with approximately 300 servers.
|
||||
|
||||
Replication is asynchronous and multi-master. `INSERT` queries (as well as `ALTER`) can be sent to any available server. Data is inserted on the server where the query is run, and then it is copied to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If part of the replicas are not available, the data is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network. The number of threads performing background tasks for replicated tables can be set by [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) setting.
|
||||
|
||||
@ -111,7 +111,7 @@ Data blocks are deduplicated. For multiple writes of the same data block (data b
|
||||
|
||||
During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.)
|
||||
|
||||
You can have any number of replicas of the same data. Yandex.Metrica uses double replication in production. Each server uses RAID-5 or RAID-6, and RAID-10 in some cases. This is a relatively reliable and convenient solution.
|
||||
You can have any number of replicas of the same data. Based on our experiences, a relatively reliable and convenient solution could use double replication in production, with each server using RAID-5 or RAID-6 (and RAID-10 in some cases).
|
||||
|
||||
The system monitors data synchronicity on replicas and is able to recover after a failure. Failover is automatic (for small differences in data) or semi-automatic (when data differs too much, which may indicate a configuration error).
|
||||
|
||||
@ -163,7 +163,7 @@ Example:
|
||||
<macros>
|
||||
<layer>05</layer>
|
||||
<shard>02</shard>
|
||||
<replica>example05-02-1.yandex.ru</replica>
|
||||
<replica>example05-02-1</replica>
|
||||
</macros>
|
||||
```
|
||||
|
||||
@ -172,7 +172,7 @@ In this case, the path consists of the following parts:
|
||||
|
||||
`/clickhouse/tables/` is the common prefix. We recommend using exactly this one.
|
||||
|
||||
`{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the Yandex.Metrica cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier.
|
||||
`{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the example cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier.
|
||||
|
||||
`table_name` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query.
|
||||
*HINT*: you could add a database name in front of `table_name` as well. E.g. `db_name.table_name`
|
||||
|
@ -6,7 +6,7 @@ toc_priority: 110
|
||||
|
||||
# Why Not Use Something Like MapReduce? {#why-not-use-something-like-mapreduce}
|
||||
|
||||
We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). Large IT companies like Google or Yandex often have proprietary in-house solutions.
|
||||
We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). Large IT companies often have proprietary in-house solutions.
|
||||
|
||||
These systems aren’t appropriate for online queries due to their high latency. In other words, they can’t be used as the back-end for a web interface. These types of systems aren’t useful for real-time data updates. Distributed sorting isn’t the best way to perform reduce operations if the result of the operation and all the intermediate results (if there are any) are located in the RAM of a single server, which is usually the case for online queries. In such a case, a hash table is an optimal way to perform reduce operations. A common approach to optimizing map-reduce tasks is pre-aggregation (partial reduce) using a hash table in RAM. The user performs this optimization manually. Distributed sorting is one of the main causes of reduced performance when running simple map-reduce tasks.
|
||||
|
||||
|
@ -9,7 +9,7 @@ toc_priority: 11
|
||||
|
||||
This question usually arises when people see official ClickHouse t-shirts. They have large words **“ClickHouse не тормозит”** on the front.
|
||||
|
||||
Before ClickHouse became open-source, it has been developed as an in-house storage system by the largest Russian IT company, [Yandex](https://yandex.com/company/). That’s why it initially got its slogan in Russian, which is “не тормозит” (pronounced as “ne tormozit”). After the open-source release we first produced some of those t-shirts for events in Russia and it was a no-brainer to use the slogan as-is.
|
||||
Before ClickHouse became open-source, it has been developed as an in-house storage system by the largest Russian IT company, Yandex. That’s why it initially got its slogan in Russian, which is “не тормозит” (pronounced as “ne tormozit”). After the open-source release we first produced some of those t-shirts for events in Russia and it was a no-brainer to use the slogan as-is.
|
||||
|
||||
One of the following batches of those t-shirts was supposed to be given away on events outside of Russia and we tried to make the English version of the slogan. Unfortunately, the Russian language is kind of elegant in terms of expressing stuff and there was a restriction of limited space on a t-shirt, so we failed to come up with good enough translation (most options appeared to be either long or inaccurate) and decided to keep the slogan in Russian even on t-shirts produced for international events. It appeared to be a great decision because people all over the world get positively surprised and curious when they see it.
|
||||
|
||||
|
@ -11,7 +11,7 @@ This section describes how to obtain example datasets and import them into Click
|
||||
The list of documented datasets:
|
||||
|
||||
- [GitHub Events](../../getting-started/example-datasets/github-events.md)
|
||||
- [Anonymized Yandex.Metrica Dataset](../../getting-started/example-datasets/metrica.md)
|
||||
- [Anonymized Web Analytics Dataset](../../getting-started/example-datasets/metrica.md)
|
||||
- [Recipes](../../getting-started/example-datasets/recipes.md)
|
||||
- [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md)
|
||||
- [WikiStat](../../getting-started/example-datasets/wikistat.md)
|
||||
|
@ -1,11 +1,11 @@
|
||||
---
|
||||
toc_priority: 15
|
||||
toc_title: Yandex.Metrica Data
|
||||
toc_title: Web Analytics Data
|
||||
---
|
||||
|
||||
# Anonymized Yandex.Metrica Data {#anonymized-yandex-metrica-data}
|
||||
# Anonymized Web Analytics Data {#anonymized-web-analytics-data}
|
||||
|
||||
Dataset consists of two tables containing anonymized data about hits (`hits_v1`) and visits (`visits_v1`) of Yandex.Metrica. You can read more about Yandex.Metrica in [ClickHouse history](../../introduction/history.md) section.
|
||||
Dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`).
|
||||
|
||||
The dataset consists of two tables, either of them can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz and as prepared partitions at https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz.
|
||||
|
||||
@ -63,7 +63,7 @@ md5sum visits_v1.tsv
|
||||
# Checksum should be equal to: 6dafe1a0f24e59e3fc2d0fed85601de6
|
||||
# now create table
|
||||
clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
|
||||
clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
|
||||
clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsClickHouse UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
|
||||
# import data
|
||||
cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000
|
||||
# optionally you can optimize table
|
||||
@ -73,6 +73,6 @@ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1"
|
||||
|
||||
## Example Queries {#example-queries}
|
||||
|
||||
[ClickHouse tutorial](../../getting-started/tutorial.md) is based on Yandex.Metrica dataset and the recommended way to get started with this dataset is to just go through tutorial.
|
||||
[The ClickHouse tutorial](../../getting-started/tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial.
|
||||
|
||||
Additional examples of queries to these tables can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) of ClickHouse (they are named `test.hits` and `test.visits` there).
|
||||
|
@ -80,7 +80,7 @@ clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv
|
||||
|
||||
## Import Sample Dataset {#import-sample-dataset}
|
||||
|
||||
Now it’s time to fill our ClickHouse server with some sample data. In this tutorial, we’ll use the anonymized data of Yandex.Metrica, the first service that runs ClickHouse in production way before it became open-source (more on that in [history section](../introduction/history.md)). There are [multiple ways to import Yandex.Metrica dataset](../getting-started/example-datasets/metrica.md), and for the sake of the tutorial, we’ll go with the most realistic one.
|
||||
Now it’s time to fill our ClickHouse server with some sample data. In this tutorial, we’ll use some anonymized web analytics data. There are [multiple ways to import the dataset](../getting-started/example-datasets/metrica.md), and for the sake of the tutorial, we’ll go with the most realistic one.
|
||||
|
||||
### Download and Extract Table Data {#download-and-extract-table-data}
|
||||
|
||||
@ -105,7 +105,7 @@ Syntax for creating tables is way more complicated compared to databases (see [r
|
||||
2. Table schema, i.e. list of columns and their [data types](../sql-reference/data-types/index.md).
|
||||
3. [Table engine](../engines/table-engines/index.md) and its settings, which determines all the details on how queries to this table will be physically executed.
|
||||
|
||||
Yandex.Metrica is a web analytics service, and sample dataset does not cover its full functionality, so there are only two tables to create:
|
||||
There are two tables to create:
|
||||
|
||||
- `hits` is a table with each action done by all users on all websites covered by the service.
|
||||
- `visits` is a table that contains pre-built sessions instead of individual actions.
|
||||
@ -287,7 +287,7 @@ CREATE TABLE tutorial.visits_v1
|
||||
`URLCategories` Array(UInt16),
|
||||
`URLRegions` Array(UInt32),
|
||||
`RefererRegions` Array(UInt32),
|
||||
`IsYandex` UInt8,
|
||||
`IsClickHouse` UInt8,
|
||||
`GoalReachesDepth` Int32,
|
||||
`GoalReachesURL` Int32,
|
||||
`GoalReachesAny` Int32,
|
||||
@ -512,7 +512,7 @@ SELECT
|
||||
sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits,
|
||||
(100. * goal_visits) / visits AS goal_percent
|
||||
FROM tutorial.visits_v1
|
||||
WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru')
|
||||
WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'http://public_search')
|
||||
```
|
||||
|
||||
## Cluster Deployment {#cluster-deployment}
|
||||
@ -533,19 +533,19 @@ Example config for a cluster with three shards, one replica each:
|
||||
<perftest_3shards_1replicas>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>example-perftest01j.yandex.ru</host>
|
||||
<host>example-perftest01j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>example-perftest02j.yandex.ru</host>
|
||||
<host>example-perftest02j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>example-perftest03j.yandex.ru</host>
|
||||
<host>example-perftest03j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
@ -591,15 +591,15 @@ Example config for a cluster of one shard containing three replicas:
|
||||
<perftest_1shards_3replicas>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>example-perftest01j.yandex.ru</host>
|
||||
<host>example-perftest01j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
<replica>
|
||||
<host>example-perftest02j.yandex.ru</host>
|
||||
<host>example-perftest02j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
<replica>
|
||||
<host>example-perftest03j.yandex.ru</host>
|
||||
<host>example-perftest03j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
@ -617,15 +617,15 @@ ZooKeeper locations are specified in the configuration file:
|
||||
``` xml
|
||||
<zookeeper>
|
||||
<node>
|
||||
<host>zoo01.yandex.ru</host>
|
||||
<host>zoo01</host>
|
||||
<port>2181</port>
|
||||
</node>
|
||||
<node>
|
||||
<host>zoo02.yandex.ru</host>
|
||||
<host>zoo02</host>
|
||||
<port>2181</port>
|
||||
</node>
|
||||
<node>
|
||||
<host>zoo03.yandex.ru</host>
|
||||
<host>zoo03</host>
|
||||
<port>2181</port>
|
||||
</node>
|
||||
</zookeeper>
|
||||
|
@ -5,7 +5,7 @@ toc_title: Applying CatBoost Models
|
||||
|
||||
# Applying a Catboost Model in ClickHouse {#applying-catboost-model-in-clickhouse}
|
||||
|
||||
[CatBoost](https://catboost.ai) is a free and open-source gradient boosting library developed at [Yandex](https://yandex.com/company/) for machine learning.
|
||||
[CatBoost](https://catboost.ai) is a free and open-source gradient boosting library developed at Yandex for machine learning.
|
||||
|
||||
With this instruction, you will learn to apply pre-trained models in ClickHouse by running model inference from SQL.
|
||||
|
||||
|
@ -300,7 +300,7 @@ Result:
|
||||
<tr> <th>Search phrase</th> <th>Count</th> </tr>
|
||||
<tr> <td></td> <td>8267016</td> </tr>
|
||||
<tr> <td>bathroom interior design</td> <td>2166</td> </tr>
|
||||
<tr> <td>yandex</td> <td>1655</td> </tr>
|
||||
<tr> <td>clickhouse</td> <td>1655</td> </tr>
|
||||
<tr> <td>spring 2014 fashion</td> <td>1549</td> </tr>
|
||||
<tr> <td>freeform photos</td> <td>1480</td> </tr>
|
||||
</table>
|
||||
@ -371,7 +371,7 @@ Similar to TabSeparated, but outputs a value in name=value format. Names are esc
|
||||
``` text
|
||||
SearchPhrase= count()=8267016
|
||||
SearchPhrase=bathroom interior design count()=2166
|
||||
SearchPhrase=yandex count()=1655
|
||||
SearchPhrase=clickhouse count()=1655
|
||||
SearchPhrase=2014 spring fashion count()=1549
|
||||
SearchPhrase=freeform photos count()=1480
|
||||
SearchPhrase=angelina jolie count()=1245
|
||||
@ -1060,7 +1060,7 @@ XML format is suitable only for output, not for parsing. Example:
|
||||
<field>2166</field>
|
||||
</row>
|
||||
<row>
|
||||
<SearchPhrase>yandex</SearchPhrase>
|
||||
<SearchPhrase>clickhouse</SearchPhrase>
|
||||
<field>1655</field>
|
||||
</row>
|
||||
<row>
|
||||
|
@ -12,7 +12,7 @@ ClickHouse provides three network interfaces (they can be optionally wrapped in
|
||||
- [Native TCP](../interfaces/tcp.md), which has less overhead.
|
||||
- [gRPC](grpc.md).
|
||||
|
||||
In most cases it is recommended to use appropriate tool or library instead of interacting with those directly. Officially supported by Yandex are the following:
|
||||
In most cases it is recommended to use an appropriate tool or library instead of interacting with those directly. The following are officially supported by ClickHouse:
|
||||
|
||||
- [Command-line client](../interfaces/cli.md)
|
||||
- [JDBC driver](../interfaces/jdbc.md)
|
||||
|
@ -5,7 +5,7 @@ toc_title: Performance
|
||||
|
||||
# Performance {#performance}
|
||||
|
||||
According to internal testing results at Yandex, ClickHouse shows the best performance (both the highest throughput for long queries and the lowest latency on short queries) for comparable operating scenarios among systems of its class that were available for testing. You can view the test results on a [separate page](https://clickhouse.com/benchmark/dbms/).
|
||||
ClickHouse shows the best performance (both the highest throughput for long queries and the lowest latency on short queries) for comparable operating scenarios among systems of its class that were available for testing. You can view the test results on a [separate page](https://clickhouse.com/benchmark/dbms/).
|
||||
|
||||
Numerous independent benchmarks came to similar conclusions. They are not difficult to find using an internet search, or you can see [our small collection of related links](https://clickhouse.com/#independent-benchmarks).
|
||||
|
||||
|
@ -59,7 +59,7 @@ wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/cl
|
||||
chmod a+x benchmark-new.sh
|
||||
wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/queries.sql
|
||||
```
|
||||
3. Download test data according to the [Yandex.Metrica dataset](../getting-started/example-datasets/metrica.md) instruction (“hits” table containing 100 million rows).
|
||||
3. Download the [web analytics dataset](../getting-started/example-datasets/metrica.md) (“hits” table containing 100 million rows).
|
||||
```bash
|
||||
wget https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz
|
||||
tar xvf hits_100m_obfuscated_v1.tar.xz -C .
|
||||
@ -78,6 +78,6 @@ mv hits_100m_obfuscated_v1/* .
|
||||
```bash
|
||||
./benchmark-new.sh hits_100m_obfuscated
|
||||
```
|
||||
7. Send the numbers and the info about your hardware configuration to clickhouse-feedback@yandex-team.com
|
||||
7. Send the numbers and the info about your hardware configuration to feedback@clickhouse.com
|
||||
|
||||
All the results are published here: https://clickhouse.com/benchmark/hardware/
|
||||
|
@ -101,7 +101,7 @@ Quotas can use the “quota key” feature to report on resources for multiple k
|
||||
<web_global>
|
||||
<!-- keyed – The quota_key "key" is passed in the query parameter,
|
||||
and the quota is tracked separately for each key value.
|
||||
For example, you can pass a Yandex.Metrica username as the key,
|
||||
For example, you can pass a username as the key,
|
||||
so the quota will be counted separately for each username.
|
||||
Using keys makes sense only if quota_key is transmitted by the program, not by a user.
|
||||
|
||||
|
@ -410,7 +410,7 @@ Useful for breaking away from a specific network interface.
|
||||
**Example**
|
||||
|
||||
``` xml
|
||||
<interserver_http_host>example.yandex.ru</interserver_http_host>
|
||||
<interserver_http_host>example.clickhouse.com</interserver_http_host>
|
||||
```
|
||||
|
||||
## interserver_https_port {#interserver-https-port}
|
||||
@ -430,7 +430,7 @@ Similar to `interserver_http_host`, except that this hostname can be used by oth
|
||||
**Example**
|
||||
|
||||
``` xml
|
||||
<interserver_https_host>example.yandex.ru</interserver_https_host>
|
||||
<interserver_https_host>example.clickhouse.com</interserver_https_host>
|
||||
```
|
||||
|
||||
## interserver_http_credentials {#server-settings-interserver-http-credentials}
|
||||
|
@ -1326,7 +1326,7 @@ If a query from the same user with the same ‘query_id’ already exists at thi
|
||||
|
||||
`1` – Cancel the old query and start running the new one.
|
||||
|
||||
Yandex.Metrica uses this parameter set to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn’t finished yet, it should be cancelled.
|
||||
Set this parameter to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn’t finished yet, it should be cancelled.
|
||||
|
||||
## replace_running_query_max_wait_ms {#replace-running-query-max-wait-ms}
|
||||
|
||||
@ -1380,7 +1380,7 @@ load_balancing = nearest_hostname
|
||||
|
||||
The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a hostname that is most similar to the server’s hostname in the config file (for the number of different characters in identical positions, up to the minimum length of both hostnames).
|
||||
|
||||
For instance, example01-01-1 and example01-01-2.yandex.ru are different in one position, while example01-01-1 and example01-02-2 differ in two places.
|
||||
For instance, example01-01-1 and example01-01-2 are different in one position, while example01-01-1 and example01-02-2 differ in two places.
|
||||
This method might seem primitive, but it does not require external data about network topology, and it does not compare IP addresses, which would be complicated for our IPv6 addresses.
|
||||
|
||||
Thus, if there are equivalent replicas, the closest one by name is preferred.
|
||||
|
@ -33,7 +33,7 @@ Client section in `config.xml` will look like:
|
||||
Add Zookeeper to ClickHouse config with some cluster and macros:
|
||||
|
||||
``` xml
|
||||
<yandex>
|
||||
<clickhouse>
|
||||
<zookeeper>
|
||||
<node>
|
||||
<host>localhost</host>
|
||||
@ -41,7 +41,7 @@ Add Zookeeper to ClickHouse config with some cluster and macros:
|
||||
<secure>1</secure>
|
||||
</node>
|
||||
</zookeeper>
|
||||
</yandex>
|
||||
</clickhouse>
|
||||
```
|
||||
|
||||
Start `clickhouse-server`. In logs you should see:
|
||||
|
@ -14,18 +14,18 @@ Columns:
|
||||
```
|
||||
|
||||
```text
|
||||
┌─name─────────────────────┬─is_aggregate─┬─case_insensitive─┬─alias_to─┐
|
||||
│ sumburConsistentHash │ 0 │ 0 │ │
|
||||
│ yandexConsistentHash │ 0 │ 0 │ │
|
||||
│ demangle │ 0 │ 0 │ │
|
||||
│ addressToLine │ 0 │ 0 │ │
|
||||
│ JSONExtractRaw │ 0 │ 0 │ │
|
||||
│ JSONExtractKeysAndValues │ 0 │ 0 │ │
|
||||
│ JSONExtract │ 0 │ 0 │ │
|
||||
│ JSONExtractString │ 0 │ 0 │ │
|
||||
│ JSONExtractFloat │ 0 │ 0 │ │
|
||||
│ JSONExtractInt │ 0 │ 0 │ │
|
||||
└──────────────────────────┴──────────────┴──────────────────┴──────────┘
|
||||
┌─name──────────────────┬─is_aggregate─┬─case_insensitive─┬─alias_to─┬─create_query─┬─origin─┐
|
||||
│ logTrace │ 0 │ 0 │ │ │ System │
|
||||
│ aes_decrypt_mysql │ 0 │ 0 │ │ │ System │
|
||||
│ aes_encrypt_mysql │ 0 │ 0 │ │ │ System │
|
||||
│ decrypt │ 0 │ 0 │ │ │ System │
|
||||
│ encrypt │ 0 │ 0 │ │ │ System │
|
||||
│ toBool │ 0 │ 0 │ │ │ System │
|
||||
│ windowID │ 0 │ 0 │ │ │ System │
|
||||
│ hopStart │ 0 │ 0 │ │ │ System │
|
||||
│ hop │ 0 │ 0 │ │ │ System │
|
||||
│ snowflakeToDateTime64 │ 0 │ 0 │ │ │ System │
|
||||
└───────────────────────┴──────────────┴──────────────────┴──────────┴──────────────┴────────┘
|
||||
|
||||
10 rows in set. Elapsed: 0.002 sec.
|
||||
```
|
||||
|
@ -65,13 +65,13 @@ Row 1:
|
||||
──────
|
||||
database: merge
|
||||
table: visits_v2
|
||||
replica_name: mtgiga001-1t.metrika.yandex.net
|
||||
replica_name: mtgiga001-1t
|
||||
position: 15
|
||||
node_name: queue-0009325559
|
||||
type: MERGE_PARTS
|
||||
create_time: 2020-12-07 14:04:21
|
||||
required_quorum: 0
|
||||
source_replica: mtgiga001-1t.metrika.yandex.net
|
||||
source_replica: mtgiga001-1t
|
||||
new_part_name: 20201130_121373_121384_2
|
||||
parts_to_merge: ['20201130_121373_121378_1','20201130_121379_121379_0','20201130_121380_121380_0','20201130_121381_121381_0','20201130_121382_121382_0','20201130_121383_121383_0','20201130_121384_121384_0']
|
||||
is_detach: 0
|
||||
|
@ -40,7 +40,7 @@ FORMAT Vertical
|
||||
``` text
|
||||
Row 1:
|
||||
──────
|
||||
name: example01-08-1.yandex.ru
|
||||
name: example01-08-1
|
||||
value:
|
||||
czxid: 932998691229
|
||||
mzxid: 932998691229
|
||||
@ -57,7 +57,7 @@ path: /clickhouse/tables/01-08/visits/replicas
|
||||
|
||||
Row 2:
|
||||
──────
|
||||
name: example01-08-2.yandex.ru
|
||||
name: example01-08-2
|
||||
value:
|
||||
czxid: 933002738135
|
||||
mzxid: 933002738135
|
||||
|
@ -139,7 +139,7 @@ With the default settings, ZooKeeper is a time bomb:
|
||||
|
||||
This bomb must be defused.
|
||||
|
||||
The ZooKeeper (3.5.1) configuration below is used in the Yandex.Metrica production environment as of May 20, 2017:
|
||||
The ZooKeeper (3.5.1) configuration below is used in a large production environment:
|
||||
|
||||
zoo.cfg:
|
||||
|
||||
|
@ -13,7 +13,7 @@ The suffix -If can be appended to the name of any aggregate function. In this ca
|
||||
|
||||
Examples: `sumIf(column, cond)`, `countIf(cond)`, `avgIf(x, cond)`, `quantilesTimingIf(level1, level2)(x, cond)`, `argMinIf(arg, val, cond)` and so on.
|
||||
|
||||
With conditional aggregate functions, you can calculate aggregates for several conditions at once, without using subqueries and `JOIN`s. For example, in Yandex.Metrica, conditional aggregate functions are used to implement the segment comparison functionality.
|
||||
With conditional aggregate functions, you can calculate aggregates for several conditions at once, without using subqueries and `JOIN`s. For example, conditional aggregate functions can be used to implement the segment comparison functionality.
|
||||
|
||||
## -Array {#agg-functions-combinator-array}
|
||||
|
||||
|
@ -14,7 +14,7 @@ This allows you to:
|
||||
- Check whether a region is part of another region.
|
||||
- Get a chain of parent regions.
|
||||
|
||||
All the functions support “translocality,” the ability to simultaneously use different perspectives on region ownership. For more information, see the section “Functions for working with Yandex.Metrica dictionaries”.
|
||||
All the functions support “translocality,” the ability to simultaneously use different perspectives on region ownership. For more information, see the section “Functions for working with web analytics dictionaries”.
|
||||
|
||||
The internal dictionaries are disabled in the default package.
|
||||
To enable them, uncomment the parameters `path_to_regions_hierarchy_file` and `path_to_regions_names_files` in the server configuration file.
|
||||
@ -48,5 +48,5 @@ Dictionary updates (other than loading at first use) do not block queries. Durin
|
||||
|
||||
We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server.
|
||||
|
||||
There are also functions for working with OS identifiers and Yandex.Metrica search engines, but they shouldn’t be used.
|
||||
There are also functions for working with OS identifiers and search engines, but they shouldn’t be used.
|
||||
|
||||
|
@ -895,7 +895,6 @@ The same as ‘today() - 1’.
|
||||
## timeSlot {#timeslot}
|
||||
|
||||
Rounds the time to the half hour.
|
||||
This function is specific to Yandex.Metrica, since half an hour is the minimum amount of time for breaking a session into two sessions if a tracking tag shows a single user’s consecutive pageviews that differ in time by strictly more than this amount. This means that tuples (the tag ID, user ID, and time slot) can be used to search for pageviews that are included in the corresponding session.
|
||||
|
||||
## toYYYYMM {#toyyyymm}
|
||||
|
||||
|
@ -220,7 +220,7 @@ Result:
|
||||
A fast, decent-quality non-cryptographic hash function for a string obtained from a URL using some type of normalization.
|
||||
`URLHash(s)` – Calculates a hash from a string without one of the trailing symbols `/`,`?` or `#` at the end, if present.
|
||||
`URLHash(s, N)` – Calculates a hash from a string up to the N level in the URL hierarchy, without one of the trailing symbols `/`,`?` or `#` at the end, if present.
|
||||
Levels are the same as in URLHierarchy. This function is specific to Yandex.Metrica.
|
||||
Levels are the same as in URLHierarchy.
|
||||
|
||||
## farmFingerprint64 {#farmfingerprint64}
|
||||
|
||||
|
@ -5,7 +5,7 @@ toc_title: JSON
|
||||
|
||||
# Functions for Working with JSON {#functions-for-working-with-json}
|
||||
|
||||
In Yandex.Metrica, JSON is transmitted by users as session parameters. There are some special functions for working with this JSON. (Although in most of the cases, the JSONs are additionally pre-processed, and the resulting values are put in separate columns in their processed format.) All these functions are based on strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done.
|
||||
ClickHouse has special functions for working with this JSON. All the JSON functions are based on strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done.
|
||||
|
||||
The following assumptions are made:
|
||||
|
||||
|
@ -189,11 +189,11 @@ Accepts a number. If the number is less than one, it returns 0. Otherwise, it ro
|
||||
|
||||
## roundDuration(num) {#rounddurationnum}
|
||||
|
||||
Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function is specific to Yandex.Metrica and used for implementing the report on session length.
|
||||
Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000.
|
||||
|
||||
## roundAge(num) {#roundagenum}
|
||||
|
||||
Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rounds the number down to a number from the set: 18, 25, 35, 45, 55. This function is specific to Yandex.Metrica and used for implementing the report on user age.
|
||||
Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rounds the number down to a number from the set: 18, 25, 35, 45, 55.
|
||||
|
||||
## roundDown(num, arr) {#rounddownnum-arr}
|
||||
|
||||
|
@ -34,7 +34,7 @@ The URL can be specified with or without a scheme. Examples:
|
||||
``` text
|
||||
svn+ssh://some.svn-hosting.com:80/repo/trunk
|
||||
some.svn-hosting.com:80/repo/trunk
|
||||
https://yandex.com/time/
|
||||
https://clickhouse.com/time/
|
||||
```
|
||||
|
||||
For these examples, the `domain` function returns the following results:
|
||||
@ -42,7 +42,7 @@ For these examples, the `domain` function returns the following results:
|
||||
``` text
|
||||
some.svn-hosting.com
|
||||
some.svn-hosting.com
|
||||
yandex.com
|
||||
clickhouse.com
|
||||
```
|
||||
|
||||
**Returned values**
|
||||
@ -85,7 +85,7 @@ The URL can be specified with or without a scheme. Examples:
|
||||
``` text
|
||||
svn+ssh://some.svn-hosting.com:80/repo/trunk
|
||||
some.svn-hosting.com:80/repo/trunk
|
||||
https://yandex.com/time/
|
||||
https://clickhouse.com/time/
|
||||
```
|
||||
|
||||
**Returned values**
|
||||
@ -109,7 +109,7 @@ SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk');
|
||||
|
||||
### firstSignificantSubdomain {#firstsignificantsubdomain}
|
||||
|
||||
Returns the “first significant subdomain”. This is a non-standard concept specific to Yandex.Metrica. The first significant subdomain is a second-level domain if it is ‘com’, ‘net’, ‘org’, or ‘co’. Otherwise, it is a third-level domain. For example, `firstSignificantSubdomain (‘https://news.yandex.ru/’) = ‘yandex’, firstSignificantSubdomain (‘https://news.yandex.com.tr/’) = ‘yandex’`. The list of “insignificant” second-level domains and other implementation details may change in the future.
|
||||
Returns the “first significant subdomain”. The first significant subdomain is a second-level domain if it is ‘com’, ‘net’, ‘org’, or ‘co’. Otherwise, it is a third-level domain. For example, `firstSignificantSubdomain (‘https://news.clickhouse.com/’) = ‘clickhouse’, firstSignificantSubdomain (‘https://news.clickhouse.com.tr/’) = ‘clickhouse’`. The list of “insignificant” second-level domains and other implementation details may change in the future.
|
||||
|
||||
### cutToFirstSignificantSubdomain {#cuttofirstsignificantsubdomain}
|
||||
|
||||
@ -117,7 +117,7 @@ Returns the part of the domain that includes top-level subdomains up to the “f
|
||||
|
||||
For example:
|
||||
|
||||
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
|
||||
- `cutToFirstSignificantSubdomain('https://news.clickhouse.com.tr/') = 'clickhouse.com.tr'`.
|
||||
- `cutToFirstSignificantSubdomain('www.tr') = 'tr'`.
|
||||
- `cutToFirstSignificantSubdomain('tr') = ''`.
|
||||
|
||||
@ -127,7 +127,7 @@ Returns the part of the domain that includes top-level subdomains up to the “f
|
||||
|
||||
For example:
|
||||
|
||||
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
|
||||
- `cutToFirstSignificantSubdomain('https://news.clickhouse.com.tr/') = 'clickhouse.com.tr'`.
|
||||
- `cutToFirstSignificantSubdomain('www.tr') = 'www.tr'`.
|
||||
- `cutToFirstSignificantSubdomain('tr') = ''`.
|
||||
|
||||
@ -335,7 +335,7 @@ Returns an array containing the URL, truncated at the end by the symbols /,? in
|
||||
|
||||
### URLPathHierarchy(URL) {#urlpathhierarchyurl}
|
||||
|
||||
The same as above, but without the protocol and host in the result. The / element (root) is not included. Example: the function is used to implement tree reports the URL in Yandex. Metric.
|
||||
The same as above, but without the protocol and host in the result. The / element (root) is not included.
|
||||
|
||||
``` text
|
||||
URLPathHierarchy('https://example.com/browse/CONV-6788') =
|
||||
|
@ -26,11 +26,11 @@ A table with the specified structure for reading or writing data in the specifie
|
||||
|
||||
**Examples**
|
||||
|
||||
Selecting the first two rows from the table from S3 file `https://storage.yandexcloud.net/my-test-bucket-768/data.csv`:
|
||||
Selecting the first two rows from the table from S3 file `https://hostname/my-test-bucket-768/data.csv`:
|
||||
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/data.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32')
|
||||
FROM s3('https://hostname/my-test-bucket-768/data.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32')
|
||||
LIMIT 2;
|
||||
```
|
||||
|
||||
@ -45,7 +45,7 @@ The similar but from file with `gzip` compression:
|
||||
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/data.csv.gz', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32', 'gzip')
|
||||
FROM s3('https://hostname/my-test-bucket-768/data.csv.gz', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32', 'gzip')
|
||||
LIMIT 2;
|
||||
```
|
||||
|
||||
@ -60,20 +60,20 @@ LIMIT 2;
|
||||
|
||||
Suppose that we have several files with following URIs on S3:
|
||||
|
||||
- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv'
|
||||
- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv'
|
||||
- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_3.csv'
|
||||
- 'https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_4.csv'
|
||||
- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_1.csv'
|
||||
- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv'
|
||||
- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv'
|
||||
- 'https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_4.csv'
|
||||
- 'https://hostname/my-test-bucket-768/some_prefix/some_file_1.csv'
|
||||
- 'https://hostname/my-test-bucket-768/some_prefix/some_file_2.csv'
|
||||
- 'https://hostname/my-test-bucket-768/some_prefix/some_file_3.csv'
|
||||
- 'https://hostname/my-test-bucket-768/some_prefix/some_file_4.csv'
|
||||
- 'https://hostname/my-test-bucket-768/another_prefix/some_file_1.csv'
|
||||
- 'https://hostname/my-test-bucket-768/another_prefix/some_file_2.csv'
|
||||
- 'https://hostname/my-test-bucket-768/another_prefix/some_file_3.csv'
|
||||
- 'https://hostname/my-test-bucket-768/another_prefix/some_file_4.csv'
|
||||
|
||||
Count the amount of rows in files ending with numbers from 1 to 3:
|
||||
|
||||
``` sql
|
||||
SELECT count(*)
|
||||
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}.csv', 'CSV', 'name String, value UInt32')
|
||||
FROM s3('https://hostname/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}.csv', 'CSV', 'name String, value UInt32')
|
||||
```
|
||||
|
||||
``` text
|
||||
@ -86,7 +86,7 @@ Count the total amount of rows in all files in these two directories:
|
||||
|
||||
``` sql
|
||||
SELECT count(*)
|
||||
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV', 'name String, value UInt32')
|
||||
FROM s3('https://hostname/my-test-bucket-768/{some,another}_prefix/*', 'CSV', 'name String, value UInt32')
|
||||
```
|
||||
|
||||
``` text
|
||||
@ -102,7 +102,7 @@ Count the total amount of rows in files named `file-000.csv`, `file-001.csv`,
|
||||
|
||||
``` sql
|
||||
SELECT count(*)
|
||||
FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV', 'name String, value UInt32');
|
||||
FROM s3('https://hostname/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV', 'name String, value UInt32');
|
||||
```
|
||||
|
||||
``` text
|
||||
@ -114,14 +114,14 @@ FROM s3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000
|
||||
Insert data into file `test-data.csv.gz`:
|
||||
|
||||
``` sql
|
||||
INSERT INTO FUNCTION s3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip')
|
||||
INSERT INTO FUNCTION s3('https://hostname/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip')
|
||||
VALUES ('test-data', 1), ('test-data-2', 2);
|
||||
```
|
||||
|
||||
Insert data into file `test-data.csv.gz` from existing table:
|
||||
|
||||
``` sql
|
||||
INSERT INTO FUNCTION s3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip')
|
||||
INSERT INTO FUNCTION s3('https://hostname/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip')
|
||||
SELECT name, value FROM existing_table;
|
||||
```
|
||||
|
||||
|
@ -926,7 +926,7 @@ toc_title: '2018'
|
||||
#### Backward Incompatible Changes: {#backward-incompatible-changes-10}
|
||||
|
||||
- Removed the `distributed_ddl_allow_replicated_alter` option. This behavior is enabled by default.
|
||||
- Removed the `strict_insert_defaults` setting. If you were using this functionality, write to `clickhouse-feedback@yandex-team.com`.
|
||||
- Removed the `strict_insert_defaults` setting. If you were using this functionality, write to `feedback@clickhouse.com`.
|
||||
- Removed the `UnsortedMergeTree` engine.
|
||||
|
||||
### ClickHouse Release 1.1.54343, 2018-02-05 {#clickhouse-release-1-1-54343-2018-02-05}
|
||||
@ -1049,7 +1049,7 @@ This release contains bug fixes for the previous release 1.1.54337:
|
||||
- The `runningIncome` function was renamed to `runningDifferenceStartingWithFirstvalue` to avoid confusion.
|
||||
- Removed the `FROM ARRAY JOIN arr` syntax when ARRAY JOIN is specified directly after FROM with no table (Amos Bird).
|
||||
- Removed the `BlockTabSeparated` format that was used solely for demonstration purposes.
|
||||
- Changed the state format for aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. If you have stored states of these aggregate functions in tables (using the `AggregateFunction` data type or materialized views with corresponding states), please write to clickhouse-feedback@yandex-team.com.
|
||||
- Changed the state format for aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. If you have stored states of these aggregate functions in tables (using the `AggregateFunction` data type or materialized views with corresponding states), please write to feedback@clickhouse.com.
|
||||
- In previous server versions there was an undocumented feature: if an aggregate function depends on parameters, you can still specify it without parameters in the AggregateFunction data type. Example: `AggregateFunction(quantiles, UInt64)` instead of `AggregateFunction(quantiles(0.5, 0.9), UInt64)`. This feature was lost. Although it was undocumented, we plan to support it again in future releases.
|
||||
- Enum data types cannot be used in min/max aggregate functions. This ability will be returned in the next release.
|
||||
|
||||
|
@ -295,7 +295,7 @@ CREATE TABLE tutorial.visits_v1
|
||||
`URLCategories` Array(UInt16),
|
||||
`URLRegions` Array(UInt32),
|
||||
`RefererRegions` Array(UInt32),
|
||||
`IsYandex` UInt8,
|
||||
`IsClickHouse` UInt8,
|
||||
`GoalReachesDepth` Int32,
|
||||
`GoalReachesURL` Int32,
|
||||
`GoalReachesAny` Int32,
|
||||
@ -523,7 +523,7 @@ SELECT
|
||||
sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits,
|
||||
(100. * goal_visits) / visits AS goal_percent
|
||||
FROM tutorial.visits_v1
|
||||
WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru')
|
||||
WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'http://public_search')
|
||||
```
|
||||
|
||||
## クラスタのデプロイ {#cluster-deployment}
|
||||
@ -547,19 +547,19 @@ ClickHouseクラスタは均質なクラスタ(homogenous cluster)です。セ
|
||||
<perftest_3shards_1replicas>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>example-perftest01j.yandex.ru</host>
|
||||
<host>example-perftest01j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>example-perftest02j.yandex.ru</host>
|
||||
<host>example-perftest02j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>example-perftest03j.yandex.ru</host>
|
||||
<host>example-perftest03j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
@ -607,15 +607,15 @@ INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1;
|
||||
<perftest_1shards_3replicas>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>example-perftest01j.yandex.ru</host>
|
||||
<host>example-perftest01j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
<replica>
|
||||
<host>example-perftest02j.yandex.ru</host>
|
||||
<host>example-perftest02j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
<replica>
|
||||
<host>example-perftest03j.yandex.ru</host>
|
||||
<host>example-perftest03j</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
@ -637,15 +637,15 @@ ZooKeeperの場所は設定ファイルで指定します:
|
||||
``` xml
|
||||
<zookeeper>
|
||||
<node>
|
||||
<host>zoo01.yandex.ru</host>
|
||||
<host>zoo01</host>
|
||||
<port>2181</port>
|
||||
</node>
|
||||
<node>
|
||||
<host>zoo02.yandex.ru</host>
|
||||
<host>zoo02</host>
|
||||
<port>2181</port>
|
||||
</node>
|
||||
<node>
|
||||
<host>zoo03.yandex.ru</host>
|
||||
<host>zoo03</host>
|
||||
<port>2181</port>
|
||||
</node>
|
||||
</zookeeper>
|
||||
|
Loading…
Reference in New Issue
Block a user