From 7d7a2d0ddef17115fd935405248d0c8d3fc8615f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 12 Aug 2020 07:00:04 +0300 Subject: [PATCH] Added instruction to reproduce the benchmark --- benchmark/monetdb/instruction.md | 356 +++++++++++++++++++++++++++++++ benchmark/monetdb/usability.md | 8 +- 2 files changed, 363 insertions(+), 1 deletion(-) create mode 100644 benchmark/monetdb/instruction.md diff --git a/benchmark/monetdb/instruction.md b/benchmark/monetdb/instruction.md new file mode 100644 index 00000000000..b8426d845a9 --- /dev/null +++ b/benchmark/monetdb/instruction.md @@ -0,0 +1,356 @@ +Go to https://www.monetdb.org/ + +Dowload now. +Latest binary releases. +Ubuntu & Debian. + +https://www.monetdb.org/downloads/deb/ + +Go to the server where you want to install MonetDB. +``` +$ sudo mcedit /etc/apt/sources.list.d/monetdb.list +``` +Write: +``` +deb https://dev.monetdb.org/downloads/deb/ bionic monetdb +``` + +``` +$ wget --output-document=- https://www.monetdb.org/downloads/MonetDB-GPG-KEY | sudo apt-key add - + +$ sudo apt update +$ sudo apt install monetdb5-sql monetdb-client + +$ sudo systemctl enable monetdbd +$ sudo systemctl start monetdbd +$ sudo usermod -a -G monetdb $USER +``` + +Logout and login back to your server. + +Tutorial: +https://www.monetdb.org/Documentation/UserGuide/Tutorial + +Creating the database: + +``` +$ sudo mkdir /opt/monetdb +$ sudo chmod 777 /opt/monetdb +$ monetdbd create /opt/monetdb + +$ monetdbd start /opt/monetdb +cannot remove socket files +``` + +Now you have to stop MonetDB, copy the contents of `/var/monetdb5` to `/opt/monetdb` and replace the `/var/monetdb5` with symlink to `/opt/monetdb`. This is necessary, because I don't have free space in `/var` and creation of database in `/opt` did not succeed. + +Start MonetDB again. + +``` +$ sudo systemctl start monetdbd +``` + +``` +$ monetdb create test +created database in maintenance mode: test + +$ monetdb release test +taken database out of maintenance mode: test +``` + +Run client: +``` +$ mclient -u monetdb -d test +``` + +Type password: monetdb + +``` +CREATE TABLE hits +( + "WatchID" BIGINT, + "JavaEnable" TINYINT, + "Title" TEXT, + "GoodEvent" SMALLINT, + "EventTime" TIMESTAMP, + "EventDate" Date, + "CounterID" INTEGER, + "ClientIP" INTEGER, + "RegionID" INTEGER, + "UserID" BIGINT, + "CounterClass" TINYINT, + "OS" TINYINT, + "UserAgent" TINYINT, + "URL" TEXT, + "Referer" TEXT, + "Refresh" TINYINT, + "RefererCategoryID" SMALLINT, + "RefererRegionID" INTEGER, + "URLCategoryID" SMALLINT, + "URLRegionID" INTEGER, + "ResolutionWidth" SMALLINT, + "ResolutionHeight" SMALLINT, + "ResolutionDepth" TINYINT, + "FlashMajor" TINYINT, + "FlashMinor" TINYINT, + "FlashMinor2" TEXT, + "NetMajor" TINYINT, + "NetMinor" TINYINT, + "UserAgentMajor" SMALLINT, + "UserAgentMinor" TEXT, + "CookieEnable" TINYINT, + "JavascriptEnable" TINYINT, + "IsMobile" TINYINT, + "MobilePhone" TINYINT, + "MobilePhoneModel" TEXT, + "Params" TEXT, + "IPNetworkID" INTEGER, + "TraficSourceID" TINYINT, + "SearchEngineID" SMALLINT, + "SearchPhrase" TEXT, + "AdvEngineID" TINYINT, + "IsArtifical" TINYINT, + "WindowClientWidth" SMALLINT, + "WindowClientHeight" SMALLINT, + "ClientTimeZone" SMALLINT, + "ClientEventTime" TIMESTAMP, + "SilverlightVersion1" TINYINT, + "SilverlightVersion2" TINYINT, + "SilverlightVersion3" INTEGER, + "SilverlightVersion4" SMALLINT, + "PageCharset" TEXT, + "CodeVersion" INTEGER, + "IsLink" TINYINT, + "IsDownload" TINYINT, + "IsNotBounce" TINYINT, + "FUniqID" BIGINT, + "OriginalURL" TEXT, + "HID" INTEGER, + "IsOldCounter" TINYINT, + "IsEvent" TINYINT, + "IsParameter" TINYINT, + "DontCountHits" TINYINT, + "WithHash" TINYINT, + "HitColor" TEXT, + "LocalEventTime" TIMESTAMP, + "Age" TINYINT, + "Sex" TINYINT, + "Income" TINYINT, + "Interests" SMALLINT, + "Robotness" TINYINT, + "RemoteIP" INTEGER, + "WindowName" INTEGER, + "OpenerName" INTEGER, + "HistoryLength" SMALLINT, + "BrowserLanguage" TEXT, + "BrowserCountry" TEXT, + "SocialNetwork" TEXT, + "SocialAction" TEXT, + "HTTPError" SMALLINT, + "SendTiming" INTEGER, + "DNSTiming" INTEGER, + "ConnectTiming" INTEGER, + "ResponseStartTiming" INTEGER, + "ResponseEndTiming" INTEGER, + "FetchTiming" INTEGER, + "SocialSourceNetworkID" TINYINT, + "SocialSourcePage" TEXT, + "ParamPrice" BIGINT, + "ParamOrderID" TEXT, + "ParamCurrency" TEXT, + "ParamCurrencyID" SMALLINT, + "OpenstatServiceName" TEXT, + "OpenstatCampaignID" TEXT, + "OpenstatAdID" TEXT, + "OpenstatSourceID" TEXT, + "UTMSource" TEXT, + "UTMMedium" TEXT, + "UTMCampaign" TEXT, + "UTMContent" TEXT, + "UTMTerm" TEXT, + "FromTag" TEXT, + "HasGCLID" TINYINT, + "RefererHash" BIGINT, + "URLHash" BIGINT, + "CLID" INTEGER +); +``` + +# How to prepare data + +Download the 100 million rows dataset from here and insert into ClickHouse: +https://clickhouse.tech/docs/en/getting-started/example-datasets/metrica/ + +Create the dataset from ClickHouse: + +``` +SELECT + toInt64(WatchID) = -9223372036854775808 ? -9223372036854775807 : toInt64(WatchID), + toInt8(JavaEnable) = -128 ? -127 : toInt8(JavaEnable), + toValidUTF8(toString(Title)), + toInt16(GoodEvent) = -32768 ? -32767 : toInt16(GoodEvent), + EventTime, + EventDate, + toInt32(CounterID) = -2147483648 ? -2147483647 : toInt32(CounterID), + toInt32(ClientIP) = -2147483648 ? -2147483647 : toInt32(ClientIP), + toInt32(RegionID) = -2147483648 ? -2147483647 : toInt32(RegionID), + toInt64(UserID) = -9223372036854775808 ? -9223372036854775807 : toInt64(UserID), + toInt8(CounterClass) = -128 ? -127 : toInt8(CounterClass), + toInt8(OS) = -128 ? -127 : toInt8(OS), + toInt8(UserAgent) = -128 ? -127 : toInt8(UserAgent), + toValidUTF8(toString(URL)), + toValidUTF8(toString(Referer)), + toInt8(Refresh) = -128 ? -127 : toInt8(Refresh), + toInt16(RefererCategoryID) = -32768 ? -32767 : toInt16(RefererCategoryID), + toInt32(RefererRegionID) = -2147483648 ? -2147483647 : toInt32(RefererRegionID), + toInt16(URLCategoryID) = -32768 ? -32767 : toInt16(URLCategoryID), + toInt32(URLRegionID) = -2147483648 ? -2147483647 : toInt32(URLRegionID), + toInt16(ResolutionWidth) = -32768 ? -32767 : toInt16(ResolutionWidth), + toInt16(ResolutionHeight) = -32768 ? -32767 : toInt16(ResolutionHeight), + toInt8(ResolutionDepth) = -128 ? -127 : toInt8(ResolutionDepth), + toInt8(FlashMajor) = -128 ? -127 : toInt8(FlashMajor), + toInt8(FlashMinor) = -128 ? -127 : toInt8(FlashMinor), + toValidUTF8(toString(FlashMinor2)), + toInt8(NetMajor) = -128 ? -127 : toInt8(NetMajor), + toInt8(NetMinor) = -128 ? -127 : toInt8(NetMinor), + toInt16(UserAgentMajor) = -32768 ? -32767 : toInt16(UserAgentMajor), + toValidUTF8(toString(UserAgentMinor)), + toInt8(CookieEnable) = -128 ? -127 : toInt8(CookieEnable), + toInt8(JavascriptEnable) = -128 ? -127 : toInt8(JavascriptEnable), + toInt8(IsMobile) = -128 ? -127 : toInt8(IsMobile), + toInt8(MobilePhone) = -128 ? -127 : toInt8(MobilePhone), + toValidUTF8(toString(MobilePhoneModel)), + toValidUTF8(toString(Params)), + toInt32(IPNetworkID) = -2147483648 ? -2147483647 : toInt32(IPNetworkID), + toInt8(TraficSourceID) = -128 ? -127 : toInt8(TraficSourceID), + toInt16(SearchEngineID) = -32768 ? -32767 : toInt16(SearchEngineID), + toValidUTF8(toString(SearchPhrase)), + toInt8(AdvEngineID) = -128 ? -127 : toInt8(AdvEngineID), + toInt8(IsArtifical) = -128 ? -127 : toInt8(IsArtifical), + toInt16(WindowClientWidth) = -32768 ? -32767 : toInt16(WindowClientWidth), + toInt16(WindowClientHeight) = -32768 ? -32767 : toInt16(WindowClientHeight), + toInt16(ClientTimeZone) = -32768 ? -32767 : toInt16(ClientTimeZone), + ClientEventTime, + toInt8(SilverlightVersion1) = -128 ? -127 : toInt8(SilverlightVersion1), + toInt8(SilverlightVersion2) = -128 ? -127 : toInt8(SilverlightVersion2), + toInt32(SilverlightVersion3) = -2147483648 ? -2147483647 : toInt32(SilverlightVersion3), + toInt16(SilverlightVersion4) = -32768 ? -32767 : toInt16(SilverlightVersion4), + toValidUTF8(toString(PageCharset)), + toInt32(CodeVersion) = -2147483648 ? -2147483647 : toInt32(CodeVersion), + toInt8(IsLink) = -128 ? -127 : toInt8(IsLink), + toInt8(IsDownload) = -128 ? -127 : toInt8(IsDownload), + toInt8(IsNotBounce) = -128 ? -127 : toInt8(IsNotBounce), + toInt64(FUniqID) = -9223372036854775808 ? -9223372036854775807 : toInt64(FUniqID), + toValidUTF8(toString(OriginalURL)), + toInt32(HID) = -2147483648 ? -2147483647 : toInt32(HID), + toInt8(IsOldCounter) = -128 ? -127 : toInt8(IsOldCounter), + toInt8(IsEvent) = -128 ? -127 : toInt8(IsEvent), + toInt8(IsParameter) = -128 ? -127 : toInt8(IsParameter), + toInt8(DontCountHits) = -128 ? -127 : toInt8(DontCountHits), + toInt8(WithHash) = -128 ? -127 : toInt8(WithHash), + toValidUTF8(toString(HitColor)), + LocalEventTime, + toInt8(Age) = -128 ? -127 : toInt8(Age), + toInt8(Sex) = -128 ? -127 : toInt8(Sex), + toInt8(Income) = -128 ? -127 : toInt8(Income), + toInt16(Interests) = -32768 ? -32767 : toInt16(Interests), + toInt8(Robotness) = -128 ? -127 : toInt8(Robotness), + toInt32(RemoteIP) = -2147483648 ? -2147483647 : toInt32(RemoteIP), + toInt32(WindowName) = -2147483648 ? -2147483647 : toInt32(WindowName), + toInt32(OpenerName) = -2147483648 ? -2147483647 : toInt32(OpenerName), + toInt16(HistoryLength) = -32768 ? -32767 : toInt16(HistoryLength), + toValidUTF8(toString(BrowserLanguage)), + toValidUTF8(toString(BrowserCountry)), + toValidUTF8(toString(SocialNetwork)), + toValidUTF8(toString(SocialAction)), + toInt16(HTTPError) = -32768 ? -32767 : toInt16(HTTPError), + toInt32(SendTiming) = -2147483648 ? -2147483647 : toInt32(SendTiming), + toInt32(DNSTiming) = -2147483648 ? -2147483647 : toInt32(DNSTiming), + toInt32(ConnectTiming) = -2147483648 ? -2147483647 : toInt32(ConnectTiming), + toInt32(ResponseStartTiming) = -2147483648 ? -2147483647 : toInt32(ResponseStartTiming), + toInt32(ResponseEndTiming) = -2147483648 ? -2147483647 : toInt32(ResponseEndTiming), + toInt32(FetchTiming) = -2147483648 ? -2147483647 : toInt32(FetchTiming), + toInt8(SocialSourceNetworkID) = -128 ? -127 : toInt8(SocialSourceNetworkID), + toValidUTF8(toString(SocialSourcePage)), + toInt64(ParamPrice) = -9223372036854775808 ? -9223372036854775807 : toInt64(ParamPrice), + toValidUTF8(toString(ParamOrderID)), + toValidUTF8(toString(ParamCurrency)), + toInt16(ParamCurrencyID) = -32768 ? -32767 : toInt16(ParamCurrencyID), + toValidUTF8(toString(OpenstatServiceName)), + toValidUTF8(toString(OpenstatCampaignID)), + toValidUTF8(toString(OpenstatAdID)), + toValidUTF8(toString(OpenstatSourceID)), + toValidUTF8(toString(UTMSource)), + toValidUTF8(toString(UTMMedium)), + toValidUTF8(toString(UTMCampaign)), + toValidUTF8(toString(UTMContent)), + toValidUTF8(toString(UTMTerm)), + toValidUTF8(toString(FromTag)), + toInt8(HasGCLID) = -128 ? -127 : toInt8(HasGCLID), + toInt64(RefererHash) = -9223372036854775808 ? -9223372036854775807 : toInt64(RefererHash), + toInt64(URLHash) = -9223372036854775808 ? -9223372036854775807 : toInt64(URLHash), + toInt32(CLID) = -2147483648 ? -2147483647 : toInt32(CLID) +FROM hits_100m_obfuscated +INTO OUTFILE '/home/milovidov/example_datasets/hits_100m_obfuscated_monetdb.tsv' +FORMAT TSV; +``` + +Note that MonetDB does not support the most negative numbers like -128. And we have to convert them by adding one. +It makes impossible to store the values of 64bit identifiers in BIGINT. +Maybe it's a trick to optimize NULLs? + +Upload the data: + +``` +$ mclient -u monetdb -d test +``` + +Type password: monetdb + +``` +COPY INTO hits FROM '/home/milovidov/example_datasets/hits_100m_obfuscated_monetdb.tsv' USING DELIMITERS '\t'; +``` + +It takes 28 minutes 02 seconds on a server (Linux Ubuntu, Xeon E5-2560v2, 32 logical CPU, 128 GiB RAM, 8xHDD RAID-5, 40 TB). +It is roughly 60 000 rows per second. + +Validate the data: + +``` +SELECT count(*) FROM hits; +``` + +Create an index: + +``` +CREATE INDEX hits_idx ON hits ("CounterID", "EventDate"); +``` + +(it takes 5 seconds) + +Run the benchmark: + +``` +./benchmark.sh | tee log.txt +``` + +You can find the log in `log.txt` file. + +Postprocess data: + +``` +grep clk log.txt | awk '{ if ($3 == "ms") { print $2 / 1000; } else if ($3 == "sec") { print $2 } else { print } }' +``` + +Then replace values with "min" (minutes) timing manually and save to `tmp.txt`. +Then process to JSON format: + +``` +awk '{ + if (i % 3 == 0) { a = $1 } + else if (i % 3 == 1) { b = $1 } + else if (i % 3 == 2) { c = $1; print "[" a ", " b ", " c "]," }; + ++i; }' < tmp.txt +``` + +And paste to `/website/benchmark/dbms/results/005_monetdb.json` in the repository. diff --git a/benchmark/monetdb/usability.md b/benchmark/monetdb/usability.md index c1f35b6bd4e..1f39b3a434c 100644 --- a/benchmark/monetdb/usability.md +++ b/benchmark/monetdb/usability.md @@ -1205,4 +1205,10 @@ How to prepare the benchmark report: `grep clk log.txt | awk '{ if ($3 == "ms") { print $2 / 1000; } else if ($3 == "sec") { print $2 } else { print } }'` - +``` +awk '{ + if (i % 3 == 0) { a = $1 } + else if (i % 3 == 1) { b = $1 } + else if (i % 3 == 2) { c = $1; print "[" a ", " b ", " c "]," }; + ++i; }' < tmp.txt +```