mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 00:52:02 +00:00
Added instruction
This commit is contained in:
parent
023540f238
commit
10346964ed
@ -1,8 +1,65 @@
|
|||||||
clickhouse-client --query "SHOW CREATE TABLE hits_100m" --format TSVRaw | tr '`' '"' | sed -r -e 's/U?Int64/BIGINT/; s/U?Int32/INTEGER/; s/U?Int16/SMALLINT/; s/U?Int8/TINYINT/; s/DateTime/TIMESTAMP ENCODING FIXED(32)/; s/Date/DATE ENCODING DAYS(16)/; s/FixedString\(2\)/TEXT ENCODING DICT(16)/; s/FixedString\(3\)/TEXT ENCODING DICT/; s/FixedString\(\d+\)/TEXT ENCODING DICT/; s/String/TEXT ENCODING DICT/;'
|
# Instruction to run benchmark for OmniSci on web-analytics dataset
|
||||||
|
|
||||||
clickhouse-client --query "SHOW CREATE TABLE hits_100m" --format TSVRaw | tr '`' '"' | sed -r -e 's/"(\w+)" U?Int([0-9]+)/toInt\2(\1)/; s/"(\w+)" (Fixed)?String(\([0-9]+\))?/toValidUTF8(toString(\1))/; s/"(\w+)" \w+/\1/'
|
OmniSci (former name "MapD") is open-source (open-core) in-memory analytical DBMS with support for GPU processing.
|
||||||
|
It can run on CPU without GPU as well. It can show competitive performance on simple queries (like - simple aggregation on a single column).
|
||||||
|
|
||||||
|
# How to install
|
||||||
|
|
||||||
|
https://docs.omnisci.com/installation-and-configuration/installation/installing-on-ubuntu
|
||||||
|
|
||||||
|
# Caveats
|
||||||
|
|
||||||
|
- Dataset (at least needed columns) must fit in memory.
|
||||||
|
- It does not support data compression (only dictionary encoding for strings).
|
||||||
|
- First query execution is very slow because uncompressed data is read from disk.
|
||||||
|
- It does not support index for quick range queries.
|
||||||
|
- It does not support NOT NULL for data types.
|
||||||
|
- It does not support BLOB.
|
||||||
|
- No support for UNSIGNED data type (it's Ok according to SQL standard).
|
||||||
|
- Lack of string processing functions.
|
||||||
|
- Strings are limited to 32767 bytes.
|
||||||
|
- GROUP BY on text data type is supported only if it has dictionary encoding.
|
||||||
|
`Exception: Cannot group by string columns which are not dictionary encoded`
|
||||||
|
- Some aggregate functions are not supported for strings at all.
|
||||||
|
`Aggregate on TEXT is not supported yet.`
|
||||||
|
- Sometimes I hit a bug when query is run in infinite loop and does not finish (after retry it's finished successfully).
|
||||||
|
- One query executed in hours even with retries.
|
||||||
|
- Sorting is slow and disabled with default settings for large resultsets.
|
||||||
|
`Exception: Sorting the result would be too slow`
|
||||||
|
`Cast from dictionary-encoded string to none-encoded would be slow`
|
||||||
|
|
||||||
|
To enable sorting of large resultsets, see:
|
||||||
|
https://stackoverflow.com/questions/62977734/omnissci-sorting-the-result-would-be-too-slow
|
||||||
|
|
||||||
|
The list of known issues is here:
|
||||||
|
https://github.com/omnisci/omniscidb/issues?q=is%3Aissue+author%3Aalexey-milovidov
|
||||||
|
|
||||||
|
# How to prepare data
|
||||||
|
|
||||||
|
Download the 100 million rows dataset from here and insert into ClickHouse:
|
||||||
|
https://clickhouse.tech/docs/en/getting-started/example-datasets/metrica/
|
||||||
|
|
||||||
|
Convert the CREATE TABLE query:
|
||||||
|
|
||||||
|
```
|
||||||
|
clickhouse-client --query "SHOW CREATE TABLE hits_100m" --format TSVRaw |
|
||||||
|
tr '`' '"' |
|
||||||
|
sed -r -e '
|
||||||
|
s/U?Int64/BIGINT/;
|
||||||
|
s/U?Int32/INTEGER/;
|
||||||
|
s/U?Int16/SMALLINT/;
|
||||||
|
s/U?Int8/TINYINT/;
|
||||||
|
s/DateTime/TIMESTAMP ENCODING FIXED(32)/;
|
||||||
|
s/ Date/ DATE ENCODING DAYS(16)/;
|
||||||
|
s/FixedString\(2\)/TEXT ENCODING DICT(16)/;
|
||||||
|
s/FixedString\(3\)/TEXT ENCODING DICT/;
|
||||||
|
s/FixedString\(\d+\)/TEXT ENCODING DICT/;
|
||||||
|
s/String/TEXT ENCODING DICT/;'
|
||||||
|
```
|
||||||
|
And cut `ENGINE` part.
|
||||||
|
|
||||||
|
The resulting CREATE TABLE query:
|
||||||
|
```
|
||||||
CREATE TABLE hits
|
CREATE TABLE hits
|
||||||
(
|
(
|
||||||
"WatchID" BIGINT,
|
"WatchID" BIGINT,
|
||||||
@ -111,7 +168,22 @@ CREATE TABLE hits
|
|||||||
"URLHash" BIGINT,
|
"URLHash" BIGINT,
|
||||||
"CLID" INTEGER
|
"CLID" INTEGER
|
||||||
);
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
Convert the dataset, prepare the list of fields for SELECT:
|
||||||
|
|
||||||
|
```
|
||||||
|
clickhouse-client --query "SHOW CREATE TABLE hits_100m" --format TSVRaw |
|
||||||
|
tr '`' '"' |
|
||||||
|
sed -r -e '
|
||||||
|
s/"(\w+)" U?Int([0-9]+)/toInt\2(\1)/;
|
||||||
|
s/"(\w+)" (Fixed)?String(\([0-9]+\))?/toValidUTF8(toString(\1))/;
|
||||||
|
s/"(\w+)" \w+/\1/'
|
||||||
|
```
|
||||||
|
|
||||||
|
The resulting SELECT query for data preparation:
|
||||||
|
|
||||||
|
```
|
||||||
SELECT
|
SELECT
|
||||||
toInt64(WatchID),
|
toInt64(WatchID),
|
||||||
toInt8(JavaEnable),
|
toInt8(JavaEnable),
|
||||||
@ -221,11 +293,39 @@ SELECT
|
|||||||
FROM hits_100m_obfuscated
|
FROM hits_100m_obfuscated
|
||||||
INTO OUTFILE '/home/milovidov/example_datasets/hits_100m_obfuscated.csv'
|
INTO OUTFILE '/home/milovidov/example_datasets/hits_100m_obfuscated.csv'
|
||||||
FORMAT CSV;
|
FORMAT CSV;
|
||||||
|
```
|
||||||
|
|
||||||
|
Upload data to OmniSci:
|
||||||
|
```
|
||||||
|
/opt/omnisci/bin/omnisql -t -p HyperInteractive
|
||||||
|
```
|
||||||
|
Run CREATE TABLE statement, then run:
|
||||||
|
```
|
||||||
COPY hits FROM '/home/milovidov/example_datasets/hits_100m_obfuscated.csv' WITH (HEADER = 'false');
|
COPY hits FROM '/home/milovidov/example_datasets/hits_100m_obfuscated.csv' WITH (HEADER = 'false');
|
||||||
|
```
|
||||||
|
|
||||||
|
Data loading took
|
||||||
|
```
|
||||||
336639 ms
|
336639 ms
|
||||||
|
```
|
||||||
|
on a server (Linux Ubuntu, Xeon E5-2560v2, 32 logical CPU, 128 GiB RAM, 8xHDD RAID-5, 40 TB).
|
||||||
|
|
||||||
|
Run benchmark:
|
||||||
|
|
||||||
|
```
|
||||||
./benchmark.sh
|
./benchmark.sh
|
||||||
|
```
|
||||||
|
|
||||||
grep -oP 'Total time: \d+' log.txt | grep -oP '\d+' | awk '{ if (i % 3 == 0) { a = $1 } else if (i % 3 == 1) { b = $1 } else if (i % 3 == 2) { c = $1; print "[" a / 1000 ", " b / 1000 ", " c / 1000 "]," }; ++i; }'
|
Prepare the result to paste into JSON:
|
||||||
|
|
||||||
|
```
|
||||||
|
grep -oP 'Total time: \d+' log.txt |
|
||||||
|
grep -oP '\d+' |
|
||||||
|
awk '{
|
||||||
|
if (i % 3 == 0) { a = $1 }
|
||||||
|
else if (i % 3 == 1) { b = $1 }
|
||||||
|
else if (i % 3 == 2) { c = $1; print "[" a / 1000 ", " b / 1000 ", " c / 1000 "]," };
|
||||||
|
++i; }'
|
||||||
|
```
|
||||||
|
|
||||||
|
And fill out `[null, null, null]` for missing runs.
|
||||||
|
Loading…
Reference in New Issue
Block a user