Merge remote-tracking branch 'upstream/master' into fix27

This commit is contained in:
proller 2019-12-27 20:57:47 +03:00
commit 7fd8452d56
10 changed files with 164 additions and 39 deletions

View File

@ -261,6 +261,10 @@ void PointInPolygonWithGrid<CoordinateType>::buildGrid()
for (size_t row = 0; row < grid_size; ++row) for (size_t row = 0; row < grid_size; ++row)
{ {
#pragma GCC diagnostic push
#if !__clang__
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
CoordinateType y_min = min_corner.y() + row * cell_height; CoordinateType y_min = min_corner.y() + row * cell_height;
CoordinateType y_max = min_corner.y() + (row + 1) * cell_height; CoordinateType y_max = min_corner.y() + (row + 1) * cell_height;
@ -268,6 +272,7 @@ void PointInPolygonWithGrid<CoordinateType>::buildGrid()
{ {
CoordinateType x_min = min_corner.x() + col * cell_width; CoordinateType x_min = min_corner.x() + col * cell_width;
CoordinateType x_max = min_corner.x() + (col + 1) * cell_width; CoordinateType x_max = min_corner.x() + (col + 1) * cell_width;
#pragma GCC diagnostic pop
Box cell_box(Point(x_min, y_min), Point(x_max, y_max)); Box cell_box(Point(x_min, y_min), Point(x_max, y_max));
Polygon cell_bound; Polygon cell_bound;

View File

@ -429,21 +429,30 @@ IProcessor::Status SortingAggregatedTransform::prepare()
continue; continue;
} }
all_finished = false; //all_finished = false;
in->setNeeded(); in->setNeeded();
if (!in->hasData()) if (!in->hasData())
{ {
need_data = true; need_data = true;
all_finished = false;
continue; continue;
} }
auto chunk = in->pull(); auto chunk = in->pull();
addChunk(std::move(chunk), input_num);
if (in->isFinished())
{
is_input_finished[input_num] = true;
}
else
{
/// If chunk was pulled, then we need data from this port. /// If chunk was pulled, then we need data from this port.
need_data = true; need_data = true;
all_finished = false;
addChunk(std::move(chunk), input_num); }
} }
if (pushed_to_output) if (pushed_to_output)

View File

@ -1,9 +1,17 @@
# docker build -t yandex/clickhouse-performance-comparison . # docker build -t yandex/clickhouse-performance-comparison .
FROM alpine FROM ubuntu:18.04
RUN apk update && apk add --no-cache bash wget python3 python3-dev g++ RUN apt-get update \
RUN pip3 --no-cache-dir install clickhouse_driver && apt-get install --yes --no-install-recommends \
RUN apk del g++ python3-dev p7zip-full bash ncdu wget python3 python3-pip python3-dev g++ \
&& pip3 --no-cache-dir install clickhouse_driver \
&& apt-get purge --yes python3-dev g++ \
&& apt-get autoremove --yes \
&& apt-get clean
COPY * / COPY * /
CMD /entrypoint.sh
# docker run --network=host --volume <workspace>:/workspace --volume=<output>:/output -e LEFT_PR=<> -e LEFT_SHA=<> -e RIGHT_PR=<> -e RIGHT_SHA=<> yandex/clickhouse-performance-comparison

View File

@ -6,8 +6,6 @@ trap "kill 0" EXIT
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
mkdir left ||:
mkdir right ||:
mkdir db0 ||: mkdir db0 ||:
left_pr=$1 left_pr=$1
@ -18,19 +16,21 @@ right_sha=$4
function download function download
{ {
rm -r left ||:
mkdir left ||:
rm -r right ||:
mkdir right ||:
la="$left_pr-$left_sha.tgz" la="$left_pr-$left_sha.tgz"
ra="$right_pr-$right_sha.tgz" ra="$right_pr-$right_sha.tgz"
wget -nd -c "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/performance/performance.tgz" -O "$la" && tar -C left --strip-components=1 -zxvf "$la" & wget -q -nd -c "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/performance/performance.tgz" -O "$la" && tar -C left --strip-components=1 -zxvf "$la" &
wget -nd -c "https://clickhouse-builds.s3.yandex.net/$right_pr/$right_sha/performance/performance.tgz" -O "$ra" && tar -C right --strip-components=1 -zxvf "$ra" & wget -q -nd -c "https://clickhouse-builds.s3.yandex.net/$right_pr/$right_sha/performance/performance.tgz" -O "$ra" && tar -C right --strip-components=1 -zxvf "$ra" &
cd db0 && wget -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" && tar -xvf hits_10m_single.tar & cd db0 && wget -q -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" && tar -xvf hits_10m_single.tar &
cd db0 && wget -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" && tar -xvf hits_100m_single.tar & cd db0 && wget -q -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" && tar -xvf hits_100m_single.tar &
cd db0 && wget -nd -c "https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" && tar -xvf hits_v1.tar & cd db0 && wget -q -nd -c "https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" && tar -xvf hits_v1.tar &
cd db0 && wget -nd -c "https://clickhouse-datasets.s3.yandex.net/visits/partitions/visits_v1.tar" && tar -xvf visits_v1.tar &
wait wait
# Use hardlinks instead of copying # Use hardlinks instead of copying
rm -r left/db ||:
rm -r right/db ||:
cp -al db0/ left/db/ cp -al db0/ left/db/
cp -al db0/ right/db/ cp -al db0/ right/db/
} }
@ -40,16 +40,26 @@ function configure
{ {
sed -i 's/<tcp_port>9000/<tcp_port>9001/g' right/config/config.xml sed -i 's/<tcp_port>9000/<tcp_port>9001/g' right/config/config.xml
cat > right/config/config.d/perf-test-tweaks.xml <<EOF cat > right/config/config.d/zz-perf-test-tweaks.xml <<EOF
<yandex> <yandex>
<logger> <logger>
<console>true</console> <console>true</console>
</logger> </logger>
<text_log remove="remove"/> <text_log remove="remove">
<table remove="remove"/>
</text_log>
<metric_log remove="remove">
<table remove="remove"/>
</metric_log>
</yandex> </yandex>
EOF EOF
cp right/config/config.d/perf-test-tweaks.xml left/config/config.d/perf-test-tweaks.xml cp right/config/config.d/zz-perf-test-tweaks.xml left/config/config.d/zz-perf-test-tweaks.xml
rm left/config/config.d/metric_log.xml ||:
rm left/config/config.d/text_log.xml ||:
rm right/config/config.d/metric_log.xml ||:
rm right/config/config.d/text_log.xml ||:
} }
configure configure
@ -78,6 +88,11 @@ function restart
while ! right/clickhouse client --port 9001 --query "select 1" ; do kill -0 $right_pid ; echo . ; sleep 1 ; done while ! right/clickhouse client --port 9001 --query "select 1" ; do kill -0 $right_pid ; echo . ; sleep 1 ; done
echo right ok echo right ok
right/clickhouse client --port 9001 --query "create database test" ||:
right/clickhouse client --port 9001 --query "rename table datasets.hits_v1 to test.hits" ||:
left/clickhouse client --port 9000 --query "create database test" ||:
left/clickhouse client --port 9000 --query "rename table datasets.hits_v1 to test.hits" ||:
} }
restart restart
@ -90,13 +105,14 @@ function run_tests
for test in left/performance/*.xml for test in left/performance/*.xml
do do
test_name=$(basename $test ".xml") test_name=$(basename $test ".xml")
"$script_dir/perf.py" "$test" > "$test_name-raw.tsv" || continue "$script_dir/perf.py" "$test" > "$test_name-raw.tsv" 2> "$test_name-err.log" || continue
right/clickhouse local --file "$test_name-raw.tsv" --structure 'query text, run int, version UInt32, time float' --query "$(cat $script_dir/eqmed.sql)" > "$test_name-report.tsv" right/clickhouse local --file "$test_name-raw.tsv" --structure 'query text, run int, version UInt32, time float' --query "$(cat $script_dir/eqmed.sql)" > "$test_name-report.tsv"
done done
} }
run_tests run_tests
# Analyze results # Analyze results
result_structure="fail int, left float, right float, diff float, rd Array(float), query text" result_structure="left float, right float, diff float, rd Array(float), query text"
right/clickhouse local --file '*-report.tsv' -S "$result_structure" --query "select * from table where rd[3] > 0.05 order by rd[3] desc" > flap-prone.tsv right/clickhouse local --file '*-report.tsv' -S "$result_structure" --query "select * from table where rd[3] > 0.05 order by rd[3] desc" > flap-prone.tsv
right/clickhouse local --file '*-report.tsv' -S "$result_structure" --query "select * from table where diff > 0.05 and diff > rd[3] order by diff desc" > failed.tsv right/clickhouse local --file '*-report.tsv' -S "$result_structure" --query "select * from table where diff > 0.05 and diff > rd[3] order by diff desc" > bad-perf.tsv
grep Exception:[^:] *-err.log > run-errors.log

View File

@ -0,0 +1,8 @@
#!/bin/bash
cd /workspace
../compare.sh $LEFT_PR $LEFT_SHA $RIGHT_PR $RIGHT_SHA > compare.log 2>&1
7z a /output/output.7z *.log *.tsv
cp compare.log /output

View File

@ -38,4 +38,4 @@ from
group by query group by query
) original_medians_array ) original_medians_array
where rd.query = original_medians_array.query where rd.query = original_medians_array.query
order by fail desc, rd_quantiles_percent[3] asc; order by rd_quantiles_percent[3] desc;

View File

@ -15,8 +15,13 @@ root = tree.getroot()
# Check main metric # Check main metric
main_metric_element = root.find('main_metric/*') main_metric_element = root.find('main_metric/*')
if main_metric_element and main_metric_element.tag != 'min_time': if main_metric_element is not None and main_metric_element.tag != 'min_time':
raise Exception('Only the min_time main metric is supported. This test uses \'{}\''.format(main_metric)) raise Exception('Only the min_time main metric is supported. This test uses \'{}\''.format(main_metric_element.tag))
# FIXME another way to detect infinite tests. They should have an appropriate main_metric but sometimes they don't.
infinite_sign = root.find('.//average_speed_not_changing_for_ms')
if infinite_sign is not None:
raise Exception('Looks like the test is infinite (sign 1)')
# Open connections # Open connections
servers = [{'host': 'localhost', 'port': 9000, 'client_name': 'left'}, {'host': 'localhost', 'port': 9001, 'client_name': 'right'}] servers = [{'host': 'localhost', 'port': 9000, 'client_name': 'left'}, {'host': 'localhost', 'port': 9001, 'client_name': 'right'}]
@ -24,12 +29,9 @@ connections = [clickhouse_driver.Client(**server) for server in servers]
# Check tables that should exist # Check tables that should exist
tables = [e.text for e in root.findall('preconditions/table_exists')] tables = [e.text for e in root.findall('preconditions/table_exists')]
if tables: for t in tables:
for c in connections: for c in connections:
tables_list = ", ".join("'{}'".format(t) for t in tables) res = c.execute("select 1 from {}".format(t))
res = c.execute("select t from values('t text', {}) anti join system.tables on database = currentDatabase() and name = t".format(tables_list))
if res:
raise Exception('Some tables are not found: {}'.format(res))
# Apply settings # Apply settings
settings = root.findall('settings/*') settings = root.findall('settings/*')
@ -76,6 +78,9 @@ for c in connections:
c.execute(q) c.execute(q)
# Run test queries # Run test queries
def tsv_escape(s):
return s.replace('\\', '\\\\').replace('\t', '\\t').replace('\n', '\\n').replace('\r','')
test_query_templates = [q.text for q in root.findall('query')] test_query_templates = [q.text for q in root.findall('query')]
test_queries = substitute_parameters(test_query_templates, parameter_combinations) test_queries = substitute_parameters(test_query_templates, parameter_combinations)
@ -83,7 +88,7 @@ for q in test_queries:
for run in range(0, 7): for run in range(0, 7):
for conn_index, c in enumerate(connections): for conn_index, c in enumerate(connections):
res = c.execute(q) res = c.execute(q)
print(q + '\t' + str(run) + '\t' + str(conn_index) + '\t' + str(c.last_query.elapsed)) print(tsv_escape(q) + '\t' + str(run) + '\t' + str(conn_index) + '\t' + str(c.last_query.elapsed))
# Run drop queries # Run drop queries
drop_query_templates = [q.text for q in root.findall('drop_query')] drop_query_templates = [q.text for q in root.findall('drop_query')]

View File

@ -21,4 +21,36 @@ If you use Oracle through the ODBC driver as a source of external dictionaries,
NLS_LANG=RUSSIAN_RUSSIA.UTF8 NLS_LANG=RUSSIAN_RUSSIA.UTF8
``` ```
## How to export data from ClickHouse to the file?
### Using INTO OUTFILE Clause
Add [INTO OUTFILE](../query_language/select/#into-outfile-clause) clause to your query.
For example:
```sql
SELECT * FROM table INTO OUTFILE 'file'
```
By default, ClickHouse uses the [TabSeparated](../interfaces/formats.md#tabseparated) format for output data. To select the [data format](../interfaces/formats.md), use the [FORMAT clause](../query_language/select/#format-clause).
For example:
```sql
SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV
```
### Using File-engine Table
See [File](../operations/table_engines/file.md).
### Using Command-line Redirection
```sql
$ clickhouse-client --query "SELECT * from table" > result.txt
```
See [clickhouse-client](../interfaces/cli.md).
[Original article](https://clickhouse.yandex/docs/en/faq/general/) <!--hide--> [Original article](https://clickhouse.yandex/docs/en/faq/general/) <!--hide-->

View File

@ -29,6 +29,7 @@ The supported formats are:
| [PrettySpace](#prettyspace) | ✗ | ✔ | | [PrettySpace](#prettyspace) | ✗ | ✔ |
| [Protobuf](#protobuf) | ✔ | ✔ | | [Protobuf](#protobuf) | ✔ | ✔ |
| [Parquet](#data-format-parquet) | ✔ | ✔ | | [Parquet](#data-format-parquet) | ✔ | ✔ |
| [ORC](#data-format-orc) | ✔ | ✗ |
| [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ |
| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ |
| [Native](#native) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ |
@ -954,16 +955,57 @@ Data types of a ClickHouse table columns can differ from the corresponding field
You can insert Parquet data from a file into ClickHouse table by the following command: You can insert Parquet data from a file into ClickHouse table by the following command:
```bash ```bash
cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet"
``` ```
You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command: You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command:
```sql ```bash
clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq}
``` ```
To exchange data with the Hadoop, you can use [HDFS table engine](../operations/table_engines/hdfs.md). To exchange data with Hadoop, you can use [HDFS table engine](../operations/table_engines/hdfs.md).
## ORC {#data-format-orc}
[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. ClickHouse supports only read operations for this format.
### Data Types Matching
The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` queries.
| ORC data type (`INSERT`) | ClickHouse data type |
| -------------------- | ------------------ |
| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) |
| `INT8` | [Int8](../data_types/int_uint.md) |
| `UINT16` | [UInt16](../data_types/int_uint.md) |
| `INT16` | [Int16](../data_types/int_uint.md) |
| `UINT32` | [UInt32](../data_types/int_uint.md) |
| `INT32` | [Int32](../data_types/int_uint.md) |
| `UINT64` | [UInt64](../data_types/int_uint.md) |
| `INT64` | [Int64](../data_types/int_uint.md) |
| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) |
| `DOUBLE` | [Float64](../data_types/float.md) |
| `DATE32` | [Date](../data_types/date.md) |
| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) |
| `STRING`, `BINARY` | [String](../data_types/string.md) |
| `DECIMAL` | [Decimal](../data_types/decimal.md) |
ClickHouse supports configurable precision of `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type.
Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
Data types of a ClickHouse table columns can differ from the corresponding fields of the ORC data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to that data type which is set for the ClickHouse table column.
### Inserting Data
You can insert Parquet data from a file into ClickHouse table by the following command:
```bash
$ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT ORC"
```
To exchange data with Hadoop, you can use [HDFS table engine](../operations/table_engines/hdfs.md).
## Format Schema {#formatschema} ## Format Schema {#formatschema}

View File

@ -1120,7 +1120,7 @@ The structure of results (the number and type of columns) must match for the que
Queries that are parts of UNION ALL can't be enclosed in brackets. ORDER BY and LIMIT are applied to separate queries, not to the final result. If you need to apply a conversion to the final result, you can put all the queries with UNION ALL in a subquery in the FROM clause. Queries that are parts of UNION ALL can't be enclosed in brackets. ORDER BY and LIMIT are applied to separate queries, not to the final result. If you need to apply a conversion to the final result, you can put all the queries with UNION ALL in a subquery in the FROM clause.
### INTO OUTFILE Clause ### INTO OUTFILE Clause {#into-outfile-clause}
Add the `INTO OUTFILE filename` clause (where filename is a string literal) to redirect query output to the specified file. Add the `INTO OUTFILE filename` clause (where filename is a string literal) to redirect query output to the specified file.
In contrast to MySQL, the file is created on the client side. The query will fail if a file with the same filename already exists. In contrast to MySQL, the file is created on the client side. The query will fail if a file with the same filename already exists.
@ -1128,7 +1128,7 @@ This functionality is available in the command-line client and clickhouse-local
The default output format is TabSeparated (the same as in the command-line client batch mode). The default output format is TabSeparated (the same as in the command-line client batch mode).
### FORMAT Clause ### FORMAT Clause {#format-clause}
Specify 'FORMAT format' to get data in any specified format. Specify 'FORMAT format' to get data in any specified format.
You can use this for convenience, or for creating dumps. You can use this for convenience, or for creating dumps.