Merge branch 'master' into sparse-serialization

This commit is contained in:
Anton Popov 2021-11-02 06:26:18 +03:00 committed by GitHub
commit 1628f50e51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
121 changed files with 3091 additions and 1456 deletions

View File

@ -207,6 +207,35 @@ jobs:
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
StressTestDebug:
needs: [BuilderDebDebug]
runs-on: [self-hosted, stress-tester]
steps:
- name: Download json reports
uses: actions/download-artifact@v2
with:
path: ${{runner.temp}}/reports_dir
- name: Check out repository code
uses: actions/checkout@v2
- name: Stress test
env:
TEMP_PATH: ${{runner.temp}}/stress_debug
REPORTS_PATH: ${{runner.temp}}/reports_dir
CHECK_NAME: 'Stress tests (debug, actions)'
REPO_COPY: ${{runner.temp}}/stress_debug/ClickHouse
REQUIRED_BUILD_NUMBER: 7
run: |
sudo rm -fr $TEMP_PATH
mkdir -p $TEMP_PATH
cp -r $GITHUB_WORKSPACE $TEMP_PATH
cd $REPO_COPY/tests/ci
python3 stress_check.py "$CHECK_NAME" $REQUIRED_BUILD_NUMBER
- name: Cleanup
if: always()
run: |
docker kill $(docker ps -q) ||:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FastTest:
needs: DockerHubPush
if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
@ -231,7 +260,7 @@ jobs:
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr $TEMP_PATH
FinishCheck:
needs: [StyleCheck, DockerHubPush, CheckLabels, BuilderReport, FastTest, FunctionalStatelessTestDebug, FunctionalStatefulTestDebug, DocsCheck]
needs: [StyleCheck, DockerHubPush, CheckLabels, BuilderReport, FastTest, FunctionalStatelessTestDebug, FunctionalStatefulTestDebug, DocsCheck, StressTestDebug]
runs-on: [self-hosted, style-checker]
steps:
- name: Check out repository code

2
.gitmodules vendored
View File

@ -76,7 +76,7 @@
url = https://github.com/ClickHouse-Extras/libcxxabi.git
[submodule "contrib/snappy"]
path = contrib/snappy
url = https://github.com/google/snappy
url = https://github.com/ClickHouse-Extras/snappy.git
[submodule "contrib/cppkafka"]
path = contrib/cppkafka
url = https://github.com/mfontanini/cppkafka.git

View File

@ -2,11 +2,11 @@
# NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
SET(VERSION_REVISION 54456)
SET(VERSION_REVISION 54457)
SET(VERSION_MAJOR 21)
SET(VERSION_MINOR 11)
SET(VERSION_MINOR 12)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH 7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7)
SET(VERSION_DESCRIBE v21.11.1.1-prestable)
SET(VERSION_STRING 21.11.1.1)
SET(VERSION_GITHASH 503a418dedf0011e9040c3a1b6913e0b5488be4c)
SET(VERSION_DESCRIBE v21.12.1.1-prestable)
SET(VERSION_STRING 21.12.1.1)
# end of autochange

2
contrib/snappy vendored

@ -1 +1 @@
Subproject commit 3f194acb57e0487531c96b97af61dcbd025a78a3
Subproject commit fb057edfed820212076239fd32cb2ff23e9016bf

4
debian/changelog vendored
View File

@ -1,5 +1,5 @@
clickhouse (21.11.1.1) unstable; urgency=low
clickhouse (21.12.1.1) unstable; urgency=low
* Modified source code
-- clickhouse-release <clickhouse-release@yandex-team.ru> Thu, 09 Sep 2021 12:03:26 +0300
-- clickhouse-release <clickhouse-release@yandex-team.ru> Tue, 02 Nov 2021 00:56:42 +0300

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.com/deb/stable/ main/"
ARG version=21.11.1.*
ARG version=21.12.1.*
RUN sed -i 's|http://archive|http://ru.archive|g' /etc/apt/sources.list

View File

@ -1,7 +1,7 @@
FROM ubuntu:20.04
ARG repository="deb https://repo.clickhouse.com/deb/stable/ main/"
ARG version=21.11.1.*
ARG version=21.12.1.*
ARG gosu_ver=1.10
# set non-empty deb_location_url url to create a docker image

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.com/deb/stable/ main/"
ARG version=21.11.1.*
ARG version=21.12.1.*
RUN apt-get update && \
apt-get install -y apt-transport-https dirmngr && \

View File

@ -634,7 +634,7 @@ create view query_display_names as select * from
create view partial_query_times as select * from
file('analyze/partial-query-times.tsv', TSVWithNamesAndTypes,
'test text, query_index int, time_stddev float, time_median float')
'test text, query_index int, time_stddev float, time_median double')
;
-- Report for partial queries that we could only run on the new server (e.g.

View File

@ -26,4 +26,6 @@ COPY ./stress /stress
COPY run.sh /
ENV DATASETS="hits visits"
ENV S3_URL="https://clickhouse-datasets.s3.yandex.net"
CMD ["/bin/bash", "/run.sh"]

View File

@ -112,7 +112,7 @@ configure
start
# shellcheck disable=SC2086 # No quotes because I want to split it into words.
/s3downloader --dataset-names $DATASETS
/s3downloader --url-prefix "$S3_URL" --dataset-names $DATASETS
chmod 777 -R /var/lib/clickhouse
clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordinary"
clickhouse-client --query "CREATE DATABASE IF NOT EXISTS test"

View File

@ -16,10 +16,13 @@ The supported formats are:
| [TabSeparatedRaw](#tabseparatedraw) | ✔ | ✔ |
| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ |
| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ |
| [TabSeparatedRawWithNames](#tabseparatedrawwithnames) | ✔ | ✔ |
| [TabSeparatedRawWithNamesAndTypes](#tabseparatedrawwithnamesandtypes) | ✔ | ✔ |
| [Template](#format-template) | ✔ | ✔ |
| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ |
| [CSV](#csv) | ✔ | ✔ |
| [CSVWithNames](#csvwithnames) | ✔ | ✔ |
| [CSVWithNamesAndTypes](#csvwithnamesandtypes) | ✔ | ✔ |
| [CustomSeparated](#format-customseparated) | ✔ | ✔ |
| [Values](#data-format-values) | ✔ | ✔ |
| [Vertical](#vertical) | ✗ | ✔ |
@ -33,8 +36,10 @@ The supported formats are:
| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ |
| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ |
| [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ |
| [JSONCompactEachRowWithNames](#jsoncompacteachrowwithnames) | ✔ | ✔ |
| [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ |
| [JSONCompactStringsEachRow](#jsoncompactstringseachrow) | ✔ | ✔ |
| [JSONCompactStringsEachRowWithNames](#jsoncompactstringseachrowwithnames) | ✔ | ✔ |
| [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔ | ✔ |
| [TSKV](#tskv) | ✔ | ✔ |
| [Pretty](#pretty) | ✗ | ✔ |
@ -51,6 +56,7 @@ The supported formats are:
| [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ |
| [ORC](#data-format-orc) | ✔ | ✔ |
| [RowBinary](#rowbinary) | ✔ | ✔ |
| [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ |
| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ |
| [Native](#native) | ✔ | ✔ |
| [Null](#null) | ✗ | ✔ |
@ -126,6 +132,9 @@ Arrays are written as a list of comma-separated values in square brackets. Numbe
[NULL](../sql-reference/syntax.md) is formatted as `\N`.
If setting [input_format_tsv_empty_as_default](../operations/settings/settings.md#settings-input_format_tsv_empty_as_default) is enabled,
empty input fields are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too.
Each element of [Nested](../sql-reference/data-types/nested-data-structures/nested.md) structures is represented as array.
For example:
@ -164,17 +173,35 @@ This format is also available under the name `TSVRaw`.
## TabSeparatedWithNames {#tabseparatedwithnames}
Differs from the `TabSeparated` format in that the column names are written in the first row.
During parsing, the first row is expected to contain the column names. You can use column names to determine their position and to check their correctness.
If setting [input_format_with_names_use_header](../operations/settings/settings.md#settings-input_format_with_names_use_header) is set to 1,
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](../operations/settings/settings.md#settings-input_format_skip_unknown_fields) is set to 1.
Otherwise, the first row will be skipped.
This format is also available under the name `TSVWithNames`.
## TabSeparatedWithNamesAndTypes {#tabseparatedwithnamesandtypes}
Differs from the `TabSeparated` format in that the column names are written to the first row, while the column types are in the second row.
During parsing, the first and second rows are completely ignored.
The first row with names is processed the same way as in `TabSeparatedWithNames` format.
If setting [input_format_with_types_use_header](../operations/settings/settings.md#settings-input_format_with_types_use_header) is set to 1,
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
This format is also available under the name `TSVWithNamesAndTypes`.
## TabSeparatedRawWithNames {#tabseparatedrawwithnames}
Differs from `TabSeparatedWithNames` format in that the rows are written without escaping.
When parsing with this format, tabs or linefeeds are not allowed in each field.
This format is also available under the name `TSVRawWithNames`.
## TabSeparatedWithNamesAndTypes {#tabseparatedrawwithnamesandtypes}
Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping.
When parsing with this format, tabs or linefeeds are not allowed in each field.
This format is also available under the name `TSVRawWithNamesAndNames`.
## Template {#format-template}
This format allows specifying a custom format string with placeholders for values with a specified escaping rule.
@ -195,7 +222,7 @@ where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as
- `Raw` (without escaping, similarly to `TSVRaw`)
- `None` (no escaping rule, see further)
If an escaping rule is omitted, then `None` will be used. `XML` and `Raw` are suitable only for output.
If an escaping rule is omitted, then `None` will be used. `XML` is suitable only for output.
So, for the following format string:
@ -375,9 +402,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR
When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported.
Empty unquoted input values are replaced with default values for the respective columns, if
[input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)
is enabled.
If setting [input_format_csv_empty_as_default](../operations/settings/settings.md#settings-input_format_csv_empty_as_default) is enabled,
empty unquoted input values are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too.
`NULL` is formatted as `\N` or `NULL` or an empty unquoted string (see settings [input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) and [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)).
@ -385,7 +411,11 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
## CSVWithNames {#csvwithnames}
Also prints the header row, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
## CSVWithNamesAndTypes {#csvwithnamesandtypes}
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
## CustomSeparated {#format-customseparated}
@ -657,10 +687,21 @@ Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yie
{"progress":{"read_rows":"3","read_bytes":"24","written_rows":"0","written_bytes":"0","total_rows_to_read":"3"}}
```
## JSONCompactEachRowWithNames {#jsoncompacteachrowwithnames}
Differs from `JSONCompactEachRow` format in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes}
Differs from `JSONCompactEachRow` format in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
## JSONCompactStringsEachRowWithNames {#jsoncompactstringseachrowwithnames}
Differs from `JSONCompactStringsEachRow` in that in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes}
Differs from `JSONCompactEachRow`/`JSONCompactStringsEachRow` in that the column names and types are written as the first two rows.
Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
```json
["'hello'", "multiply(42, number)", "range(5)"]
@ -910,6 +951,13 @@ Array is represented as a varint length (unsigned [LEB128](https://en.wikipedia.
For [NULL](../sql-reference/syntax.md#null-literal) support, an additional byte containing 1 or 0 is added before each [Nullable](../sql-reference/data-types/nullable.md) value. If 1, then the value is `NULL` and this byte is interpreted as a separate value. If 0, the value after the byte is not `NULL`.
## RowBinaryWithNames {#rowbinarywithnames}
Similar to [RowBinary](#rowbinary), but with added header:
- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N)
- N `String`s specifying column names
## RowBinaryWithNamesAndTypes {#rowbinarywithnamesandtypes}
Similar to [RowBinary](#rowbinary), but with added header:

View File

@ -399,7 +399,7 @@ Default value: 1.
## input_format_defaults_for_omitted_fields {#session_settings-input_format_defaults_for_omitted_fields}
When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv) and [TabSeparated](../../interfaces/formats.md#tabseparated) formats.
When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv), [TabSeparated](../../interfaces/formats.md#tabseparated) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes.
!!! note "Note"
When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance.
@ -417,6 +417,12 @@ When enabled, replace empty input fields in TSV with default values. For complex
Disabled by default.
## input_format_csv_empty_as_default {#settings-input-format-csv-empty-as-default}
When enabled, replace empty input fields in CSV with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too.
Enabled by default.
## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number}
Enables or disables parsing enum values as enum ids for TSV input format.
@ -540,8 +546,40 @@ To improve insert performance, we recommend disabling this check if you are sure
Supported formats:
- [CSVWithNames](../../interfaces/formats.md#csvwithnames)
- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames)
- [CSVWithNames](../../interfaces/formats.md#csvwithnames)
- [CSVWithNames](../../interfaces/formats.md#csvwithnamesandtypes)
- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames)
- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md#tabseparatedwithnamesandtypes)
- [JSONCompactEachRowWithNames](../../interfaces/formats.md#jsoncompacteachrowwithnames)
- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompacteachrowwithnamesandtypes)
- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md#jsoncompactstringseachrowwithnames)
- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompactstringseachrowwithnamesandtypes)
- [RowBinaryWithNames](../../interfaces/formats.md#rowbinarywithnames-rowbinarywithnames)
- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes)
Possible values:
- 0 — Disabled.
- 1 — Enabled.
Default value: 1.
## input_format_with_types_use_header {#settings-input-format-with-types-use-header}
Controls whether format parser should check if data types from the input data match data types from the target table.
Supported formats:
- [CSVWithNames](../../interfaces/formats.md#csvwithnames)
- [CSVWithNames](../../interfaces/formats.md#csvwithnamesandtypes)
- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames)
- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md#tabseparatedwithnamesandtypes)
- [JSONCompactEachRowWithNames](../../interfaces/formats.md#jsoncompacteachrowwithnames)
- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompacteachrowwithnamesandtypes)
- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md#jsoncompactstringseachrowwithnames)
- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompactstringseachrowwithnamesandtypes)
- [RowBinaryWithNames](../../interfaces/formats.md#rowbinarywithnames-rowbinarywithnames)
- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes)
Possible values:

View File

@ -8,7 +8,7 @@ toc_title: DATABASE
Creates a new database.
``` sql
CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)]
CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] [COMMENT 'Comment']
```
## Clauses {#clauses}
@ -26,4 +26,33 @@ ClickHouse creates the `db_name` database on all the servers of a specified clus
### ENGINE {#engine}
[MySQL](../../../engines/database-engines/mysql.md) allows you to retrieve data from the remote MySQL server. By default, ClickHouse uses its own [database engine](../../../engines/database-engines/index.md). Theres also a [lazy](../../../engines/database-engines/lazy.md) engine.
By default, ClickHouse uses its own [Atomic](../../../engines/database-engines/atomic.md) database engine. There are also [Lazy](../../../engines/database-engines/lazy.md), [MySQL](../../../engines/database-engines/mysql.md), [PostgresSQL](../../../engines/database-engines/postgresql.md), [MaterializedMySQL](../../../engines/database-engines/materialized-mysql.md), [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md), [Replicated](../../../engines/database-engines/replicated.md), [SQLite](../../../engines/database-engines/sqlite.md).
### COMMENT {#comment}
You can add a comment to the database when you creating it.
The comment is supported for all database engines.
**Syntax**
``` sql
CREATE DATABASE db_name ENGINE = engine(...) COMMENT 'Comment'
```
**Example**
Query:
``` sql
CREATE DATABASE db_comment ENGINE = Memory COMMENT 'The temporary database';
SELECT name, comment FROM system.databases WHERE name = 'db_comment';
```
Result:
```text
┌─name───────┬─comment────────────────┐
│ db_comment │ The temporary database │
└────────────┴────────────────────────┘
```

View File

@ -8,27 +8,51 @@ toc_title: "База данных"
Создает базу данных.
``` sql
CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)]
CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] [COMMENT 'Comment']
```
### Секции {#sektsii}
## Секции {#clauses}
- `IF NOT EXISTS`
### IF NOT EXISTS {#if-not-exists}
Если база данных с именем `db_name` уже существует, то ClickHouse не создаёт базу данных и:
- Не генерирует исключение, если секция указана.
- Генерирует исключение, если секция не указана.
Если база данных с именем `db_name` уже существует, то ClickHouse не создает базу данных и:
- `ON CLUSTER`
- Не генерирует исключение, если секция указана.
- Генерирует исключение, если секция не указана.
ClickHouse создаёт базу данных `db_name` на всех серверах указанного кластера.
### ON CLUSTER {#on-cluster}
- `ENGINE`
ClickHouse создаёт базу данных с именем `db_name` на всех серверах указанного кластера. Более подробную информацию смотрите в разделе [Распределенные DDL запросы](../../../sql-reference/distributed-ddl.md).
- MySQL
### ENGINE {#engine}
Позволяет получать данные с удаленного сервера MySQL.
По умолчанию ClickHouse использует собственный движок баз данных [Atomic](../../../engines/database-engines/atomic.md). Есть также движки баз данных [Lazy](../../../engines/database-engines/lazy.md), [MySQL](../../../engines/database-engines/mysql.md), [PostgresSQL](../../../engines/database-engines/postgresql.md), [MaterializedMySQL](../../../engines/database-engines/materialized-mysql.md), [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md), [Replicated](../../../engines/database-engines/replicated.md), [SQLite](../../../engines/database-engines/sqlite.md).
По умолчанию ClickHouse использует собственный движок баз данных.
### COMMENT {#comment}
<!--hide-->
Вы можете добавить комментарий к базе данных при ее создании.
Комментарий поддерживается для всех движков баз данных.
**Синтаксис**
``` sql
CREATE DATABASE db_name ENGINE = engine(...) COMMENT 'Comment'
```
**Пример**
Запрос:
``` sql
CREATE DATABASE db_comment ENGINE = Memory COMMENT 'The temporary database';
SELECT name, comment FROM system.databases WHERE name = 'db_comment';
```
Результат:
```text
┌─name───────┬─comment────────────────┐
│ db_comment │ The temporary database │
└────────────┴────────────────────────┘
```

View File

@ -1007,9 +1007,6 @@ void Client::addOptions(OptionsDescription & options_description)
("max_client_network_bandwidth", po::value<int>(), "the maximum speed of data exchange over the network for the client in bytes per second.")
("compression", po::value<bool>(), "enable or disable compression")
("log-level", po::value<std::string>(), "client log level")
("server_logs_file", po::value<std::string>(), "put server logs into specified file")
("query-fuzzer-runs", po::value<int>()->default_value(0), "After executing every SELECT query, do random mutations in it and run again specified number of times. This is used for testing to discover unexpected corner cases.")
("interleave-queries-file", po::value<std::vector<std::string>>()->multitoken(),
"file path with queries to execute before every file from 'queries-file'; multiple files can be specified (--queries-file file1 file2...); this is needed to enable more aggressive fuzzing of newly added tests (see 'query-fuzzer-runs' option)")
@ -1125,8 +1122,6 @@ void Client::processOptions(const OptionsDescription & options_description,
max_client_network_bandwidth = options["max_client_network_bandwidth"].as<int>();
if (options.count("compression"))
config().setBool("compression", options["compression"].as<bool>());
if (options.count("server_logs_file"))
server_logs_file = options["server_logs_file"].as<std::string>();
if (options.count("no-warnings"))
config().setBool("no-warnings", true);

View File

@ -4,6 +4,7 @@
#include <Poco/String.h>
#include <Poco/Logger.h>
#include <Poco/NullChannel.h>
#include <Poco/SimpleFileChannel.h>
#include <Databases/DatabaseMemory.h>
#include <Storages/System/attachSystemTables.h>
#include <Storages/System/attachInformationSchemaTables.h>
@ -182,23 +183,6 @@ void LocalServer::initialize(Poco::Util::Application & self)
auto loaded_config = config_processor.loadConfig();
config().add(loaded_config.configuration.duplicate(), PRIO_DEFAULT, false);
}
if (config().has("logger.console") || config().has("logger.level") || config().has("logger.log"))
{
// force enable logging
config().setString("logger", "logger");
// sensitive data rules are not used here
buildLoggers(config(), logger(), "clickhouse-local");
}
else
{
// Turn off server logging to stderr
if (!config().has("verbose"))
{
Poco::Logger::root().setLevel("none");
Poco::Logger::root().setChannel(Poco::AutoPtr<Poco::NullChannel>(new Poco::NullChannel()));
}
}
}
@ -497,6 +481,35 @@ void LocalServer::processConfig()
}
print_stack_trace = config().getBool("stacktrace", false);
auto logging = (config().has("logger.console")
|| config().has("logger.level")
|| config().has("log-level")
|| config().has("logger.log"));
auto file_logging = config().has("server_logs_file");
if (is_interactive && logging && !file_logging)
throw Exception("For interactive mode logging is allowed only with --server_logs_file option",
ErrorCodes::BAD_ARGUMENTS);
if (file_logging)
{
auto level = Poco::Logger::parseLevel(config().getString("log-level", "trace"));
Poco::Logger::root().setLevel(level);
Poco::Logger::root().setChannel(Poco::AutoPtr<Poco::SimpleFileChannel>(new Poco::SimpleFileChannel(server_logs_file)));
}
else if (logging)
{
// force enable logging
config().setString("logger", "logger");
// sensitive data rules are not used here
buildLoggers(config(), logger(), "clickhouse-local");
}
else
{
Poco::Logger::root().setLevel("none");
Poco::Logger::root().setChannel(Poco::AutoPtr<Poco::NullChannel>(new Poco::NullChannel()));
}
shared_context = Context::createShared();
global_context = Context::createGlobal(shared_context.get());

View File

@ -1612,9 +1612,13 @@ void ClientBase::init(int argc, char ** argv)
("disable_suggestion,A", "Disable loading suggestion data. Note that suggestion data is loaded asynchronously through a second connection to ClickHouse server. Also it is reasonable to disable suggestion if you want to paste a query with TAB characters. Shorthand option -A is for those who get used to mysql client.")
("time,t", "print query execution time to stderr in non-interactive mode (for benchmarks)")
("echo", "in batch mode, print query before execution")
("verbose", "print query and other debugging info")
("log-level", po::value<std::string>(), "log level")
("server_logs_file", po::value<std::string>(), "put server logs into specified file")
("multiline,m", "multiline")
("multiquery,n", "multiquery")
@ -1701,6 +1705,8 @@ void ClientBase::init(int argc, char ** argv)
config().setBool("verbose", true);
if (options.count("log-level"))
Poco::Logger::root().setLevel(options["log-level"].as<std::string>());
if (options.count("server_logs_file"))
server_logs_file = options["server_logs_file"].as<std::string>();
if (options.count("hardware-utilization"))
progress_indication.print_hardware_utilization = true;

View File

@ -593,6 +593,7 @@
M(623, CAPN_PROTO_BAD_CAST) \
M(624, BAD_FILE_TYPE) \
M(625, IO_SETUP_ERROR) \
M(626, CANNOT_SKIP_UNKNOWN_FIELD) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \

View File

@ -38,6 +38,11 @@ void FileChecker::setPath(const String & file_info_path_)
files_info_path = file_info_path_;
}
String FileChecker::getPath() const
{
return files_info_path;
}
void FileChecker::update(const String & full_file_path)
{
bool exists = disk->exists(full_file_path);

View File

@ -13,7 +13,9 @@ class FileChecker
{
public:
FileChecker(DiskPtr disk_, const String & file_info_path_);
void setPath(const String & file_info_path_);
String getPath() const;
void update(const String & full_file_path);
void setEmpty(const String & full_file_path);

View File

@ -588,6 +588,17 @@ DataTypes Block::getDataTypes() const
return res;
}
Names Block::getDataTypeNames() const
{
Names res;
res.reserve(columns());
for (const auto & elem : data)
res.push_back(elem.type->getName());
return res;
}
bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs)
{

View File

@ -90,6 +90,7 @@ public:
NamesAndTypesList getNamesAndTypesList() const;
Names getNames() const;
DataTypes getDataTypes() const;
Names getDataTypeNames() const;
/// Returns number of rows from first column in block, not equal to nullptr. If no columns, returns 0.
size_t rows() const;

View File

@ -556,10 +556,12 @@ class IColumn;
M(Bool, input_format_csv_unquoted_null_literal_as_null, false, "Consider unquoted NULL literal as \\N", 0) \
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \
M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \
M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).", 0) \
M(Bool, input_format_with_names_use_header, true, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \
M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \
M(Bool, input_format_with_names_use_header, true, "For -WithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \
M(Bool, input_format_with_types_use_header, true, "For -WithNamesAndTypes input formats this controls whether format parser should check if data types from the input match data types from the header.", 0) \
M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \
M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).", IMPORTANT) \
M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, -WithNames, -WithNamesAndTypes formats).", IMPORTANT) \
M(Bool, input_format_csv_empty_as_default, true, "Treat empty fields in CSV input as default values.", 0) \
M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \
M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \
M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \

View File

@ -52,6 +52,7 @@ public:
bool onlyNull() const override { return false; }
bool lowCardinality() const override { return true; }
bool supportsSparseSerialization() const override { return false; }
bool isLowCardinalityNullable() const override { return dictionary_type->isNullable(); }
static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type);
static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys);

View File

@ -278,6 +278,9 @@ public:
virtual bool lowCardinality() const { return false; }
/// Checks if this type is LowCardinality(Nullable(...))
virtual bool isLowCardinalityNullable() const { return false; }
/// Strings, Numbers, Date, DateTime, Nullable
virtual bool canBeInsideLowCardinality() const { return false; }

View File

@ -3,6 +3,7 @@
#include <Columns/IColumn.h>
#include <IO/WriteHelpers.h>
#include <IO/Operators.h>
#include <IO/ReadBufferFromString.h>
#include <Common/escapeForFileName.h>
#include <DataTypes/NestedUtils.h>
#include <base/EnumReflection.h>
@ -249,6 +250,20 @@ bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path)
return true;
}
void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
String field;
/// Read until \t or \n.
readString(field, istr);
ReadBufferFromString buf(field);
deserializeWholeText(column, buf, settings);
}
void ISerialization::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeText(column, row_num, ostr, settings);
}
size_t ISerialization::getArrayLevel(const SubstreamPath & path)
{
size_t level = 0;

View File

@ -318,6 +318,14 @@ public:
serializeText(column, row_num, ostr, settings);
}
/** Text deserialization without escaping and quoting. Reads all data until first \n or \t
* into a temporary string and then call deserializeWholeText. It was implemented this way
* because this function is rarely used and because proper implementation requires a lot of
* additional code in data types serialization and ReadHelpers.
*/
virtual void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const;
virtual void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const;
static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path);
static String getFileNameForStream(const String & name_in_storage, const SubstreamPath & path);
static String getSubcolumnNameForStream(const SubstreamPath & path);

View File

@ -163,7 +163,7 @@ void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffe
void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringInto(data, istr); });
read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); });
}

View File

@ -784,6 +784,7 @@ void SerializationLowCardinality::serializeTextJSON(const IColumn & column, size
{
serializeImpl(column, row_num, &ISerialization::serializeTextJSON, ostr, settings);
}
void SerializationLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeImpl(column, &ISerialization::deserializeTextJSON, istr, settings);
@ -794,6 +795,16 @@ void SerializationLowCardinality::serializeTextXML(const IColumn & column, size_
serializeImpl(column, row_num, &ISerialization::serializeTextXML, ostr, settings);
}
void SerializationLowCardinality::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeImpl(column, &ISerialization::deserializeTextRaw, istr, settings);
}
void SerializationLowCardinality::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeImpl(column, row_num, &ISerialization::serializeTextRaw, ostr, settings);
}
template <typename... Params, typename... Args>
void SerializationLowCardinality::serializeImpl(
const IColumn & column, size_t row_num, SerializationLowCardinality::SerializeFunctionPtr<Params...> func, Args &&... args) const

View File

@ -63,6 +63,8 @@ public:
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
private:
template <typename ... Params>

View File

@ -12,7 +12,6 @@
#include <IO/WriteBuffer.h>
#include <IO/WriteHelpers.h>
#include <IO/ConcatReadBuffer.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
namespace DB
@ -261,8 +260,36 @@ void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer
deserializeTextEscapedImpl<void>(column, istr, settings, nested);
}
void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeTextRawImpl<void>(column, istr, settings, nested);
}
void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const ColumnNullable & col = assert_cast<const ColumnNullable &>(column);
if (col.isNullAt(row_num))
writeString(settings.tsv.null_representation, ostr);
else
nested->serializeTextRaw(col.getNestedColumn(), row_num, ostr, settings);
}
template<typename ReturnType>
ReturnType SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested)
{
return deserializeTextEscapedAndRawImpl<ReturnType, false>(column, istr, settings, nested);
}
template<typename ReturnType>
ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings,
const SerializationPtr & nested)
{
return deserializeTextEscapedAndRawImpl<ReturnType, true>(column, istr, settings, nested);
}
template<typename ReturnType, bool escaped>
ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings,
const SerializationPtr & nested)
{
/// Little tricky, because we cannot discriminate null from first character.
@ -272,7 +299,13 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R
/// This is not null, surely.
return safeDeserialize<ReturnType>(column, *nested,
[] { return false; },
[&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextEscaped(nested_column, istr, settings); });
[&nested, &istr, &settings] (IColumn & nested_column)
{
if constexpr (escaped)
nested->deserializeTextEscaped(nested_column, istr, settings);
else
nested->deserializeTextRaw(nested_column, istr, settings);
});
}
else
{
@ -298,7 +331,10 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R
{
/// We could step back to consume backslash again.
--istr.position();
nested->deserializeTextEscaped(nested_column, istr, settings);
if constexpr (escaped)
nested->deserializeTextEscaped(nested_column, istr, settings);
else
nested->deserializeTextRaw(nested_column, istr, settings);
}
else
{
@ -306,7 +342,10 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R
ReadBufferFromMemory prefix("\\", 1);
ConcatReadBuffer prepended_istr(prefix, istr);
nested->deserializeTextEscaped(nested_column, prepended_istr, settings);
if constexpr (escaped)
nested->deserializeTextEscaped(nested_column, prepended_istr, settings);
else
nested->deserializeTextRaw(nested_column, prepended_istr, settings);
/// Synchronise cursor position in original buffer.
@ -512,5 +551,6 @@ template bool SerializationNullable::deserializeTextEscapedImpl<bool>(IColumn &
template bool SerializationNullable::deserializeTextQuotedImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested);
template bool SerializationNullable::deserializeTextCSVImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested);
template bool SerializationNullable::deserializeTextJSONImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested);
template bool SerializationNullable::deserializeTextRawImpl<bool>(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested);
}

View File

@ -71,6 +71,9 @@ public:
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
/// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false)
/// If ReturnType is void, deserialize Nullable(T)
template <typename ReturnType = bool>
@ -83,6 +86,10 @@ public:
static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested);
template <typename ReturnType = bool>
static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested);
template <typename ReturnType = bool>
static ReturnType deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested);
template <typename ReturnType = bool, bool escaped>
static ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested);
private:
struct SubcolumnCreator : public ISubcolumnCreator

View File

@ -245,7 +245,7 @@ static inline void read(IColumn & column, Reader && reader)
void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); });
read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); });
}

View File

@ -56,7 +56,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line;
format_settings.csv.delimiter = settings.format_csv_delimiter;
format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields;
format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
format_settings.csv.null_representation = settings.output_format_csv_null_representation;
format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null;
@ -107,10 +107,12 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
format_settings.with_names_use_header = settings.input_format_with_names_use_header;
format_settings.with_types_use_header = settings.input_format_with_types_use_header;
format_settings.write_statistics = settings.output_format_write_statistics;
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context

View File

@ -68,7 +68,6 @@ public:
size_t row)>;
private:
using InputCreatorFunc = InputFormatPtr(
ReadBuffer & buf,
const Block & header,

View File

@ -25,10 +25,12 @@ struct FormatSettings
bool skip_unknown_fields = false;
bool with_names_use_header = false;
bool with_types_use_header = false;
bool write_statistics = true;
bool import_nested_json = false;
bool null_as_default = true;
bool decimal_trailing_zeros = false;
bool defaults_for_omitted_fields = true;
enum class DateTimeInputFormat
{

View File

@ -1,5 +1,8 @@
#include <IO/ReadHelpers.h>
#include <Formats/JSONEachRowUtils.h>
#include <IO/ReadBufferFromString.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <base/find_symbols.h>
namespace DB
@ -10,7 +13,8 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
template <const char opening_bracket, const char closing_bracket>
static std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows)
{
skipWhitespaceIfAny(in);
@ -19,7 +23,7 @@ std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D
bool quotes = false;
size_t number_of_rows = 0;
while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast<size_t>(pos - in.position()) < min_chunk_size))
while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast<size_t>(pos - in.position()) < min_chunk_size || number_of_rows < min_rows))
{
const auto current_object_size = memory.size() + static_cast<size_t>(pos - in.position());
if (current_object_size > 10 * min_chunk_size)
@ -50,19 +54,19 @@ std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D
}
else
{
pos = find_first_symbols<'{', '}', '\\', '"'>(pos, in.buffer().end());
pos = find_first_symbols<opening_bracket, closing_bracket, '\\', '"'>(pos, in.buffer().end());
if (pos > in.buffer().end())
throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
else if (pos == in.buffer().end())
continue;
else if (*pos == '{')
else if (*pos == opening_bracket)
{
++balance;
++pos;
}
else if (*pos == '}')
else if (*pos == closing_bracket)
{
--balance;
++pos;
@ -88,6 +92,16 @@ std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D
return {loadAtPosition(in, memory, pos), number_of_rows};
}
std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
{
return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1);
}
std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows)
{
return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows);
}
bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf)
{
/// For JSONEachRow we can safely skip whitespace characters
@ -95,4 +109,37 @@ bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf)
return buf.eof() || *buf.position() == '[';
}
bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings)
{
try
{
bool as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable();
if (yield_strings)
{
String str;
readJSONString(str, in);
ReadBufferFromString buf(str);
if (as_nullable)
return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization);
serialization->deserializeWholeText(column, buf, format_settings);
return true;
}
if (as_nullable)
return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization);
serialization->deserializeTextJSON(column, in, format_settings);
return true;
}
catch (Exception & e)
{
e.addMessage("(while reading the value of key " + column_name + ")");
throw;
}
}
}

View File

@ -1,5 +1,6 @@
#pragma once
#include <Formats/FormatSettings.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/ReadBuffer.h>
#include <utility>
@ -7,8 +8,11 @@
namespace DB
{
std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size);
std::pair<bool, size_t> fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size);
std::pair<bool, size_t> fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows);
bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf);
bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings);
}

View File

@ -13,6 +13,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory);
void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory);
void registerFileSegmentationEngineRegexp(FormatFactory & factory);
void registerFileSegmentationEngineJSONAsString(FormatFactory & factory);
void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory);
/// Formats for both input/output.
@ -87,6 +88,7 @@ void registerFormats()
registerFileSegmentationEngineJSONEachRow(factory);
registerFileSegmentationEngineRegexp(factory);
registerFileSegmentationEngineJSONAsString(factory);
registerFileSegmentationEngineJSONCompactEachRow(factory);
registerInputFormatNative(factory);
registerOutputFormatNative(factory);

View File

@ -0,0 +1,13 @@
#include <Formats/registerWithNamesAndTypes.h>
namespace DB
{
void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWithNamesAndTypesFunc register_func)
{
register_func(base_format_name, false, false);
register_func(base_format_name + "WithNames", true, false);
register_func(base_format_name + "WithNamesAndTypes", true, true);
}
}

View File

@ -0,0 +1,12 @@
#pragma once
#include <string>
#include <functional>
namespace DB
{
using RegisterWithNamesAndTypesFunc = std::function<void(const std::string & format_name, bool with_names, bool with_types)>;
void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWithNamesAndTypesFunc register_func);
}

View File

@ -16,16 +16,11 @@ struct ColumnMapping
using OptionalIndexes = std::vector<std::optional<size_t>>;
OptionalIndexes column_indexes_for_input_fields;
/// Tracks which columns we have read in a single read() call.
/// For columns that are never read, it is initialized to false when we
/// read the file header, and never changed afterwards.
/// For other columns, it is updated on each read() call.
std::vector<UInt8> read_columns;
/// The list of column indexes that are not presented in input data.
std::vector<UInt8> not_presented_columns;
/// Whether we have any columns that are not read from file at all,
/// and must be always initialized with defaults.
bool have_always_default_columns{false};
/// The list of column names in input data. Needed for better exception messages.
std::vector<String> names_of_columns;
};
using ColumnMappingPtr = std::shared_ptr<ColumnMapping>;

View File

@ -2,79 +2,96 @@
#include <IO/ReadHelpers.h>
#include <Processors/Formats/Impl/BinaryRowInputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/DataTypeFactory.h>
namespace DB
{
BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_)
: IRowInputFormat(std::move(header), in_, params_), with_names(with_names_), with_types(with_types_)
namespace ErrorCodes
{
extern const int CANNOT_SKIP_UNKNOWN_FIELD;
}
BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(std::move(header), in_, std::move(params_), with_names_, with_types_, format_settings_)
{
}
bool BinaryRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &)
std::vector<String> BinaryRowInputFormat::readHeaderRow()
{
if (in->eof())
return false;
std::vector<String> fields;
String field;
for (size_t i = 0; i < read_columns; ++i)
{
readStringBinary(field, *in);
fields.push_back(field);
}
return fields;
}
size_t num_columns = columns.size();
for (size_t i = 0; i < num_columns; ++i)
serializations[i]->deserializeBinary(*columns[i], *in);
std::vector<String> BinaryRowInputFormat::readNames()
{
readVarUInt(read_columns, *in);
return readHeaderRow();
}
std::vector<String> BinaryRowInputFormat::readTypes()
{
auto types = readHeaderRow();
for (const auto & type_name : types)
read_data_types.push_back(DataTypeFactory::instance().get(type_name));
return types;
}
bool BinaryRowInputFormat::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/)
{
serialization->deserializeBinary(column, *in);
return true;
}
void BinaryRowInputFormat::readPrefix()
void BinaryRowInputFormat::skipHeaderRow()
{
/// NOTE: The header is completely ignored. This can be easily improved.
UInt64 columns = 0;
String tmp;
if (with_names || with_types)
{
readVarUInt(columns, *in);
}
if (with_names)
{
for (size_t i = 0; i < columns; ++i)
{
readStringBinary(tmp, *in);
}
}
if (with_types)
{
for (size_t i = 0; i < columns; ++i)
{
readStringBinary(tmp, *in);
}
}
for (size_t i = 0; i < read_columns; ++i)
readStringBinary(tmp, *in);
}
void BinaryRowInputFormat::skipNames()
{
readVarUInt(read_columns, *in);
skipHeaderRow();
}
void BinaryRowInputFormat::skipTypes()
{
skipHeaderRow();
}
void BinaryRowInputFormat::skipField(size_t file_column)
{
if (file_column >= read_data_types.size())
throw Exception(ErrorCodes::CANNOT_SKIP_UNKNOWN_FIELD, "Cannot skip unknown field in RowBinaryWithNames format, because it's type is unknown");
Field field;
read_data_types[file_column]->getDefaultSerialization()->deserializeBinary(field, *in);
}
void registerInputFormatRowBinary(FormatFactory & factory)
{
factory.registerInputFormat("RowBinary", [](
ReadBuffer & buf,
const Block & sample,
const IRowInputFormat::Params & params,
const FormatSettings &)
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
return std::make_shared<BinaryRowInputFormat>(buf, sample, params, false, false);
});
factory.registerInputFormat(format_name, [with_names, with_types](
ReadBuffer & buf,
const Block & sample,
const IRowInputFormat::Params & params,
const FormatSettings & settings)
{
return std::make_shared<BinaryRowInputFormat>(buf, sample, params, with_names, with_types, settings);
});
};
factory.registerInputFormat("RowBinaryWithNamesAndTypes", [](
ReadBuffer & buf,
const Block & sample,
const IRowInputFormat::Params & params,
const FormatSettings &)
{
return std::make_shared<BinaryRowInputFormat>(buf, sample, params, true, true);
});
registerWithNamesAndTypes("RowBinary", register_func);
}
}

View File

@ -2,6 +2,7 @@
#include <Core/Block.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
namespace DB
@ -12,19 +13,33 @@ class ReadBuffer;
/** A stream for inputting data in a binary line-by-line format.
*/
class BinaryRowInputFormat : public IRowInputFormat
class BinaryRowInputFormat : public RowInputFormatWithNamesAndTypes
{
public:
BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_);
bool readRow(MutableColumns & columns, RowReadExtension &) override;
void readPrefix() override;
BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
String getName() const override { return "BinaryRowInputFormat"; }
/// RowInputFormatWithNamesAndTypes implements logic with DiagnosticInfo, but
/// in this format we cannot provide any DiagnosticInfo, because here we have
/// just binary data.
std::string getDiagnosticInfo() override { return {}; }
private:
bool with_names;
bool with_types;
bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override;
void skipField(size_t file_column) override;
void skipNames() override;
void skipTypes() override;
void skipHeaderRow();
std::vector<String> readNames() override;
std::vector<String> readTypes() override;
std::vector<String> readHeaderRow();
/// Data types read from input data.
DataTypes read_data_types;
UInt64 read_columns = 0;
};
}

View File

@ -4,6 +4,7 @@
#include <DataTypes/IDataType.h>
#include <Processors/Formats/Impl/BinaryRowOutputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/registerWithNamesAndTypes.h>
namespace DB
@ -49,23 +50,19 @@ void BinaryRowOutputFormat::writeField(const IColumn & column, const ISerializat
void registerOutputFormatRowBinary(FormatFactory & factory)
{
factory.registerOutputFormat("RowBinary", [](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings &)
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
return std::make_shared<BinaryRowOutputFormat>(buf, sample, false, false, params);
});
factory.registerOutputFormat(format_name, [with_names, with_types](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings &)
{
return std::make_shared<BinaryRowOutputFormat>(buf, sample, with_names, with_types, params);
});
};
factory.registerOutputFormat("RowBinaryWithNamesAndTypes", [](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings &)
{
return std::make_shared<BinaryRowOutputFormat>(buf, sample, true, true, params);
});
registerWithNamesAndTypes("RowBinary", register_func);
}
}

View File

@ -3,12 +3,12 @@
#include <IO/Operators.h>
#include <Formats/verbosePrintString.h>
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <Formats/FormatFactory.h>
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
#include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/DataTypeNothing.h>
namespace DB
{
@ -19,62 +19,21 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
CSVRowInputFormat::CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
bool with_names_, const FormatSettings & format_settings_)
: RowInputFormatWithDiagnosticInfo(header_, in_, params_)
, with_names(with_names_)
, format_settings(format_settings_)
CSVRowInputFormat::CSVRowInputFormat(
const Block & header_,
ReadBuffer & in_,
const Params & params_,
bool with_names_,
bool with_types_,
const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_)
{
const String bad_delimiters = " \t\"'.UL";
if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos)
throw Exception(String("CSV format may not work correctly with delimiter '") + format_settings.csv.delimiter +
"'. Try use CustomSeparated format instead.", ErrorCodes::BAD_ARGUMENTS);
const auto & sample = getPort().getHeader();
size_t num_columns = sample.columns();
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = sample.getByPosition(i);
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
}
}
/// Map an input file column to a table column, based on its name.
void CSVRowInputFormat::addInputColumn(const String & column_name)
{
const auto column_it = column_indexes_by_names.find(column_name);
if (column_it == column_indexes_by_names.end())
{
if (format_settings.skip_unknown_fields)
{
column_mapping->column_indexes_for_input_fields.push_back(std::nullopt);
return;
}
throw Exception(
"Unknown field found in CSV header: '" + column_name + "' " +
"at position " + std::to_string(column_mapping->column_indexes_for_input_fields.size()) +
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
ErrorCodes::INCORRECT_DATA
);
}
const auto column_index = column_it->second;
if (column_mapping->read_columns[column_index])
throw Exception("Duplicate field found while parsing CSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
column_mapping->read_columns[column_index] = true;
column_mapping->column_indexes_for_input_fields.emplace_back(column_index);
String("CSV format may not work correctly with delimiter '") + format_settings.csv.delimiter
+ "'. Try use CustomSeparated format instead.",
ErrorCodes::BAD_ARGUMENTS);
}
static void skipEndOfLine(ReadBuffer & in)
@ -100,29 +59,6 @@ static void skipEndOfLine(ReadBuffer & in)
throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA);
}
static void skipDelimiter(ReadBuffer & in, const char delimiter, bool is_last_column)
{
if (is_last_column)
{
if (in.eof())
return;
/// we support the extra delimiter at the end of the line
if (*in.position() == delimiter)
{
++in.position();
if (in.eof())
return;
}
skipEndOfLine(in);
}
else
assertChar(delimiter, in);
}
/// Skip `whitespace` symbols allowed in CSV.
static inline void skipWhitespacesAndTabs(ReadBuffer & in)
{
@ -132,255 +68,138 @@ static inline void skipWhitespacesAndTabs(ReadBuffer & in)
++in.position();
}
static void skipRow(ReadBuffer & in, const FormatSettings::CSV & settings, size_t num_columns)
void CSVRowInputFormat::skipFieldDelimiter()
{
String tmp;
for (size_t i = 0; i < num_columns; ++i)
{
skipWhitespacesAndTabs(in);
readCSVString(tmp, in, settings);
skipWhitespacesAndTabs(in);
skipDelimiter(in, settings.delimiter, i + 1 == num_columns);
}
skipWhitespacesAndTabs(*in);
assertChar(format_settings.csv.delimiter, *in);
}
void CSVRowInputFormat::setupAllColumnsByTableSchema()
String CSVRowInputFormat::readFieldIntoString()
{
const auto & header = getPort().getHeader();
column_mapping->read_columns.assign(header.columns(), true);
column_mapping->column_indexes_for_input_fields.resize(header.columns());
for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i)
column_mapping->column_indexes_for_input_fields[i] = i;
skipWhitespacesAndTabs(*in);
String field;
readCSVString(field, *in, format_settings.csv);
return field;
}
void CSVRowInputFormat::readPrefix()
void CSVRowInputFormat::skipField()
{
/// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes,
/// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it.
skipBOMIfExists(*in);
size_t num_columns = data_types.size();
const auto & header = getPort().getHeader();
/// This is a bit of abstraction leakage, but we have almost the same code in other places.
/// Thus, we check if this InputFormat is working with the "real" beginning of the data in case of parallel parsing.
if (with_names && getCurrentUnitNumber() == 0)
{
/// This CSV file has a header row with column names. Depending on the
/// settings, use it or skip it.
if (format_settings.with_names_use_header)
{
/// Look at the file header to see which columns we have there.
/// The missing columns are filled with defaults.
column_mapping->read_columns.assign(header.columns(), false);
do
{
String column_name;
skipWhitespacesAndTabs(*in);
readCSVString(column_name, *in, format_settings.csv);
skipWhitespacesAndTabs(*in);
addInputColumn(column_name);
}
while (checkChar(format_settings.csv.delimiter, *in));
skipDelimiter(*in, format_settings.csv.delimiter, true);
for (auto read_column : column_mapping->read_columns)
{
if (!read_column)
{
column_mapping->have_always_default_columns = true;
break;
}
}
return;
}
else
{
skipRow(*in, format_settings.csv, num_columns);
setupAllColumnsByTableSchema();
}
}
else if (!column_mapping->is_set)
setupAllColumnsByTableSchema();
readFieldIntoString();
}
bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext)
void CSVRowInputFormat::skipRowEndDelimiter()
{
skipWhitespacesAndTabs(*in);
if (in->eof())
return false;
return;
updateDiagnosticInfo();
/// we support the extra delimiter at the end of the line
if (*in->position() == format_settings.csv.delimiter)
++in->position();
/// Track whether we have to fill any columns in this row with default
/// values. If not, we return an empty column mask to the caller, so that
/// it doesn't have to check it.
bool have_default_columns = column_mapping->have_always_default_columns;
skipWhitespacesAndTabs(*in);
if (in->eof())
return;
ext.read_columns.assign(column_mapping->read_columns.size(), true);
const auto delimiter = format_settings.csv.delimiter;
for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column)
{
const auto & table_column = column_mapping->column_indexes_for_input_fields[file_column];
const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size();
if (table_column)
{
skipWhitespacesAndTabs(*in);
ext.read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column],
serializations[*table_column], is_last_file_column);
if (!ext.read_columns[*table_column])
have_default_columns = true;
skipWhitespacesAndTabs(*in);
}
else
{
/// We never read this column from the file, just skip it.
String tmp;
readCSVString(tmp, *in, format_settings.csv);
}
skipDelimiter(*in, delimiter, is_last_file_column);
}
if (have_default_columns)
{
for (size_t i = 0; i < column_mapping->read_columns.size(); i++)
{
if (!column_mapping->read_columns[i])
{
/// The column value for this row is going to be overwritten
/// with default by the caller, but the general assumption is
/// that the column size increases for each row, so we have
/// to insert something. Since we do not care about the exact
/// value, we do not have to use the default value specified by
/// the data type, and can just use IColumn::insertDefault().
columns[i]->insertDefault();
ext.read_columns[i] = false;
}
}
}
return true;
skipEndOfLine(*in);
}
bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
void CSVRowInputFormat::skipHeaderRow()
{
do
{
skipField();
skipWhitespacesAndTabs(*in);
}
while (checkChar(format_settings.csv.delimiter, *in));
skipRowEndDelimiter();
}
std::vector<String> CSVRowInputFormat::readHeaderRow()
{
std::vector<String> fields;
do
{
fields.push_back(readFieldIntoString());
skipWhitespacesAndTabs(*in);
}
while (checkChar(format_settings.csv.delimiter, *in));
skipRowEndDelimiter();
return fields;
}
bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
const char delimiter = format_settings.csv.delimiter;
for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column)
try
{
if (file_column == 0 && in->eof())
{
out << "<End of stream>\n";
return false;
}
skipWhitespacesAndTabs(*in);
if (column_mapping->column_indexes_for_input_fields[file_column].has_value())
assertChar(delimiter, *in);
}
catch (const DB::Exception &)
{
if (*in->position() == '\n' || *in->position() == '\r')
{
const auto & header = getPort().getHeader();
size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value();
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx],
out, file_column))
return false;
out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected."
" It's like your file has less columns than expected.\n"
"And if your file has the right number of columns, maybe it has unescaped quotes in values.\n";
}
else
{
static const String skipped_column_str = "<SKIPPED COLUMN>";
static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
return false;
}
skipWhitespacesAndTabs(*in);
/// Delimiters
if (file_column + 1 == column_mapping->column_indexes_for_input_fields.size())
{
if (in->eof())
return false;
/// we support the extra delimiter at the end of the line
if (*in->position() == delimiter)
{
++in->position();
if (in->eof())
break;
}
if (!in->eof() && *in->position() != '\n' && *in->position() != '\r')
{
out << "ERROR: There is no line feed. ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n"
" It's like your file has more columns than expected.\n"
"And if your file has the right number of columns, maybe it has an unquoted string value with a comma.\n";
return false;
}
skipEndOfLine(*in);
}
else
{
try
{
assertChar(delimiter, *in);
}
catch (const DB::Exception &)
{
if (*in->position() == '\n' || *in->position() == '\r')
{
out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected."
" It's like your file has less columns than expected.\n"
"And if your file has the right number of columns, maybe it has unescaped quotes in values.\n";
}
else
{
out << "ERROR: There is no delimiter (" << delimiter << "). ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n";
}
return false;
}
out << "ERROR: There is no delimiter (" << delimiter << "). ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n";
}
return false;
}
return true;
}
bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
{
skipWhitespacesAndTabs(*in);
if (in->eof())
return true;
/// we support the extra delimiter at the end of the line
if (*in->position() == format_settings.csv.delimiter)
{
++in->position();
skipWhitespacesAndTabs(*in);
if (in->eof())
return true;
}
if (!in->eof() && *in->position() != '\n' && *in->position() != '\r')
{
out << "ERROR: There is no line feed. ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n"
" It's like your file has more columns than expected.\n"
"And if your file has the right number of columns, maybe it has an unquoted string value with a comma.\n";
return false;
}
skipEndOfLine(*in);
return true;
}
void CSVRowInputFormat::syncAfterError()
{
skipToNextLineOrEOF(*in);
}
void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/)
{
const auto & index = column_mapping->column_indexes_for_input_fields[file_column];
if (index)
{
const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size();
readField(column, type, serializations[*index], is_last_file_column);
}
else
{
String tmp;
readCSVString(tmp, *in, format_settings.csv);
}
}
skipWhitespacesAndTabs(*in);
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column)
{
const bool at_delimiter = !in->eof() && *in->position() == format_settings.csv.delimiter;
const bool at_last_column_line_end = is_last_file_column
&& (in->eof() || *in->position() == '\n' || *in->position() == '\r');
@ -399,7 +218,7 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co
column.insertDefault();
return false;
}
else if (format_settings.null_as_default && !type->isNullable())
else if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable())
{
/// If value is null but type is not nullable then use default value instead.
return SerializationNullable::deserializeTextCSVImpl(column, *in, format_settings, serialization);
@ -412,31 +231,24 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co
}
}
void CSVRowInputFormat::resetParser()
{
RowInputFormatWithDiagnosticInfo::resetParser();
column_mapping->column_indexes_for_input_fields.clear();
column_mapping->read_columns.clear();
column_mapping->have_always_default_columns = false;
}
void registerInputFormatCSV(FormatFactory & factory)
{
for (bool with_names : {false, true})
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerInputFormat(with_names ? "CSVWithNames" : "CSV", [=](
factory.registerInputFormat(format_name, [with_names, with_types](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<CSVRowInputFormat>(sample, buf, params, with_names, settings);
return std::make_shared<CSVRowInputFormat>(sample, buf, std::move(params), with_names, with_types, settings);
});
}
};
registerWithNamesAndTypes("CSV", register_func);
}
static std::pair<bool, size_t> fileSegmentationEngineCSVImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
static std::pair<bool, size_t> fileSegmentationEngineCSVImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows)
{
char * pos = in.position();
bool quotes = false;
@ -476,7 +288,7 @@ static std::pair<bool, size_t> fileSegmentationEngineCSVImpl(ReadBuffer & in, DB
else if (*pos == '\n')
{
++number_of_rows;
if (memory.size() + static_cast<size_t>(pos - in.position()) >= min_chunk_size)
if (memory.size() + static_cast<size_t>(pos - in.position()) >= min_chunk_size && number_of_rows >= min_rows)
need_more_data = false;
++pos;
if (loadAtPosition(in, memory, pos) && *pos == '\r')
@ -484,7 +296,7 @@ static std::pair<bool, size_t> fileSegmentationEngineCSVImpl(ReadBuffer & in, DB
}
else if (*pos == '\r')
{
if (memory.size() + static_cast<size_t>(pos - in.position()) >= min_chunk_size)
if (memory.size() + static_cast<size_t>(pos - in.position()) >= min_chunk_size && number_of_rows >= min_rows)
need_more_data = false;
++pos;
if (loadAtPosition(in, memory, pos) && *pos == '\n')
@ -502,8 +314,16 @@ static std::pair<bool, size_t> fileSegmentationEngineCSVImpl(ReadBuffer & in, DB
void registerFileSegmentationEngineCSV(FormatFactory & factory)
{
factory.registerFileSegmentationEngine("CSV", &fileSegmentationEngineCSVImpl);
factory.registerFileSegmentationEngine("CSVWithNames", &fileSegmentationEngineCSVImpl);
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
size_t min_rows = 1 + int(with_names) + int(with_types);
factory.registerFileSegmentationEngine(format_name, [min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
{
return fileSegmentationEngineCSVImpl(in, memory, min_chunk_size, min_rows);
});
};
registerWithNamesAndTypes("CSV", register_func);
}
}

View File

@ -4,7 +4,7 @@
#include <unordered_map>
#include <Core/Block.h>
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Formats/FormatSettings.h>
@ -14,41 +14,44 @@ namespace DB
/** A stream for inputting data in csv format.
* Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values.
*/
class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo
class CSVRowInputFormat : public RowInputFormatWithNamesAndTypes
{
public:
/** with_names - in the first line the header with column names
* with_types - on the next line header with type names
*/
CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
bool with_names_, const FormatSettings & format_settings_);
bool with_names_, bool with_types_, const FormatSettings & format_settings_);
String getName() const override { return "CSVRowInputFormat"; }
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
void readPrefix() override;
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
void resetParser() override;
private:
/// There fields are computed in constructor.
bool with_names;
const FormatSettings format_settings;
DataTypes data_types;
using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names;
void addInputColumn(const String & column_name);
void setupAllColumnsByTableSchema();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override;
bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override;
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override
{
return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t';
}
bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column);
bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override;
void skipField(size_t /*file_column*/) override { skipField(); }
void skipField();
void skipHeaderRow() ;
void skipNames() override { skipHeaderRow(); }
void skipTypes() override { skipHeaderRow(); }
void skipFieldDelimiter() override;
void skipRowEndDelimiter() override;
std::vector<String> readHeaderRow();
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
String readFieldIntoString();
};
}

View File

@ -1,5 +1,6 @@
#include <Processors/Formats/Impl/CSVRowOutputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <IO/WriteHelpers.h>
@ -8,8 +9,8 @@ namespace DB
{
CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_)
: IRowOutputFormat(header_, out_, params_), with_names(with_names_), format_settings(format_settings_)
CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, bool with_types_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_)
: IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_)
{
const auto & sample = getPort(PortKind::Main).getHeader();
size_t columns = sample.columns();
@ -18,25 +19,27 @@ CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_
data_types[i] = sample.safeGetByPosition(i).type;
}
void CSVRowOutputFormat::writeLine(const std::vector<String> & values)
{
for (size_t i = 0; i < values.size(); ++i)
{
writeCSVString(values[i], out);
if (i + 1 == values.size())
writeRowEndDelimiter();
else
writeFieldDelimiter();
}
}
void CSVRowOutputFormat::doWritePrefix()
{
const auto & sample = getPort(PortKind::Main).getHeader();
size_t columns = sample.columns();
if (with_names)
{
for (size_t i = 0; i < columns; ++i)
{
writeCSVString(sample.safeGetByPosition(i).name, out);
writeLine(sample.getNames());
char delimiter = format_settings.csv.delimiter;
if (i + 1 == columns)
delimiter = '\n';
writeChar(delimiter, out);
}
}
if (with_types)
writeLine(sample.getDataTypeNames());
}
@ -72,18 +75,20 @@ void CSVRowOutputFormat::writeBeforeExtremes()
void registerOutputFormatCSV(FormatFactory & factory)
{
for (bool with_names : {false, true})
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerOutputFormat(with_names ? "CSVWithNames" : "CSV", [=](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & format_settings)
factory.registerOutputFormat(format_name, [with_names, with_types](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & format_settings)
{
return std::make_shared<CSVRowOutputFormat>(buf, sample, with_names, params, format_settings);
return std::make_shared<CSVRowOutputFormat>(buf, sample, with_names, with_types, params, format_settings);
});
factory.markOutputFormatSupportsParallelFormatting(with_names ? "CSVWithNames" : "CSV");
}
factory.markOutputFormatSupportsParallelFormatting(format_name);
};
registerWithNamesAndTypes("CSV", register_func);
}
}

View File

@ -20,7 +20,7 @@ public:
/** with_names - output in the first line a header with column names
* with_types - output in the next line header with the names of the types
*/
CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_);
CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, bool with_types, const RowOutputFormatParams & params_, const FormatSettings & format_settings_);
String getName() const override { return "CSVRowOutputFormat"; }
@ -38,9 +38,11 @@ public:
return String("text/csv; charset=UTF-8; header=") + (with_names ? "present" : "absent");
}
protected:
private:
void writeLine(const std::vector<String> & values);
bool with_names;
bool with_types;
const FormatSettings format_settings;
DataTypes data_types;
};

View File

@ -185,7 +185,7 @@ void registerInputFormatJSONAsString(FormatFactory & factory)
void registerFileSegmentationEngineJSONAsString(FormatFactory & factory)
{
factory.registerFileSegmentationEngine("JSONAsString", &fileSegmentationEngineJSONEachRowImpl);
factory.registerFileSegmentationEngine("JSONAsString", &fileSegmentationEngineJSONEachRow);
}
void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory)

View File

@ -1,8 +1,11 @@
#include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h>
#include <IO/ReadHelpers.h>
#include <IO/Operators.h>
#include <Formats/FormatFactory.h>
#include <Formats/verbosePrintString.h>
#include <Formats/JSONEachRowUtils.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/Serializations/SerializationNullable.h>
@ -12,183 +15,39 @@ namespace DB
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int CANNOT_READ_ALL_DATA;
}
JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(ReadBuffer & in_,
const Block & header_,
Params params_,
const FormatSettings & format_settings_,
bool with_names_,
bool yield_strings_)
: IRowInputFormat(header_, in_, std::move(params_)), format_settings(format_settings_), with_names(with_names_), yield_strings(yield_strings_)
JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(
const Block & header_,
ReadBuffer & in_,
Params params_,
bool with_names_,
bool with_types_,
bool yield_strings_,
const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_)
, yield_strings(yield_strings_)
{
const auto & sample = getPort().getHeader();
size_t num_columns = sample.columns();
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = sample.getByPosition(i);
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
}
}
void JSONCompactEachRowRowInputFormat::resetParser()
void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter()
{
IRowInputFormat::resetParser();
column_indexes_for_input_fields.clear();
not_seen_columns.clear();
}
void JSONCompactEachRowRowInputFormat::readPrefix()
{
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
skipBOMIfExists(*in);
if (with_names)
{
size_t num_columns = getPort().getHeader().columns();
read_columns.assign(num_columns, false);
assertChar('[', *in);
do
{
skipWhitespaceIfAny(*in);
String column_name;
readJSONString(column_name, *in);
addInputColumn(column_name);
skipWhitespaceIfAny(*in);
}
while (checkChar(',', *in));
assertChar(']', *in);
skipEndOfLine();
/// Type checking
assertChar('[', *in);
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
{
skipWhitespaceIfAny(*in);
String data_type;
readJSONString(data_type, *in);
if (column_indexes_for_input_fields[i] &&
data_types[*column_indexes_for_input_fields[i]]->getName() != data_type)
{
throw Exception(
"Type of '" + getPort().getHeader().getByPosition(*column_indexes_for_input_fields[i]).name
+ "' must be " + data_types[*column_indexes_for_input_fields[i]]->getName() +
", not " + data_type,
ErrorCodes::INCORRECT_DATA
);
}
if (i != column_indexes_for_input_fields.size() - 1)
assertChar(',', *in);
skipWhitespaceIfAny(*in);
}
assertChar(']', *in);
}
else
{
size_t num_columns = getPort().getHeader().columns();
read_columns.assign(num_columns, true);
column_indexes_for_input_fields.resize(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
column_indexes_for_input_fields[i] = i;
}
}
for (size_t i = 0; i < read_columns.size(); ++i)
{
if (!read_columns[i])
{
not_seen_columns.emplace_back(i);
}
}
}
void JSONCompactEachRowRowInputFormat::addInputColumn(const String & column_name)
{
names_of_columns.emplace_back(column_name);
const auto column_it = column_indexes_by_names.find(column_name);
if (column_it == column_indexes_by_names.end())
{
if (format_settings.skip_unknown_fields)
{
column_indexes_for_input_fields.push_back(std::nullopt);
return;
}
throw Exception(
"Unknown field found in JSONCompactEachRow header: '" + column_name + "' " +
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
ErrorCodes::INCORRECT_DATA
);
}
const auto column_index = column_it->second;
if (read_columns[column_index])
throw Exception("Duplicate field found while parsing JSONCompactEachRow header: " + column_name, ErrorCodes::INCORRECT_DATA);
read_columns[column_index] = true;
column_indexes_for_input_fields.emplace_back(column_index);
}
bool JSONCompactEachRowRowInputFormat::readRow(DB::MutableColumns &columns, DB::RowReadExtension &ext)
{
skipEndOfLine();
if (in->eof())
return false;
size_t num_columns = columns.size();
read_columns.assign(num_columns, false);
skipWhitespaceIfAny(*in);
assertChar('[', *in);
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
{
const auto & table_column = column_indexes_for_input_fields[file_column];
if (table_column)
{
readField(*table_column, columns);
}
else
{
skipJSONField(*in, StringRef(names_of_columns[file_column]));
}
}
skipWhitespaceIfAny(*in);
if (in->eof())
throw ParsingException("Unexpected end of stream while parsing JSONCompactEachRow format", ErrorCodes::CANNOT_READ_ALL_DATA);
if (file_column + 1 != column_indexes_for_input_fields.size())
{
assertChar(',', *in);
skipWhitespaceIfAny(*in);
}
}
void JSONCompactEachRowRowInputFormat::skipFieldDelimiter()
{
skipWhitespaceIfAny(*in);
assertChar(',', *in);
}
void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter()
{
skipWhitespaceIfAny(*in);
assertChar(']', *in);
for (const auto & name : not_seen_columns)
columns[name]->insertDefault();
ext.read_columns = read_columns;
return true;
}
void JSONCompactEachRowRowInputFormat::skipEndOfLine()
{
skipWhitespaceIfAny(*in);
if (!in->eof() && (*in->position() == ',' || *in->position() == ';'))
++in->position();
@ -196,39 +55,55 @@ void JSONCompactEachRowRowInputFormat::skipEndOfLine()
skipWhitespaceIfAny(*in);
}
void JSONCompactEachRowRowInputFormat::readField(size_t index, MutableColumns & columns)
String JSONCompactEachRowRowInputFormat::readFieldIntoString()
{
try
skipWhitespaceIfAny(*in);
String field;
readJSONString(field, *in);
return field;
}
void JSONCompactEachRowRowInputFormat::skipField(size_t file_column)
{
skipWhitespaceIfAny(*in);
skipJSONField(*in, column_mapping->names_of_columns[file_column]);
}
void JSONCompactEachRowRowInputFormat::skipHeaderRow()
{
skipRowStartDelimiter();
size_t i = 0;
do
{
read_columns[index] = true;
const auto & type = data_types[index];
const auto & serialization = serializations[index];
if (yield_strings)
{
String str;
readJSONString(str, *in);
ReadBufferFromString buf(str);
if (format_settings.null_as_default && !type->isNullable())
read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization);
else
serialization->deserializeWholeText(*columns[index], buf, format_settings);
}
else
{
if (format_settings.null_as_default && !type->isNullable())
read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], *in, format_settings, serialization);
else
serialization->deserializeTextJSON(*columns[index], *in, format_settings);
}
if (i >= column_mapping->names_of_columns.size())
throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names");
skipField(i++);
skipWhitespaceIfAny(*in);
}
catch (Exception & e)
while (checkChar(',', *in));
skipRowEndDelimiter();
}
std::vector<String> JSONCompactEachRowRowInputFormat::readHeaderRow()
{
skipRowStartDelimiter();
std::vector<String> fields;
do
{
e.addMessage("(while reading the value of key " + getPort().getHeader().getByPosition(index).name + ")");
throw;
fields.push_back(readFieldIntoString());
skipWhitespaceIfAny(*in);
}
while (checkChar(',', *in));
skipRowEndDelimiter();
return fields;
}
bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name)
{
skipWhitespaceIfAny(*in);
return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings);
}
void JSONCompactEachRowRowInputFormat::syncAfterError()
@ -236,43 +111,112 @@ void JSONCompactEachRowRowInputFormat::syncAfterError()
skipToUnescapedNextLineOrEOF(*in);
}
bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
{
skipWhitespaceIfAny(*in);
if (!checkChar('[', *in))
{
out << "ERROR: There is no '[' before the row.\n";
return false;
}
return true;
}
bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
try
{
skipWhitespaceIfAny(*in);
assertChar(',', *in);
}
catch (const DB::Exception &)
{
if (*in->position() == ']')
{
out << "ERROR: Closing parenthesis (']') found where comma is expected."
" It's like your file has less columns than expected.\n"
"And if your file has the right number of columns, maybe it has unescaped quotes in values.\n";
}
else
{
out << "ERROR: There is no comma. ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n";
}
return false;
}
return true;
}
bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
{
skipWhitespaceIfAny(*in);
if (in->eof())
{
out << "ERROR: Unexpected end of file. ']' expected at the end of row.";
return false;
}
if (!checkChar(']', *in))
{
out << "ERROR: There is no closing parenthesis (']') at the end of the row. ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n";
return false;
}
skipWhitespaceIfAny(*in);
if (in->eof())
return true;
if ((*in->position() == ',' || *in->position() == ';'))
++in->position();
skipWhitespaceIfAny(*in);
return true;
}
void registerInputFormatJSONCompactEachRow(FormatFactory & factory)
{
factory.registerInputFormat("JSONCompactEachRow", [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
for (bool yield_strings : {true, false})
{
return std::make_shared<JSONCompactEachRowRowInputFormat>(buf, sample, std::move(params), settings, false, false);
});
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerInputFormat(format_name, [with_names, with_types, yield_strings](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<JSONCompactEachRowRowInputFormat>(sample, buf, std::move(params), with_names, with_types, yield_strings, settings);
});
};
factory.registerInputFormat("JSONCompactEachRowWithNamesAndTypes", [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<JSONCompactEachRowRowInputFormat>(buf, sample, std::move(params), settings, true, false);
});
registerWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func);
}
}
factory.registerInputFormat("JSONCompactStringsEachRow", [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory)
{
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
return std::make_shared<JSONCompactEachRowRowInputFormat>(buf, sample, std::move(params), settings, false, true);
});
/// In case when we have names and/or types in the first two/one rows,
/// we need to read at least one more row of actual data. So, set
/// the minimum of rows for segmentation engine according to
/// parameters with_names and with_types.
size_t min_rows = 1 + int(with_names) + int(with_types);
factory.registerFileSegmentationEngine(format_name, [min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
{
return fileSegmentationEngineJSONCompactEachRow(in, memory, min_chunk_size, min_rows);
});
};
factory.registerInputFormat("JSONCompactStringsEachRowWithNamesAndTypes", [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<JSONCompactEachRowRowInputFormat>(buf, sample, std::move(params), settings, true, true);
});
registerWithNamesAndTypes("JSONCompactEachRow", register_func);
registerWithNamesAndTypes("JSONCompactStringsEachRow", register_func);
}
}

View File

@ -1,7 +1,7 @@
#pragma once
#include <Core/Block.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <Formats/FormatSettings.h>
#include <Common/HashTable/HashMap.h>
@ -17,49 +17,47 @@ class ReadBuffer;
* - JSONCompactStringsEachRowWithNamesAndTypes
*
*/
class JSONCompactEachRowRowInputFormat : public IRowInputFormat
class JSONCompactEachRowRowInputFormat : public RowInputFormatWithNamesAndTypes
{
public:
JSONCompactEachRowRowInputFormat(
ReadBuffer & in_,
const Block & header_,
ReadBuffer & in_,
Params params_,
const FormatSettings & format_settings_,
bool with_names_,
bool yield_strings_);
bool with_types_,
bool yield_strings_,
const FormatSettings & format_settings_);
String getName() const override { return "JSONCompactEachRowRowInputFormat"; }
void readPrefix() override;
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
void resetParser() override;
private:
void addInputColumn(const String & column_name);
void skipEndOfLine();
void readField(size_t index, MutableColumns & columns);
bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override;
bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override;
bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override;
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override
{
return *pos != ',' && *pos != ']' && *pos != ' ' && *pos != '\t';
}
const FormatSettings format_settings;
bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override;
using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names;
void skipField(size_t file_column) override;
void skipHeaderRow();
void skipNames() override { skipHeaderRow(); }
void skipTypes() override { skipHeaderRow(); }
void skipRowStartDelimiter() override;
void skipFieldDelimiter() override;
void skipRowEndDelimiter() override;
using OptionalIndexes = std::vector<std::optional<size_t>>;
OptionalIndexes column_indexes_for_input_fields;
std::vector<String> readHeaderRow();
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
String readFieldIntoString();
DataTypes data_types;
std::vector<UInt8> read_columns;
std::vector<size_t> not_seen_columns;
/// This is for the correct exceptions in skipping unknown fields.
std::vector<String> names_of_columns;
/// For *WithNamesAndTypes formats.
bool with_names;
/// For JSONCompactString* formats.
bool yield_strings;
};

View File

@ -2,6 +2,7 @@
#include <IO/WriteBufferValidUTF8.h>
#include <Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/registerWithNamesAndTypes.h>
namespace DB
@ -13,12 +14,10 @@ JSONCompactEachRowRowOutputFormat::JSONCompactEachRowRowOutputFormat(WriteBuffer
const RowOutputFormatParams & params_,
const FormatSettings & settings_,
bool with_names_,
bool with_types_,
bool yield_strings_)
: IRowOutputFormat(header_, out_, params_), settings(settings_), with_names(with_names_), yield_strings(yield_strings_)
: IRowOutputFormat(header_, out_, params_), settings(settings_), with_names(with_names_), with_types(with_types_), yield_strings(yield_strings_)
{
const auto & sample = getPort(PortKind::Main).getHeader();
NamesAndTypesList columns(sample.getNamesAndTypesList());
fields.assign(columns.begin(), columns.end());
}
@ -57,39 +56,40 @@ void JSONCompactEachRowRowOutputFormat::writeTotals(const Columns & columns, siz
{
writeChar('\n', out);
size_t num_columns = columns.size();
writeChar('[', out);
writeRowStartDelimiter();
for (size_t i = 0; i < num_columns; ++i)
{
if (i != 0)
JSONCompactEachRowRowOutputFormat::writeFieldDelimiter();
writeFieldDelimiter();
JSONCompactEachRowRowOutputFormat::writeField(*columns[i], *serializations[i], row_num);
writeField(*columns[i], *serializations[i], row_num);
}
writeCString("]\n", out);
writeRowEndDelimiter();
}
void JSONCompactEachRowRowOutputFormat::writeLine(const std::vector<String> & values)
{
writeRowStartDelimiter();
for (size_t i = 0; i < values.size(); ++i)
{
writeChar('\"', out);
writeString(values[i], out);
writeChar('\"', out);
if (i != values.size() - 1)
writeFieldDelimiter();
}
writeRowEndDelimiter();
}
void JSONCompactEachRowRowOutputFormat::doWritePrefix()
{
const auto & header = getPort(PortKind::Main).getHeader();
if (with_names)
{
writeChar('[', out);
for (size_t i = 0; i < fields.size(); ++i)
{
writeChar('\"', out);
writeString(fields[i].name, out);
writeChar('\"', out);
if (i != fields.size() - 1)
writeCString(", ", out);
}
writeCString("]\n[", out);
for (size_t i = 0; i < fields.size(); ++i)
{
writeJSONString(fields[i].type->getName(), out, settings);
if (i != fields.size() - 1)
writeCString(", ", out);
}
writeCString("]\n", out);
}
writeLine(header.getNames());
if (with_types)
writeLine(header.getDataTypeNames());
}
void JSONCompactEachRowRowOutputFormat::consumeTotals(DB::Chunk chunk)
@ -100,45 +100,24 @@ void JSONCompactEachRowRowOutputFormat::consumeTotals(DB::Chunk chunk)
void registerOutputFormatJSONCompactEachRow(FormatFactory & factory)
{
factory.registerOutputFormat("JSONCompactEachRow", [](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & format_settings)
for (bool yield_strings : {false, true})
{
return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, false, false);
});
factory.markOutputFormatSupportsParallelFormatting("JSONCompactEachRow");
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
factory.registerOutputFormat(format_name, [yield_strings, with_names, with_types](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & format_settings)
{
return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, with_names, with_types, yield_strings);
});
factory.registerOutputFormat("JSONCompactEachRowWithNamesAndTypes", [](
WriteBuffer &buf,
const Block &sample,
const RowOutputFormatParams & params,
const FormatSettings &format_settings)
{
return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, true, false);
});
factory.markOutputFormatSupportsParallelFormatting("JSONCompactEachRowWithNamesAndTypes");
factory.markOutputFormatSupportsParallelFormatting(format_name);
};
factory.registerOutputFormat("JSONCompactStringsEachRow", [](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & format_settings)
{
return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, false, true);
});
factory.markOutputFormatSupportsParallelFormatting("JSONCompactStringsEachRow");
factory.registerOutputFormat("JSONCompactStringsEachRowWithNamesAndTypes", [](
WriteBuffer &buf,
const Block &sample,
const RowOutputFormatParams & params,
const FormatSettings &format_settings)
{
return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, true, true);
});
factory.markOutputFormatSupportsParallelFormatting("JSONCompactStringsEachRowWithNamesAndTypes");
registerWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func);
}
}

View File

@ -21,15 +21,14 @@ public:
const RowOutputFormatParams & params_,
const FormatSettings & settings_,
bool with_names_,
bool with_types_,
bool yield_strings_);
String getName() const override { return "JSONCompactEachRowRowOutputFormat"; }
void doWritePrefix() override;
void writeBeforeTotals() override {}
void writeTotals(const Columns & columns, size_t row_num) override;
void writeAfterTotals() override {}
void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override;
void writeFieldDelimiter() override;
@ -42,11 +41,11 @@ protected:
void consumeExtremes(Chunk) override {}
private:
void writeLine(const std::vector<String> & values);
FormatSettings settings;
NamesAndTypes fields;
bool with_names;
bool with_types;
bool yield_strings;
};
}

View File

@ -136,37 +136,10 @@ void JSONEachRowRowInputFormat::readField(size_t index, MutableColumns & columns
if (seen_columns[index])
throw Exception("Duplicate field found while parsing JSONEachRow format: " + columnName(index), ErrorCodes::INCORRECT_DATA);
try
{
seen_columns[index] = read_columns[index] = true;
const auto & type = getPort().getHeader().getByPosition(index).type;
const auto & serialization = serializations[index];
if (yield_strings)
{
String str;
readJSONString(str, *in);
ReadBufferFromString buf(str);
if (format_settings.null_as_default && !type->isNullable())
read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization);
else
serialization->deserializeWholeText(*columns[index], buf, format_settings);
}
else
{
if (format_settings.null_as_default && !type->isNullable())
read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], *in, format_settings, serialization);
else
serialization->deserializeTextJSON(*columns[index], *in, format_settings);
}
}
catch (Exception & e)
{
e.addMessage("(while reading the value of key " + columnName(index) + ")");
throw;
}
seen_columns[index] = true;
const auto & type = getPort().getHeader().getByPosition(index).type;
const auto & serialization = serializations[index];
read_columns[index] = readFieldImpl(*in, *columns[index], type, serialization, columnName(index), format_settings, yield_strings);
}
inline bool JSONEachRowRowInputFormat::advanceToNextKey(size_t key_index)
@ -282,8 +255,13 @@ bool JSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi
if (!seen_columns[i])
header.getByPosition(i).type->insertDefaultInto(*columns[i]);
/// return info about defaults set
ext.read_columns = read_columns;
/// Return info about defaults set.
/// If defaults_for_omitted_fields is set to 0, we should just leave already inserted defaults.
if (format_settings.defaults_for_omitted_fields)
ext.read_columns = read_columns;
else
ext.read_columns.assign(read_columns.size(), true);
return true;
}
@ -355,8 +333,8 @@ void registerInputFormatJSONEachRow(FormatFactory & factory)
void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory)
{
factory.registerFileSegmentationEngine("JSONEachRow", &fileSegmentationEngineJSONEachRowImpl);
factory.registerFileSegmentationEngine("JSONStringsEachRow", &fileSegmentationEngineJSONEachRowImpl);
factory.registerFileSegmentationEngine("JSONEachRow", &fileSegmentationEngineJSONEachRow);
factory.registerFileSegmentationEngine("JSONStringsEachRow", &fileSegmentationEngineJSONEachRow);
}
void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory)

View File

@ -60,7 +60,7 @@ RegexpRowInputFormat::ColumnFormat RegexpRowInputFormat::stringToFormat(const St
bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns)
{
const auto & type = getPort().getHeader().getByPosition(index).type;
bool parse_as_nullable = format_settings.null_as_default && !type->isNullable();
bool parse_as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable();
bool read = true;
ReadBuffer field_buf(const_cast<char *>(matched_fields[index].data()), matched_fields[index].size(), 0);
try
@ -94,9 +94,9 @@ bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns)
break;
case ColumnFormat::Raw:
if (parse_as_nullable)
read = SerializationNullable::deserializeWholeTextImpl(*columns[index], field_buf, format_settings, serialization);
read = SerializationNullable::deserializeTextRawImpl(*columns[index], field_buf, format_settings, serialization);
else
serialization->deserializeWholeText(*columns[index], field_buf, format_settings);
serialization->deserializeTextRaw(*columns[index], field_buf, format_settings);
break;
default:
break;

View File

@ -143,7 +143,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex
seen_columns[index] = read_columns[index] = true;
const auto & type = getPort().getHeader().getByPosition(index).type;
const auto & serialization = serializations[index];
if (format_settings.null_as_default && !type->isNullable())
if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable())
read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], *in, format_settings, serialization);
else
serialization->deserializeTextEscaped(*columns[index], *in, format_settings);

View File

@ -8,7 +8,7 @@ namespace DB
{
TSKVRowOutputFormat::TSKVRowOutputFormat(WriteBuffer & out_, const Block & header, const RowOutputFormatParams & params_, const FormatSettings & format_settings_)
: TabSeparatedRowOutputFormat(out_, header, false, false, params_, format_settings_)
: TabSeparatedRowOutputFormat(out_, header, false, false, false, params_, format_settings_)
{
const auto & sample = getPort(PortKind::Main).getHeader();
NamesAndTypesList columns(sample.getNamesAndTypesList());

View File

@ -1,58 +0,0 @@
#pragma once
#include <Core/Block.h>
#include <Formats/FormatSettings.h>
#include <IO/ReadBufferFromString.h>
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
namespace DB
{
/** A stream to input data in tsv format, but without escaping individual values.
* It only supports columns without '\n' or '\t'
*/
class TabSeparatedRawRowInputFormat : public TabSeparatedRowInputFormat
{
public:
/** with_names - the first line is the header with the names of the columns
* with_types - on the next line header with type names
*/
TabSeparatedRawRowInputFormat(
const Block & header_,
ReadBuffer & in_,
const Params & params_,
bool with_names_,
bool with_types_,
const FormatSettings & format_settings_)
: TabSeparatedRowInputFormat(header_, in_, params_, with_names_, with_types_, format_settings_)
{
}
String getName() const override { return "TabSeparatedRawRowInputFormat"; }
bool readField(IColumn & column, const DataTypePtr &, const SerializationPtr & serialization, bool) override
{
String tmp;
while (!in->eof())
{
char * pos = find_first_symbols<'\n', '\t'>(in->position(), in->buffer().end());
tmp.append(in->position(), pos - in->position());
in->position() = pos;
if (pos == in->buffer().end())
in->next();
else
break;
}
ReadBufferFromString cell(tmp);
serialization->deserializeWholeText(column, cell, format_settings);
return true;
}
};
}

View File

@ -1,35 +0,0 @@
#pragma once
#include <Formats/FormatSettings.h>
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
namespace DB
{
/** A stream for outputting data in tsv format, but without escaping individual values.
* (That is, the output is irreversible.)
*/
class TabSeparatedRawRowOutputFormat : public TabSeparatedRowOutputFormat
{
public:
TabSeparatedRawRowOutputFormat(
WriteBuffer & out_,
const Block & header_,
bool with_names_,
bool with_types_,
const RowOutputFormatParams & params_,
const FormatSettings & format_settings_)
: TabSeparatedRowOutputFormat(out_, header_, with_names_, with_types_, params_, format_settings_)
{
}
String getName() const override { return "TabSeparatedRawRowOutputFormat"; }
void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override
{
serialization.serializeText(column, row_num, out, format_settings);
}
};
}

View File

@ -1,13 +1,11 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromString.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/Operators.h>
#include <IO/BufferWithOwnMemory.h>
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
#include <Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h>
#include <Formats/verbosePrintString.h>
#include <Formats/FormatFactory.h>
#include <DataTypes/DataTypeNothing.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/Serializations/SerializationNullable.h>
@ -20,19 +18,6 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
static void skipTSVRow(ReadBuffer & in, const size_t num_columns)
{
NullOutput null_sink;
for (size_t i = 0; i < num_columns; ++i)
{
readEscapedStringInto(null_sink, in);
assertChar(i == num_columns - 1 ? '\n' : '\t', in);
}
}
/** Check for a common error case - usage of Windows line feed.
*/
static void checkForCarriageReturn(ReadBuffer & in)
@ -45,188 +30,73 @@ static void checkForCarriageReturn(ReadBuffer & in)
ErrorCodes::INCORRECT_DATA);
}
TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: RowInputFormatWithDiagnosticInfo(header_, in_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_)
TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(
const Block & header_,
ReadBuffer & in_,
const Params & params_,
bool with_names_,
bool with_types_,
bool is_raw_,
const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_), is_raw(is_raw_)
{
const auto & sample = getPort().getHeader();
size_t num_columns = sample.columns();
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = sample.getByPosition(i);
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
}
column_mapping->column_indexes_for_input_fields.reserve(num_columns);
column_mapping->read_columns.assign(num_columns, false);
}
void TabSeparatedRowInputFormat::setupAllColumnsByTableSchema()
void TabSeparatedRowInputFormat::skipFieldDelimiter()
{
const auto & header = getPort().getHeader();
column_mapping->read_columns.assign(header.columns(), true);
column_mapping->column_indexes_for_input_fields.resize(header.columns());
for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i)
column_mapping->column_indexes_for_input_fields[i] = i;
assertChar('\t', *in);
}
void TabSeparatedRowInputFormat::addInputColumn(const String & column_name)
{
const auto column_it = column_indexes_by_names.find(column_name);
if (column_it == column_indexes_by_names.end())
{
if (format_settings.skip_unknown_fields)
{
column_mapping->column_indexes_for_input_fields.push_back(std::nullopt);
return;
}
throw Exception(
"Unknown field found in TSV header: '" + column_name + "' " +
"at position " + std::to_string(column_mapping->column_indexes_for_input_fields.size()) +
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
ErrorCodes::INCORRECT_DATA
);
}
const auto column_index = column_it->second;
if (column_mapping->read_columns[column_index])
throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
column_mapping->read_columns[column_index] = true;
column_mapping->column_indexes_for_input_fields.emplace_back(column_index);
}
void TabSeparatedRowInputFormat::fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension)
{
/// It is safe to memorize this on the first run - the format guarantees this does not change
if (unlikely(row_num == 1))
{
columns_to_fill_with_default_values.clear();
for (size_t index = 0; index < column_mapping->read_columns.size(); ++index)
if (column_mapping->read_columns[index] == 0)
columns_to_fill_with_default_values.push_back(index);
}
for (const auto column_index : columns_to_fill_with_default_values)
{
data_types[column_index]->insertDefaultInto(*columns[column_index]);
row_read_extension.read_columns[column_index] = false;
}
}
void TabSeparatedRowInputFormat::readPrefix()
{
if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8())
{
/// In this format, we assume that column name or type cannot contain BOM,
/// so, if format has header,
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
skipBOMIfExists(*in);
}
/// This is a bit of abstraction leakage, but we have almost the same code in other places.
/// Thus, we check if this InputFormat is working with the "real" beginning of the data in case of parallel parsing.
if (with_names && getCurrentUnitNumber() == 0)
{
if (format_settings.with_names_use_header)
{
String column_name;
for (;;)
{
readEscapedString(column_name, *in);
if (!checkChar('\t', *in))
{
/// Check last column for \r before adding it, otherwise an error will be:
/// "Unknown field found in TSV header"
checkForCarriageReturn(*in);
addInputColumn(column_name);
break;
}
else
addInputColumn(column_name);
}
if (!in->eof())
{
assertChar('\n', *in);
}
}
else
{
setupAllColumnsByTableSchema();
skipTSVRow(*in, column_mapping->column_indexes_for_input_fields.size());
}
}
else if (!column_mapping->is_set)
setupAllColumnsByTableSchema();
if (with_types)
{
skipTSVRow(*in, column_mapping->column_indexes_for_input_fields.size());
}
}
bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext)
void TabSeparatedRowInputFormat::skipRowEndDelimiter()
{
if (in->eof())
return false;
return;
updateDiagnosticInfo();
if (unlikely(row_num <= 1))
checkForCarriageReturn(*in);
ext.read_columns.assign(column_mapping->read_columns.size(), true);
for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column)
{
const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column];
const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size();
if (column_index)
{
const auto & type = data_types[*column_index];
ext.read_columns[*column_index] = readField(*columns[*column_index], type, serializations[*column_index], is_last_file_column);
}
else
{
NullOutput null_sink;
readEscapedStringInto(null_sink, *in);
}
/// skip separators
if (file_column + 1 < column_mapping->column_indexes_for_input_fields.size())
{
assertChar('\t', *in);
}
else if (!in->eof())
{
if (unlikely(row_num == 1))
checkForCarriageReturn(*in);
assertChar('\n', *in);
}
}
fillUnreadColumnsWithDefaults(columns, ext);
return true;
assertChar('\n', *in);
}
String TabSeparatedRowInputFormat::readFieldIntoString()
{
String field;
readEscapedString(field, *in);
return field;
}
void TabSeparatedRowInputFormat::skipField()
{
NullOutput null_sink;
readEscapedStringInto(null_sink, *in);
}
void TabSeparatedRowInputFormat::skipHeaderRow()
{
do
{
skipField();
}
while (checkChar('\t', *in));
skipRowEndDelimiter();
}
std::vector<String> TabSeparatedRowInputFormat::readHeaderRow()
{
std::vector<String> fields;
do
{
fields.push_back(readFieldIntoString());
}
while (checkChar('\t', *in));
skipRowEndDelimiter();
return fields;
}
bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type,
const SerializationPtr & serialization, bool is_last_file_column)
const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/)
{
const bool at_delimiter = !is_last_file_column && !in->eof() && *in->position() == '\t';
const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n');
@ -236,137 +106,110 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr &
column.insertDefault();
return false;
}
else if (format_settings.null_as_default && !type->isNullable())
bool as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable();
if (is_raw)
{
if (as_nullable)
return SerializationNullable::deserializeTextRawImpl(column, *in, format_settings, serialization);
serialization->deserializeTextRaw(column, *in, format_settings);
return true;
}
if (as_nullable)
return SerializationNullable::deserializeTextEscapedImpl(column, *in, format_settings, serialization);
serialization->deserializeTextEscaped(column, *in, format_settings);
return true;
}
bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)
{
for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column)
try
{
if (file_column == 0 && in->eof())
assertChar('\t', *in);
}
catch (const DB::Exception &)
{
if (*in->position() == '\n')
{
out << "<End of stream>\n";
return false;
out << "ERROR: Line feed found where tab is expected."
" It's like your file has less columns than expected.\n"
"And if your file has the right number of columns, "
"maybe it has an unescaped backslash in value before tab, which causes the tab to be escaped.\n";
}
if (column_mapping->column_indexes_for_input_fields[file_column].has_value())
else if (*in->position() == '\r')
{
const auto & header = getPort().getHeader();
size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value();
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx],
out, file_column))
return false;
out << "ERROR: Carriage return found where tab is expected.\n";
}
else
{
static const String skipped_column_str = "<SKIPPED COLUMN>";
static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
return false;
}
/// Delimiters
if (file_column + 1 == column_mapping->column_indexes_for_input_fields.size())
{
if (!in->eof())
{
try
{
assertChar('\n', *in);
}
catch (const DB::Exception &)
{
if (*in->position() == '\t')
{
out << "ERROR: Tab found where line feed is expected."
" It's like your file has more columns than expected.\n"
"And if your file has the right number of columns, maybe it has an unescaped tab in a value.\n";
}
else if (*in->position() == '\r')
{
out << "ERROR: Carriage return found where line feed is expected."
" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
}
else
{
out << "ERROR: There is no line feed. ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n";
}
return false;
}
}
}
else
{
try
{
assertChar('\t', *in);
}
catch (const DB::Exception &)
{
if (*in->position() == '\n')
{
out << "ERROR: Line feed found where tab is expected."
" It's like your file has less columns than expected.\n"
"And if your file has the right number of columns, "
"maybe it has an unescaped backslash in value before tab, which causes the tab to be escaped.\n";
}
else if (*in->position() == '\r')
{
out << "ERROR: Carriage return found where tab is expected.\n";
}
else
{
out << "ERROR: There is no tab. ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n";
}
return false;
}
out << "ERROR: There is no tab. ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n";
}
return false;
}
return true;
}
void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
{
const auto & index = column_mapping->column_indexes_for_input_fields[file_column];
if (index)
{
bool can_be_parsed_as_null = removeLowCardinality(type)->isNullable();
if (in->eof())
return true;
// check null value for type is not nullable. don't cross buffer bound for simplicity, so maybe missing some case
if (!can_be_parsed_as_null && !in->eof())
try
{
assertChar('\n', *in);
}
catch (const DB::Exception &)
{
if (*in->position() == '\t')
{
if (*in->position() == '\\' && in->available() >= 2)
out << "ERROR: Tab found where line feed is expected."
" It's like your file has more columns than expected.\n"
"And if your file has the right number of columns, maybe it has an unescaped tab in a value.\n";
}
else if (*in->position() == '\r')
{
out << "ERROR: Carriage return found where line feed is expected."
" It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n";
}
else
{
out << "ERROR: There is no line feed. ";
verbosePrintString(in->position(), in->position() + 1, out);
out << " found instead.\n";
}
return false;
}
return true;
}
void TabSeparatedRowInputFormat::checkNullValueForNonNullable(DataTypePtr type)
{
bool can_be_parsed_as_null = type->isNullable() || type->isLowCardinalityNullable() || format_settings.null_as_default;
// check null value for type is not nullable. don't cross buffer bound for simplicity, so maybe missing some case
if (!can_be_parsed_as_null && !in->eof())
{
if (*in->position() == '\\' && in->available() >= 2)
{
++in->position();
if (*in->position() == 'N')
{
++in->position();
if (*in->position() == 'N')
{
++in->position();
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected NULL value of not Nullable type {}", type->getName());
}
else
{
--in->position();
}
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected NULL value of not Nullable type {}", type->getName());
}
else
{
--in->position();
}
}
const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size();
readField(column, type, serializations[*index], is_last_file_column);
}
else
{
NullOutput null_sink;
readEscapedStringInto(null_sink, *in);
}
}
@ -375,67 +218,28 @@ void TabSeparatedRowInputFormat::syncAfterError()
skipToUnescapedNextLineOrEOF(*in);
}
void TabSeparatedRowInputFormat::resetParser()
{
RowInputFormatWithDiagnosticInfo::resetParser();
const auto & sample = getPort().getHeader();
column_mapping->read_columns.assign(sample.columns(), false);
column_mapping->column_indexes_for_input_fields.clear();
columns_to_fill_with_default_values.clear();
}
void registerInputFormatTabSeparated(FormatFactory & factory)
{
for (const auto * name : {"TabSeparated", "TSV"})
for (bool is_raw : {false, true})
{
factory.registerInputFormat(name, [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, false, false, settings);
});
}
factory.registerInputFormat(format_name, [with_names, with_types, is_raw](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, std::move(params), with_names, with_types, is_raw, settings);
});
};
for (const auto * name : {"TabSeparatedRaw", "TSVRaw"})
{
factory.registerInputFormat(name, [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRawRowInputFormat>(sample, buf, params, false, false, settings);
});
}
for (const auto * name : {"TabSeparatedWithNames", "TSVWithNames"})
{
factory.registerInputFormat(name, [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, true, false, settings);
});
}
for (const auto * name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"})
{
factory.registerInputFormat(name, [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRowInputFormat>(sample, buf, params, true, true, settings);
});
registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func);
registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func);
}
}
static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, bool is_raw, size_t min_rows)
{
bool need_more_data = true;
char * pos = in.position();
@ -443,13 +247,18 @@ static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer
while (loadAtPosition(in, memory, pos) && need_more_data)
{
pos = find_first_symbols<'\\', '\r', '\n'>(pos, in.buffer().end());
if (is_raw)
pos = find_first_symbols<'\r', '\n'>(pos, in.buffer().end());
else
pos = find_first_symbols<'\\', '\r', '\n'>(pos, in.buffer().end());
if (pos > in.buffer().end())
throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
else if (pos == in.buffer().end())
throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
if (pos == in.buffer().end())
continue;
else if (*pos == '\\')
if (!is_raw && *pos == '\\')
{
++pos;
if (loadAtPosition(in, memory, pos))
@ -460,7 +269,7 @@ static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer
if (*pos == '\n')
++number_of_rows;
if (memory.size() + static_cast<size_t>(pos - in.position()) >= min_chunk_size)
if ((memory.size() + static_cast<size_t>(pos - in.position()) >= min_chunk_size) && number_of_rows >= min_rows)
need_more_data = false;
++pos;
}
@ -473,11 +282,29 @@ static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer
void registerFileSegmentationEngineTabSeparated(FormatFactory & factory)
{
// We can use the same segmentation engine for TSKV.
for (const auto & name : {"TabSeparated", "TSV", "TSKV", "TabSeparatedWithNames", "TSVWithNames"})
for (bool is_raw : {false, true})
{
factory.registerFileSegmentationEngine(name, &fileSegmentationEngineTabSeparatedImpl);
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
size_t min_rows = 1 + int(with_names) + int(with_types);
factory.registerFileSegmentationEngine(format_name, [is_raw, min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
{
return fileSegmentationEngineTabSeparatedImpl(in, memory, min_chunk_size, is_raw, min_rows);
});
};
registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func);
registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func);
}
// We can use the same segmentation engine for TSKV.
factory.registerFileSegmentationEngine("TSKV", [](
ReadBuffer & in,
DB::Memory<> & memory,
size_t min_chunk_size)
{
return fileSegmentationEngineTabSeparatedImpl(in, memory, min_chunk_size, false, 1);
});
}
}

View File

@ -2,7 +2,7 @@
#include <Core/Block.h>
#include <Formats/FormatSettings.h>
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
namespace DB
@ -10,46 +10,43 @@ namespace DB
/** A stream to input data in tsv format.
*/
class TabSeparatedRowInputFormat : public RowInputFormatWithDiagnosticInfo
class TabSeparatedRowInputFormat : public RowInputFormatWithNamesAndTypes
{
public:
/** with_names - the first line is the header with the names of the columns
* with_types - on the next line header with type names
*/
TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_,
bool with_names_, bool with_types_, const FormatSettings & format_settings_);
bool with_names_, bool with_types_, bool is_raw, const FormatSettings & format_settings_);
String getName() const override { return "TabSeparatedRowInputFormat"; }
bool readRow(MutableColumns & columns, RowReadExtension &) override;
void readPrefix() override;
bool allowSyncAfterError() const override { return true; }
void syncAfterError() override;
void resetParser() override;
protected:
bool with_names;
bool with_types;
const FormatSettings format_settings;
virtual bool readField(IColumn & column, const DataTypePtr & type,
const SerializationPtr & serialization, bool is_last_file_column);
private:
DataTypes data_types;
bool is_raw;
using IndexesMap = std::unordered_map<String, size_t>;
IndexesMap column_indexes_by_names;
bool readField(IColumn & column, const DataTypePtr & type,
const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override;
std::vector<size_t> columns_to_fill_with_default_values;
void skipField(size_t /*file_column*/) override { skipField(); }
void skipField();
void skipHeaderRow();
void skipNames() override { skipHeaderRow(); }
void skipTypes() override { skipHeaderRow(); }
void skipFieldDelimiter() override;
void skipRowEndDelimiter() override;
void addInputColumn(const String & column_name);
void setupAllColumnsByTableSchema();
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension);
std::vector<String> readHeaderRow();
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
String readFieldIntoString();
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
void checkNullValueForNonNullable(DataTypePtr type) override;
bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override;
bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override;
bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; }
};

View File

@ -1,6 +1,6 @@
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
#include <Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h>
#include <Formats/FormatFactory.h>
#include <Formats/registerWithNamesAndTypes.h>
#include <IO/WriteHelpers.h>
@ -11,41 +11,43 @@ TabSeparatedRowOutputFormat::TabSeparatedRowOutputFormat(
const Block & header_,
bool with_names_,
bool with_types_,
bool is_raw_,
const RowOutputFormatParams & params_,
const FormatSettings & format_settings_)
: IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_)
: IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), is_raw(is_raw_), format_settings(format_settings_)
{
}
void TabSeparatedRowOutputFormat::writeLine(const std::vector<String> & values)
{
for (size_t i = 0; i < values.size(); ++i)
{
writeEscapedString(values[i], out);
if (i + 1 == values.size())
writeRowEndDelimiter();
else
writeFieldDelimiter();
}
}
void TabSeparatedRowOutputFormat::doWritePrefix()
{
const auto & header = getPort(PortKind::Main).getHeader();
size_t columns = header.columns();
if (with_names)
{
for (size_t i = 0; i < columns; ++i)
{
writeEscapedString(header.safeGetByPosition(i).name, out);
writeChar(i == columns - 1 ? '\n' : '\t', out);
}
}
writeLine(header.getNames());
if (with_types)
{
for (size_t i = 0; i < columns; ++i)
{
writeEscapedString(header.safeGetByPosition(i).type->getName(), out);
writeChar(i == columns - 1 ? '\n' : '\t', out);
}
}
writeLine(header.getDataTypeNames());
}
void TabSeparatedRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num)
{
serialization.serializeTextEscaped(column, row_num, out, format_settings);
if (is_raw)
serialization.serializeTextRaw(column, row_num, out, format_settings);
else
serialization.serializeTextEscaped(column, row_num, out, format_settings);
}
@ -75,56 +77,24 @@ void TabSeparatedRowOutputFormat::writeBeforeExtremes()
void registerOutputFormatTabSeparated(FormatFactory & factory)
{
for (const auto * name : {"TabSeparated", "TSV"})
for (bool is_raw : {false, true})
{
factory.registerOutputFormat(name, [](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & settings)
auto register_func = [&](const String & format_name, bool with_names, bool with_types)
{
return std::make_shared<TabSeparatedRowOutputFormat>(buf, sample, false, false, params, settings);
});
factory.markOutputFormatSupportsParallelFormatting(name);
}
factory.registerOutputFormat(format_name, [is_raw, with_names, with_types](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRowOutputFormat>(buf, sample, with_names, with_types, is_raw, params, settings);
});
for (const auto * name : {"TabSeparatedRaw", "TSVRaw"})
{
factory.registerOutputFormat(name, [](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRawRowOutputFormat>(buf, sample, false, false, params, settings);
});
factory.markOutputFormatSupportsParallelFormatting(name);
}
factory.markOutputFormatSupportsParallelFormatting(format_name);
};
for (const auto * name : {"TabSeparatedWithNames", "TSVWithNames"})
{
factory.registerOutputFormat(name, [](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRowOutputFormat>(buf, sample, true, false, params, settings);
});
factory.markOutputFormatSupportsParallelFormatting(name);
}
for (const auto * name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"})
{
factory.registerOutputFormat(name, [](
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRowOutputFormat>(buf, sample, true, true, params, settings);
});
factory.markOutputFormatSupportsParallelFormatting(name);
registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func);
registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func);
}
}

View File

@ -23,6 +23,7 @@ public:
const Block & header_,
bool with_names_,
bool with_types_,
bool is_raw_,
const RowOutputFormatParams & params_,
const FormatSettings & format_settings_);
@ -39,10 +40,13 @@ public:
/// https://www.iana.org/assignments/media-types/text/tab-separated-values
String getContentType() const override { return "text/tab-separated-values; charset=UTF-8"; }
protected:
private:
void writeLine(const std::vector<String> & values);
bool with_names;
bool with_types;
bool is_raw;
protected:
const FormatSettings format_settings;
};

View File

@ -130,7 +130,7 @@ void TemplateBlockOutputFormat::serializeField(const IColumn & column, const ISe
serialization.serializeTextXML(column, row_num, out, settings);
break;
case ColumnFormat::Raw:
serialization.serializeText(column, row_num, out, settings);
serialization.serializeTextRaw(column, row_num, out, settings);
break;
default:
__builtin_unreachable();

View File

@ -45,8 +45,8 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer
}
else
{
if (format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw)
format.throwInvalidFormat("XML and Raw deserialization is not supported", i);
if (format.formats[i] == ColumnFormat::Xml)
format.throwInvalidFormat("XML deserialization is not supported", i);
}
}
@ -54,8 +54,8 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer
std::vector<UInt8> column_in_format(header_.columns(), false);
for (size_t i = 0; i < row_format.columnsCount(); ++i)
{
if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw)
row_format.throwInvalidFormat("XML and Raw deserialization is not supported", i);
if (row_format.formats[i] == ColumnFormat::Xml)
row_format.throwInvalidFormat("XML deserialization is not supported", i);
if (row_format.format_idx_to_column_idx[i])
{
@ -194,7 +194,7 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type,
{
ColumnFormat col_format = row_format.formats[file_column];
bool read = true;
bool parse_as_nullable = settings.null_as_default && !type->isNullable();
bool parse_as_nullable = settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable();
try
{
switch (col_format)
@ -226,6 +226,12 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type,
else
serialization->deserializeTextJSON(column, buf, settings);
break;
case ColumnFormat::Raw:
if (parse_as_nullable)
read = SerializationNullable::deserializeTextRawImpl(column, buf, settings, serialization);
else
serialization->deserializeTextRaw(column, buf, settings);
break;
default:
__builtin_unreachable();
}

View File

@ -180,7 +180,7 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx)
bool read = true;
const auto & type = types[column_idx];
const auto & serialization = serializations[column_idx];
if (format_settings.null_as_default && !type->isNullable())
if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable())
read = SerializationNullable::deserializeTextQuotedImpl(column, *buf, format_settings, serialization);
else
serialization->deserializeTextQuoted(column, *buf, format_settings);
@ -421,7 +421,7 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx
Field value = convertFieldToType(expression_value, type, value_raw.second.get());
/// Check that we are indeed allowed to insert a NULL.
if (value.isNull() && !type.isNullable())
if (value.isNull() && !type.isNullable() && !type.isLowCardinalityNullable())
{
if (format_settings.null_as_default)
{

View File

@ -0,0 +1,260 @@
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
#include <DataTypes/DataTypeNothing.h>
#include <IO/ReadHelpers.h>
#include <IO/Operators.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
}
RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes(
const Block & header_,
ReadBuffer & in_,
const Params & params_,
bool with_names_,
bool with_types_,
const FormatSettings & format_settings_)
: RowInputFormatWithDiagnosticInfo(header_, in_, params_), format_settings(format_settings_), with_names(with_names_), with_types(with_types_)
{
const auto & sample = getPort().getHeader();
size_t num_columns = sample.columns();
data_types.resize(num_columns);
column_indexes_by_names.reserve(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_info = sample.getByPosition(i);
data_types[i] = column_info.type;
column_indexes_by_names.emplace(column_info.name, i);
}
}
void RowInputFormatWithNamesAndTypes::setupAllColumnsByTableSchema()
{
const auto & header = getPort().getHeader();
column_mapping->column_indexes_for_input_fields.resize(header.columns());
column_mapping->names_of_columns = header.getNames();
for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i)
column_mapping->column_indexes_for_input_fields[i] = i;
}
void RowInputFormatWithNamesAndTypes::addInputColumn(const String & column_name, std::vector<bool> & read_columns)
{
column_mapping->names_of_columns.push_back(column_name);
const auto column_it = column_indexes_by_names.find(column_name);
if (column_it == column_indexes_by_names.end())
{
if (format_settings.skip_unknown_fields)
{
column_mapping->column_indexes_for_input_fields.push_back(std::nullopt);
return;
}
throw Exception(
ErrorCodes::INCORRECT_DATA,
"Unknown field found in {} header: '{}' at position {}\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
getName(), column_name, column_mapping->column_indexes_for_input_fields.size());
}
const auto column_index = column_it->second;
if (read_columns[column_index])
throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
read_columns[column_index] = true;
column_mapping->column_indexes_for_input_fields.emplace_back(column_index);
}
void RowInputFormatWithNamesAndTypes::readPrefix()
{
if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8())
{
/// We assume that column name or type cannot contain BOM, so, if format has header,
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
skipBOMIfExists(*in);
}
/// This is a bit of abstraction leakage, but we need it in parallel parsing:
/// we check if this InputFormat is working with the "real" beginning of the data.
if (with_names && getCurrentUnitNumber() == 0)
{
if (format_settings.with_names_use_header)
{
std::vector<bool> read_columns(data_types.size(), false);
auto column_names = readNames();
for (const auto & name : column_names)
addInputColumn(name, read_columns);
for (size_t i = 0; i != read_columns.size(); ++i)
{
if (!read_columns[i])
column_mapping->not_presented_columns.push_back(i);
}
}
else
{
setupAllColumnsByTableSchema();
skipNames();
}
}
else if (!column_mapping->is_set)
setupAllColumnsByTableSchema();
if (with_types && getCurrentUnitNumber() == 0)
{
if (format_settings.with_types_use_header)
{
auto types = readTypes();
if (types.size() != column_mapping->column_indexes_for_input_fields.size())
throw Exception(
ErrorCodes::INCORRECT_DATA,
"The number of data types differs from the number of column names in input data");
/// Check that types from input matches types from header.
for (size_t i = 0; i < types.size(); ++i)
{
if (column_mapping->column_indexes_for_input_fields[i] &&
data_types[*column_mapping->column_indexes_for_input_fields[i]]->getName() != types[i])
{
throw Exception(
ErrorCodes::INCORRECT_DATA,
"Type of '{}' must be {}, not {}",
getPort().getHeader().getByPosition(*column_mapping->column_indexes_for_input_fields[i]).name,
data_types[*column_mapping->column_indexes_for_input_fields[i]]->getName(), types[i]);
}
}
}
else
skipTypes();
}
}
void RowInputFormatWithNamesAndTypes::insertDefaultsForNotSeenColumns(MutableColumns & columns, RowReadExtension & ext)
{
for (auto index : column_mapping->not_presented_columns)
{
columns[index]->insertDefault();
ext.read_columns[index] = false;
}
}
bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadExtension & ext)
{
if (in->eof())
return false;
updateDiagnosticInfo();
skipRowStartDelimiter();
ext.read_columns.resize(data_types.size());
for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column)
{
const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column];
const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size();
if (column_index)
ext.read_columns[*column_index] = readField(
*columns[*column_index],
data_types[*column_index],
serializations[*column_index],
is_last_file_column,
column_mapping->names_of_columns[file_column]);
else
skipField(file_column);
if (!is_last_file_column)
skipFieldDelimiter();
}
skipRowEndDelimiter();
insertDefaultsForNotSeenColumns(columns, ext);
/// If defaults_for_omitted_fields is set to 0, we should leave already inserted defaults.
if (!format_settings.defaults_for_omitted_fields)
ext.read_columns.assign(ext.read_columns.size(), true);
return true;
}
void RowInputFormatWithNamesAndTypes::resetParser()
{
RowInputFormatWithDiagnosticInfo::resetParser();
column_mapping->column_indexes_for_input_fields.clear();
column_mapping->not_presented_columns.clear();
column_mapping->names_of_columns.clear();
}
void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
{
const auto & index = column_mapping->column_indexes_for_input_fields[file_column];
if (index)
{
checkNullValueForNonNullable(type);
const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size();
readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]);
}
else
{
skipField(file_column);
}
}
bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
{
if (in->eof())
{
out << "<End of stream>\n";
return false;
}
if (!parseRowStartWithDiagnosticInfo(out))
return false;
for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column)
{
if (column_mapping->column_indexes_for_input_fields[file_column].has_value())
{
const auto & header = getPort().getHeader();
size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value();
if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], out, file_column))
return false;
}
else
{
static const String skipped_column_str = "<SKIPPED COLUMN>";
static const DataTypePtr skipped_column_type = std::make_shared<DataTypeNothing>();
static const MutableColumnPtr skipped_column = skipped_column_type->createColumn();
if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column))
return false;
}
/// Delimiters
if (file_column + 1 != column_mapping->column_indexes_for_input_fields.size())
{
if (!parseFieldDelimiterWithDiagnosticInfo(out))
return false;
}
}
return parseRowEndWithDiagnosticInfo(out);
}
void registerFileSegmentationEngineForFormatWithNamesAndTypes(
FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine)
{
factory.registerFileSegmentationEngine(base_format_name, segmentation_engine);
factory.registerFileSegmentationEngine(base_format_name + "WithNames", segmentation_engine);
factory.registerFileSegmentationEngine(base_format_name + "WithNamesAndTypes", segmentation_engine);
}
}

View File

@ -0,0 +1,85 @@
#pragma once
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
#include <Formats/FormatSettings.h>
#include <Formats/FormatFactory.h>
namespace DB
{
/// Base class for input formats with -WithNames and -WithNamesAndTypes suffixes.
/// It accepts 2 parameters in constructor - with_names and with_types and implements
/// input format depending on them:
/// - if with_names is true, it will expect that the first row of data contains column
/// names. If setting input_format_with_names_use_header is set to 1, columns mapping
/// will be performed.
/// - if with_types is true, it will expect that the second row of data contains column
/// types. If setting input_format_with_types_use_header is set to 1, types from input
/// will be compared types from header.
/// It's important that firstly this class reads/skips names and only
/// then reads/skips types. So you can this invariant.
class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo
{
public:
/** with_names - in the first line the header with column names
* with_types - in the second line the header with column names
*/
RowInputFormatWithNamesAndTypes(
const Block & header_,
ReadBuffer & in_,
const Params & params_,
bool with_names_, bool with_types_, const FormatSettings & format_settings_);
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
void readPrefix() override;
void resetParser() override;
protected:
/// Read single field from input. Return false if there was no real value and we inserted default value.
virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0;
/// Skip single field, it's used to skip unknown columns.
virtual void skipField(size_t file_column) = 0;
/// Skip the whole row with names.
virtual void skipNames() = 0;
/// Skip the whole row with types.
virtual void skipTypes() = 0;
/// Skip delimiters, if any.
virtual void skipRowStartDelimiter() {}
virtual void skipFieldDelimiter() {}
virtual void skipRowEndDelimiter() {}
/// Methods for parsing with diagnostic info.
virtual void checkNullValueForNonNullable(DataTypePtr) {}
virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; }
virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; }
virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;}
bool isGarbageAfterField(size_t, ReadBuffer::Position) override {return false; }
/// Read row with names and return the list of them.
virtual std::vector<String> readNames() = 0;
/// Read row with types and return the list of them.
virtual std::vector<String> readTypes() = 0;
const FormatSettings format_settings;
DataTypes data_types;
private:
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override;
void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override;
void setupAllColumnsByTableSchema();
void addInputColumn(const String & column_name, std::vector<bool> & read_columns);
void insertDefaultsForNotSeenColumns(MutableColumns & columns, RowReadExtension & ext);
bool with_names;
bool with_types;
std::unordered_map<String, size_t> column_indexes_by_names;
};
void registerFileSegmentationEngineForFormatWithNamesAndTypes(
FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine);
}

View File

@ -218,7 +218,7 @@ bool IStorage::isStaticStorage() const
return false;
}
BackupEntries IStorage::backup(const ASTs &, ContextPtr) const
BackupEntries IStorage::backup(const ASTs &, ContextPtr)
{
throw Exception("Table engine " + getName() + " doesn't support backups", ErrorCodes::NOT_IMPLEMENTED);
}

View File

@ -205,7 +205,7 @@ public:
NameDependencies getDependentViewsByColumn(ContextPtr context) const;
/// Prepares entries to backup data of the storage.
virtual BackupEntries backup(const ASTs & partitions, ContextPtr context) const;
virtual BackupEntries backup(const ASTs & partitions, ContextPtr context);
/// Extract data from the backup and put it to the storage.
virtual RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context);

View File

@ -333,8 +333,9 @@ const KeyCondition::AtomMap KeyCondition::atom_map
[] (RPNElement & out, const Field &)
{
out.function = RPNElement::FUNCTION_IS_NULL;
// When using NULL_LAST, isNull means [+Inf, +Inf]
out.range = Range(Field(POSITIVE_INFINITY));
// isNull means +Inf (NULLS_LAST) or -Inf (NULLS_FIRST),
// which is equivalent to not in Range (-Inf, +Inf)
out.range = Range();
return true;
}
}
@ -2002,7 +2003,10 @@ BoolMask KeyCondition::checkInHyperrectangle(
/// No need to apply monotonic functions as nulls are kept.
bool intersects = element.range.intersectsRange(*key_range);
bool contains = element.range.containsRange(*key_range);
rpn_stack.emplace_back(intersects, !contains);
if (element.function == RPNElement::FUNCTION_IS_NULL)
rpn_stack.back() = !rpn_stack.back();
}
else if (
element.function == RPNElement::FUNCTION_IN_SET

View File

@ -3500,7 +3500,7 @@ Pipe MergeTreeData::alterPartition(
}
BackupEntries MergeTreeData::backup(const ASTs & partitions, ContextPtr local_context) const
BackupEntries MergeTreeData::backup(const ASTs & partitions, ContextPtr local_context)
{
DataPartsVector data_parts;
if (partitions.empty())
@ -3522,7 +3522,7 @@ BackupEntries MergeTreeData::backupDataParts(const DataPartsVector & data_parts)
auto temp_dir_it = temp_dirs.find(disk);
if (temp_dir_it == temp_dirs.end())
temp_dir_it = temp_dirs.emplace(disk, std::make_shared<TemporaryFileOnDisk>(disk, "tmp_backup_")).first;
temp_dir_it = temp_dirs.emplace(disk, std::make_shared<TemporaryFileOnDisk>(disk, "tmp/backup_")).first;
auto temp_dir_owner = temp_dir_it->second;
fs::path temp_dir = temp_dir_owner->getPath();

View File

@ -630,7 +630,7 @@ public:
TableLockHolder & table_lock_holder);
/// Prepares entries to backup data of the storage.
BackupEntries backup(const ASTs & partitions, ContextPtr context) const override;
BackupEntries backup(const ASTs & partitions, ContextPtr context) override;
static BackupEntries backupDataParts(const DataPartsVector & data_parts);
/// Extract data from the backup and put it to the storage.

View File

@ -12,6 +12,7 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFileBase.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
#include <Compression/CompressedReadBuffer.h>
#include <Compression/CompressedWriteBuffer.h>
@ -28,6 +29,11 @@
#include <QueryPipeline/Pipe.h>
#include <Processors/Sinks/SinkToStorage.h>
#include <Backups/BackupEntryFromImmutableFile.h>
#include <Backups/BackupEntryFromSmallFile.h>
#include <Backups/IBackup.h>
#include <Disks/TemporaryFileOnDisk.h>
#include <cassert>
#include <chrono>
@ -47,6 +53,7 @@ namespace ErrorCodes
extern const int SIZES_OF_MARKS_FILES_ARE_INCONSISTENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int INCORRECT_FILE_NAME;
extern const int NOT_IMPLEMENTED;
}
/// NOTE: The lock `StorageLog::rwlock` is NOT kept locked while reading,
@ -880,6 +887,162 @@ IStorage::ColumnSizeByName StorageLog::getColumnSizes() const
}
BackupEntries StorageLog::backup(const ASTs & partitions, ContextPtr context)
{
if (!partitions.empty())
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName());
auto lock_timeout = getLockTimeout(context);
loadMarks(lock_timeout);
ReadLock lock{rwlock, lock_timeout};
if (!lock)
throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);
if (!num_data_files || !file_checker.getFileSize(data_files[INDEX_WITH_REAL_ROW_COUNT].path))
return {};
auto temp_dir_owner = std::make_shared<TemporaryFileOnDisk>(disk, "tmp/backup_");
auto temp_dir = temp_dir_owner->getPath();
disk->createDirectories(temp_dir);
BackupEntries backup_entries;
/// *.bin
for (const auto & data_file : data_files)
{
/// We make a copy of the data file because it can be changed later in write() or in truncate().
String data_file_name = fileName(data_file.path);
String temp_file_path = temp_dir + "/" + data_file_name;
disk->copy(data_file.path, disk, temp_file_path);
backup_entries.emplace_back(
data_file_name,
std::make_unique<BackupEntryFromImmutableFile>(
disk, temp_file_path, file_checker.getFileSize(data_file.path), std::nullopt, temp_dir_owner));
}
/// __marks.mrk
if (use_marks_file)
{
/// We make a copy of the data file because it can be changed later in write() or in truncate().
String marks_file_name = fileName(marks_file_path);
String temp_file_path = temp_dir + "/" + marks_file_name;
disk->copy(marks_file_path, disk, temp_file_path);
backup_entries.emplace_back(
marks_file_name,
std::make_unique<BackupEntryFromImmutableFile>(
disk, temp_file_path, file_checker.getFileSize(marks_file_path), std::nullopt, temp_dir_owner));
}
/// sizes.json
String files_info_path = file_checker.getPath();
backup_entries.emplace_back(fileName(files_info_path), std::make_unique<BackupEntryFromSmallFile>(disk, files_info_path));
/// columns.txt
backup_entries.emplace_back(
"columns.txt", std::make_unique<BackupEntryFromMemory>(getInMemoryMetadata().getColumns().getAllPhysical().toString()));
/// count.txt
if (use_marks_file)
{
size_t num_rows = data_files[INDEX_WITH_REAL_ROW_COUNT].marks.empty() ? 0 : data_files[INDEX_WITH_REAL_ROW_COUNT].marks.back().rows;
backup_entries.emplace_back("count.txt", std::make_unique<BackupEntryFromMemory>(toString(num_rows)));
}
return backup_entries;
}
RestoreDataTasks StorageLog::restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context)
{
if (!partitions.empty())
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName());
auto restore_task = [this, backup, data_path_in_backup, context]()
{
auto lock_timeout = getLockTimeout(context);
WriteLock lock{rwlock, lock_timeout};
if (!lock)
throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);
if (!num_data_files)
return;
/// Load the marks if not loaded yet. We have to do that now because we're going to update these marks.
loadMarks(lock);
/// If there were no files, save zero file sizes to be able to rollback in case of error.
saveFileSizes(lock);
try
{
/// Append data files.
for (const auto & data_file : data_files)
{
String file_path_in_backup = data_path_in_backup + fileName(data_file.path);
auto backup_entry = backup->read(file_path_in_backup);
auto in = backup_entry->getReadBuffer();
auto out = disk->writeFile(data_file.path, max_compress_block_size, WriteMode::Append);
copyData(*in, *out);
}
if (use_marks_file)
{
/// Append marks.
size_t num_extra_marks = 0;
String file_path_in_backup = data_path_in_backup + fileName(marks_file_path);
size_t file_size = backup->getSize(file_path_in_backup);
if (file_size % (num_data_files * sizeof(Mark)) != 0)
throw Exception("Size of marks file is inconsistent", ErrorCodes::SIZES_OF_MARKS_FILES_ARE_INCONSISTENT);
num_extra_marks = file_size / (num_data_files * sizeof(Mark));
size_t num_marks = data_files[0].marks.size();
for (auto & data_file : data_files)
data_file.marks.reserve(num_marks + num_extra_marks);
std::vector<size_t> old_data_sizes;
std::vector<size_t> old_num_rows;
old_data_sizes.resize(num_data_files);
old_num_rows.resize(num_data_files);
for (size_t i = 0; i != num_data_files; ++i)
{
old_data_sizes[i] = file_checker.getFileSize(data_files[i].path);
old_num_rows[i] = num_marks ? data_files[i].marks[num_marks - 1].rows : 0;
}
auto backup_entry = backup->read(file_path_in_backup);
auto marks_rb = backup_entry->getReadBuffer();
for (size_t i = 0; i != num_extra_marks; ++i)
{
for (size_t j = 0; j != num_data_files; ++j)
{
Mark mark;
mark.read(*marks_rb);
mark.rows += old_num_rows[j]; /// Adjust the number of rows.
mark.offset += old_data_sizes[j]; /// Adjust the offset.
data_files[j].marks.push_back(mark);
}
}
}
/// Finish writing.
saveMarks(lock);
saveFileSizes(lock);
}
catch (...)
{
/// Rollback partial writes.
file_checker.repair();
removeUnsavedMarks(lock);
throw;
}
};
return {restore_task};
}
void registerStorageLog(StorageFactory & factory)
{
StorageFactory::StorageFeatures features{

View File

@ -51,6 +51,9 @@ public:
bool supportsSubcolumns() const override { return true; }
ColumnSizeByName getColumnSizes() const override;
BackupEntries backup(const ASTs & partitions, ContextPtr context) override;
RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) override;
protected:
/** Attach the table with the appropriate name, along the appropriate path (with / at the end),
* (the correctness of names and paths is not verified)

View File

@ -9,10 +9,12 @@
#include <Common/Exception.h>
#include <IO/WriteBufferFromFileBase.h>
#include <Compression/CompressedReadBuffer.h>
#include <Compression/CompressedReadBufferFromFile.h>
#include <Compression/CompressedWriteBuffer.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
#include <Formats/NativeReader.h>
#include <Formats/NativeWriter.h>
@ -33,6 +35,13 @@
#include <Processors/Sinks/SinkToStorage.h>
#include <QueryPipeline/Pipe.h>
#include <Backups/BackupEntryFromImmutableFile.h>
#include <Backups/BackupEntryFromSmallFile.h>
#include <Backups/IBackup.h>
#include <Disks/TemporaryFileOnDisk.h>
#include <base/insertAtEnd.h>
#include <cassert>
@ -44,6 +53,7 @@ namespace ErrorCodes
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int INCORRECT_FILE_NAME;
extern const int TIMEOUT_EXCEEDED;
extern const int NOT_IMPLEMENTED;
}
@ -481,6 +491,134 @@ void StorageStripeLog::saveFileSizes(const WriteLock & /* already locked for wri
}
BackupEntries StorageStripeLog::backup(const ASTs & partitions, ContextPtr context)
{
if (!partitions.empty())
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName());
auto lock_timeout = getLockTimeout(context);
loadIndices(lock_timeout);
ReadLock lock{rwlock, lock_timeout};
if (!lock)
throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);
if (!file_checker.getFileSize(data_file_path))
return {};
auto temp_dir_owner = std::make_shared<TemporaryFileOnDisk>(disk, "tmp/backup_");
auto temp_dir = temp_dir_owner->getPath();
disk->createDirectories(temp_dir);
BackupEntries backup_entries;
/// data.bin
{
/// We make a copy of the data file because it can be changed later in write() or in truncate().
String data_file_name = fileName(data_file_path);
String temp_file_path = temp_dir + "/" + data_file_name;
disk->copy(data_file_path, disk, temp_file_path);
backup_entries.emplace_back(
data_file_name,
std::make_unique<BackupEntryFromImmutableFile>(
disk, temp_file_path, file_checker.getFileSize(data_file_path), std::nullopt, temp_dir_owner));
}
/// index.mrk
{
/// We make a copy of the data file because it can be changed later in write() or in truncate().
String index_file_name = fileName(index_file_path);
String temp_file_path = temp_dir + "/" + index_file_name;
disk->copy(index_file_path, disk, temp_file_path);
backup_entries.emplace_back(
index_file_name,
std::make_unique<BackupEntryFromImmutableFile>(
disk, temp_file_path, file_checker.getFileSize(index_file_path), std::nullopt, temp_dir_owner));
}
/// sizes.json
String files_info_path = file_checker.getPath();
backup_entries.emplace_back(fileName(files_info_path), std::make_unique<BackupEntryFromSmallFile>(disk, files_info_path));
/// columns.txt
backup_entries.emplace_back(
"columns.txt", std::make_unique<BackupEntryFromMemory>(getInMemoryMetadata().getColumns().getAllPhysical().toString()));
/// count.txt
size_t num_rows = 0;
for (const auto & block : indices.blocks)
num_rows += block.num_rows;
backup_entries.emplace_back("count.txt", std::make_unique<BackupEntryFromMemory>(toString(num_rows)));
return backup_entries;
}
RestoreDataTasks StorageStripeLog::restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context)
{
if (!partitions.empty())
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName());
auto restore_task = [this, backup, data_path_in_backup, context]()
{
WriteLock lock{rwlock, getLockTimeout(context)};
if (!lock)
throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED);
/// Load the indices if not loaded yet. We have to do that now because we're going to update these indices.
loadIndices(lock);
/// If there were no files, save zero file sizes to be able to rollback in case of error.
saveFileSizes(lock);
try
{
/// Append the data file.
auto old_data_size = file_checker.getFileSize(data_file_path);
{
String file_path_in_backup = data_path_in_backup + fileName(data_file_path);
auto backup_entry = backup->read(file_path_in_backup);
auto in = backup_entry->getReadBuffer();
auto out = disk->writeFile(data_file_path, max_compress_block_size, WriteMode::Append);
copyData(*in, *out);
}
/// Append the index.
String index_path_in_backup = data_path_in_backup + fileName(index_file_path);
if (backup->exists(index_path_in_backup))
{
IndexForNativeFormat extra_indices;
auto backup_entry = backup->read(index_path_in_backup);
auto index_in = backup_entry->getReadBuffer();
CompressedReadBuffer index_compressed_in{*index_in};
extra_indices.read(index_compressed_in);
/// Adjust the offsets.
for (auto & block : extra_indices.blocks)
{
for (auto & column : block.columns)
column.location.offset_in_compressed_file += old_data_size;
}
insertAtEnd(indices.blocks, std::move(extra_indices.blocks));
}
/// Finish writing.
saveIndices(lock);
saveFileSizes(lock);
}
catch (...)
{
/// Rollback partial writes.
file_checker.repair();
removeUnsavedIndices(lock);
throw;
}
};
return {restore_task};
}
void registerStorageStripeLog(StorageFactory & factory)
{
StorageFactory::StorageFeatures features{

View File

@ -50,6 +50,9 @@ public:
void truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder&) override;
BackupEntries backup(const ASTs & partitions, ContextPtr context) override;
RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) override;
protected:
StorageStripeLog(
DiskPtr disk_,
@ -92,7 +95,7 @@ private:
const size_t max_compress_block_size;
std::shared_timed_mutex rwlock;
mutable std::shared_timed_mutex rwlock;
Poco::Logger * log;
};

View File

@ -26,6 +26,7 @@ const char * auto_contributors[] {
"Aleksandra (Ася)",
"Aleksandr Karo",
"Aleksandrov Vladimir",
"Aleksandr Shalimov",
"alekseik1",
"Aleksei Levushkin",
"Aleksei Semiglazov",
@ -36,6 +37,7 @@ const char * auto_contributors[] {
"Alexander Avdonkin",
"Alexander Bezpiatov",
"Alexander Burmak",
"Alexander Chashnikov",
"Alexander Ermolaev",
"Alexander Gololobov",
"Alexander GQ Gerasiov",
@ -101,9 +103,11 @@ const char * auto_contributors[] {
"ana-uvarova",
"AnaUvarova",
"Andr0901",
"andrc1901",
"Andreas Hunkeler",
"AndreevDm",
"Andrei Bodrov",
"Andrei Ch",
"Andrei Chulkov",
"andrei-karpliuk",
"Andrei Nekrashevich",
@ -127,6 +131,7 @@ const char * auto_contributors[] {
"Anmol Arora",
"Anna",
"Anna Shakhova",
"anneji",
"anneji-dev",
"annvsh",
"anrodigina",
@ -181,6 +186,7 @@ const char * auto_contributors[] {
"BanyRule",
"Baudouin Giard",
"BayoNet",
"bbkas",
"benamazing",
"benbiti",
"Benjamin Naecker",
@ -190,6 +196,7 @@ const char * auto_contributors[] {
"bharatnc",
"Big Elephant",
"Bill",
"BiteTheDDDDt",
"BlahGeek",
"blazerer",
"bluebirddm",
@ -220,6 +227,7 @@ const char * auto_contributors[] {
"Chao Wang",
"chasingegg",
"chengy8934",
"chenjian",
"chenqi",
"chenxing-xc",
"chenxing.xc",
@ -232,7 +240,9 @@ const char * auto_contributors[] {
"Ciprian Hacman",
"Clement Rodriguez",
"Clément Rodriguez",
"ClickHouse Admin",
"cn-ds",
"Cody Baker",
"Colum",
"comunodi",
"Constantin S. Pan",
@ -343,6 +353,7 @@ const char * auto_contributors[] {
"fastio",
"favstovol",
"FawnD2",
"Federico Ceratto",
"FeehanG",
"feihengye",
"felixoid",
@ -372,6 +383,7 @@ const char * auto_contributors[] {
"fuwhu",
"Fu Zhe",
"fuzhe1989",
"fuzzERot",
"Gagan Arneja",
"Gao Qiang",
"g-arslan",
@ -407,6 +419,7 @@ const char * auto_contributors[] {
"heng zhao",
"hermano",
"hexiaoting",
"hhell",
"Hiroaki Nakamura",
"hotid",
"huangzhaowei",
@ -456,6 +469,7 @@ const char * auto_contributors[] {
"Ivan Kush",
"Ivan Kushnarenko",
"Ivan Lezhankin",
"Ivan Milov",
"Ivan Remen",
"Ivan Starkov",
"ivanzhukov",
@ -470,6 +484,7 @@ const char * auto_contributors[] {
"jasine",
"Jason",
"Jason Keirstead",
"jasperzhu",
"javartisan",
"javi",
"javi santana",
@ -483,6 +498,7 @@ const char * auto_contributors[] {
"Jiang Tao",
"jianmei zhang",
"jkuklis",
"João Figueiredo",
"Jochen Schalanda",
"John",
"John Hummel",
@ -492,9 +508,11 @@ const char * auto_contributors[] {
"jyz0309",
"Kang Liu",
"Karl Pietrzak",
"karnevil13",
"keenwolf",
"Keiji Yoshida",
"Ken Chen",
"Ken MacInnis",
"Kevin Chiang",
"Kevin Michel",
"kevin wan",
@ -509,6 +527,7 @@ const char * auto_contributors[] {
"KochetovNicolai",
"kolsys",
"Konstantin Grabar",
"Konstantin Ilchenko",
"Konstantin Lebedev",
"Konstantin Malanchev",
"Konstantin Podshumok",
@ -529,6 +548,7 @@ const char * auto_contributors[] {
"l1tsolaiki",
"lalex",
"Latysheva Alexandra",
"laurieliyang",
"lehasm",
"Léo Ercolanelli",
"Leonardo Cecchi",
@ -539,6 +559,9 @@ const char * auto_contributors[] {
"levushkin aleksej",
"levysh",
"Lewinma",
"lhuang0928",
"lhuang09287750",
"liang.huang",
"liangqian",
"libenwang",
"lichengxiang",
@ -593,6 +616,7 @@ const char * auto_contributors[] {
"mastertheknife",
"Matthew Peveler",
"Matwey V. Kornilov",
"Mátyás Jani",
"Max",
"Max Akhmedov",
"Max Bruce",
@ -613,11 +637,13 @@ const char * auto_contributors[] {
"maxkuzn",
"maxulan",
"Max Vetrov",
"MaxWk",
"Mc.Spring",
"mehanizm",
"MeiK",
"melin",
"memo",
"Memo",
"meo",
"meoww-bot",
"mergify[bot]",
@ -636,6 +662,7 @@ const char * auto_contributors[] {
"Miguel Fernández",
"miha-g",
"Mihail Fandyushin",
"mikael",
"Mikahil Nacharov",
"Mike",
"Mike F",
@ -658,6 +685,8 @@ const char * auto_contributors[] {
"millb",
"Misko Lee",
"mnkonkova",
"mo-avatar",
"Mohamad Fadhil",
"Mohammad Hossein Sekhavat",
"morty",
"moscas",
@ -698,6 +727,7 @@ const char * auto_contributors[] {
"nikitamikhaylov",
"Nikita Mikhaylov",
"Nikita Orlov",
"Nikita Tikhomirov",
"Nikita Vasilev",
"Nikolai Kochetov",
"Nikolai Sorokin",
@ -739,15 +769,18 @@ const char * auto_contributors[] {
"Paramtamtam",
"Patrick Zippenfenig",
"Pavel",
"Pavel Cheremushkin",
"Pavel Kartaviy",
"Pavel Kartavyy",
"Pavel Kovalenko",
"Pavel Kruglov",
"Pavel Litvinenko",
"Pavel Medvedev",
"Pavel Patrin",
"Pavel Yakunin",
"Pavlo Bashynskiy",
"Pawel Rog",
"pawelsz-rb",
"pdv-ru",
"Peng Jian",
"Persiyanov Dmitriy Andreevich",
@ -770,6 +803,7 @@ const char * auto_contributors[] {
"qianmoQ",
"quid",
"Quid37",
"quoctan132",
"r1j1k",
"Rafael David Tinoco",
"rainbowsysu",
@ -789,6 +823,7 @@ const char * auto_contributors[] {
"robot-clickhouse",
"robot-metrika-test",
"rodrigargar",
"Rohit Agarwal",
"Romain Neutron",
"roman",
"Roman Bug",
@ -816,6 +851,7 @@ const char * auto_contributors[] {
"Saulius Valatka",
"sdk2",
"Sean Haynes",
"Sébastien",
"Sébastien Launay",
"serebrserg",
"Sergei Bocharov",
@ -840,10 +876,12 @@ const char * auto_contributors[] {
"sev7e0",
"SevaCode",
"sevirov",
"Seyed Mehrshad Hosseini",
"sfod",
"shangshujie",
"shedx",
"Sherry Wang",
"Shoh Jahon",
"Silviu Caragea",
"Simeon Emanuilov",
"Simon Liu",
@ -887,16 +925,23 @@ const char * auto_contributors[] {
"Taleh Zaliyev",
"Tangaev",
"tao jiang",
"Tatiana",
"Tatiana Kirillova",
"tavplubix",
"TCeason",
"Teja",
"Teja Srivastasa",
"Tema Novikov",
"templarzq",
"Tentoshka",
"terrylin",
"The-Alchemist",
"Thomas Berdy",
"Thom O'Connor",
"tianzhou",
"Tiaonmmn",
"tiger.yan",
"Tigran Khudaverdyan",
"tison",
"TiunovNN",
"Tobias Adamson",
@ -934,7 +979,9 @@ const char * auto_contributors[] {
"Veloman Yunkan",
"Veniamin Gvozdikov",
"Veselkov Konstantin",
"vesslanjin",
"vgocoder",
"Viachaslau Boben",
"vic",
"vicdashkov",
"vicgao",
@ -950,6 +997,7 @@ const char * auto_contributors[] {
"Vitaliy Zakaznikov",
"Vitaly",
"Vitaly Baranov",
"Vitaly Orlov",
"Vitaly Samigullin",
"Vitaly Stoyan",
"vitstn",
@ -980,6 +1028,7 @@ const char * auto_contributors[] {
"vzakaznikov",
"wangchao",
"Wang Fenjin",
"WangZengrui",
"weeds085490",
"Weiqing Xu",
"William Shallum",
@ -994,6 +1043,7 @@ const char * auto_contributors[] {
"Yangkuan Liu",
"yangshuai",
"Yatsishin Ilya",
"yeer",
"Yegor Andreenko",
"Yegor Levankov",
"ygrek",
@ -1023,6 +1073,7 @@ const char * auto_contributors[] {
"ywill3",
"zamulla",
"zhang2014",
"zhanglistar",
"zhangshengyu",
"zhangxiao018",
"zhangxiao871",
@ -1051,6 +1102,7 @@ const char * auto_contributors[] {
"Смитюх Вячеслав",
"Сундуков Алексей",
"万康",
"凌涛",
"吴健",
"小路",
"张中南",
@ -1058,8 +1110,10 @@ const char * auto_contributors[] {
"张风啸",
"徐炘",
"曲正鹏",
"木木夕120",
"未来星___费",
"极客青年",
"枢木",
"董海镔",
"谢磊",
"贾顺名(Jarvis)",

View File

@ -79,7 +79,7 @@ def list_runners(access_token):
def group_runners_by_tag(listed_runners):
result = {}
RUNNER_TYPE_LABELS = ['style-checker', 'builder', 'func-tester']
RUNNER_TYPE_LABELS = ['style-checker', 'builder', 'func-tester', 'stress-tester']
for runner in listed_runners:
for tag in runner.tags:
if tag in RUNNER_TYPE_LABELS:

275
tests/ci/stress_check.py Normal file
View File

@ -0,0 +1,275 @@
#!/usr/bin/env python3
#!/usr/bin/env python3
import csv
import logging
import subprocess
import os
import json
import time
import sys
from github import Github
import requests
from report import create_test_html_report
from s3_helper import S3Helper
from get_robot_token import get_best_robot_token
from pr_info import PRInfo
DOWNLOAD_RETRIES_COUNT = 5
def dowload_build_with_progress(url, path):
logging.info("Downloading from %s to temp path %s", url, path)
for i in range(DOWNLOAD_RETRIES_COUNT):
try:
with open(path, 'wb') as f:
response = requests.get(url, stream=True)
response.raise_for_status()
total_length = response.headers.get('content-length')
if total_length is None or int(total_length) == 0:
logging.info("No content-length, will download file without progress")
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
logging.info("Content length is %ld bytes", total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
if sys.stdout.isatty():
done = int(50 * dl / total_length)
percent = int(100 * float(dl) / total_length)
eq_str = '=' * done
space_str = ' ' * (50 - done)
sys.stdout.write(f"\r[{eq_str}{space_str}] {percent}%")
sys.stdout.flush()
break
except Exception as ex:
sys.stdout.write("\n")
time.sleep(3)
logging.info("Exception while downloading %s, retry %s", ex, i + 1)
if os.path.exists(path):
os.remove(path)
else:
raise Exception(f"Cannot download dataset from {url}, all retries exceeded")
sys.stdout.write("\n")
logging.info("Downloading finished")
def download_builds(result_path, build_urls):
for url in build_urls:
if url.endswith('.deb'):
fname = os.path.basename(url)
logging.info("Will download %s to %s", fname, result_path)
dowload_build_with_progress(url, os.path.join(result_path, fname))
def get_build_config(build_number, repo_path):
ci_config_path = os.path.join(repo_path, "tests/ci/ci_config.json")
with open(ci_config_path, 'r', encoding='utf-8') as ci_config:
config_dict = json.load(ci_config)
return config_dict['build_config'][build_number]
def get_build_urls(build_config_str, reports_path):
for root, _, files in os.walk(reports_path):
for f in files:
if build_config_str in f :
logging.info("Found build report json %s", f)
with open(os.path.join(root, f), 'r', encoding='utf-8') as file_handler:
build_report = json.load(file_handler)
return build_report['build_urls']
return []
def build_config_to_string(build_config):
if build_config["package-type"] == "performance":
return "performance"
return "_".join([
build_config['compiler'],
build_config['build-type'] if build_config['build-type'] else "relwithdebuginfo",
build_config['sanitizer'] if build_config['sanitizer'] else "none",
build_config['bundled'],
build_config['splitted'],
"tidy" if build_config['tidy'] == "enable" else "notidy",
"with_coverage" if build_config['with_coverage'] else "without_coverage",
build_config['package-type'],
])
def get_run_command(build_path, result_folder, server_log_folder, image):
cmd = "docker run -e S3_URL='https://clickhouse-datasets.s3.amazonaws.com' " + \
f"--volume={build_path}:/package_folder " \
f"--volume={result_folder}:/test_output " \
f"--volume={server_log_folder}:/var/log/clickhouse-server {image}"
return cmd
def process_logs(s3_client, additional_logs, s3_path_prefix):
additional_urls = []
for log_path in additional_logs:
if log_path:
additional_urls.append(
s3_client.upload_test_report_to_s3(
log_path,
s3_path_prefix + "/" + os.path.basename(log_path)))
return additional_urls
def upload_results(s3_client, pr_number, commit_sha, test_results, raw_log, additional_files, check_name):
additional_files = [raw_log] + additional_files
s3_path_prefix = f"{pr_number}/{commit_sha}/" + check_name.lower().replace(' ', '_').replace('(', '_').replace(')', '_').replace(',', '_')
additional_urls = process_logs(s3_client, additional_files, s3_path_prefix)
branch_url = "https://github.com/ClickHouse/ClickHouse/commits/master"
branch_name = "master"
if pr_number != 0:
branch_name = f"PR #{pr_number}"
branch_url = f"https://github.com/ClickHouse/ClickHouse/pull/{pr_number}"
commit_url = f"https://github.com/ClickHouse/ClickHouse/commit/{commit_sha}"
task_url = f"https://github.com/ClickHouse/ClickHouse/actions/runs/{os.getenv('GITHUB_RUN_ID')}"
raw_log_url = additional_urls[0]
additional_urls.pop(0)
html_report = create_test_html_report(check_name, test_results, raw_log_url, task_url, branch_url, branch_name, commit_url, additional_urls, True)
with open('report.html', 'w', encoding='utf-8') as f:
f.write(html_report)
url = s3_client.upload_test_report_to_s3('report.html', s3_path_prefix + ".html")
logging.info("Search result in url %s", url)
return url
def get_commit(gh, commit_sha):
repo = gh.get_repo(os.getenv("GITHUB_REPOSITORY", "ClickHouse/ClickHouse"))
commit = repo.get_commit(commit_sha)
return commit
def process_results(result_folder, server_log_path, run_log_path):
test_results = []
additional_files = []
# Just upload all files from result_folder.
# If task provides processed results, then it's responsible for content of result_folder.
if os.path.exists(result_folder):
test_files = [f for f in os.listdir(result_folder) if os.path.isfile(os.path.join(result_folder, f))]
additional_files = [os.path.join(result_folder, f) for f in test_files]
if os.path.exists(server_log_path):
server_log_files = [f for f in os.listdir(server_log_path) if os.path.isfile(os.path.join(server_log_path, f))]
additional_files = additional_files + [os.path.join(server_log_path, f) for f in server_log_files]
additional_files.append(run_log_path)
status_path = os.path.join(result_folder, "check_status.tsv")
if not os.path.exists(status_path):
return "failure", "check_status.tsv doesn't exists", test_results, additional_files
logging.info("Found check_status.tsv")
with open(status_path, 'r', encoding='utf-8') as status_file:
status = list(csv.reader(status_file, delimiter='\t'))
if len(status) != 1 or len(status[0]) != 2:
return "error", "Invalid check_status.tsv", test_results, additional_files
state, description = status[0][0], status[0][1]
results_path = os.path.join(result_folder, "test_results.tsv")
with open(results_path, 'r', encoding='utf-8') as results_file:
test_results = list(csv.reader(results_file, delimiter='\t'))
if len(test_results) == 0:
raise Exception("Empty results")
return state, description, test_results, additional_files
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
temp_path = os.getenv("TEMP_PATH", os.path.abspath("."))
repo_path = os.getenv("REPO_COPY", os.path.abspath("../../"))
reports_path = os.getenv("REPORTS_PATH", "./reports")
check_name = sys.argv[1]
build_number = int(sys.argv[2])
if not os.path.exists(temp_path):
os.makedirs(temp_path)
with open(os.getenv('GITHUB_EVENT_PATH'), 'r', encoding='utf-8') as event_file:
event = json.load(event_file)
pr_info = PRInfo(event)
gh = Github(get_best_robot_token())
for root, _, files in os.walk(reports_path):
for f in files:
if f == 'changed_images.json':
images_path = os.path.join(root, 'changed_images.json')
break
image_name = "clickhouse/stress-test"
docker_image = image_name
if images_path and os.path.exists(images_path):
logging.info("Images file exists")
with open(images_path, 'r', encoding='utf-8') as images_fd:
images = json.load(images_fd)
logging.info("Got images %s", images)
if image_name in images:
docker_image += ':' + images[image_name]
else:
logging.info("Images file not found")
for i in range(10):
try:
logging.info("Pulling image %s", docker_image)
subprocess.check_output(f"docker pull {docker_image}", stderr=subprocess.STDOUT, shell=True)
break
except Exception as ex:
time.sleep(i * 3)
logging.info("Got execption pulling docker %s", ex)
else:
raise Exception(f"Cannot pull dockerhub for image docker pull {docker_image}")
build_config = get_build_config(build_number, repo_path)
build_config_str = build_config_to_string(build_config)
urls = get_build_urls(build_config_str, reports_path)
if not urls:
raise Exception("No build URLs found")
packages_path = os.path.join(temp_path, "packages")
if not os.path.exists(packages_path):
os.makedirs(packages_path)
server_log_path = os.path.join(temp_path, "server_log")
if not os.path.exists(server_log_path):
os.makedirs(server_log_path)
result_path = os.path.join(temp_path, "result_path")
if not os.path.exists(result_path):
os.makedirs(result_path)
run_log_path = os.path.join(temp_path, "runlog.log")
download_builds(packages_path, urls)
run_command = get_run_command(packages_path, result_path, server_log_path, docker_image)
logging.info("Going to run func tests: %s", run_command)
with open(run_log_path, 'w', encoding='utf-8') as log:
with subprocess.Popen(run_command, shell=True, stderr=log, stdout=log) as process:
retcode = process.wait()
if retcode == 0:
logging.info("Run successfully")
else:
logging.info("Run failed")
subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True)
s3_helper = S3Helper('https://s3.amazonaws.com')
state, description, test_results, additional_logs = process_results(result_path, server_log_path, run_log_path)
report_url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, run_log_path, additional_logs, check_name)
print(f"::notice ::Report url: {report_url}")
commit = get_commit(gh, pr_info.sha)
commit.create_status(context=check_name, description=description, state=state, target_url=report_url)

View File

@ -0,0 +1,20 @@
#!/usr/bin/env bash
set -euo pipefail
echo "Running init script"
export DEBIAN_FRONTEND=noninteractive
export RUNNER_HOME=/home/ubuntu/actions-runner
echo "Receiving token"
export RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value`
export RUNNER_URL="https://github.com/ClickHouse"
# Funny fact, but metadata service has fixed IP
export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id`
cd $RUNNER_HOME
echo "Going to configure runner"
sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,func-tester' --work _work
echo "Run"
sudo -u ubuntu ./run.sh

View File

@ -0,0 +1,20 @@
#!/usr/bin/env bash
set -euo pipefail
echo "Running init script"
export DEBIAN_FRONTEND=noninteractive
export RUNNER_HOME=/home/ubuntu/actions-runner
echo "Receiving token"
export RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value`
export RUNNER_URL="https://github.com/ClickHouse"
# Funny fact, but metadata service has fixed IP
export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id`
cd $RUNNER_HOME
echo "Going to configure runner"
sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,stress-tester' --work _work
echo "Run"
sudo -u ubuntu ./run.sh

View File

@ -6,9 +6,11 @@ cluster = ClickHouseCluster(__file__)
instance = cluster.add_instance('instance')
def create_and_fill_table():
def create_and_fill_table(engine="MergeTree"):
if engine == "MergeTree":
engine = "MergeTree ORDER BY y PARTITION BY x%10"
instance.query("CREATE DATABASE test")
instance.query("CREATE TABLE test.table(x UInt32, y String) ENGINE=MergeTree ORDER BY y PARTITION BY x%10")
instance.query(f"CREATE TABLE test.table(x UInt32, y String) ENGINE={engine}")
instance.query("INSERT INTO test.table SELECT number, toString(number) FROM numbers(100)")
@ -36,9 +38,11 @@ def new_backup_name():
return f"test-backup-{backup_id_counter}"
def test_restore_table():
@pytest.mark.parametrize("engine", ["MergeTree", "Log", "TinyLog", "StripeLog"])
def test_restore_table(engine):
backup_name = new_backup_name()
create_and_fill_table()
create_and_fill_table(engine=engine)
assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n"
instance.query(f"BACKUP TABLE test.table TO '{backup_name}'")
@ -50,9 +54,10 @@ def test_restore_table():
assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n"
def test_restore_table_into_existing_table():
@pytest.mark.parametrize("engine", ["MergeTree", "Log", "TinyLog", "StripeLog"])
def test_restore_table_into_existing_table(engine):
backup_name = new_backup_name()
create_and_fill_table()
create_and_fill_table(engine=engine)
assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n"
instance.query(f"BACKUP TABLE test.table TO '{backup_name}'")

View File

@ -2,7 +2,7 @@ import pytest
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
cluster = ClickHouseCluster(__file__, name="aggregate_fixed_key")
node1 = cluster.add_instance('node1', with_zookeeper=True, image='yandex/clickhouse-server', tag='21.3', with_installed_binary=True)
node2 = cluster.add_instance('node2', with_zookeeper=True, image='yandex/clickhouse-server')
node3 = cluster.add_instance('node3', with_zookeeper=True, image='yandex/clickhouse-server')

View File

@ -2850,7 +2850,7 @@ def test_kafka_formats_with_broken_message(kafka_cluster):
# broken message
b'\x05\x02\x69\x64\x07\x62\x6c\x6f\x63\x6b\x4e\x6f\x04\x76\x61\x6c\x31\x04\x76\x61\x6c\x32\x04\x76\x61\x6c\x33\x05\x49\x6e\x74\x36\x34\x06\x53\x74\x72\x69\x6e\x67\x06\x53\x74\x72\x69\x6e\x67\x07\x46\x6c\x6f\x61\x74\x33\x32\x05\x55\x49\x6e\x74\x38\x00\x00\x00\x00\x00\x00\x00\x00\x03\x42\x41\x44\x02\x41\x4d\x00\x00\x00\x3f\x01',
],
'expected':'{"raw_message":"0502696407626C6F636B4E6F0476616C310476616C320476616C3305496E74363406537472696E6706537472696E6707466C6F617433320555496E743800000000000000000342414402414D0000003F01","error":"Cannot read all data. Bytes read: 9. Bytes expected: 65.: (at row 1)\\n"}',
'expected':'{"raw_message":"0502696407626C6F636B4E6F0476616C310476616C320476616C3305496E74363406537472696E6706537472696E6707466C6F617433320555496E743800000000000000000342414402414D0000003F01","error":"Type of \'blockNo\' must be UInt16, not String"}',
'printable':False,
},
'ORC': {

View File

@ -1,6 +1,10 @@
"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline
here"
"x","y","z","a","b"
"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline
here"
"x","y","z","a","b"
"String","UInt8","Array(UInt8)","Tuple(UInt16, Array(String))","String"
"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline
here"
0,"0","[]","2000-01-01","2000-01-01 00:00:00"

View File

@ -1,3 +1,4 @@
SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNames;
SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSV;
SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNames;
SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNamesAndTypes;
SELECT number, toString(number), range(number), toDate('2000-01-01') + number, toDateTime('2000-01-01 00:00:00') + number FROM system.numbers LIMIT 10 FORMAT CSV;

View File

@ -13,7 +13,7 @@ Hello "world", 789 ,2016-01-03
"Hello
world", 100, 2016-01-04,
default,,
default-eof,,' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --query="INSERT INTO csv FORMAT CSV";
default-eof,,' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --query="INSERT INTO csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY d";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";
@ -33,7 +33,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t Nullable(DateTime('Europe/Moscow
echo 'NULL, NULL
"2016-01-01 01:02:03",NUL
"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --input_format_csv_unquoted_null_literal_as_null=1 --query="INSERT INTO csv FORMAT CSV";
"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --input_format_csv_unquoted_null_literal_as_null=1 --input_format_csv_empty_as_default=1 --query="INSERT INTO csv FORMAT CSV";
$CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST";
$CLICKHOUSE_CLIENT --query="DROP TABLE csv";

View File

@ -23,3 +23,11 @@ cv bn m","","as""df'gh","",456,"2016-01-02"
"as""df'gh","","zx
cv bn m","",789,"2016-01-04"
"qwe,rty","","","",9876543210,"2016-01-03"
==== check raw ====
"qwe,rty","as""df'gh","","zx
cv bn m",123,"2016-01-01"
"as""df\'gh","","zx
cv bn m","qwe,rty",456,"2016-01-02"
"zx\cv\bn m","qwe,rty","as""df'gh","",789,"2016-01-04"
"","zx
cv bn m","qwe,rty","as""df'gh",9876543210,"2016-01-03"

View File

@ -1,4 +1,6 @@
#!/usr/bin/env bash
# Tags: no-parallel
# shellcheck disable=SC2016,SC2028
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
@ -50,6 +52,30 @@ format_template_rows_between_delimiter = ','";
$CLICKHOUSE_CLIENT --query="SELECT * FROM template2 ORDER BY n FORMAT CSV";
echo "==== check raw ===="
echo -ne '{prefix} \n${data}\n $$ suffix $$\n' > "$CURDIR"/00938_template_input_format_resultset.tmp
echo -ne 'n:\t${n:Escaped}, s1:\t${0:Raw}\t, s2:\t${1:Quoted}, s3:\t${s3:JSON}, s4:\t${3:CSV}, d:\t${d:Escaped}\t' > "$CURDIR"/00938_template_input_format_row.tmp
$CLICKHOUSE_CLIENT --query="TRUNCATE TABLE template1";
echo "{prefix}"' '"
n: 123, s1: qwe,rty , s2: 'as\"df\\'gh', s3: \"\", s4: \"zx
cv bn m\", d: 2016-01-01 ;
n: 456, s1: as\"df\\'gh , s2: '', s3: \"zx\\ncv\\tbn m\", s4: \"qwe,rty\", d: 2016-01-02 ;
n: 9876543210, s1: , s2: 'zx\\ncv\\tbn m', s3: \"qwe,rty\", s4: \"as\"\"df'gh\", d: 2016-01-03 ;
n: 789, s1: zx\cv\bn m , s2: 'qwe,rty', s3: \"as\\\"df'gh\", s4: \"\", d: 2016-01-04"$'\t'"
$ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO template1 FORMAT Template SETTINGS \
format_template_resultset = '$CURDIR/00938_template_input_format_resultset.tmp', \
format_template_row = '$CURDIR/00938_template_input_format_row.tmp', \
format_template_rows_between_delimiter = ';\n'";
$CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT CSV";
$CLICKHOUSE_CLIENT --query="DROP TABLE template1";
$CLICKHOUSE_CLIENT --query="DROP TABLE template2";
rm "$CURDIR"/00938_template_input_format_resultset.tmp "$CURDIR"/00938_template_input_format_row.tmp

View File

@ -12,6 +12,11 @@
[1, "a"]
[2, "b"]
[3, "c"]
----------
["value", "name"]
[1, "a"]
[2, "b"]
[3, "c"]
4
["name", "c"]
["String", "UInt64"]
@ -31,17 +36,33 @@
8
["first", 1, 2, 0]
["second", 2, 0, 6]
["first", 1, 2, 0]
["second", 2, 0, 6]
9
["first", 1, 2, 8]
["second", 2, 32, 6]
["first", 1, 2, 8]
["second", 2, 32, 6]
10
["first", 1, 16, 8]
["second", 2, 32, 8]
["first", 1, 16, 8]
["second", 2, 32, 8]
11
["v1", "v2", "v3", "v4"]
["String", "UInt8", "UInt16", "UInt8"]
["", 2, 3, 1]
["", 2, 3, 1]
----------
["v1", "v2", "v3", "v4"]
["", 2, 3, 1]
["", 2, 3, 1]
12
["v1", "n.id", "n.name"]
["UInt8", "Array(UInt8)", "Array(String)"]
[16, [15,16,0], ["first","second","third"]]
[16, [15,16,0], ["first","second","third"]]
----------
["v1", "n.id", "n.name"]
[16, [15,16,0], ["first","second","third"]]
[16, [15,16,0], ["first","second","third"]]

View File

@ -10,8 +10,10 @@ SELECT 2;
/* Check Totals */
SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactEachRow;
SELECT 3;
/* Check JSONCompactEachRowWithNamesAndTypes Output */
/* Check JSONCompactEachRowWithNames and JSONCompactEachRowWithNamesAndTypes Output */
SELECT * FROM test_table FORMAT JSONCompactEachRowWithNamesAndTypes;
SELECT '----------';
SELECT * FROM test_table FORMAT JSONCompactEachRowWithNames;
SELECT 4;
/* Check Totals */
SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactEachRowWithNamesAndTypes;
@ -35,30 +37,39 @@ INSERT INTO test_table_2 FORMAT JSONCompactEachRow [16, [15, 16, null], ["first"
SELECT * FROM test_table_2 FORMAT JSONCompactEachRow;
TRUNCATE TABLE test_table_2;
SELECT 8;
/* Check JSONCompactEachRowWithNamesAndTypes Output */
/* Check JSONCompactEachRowWithNamesAndTypes and JSONCompactEachRowWithNamesAndTypes Input */
SET input_format_null_as_default = 0;
INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", 1, "2", null]["second", 2, null, 6];
INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "v3", "v4"]["first", 1, "2", null]["second", 2, null, 6];
SELECT * FROM test_table FORMAT JSONCompactEachRow;
TRUNCATE TABLE test_table;
SELECT 9;
/* Check input_format_null_as_default = 1 */
SET input_format_null_as_default = 1;
INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", 1, "2", null] ["second", 2, null, 6];
INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "v3", "v4"]["first", 1, "2", null] ["second", 2, null, 6];
SELECT * FROM test_table FORMAT JSONCompactEachRow;
SELECT 10;
/* Check Header */
TRUNCATE TABLE test_table;
SET input_format_skip_unknown_fields = 1;
INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "invalid_column"]["String", "UInt8", "UInt8"]["first", 1, 32]["second", 2, "64"];
INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "invalid_column"]["first", 1, 32]["second", 2, "64"];
SELECT * FROM test_table FORMAT JSONCompactEachRow;
SELECT 11;
TRUNCATE TABLE test_table;
INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v4", "v2", "v3"]["UInt8", "UInt8", "UInt16"][1, 2, 3]
INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v4", "v2", "v3"][1, 2, 3]
SELECT * FROM test_table FORMAT JSONCompactEachRowWithNamesAndTypes;
SELECT '----------';
SELECT * FROM test_table FORMAT JSONCompactEachRowWithNames;
SELECT 12;
/* Check Nested */
INSERT INTO test_table_2 FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "n.id", "n.name"]["UInt8", "Array(UInt8)", "Array(String)"][16, [15, 16, null], ["first", "second", "third"]];
INSERT INTO test_table_2 FORMAT JSONCompactEachRowWithNames ["v1", "n.id", "n.name"][16, [15, 16, null], ["first", "second", "third"]];
SELECT * FROM test_table_2 FORMAT JSONCompactEachRowWithNamesAndTypes;
SELECT '----------';
SELECT * FROM test_table_2 FORMAT JSONCompactEachRowWithNames;
DROP TABLE IF EXISTS test_table;
DROP TABLE IF EXISTS test_table_2;

View File

@ -1,5 +1,5 @@
CSV
Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR
Column 2, name: d, type: Decimal(18, 10), parsed text: " 123456789"ERROR
ERROR: garbage after DateTime: "7, <DOUBLE QUOTE>Hello<DOUBLE QUOTE>"
ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
ERROR: There is no line feed. "1" found instead.
@ -28,3 +28,14 @@ ERROR: There is no delimiter before field 1: expected "<TAB>", got "7<TAB>Hello<
ERROR: There is no delimiter after last field: expected "<LINE FEED>", got "<TAB>1<LINE FEED>"
ERROR: There is no delimiter after last field: expected "<LINE FEED>", got "Hello<LINE FEED>"
Column 0, name: t, type: DateTime, ERROR: text "<LINE FEED>" is not like DateTime
JSONCompactEachRow
Column 2, name: d, type: Decimal(18, 10), parsed text: " 123456789"ERROR
Column 0, name: t, type: DateTime, parsed text: "<DOUBLE QUOTE>2020-04-21 12:34:56"ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
ERROR: garbage after DateTime: "7, <DOUBLE QUOTE>Hello<DOUBLE QUOTE>"
ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.
ERROR: There is no closing parenthesis (']') at the end of the row. "," found instead.
Column 1, name: s, type: String, parsed text: <EMPTY>ERROR
ERROR: There is no '[' before the row.
ERROR: garbage after Decimal(18, 10): ";<LINE FEED>"
ERROR: There is no comma. ";" found instead.
ERROR: Closing parenthesis (']') found where comma is expected. It's like your file has less columns than expected.

View File

@ -38,3 +38,19 @@ echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ER
echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format JSONCompactEachRow)
echo '["2020-04-21 12:34:56", "Hello", 12345678]' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "JSONCompactEachRow"
echo '["2020-04-21 12:34:56", "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '["2020-04-21 12:34:567", "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '["2020-04-21 12:34:56"7, "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '["2020-04-21 12:34:56", "Hello", 12345678,1]' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo '["2020-04-21 12:34:56",,123Hello]' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '["2020-04-21 12:34:56", "Hello", 12345678\n]' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '"2020-04-21 12:34:56", "Hello", 12345678]' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '["2020-04-21 12:34:56", "Hello", 12345678;' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '["2020-04-21 12:34:56", "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '["2020-04-21 12:34:56", "Hello", 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '["2020-04-21 12:34:56", "Hello"; 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '["2020-04-21 12:34:56", "Hello"\n' | "${PARSER[@]}" 2>&1| grep "ERROR"
echo -e '["2020-04-21 12:34:56", "Hello"]' | "${PARSER[@]}" 2>&1| grep "ERROR"

View File

@ -7,7 +7,21 @@ number
UInt64
0
1
TSVRawWithNames
number
0
1
TSVRawWithNamesAndTypes
number
UInt64
0
1
CSVWithNames
"number"
0
1
CSVWithNamesAndTypes
"number"
"UInt64"
0
1

View File

@ -15,5 +15,14 @@ ${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNames
echo 'TSVWithNamesAndTypes'
${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNamesAndTypes
echo 'TSVRawWithNames'
${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNames
echo 'TSVRawWithNamesAndTypes'
${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNamesAndTypes
echo 'CSVWithNames'
${CLICKHOUSE_LOCAL} "${opts[@]}" --format CSVWithNames
echo 'CSVWithNamesAndTypes'
${CLICKHOUSE_LOCAL} "${opts[@]}" --format CSVWithNamesAndTypes

View File

@ -2,6 +2,11 @@ zero rows
TSVWithNames
TSVWithNamesAndTypes
CSVWithNames
CSVWithNamesAndTypes
JSONCompactEachRowWithNames
JSONCompactEachRowWithNamesAndTypes
JSONCompactStringsEachRow
JSONCompactStringsEachRowWithNamesAndTypes
multi clickhouse-local one file
TSVWithNames
0
@ -15,3 +20,23 @@ CSVWithNames
0
0
0
CSVWithNamesAndTypes
0
0
0
JSONCompactEachRowWithNames
0
0
0
JSONCompactEachRowWithNamesAndTypes
0
0
0
JSONCompactStringsEachRow
0
0
0
JSONCompactStringsEachRowWithNamesAndTypes
0
0
0

View File

@ -1,4 +1,5 @@
#!/usr/bin/env bash
# Tags: no-parallel
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
@ -6,26 +7,26 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# zero rows
echo 'zero rows'
for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames; do
for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames CSVWithNamesAndTypes JSONCompactEachRowWithNames JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRow JSONCompactStringsEachRowWithNamesAndTypes; do
echo $format
${CLICKHOUSE_LOCAL} --query="
CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format.tsv') AS SELECT * FROM numbers(1) WHERE number < 0;
CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format') AS SELECT * FROM numbers(1) WHERE number < 0;
SELECT * FROM ${format}_01375;
DROP TABLE ${format}_01375;
"
rm 01375_$format.tsv
rm 01375_$format
done
# run multiple times to the same file
echo 'multi clickhouse-local one file'
for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames; do
for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames CSVWithNamesAndTypes JSONCompactEachRowWithNames JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRow JSONCompactStringsEachRowWithNamesAndTypes; do
echo $format
for _ in {1..2}; do
${CLICKHOUSE_LOCAL} --query="
CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format.tsv') AS SELECT * FROM numbers(1);
CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format') AS SELECT * FROM numbers(1);
SELECT * FROM ${format}_01375;
DROP TABLE ${format}_01375;
"
done
rm 01375_$format.tsv
rm 01375_$format
done

Some files were not shown because too many files have changed in this diff Show More